feat: implement layout-preserving PDF generation with table reconstruction

Major Features: - Add PDF generation service with Chinese font support - Parse HTML tables from PP-StructureV3 and rebuild with ReportLab - Extract table text for translation purposes - Auto-filter text regions inside tables to avoid overlaps Backend Changes: 1. pdf_generator_service.py (NEW) - HTMLTableParser: Parse HTML tables to extract structure - PDFGeneratorService: Generate layout-preserving PDFs - Coordinate transformation: OCR (top-left) → PDF (bottom-left) - Font size heuristics: 75% of bbox height with width checking - Table reconstruction: Parse HTML → ReportLab Table - Image embedding: Extract bbox from filenames 2. ocr_service.py - Add _extract_table_text() for translation support - Add output_dir parameter to save images to result directory - Extract bbox from image filenames (img_in_table_box_x1_y1_x2_y2.jpg) 3. tasks.py - Update process_task_ocr to use save_results() with PDF generation - Fix download_pdf endpoint to use database-stored PDF paths - Support on-demand PDF generation from JSON 4. config.py - Add chinese_font_path configuration - Add pdf_enable_bbox_debug flag Frontend Changes: 1. PDFViewer.tsx (NEW) - React PDF viewer with zoom and pagination - Memoized file config to prevent unnecessary reloads 2. TaskDetailPage.tsx & ResultsPage.tsx - Integrate PDF preview and download 3. main.tsx - Configure PDF.js worker via CDN 4. vite.config.ts - Add host: '0.0.0.0' for network access - Use VITE_API_URL environment variable for backend proxy Dependencies: - reportlab: PDF generation library - Noto Sans SC font: Chinese character support 🤖 Generated with Claude Code https://claude.com/claude-code Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-17 20:21:56 +08:00
parent 012da1abc4
commit fa1abcd8e6
16 changed files with 1427 additions and 57 deletions
--- a/backend/app/services/ocr_service.py
+++ b/backend/app/services/ocr_service.py
@@ -284,7 +284,8 @@ class OCRService:
        image_path: Path,
        lang: str = 'ch',
        detect_layout: bool = True,
-        confidence_threshold: Optional[float] = None
+        confidence_threshold: Optional[float] = None,
+        output_dir: Optional[Path] = None
    ) -> Dict:
        """
        Process single image with OCR and layout analysis
@@ -340,7 +341,8 @@ class OCRService:
                        page_image_path,
                        lang=lang,
                        detect_layout=detect_layout,
-                        confidence_threshold=confidence_threshold
+                        confidence_threshold=confidence_threshold,
+                        output_dir=output_dir
                    )

                    # Accumulate results
@@ -458,7 +460,7 @@ class OCRService:
            images_metadata = []

            if detect_layout:
-                layout_data, images_metadata = self.analyze_layout(image_path)
+                layout_data, images_metadata = self.analyze_layout(image_path, output_dir=output_dir)

            # Generate Markdown
            markdown_content = self.generate_markdown(text_regions, layout_data)
@@ -500,12 +502,71 @@ class OCRService:
                'processing_time': (datetime.now() - start_time).total_seconds(),
            }

-    def analyze_layout(self, image_path: Path) -> Tuple[Optional[Dict], List[Dict]]:
+    def _extract_table_text(self, html_content: str) -> str:
+        """
+        Extract text from HTML table content for translation purposes
+
+        Args:
+            html_content: HTML content containing table
+
+        Returns:
+            Extracted text from table cells
+        """
+        try:
+            from html.parser import HTMLParser
+
+            class TableTextExtractor(HTMLParser):
+                def __init__(self):
+                    super().__init__()
+                    self.text_parts = []
+                    self.in_table = False
+
+                def handle_starttag(self, tag, attrs):
+                    if tag == 'table':
+                        self.in_table = True
+
+                def handle_endtag(self, tag):
+                    if tag == 'table':
+                        self.in_table = False
+                    elif tag in ('td', 'th') and self.in_table:
+                        self.text_parts.append(' | ')  # Cell separator
+                    elif tag == 'tr' and self.in_table:
+                        self.text_parts.append('\n')  # Row separator
+
+                def handle_data(self, data):
+                    if self.in_table:
+                        stripped = data.strip()
+                        if stripped:
+                            self.text_parts.append(stripped)
+
+            parser = TableTextExtractor()
+            parser.feed(html_content)
+
+            # Clean up the extracted text
+            extracted = ''.join(parser.text_parts)
+            # Remove multiple separators
+            import re
+            extracted = re.sub(r'\s*\|\s*\|+\s*', ' | ', extracted)
+            extracted = re.sub(r'\n+', '\n', extracted)
+            extracted = extracted.strip()
+
+            return extracted
+
+        except Exception as e:
+            logger.warning(f"Failed to extract table text: {e}")
+            # Fallback: just remove HTML tags
+            import re
+            text = re.sub(r'<[^>]+>', ' ', html_content)
+            text = re.sub(r'\s+', ' ', text)
+            return text.strip()
+
+    def analyze_layout(self, image_path: Path, output_dir: Optional[Path] = None) -> Tuple[Optional[Dict], List[Dict]]:
        """
        Analyze document layout using PP-StructureV3

        Args:
            image_path: Path to image file
+            output_dir: Optional output directory for saving extracted images (defaults to image_path.parent)

        Returns:
            Tuple of (layout_data, images_metadata)
@@ -548,16 +609,59 @@ class OCRService:
                                'page': page_idx,
                                'bbox': [],  # PP-StructureV3 doesn't provide individual bbox in this format
                            }
+
+                            # Extract text from table for translation purposes
+                            if has_table:
+                                table_text = self._extract_table_text(markdown_texts)
+                                element['extracted_text'] = table_text
+                                logger.info(f"Extracted {len(table_text)} characters from table")
+
                            layout_elements.append(element)

-                        # Add image metadata
+                        # Add image metadata and SAVE images to disk
                        for img_idx, (img_path, img_obj) in enumerate(markdown_images.items()):
+                            # Save image to disk
+                            try:
+                                # Determine base directory for saving images
+                                base_dir = output_dir if output_dir else image_path.parent
+
+                                # Create full path for image file
+                                full_img_path = base_dir / img_path
+
+                                # Create imgs/ subdirectory if it doesn't exist
+                                full_img_path.parent.mkdir(parents=True, exist_ok=True)
+
+                                # Save image object to disk
+                                if hasattr(img_obj, 'save'):
+                                    # img_obj is PIL Image
+                                    img_obj.save(str(full_img_path))
+                                    logger.info(f"Saved extracted image to {full_img_path}")
+                                else:
+                                    logger.warning(f"Image object for {img_path} does not have save() method, skipping")
+
+                            except Exception as e:
+                                logger.warning(f"Failed to save image {img_path}: {str(e)}")
+                                # Continue processing even if image save fails
+
+                            # Extract bbox from filename (format: img_in_table_box_x1_y1_x2_y2.jpg)
+                            bbox = []
+                            try:
+                                import re
+                                match = re.search(r'box_(\d+)_(\d+)_(\d+)_(\d+)', img_path)
+                                if match:
+                                    x1, y1, x2, y2 = map(int, match.groups())
+                                    # Convert to 4-point bbox format: [[x1,y1], [x2,y1], [x2,y2], [x1,y2]]
+                                    bbox = [[x1, y1], [x2, y1], [x2, y2], [x1, y2]]
+                                    logger.info(f"Extracted bbox from filename: {bbox}")
+                            except Exception as e:
+                                logger.warning(f"Failed to extract bbox from {img_path}: {e}")
+
                            images_metadata.append({
                                'element_id': len(layout_elements) + img_idx,
                                'image_path': img_path,
                                'type': 'image',
                                'page': page_idx,
-                                'bbox': [],
+                                'bbox': bbox,
                            })

            if layout_elements:
@@ -638,18 +742,20 @@ class OCRService:
        self,
        result: Dict,
        output_dir: Path,
-        file_id: str
-    ) -> Tuple[Optional[Path], Optional[Path]]:
+        file_id: str,
+        source_file_path: Optional[Path] = None
+    ) -> Tuple[Optional[Path], Optional[Path], Optional[Path]]:
        """
-        Save OCR results to JSON and Markdown files
+        Save OCR results to JSON, Markdown, and layout-preserving PDF files

        Args:
            result: OCR result dictionary
            output_dir: Output directory
            file_id: Unique file identifier
+            source_file_path: Optional path to original source file for PDF generation

        Returns:
-            Tuple of (json_path, markdown_path)
+            Tuple of (json_path, markdown_path, pdf_path)
        """
        try:
            output_dir.mkdir(parents=True, exist_ok=True)
@@ -666,8 +772,37 @@ class OCRService:
                f.write(markdown_content)

            logger.info(f"Results saved: {json_path.name}, {markdown_path.name}")
-            return json_path, markdown_path
+
+            # Generate layout-preserving PDF
+            pdf_path = None
+            try:
+                from app.services.pdf_generator_service import pdf_generator_service
+
+                pdf_filename = f"{file_id}_layout.pdf"
+                pdf_path = output_dir / pdf_filename
+
+                logger.info(f"Generating layout-preserving PDF: {pdf_filename}")
+
+                success = pdf_generator_service.generate_layout_pdf(
+                    json_path=json_path,
+                    output_path=pdf_path,
+                    source_file_path=source_file_path
+                )
+
+                if success:
+                    logger.info(f"✓ PDF generated successfully: {pdf_path.name}")
+                else:
+                    logger.warning(f"✗ PDF generation failed for {file_id}")
+                    pdf_path = None
+
+            except Exception as e:
+                logger.error(f"Error generating PDF for {file_id}: {str(e)}")
+                import traceback
+                traceback.print_exc()
+                pdf_path = None
+
+            return json_path, markdown_path, pdf_path

        except Exception as e:
            logger.error(f"Error saving results: {str(e)}")
-            return None, None
+            return None, None, None