feat: implement layout-preserving PDF generation with table reconstruction

Major Features: - Add PDF generation service with Chinese font support - Parse HTML tables from PP-StructureV3 and rebuild with ReportLab - Extract table text for translation purposes - Auto-filter text regions inside tables to avoid overlaps Backend Changes: 1. pdf_generator_service.py (NEW) - HTMLTableParser: Parse HTML tables to extract structure - PDFGeneratorService: Generate layout-preserving PDFs - Coordinate transformation: OCR (top-left) → PDF (bottom-left) - Font size heuristics: 75% of bbox height with width checking - Table reconstruction: Parse HTML → ReportLab Table - Image embedding: Extract bbox from filenames 2. ocr_service.py - Add _extract_table_text() for translation support - Add output_dir parameter to save images to result directory - Extract bbox from image filenames (img_in_table_box_x1_y1_x2_y2.jpg) 3. tasks.py - Update process_task_ocr to use save_results() with PDF generation - Fix download_pdf endpoint to use database-stored PDF paths - Support on-demand PDF generation from JSON 4. config.py - Add chinese_font_path configuration - Add pdf_enable_bbox_debug flag Frontend Changes: 1. PDFViewer.tsx (NEW) - React PDF viewer with zoom and pagination - Memoized file config to prevent unnecessary reloads 2. TaskDetailPage.tsx & ResultsPage.tsx - Integrate PDF preview and download 3. main.tsx - Configure PDF.js worker via CDN 4. vite.config.ts - Add host: '0.0.0.0' for network access - Use VITE_API_URL environment variable for backend proxy Dependencies: - reportlab: PDF generation library - Noto Sans SC font: Chinese character support 🤖 Generated with Claude Code https://claude.com/claude-code Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-17 20:21:56 +08:00
parent 012da1abc4
commit fa1abcd8e6
16 changed files with 1427 additions and 57 deletions
--- a/backend/app/services/pdf_generator_service.py
+++ b/backend/app/services/pdf_generator_service.py
@@ -0,0 +1,626 @@
+"""
+Layout-Preserving PDF Generation Service
+Generates PDF files that preserve the original document layout using OCR JSON data
+"""
+
+import json
+import logging
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+from datetime import datetime
+
+from reportlab.lib.pagesizes import A4, letter
+from reportlab.lib.units import mm
+from reportlab.pdfgen import canvas
+from reportlab.pdfbase import pdfmetrics
+from reportlab.pdfbase.ttfonts import TTFont
+from reportlab.platypus import Table, TableStyle
+from reportlab.lib import colors
+from reportlab.lib.enums import TA_CENTER, TA_LEFT, TA_RIGHT
+from reportlab.platypus import Paragraph
+from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
+from PIL import Image
+from html.parser import HTMLParser
+
+from app.core.config import settings
+
+logger = logging.getLogger(__name__)
+
+
+class HTMLTableParser(HTMLParser):
+    """Parse HTML table to extract structure and data"""
+
+    def __init__(self):
+        super().__init__()
+        self.tables = []
+        self.current_table = None
+        self.current_row = None
+        self.current_cell = None
+        self.in_table = False
+
+    def handle_starttag(self, tag, attrs):
+        attrs_dict = dict(attrs)
+
+        if tag == 'table':
+            self.in_table = True
+            self.current_table = {'rows': []}
+
+        elif tag == 'tr' and self.in_table:
+            self.current_row = {'cells': []}
+
+        elif tag in ('td', 'th') and self.in_table and self.current_row is not None:
+            colspan = int(attrs_dict.get('colspan', 1))
+            rowspan = int(attrs_dict.get('rowspan', 1))
+            self.current_cell = {
+                'text': '',
+                'is_header': tag == 'th',
+                'colspan': colspan,
+                'rowspan': rowspan
+            }
+
+    def handle_endtag(self, tag):
+        if tag == 'table' and self.in_table:
+            if self.current_table and self.current_table['rows']:
+                self.tables.append(self.current_table)
+            self.current_table = None
+            self.in_table = False
+
+        elif tag == 'tr' and self.current_row is not None:
+            if self.current_table is not None:
+                self.current_table['rows'].append(self.current_row)
+            self.current_row = None
+
+        elif tag in ('td', 'th') and self.current_cell is not None:
+            if self.current_row is not None:
+                self.current_row['cells'].append(self.current_cell)
+            self.current_cell = None
+
+    def handle_data(self, data):
+        if self.current_cell is not None:
+            self.current_cell['text'] += data.strip() + ' '
+
+
+class PDFGeneratorService:
+    """Service for generating layout-preserving PDFs from OCR JSON data"""
+
+    def __init__(self):
+        """Initialize PDF generator with font configuration"""
+        self.font_name = 'NotoSansSC'
+        self.font_path = None
+        self.font_registered = False
+
+        self._register_chinese_font()
+
+    def _register_chinese_font(self):
+        """Register Chinese font for PDF generation"""
+        try:
+            # Get font path from settings
+            font_path = Path(settings.chinese_font_path)
+
+            # Try relative path from project root
+            if not font_path.is_absolute():
+                # Adjust path - settings.chinese_font_path starts with ./backend/
+                project_root = Path(__file__).resolve().parent.parent.parent.parent
+                font_path = project_root / font_path
+
+            if not font_path.exists():
+                logger.error(f"Chinese font not found at {font_path}")
+                return
+
+            # Register font
+            pdfmetrics.registerFont(TTFont(self.font_name, str(font_path)))
+            self.font_path = font_path
+            self.font_registered = True
+            logger.info(f"Chinese font registered: {self.font_name} from {font_path}")
+
+        except Exception as e:
+            logger.error(f"Failed to register Chinese font: {e}")
+            self.font_registered = False
+
+    def load_ocr_json(self, json_path: Path) -> Optional[Dict]:
+        """
+        Load and parse OCR JSON result file
+
+        Args:
+            json_path: Path to JSON file
+
+        Returns:
+            Parsed JSON data or None if failed
+        """
+        try:
+            with open(json_path, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+
+            logger.info(f"Loaded OCR JSON: {json_path.name}")
+            return data
+
+        except Exception as e:
+            logger.error(f"Failed to load JSON {json_path}: {e}")
+            return None
+
+    def calculate_page_dimensions(self, text_regions: List[Dict], source_file_path: Optional[Path] = None) -> Tuple[float, float]:
+        """
+        Calculate page dimensions from source file or text region bounding boxes
+
+        Args:
+            text_regions: List of text regions with bbox coordinates
+            source_file_path: Optional path to source file for accurate dimensions
+
+        Returns:
+            Tuple of (width, height) in points
+        """
+        # First try to get dimensions from source file
+        if source_file_path:
+            dims = self.get_original_page_size(source_file_path)
+            if dims:
+                return dims
+
+        if not text_regions:
+            return A4  # Default to A4 size
+
+        max_x = 0
+        max_y = 0
+
+        for region in text_regions:
+            bbox = region.get('bbox', [])
+            if not bbox or len(bbox) < 4:
+                continue
+
+            # bbox format: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
+            for point in bbox:
+                if isinstance(point, (list, tuple)) and len(point) >= 2:
+                    x, y = point[0], point[1]
+                    max_x = max(max_x, x)
+                    max_y = max(max_y, y)
+
+        # OCR coordinates are in pixels, use them directly as points (1:1 mapping)
+        # Do NOT add padding - this causes layout issues
+        width = max_x if max_x > 0 else A4[0]
+        height = max_y if max_y > 0 else A4[1]
+
+        logger.info(f"Calculated page dimensions from OCR: {width:.1f} x {height:.1f} points")
+        return (width, height)
+
+    def get_original_page_size(self, file_path: Path) -> Optional[Tuple[float, float]]:
+        """
+        Extract page dimensions from original source file
+
+        Args:
+            file_path: Path to original file (image or PDF)
+
+        Returns:
+            Tuple of (width, height) in points or None
+        """
+        try:
+            if not file_path.exists():
+                return None
+
+            # For images, get dimensions from PIL
+            if file_path.suffix.lower() in ['.png', '.jpg', '.jpeg', '.bmp', '.tiff']:
+                img = Image.open(file_path)
+                # Use pixel dimensions directly as points (1:1 mapping)
+                # This matches how PaddleOCR reports coordinates
+                width_pt = float(img.width)
+                height_pt = float(img.height)
+                logger.info(f"Extracted dimensions from image: {width_pt:.1f} x {height_pt:.1f} points (1:1 pixel mapping)")
+                return (width_pt, height_pt)
+
+            # For PDFs, would need PyPDF2 or similar
+            # For now, return None to use calculated dimensions
+
+        except Exception as e:
+            logger.warning(f"Failed to get page size from {file_path}: {e}")
+
+        return None
+
+    def draw_text_region(
+        self,
+        pdf_canvas: canvas.Canvas,
+        region: Dict,
+        page_height: float
+    ):
+        """
+        Draw a text region at precise coordinates
+
+        Args:
+            pdf_canvas: ReportLab canvas object
+            region: Text region dict with text, bbox, confidence
+            page_height: Height of page (for coordinate transformation)
+        """
+        text = region.get('text', '')
+        bbox = region.get('bbox', [])
+        confidence = region.get('confidence', 1.0)
+
+        if not text or not bbox or len(bbox) < 4:
+            return
+
+        try:
+            # bbox from OCR: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
+            # Points: top-left, top-right, bottom-right, bottom-left
+            # OCR coordinates: origin (0,0) at top-left, Y increases downward
+            ocr_x_left = bbox[0][0]    # Left X
+            ocr_y_top = bbox[0][1]     # Top Y in OCR coordinates
+            ocr_x_right = bbox[2][0]   # Right X
+            ocr_y_bottom = bbox[2][1]  # Bottom Y in OCR coordinates
+
+            # Calculate bbox dimensions
+            bbox_width = abs(ocr_x_right - ocr_x_left)
+            bbox_height = abs(ocr_y_bottom - ocr_y_top)
+
+            # Calculate font size using heuristics
+            # Font size is typically 70-90% of bbox height
+            # Testing shows 0.75 works well for most cases
+            font_size = bbox_height * 0.75
+            font_size = max(min(font_size, 72), 4)  # Clamp between 4pt and 72pt
+
+            # Transform coordinates: OCR (top-left origin) → PDF (bottom-left origin)
+            # CRITICAL: Y-axis flip!
+            pdf_x = ocr_x_left
+            pdf_y = page_height - ocr_y_bottom  # Flip Y-axis using bottom coordinate
+
+            # Set font
+            font_name = self.font_name if self.font_registered else 'Helvetica'
+            pdf_canvas.setFont(font_name, font_size)
+
+            # Calculate text width to prevent overflow
+            text_width = pdf_canvas.stringWidth(text, font_name, font_size)
+
+            # If text is too wide for bbox, scale down font
+            if text_width > bbox_width:
+                scale_factor = bbox_width / text_width
+                font_size = font_size * scale_factor * 0.95  # 95% to add small margin
+                font_size = max(font_size, 3)  # Minimum 3pt
+                pdf_canvas.setFont(font_name, font_size)
+
+            # Draw text at calculated position
+            pdf_canvas.drawString(pdf_x, pdf_y, text)
+
+            # Debug: Draw bounding box (optional)
+            if settings.pdf_enable_bbox_debug:
+                pdf_canvas.setStrokeColorRGB(1, 0, 0, 0.3)  # Red, semi-transparent
+                pdf_canvas.setLineWidth(0.5)
+                # Transform all bbox points to PDF coordinates
+                pdf_points = [(p[0], page_height - p[1]) for p in bbox]
+                # Draw bbox rectangle
+                for i in range(4):
+                    x1, y1 = pdf_points[i]
+                    x2, y2 = pdf_points[(i + 1) % 4]
+                    pdf_canvas.line(x1, y1, x2, y2)
+
+        except Exception as e:
+            logger.warning(f"Failed to draw text region '{text[:20]}...': {e}")
+
+    def draw_table_region(
+        self,
+        pdf_canvas: canvas.Canvas,
+        table_element: Dict,
+        images_metadata: List[Dict],
+        page_height: float
+    ):
+        """
+        Draw a table region by parsing HTML and rebuilding with ReportLab Table
+
+        Args:
+            pdf_canvas: ReportLab canvas object
+            table_element: Table element dict with HTML content
+            images_metadata: List of image metadata to find table bbox
+            page_height: Height of page
+        """
+        try:
+            html_content = table_element.get('content', '')
+            if not html_content:
+                return
+
+            # Parse HTML to extract table structure
+            parser = HTMLTableParser()
+            parser.feed(html_content)
+
+            if not parser.tables:
+                logger.warning("No tables found in HTML content")
+                return
+
+            # Get the first table (PP-StructureV3 usually provides one table per element)
+            table_data = parser.tables[0]
+            rows = table_data['rows']
+
+            if not rows:
+                return
+
+            # Find corresponding table image to get bbox
+            table_bbox = None
+            for img_meta in images_metadata:
+                img_path = img_meta.get('image_path', '')
+                if 'table' in img_path.lower():
+                    bbox = img_meta.get('bbox', [])
+                    if bbox and len(bbox) >= 4:
+                        table_bbox = bbox
+                        break
+
+            if not table_bbox:
+                logger.warning("No bbox found for table")
+                return
+
+            # Extract bbox coordinates
+            ocr_x_left = table_bbox[0][0]
+            ocr_y_top = table_bbox[0][1]
+            ocr_x_right = table_bbox[2][0]
+            ocr_y_bottom = table_bbox[2][1]
+
+            table_width = abs(ocr_x_right - ocr_x_left)
+            table_height = abs(ocr_y_bottom - ocr_y_top)
+
+            # Transform coordinates
+            pdf_x = ocr_x_left
+            pdf_y = page_height - ocr_y_bottom
+
+            # Build table data for ReportLab
+            # Convert parsed structure to simple 2D array
+            max_cols = max(len(row['cells']) for row in rows)
+            reportlab_data = []
+
+            for row in rows:
+                row_data = []
+                for cell in row['cells']:
+                    text = cell['text'].strip()
+                    row_data.append(text)
+                # Pad row if needed
+                while len(row_data) < max_cols:
+                    row_data.append('')
+                reportlab_data.append(row_data)
+
+            # Calculate column widths (equal distribution)
+            col_widths = [table_width / max_cols] * max_cols
+
+            # Create ReportLab Table
+            # Use smaller font size to fit in bbox
+            font_size = min(table_height / len(rows) * 0.5, 10)
+            font_size = max(font_size, 6)
+
+            # Create table with font
+            table = Table(reportlab_data, colWidths=col_widths)
+
+            # Apply table style
+            style = TableStyle([
+                ('FONT', (0, 0), (-1, -1), self.font_name if self.font_registered else 'Helvetica', font_size),
+                ('GRID', (0, 0), (-1, -1), 0.5, colors.black),
+                ('VALIGN', (0, 0), (-1, -1), 'MIDDLE'),
+                ('ALIGN', (0, 0), (-1, -1), 'CENTER'),
+                ('LEFTPADDING', (0, 0), (-1, -1), 2),
+                ('RIGHTPADDING', (0, 0), (-1, -1), 2),
+                ('TOPPADDING', (0, 0), (-1, -1), 2),
+                ('BOTTOMPADDING', (0, 0), (-1, -1), 2),
+            ])
+
+            # Add header style if first row has headers
+            if rows and rows[0]['cells'] and rows[0]['cells'][0].get('is_header'):
+                style.add('BACKGROUND', (0, 0), (-1, 0), colors.lightgrey)
+                style.add('FONT', (0, 0), (-1, 0), self.font_name if self.font_registered else 'Helvetica-Bold', font_size)
+
+            table.setStyle(style)
+
+            # Calculate table size
+            table.wrapOn(pdf_canvas, table_width, table_height)
+
+            # Draw table at position
+            table.drawOn(pdf_canvas, pdf_x, pdf_y)
+
+            logger.info(f"Drew table at ({pdf_x:.0f}, {pdf_y:.0f}) size {table_width:.0f}x{table_height:.0f} with {len(rows)} rows")
+
+        except Exception as e:
+            logger.warning(f"Failed to draw table region: {e}")
+            import traceback
+            traceback.print_exc()
+
+    def draw_image_region(
+        self,
+        pdf_canvas: canvas.Canvas,
+        region: Dict,
+        page_height: float,
+        result_dir: Path
+    ):
+        """
+        Draw an image region by embedding the extracted image
+
+        Handles images extracted by PP-StructureV3 (tables, figures, charts, etc.)
+
+        Args:
+            pdf_canvas: ReportLab canvas object
+            region: Image metadata dict with image_path and bbox
+            page_height: Height of page (for coordinate transformation)
+            result_dir: Directory containing result files
+        """
+        try:
+            image_path_str = region.get('image_path', '')
+            if not image_path_str:
+                return
+
+            # Construct full path to image
+            image_path = result_dir / image_path_str
+
+            if not image_path.exists():
+                logger.warning(f"Image not found: {image_path}")
+                return
+
+            # Get bbox for positioning
+            bbox = region.get('bbox', [])
+            if not bbox or len(bbox) < 4:
+                # If no bbox, skip for now
+                logger.warning(f"No bbox for image {image_path_str}")
+                return
+
+            # bbox from OCR: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
+            # OCR coordinates: origin (0,0) at top-left, Y increases downward
+            ocr_x_left = bbox[0][0]
+            ocr_y_top = bbox[0][1]
+            ocr_x_right = bbox[2][0]
+            ocr_y_bottom = bbox[2][1]
+
+            # Calculate bbox dimensions
+            bbox_width = abs(ocr_x_right - ocr_x_left)
+            bbox_height = abs(ocr_y_bottom - ocr_y_top)
+
+            # Transform coordinates: OCR (top-left origin) → PDF (bottom-left origin)
+            # CRITICAL: Y-axis flip!
+            # For images, we position at bottom-left corner
+            pdf_x_left = ocr_x_left
+            pdf_y_bottom = page_height - ocr_y_bottom  # Flip Y-axis
+
+            # Draw image using ReportLab
+            # drawImage expects: (path, x, y, width, height)
+            # where (x, y) is the bottom-left corner of the image
+            pdf_canvas.drawImage(
+                str(image_path),
+                pdf_x_left,
+                pdf_y_bottom,
+                width=bbox_width,
+                height=bbox_height,
+                preserveAspectRatio=True,
+                mask='auto'  # Handle transparency
+            )
+
+            logger.info(f"Drew image: {image_path_str} at ({pdf_x_left:.0f}, {pdf_y_bottom:.0f}) size {bbox_width:.0f}x{bbox_height:.0f}")
+
+        except Exception as e:
+            logger.warning(f"Failed to draw image region: {e}")
+
+    def generate_layout_pdf(
+        self,
+        json_path: Path,
+        output_path: Path,
+        source_file_path: Optional[Path] = None
+    ) -> bool:
+        """
+        Generate layout-preserving PDF from OCR JSON data
+
+        Args:
+            json_path: Path to OCR JSON file
+            output_path: Path to save generated PDF
+            source_file_path: Optional path to original source file for dimension extraction
+
+        Returns:
+            True if successful, False otherwise
+        """
+        try:
+            # Check if PDF already exists (caching)
+            if output_path.exists():
+                logger.info(f"PDF already exists: {output_path.name}")
+                return True
+
+            # Load JSON data
+            ocr_data = self.load_ocr_json(json_path)
+            if not ocr_data:
+                return False
+
+            # Get text regions
+            text_regions = ocr_data.get('text_regions', [])
+            if not text_regions:
+                logger.warning("No text regions found in JSON")
+                return False
+
+            # Get images metadata
+            images_metadata = ocr_data.get('images_metadata', [])
+
+            # Get layout data
+            layout_data = ocr_data.get('layout_data', {})
+
+            # Determine page dimensions
+            page_size = self.calculate_page_dimensions(text_regions, source_file_path)
+
+            page_width, page_height = page_size
+
+            # Create PDF canvas
+            pdf_canvas = canvas.Canvas(str(output_path), pagesize=(page_width, page_height))
+
+            # Extract table bboxes to exclude text in those regions
+            table_bboxes = []
+            for img_meta in images_metadata:
+                img_path = img_meta.get('image_path', '')
+                if 'table' in img_path.lower():
+                    bbox = img_meta.get('bbox', [])
+                    if bbox and len(bbox) >= 4:
+                        table_bboxes.append(bbox)
+
+            # Helper function to check if a point is inside a bbox
+            def point_in_bbox(x, y, bbox):
+                x1, y1 = bbox[0]
+                x2, y2 = bbox[2]
+                return min(x1, x2) <= x <= max(x1, x2) and min(y1, y2) <= y <= max(y1, y2)
+
+            # Filter text regions to exclude those inside tables
+            filtered_text_regions = []
+            for region in text_regions:
+                bbox = region.get('bbox', [])
+                if not bbox or len(bbox) < 4:
+                    continue
+
+                # Check if text region center is inside any table bbox
+                center_x = (bbox[0][0] + bbox[2][0]) / 2
+                center_y = (bbox[0][1] + bbox[2][1]) / 2
+
+                is_in_table = any(point_in_bbox(center_x, center_y, table_bbox) for table_bbox in table_bboxes)
+
+                if not is_in_table:
+                    filtered_text_regions.append(region)
+                else:
+                    logger.debug(f"Excluded text '{region.get('text', '')[:20]}...' (inside table)")
+
+            logger.info(f"Filtered {len(text_regions) - len(filtered_text_regions)} text regions inside tables")
+
+            # Group regions by page
+            pages_data = {}
+            for region in filtered_text_regions:
+                page_num = region.get('page', 1)
+                if page_num not in pages_data:
+                    pages_data[page_num] = []
+                pages_data[page_num].append(region)
+
+            # Get table elements from layout_data
+            table_elements = []
+            if layout_data and layout_data.get('elements'):
+                table_elements = [e for e in layout_data['elements'] if e.get('type') == 'table']
+
+            # Process each page
+            total_pages = ocr_data.get('total_pages', 1)
+            for page_num in range(1, total_pages + 1):
+                if page_num > 1:
+                    pdf_canvas.showPage()  # Start new page
+
+                # Draw text regions for this page (excluding table text)
+                page_regions = pages_data.get(page_num, [])
+                for region in page_regions:
+                    self.draw_text_region(pdf_canvas, region, page_height)
+
+                # Draw tables for this page
+                for table_elem in table_elements:
+                    if table_elem.get('page', 0) == page_num - 1:  # page is 0-indexed
+                        self.draw_table_region(pdf_canvas, table_elem, images_metadata, page_height)
+
+                # Draw non-table images for this page (figure, chart, seal, etc.)
+                for img_meta in images_metadata:
+                    if img_meta.get('page') == page_num - 1:  # page is 0-indexed
+                        img_path = img_meta.get('image_path', '')
+                        # Skip table images (they're now rendered as tables)
+                        if 'table' not in img_path.lower():
+                            self.draw_image_region(
+                                pdf_canvas,
+                                img_meta,
+                                page_height,
+                                json_path.parent
+                            )
+
+            # Save PDF
+            pdf_canvas.save()
+
+            file_size = output_path.stat().st_size
+            logger.info(f"Generated layout-preserving PDF: {output_path.name} ({file_size} bytes)")
+            return True
+
+        except Exception as e:
+            logger.error(f"Failed to generate PDF: {e}")
+            import traceback
+            traceback.print_exc()
+            return False
+
+
+# Singleton instance
+pdf_generator_service = PDFGeneratorService()