feat: implement layout-preserving PDF generation with table reconstruction

Major Features: - Add PDF generation service with Chinese font support - Parse HTML tables from PP-StructureV3 and rebuild with ReportLab - Extract table text for translation purposes - Auto-filter text regions inside tables to avoid overlaps Backend Changes: 1. pdf_generator_service.py (NEW) - HTMLTableParser: Parse HTML tables to extract structure - PDFGeneratorService: Generate layout-preserving PDFs - Coordinate transformation: OCR (top-left) → PDF (bottom-left) - Font size heuristics: 75% of bbox height with width checking - Table reconstruction: Parse HTML → ReportLab Table - Image embedding: Extract bbox from filenames 2. ocr_service.py - Add _extract_table_text() for translation support - Add output_dir parameter to save images to result directory - Extract bbox from image filenames (img_in_table_box_x1_y1_x2_y2.jpg) 3. tasks.py - Update process_task_ocr to use save_results() with PDF generation - Fix download_pdf endpoint to use database-stored PDF paths - Support on-demand PDF generation from JSON 4. config.py - Add chinese_font_path configuration - Add pdf_enable_bbox_debug flag Frontend Changes: 1. PDFViewer.tsx (NEW) - React PDF viewer with zoom and pagination - Memoized file config to prevent unnecessary reloads 2. TaskDetailPage.tsx & ResultsPage.tsx - Integrate PDF preview and download 3. main.tsx - Configure PDF.js worker via CDN 4. vite.config.ts - Add host: '0.0.0.0' for network access - Use VITE_API_URL environment variable for backend proxy Dependencies: - reportlab: PDF generation library - Noto Sans SC font: Chinese character support 🤖 Generated with Claude Code https://claude.com/claude-code Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-17 20:21:56 +08:00
parent 012da1abc4
commit fa1abcd8e6
16 changed files with 1427 additions and 57 deletions
--- a/backend/app/core/config.py
+++ b/backend/app/core/config.py
@@ -96,6 +96,11 @@ class Settings(BaseSettings):
    pdf_margin_left: int = Field(default=20)
    pdf_margin_right: int = Field(default=20)

+    # ===== Layout-Preserving PDF Configuration =====
+    chinese_font_path: str = Field(default="./backend/fonts/NotoSansSC-Regular.ttf")
+    pdf_font_size_base: int = Field(default=12)
+    pdf_enable_bbox_debug: bool = Field(default=False)  # Draw bounding boxes for debugging
+
    # ===== Translation Configuration (Reserved) =====
    enable_translation: bool = Field(default=False)
    translation_engine: str = Field(default="offline")
--- a/backend/app/routers/tasks.py
+++ b/backend/app/routers/tasks.py
@@ -66,34 +66,33 @@ def process_task_ocr(task_id: str, task_db_id: int, file_path: str, filename: st
        # Initialize OCR service
        ocr_service = OCRService()

+        # Create result directory before OCR processing (needed for saving extracted images)
+        result_dir = Path(settings.result_dir) / task_id
+        result_dir.mkdir(parents=True, exist_ok=True)
+
        # Process the file with OCR
        ocr_result = ocr_service.process_image(
            image_path=Path(file_path),
            lang='ch',
-            detect_layout=True
+            detect_layout=True,
+            output_dir=result_dir
        )

        # Calculate processing time
        processing_time_ms = int((datetime.now() - start_time).total_seconds() * 1000)

-        # Create result directory
-        result_dir = Path(settings.result_dir) / task_id
-        result_dir.mkdir(parents=True, exist_ok=True)
-
-        # Save JSON result
-        json_path = result_dir / f"{Path(filename).stem}_result.json"
-        with open(json_path, 'w', encoding='utf-8') as f:
-            json.dump(ocr_result, f, ensure_ascii=False, indent=2)
-
-        # Save Markdown result
-        markdown_path = result_dir / f"{Path(filename).stem}_result.md"
-        markdown_content = ocr_result.get('markdown_content', '')
-        with open(markdown_path, 'w', encoding='utf-8') as f:
-            f.write(markdown_content)
+        # Save results using OCR service (includes JSON, Markdown, and PDF generation)
+        json_path, markdown_path, pdf_path = ocr_service.save_results(
+            result=ocr_result,
+            output_dir=result_dir,
+            file_id=Path(filename).stem,
+            source_file_path=Path(file_path)
+        )

        # Update task with results (direct database update)
-        task.result_json_path = str(json_path)
-        task.result_markdown_path = str(markdown_path)
+        task.result_json_path = str(json_path) if json_path else None
+        task.result_markdown_path = str(markdown_path) if markdown_path else None
+        task.result_pdf_path = str(pdf_path) if pdf_path else None
        task.processing_time_ms = processing_time_ms
        task.status = TaskStatus.COMPLETED
        task.completed_at = datetime.utcnow()
@@ -468,10 +467,16 @@ async def download_pdf(
    current_user: User = Depends(get_current_user)
 ):
    """
-    Download task result as searchable PDF file
+    Download task result as layout-preserving PDF file

    - **task_id**: Task UUID
+
+    Returns a PDF that preserves the original document layout using OCR results.
+    The PDF is generated from OCR JSON data and cached for subsequent requests.
    """
+    from pathlib import Path
+    from app.services.pdf_generator_service import pdf_generator_service
+
    # Get task
    task = task_service.get_task_by_id(
        db=db,
@@ -485,12 +490,69 @@ async def download_pdf(
            detail="Task not found"
        )

+    # Check if task is completed
+    if task.status.value != "completed":
+        raise HTTPException(
+            status_code=status.HTTP_400_BAD_REQUEST,
+            detail="Task is not completed yet. Please wait for OCR processing to finish."
+        )
+
+    # Check if PDF path is stored in database
+    if task.result_pdf_path and Path(task.result_pdf_path).exists():
+        pdf_path = Path(task.result_pdf_path)
+        logger.info(f"Using pre-generated PDF from database: {pdf_path.name}")
+    else:
+        # Fallback: Try to generate PDF on-demand
+        result_dir = Path(settings.result_dir) / task_id
+
+        # Use stored JSON path or construct it
+        if task.result_json_path and Path(task.result_json_path).exists():
+            json_path = Path(task.result_json_path)
+        else:
+            # Try to find JSON file in result directory
+            json_files = list(result_dir.glob("*_result.json"))
+            if not json_files:
+                raise HTTPException(
+                    status_code=status.HTTP_404_NOT_FOUND,
+                    detail="OCR result JSON not found"
+                )
+            json_path = json_files[0]
+
+        # Construct PDF path based on JSON filename
+        pdf_filename = json_path.stem.replace("_result", "_layout") + ".pdf"
+        pdf_path = result_dir / pdf_filename
+
+        # Generate PDF if it doesn't exist
+        if not pdf_path.exists():
+            logger.info(f"Generating layout-preserving PDF for task {task_id}")
+
+            # Get source file path if available
+            source_file = None
+            task_file = db.query(TaskFile).filter(TaskFile.task_id == task.id).first()
+            if task_file and task_file.stored_path and Path(task_file.stored_path).exists():
+                source_file = Path(task_file.stored_path)
+
+            # Generate PDF
+            success = pdf_generator_service.generate_layout_pdf(
+                json_path=json_path,
+                output_path=pdf_path,
+                source_file_path=source_file
+            )
+
+            if not success:
+                raise HTTPException(
+                    status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+                    detail="Failed to generate PDF. Please check server logs."
+                )
+
+            logger.info(f"PDF generated successfully: {pdf_path.name}")
+
    # Validate file access
    is_valid, error_msg = file_access_service.validate_file_access(
        db=db,
        user_id=current_user.id,
        task_id=task_id,
-        file_path=task.result_pdf_path
+        file_path=str(pdf_path)
    )

    if not is_valid:
@@ -502,7 +564,7 @@ async def download_pdf(
    # Return file
    filename = f"{task.filename or task_id}_result.pdf"
    return FileResponse(
-        path=task.result_pdf_path,
+        path=str(pdf_path),
        filename=filename,
        media_type="application/pdf"
    )
--- a/backend/app/services/ocr_service.py
+++ b/backend/app/services/ocr_service.py
@@ -284,7 +284,8 @@ class OCRService:
        image_path: Path,
        lang: str = 'ch',
        detect_layout: bool = True,
-        confidence_threshold: Optional[float] = None
+        confidence_threshold: Optional[float] = None,
+        output_dir: Optional[Path] = None
    ) -> Dict:
        """
        Process single image with OCR and layout analysis
@@ -340,7 +341,8 @@ class OCRService:
                        page_image_path,
                        lang=lang,
                        detect_layout=detect_layout,
-                        confidence_threshold=confidence_threshold
+                        confidence_threshold=confidence_threshold,
+                        output_dir=output_dir
                    )

                    # Accumulate results
@@ -458,7 +460,7 @@ class OCRService:
            images_metadata = []

            if detect_layout:
-                layout_data, images_metadata = self.analyze_layout(image_path)
+                layout_data, images_metadata = self.analyze_layout(image_path, output_dir=output_dir)

            # Generate Markdown
            markdown_content = self.generate_markdown(text_regions, layout_data)
@@ -500,12 +502,71 @@ class OCRService:
                'processing_time': (datetime.now() - start_time).total_seconds(),
            }

-    def analyze_layout(self, image_path: Path) -> Tuple[Optional[Dict], List[Dict]]:
+    def _extract_table_text(self, html_content: str) -> str:
+        """
+        Extract text from HTML table content for translation purposes
+
+        Args:
+            html_content: HTML content containing table
+
+        Returns:
+            Extracted text from table cells
+        """
+        try:
+            from html.parser import HTMLParser
+
+            class TableTextExtractor(HTMLParser):
+                def __init__(self):
+                    super().__init__()
+                    self.text_parts = []
+                    self.in_table = False
+
+                def handle_starttag(self, tag, attrs):
+                    if tag == 'table':
+                        self.in_table = True
+
+                def handle_endtag(self, tag):
+                    if tag == 'table':
+                        self.in_table = False
+                    elif tag in ('td', 'th') and self.in_table:
+                        self.text_parts.append(' | ')  # Cell separator
+                    elif tag == 'tr' and self.in_table:
+                        self.text_parts.append('\n')  # Row separator
+
+                def handle_data(self, data):
+                    if self.in_table:
+                        stripped = data.strip()
+                        if stripped:
+                            self.text_parts.append(stripped)
+
+            parser = TableTextExtractor()
+            parser.feed(html_content)
+
+            # Clean up the extracted text
+            extracted = ''.join(parser.text_parts)
+            # Remove multiple separators
+            import re
+            extracted = re.sub(r'\s*\|\s*\|+\s*', ' | ', extracted)
+            extracted = re.sub(r'\n+', '\n', extracted)
+            extracted = extracted.strip()
+
+            return extracted
+
+        except Exception as e:
+            logger.warning(f"Failed to extract table text: {e}")
+            # Fallback: just remove HTML tags
+            import re
+            text = re.sub(r'<[^>]+>', ' ', html_content)
+            text = re.sub(r'\s+', ' ', text)
+            return text.strip()
+
+    def analyze_layout(self, image_path: Path, output_dir: Optional[Path] = None) -> Tuple[Optional[Dict], List[Dict]]:
        """
        Analyze document layout using PP-StructureV3

        Args:
            image_path: Path to image file
+            output_dir: Optional output directory for saving extracted images (defaults to image_path.parent)

        Returns:
            Tuple of (layout_data, images_metadata)
@@ -548,16 +609,59 @@ class OCRService:
                                'page': page_idx,
                                'bbox': [],  # PP-StructureV3 doesn't provide individual bbox in this format
                            }
+
+                            # Extract text from table for translation purposes
+                            if has_table:
+                                table_text = self._extract_table_text(markdown_texts)
+                                element['extracted_text'] = table_text
+                                logger.info(f"Extracted {len(table_text)} characters from table")
+
                            layout_elements.append(element)

-                        # Add image metadata
+                        # Add image metadata and SAVE images to disk
                        for img_idx, (img_path, img_obj) in enumerate(markdown_images.items()):
+                            # Save image to disk
+                            try:
+                                # Determine base directory for saving images
+                                base_dir = output_dir if output_dir else image_path.parent
+
+                                # Create full path for image file
+                                full_img_path = base_dir / img_path
+
+                                # Create imgs/ subdirectory if it doesn't exist
+                                full_img_path.parent.mkdir(parents=True, exist_ok=True)
+
+                                # Save image object to disk
+                                if hasattr(img_obj, 'save'):
+                                    # img_obj is PIL Image
+                                    img_obj.save(str(full_img_path))
+                                    logger.info(f"Saved extracted image to {full_img_path}")
+                                else:
+                                    logger.warning(f"Image object for {img_path} does not have save() method, skipping")
+
+                            except Exception as e:
+                                logger.warning(f"Failed to save image {img_path}: {str(e)}")
+                                # Continue processing even if image save fails
+
+                            # Extract bbox from filename (format: img_in_table_box_x1_y1_x2_y2.jpg)
+                            bbox = []
+                            try:
+                                import re
+                                match = re.search(r'box_(\d+)_(\d+)_(\d+)_(\d+)', img_path)
+                                if match:
+                                    x1, y1, x2, y2 = map(int, match.groups())
+                                    # Convert to 4-point bbox format: [[x1,y1], [x2,y1], [x2,y2], [x1,y2]]
+                                    bbox = [[x1, y1], [x2, y1], [x2, y2], [x1, y2]]
+                                    logger.info(f"Extracted bbox from filename: {bbox}")
+                            except Exception as e:
+                                logger.warning(f"Failed to extract bbox from {img_path}: {e}")
+
                            images_metadata.append({
                                'element_id': len(layout_elements) + img_idx,
                                'image_path': img_path,
                                'type': 'image',
                                'page': page_idx,
-                                'bbox': [],
+                                'bbox': bbox,
                            })

            if layout_elements:
@@ -638,18 +742,20 @@ class OCRService:
        self,
        result: Dict,
        output_dir: Path,
-        file_id: str
-    ) -> Tuple[Optional[Path], Optional[Path]]:
+        file_id: str,
+        source_file_path: Optional[Path] = None
+    ) -> Tuple[Optional[Path], Optional[Path], Optional[Path]]:
        """
-        Save OCR results to JSON and Markdown files
+        Save OCR results to JSON, Markdown, and layout-preserving PDF files

        Args:
            result: OCR result dictionary
            output_dir: Output directory
            file_id: Unique file identifier
+            source_file_path: Optional path to original source file for PDF generation

        Returns:
-            Tuple of (json_path, markdown_path)
+            Tuple of (json_path, markdown_path, pdf_path)
        """
        try:
            output_dir.mkdir(parents=True, exist_ok=True)
@@ -666,8 +772,37 @@ class OCRService:
                f.write(markdown_content)

            logger.info(f"Results saved: {json_path.name}, {markdown_path.name}")
-            return json_path, markdown_path
+
+            # Generate layout-preserving PDF
+            pdf_path = None
+            try:
+                from app.services.pdf_generator_service import pdf_generator_service
+
+                pdf_filename = f"{file_id}_layout.pdf"
+                pdf_path = output_dir / pdf_filename
+
+                logger.info(f"Generating layout-preserving PDF: {pdf_filename}")
+
+                success = pdf_generator_service.generate_layout_pdf(
+                    json_path=json_path,
+                    output_path=pdf_path,
+                    source_file_path=source_file_path
+                )
+
+                if success:
+                    logger.info(f"✓ PDF generated successfully: {pdf_path.name}")
+                else:
+                    logger.warning(f"✗ PDF generation failed for {file_id}")
+                    pdf_path = None
+
+            except Exception as e:
+                logger.error(f"Error generating PDF for {file_id}: {str(e)}")
+                import traceback
+                traceback.print_exc()
+                pdf_path = None
+
+            return json_path, markdown_path, pdf_path

        except Exception as e:
            logger.error(f"Error saving results: {str(e)}")
-            return None, None
+            return None, None, None
--- a/backend/app/services/pdf_generator_service.py
+++ b/backend/app/services/pdf_generator_service.py
@@ -0,0 +1,626 @@
+"""
+Layout-Preserving PDF Generation Service
+Generates PDF files that preserve the original document layout using OCR JSON data
+"""
+
+import json
+import logging
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+from datetime import datetime
+
+from reportlab.lib.pagesizes import A4, letter
+from reportlab.lib.units import mm
+from reportlab.pdfgen import canvas
+from reportlab.pdfbase import pdfmetrics
+from reportlab.pdfbase.ttfonts import TTFont
+from reportlab.platypus import Table, TableStyle
+from reportlab.lib import colors
+from reportlab.lib.enums import TA_CENTER, TA_LEFT, TA_RIGHT
+from reportlab.platypus import Paragraph
+from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
+from PIL import Image
+from html.parser import HTMLParser
+
+from app.core.config import settings
+
+logger = logging.getLogger(__name__)
+
+
+class HTMLTableParser(HTMLParser):
+    """Parse HTML table to extract structure and data"""
+
+    def __init__(self):
+        super().__init__()
+        self.tables = []
+        self.current_table = None
+        self.current_row = None
+        self.current_cell = None
+        self.in_table = False
+
+    def handle_starttag(self, tag, attrs):
+        attrs_dict = dict(attrs)
+
+        if tag == 'table':
+            self.in_table = True
+            self.current_table = {'rows': []}
+
+        elif tag == 'tr' and self.in_table:
+            self.current_row = {'cells': []}
+
+        elif tag in ('td', 'th') and self.in_table and self.current_row is not None:
+            colspan = int(attrs_dict.get('colspan', 1))
+            rowspan = int(attrs_dict.get('rowspan', 1))
+            self.current_cell = {
+                'text': '',
+                'is_header': tag == 'th',
+                'colspan': colspan,
+                'rowspan': rowspan
+            }
+
+    def handle_endtag(self, tag):
+        if tag == 'table' and self.in_table:
+            if self.current_table and self.current_table['rows']:
+                self.tables.append(self.current_table)
+            self.current_table = None
+            self.in_table = False
+
+        elif tag == 'tr' and self.current_row is not None:
+            if self.current_table is not None:
+                self.current_table['rows'].append(self.current_row)
+            self.current_row = None
+
+        elif tag in ('td', 'th') and self.current_cell is not None:
+            if self.current_row is not None:
+                self.current_row['cells'].append(self.current_cell)
+            self.current_cell = None
+
+    def handle_data(self, data):
+        if self.current_cell is not None:
+            self.current_cell['text'] += data.strip() + ' '
+
+
+class PDFGeneratorService:
+    """Service for generating layout-preserving PDFs from OCR JSON data"""
+
+    def __init__(self):
+        """Initialize PDF generator with font configuration"""
+        self.font_name = 'NotoSansSC'
+        self.font_path = None
+        self.font_registered = False
+
+        self._register_chinese_font()
+
+    def _register_chinese_font(self):
+        """Register Chinese font for PDF generation"""
+        try:
+            # Get font path from settings
+            font_path = Path(settings.chinese_font_path)
+
+            # Try relative path from project root
+            if not font_path.is_absolute():
+                # Adjust path - settings.chinese_font_path starts with ./backend/
+                project_root = Path(__file__).resolve().parent.parent.parent.parent
+                font_path = project_root / font_path
+
+            if not font_path.exists():
+                logger.error(f"Chinese font not found at {font_path}")
+                return
+
+            # Register font
+            pdfmetrics.registerFont(TTFont(self.font_name, str(font_path)))
+            self.font_path = font_path
+            self.font_registered = True
+            logger.info(f"Chinese font registered: {self.font_name} from {font_path}")
+
+        except Exception as e:
+            logger.error(f"Failed to register Chinese font: {e}")
+            self.font_registered = False
+
+    def load_ocr_json(self, json_path: Path) -> Optional[Dict]:
+        """
+        Load and parse OCR JSON result file
+
+        Args:
+            json_path: Path to JSON file
+
+        Returns:
+            Parsed JSON data or None if failed
+        """
+        try:
+            with open(json_path, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+
+            logger.info(f"Loaded OCR JSON: {json_path.name}")
+            return data
+
+        except Exception as e:
+            logger.error(f"Failed to load JSON {json_path}: {e}")
+            return None
+
+    def calculate_page_dimensions(self, text_regions: List[Dict], source_file_path: Optional[Path] = None) -> Tuple[float, float]:
+        """
+        Calculate page dimensions from source file or text region bounding boxes
+
+        Args:
+            text_regions: List of text regions with bbox coordinates
+            source_file_path: Optional path to source file for accurate dimensions
+
+        Returns:
+            Tuple of (width, height) in points
+        """
+        # First try to get dimensions from source file
+        if source_file_path:
+            dims = self.get_original_page_size(source_file_path)
+            if dims:
+                return dims
+
+        if not text_regions:
+            return A4  # Default to A4 size
+
+        max_x = 0
+        max_y = 0
+
+        for region in text_regions:
+            bbox = region.get('bbox', [])
+            if not bbox or len(bbox) < 4:
+                continue
+
+            # bbox format: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
+            for point in bbox:
+                if isinstance(point, (list, tuple)) and len(point) >= 2:
+                    x, y = point[0], point[1]
+                    max_x = max(max_x, x)
+                    max_y = max(max_y, y)
+
+        # OCR coordinates are in pixels, use them directly as points (1:1 mapping)
+        # Do NOT add padding - this causes layout issues
+        width = max_x if max_x > 0 else A4[0]
+        height = max_y if max_y > 0 else A4[1]
+
+        logger.info(f"Calculated page dimensions from OCR: {width:.1f} x {height:.1f} points")
+        return (width, height)
+
+    def get_original_page_size(self, file_path: Path) -> Optional[Tuple[float, float]]:
+        """
+        Extract page dimensions from original source file
+
+        Args:
+            file_path: Path to original file (image or PDF)
+
+        Returns:
+            Tuple of (width, height) in points or None
+        """
+        try:
+            if not file_path.exists():
+                return None
+
+            # For images, get dimensions from PIL
+            if file_path.suffix.lower() in ['.png', '.jpg', '.jpeg', '.bmp', '.tiff']:
+                img = Image.open(file_path)
+                # Use pixel dimensions directly as points (1:1 mapping)
+                # This matches how PaddleOCR reports coordinates
+                width_pt = float(img.width)
+                height_pt = float(img.height)
+                logger.info(f"Extracted dimensions from image: {width_pt:.1f} x {height_pt:.1f} points (1:1 pixel mapping)")
+                return (width_pt, height_pt)
+
+            # For PDFs, would need PyPDF2 or similar
+            # For now, return None to use calculated dimensions
+
+        except Exception as e:
+            logger.warning(f"Failed to get page size from {file_path}: {e}")
+
+        return None
+
+    def draw_text_region(
+        self,
+        pdf_canvas: canvas.Canvas,
+        region: Dict,
+        page_height: float
+    ):
+        """
+        Draw a text region at precise coordinates
+
+        Args:
+            pdf_canvas: ReportLab canvas object
+            region: Text region dict with text, bbox, confidence
+            page_height: Height of page (for coordinate transformation)
+        """
+        text = region.get('text', '')
+        bbox = region.get('bbox', [])
+        confidence = region.get('confidence', 1.0)
+
+        if not text or not bbox or len(bbox) < 4:
+            return
+
+        try:
+            # bbox from OCR: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
+            # Points: top-left, top-right, bottom-right, bottom-left
+            # OCR coordinates: origin (0,0) at top-left, Y increases downward
+            ocr_x_left = bbox[0][0]    # Left X
+            ocr_y_top = bbox[0][1]     # Top Y in OCR coordinates
+            ocr_x_right = bbox[2][0]   # Right X
+            ocr_y_bottom = bbox[2][1]  # Bottom Y in OCR coordinates
+
+            # Calculate bbox dimensions
+            bbox_width = abs(ocr_x_right - ocr_x_left)
+            bbox_height = abs(ocr_y_bottom - ocr_y_top)
+
+            # Calculate font size using heuristics
+            # Font size is typically 70-90% of bbox height
+            # Testing shows 0.75 works well for most cases
+            font_size = bbox_height * 0.75
+            font_size = max(min(font_size, 72), 4)  # Clamp between 4pt and 72pt
+
+            # Transform coordinates: OCR (top-left origin) → PDF (bottom-left origin)
+            # CRITICAL: Y-axis flip!
+            pdf_x = ocr_x_left
+            pdf_y = page_height - ocr_y_bottom  # Flip Y-axis using bottom coordinate
+
+            # Set font
+            font_name = self.font_name if self.font_registered else 'Helvetica'
+            pdf_canvas.setFont(font_name, font_size)
+
+            # Calculate text width to prevent overflow
+            text_width = pdf_canvas.stringWidth(text, font_name, font_size)
+
+            # If text is too wide for bbox, scale down font
+            if text_width > bbox_width:
+                scale_factor = bbox_width / text_width
+                font_size = font_size * scale_factor * 0.95  # 95% to add small margin
+                font_size = max(font_size, 3)  # Minimum 3pt
+                pdf_canvas.setFont(font_name, font_size)
+
+            # Draw text at calculated position
+            pdf_canvas.drawString(pdf_x, pdf_y, text)
+
+            # Debug: Draw bounding box (optional)
+            if settings.pdf_enable_bbox_debug:
+                pdf_canvas.setStrokeColorRGB(1, 0, 0, 0.3)  # Red, semi-transparent
+                pdf_canvas.setLineWidth(0.5)
+                # Transform all bbox points to PDF coordinates
+                pdf_points = [(p[0], page_height - p[1]) for p in bbox]
+                # Draw bbox rectangle
+                for i in range(4):
+                    x1, y1 = pdf_points[i]
+                    x2, y2 = pdf_points[(i + 1) % 4]
+                    pdf_canvas.line(x1, y1, x2, y2)
+
+        except Exception as e:
+            logger.warning(f"Failed to draw text region '{text[:20]}...': {e}")
+
+    def draw_table_region(
+        self,
+        pdf_canvas: canvas.Canvas,
+        table_element: Dict,
+        images_metadata: List[Dict],
+        page_height: float
+    ):
+        """
+        Draw a table region by parsing HTML and rebuilding with ReportLab Table
+
+        Args:
+            pdf_canvas: ReportLab canvas object
+            table_element: Table element dict with HTML content
+            images_metadata: List of image metadata to find table bbox
+            page_height: Height of page
+        """
+        try:
+            html_content = table_element.get('content', '')
+            if not html_content:
+                return
+
+            # Parse HTML to extract table structure
+            parser = HTMLTableParser()
+            parser.feed(html_content)
+
+            if not parser.tables:
+                logger.warning("No tables found in HTML content")
+                return
+
+            # Get the first table (PP-StructureV3 usually provides one table per element)
+            table_data = parser.tables[0]
+            rows = table_data['rows']
+
+            if not rows:
+                return
+
+            # Find corresponding table image to get bbox
+            table_bbox = None
+            for img_meta in images_metadata:
+                img_path = img_meta.get('image_path', '')
+                if 'table' in img_path.lower():
+                    bbox = img_meta.get('bbox', [])
+                    if bbox and len(bbox) >= 4:
+                        table_bbox = bbox
+                        break
+
+            if not table_bbox:
+                logger.warning("No bbox found for table")
+                return
+
+            # Extract bbox coordinates
+            ocr_x_left = table_bbox[0][0]
+            ocr_y_top = table_bbox[0][1]
+            ocr_x_right = table_bbox[2][0]
+            ocr_y_bottom = table_bbox[2][1]
+
+            table_width = abs(ocr_x_right - ocr_x_left)
+            table_height = abs(ocr_y_bottom - ocr_y_top)
+
+            # Transform coordinates
+            pdf_x = ocr_x_left
+            pdf_y = page_height - ocr_y_bottom
+
+            # Build table data for ReportLab
+            # Convert parsed structure to simple 2D array
+            max_cols = max(len(row['cells']) for row in rows)
+            reportlab_data = []
+
+            for row in rows:
+                row_data = []
+                for cell in row['cells']:
+                    text = cell['text'].strip()
+                    row_data.append(text)
+                # Pad row if needed
+                while len(row_data) < max_cols:
+                    row_data.append('')
+                reportlab_data.append(row_data)
+
+            # Calculate column widths (equal distribution)
+            col_widths = [table_width / max_cols] * max_cols
+
+            # Create ReportLab Table
+            # Use smaller font size to fit in bbox
+            font_size = min(table_height / len(rows) * 0.5, 10)
+            font_size = max(font_size, 6)
+
+            # Create table with font
+            table = Table(reportlab_data, colWidths=col_widths)
+
+            # Apply table style
+            style = TableStyle([
+                ('FONT', (0, 0), (-1, -1), self.font_name if self.font_registered else 'Helvetica', font_size),
+                ('GRID', (0, 0), (-1, -1), 0.5, colors.black),
+                ('VALIGN', (0, 0), (-1, -1), 'MIDDLE'),
+                ('ALIGN', (0, 0), (-1, -1), 'CENTER'),
+                ('LEFTPADDING', (0, 0), (-1, -1), 2),
+                ('RIGHTPADDING', (0, 0), (-1, -1), 2),
+                ('TOPPADDING', (0, 0), (-1, -1), 2),
+                ('BOTTOMPADDING', (0, 0), (-1, -1), 2),
+            ])
+
+            # Add header style if first row has headers
+            if rows and rows[0]['cells'] and rows[0]['cells'][0].get('is_header'):
+                style.add('BACKGROUND', (0, 0), (-1, 0), colors.lightgrey)
+                style.add('FONT', (0, 0), (-1, 0), self.font_name if self.font_registered else 'Helvetica-Bold', font_size)
+
+            table.setStyle(style)
+
+            # Calculate table size
+            table.wrapOn(pdf_canvas, table_width, table_height)
+
+            # Draw table at position
+            table.drawOn(pdf_canvas, pdf_x, pdf_y)
+
+            logger.info(f"Drew table at ({pdf_x:.0f}, {pdf_y:.0f}) size {table_width:.0f}x{table_height:.0f} with {len(rows)} rows")
+
+        except Exception as e:
+            logger.warning(f"Failed to draw table region: {e}")
+            import traceback
+            traceback.print_exc()
+
+    def draw_image_region(
+        self,
+        pdf_canvas: canvas.Canvas,
+        region: Dict,
+        page_height: float,
+        result_dir: Path
+    ):
+        """
+        Draw an image region by embedding the extracted image
+
+        Handles images extracted by PP-StructureV3 (tables, figures, charts, etc.)
+
+        Args:
+            pdf_canvas: ReportLab canvas object
+            region: Image metadata dict with image_path and bbox
+            page_height: Height of page (for coordinate transformation)
+            result_dir: Directory containing result files
+        """
+        try:
+            image_path_str = region.get('image_path', '')
+            if not image_path_str:
+                return
+
+            # Construct full path to image
+            image_path = result_dir / image_path_str
+
+            if not image_path.exists():
+                logger.warning(f"Image not found: {image_path}")
+                return
+
+            # Get bbox for positioning
+            bbox = region.get('bbox', [])
+            if not bbox or len(bbox) < 4:
+                # If no bbox, skip for now
+                logger.warning(f"No bbox for image {image_path_str}")
+                return
+
+            # bbox from OCR: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
+            # OCR coordinates: origin (0,0) at top-left, Y increases downward
+            ocr_x_left = bbox[0][0]
+            ocr_y_top = bbox[0][1]
+            ocr_x_right = bbox[2][0]
+            ocr_y_bottom = bbox[2][1]
+
+            # Calculate bbox dimensions
+            bbox_width = abs(ocr_x_right - ocr_x_left)
+            bbox_height = abs(ocr_y_bottom - ocr_y_top)
+
+            # Transform coordinates: OCR (top-left origin) → PDF (bottom-left origin)
+            # CRITICAL: Y-axis flip!
+            # For images, we position at bottom-left corner
+            pdf_x_left = ocr_x_left
+            pdf_y_bottom = page_height - ocr_y_bottom  # Flip Y-axis
+
+            # Draw image using ReportLab
+            # drawImage expects: (path, x, y, width, height)
+            # where (x, y) is the bottom-left corner of the image
+            pdf_canvas.drawImage(
+                str(image_path),
+                pdf_x_left,
+                pdf_y_bottom,
+                width=bbox_width,
+                height=bbox_height,
+                preserveAspectRatio=True,
+                mask='auto'  # Handle transparency
+            )
+
+            logger.info(f"Drew image: {image_path_str} at ({pdf_x_left:.0f}, {pdf_y_bottom:.0f}) size {bbox_width:.0f}x{bbox_height:.0f}")
+
+        except Exception as e:
+            logger.warning(f"Failed to draw image region: {e}")
+
+    def generate_layout_pdf(
+        self,
+        json_path: Path,
+        output_path: Path,
+        source_file_path: Optional[Path] = None
+    ) -> bool:
+        """
+        Generate layout-preserving PDF from OCR JSON data
+
+        Args:
+            json_path: Path to OCR JSON file
+            output_path: Path to save generated PDF
+            source_file_path: Optional path to original source file for dimension extraction
+
+        Returns:
+            True if successful, False otherwise
+        """
+        try:
+            # Check if PDF already exists (caching)
+            if output_path.exists():
+                logger.info(f"PDF already exists: {output_path.name}")
+                return True
+
+            # Load JSON data
+            ocr_data = self.load_ocr_json(json_path)
+            if not ocr_data:
+                return False
+
+            # Get text regions
+            text_regions = ocr_data.get('text_regions', [])
+            if not text_regions:
+                logger.warning("No text regions found in JSON")
+                return False
+
+            # Get images metadata
+            images_metadata = ocr_data.get('images_metadata', [])
+
+            # Get layout data
+            layout_data = ocr_data.get('layout_data', {})
+
+            # Determine page dimensions
+            page_size = self.calculate_page_dimensions(text_regions, source_file_path)
+
+            page_width, page_height = page_size
+
+            # Create PDF canvas
+            pdf_canvas = canvas.Canvas(str(output_path), pagesize=(page_width, page_height))
+
+            # Extract table bboxes to exclude text in those regions
+            table_bboxes = []
+            for img_meta in images_metadata:
+                img_path = img_meta.get('image_path', '')
+                if 'table' in img_path.lower():
+                    bbox = img_meta.get('bbox', [])
+                    if bbox and len(bbox) >= 4:
+                        table_bboxes.append(bbox)
+
+            # Helper function to check if a point is inside a bbox
+            def point_in_bbox(x, y, bbox):
+                x1, y1 = bbox[0]
+                x2, y2 = bbox[2]
+                return min(x1, x2) <= x <= max(x1, x2) and min(y1, y2) <= y <= max(y1, y2)
+
+            # Filter text regions to exclude those inside tables
+            filtered_text_regions = []
+            for region in text_regions:
+                bbox = region.get('bbox', [])
+                if not bbox or len(bbox) < 4:
+                    continue
+
+                # Check if text region center is inside any table bbox
+                center_x = (bbox[0][0] + bbox[2][0]) / 2
+                center_y = (bbox[0][1] + bbox[2][1]) / 2
+
+                is_in_table = any(point_in_bbox(center_x, center_y, table_bbox) for table_bbox in table_bboxes)
+
+                if not is_in_table:
+                    filtered_text_regions.append(region)
+                else:
+                    logger.debug(f"Excluded text '{region.get('text', '')[:20]}...' (inside table)")
+
+            logger.info(f"Filtered {len(text_regions) - len(filtered_text_regions)} text regions inside tables")
+
+            # Group regions by page
+            pages_data = {}
+            for region in filtered_text_regions:
+                page_num = region.get('page', 1)
+                if page_num not in pages_data:
+                    pages_data[page_num] = []
+                pages_data[page_num].append(region)
+
+            # Get table elements from layout_data
+            table_elements = []
+            if layout_data and layout_data.get('elements'):
+                table_elements = [e for e in layout_data['elements'] if e.get('type') == 'table']
+
+            # Process each page
+            total_pages = ocr_data.get('total_pages', 1)
+            for page_num in range(1, total_pages + 1):
+                if page_num > 1:
+                    pdf_canvas.showPage()  # Start new page
+
+                # Draw text regions for this page (excluding table text)
+                page_regions = pages_data.get(page_num, [])
+                for region in page_regions:
+                    self.draw_text_region(pdf_canvas, region, page_height)
+
+                # Draw tables for this page
+                for table_elem in table_elements:
+                    if table_elem.get('page', 0) == page_num - 1:  # page is 0-indexed
+                        self.draw_table_region(pdf_canvas, table_elem, images_metadata, page_height)
+
+                # Draw non-table images for this page (figure, chart, seal, etc.)
+                for img_meta in images_metadata:
+                    if img_meta.get('page') == page_num - 1:  # page is 0-indexed
+                        img_path = img_meta.get('image_path', '')
+                        # Skip table images (they're now rendered as tables)
+                        if 'table' not in img_path.lower():
+                            self.draw_image_region(
+                                pdf_canvas,
+                                img_meta,
+                                page_height,
+                                json_path.parent
+                            )
+
+            # Save PDF
+            pdf_canvas.save()
+
+            file_size = output_path.stat().st_size
+            logger.info(f"Generated layout-preserving PDF: {output_path.name} ({file_size} bytes)")
+            return True
+
+        except Exception as e:
+            logger.error(f"Failed to generate PDF: {e}")
+            import traceback
+            traceback.print_exc()
+            return False
+
+
+# Singleton instance
+pdf_generator_service = PDFGeneratorService()