""" Layout-Preserving PDF Generation Service Generates PDF files that preserve the original document layout using OCR JSON data """ import json import logging from pathlib import Path from typing import Dict, List, Optional, Tuple from datetime import datetime from reportlab.lib.pagesizes import A4, letter from reportlab.lib.units import mm from reportlab.pdfgen import canvas from reportlab.pdfbase import pdfmetrics from reportlab.pdfbase.ttfonts import TTFont from reportlab.platypus import Table, TableStyle from reportlab.lib import colors from reportlab.lib.enums import TA_CENTER, TA_LEFT, TA_RIGHT from reportlab.platypus import Paragraph from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle from PIL import Image from html.parser import HTMLParser from app.core.config import settings logger = logging.getLogger(__name__) class HTMLTableParser(HTMLParser): """Parse HTML table to extract structure and data""" def __init__(self): super().__init__() self.tables = [] self.current_table = None self.current_row = None self.current_cell = None self.in_table = False def handle_starttag(self, tag, attrs): attrs_dict = dict(attrs) if tag == 'table': self.in_table = True self.current_table = {'rows': []} elif tag == 'tr' and self.in_table: self.current_row = {'cells': []} elif tag in ('td', 'th') and self.in_table and self.current_row is not None: colspan = int(attrs_dict.get('colspan', 1)) rowspan = int(attrs_dict.get('rowspan', 1)) self.current_cell = { 'text': '', 'is_header': tag == 'th', 'colspan': colspan, 'rowspan': rowspan } def handle_endtag(self, tag): if tag == 'table' and self.in_table: if self.current_table and self.current_table['rows']: self.tables.append(self.current_table) self.current_table = None self.in_table = False elif tag == 'tr' and self.current_row is not None: if self.current_table is not None: self.current_table['rows'].append(self.current_row) self.current_row = None elif tag in ('td', 'th') and self.current_cell is not None: if self.current_row is not None: self.current_row['cells'].append(self.current_cell) self.current_cell = None def handle_data(self, data): if self.current_cell is not None: self.current_cell['text'] += data.strip() + ' ' class PDFGeneratorService: """Service for generating layout-preserving PDFs from OCR JSON data""" def __init__(self): """Initialize PDF generator with font configuration""" self.font_name = 'NotoSansSC' self.font_path = None self.font_registered = False self._register_chinese_font() def _register_chinese_font(self): """Register Chinese font for PDF generation""" try: # Get font path from settings font_path = Path(settings.chinese_font_path) # Try relative path from project root if not font_path.is_absolute(): # Adjust path - settings.chinese_font_path starts with ./backend/ project_root = Path(__file__).resolve().parent.parent.parent.parent font_path = project_root / font_path if not font_path.exists(): logger.error(f"Chinese font not found at {font_path}") return # Register font pdfmetrics.registerFont(TTFont(self.font_name, str(font_path))) self.font_path = font_path self.font_registered = True logger.info(f"Chinese font registered: {self.font_name} from {font_path}") except Exception as e: logger.error(f"Failed to register Chinese font: {e}") self.font_registered = False def load_ocr_json(self, json_path: Path) -> Optional[Dict]: """ Load and parse OCR JSON result file Args: json_path: Path to JSON file Returns: Parsed JSON data or None if failed """ try: with open(json_path, 'r', encoding='utf-8') as f: data = json.load(f) logger.info(f"Loaded OCR JSON: {json_path.name}") return data except Exception as e: logger.error(f"Failed to load JSON {json_path}: {e}") return None def calculate_page_dimensions(self, text_regions: List[Dict], source_file_path: Optional[Path] = None) -> Tuple[float, float]: """ Calculate page dimensions from source file or text region bounding boxes Args: text_regions: List of text regions with bbox coordinates source_file_path: Optional path to source file for accurate dimensions Returns: Tuple of (width, height) in points """ # First try to get dimensions from source file if source_file_path: dims = self.get_original_page_size(source_file_path) if dims: return dims if not text_regions: return A4 # Default to A4 size max_x = 0 max_y = 0 for region in text_regions: bbox = region.get('bbox', []) if not bbox or len(bbox) < 4: continue # bbox format: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]] for point in bbox: if isinstance(point, (list, tuple)) and len(point) >= 2: x, y = point[0], point[1] max_x = max(max_x, x) max_y = max(max_y, y) # OCR coordinates are in pixels, use them directly as points (1:1 mapping) # Do NOT add padding - this causes layout issues width = max_x if max_x > 0 else A4[0] height = max_y if max_y > 0 else A4[1] logger.info(f"Calculated page dimensions from OCR: {width:.1f} x {height:.1f} points") return (width, height) def get_original_page_size(self, file_path: Path) -> Optional[Tuple[float, float]]: """ Extract page dimensions from original source file Args: file_path: Path to original file (image or PDF) Returns: Tuple of (width, height) in points or None """ try: if not file_path.exists(): return None # For images, get dimensions from PIL if file_path.suffix.lower() in ['.png', '.jpg', '.jpeg', '.bmp', '.tiff']: img = Image.open(file_path) # Use pixel dimensions directly as points (1:1 mapping) # This matches how PaddleOCR reports coordinates width_pt = float(img.width) height_pt = float(img.height) logger.info(f"Extracted dimensions from image: {width_pt:.1f} x {height_pt:.1f} points (1:1 pixel mapping)") return (width_pt, height_pt) # For PDFs, would need PyPDF2 or similar # For now, return None to use calculated dimensions except Exception as e: logger.warning(f"Failed to get page size from {file_path}: {e}") return None def draw_text_region( self, pdf_canvas: canvas.Canvas, region: Dict, page_height: float ): """ Draw a text region at precise coordinates Args: pdf_canvas: ReportLab canvas object region: Text region dict with text, bbox, confidence page_height: Height of page (for coordinate transformation) """ text = region.get('text', '') bbox = region.get('bbox', []) confidence = region.get('confidence', 1.0) if not text or not bbox or len(bbox) < 4: return try: # bbox from OCR: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]] # Points: top-left, top-right, bottom-right, bottom-left # OCR coordinates: origin (0,0) at top-left, Y increases downward ocr_x_left = bbox[0][0] # Left X ocr_y_top = bbox[0][1] # Top Y in OCR coordinates ocr_x_right = bbox[2][0] # Right X ocr_y_bottom = bbox[2][1] # Bottom Y in OCR coordinates # Calculate bbox dimensions bbox_width = abs(ocr_x_right - ocr_x_left) bbox_height = abs(ocr_y_bottom - ocr_y_top) # Calculate font size using heuristics # Font size is typically 70-90% of bbox height # Testing shows 0.75 works well for most cases font_size = bbox_height * 0.75 font_size = max(min(font_size, 72), 4) # Clamp between 4pt and 72pt # Transform coordinates: OCR (top-left origin) → PDF (bottom-left origin) # CRITICAL: Y-axis flip! pdf_x = ocr_x_left pdf_y = page_height - ocr_y_bottom # Flip Y-axis using bottom coordinate # Set font font_name = self.font_name if self.font_registered else 'Helvetica' pdf_canvas.setFont(font_name, font_size) # Calculate text width to prevent overflow text_width = pdf_canvas.stringWidth(text, font_name, font_size) # If text is too wide for bbox, scale down font if text_width > bbox_width: scale_factor = bbox_width / text_width font_size = font_size * scale_factor * 0.95 # 95% to add small margin font_size = max(font_size, 3) # Minimum 3pt pdf_canvas.setFont(font_name, font_size) # Draw text at calculated position pdf_canvas.drawString(pdf_x, pdf_y, text) # Debug: Draw bounding box (optional) if settings.pdf_enable_bbox_debug: pdf_canvas.setStrokeColorRGB(1, 0, 0, 0.3) # Red, semi-transparent pdf_canvas.setLineWidth(0.5) # Transform all bbox points to PDF coordinates pdf_points = [(p[0], page_height - p[1]) for p in bbox] # Draw bbox rectangle for i in range(4): x1, y1 = pdf_points[i] x2, y2 = pdf_points[(i + 1) % 4] pdf_canvas.line(x1, y1, x2, y2) except Exception as e: logger.warning(f"Failed to draw text region '{text[:20]}...': {e}") def draw_table_region( self, pdf_canvas: canvas.Canvas, table_element: Dict, images_metadata: List[Dict], page_height: float ): """ Draw a table region by parsing HTML and rebuilding with ReportLab Table Args: pdf_canvas: ReportLab canvas object table_element: Table element dict with HTML content images_metadata: List of image metadata to find table bbox page_height: Height of page """ try: html_content = table_element.get('content', '') if not html_content: return # Parse HTML to extract table structure parser = HTMLTableParser() parser.feed(html_content) if not parser.tables: logger.warning("No tables found in HTML content") return # Get the first table (PP-StructureV3 usually provides one table per element) table_data = parser.tables[0] rows = table_data['rows'] if not rows: return # Find corresponding table image to get bbox table_bbox = None for img_meta in images_metadata: img_path = img_meta.get('image_path', '') if 'table' in img_path.lower(): bbox = img_meta.get('bbox', []) if bbox and len(bbox) >= 4: table_bbox = bbox break if not table_bbox: logger.warning("No bbox found for table") return # Extract bbox coordinates ocr_x_left = table_bbox[0][0] ocr_y_top = table_bbox[0][1] ocr_x_right = table_bbox[2][0] ocr_y_bottom = table_bbox[2][1] table_width = abs(ocr_x_right - ocr_x_left) table_height = abs(ocr_y_bottom - ocr_y_top) # Transform coordinates pdf_x = ocr_x_left pdf_y = page_height - ocr_y_bottom # Build table data for ReportLab # Convert parsed structure to simple 2D array max_cols = max(len(row['cells']) for row in rows) reportlab_data = [] for row in rows: row_data = [] for cell in row['cells']: text = cell['text'].strip() row_data.append(text) # Pad row if needed while len(row_data) < max_cols: row_data.append('') reportlab_data.append(row_data) # Calculate column widths (equal distribution) col_widths = [table_width / max_cols] * max_cols # Create ReportLab Table # Use smaller font size to fit in bbox font_size = min(table_height / len(rows) * 0.5, 10) font_size = max(font_size, 6) # Create table with font table = Table(reportlab_data, colWidths=col_widths) # Apply table style style = TableStyle([ ('FONT', (0, 0), (-1, -1), self.font_name if self.font_registered else 'Helvetica', font_size), ('GRID', (0, 0), (-1, -1), 0.5, colors.black), ('VALIGN', (0, 0), (-1, -1), 'MIDDLE'), ('ALIGN', (0, 0), (-1, -1), 'CENTER'), ('LEFTPADDING', (0, 0), (-1, -1), 2), ('RIGHTPADDING', (0, 0), (-1, -1), 2), ('TOPPADDING', (0, 0), (-1, -1), 2), ('BOTTOMPADDING', (0, 0), (-1, -1), 2), ]) # Add header style if first row has headers if rows and rows[0]['cells'] and rows[0]['cells'][0].get('is_header'): style.add('BACKGROUND', (0, 0), (-1, 0), colors.lightgrey) style.add('FONT', (0, 0), (-1, 0), self.font_name if self.font_registered else 'Helvetica-Bold', font_size) table.setStyle(style) # Calculate table size table.wrapOn(pdf_canvas, table_width, table_height) # Draw table at position table.drawOn(pdf_canvas, pdf_x, pdf_y) logger.info(f"Drew table at ({pdf_x:.0f}, {pdf_y:.0f}) size {table_width:.0f}x{table_height:.0f} with {len(rows)} rows") except Exception as e: logger.warning(f"Failed to draw table region: {e}") import traceback traceback.print_exc() def draw_image_region( self, pdf_canvas: canvas.Canvas, region: Dict, page_height: float, result_dir: Path ): """ Draw an image region by embedding the extracted image Handles images extracted by PP-StructureV3 (tables, figures, charts, etc.) Args: pdf_canvas: ReportLab canvas object region: Image metadata dict with image_path and bbox page_height: Height of page (for coordinate transformation) result_dir: Directory containing result files """ try: image_path_str = region.get('image_path', '') if not image_path_str: return # Construct full path to image image_path = result_dir / image_path_str if not image_path.exists(): logger.warning(f"Image not found: {image_path}") return # Get bbox for positioning bbox = region.get('bbox', []) if not bbox or len(bbox) < 4: # If no bbox, skip for now logger.warning(f"No bbox for image {image_path_str}") return # bbox from OCR: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]] # OCR coordinates: origin (0,0) at top-left, Y increases downward ocr_x_left = bbox[0][0] ocr_y_top = bbox[0][1] ocr_x_right = bbox[2][0] ocr_y_bottom = bbox[2][1] # Calculate bbox dimensions bbox_width = abs(ocr_x_right - ocr_x_left) bbox_height = abs(ocr_y_bottom - ocr_y_top) # Transform coordinates: OCR (top-left origin) → PDF (bottom-left origin) # CRITICAL: Y-axis flip! # For images, we position at bottom-left corner pdf_x_left = ocr_x_left pdf_y_bottom = page_height - ocr_y_bottom # Flip Y-axis # Draw image using ReportLab # drawImage expects: (path, x, y, width, height) # where (x, y) is the bottom-left corner of the image pdf_canvas.drawImage( str(image_path), pdf_x_left, pdf_y_bottom, width=bbox_width, height=bbox_height, preserveAspectRatio=True, mask='auto' # Handle transparency ) logger.info(f"Drew image: {image_path_str} at ({pdf_x_left:.0f}, {pdf_y_bottom:.0f}) size {bbox_width:.0f}x{bbox_height:.0f}") except Exception as e: logger.warning(f"Failed to draw image region: {e}") def generate_layout_pdf( self, json_path: Path, output_path: Path, source_file_path: Optional[Path] = None ) -> bool: """ Generate layout-preserving PDF from OCR JSON data Args: json_path: Path to OCR JSON file output_path: Path to save generated PDF source_file_path: Optional path to original source file for dimension extraction Returns: True if successful, False otherwise """ try: # Check if PDF already exists (caching) if output_path.exists(): logger.info(f"PDF already exists: {output_path.name}") return True # Load JSON data ocr_data = self.load_ocr_json(json_path) if not ocr_data: return False # Get text regions text_regions = ocr_data.get('text_regions', []) if not text_regions: logger.warning("No text regions found in JSON") return False # Get images metadata images_metadata = ocr_data.get('images_metadata', []) # Get layout data layout_data = ocr_data.get('layout_data', {}) # Determine page dimensions page_size = self.calculate_page_dimensions(text_regions, source_file_path) page_width, page_height = page_size # Create PDF canvas pdf_canvas = canvas.Canvas(str(output_path), pagesize=(page_width, page_height)) # Extract table bboxes to exclude text in those regions table_bboxes = [] for img_meta in images_metadata: img_path = img_meta.get('image_path', '') if 'table' in img_path.lower(): bbox = img_meta.get('bbox', []) if bbox and len(bbox) >= 4: table_bboxes.append(bbox) # Helper function to check if a point is inside a bbox def point_in_bbox(x, y, bbox): x1, y1 = bbox[0] x2, y2 = bbox[2] return min(x1, x2) <= x <= max(x1, x2) and min(y1, y2) <= y <= max(y1, y2) # Filter text regions to exclude those inside tables filtered_text_regions = [] for region in text_regions: bbox = region.get('bbox', []) if not bbox or len(bbox) < 4: continue # Check if text region center is inside any table bbox center_x = (bbox[0][0] + bbox[2][0]) / 2 center_y = (bbox[0][1] + bbox[2][1]) / 2 is_in_table = any(point_in_bbox(center_x, center_y, table_bbox) for table_bbox in table_bboxes) if not is_in_table: filtered_text_regions.append(region) else: logger.debug(f"Excluded text '{region.get('text', '')[:20]}...' (inside table)") logger.info(f"Filtered {len(text_regions) - len(filtered_text_regions)} text regions inside tables") # Group regions by page pages_data = {} for region in filtered_text_regions: page_num = region.get('page', 1) if page_num not in pages_data: pages_data[page_num] = [] pages_data[page_num].append(region) # Get table elements from layout_data table_elements = [] if layout_data and layout_data.get('elements'): table_elements = [e for e in layout_data['elements'] if e.get('type') == 'table'] # Process each page total_pages = ocr_data.get('total_pages', 1) for page_num in range(1, total_pages + 1): if page_num > 1: pdf_canvas.showPage() # Start new page # Draw text regions for this page (excluding table text) page_regions = pages_data.get(page_num, []) for region in page_regions: self.draw_text_region(pdf_canvas, region, page_height) # Draw tables for this page for table_elem in table_elements: if table_elem.get('page', 0) == page_num - 1: # page is 0-indexed self.draw_table_region(pdf_canvas, table_elem, images_metadata, page_height) # Draw non-table images for this page (figure, chart, seal, etc.) for img_meta in images_metadata: if img_meta.get('page') == page_num - 1: # page is 0-indexed img_path = img_meta.get('image_path', '') # Skip table images (they're now rendered as tables) if 'table' not in img_path.lower(): self.draw_image_region( pdf_canvas, img_meta, page_height, json_path.parent ) # Save PDF pdf_canvas.save() file_size = output_path.stat().st_size logger.info(f"Generated layout-preserving PDF: {output_path.name} ({file_size} bytes)") return True except Exception as e: logger.error(f"Failed to generate PDF: {e}") import traceback traceback.print_exc() return False # Singleton instance pdf_generator_service = PDFGeneratorService()