""" Layout-Preserving PDF Generation Service Generates PDF files that preserve the original document layout using OCR JSON data """ import json import logging import re from pathlib import Path from typing import Dict, List, Optional, Tuple, Union from datetime import datetime from reportlab.lib.pagesizes import A4, letter from reportlab.lib.units import mm from reportlab.pdfgen import canvas from reportlab.pdfbase import pdfmetrics from reportlab.pdfbase.ttfonts import TTFont from reportlab.platypus import Table, TableStyle, SimpleDocTemplate, Spacer from reportlab.platypus import Image as PlatypusImage from reportlab.lib import colors from reportlab.lib.enums import TA_CENTER, TA_LEFT, TA_RIGHT from reportlab.platypus import Paragraph from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle from PIL import Image from html.parser import HTMLParser from app.core.config import settings from app.utils.bbox_utils import normalize_bbox # Import table column corrector for column alignment fix try: from app.services.table_column_corrector import TableColumnCorrector TABLE_COLUMN_CORRECTOR_AVAILABLE = True except ImportError: TABLE_COLUMN_CORRECTOR_AVAILABLE = False TableColumnCorrector = None # Import text region renderer for simple text positioning try: from app.services.text_region_renderer import TextRegionRenderer, load_raw_ocr_regions TEXT_REGION_RENDERER_AVAILABLE = True except ImportError: TEXT_REGION_RENDERER_AVAILABLE = False TextRegionRenderer = None load_raw_ocr_regions = None # Import UnifiedDocument for dual-track support try: from app.models.unified_document import ( UnifiedDocument, DocumentElement, ElementType, BoundingBox, TableData, ProcessingTrack, DocumentMetadata, Dimensions, Page, StyleInfo ) UNIFIED_DOCUMENT_AVAILABLE = True except ImportError: UNIFIED_DOCUMENT_AVAILABLE = False UnifiedDocument = None logger = logging.getLogger(__name__) class HTMLTableParser(HTMLParser): """Parse HTML table to extract structure and data""" def __init__(self): super().__init__() self.tables = [] self.current_table = None self.current_row = None self.current_cell = None self.in_table = False def handle_starttag(self, tag, attrs): attrs_dict = dict(attrs) if tag == 'table': self.in_table = True self.current_table = {'rows': []} elif tag == 'tr' and self.in_table: self.current_row = {'cells': []} elif tag in ('td', 'th') and self.in_table and self.current_row is not None: colspan = int(attrs_dict.get('colspan', 1)) rowspan = int(attrs_dict.get('rowspan', 1)) self.current_cell = { 'text': '', 'is_header': tag == 'th', 'colspan': colspan, 'rowspan': rowspan } def handle_endtag(self, tag): if tag == 'table' and self.in_table: if self.current_table and self.current_table['rows']: self.tables.append(self.current_table) self.current_table = None self.in_table = False elif tag == 'tr' and self.current_row is not None: if self.current_table is not None: self.current_table['rows'].append(self.current_row) self.current_row = None elif tag in ('td', 'th') and self.current_cell is not None: if self.current_row is not None: self.current_row['cells'].append(self.current_cell) self.current_cell = None def handle_data(self, data): if self.current_cell is not None: self.current_cell['text'] += data.strip() + ' ' class PDFGeneratorService: """Service for generating layout-preserving PDFs from OCR JSON data""" # Font mapping from common fonts to PDF standard fonts FONT_MAPPING = { 'Arial': 'Helvetica', 'Arial Black': 'Helvetica-Bold', 'Times New Roman': 'Times-Roman', 'Times': 'Times-Roman', 'Courier New': 'Courier', 'Courier': 'Courier', 'Calibri': 'Helvetica', 'Cambria': 'Times-Roman', 'Georgia': 'Times-Roman', 'Verdana': 'Helvetica', 'Tahoma': 'Helvetica', 'Trebuchet MS': 'Helvetica', 'Comic Sans MS': 'Helvetica', 'Impact': 'Helvetica-Bold', 'Lucida Console': 'Courier', 'Palatino': 'Times-Roman', 'Garamond': 'Times-Roman', 'Bookman': 'Times-Roman', 'Century Gothic': 'Helvetica', 'Franklin Gothic': 'Helvetica', } # Style flags for text formatting STYLE_FLAG_BOLD = 1 STYLE_FLAG_ITALIC = 2 STYLE_FLAG_UNDERLINE = 4 STYLE_FLAG_STRIKETHROUGH = 8 def __init__(self): """Initialize PDF generator with font configuration""" self.font_name = 'NotoSansSC' self.font_path = None self.font_registered = False self.current_processing_track = None # Track type for current document self._register_chinese_font() def _register_chinese_font(self): """Register Chinese font for PDF generation""" try: # Get font path from settings font_path = Path(settings.chinese_font_path) # Try relative path from project root if not font_path.is_absolute(): # Adjust path - settings.chinese_font_path starts with ./backend/ project_root = Path(__file__).resolve().parent.parent.parent.parent font_path = project_root / font_path if not font_path.exists(): logger.error(f"Chinese font not found at {font_path}") return # Register font pdfmetrics.registerFont(TTFont(self.font_name, str(font_path))) self.font_path = font_path self.font_registered = True logger.info(f"Chinese font registered: {self.font_name} from {font_path}") except Exception as e: logger.error(f"Failed to register Chinese font: {e}") self.font_registered = False def _detect_content_orientation( self, page_width: float, page_height: float, ocr_data: Dict ) -> Tuple[bool, float, float]: """ Detect if content orientation differs from page dimensions. This handles cases where a document is scanned in portrait orientation but the actual content is landscape (or vice versa). PP-StructureV3 may return bounding boxes in the "corrected" orientation while the image remains in its scanned orientation. Args: page_width: Declared page width from image dimensions page_height: Declared page height from image dimensions ocr_data: OCR data dictionary containing bounding boxes Returns: Tuple of (needs_rotation, adjusted_width, adjusted_height) - needs_rotation: True if page orientation should be swapped - adjusted_width: Width to use for PDF page - adjusted_height: Height to use for PDF page """ # Find max content bounds from all regions max_x = 0 max_y = 0 all_regions = [] # Collect regions from various sources if 'text_regions' in ocr_data and isinstance(ocr_data['text_regions'], list): all_regions.extend(ocr_data['text_regions']) if 'layout_data' in ocr_data and isinstance(ocr_data['layout_data'], dict): elements = ocr_data['layout_data'].get('elements', []) if elements: all_regions.extend(elements) if 'images_metadata' in ocr_data and isinstance(ocr_data['images_metadata'], list): all_regions.extend(ocr_data['images_metadata']) for region in all_regions: try: bbox = region.get('bbox') if not bbox: continue # Handle different bbox formats if isinstance(bbox, dict): # BoundingBox object format max_x = max(max_x, float(bbox.get('x1', bbox.get('x0', 0) + bbox.get('width', 0)))) max_y = max(max_y, float(bbox.get('y1', bbox.get('y0', 0) + bbox.get('height', 0)))) elif isinstance(bbox, (list, tuple)): if len(bbox) >= 4 and isinstance(bbox[0], (int, float)): # [x1, y1, x2, y2] format max_x = max(max_x, float(bbox[2])) max_y = max(max_y, float(bbox[3])) elif isinstance(bbox[0], (list, tuple)): # Polygon format [[x, y], ...] x_coords = [p[0] for p in bbox if len(p) >= 2] y_coords = [p[1] for p in bbox if len(p) >= 2] if x_coords and y_coords: max_x = max(max_x, max(x_coords)) max_y = max(max_y, max(y_coords)) except Exception as e: logger.debug(f"Error processing bbox for orientation detection: {e}") continue if max_x == 0 or max_y == 0: # No valid bboxes found, use original dimensions return (False, page_width, page_height) logger.info(f"內容邊界偵測: max_x={max_x:.1f}, max_y={max_y:.1f}, " f"page_dims={page_width:.1f}x{page_height:.1f}") # Calculate how much content extends beyond page boundaries x_overflow = max_x / page_width if page_width > 0 else 1 y_overflow = max_y / page_height if page_height > 0 else 1 # Check if content significantly exceeds page dimensions in one direction # This suggests the content is in a different orientation OVERFLOW_THRESHOLD = 1.15 # Content extends >15% beyond declared dimensions if x_overflow > OVERFLOW_THRESHOLD and y_overflow <= 1.05: # Content is wider than page but fits in height # This suggests portrait image with landscape content logger.warning(f"偵測到內容方向可能與頁面不符: " f"x_overflow={x_overflow:.2f}, y_overflow={y_overflow:.2f}") # Check if swapping dimensions would help # If max_x fits better in page_height, swap if max_x <= page_height * 1.05: logger.info(f"建議頁面旋轉: {page_width:.1f}x{page_height:.1f} -> " f"{page_height:.1f}x{page_width:.1f}") return (True, page_height, page_width) else: # Content still doesn't fit, just scale to fit content logger.info(f"內容超出頁面邊界,調整頁面大小以容納內容") return (False, max_x * 1.02, page_height) elif y_overflow > OVERFLOW_THRESHOLD and x_overflow <= 1.05: # Content is taller than page but fits in width # Less common - landscape image with portrait content logger.warning(f"偵測到內容方向可能與頁面不符 (高度溢出): " f"x_overflow={x_overflow:.2f}, y_overflow={y_overflow:.2f}") if max_y <= page_width * 1.05: logger.info(f"建議頁面旋轉: {page_width:.1f}x{page_height:.1f} -> " f"{page_height:.1f}x{page_width:.1f}") return (True, page_height, page_width) else: logger.info(f"內容超出頁面邊界,調整頁面大小以容納內容") return (False, page_width, max_y * 1.02) # No orientation issue detected return (False, page_width, page_height) def _parse_color(self, color_value) -> Tuple[float, float, float]: """ Parse color value to RGB tuple. Args: color_value: Color as hex string (#RRGGBB), RGB tuple, or color name Returns: RGB tuple with values 0-1 for ReportLab """ if not color_value: return (0, 0, 0) # Default to black try: # Handle hex color (#RRGGBB or #RGB) if isinstance(color_value, str) and color_value.startswith('#'): hex_color = color_value.lstrip('#') # Expand short form (#RGB -> #RRGGBB) if len(hex_color) == 3: hex_color = ''.join([c*2 for c in hex_color]) if len(hex_color) == 6: r = int(hex_color[0:2], 16) / 255.0 g = int(hex_color[2:4], 16) / 255.0 b = int(hex_color[4:6], 16) / 255.0 return (r, g, b) # Handle RGB tuple or list elif isinstance(color_value, (tuple, list)) and len(color_value) >= 3: r, g, b = color_value[0:3] # Normalize to 0-1 if values are 0-255 if any(v > 1 for v in [r, g, b]): return (r/255.0, g/255.0, b/255.0) return (r, g, b) except (ValueError, TypeError) as e: logger.warning(f"Failed to parse color {color_value}: {e}") # Default to black return (0, 0, 0) def _map_font(self, font_name: Optional[str]) -> str: """ Map font name to PDF standard font. Args: font_name: Original font name Returns: PDF standard font name """ if not font_name: return 'Helvetica' # Direct lookup if font_name in self.FONT_MAPPING: return self.FONT_MAPPING[font_name] # Case-insensitive lookup font_lower = font_name.lower() for orig_font, pdf_font in self.FONT_MAPPING.items(): if orig_font.lower() == font_lower: return pdf_font # Partial match for common patterns if 'arial' in font_lower: return 'Helvetica' elif 'times' in font_lower: return 'Times-Roman' elif 'courier' in font_lower: return 'Courier' # Default fallback - use NotoSansSC for CJK support if registered if self.font_registered: logger.debug(f"Font '{font_name}' not found in mapping, using {self.font_name} for CJK support") return self.font_name else: logger.debug(f"Font '{font_name}' not found in mapping, using Helvetica") return 'Helvetica' def _apply_text_style(self, c: canvas.Canvas, style_info, default_size: float = 12): """ Apply text styling from StyleInfo to PDF canvas. Args: c: ReportLab canvas object style_info: StyleInfo object or dict with font, size, color, flags default_size: Default font size if not specified """ if not style_info: # Apply default styling c.setFont('Helvetica', default_size) c.setFillColorRGB(0, 0, 0) return try: # Extract style attributes if hasattr(style_info, '__dict__'): # StyleInfo object font_family = getattr(style_info, 'font_name', None) font_size = getattr(style_info, 'font_size', default_size) color = getattr(style_info, 'text_color', None) font_weight = getattr(style_info, 'font_weight', 'normal') font_style = getattr(style_info, 'font_style', 'normal') # Legacy flags support flags = getattr(style_info, 'flags', 0) elif isinstance(style_info, dict): # Dictionary font_family = style_info.get('font_name') font_size = style_info.get('font_size', default_size) color = style_info.get('text_color') font_weight = style_info.get('font_weight', 'normal') font_style = style_info.get('font_style', 'normal') # Legacy flags support flags = style_info.get('flags', 0) else: # Unknown format, use defaults c.setFont('Helvetica', default_size) c.setFillColorRGB(0, 0, 0) return # Map font name base_font = self._map_font(font_family) if font_family else 'Helvetica' # Determine bold and italic from font_weight/font_style (preferred) or flags (legacy) is_bold = font_weight == 'bold' if font_weight else bool(flags & self.STYLE_FLAG_BOLD) is_italic = font_style == 'italic' if font_style else bool(flags & self.STYLE_FLAG_ITALIC) # Apply bold/italic modifiers if is_bold or is_italic: if is_bold and is_italic: # Try bold-italic variant if 'Helvetica' in base_font: base_font = 'Helvetica-BoldOblique' elif 'Times' in base_font: base_font = 'Times-BoldItalic' elif 'Courier' in base_font: base_font = 'Courier-BoldOblique' elif is_bold: # Try bold variant if 'Helvetica' in base_font: base_font = 'Helvetica-Bold' elif 'Times' in base_font: base_font = 'Times-Bold' elif 'Courier' in base_font: base_font = 'Courier-Bold' elif is_italic: # Try italic variant if 'Helvetica' in base_font: base_font = 'Helvetica-Oblique' elif 'Times' in base_font: base_font = 'Times-Italic' elif 'Courier' in base_font: base_font = 'Courier-Oblique' # Apply font and size actual_size = font_size if font_size and font_size > 0 else default_size try: c.setFont(base_font, actual_size) except KeyError: # Font not available, fallback logger.warning(f"Font '{base_font}' not available, using Helvetica") c.setFont('Helvetica', actual_size) # Apply color rgb_color = None if hasattr(style_info, 'get_rgb_color'): # Use StyleInfo method if available rgb_color = style_info.get_rgb_color() elif color is not None: # Parse from extracted color value r, g, b = self._parse_color(color) rgb_color = (r, g, b) if rgb_color: # text_color is in 0-255 range, convert to 0-1 for ReportLab r, g, b = rgb_color if any(v > 1 for v in [r, g, b]): r, g, b = r/255.0, g/255.0, b/255.0 c.setFillColorRGB(r, g, b) else: c.setFillColorRGB(0, 0, 0) # Default black except Exception as e: logger.error(f"Failed to apply text style: {e}") # Fallback to defaults c.setFont('Helvetica', default_size) c.setFillColorRGB(0, 0, 0) def load_ocr_json(self, json_path: Path) -> Optional[Dict]: """ Load and parse OCR JSON result file Args: json_path: Path to JSON file Returns: Parsed JSON data or None if failed """ try: with open(json_path, 'r', encoding='utf-8') as f: data = json.load(f) logger.info(f"Loaded OCR JSON: {json_path.name}") return data except Exception as e: logger.error(f"Failed to load JSON {json_path}: {e}") return None def _get_image_path(self, element) -> Optional[str]: """ Get image path with fallback logic. Checks multiple locations in order: 1. element.content["saved_path"] - Direct track saved path 2. element.content["path"] - Legacy path 3. element.content["image_path"] - Alternative path 4. element.saved_path - Direct attribute 5. element.metadata["path"] - Metadata fallback Args: element: DocumentElement object Returns: Path to image file or None if not found """ # Check content dictionary if isinstance(element.content, dict): for key in ['saved_path', 'path', 'image_path']: if key in element.content: return element.content[key] # Check direct attribute if hasattr(element, 'saved_path') and element.saved_path: return element.saved_path # Check metadata if element.metadata and isinstance(element.metadata, dict): if 'path' in element.metadata: return element.metadata['path'] if 'saved_path' in element.metadata: return element.metadata['saved_path'] return None def convert_unified_document_to_ocr_data(self, unified_doc: 'UnifiedDocument') -> Dict: """ Convert UnifiedDocument to OCR data format for PDF generation. This method transforms the UnifiedDocument structure into the legacy OCR data format that the PDF generator expects, supporting both OCR and DIRECT processing tracks. Args: unified_doc: UnifiedDocument object from either processing track Returns: Dictionary in OCR data format with text_regions, images_metadata, layout_data """ text_regions = [] images_metadata = [] layout_elements = [] for page in unified_doc.pages: page_num = page.page_number # 1-based for element in page.elements: # Convert BoundingBox to polygon format [[x,y], [x,y], [x,y], [x,y]] bbox_polygon = [ [element.bbox.x0, element.bbox.y0], # top-left [element.bbox.x1, element.bbox.y0], # top-right [element.bbox.x1, element.bbox.y1], # bottom-right [element.bbox.x0, element.bbox.y1], # bottom-left ] # Handle text elements if element.is_text or element.type in [ ElementType.TEXT, ElementType.TITLE, ElementType.HEADER, ElementType.FOOTER, ElementType.PARAGRAPH, ElementType.CAPTION, ElementType.LIST_ITEM, ElementType.FOOTNOTE, ElementType.REFERENCE ]: text_content = element.get_text() if text_content: text_region = { 'text': text_content, 'bbox': bbox_polygon, 'confidence': element.confidence or 1.0, 'page': page_num, 'element_type': element.type.value # Include element type for styling } # Include style information if available (for Direct track) if hasattr(element, 'style') and element.style: text_region['style'] = element.style text_regions.append(text_region) # Handle table elements elif element.type == ElementType.TABLE: # Convert TableData to HTML for layout_data if isinstance(element.content, TableData): html_content = element.content.to_html() elif isinstance(element.content, dict): html_content = element.content.get('html', str(element.content)) else: html_content = str(element.content) table_element = { 'type': 'table', 'content': html_content, 'bbox': [element.bbox.x0, element.bbox.y0, element.bbox.x1, element.bbox.y1], 'page': page_num - 1, # layout uses 0-based 'element_id': element.element_id # For _use_border_only matching } # Preserve cell_boxes and embedded_images from metadata # These are extracted by PP-StructureV3 and used for accurate table rendering if element.metadata: if 'cell_boxes' in element.metadata: table_element['cell_boxes'] = element.metadata['cell_boxes'] table_element['cell_boxes_source'] = element.metadata.get('cell_boxes_source', 'metadata') if 'embedded_images' in element.metadata: table_element['embedded_images'] = element.metadata['embedded_images'] # Pass through rebuild flag - rebuilt tables should use HTML content if element.metadata.get('was_rebuilt'): table_element['was_rebuilt'] = True logger.debug(f"Table {element.element_id}: marked as rebuilt") layout_elements.append(table_element) # Add bbox to images_metadata for text overlap filtering # (no actual image file, just bbox for filtering) img_metadata = { 'image_path': None, # No fake table image 'bbox': bbox_polygon, 'page': page_num - 1, # 0-based for images_metadata 'type': 'table', 'element_id': element.element_id } # Also copy cell_boxes for quality checking if element.metadata and 'cell_boxes' in element.metadata: img_metadata['cell_boxes'] = element.metadata['cell_boxes'] # Mark if table was rebuilt if element.metadata and element.metadata.get('was_rebuilt'): img_metadata['was_rebuilt'] = True images_metadata.append(img_metadata) # Handle image/visual elements (including stamps/seals) elif element.is_visual or element.type in [ ElementType.IMAGE, ElementType.FIGURE, ElementType.CHART, ElementType.DIAGRAM, ElementType.LOGO, ElementType.STAMP ]: # Get image path using fallback logic image_path = self._get_image_path(element) # Only add if we found a valid path if image_path: images_metadata.append({ 'image_path': image_path, 'bbox': bbox_polygon, 'page': page_num - 1, # 0-based 'type': element.type.value }) logger.debug(f"Found image path: {image_path} for element {element.element_id}") else: logger.warning(f"No image path found for visual element {element.element_id}") # Build page dimensions mapping for multi-page support page_dimensions = {} for page in unified_doc.pages: page_dimensions[page.page_number - 1] = { # 0-based index 'width': page.dimensions.width, 'height': page.dimensions.height } # Build OCR data structure ocr_data = { 'text_regions': text_regions, 'images_metadata': images_metadata, 'layout_data': { 'elements': layout_elements, 'total_elements': len(layout_elements) }, 'total_pages': unified_doc.page_count, 'ocr_dimensions': { 'width': unified_doc.pages[0].dimensions.width if unified_doc.pages else 0, 'height': unified_doc.pages[0].dimensions.height if unified_doc.pages else 0 }, 'page_dimensions': page_dimensions, # Per-page dimensions for multi-page support # Metadata for tracking '_from_unified_document': True, '_processing_track': unified_doc.metadata.processing_track.value } logger.info(f"Converted UnifiedDocument to OCR data: " f"{len(text_regions)} text regions, " f"{len(images_metadata)} images, " f"{len(layout_elements)} layout elements, " f"track={unified_doc.metadata.processing_track.value}") return ocr_data def generate_from_unified_document( self, unified_doc: 'UnifiedDocument', output_path: Path, source_file_path: Optional[Path] = None ) -> bool: """ Generate layout-preserving PDF directly from UnifiedDocument. This method supports both OCR and DIRECT processing tracks, preserving layout and coordinate information from either source. Args: unified_doc: UnifiedDocument object output_path: Path to save generated PDF source_file_path: Optional path to original source file Returns: True if successful, False otherwise """ if not UNIFIED_DOCUMENT_AVAILABLE: logger.error("UnifiedDocument support not available") return False try: # Detect processing track for track-specific rendering processing_track = None if hasattr(unified_doc, 'metadata') and unified_doc.metadata: if hasattr(unified_doc.metadata, 'processing_track'): processing_track = unified_doc.metadata.processing_track elif isinstance(unified_doc.metadata, dict): processing_track = unified_doc.metadata.get('processing_track') # Route to track-specific rendering method # ProcessingTrack is (str, Enum), so comparing with enum value works for both string and enum # HYBRID track uses Direct track rendering (Direct text/tables + OCR images) is_direct_track = (processing_track == ProcessingTrack.DIRECT or processing_track == ProcessingTrack.HYBRID) logger.info(f"Processing track: {processing_track}, using {'Direct' if is_direct_track else 'OCR'} track rendering") if is_direct_track: # Direct track: Rich formatting preservation return self._generate_direct_track_pdf( unified_doc=unified_doc, output_path=output_path, source_file_path=source_file_path ) else: # OCR track: Simplified rendering (backward compatible) return self._generate_ocr_track_pdf( unified_doc=unified_doc, output_path=output_path, source_file_path=source_file_path ) except Exception as e: logger.error(f"Failed to generate PDF from UnifiedDocument: {e}") import traceback traceback.print_exc() return False def _is_element_inside_regions(self, element_bbox, regions_elements, overlap_threshold=0.5) -> bool: """ Check if an element overlaps significantly with any exclusion region (table, image). This prevents duplicate rendering when text overlaps with tables/images. Direct extraction often extracts both the structured element (table/image) AND its text content as separate text blocks. Uses overlap ratio detection instead of strict containment, since text blocks from DirectExtractionEngine may be larger than detected table/image regions (e.g., text block includes heading above table). Args: element_bbox: BBox of the element to check regions_elements: List of region elements (tables, images) to check against overlap_threshold: Minimum overlap percentage to trigger filtering (default 0.5 = 50%) Returns: True if element overlaps ≥50% with any region, False otherwise """ if not element_bbox: return False e_x0, e_y0, e_x1, e_y1 = element_bbox.x0, element_bbox.y0, element_bbox.x1, element_bbox.y1 elem_area = (e_x1 - e_x0) * (e_y1 - e_y0) if elem_area <= 0: return False for region in regions_elements: r_bbox = region.bbox if not r_bbox: continue # Calculate overlap rectangle overlap_x0 = max(e_x0, r_bbox.x0) overlap_y0 = max(e_y0, r_bbox.y0) overlap_x1 = min(e_x1, r_bbox.x1) overlap_y1 = min(e_y1, r_bbox.y1) # Check if there is any overlap if overlap_x0 < overlap_x1 and overlap_y0 < overlap_y1: # Calculate overlap area overlap_area = (overlap_x1 - overlap_x0) * (overlap_y1 - overlap_y0) overlap_ratio = overlap_area / elem_area # If element overlaps more than threshold, filter it out if overlap_ratio >= overlap_threshold: return True return False def _generate_direct_track_pdf( self, unified_doc: 'UnifiedDocument', output_path: Path, source_file_path: Optional[Path] = None ) -> bool: """ Generate PDF with rich formatting preservation for Direct track. This method processes UnifiedDocument directly without converting to legacy OCR format, preserving StyleInfo and applying proper text formatting including line breaks. Args: unified_doc: UnifiedDocument from Direct extraction output_path: Path to save generated PDF source_file_path: Optional path to original source file Returns: True if successful, False otherwise """ try: logger.info("=== Direct Track PDF Generation ===") logger.info(f"Total pages: {len(unified_doc.pages)}") # Set current track for helper methods (may be DIRECT or HYBRID) if hasattr(unified_doc, 'metadata') and unified_doc.metadata: self.current_processing_track = unified_doc.metadata.processing_track else: self.current_processing_track = ProcessingTrack.DIRECT # Get page dimensions from first page (for canvas initialization) if not unified_doc.pages: logger.error("No pages in document") return False first_page = unified_doc.pages[0] page_width = first_page.dimensions.width page_height = first_page.dimensions.height logger.info(f"First page dimensions: {page_width} x {page_height}") # Create PDF canvas with first page dimensions (will be updated per page) from reportlab.pdfgen import canvas pdf_canvas = canvas.Canvas(str(output_path), pagesize=(page_width, page_height)) # Process each page for page_idx, page in enumerate(unified_doc.pages): logger.info(f">>> Processing page {page_idx + 1}/{len(unified_doc.pages)}") # Get current page dimensions current_page_width = page.dimensions.width current_page_height = page.dimensions.height logger.info(f"Page {page_idx + 1} dimensions: {current_page_width} x {current_page_height}") if page_idx > 0: pdf_canvas.showPage() # Set page size for current page pdf_canvas.setPageSize((current_page_width, current_page_height)) # Separate elements by type text_elements = [] table_elements = [] image_elements = [] list_elements = [] # FIX: Collect exclusion regions (tables, images) to prevent duplicate rendering regions_to_avoid = [] # Calculate page area for background detection page_area = current_page_width * current_page_height for element in page.elements: if element.type == ElementType.TABLE: table_elements.append(element) regions_to_avoid.append(element) # Tables are exclusion regions elif element.is_visual or element.type in [ ElementType.IMAGE, ElementType.FIGURE, ElementType.CHART, ElementType.DIAGRAM, ElementType.LOGO, ElementType.STAMP ]: # Skip large vector_graphics charts in Direct track # These are visual decorations (borders, lines, frames) that would cover text # PyMuPDF extracts both vector graphics as images AND text layer separately if element.type == ElementType.CHART and element.bbox: content = element.content is_vector_graphics = ( isinstance(content, dict) and content.get('source') == 'vector_graphics' ) if is_vector_graphics: elem_area = (element.bbox.x1 - element.bbox.x0) * (element.bbox.y1 - element.bbox.y0) coverage_ratio = elem_area / page_area if page_area > 0 else 0 if coverage_ratio > 0.5: logger.info(f"Skipping large vector_graphics chart {element.element_id} " f"(covers {coverage_ratio*100:.1f}% of page) - text provides actual content") continue image_elements.append(element) # Only add real images to exclusion regions, NOT charts/diagrams # Charts often have large bounding boxes that include text labels # which should be rendered as selectable text on top if element.type in [ElementType.IMAGE, ElementType.FIGURE, ElementType.LOGO, ElementType.STAMP]: # Check if this is Direct track (text from PDF text layer, not OCR) is_direct = (self.current_processing_track == ProcessingTrack.DIRECT or self.current_processing_track == ProcessingTrack.HYBRID) if is_direct: # Direct track: text is from PDF text layer, not OCR'd from images # Don't exclude any images - text should be rendered on top # This is critical for Office documents with background images logger.debug(f"Direct track: not excluding {element.element_id} from text regions") continue # OCR track: Skip full-page background images from exclusion regions # Smaller images that might contain OCR'd text should still be excluded if element.bbox: elem_area = (element.bbox.x1 - element.bbox.x0) * (element.bbox.y1 - element.bbox.y0) coverage_ratio = elem_area / page_area if page_area > 0 else 0 # If image covers >70% of page, it's likely a background - don't exclude text if coverage_ratio > 0.7: logger.debug(f"OCR track: skipping background image {element.element_id} from exclusion " f"(covers {coverage_ratio*100:.1f}% of page)") continue regions_to_avoid.append(element) elif element.type == ElementType.LIST_ITEM: list_elements.append(element) elif self._is_list_item_fallback(element): # Fallback detection: Check metadata and text patterns list_elements.append(element) # Mark as list item for downstream processing element.type = ElementType.LIST_ITEM elif element.is_text or element.type in [ ElementType.TEXT, ElementType.TITLE, ElementType.HEADER, ElementType.FOOTER, ElementType.PARAGRAPH, ElementType.FOOTNOTE, ElementType.REFERENCE, ElementType.EQUATION, ElementType.CAPTION ]: text_elements.append(element) logger.info(f"Page {page_idx + 1}: {len(text_elements)} text, " f"{len(table_elements)} tables, {len(image_elements)} images, " f"{len(list_elements)} list items") # Use original element order from extraction engine # The extraction engine has already sorted elements by reading order, # handling multi-column layouts correctly (top-to-bottom, left-to-right) all_elements = [] # Preserve original order by iterating through page.elements for elem in page.elements: if elem in image_elements: all_elements.append(('image', elem)) elif elem in table_elements: all_elements.append(('table', elem)) elif elem in list_elements: all_elements.append(('list', elem)) elif elem in text_elements: all_elements.append(('text', elem)) logger.debug(f"Drawing {len(all_elements)} elements in extraction order (preserves multi-column reading order)") logger.debug(f"Exclusion regions: {len(regions_to_avoid)} (tables/images/charts)") # Debug: Log exclusion region types region_types = {} for region in regions_to_avoid: region_type = region.type.name region_types[region_type] = region_types.get(region_type, 0) + 1 if region_types: logger.debug(f" Exclusion region breakdown: {region_types}") # Draw elements in document order for elem_type, elem in all_elements: if elem_type == 'image': self._draw_image_element_direct(pdf_canvas, elem, current_page_height, output_path.parent) elif elem_type == 'table': self._draw_table_element_direct(pdf_canvas, elem, current_page_height) elif elem_type == 'list': # FIX: Check if list item overlaps with table/image if not self._is_element_inside_regions(elem.bbox, regions_to_avoid): self._draw_text_element_direct(pdf_canvas, elem, current_page_height) else: logger.debug(f"Skipping list element {elem.element_id} inside table/image region") elif elem_type == 'text': # FIX: Check if text overlaps with table/image before drawing if not self._is_element_inside_regions(elem.bbox, regions_to_avoid): self._draw_text_element_direct(pdf_canvas, elem, current_page_height) else: logger.debug(f"Skipping text element {elem.element_id} inside table/image region") # Save PDF pdf_canvas.save() logger.info(f"Direct track PDF saved to {output_path}") # Reset track self.current_processing_track = None return True except Exception as e: logger.error(f"Failed to generate Direct track PDF: {e}") import traceback traceback.print_exc() self.current_processing_track = None return False def _generate_ocr_track_pdf( self, unified_doc: 'UnifiedDocument', output_path: Path, source_file_path: Optional[Path] = None ) -> bool: """ Generate PDF with simplified rendering for OCR track. This method uses the existing OCR data conversion and rendering pipeline for backward compatibility. Args: unified_doc: UnifiedDocument from OCR processing output_path: Path to save generated PDF source_file_path: Optional path to original source file Returns: True if successful, False otherwise """ try: logger.info("=== OCR Track PDF Generation ===") # Set current track self.current_processing_track = 'ocr' # Check if simple text positioning mode is enabled if (settings.simple_text_positioning_enabled and TEXT_REGION_RENDERER_AVAILABLE): logger.info("Using simple text positioning mode") result = self._generate_simple_text_pdf( unified_doc=unified_doc, output_path=output_path, source_file_path=source_file_path ) else: # Convert UnifiedDocument to OCR data format (legacy) ocr_data = self.convert_unified_document_to_ocr_data(unified_doc) # Use existing generation pipeline result = self._generate_pdf_from_data( ocr_data=ocr_data, output_path=output_path, source_file_path=source_file_path ) # Reset track self.current_processing_track = None return result except Exception as e: logger.error(f"Failed to generate OCR track PDF: {e}") import traceback traceback.print_exc() self.current_processing_track = None return False def _generate_simple_text_pdf( self, unified_doc: 'UnifiedDocument', output_path: Path, source_file_path: Optional[Path] = None ) -> bool: """ Generate PDF using simple text positioning from raw OCR regions. This approach bypasses complex table structure reconstruction and renders raw OCR text directly at detected positions with rotation correction. Images, charts, figures, seals, and formulas are still rendered normally. Args: unified_doc: UnifiedDocument from OCR processing output_path: Path to save generated PDF source_file_path: Optional path to original source file Returns: True if successful, False otherwise """ try: logger.info("=== Simple Text Positioning PDF Generation ===") # Initialize text region renderer text_renderer = TextRegionRenderer( font_name=self.font_name, debug=settings.simple_text_positioning_debug ) # Get result directory from output_path result_dir = output_path.parent # Try to determine task_id from result directory or output filename # Output path is typically: result_dir/task_id_edited.pdf task_id = None if output_path.stem.endswith('_edited'): task_id = output_path.stem.replace('_edited', '') elif result_dir.name: # result_dir is typically the task_id directory task_id = result_dir.name if not task_id: logger.warning("Could not determine task_id, falling back to legacy method") ocr_data = self.convert_unified_document_to_ocr_data(unified_doc) return self._generate_pdf_from_data( ocr_data=ocr_data, output_path=output_path, source_file_path=source_file_path ) logger.info(f"Task ID: {task_id}, Result dir: {result_dir}") # Get total pages from UnifiedDocument total_pages = len(unified_doc.pages) if unified_doc.pages else 1 # Get page dimensions from first page (for canvas initialization) if not unified_doc.pages: logger.error("No pages in document") return False first_page = unified_doc.pages[0] if hasattr(first_page, 'dimensions') and first_page.dimensions: page_width = float(first_page.dimensions.width) page_height = float(first_page.dimensions.height) else: # Fallback to default size page_width = 612.0 # Letter width page_height = 792.0 # Letter height logger.warning(f"No page dimensions found, using default {page_width}x{page_height}") logger.info(f"Initial page size: {page_width:.1f} x {page_height:.1f}") # Create PDF canvas pdf_canvas = canvas.Canvas(str(output_path), pagesize=(page_width, page_height)) # Collect image-type elements from UnifiedDocument for rendering # Types that should be rendered as images: figure, image, chart, seal, formula image_element_types = {'figure', 'image', 'chart', 'seal', 'formula'} # Process each page for page_num in range(1, total_pages + 1): logger.info(f">>> Processing page {page_num}/{total_pages}") # Get page dimensions for current page if page_num <= len(unified_doc.pages): current_page = unified_doc.pages[page_num - 1] if hasattr(current_page, 'dimensions') and current_page.dimensions: current_width = float(current_page.dimensions.width) current_height = float(current_page.dimensions.height) else: current_width = page_width current_height = page_height else: current_width = page_width current_height = page_height if page_num > 1: pdf_canvas.showPage() # Set page size pdf_canvas.setPageSize((current_width, current_height)) # === Layer 1: Render images, charts, figures, seals, formulas === # Also collect exclusion zones for text avoidance exclusion_zones = [] # List of (x0, y0, x1, y1) tuples if page_num <= len(unified_doc.pages): current_page = unified_doc.pages[page_num - 1] page_elements = current_page.elements if hasattr(current_page, 'elements') else [] image_elements_rendered = 0 for elem in page_elements: elem_type = elem.type if hasattr(elem, 'type') else elem.get('type', '') # Handle enum type if hasattr(elem_type, 'value'): elem_type = elem_type.value if elem_type in image_element_types: # Get image path from element content content = elem.content if hasattr(elem, 'content') else elem.get('content', {}) if isinstance(content, dict): saved_path = content.get('saved_path') or content.get('path') else: saved_path = None # Get bbox for exclusion zone (even if image file not found) bbox = elem.bbox if hasattr(elem, 'bbox') else elem.get('bbox', {}) if hasattr(bbox, 'x0'): x0, y0, x1, y1 = bbox.x0, bbox.y0, bbox.x1, bbox.y1 elif isinstance(bbox, dict): x0 = bbox.get('x0', 0) y0 = bbox.get('y0', 0) x1 = bbox.get('x1', x0 + bbox.get('width', 0)) y1 = bbox.get('y1', y0 + bbox.get('height', 0)) else: continue # Add to exclusion zones for text avoidance # Use original image coordinates (not PDF flipped) exclusion_zones.append((x0, y0, x1, y1)) if saved_path: # Try to find the image file image_path = result_dir / saved_path if not image_path.exists(): # Try in imgs subdirectory image_path = result_dir / 'imgs' / saved_path if not image_path.exists(): # Try just the filename image_path = result_dir / Path(saved_path).name if image_path.exists(): try: # Convert coordinates (flip Y for PDF) pdf_x = x0 pdf_y = current_height - y1 # Bottom of image in PDF coords img_width = x1 - x0 img_height = y1 - y0 # Draw image pdf_canvas.drawImage( str(image_path), pdf_x, pdf_y, width=img_width, height=img_height, preserveAspectRatio=True, mask='auto' ) image_elements_rendered += 1 logger.debug(f"Rendered {elem_type}: {saved_path} at ({pdf_x:.1f}, {pdf_y:.1f})") except Exception as e: logger.warning(f"Failed to render {elem_type} {saved_path}: {e}") else: logger.warning(f"Image file not found: {saved_path}") # Also check for embedded images in table elements # These are images detected inside table regions by PP-Structure elif elem_type == 'table': metadata = elem.metadata if hasattr(elem, 'metadata') else elem.get('metadata', {}) embedded_images = metadata.get('embedded_images', []) if metadata else [] for emb_img in embedded_images: emb_bbox = emb_img.get('bbox', []) if emb_bbox and len(emb_bbox) >= 4: ex0, ey0, ex1, ey1 = emb_bbox[0], emb_bbox[1], emb_bbox[2], emb_bbox[3] exclusion_zones.append((ex0, ey0, ex1, ey1)) # Also render the embedded image saved_path = emb_img.get('saved_path', '') if saved_path: image_path = result_dir / saved_path if not image_path.exists(): image_path = result_dir / Path(saved_path).name if image_path.exists(): try: pdf_x = ex0 pdf_y = current_height - ey1 img_width = ex1 - ex0 img_height = ey1 - ey0 pdf_canvas.drawImage( str(image_path), pdf_x, pdf_y, width=img_width, height=img_height, preserveAspectRatio=True, mask='auto' ) image_elements_rendered += 1 logger.debug(f"Rendered embedded image: {saved_path} at ({pdf_x:.1f}, {pdf_y:.1f})") except Exception as e: logger.warning(f"Failed to render embedded image {saved_path}: {e}") if image_elements_rendered > 0: logger.info(f"Rendered {image_elements_rendered} image elements (figures/charts/seals/formulas/embedded)") if exclusion_zones: logger.info(f"Collected {len(exclusion_zones)} exclusion zones for text avoidance") # === Layer 2: Render text from raw OCR regions === raw_regions = load_raw_ocr_regions(str(result_dir), task_id, page_num) if not raw_regions: logger.warning(f"No raw OCR regions found for page {page_num}") else: logger.info(f"Loaded {len(raw_regions)} raw OCR regions for page {page_num}") # Collect texts inside exclusion zones for position-aware deduplication # This prevents duplicate axis labels from being rendered near charts zone_texts = None if exclusion_zones: zone_texts = text_renderer.collect_zone_texts( raw_regions, exclusion_zones, threshold=0.5, include_axis_labels=True ) if zone_texts: logger.info(f"Collected {len(zone_texts)} zone texts for deduplication: {list(zone_texts)[:10]}...") # Render all text regions, avoiding exclusion zones (images/charts) # Scale factors are 1.0 since OCR dimensions match page dimensions rendered = text_renderer.render_all_regions( pdf_canvas=pdf_canvas, regions=raw_regions, page_height=current_height, scale_x=1.0, scale_y=1.0, exclusion_zones=exclusion_zones, zone_texts=zone_texts ) logger.info(f"Rendered {rendered} text regions") logger.info(f"<<< Page {page_num} complete") # Save PDF pdf_canvas.save() file_size = output_path.stat().st_size logger.info(f"Generated PDF: {output_path.name} ({file_size} bytes)") return True except Exception as e: logger.error(f"Failed to generate simple text PDF: {e}") import traceback traceback.print_exc() return False def _generate_pdf_from_data( self, ocr_data: Dict, output_path: Path, source_file_path: Optional[Path] = None, json_parent_dir: Optional[Path] = None ) -> bool: """ Internal method to generate PDF from OCR data dictionary. This is the core generation logic extracted for reuse by both JSON-based and UnifiedDocument-based generation paths. Args: ocr_data: OCR data dictionary output_path: Path to save generated PDF source_file_path: Optional path to original source file json_parent_dir: Directory containing images (for JSON-based generation) Returns: True if successful, False otherwise """ try: # Note: Removed PDF caching - always regenerate to ensure latest code changes take effect # If caching is needed, implement at a higher level with proper cache invalidation # Get text regions text_regions = ocr_data.get('text_regions', []) if not text_regions: logger.warning("No text regions found in data") # Don't fail - might have only tables/images # Get images metadata images_metadata = ocr_data.get('images_metadata', []) # Get layout data layout_data = ocr_data.get('layout_data', {}) # Step 1: Get OCR processing dimensions (for first page / default) ocr_width, ocr_height = self.calculate_page_dimensions(ocr_data, source_file_path=None) logger.info(f"OCR 處理時使用的座標系尺寸 (第一頁): {ocr_width:.1f} x {ocr_height:.1f}") # Step 2: Get page dimensions mapping for multi-page support page_dimensions = ocr_data.get('page_dimensions', {}) if not page_dimensions: # Fallback: use first page dimensions for all pages page_dimensions = {0: {'width': ocr_width, 'height': ocr_height}} logger.info("No page_dimensions found, using first page size for all pages") # Step 3: Get original file dimensions for all pages # For OCR track, we use OCR coordinate system dimensions directly to avoid scaling issues original_page_sizes = {} use_ocr_dimensions_for_pdf = (self.current_processing_track == 'ocr') if use_ocr_dimensions_for_pdf: # OCR Track: Use OCR coordinate system dimensions directly # This ensures no scaling is needed (scale = 1.0) logger.info(f"OCR Track: 使用 OCR 座標系尺寸作為 PDF 頁面尺寸(避免縮放)") elif source_file_path: original_page_sizes = self.get_all_page_sizes(source_file_path) if original_page_sizes: logger.info(f"從原始文件獲取到 {len(original_page_sizes)} 頁尺寸") else: logger.warning(f"無法獲取原始文件尺寸,將使用 OCR/UnifiedDocument 尺寸") else: logger.info(f"無原始文件,將使用 OCR/UnifiedDocument 尺寸") # Determine initial canvas size (will be updated per page) # Priority for OCR track: OCR dimensions (no scaling) # Priority for Direct track: original file first page > OCR/UnifiedDocument first page if use_ocr_dimensions_for_pdf: target_width, target_height = ocr_width, ocr_height logger.info(f"初始 PDF 尺寸(OCR Track, 使用 OCR 座標系): {target_width:.1f} x {target_height:.1f}") elif 0 in original_page_sizes: target_width, target_height = original_page_sizes[0] logger.info(f"初始 PDF 尺寸(來自原始文件首頁): {target_width:.1f} x {target_height:.1f}") else: target_width, target_height = ocr_width, ocr_height logger.info(f"初始 PDF 尺寸(來自 OCR/UnifiedDocument): {target_width:.1f} x {target_height:.1f}") # Step 4: Detect content orientation mismatch # This handles rotated scans where content bbox exceeds page dimensions # IMPORTANT: Use OCR dimensions (pixels) for detection, not PDF points # because content bboxes are in the same coordinate system as OCR dimensions needs_rotation, adjusted_ocr_width, adjusted_ocr_height = self._detect_content_orientation( ocr_width, ocr_height, ocr_data ) # If orientation change detected, calculate the adjusted target dimensions if needs_rotation: # Swap target dimensions to match the detected orientation adjusted_width = target_height adjusted_height = target_width elif adjusted_ocr_width != ocr_width or adjusted_ocr_height != ocr_height: # Content extends beyond OCR dimensions, scale target proportionally scale_w = adjusted_ocr_width / ocr_width if ocr_width > 0 else 1.0 scale_h = adjusted_ocr_height / ocr_height if ocr_height > 0 else 1.0 adjusted_width = target_width * scale_w adjusted_height = target_height * scale_h else: adjusted_width = target_width adjusted_height = target_height if needs_rotation or (adjusted_width != target_width or adjusted_height != target_height): logger.info(f"頁面尺寸調整: {target_width:.1f}x{target_height:.1f} -> " f"{adjusted_width:.1f}x{adjusted_height:.1f} (旋轉={needs_rotation})") target_width, target_height = adjusted_width, adjusted_height # Update original_page_sizes with the new TARGET dimensions if 0 in original_page_sizes: original_page_sizes[0] = (target_width, target_height) logger.info(f"覆蓋原始文件尺寸以適應內容方向") # CRITICAL: Update page_dimensions with SWAPPED OCR dimensions # This is the coordinate system that the content bboxes are in # When content is rotated, width and height are effectively swapped if needs_rotation and 0 in page_dimensions: # Swap the OCR dimensions to match the rotated content coordinate system original_ocr_w = page_dimensions[0]['width'] original_ocr_h = page_dimensions[0]['height'] page_dimensions[0] = {'width': original_ocr_h, 'height': original_ocr_w} logger.info(f"旋轉 OCR 座標系: {original_ocr_w:.1f}x{original_ocr_h:.1f} -> " f"{original_ocr_h:.1f}x{original_ocr_w:.1f}") # Create PDF canvas with initial page size (will be updated per page) pdf_canvas = canvas.Canvas(str(output_path), pagesize=(target_width, target_height)) # Smart filtering: only include tables with good cell_boxes quality in regions_to_avoid # Tables with bad cell_boxes will use raw OCR text positioning instead # Exception: Rebuilt tables always use HTML content and filter text regions_to_avoid = [] good_quality_tables = [] bad_quality_tables = [] rebuilt_tables = [] for img in images_metadata: if img.get('type') == 'table': elem_id = img.get('element_id', 'unknown') # Check if this table was rebuilt - rebuilt tables have good content was_rebuilt = img.get('was_rebuilt', False) if was_rebuilt: # Rebuilt tables have accurate content - filter text, use HTML regions_to_avoid.append(img) rebuilt_tables.append(elem_id) else: # Check cell_boxes quality for non-rebuilt tables cell_boxes = img.get('cell_boxes', []) quality = self._check_cell_boxes_quality(cell_boxes, elem_id) if quality == 'good': # Good quality: filter text, render with cell_boxes regions_to_avoid.append(img) good_quality_tables.append(elem_id) else: # Bad quality: don't filter text, just draw border bad_quality_tables.append(elem_id) img['_use_border_only'] = True # Mark for border-only rendering else: # Non-table elements (images, figures, charts) always avoid regions_to_avoid.append(img) logger.info(f"過濾文字區域: {len(regions_to_avoid)} 個區域需要避免") if rebuilt_tables: logger.info(f" 重建表格用 HTML: {rebuilt_tables}") if good_quality_tables: logger.info(f" 表格用 cell_boxes: {good_quality_tables}") if bad_quality_tables: logger.info(f" 表格用 raw OCR text (border only): {bad_quality_tables}") filtered_text_regions = self._filter_text_in_regions(text_regions, regions_to_avoid) # Group regions by page pages_data = {} for region in filtered_text_regions: page_num = region.get('page', 1) if page_num not in pages_data: pages_data[page_num] = [] pages_data[page_num].append(region) # Get table elements from layout_data and copy _use_border_only flags table_elements = [] if layout_data and layout_data.get('elements'): # Create a lookup for _use_border_only flags from images_metadata border_only_tables = {img.get('element_id') for img in images_metadata if img.get('type') == 'table' and img.get('_use_border_only')} logger.debug(f"[DEBUG] border_only_tables from images_metadata: {border_only_tables}") for e in layout_data['elements']: if e.get('type') == 'table': elem_id = e.get('element_id') logger.debug(f"[DEBUG] layout_data table element_id: {elem_id}") # Copy the flag if this table should use border only if elem_id in border_only_tables: e['_use_border_only'] = True logger.info(f"[DEBUG] Set _use_border_only=True for table {elem_id}") table_elements.append(e) # Process each page total_pages = ocr_data.get('total_pages', 1) logger.info(f"開始處理 {total_pages} 頁 PDF") # Determine image directory if json_parent_dir is None: json_parent_dir = output_path.parent for page_num in range(1, total_pages + 1): logger.info(f">>> 處理第 {page_num}/{total_pages} 頁") # Get current page dimensions with priority order: # For OCR Track: always use OCR dimensions (scale = 1.0) # For Direct Track: # 1. Original file dimensions (highest priority) # 2. OCR/UnifiedDocument dimensions # 3. Fallback to first page dimensions page_idx = page_num - 1 dimension_source = "unknown" # For OCR Track: always use OCR dimensions if use_ocr_dimensions_for_pdf and page_idx in page_dimensions: current_page_dims = page_dimensions[page_idx] current_target_w = float(current_page_dims['width']) current_target_h = float(current_page_dims['height']) dimension_source = "ocr_track_direct" # Priority 1: Original file dimensions (Direct Track only) elif page_idx in original_page_sizes: current_target_w, current_target_h = original_page_sizes[page_idx] dimension_source = "original_file" # Priority 2: OCR/UnifiedDocument dimensions (which may have been adjusted for orientation) elif page_idx in page_dimensions: current_page_dims = page_dimensions[page_idx] current_target_w = float(current_page_dims['width']) current_target_h = float(current_page_dims['height']) dimension_source = "ocr_unified_doc" # Priority 3: Fallback to first page else: current_target_w = ocr_width current_target_h = ocr_height dimension_source = "fallback_first_page" logger.warning(f"No dimensions for page {page_num}, using first page size") # For pages after the first, check if orientation adjustment is needed # (First page was already handled above) if page_num > 1 and dimension_source == "original_file": # Build per-page data for orientation detection page_ocr_data = { 'text_regions': [r for r in text_regions if r.get('page', 1) == page_num], 'layout_data': { 'elements': [e for e in layout_data.get('elements', []) if e.get('page', 0) == page_idx] }, 'images_metadata': [i for i in images_metadata if i.get('page', 0) == page_idx] } needs_page_rotation, adj_w, adj_h = self._detect_content_orientation( current_target_w, current_target_h, page_ocr_data ) if needs_page_rotation or (adj_w != current_target_w or adj_h != current_target_h): logger.info(f"第 {page_num} 頁尺寸調整: " f"{current_target_w:.1f}x{current_target_h:.1f} -> " f"{adj_w:.1f}x{adj_h:.1f}") current_target_w, current_target_h = adj_w, adj_h # Calculate scale factors for coordinate transformation # OCR coordinates need to be scaled if original file dimensions differ if dimension_source == "original_file": # Get OCR dimensions for this page to calculate scale if page_idx in page_dimensions: ocr_page_w = float(page_dimensions[page_idx]['width']) ocr_page_h = float(page_dimensions[page_idx]['height']) else: ocr_page_w = ocr_width ocr_page_h = ocr_height current_scale_w = current_target_w / ocr_page_w if ocr_page_w > 0 else 1.0 current_scale_h = current_target_h / ocr_page_h if ocr_page_h > 0 else 1.0 else: # Using OCR/UnifiedDocument dimensions directly, no scaling needed current_scale_w = 1.0 current_scale_h = 1.0 logger.info(f"第 {page_num} 頁尺寸: {current_target_w:.1f} x {current_target_h:.1f} " f"(來源: {dimension_source}, 縮放: {current_scale_w:.3f}x{current_scale_h:.3f})") if page_num > 1: pdf_canvas.showPage() # Set page size for current page pdf_canvas.setPageSize((current_target_w, current_target_h)) # Get regions for this page page_text_regions = pages_data.get(page_num, []) page_table_regions = [t for t in table_elements if t.get('page') == page_num - 1] page_image_regions = [ img for img in images_metadata if img.get('page') == page_num - 1 and img.get('type') != 'table' and img.get('image_path') is not None # Skip table placeholders ] # Draw in layers: images → tables → text # 1. Draw images (bottom layer) for img_meta in page_image_regions: self.draw_image_region( pdf_canvas, img_meta, current_target_h, json_parent_dir, current_scale_w, current_scale_h ) # 2. Draw tables (middle layer) for table_elem in page_table_regions: self.draw_table_region( pdf_canvas, table_elem, images_metadata, current_target_h, current_scale_w, current_scale_h, result_dir=json_parent_dir ) # 3. Draw text (top layer) for region in page_text_regions: self.draw_text_region( pdf_canvas, region, current_target_h, current_scale_w, current_scale_h ) logger.info(f"<<< 第 {page_num} 頁完成") # Save PDF pdf_canvas.save() file_size = output_path.stat().st_size logger.info(f"Generated PDF: {output_path.name} ({file_size} bytes)") return True except Exception as e: logger.error(f"Failed to generate PDF: {e}") import traceback traceback.print_exc() return False def calculate_page_dimensions(self, ocr_data: Dict, source_file_path: Optional[Path] = None) -> Tuple[float, float]: """ 從 OCR JSON 數據中取得頁面尺寸。 優先使用明確的 dimensions 欄位,失敗時才回退到 bbox 推斷。 Args: ocr_data: Complete OCR data dictionary with text_regions and layout source_file_path: Optional path to source file (fallback only) Returns: Tuple of (width, height) in points """ # *** 優先級 1: 檢查 ocr_dimensions (UnifiedDocument 轉換來的) *** if 'ocr_dimensions' in ocr_data: dims = ocr_data['ocr_dimensions'] # Handle both dict format {'width': w, 'height': h} and # list format [{'page': 1, 'width': w, 'height': h}, ...] if isinstance(dims, list) and len(dims) > 0: dims = dims[0] # Use first page dimensions if isinstance(dims, dict): w = float(dims.get('width', 0)) h = float(dims.get('height', 0)) if w > 0 and h > 0: logger.info(f"使用 ocr_dimensions 欄位的頁面尺寸: {w:.1f} x {h:.1f}") return (w, h) # *** 優先級 2: 檢查原始 JSON 的 dimensions *** if 'dimensions' in ocr_data: dims = ocr_data['dimensions'] w = float(dims.get('width', 0)) h = float(dims.get('height', 0)) if w > 0 and h > 0: logger.info(f"使用 dimensions 欄位的頁面尺寸: {w:.1f} x {h:.1f}") return (w, h) # *** 優先級 3: Fallback - 從 bbox 推斷 (僅當上述皆缺失時使用) *** logger.info("dimensions 欄位不可用,回退到 bbox 推斷") max_x = 0 max_y = 0 # *** 關鍵修復:檢查所有可能包含 bbox 的字段 *** # 不同版本的 OCR 輸出可能使用不同的字段名 all_regions = [] # 1. text_regions - 包含所有文字區域(最常見) if 'text_regions' in ocr_data and isinstance(ocr_data['text_regions'], list): all_regions.extend(ocr_data['text_regions']) # 2. image_regions - 包含圖片區域 if 'image_regions' in ocr_data and isinstance(ocr_data['image_regions'], list): all_regions.extend(ocr_data['image_regions']) # 3. tables - 包含表格區域 if 'tables' in ocr_data and isinstance(ocr_data['tables'], list): all_regions.extend(ocr_data['tables']) # 4. layout - 可能包含布局信息(可能是空列表) if 'layout' in ocr_data and isinstance(ocr_data['layout'], list): all_regions.extend(ocr_data['layout']) # 5. layout_data.elements - PP-StructureV3 格式 if 'layout_data' in ocr_data and isinstance(ocr_data['layout_data'], dict): elements = ocr_data['layout_data'].get('elements', []) if elements: all_regions.extend(elements) if not all_regions: # 如果 JSON 為空,回退到原始檔案尺寸 logger.warning("JSON 中沒有找到 text_regions, image_regions, tables, layout 或 layout_data.elements,回退到原始檔案尺寸。") if source_file_path: dims = self.get_original_page_size(source_file_path) if dims: return dims return A4 region_count = 0 for region in all_regions: try: bbox = region.get('bbox') if not bbox: continue region_count += 1 # *** 關鍵修復:正確處理多邊形 [[x, y], ...] 格式 *** if isinstance(bbox[0], (int, float)): # 處理簡單的 [x1, y1, x2, y2] 格式 max_x = max(max_x, bbox[2]) max_y = max(max_y, bbox[3]) elif isinstance(bbox[0], (list, tuple)): # 處理多邊形 [[x, y], ...] 格式 x_coords = [p[0] for p in bbox if isinstance(p, (list, tuple)) and len(p) >= 2] y_coords = [p[1] for p in bbox if isinstance(p, (list, tuple)) and len(p) >= 2] if x_coords and y_coords: max_x = max(max_x, max(x_coords)) max_y = max(max_y, max(y_coords)) except Exception as e: logger.warning(f"Error processing bbox {bbox}: {e}") if max_x > 0 and max_y > 0: logger.info(f"從 {region_count} 個區域中推斷出的 OCR 座標系尺寸: {max_x:.1f} x {max_y:.1f}") return (max_x, max_y) else: # 如果所有 bbox 都解析失敗,才回退 logger.warning("無法從 bbox 推斷尺寸,回退到原始檔案尺寸。") if source_file_path: dims = self.get_original_page_size(source_file_path) if dims: return dims return A4 def get_all_page_sizes(self, file_path: Path) -> Dict[int, Tuple[float, float]]: """ Extract dimensions for all pages from original source file Args: file_path: Path to original file (image or PDF) Returns: Dict mapping page index (0-based) to (width, height) in points Empty dict if extraction fails """ page_sizes = {} try: if not file_path.exists(): logger.warning(f"File not found: {file_path}") return page_sizes # For images, single page with dimensions from PIL if file_path.suffix.lower() in ['.png', '.jpg', '.jpeg', '.bmp', '.tiff']: img = Image.open(file_path) # Use pixel dimensions directly as points (1:1 mapping) # This matches how PaddleOCR reports coordinates width_pt = float(img.width) height_pt = float(img.height) page_sizes[0] = (width_pt, height_pt) logger.info(f"Extracted dimensions from image: {width_pt:.1f} x {height_pt:.1f} points (1:1 pixel mapping)") return page_sizes # For PDFs, extract dimensions for all pages using PyPDF2 if file_path.suffix.lower() == '.pdf': try: from PyPDF2 import PdfReader reader = PdfReader(file_path) total_pages = len(reader.pages) for page_idx in range(total_pages): page = reader.pages[page_idx] # MediaBox gives [x1, y1, x2, y2] in points mediabox = page.mediabox width_pt = float(mediabox.width) height_pt = float(mediabox.height) # IMPORTANT: Consider page rotation! # PDF pages can have /Rotate attribute (0, 90, 180, 270) # When rotation is 90 or 270 degrees, width and height should be swapped # because pdf2image and PDF viewers apply this rotation when rendering rotation = page.get('/Rotate', 0) if rotation is None: rotation = 0 rotation = int(rotation) % 360 if rotation in (90, 270): # Swap width and height for 90/270 degree rotation width_pt, height_pt = height_pt, width_pt logger.info(f"Page {page_idx}: Rotation={rotation}°, swapped dimensions to {width_pt:.1f} x {height_pt:.1f}") page_sizes[page_idx] = (width_pt, height_pt) logger.info(f"Extracted dimensions from PDF: {total_pages} pages") for idx, (w, h) in page_sizes.items(): logger.debug(f" Page {idx}: {w:.1f} x {h:.1f} points") return page_sizes except ImportError: logger.warning("PyPDF2 not available, cannot extract PDF dimensions") except Exception as e: logger.warning(f"Failed to extract PDF dimensions: {e}") except Exception as e: logger.warning(f"Failed to get page sizes from {file_path}: {e}") return page_sizes def get_original_page_size(self, file_path: Path) -> Optional[Tuple[float, float]]: """ Extract first page dimensions from original source file (backward compatibility) Args: file_path: Path to original file (image or PDF) Returns: Tuple of (width, height) in points or None """ page_sizes = self.get_all_page_sizes(file_path) if 0 in page_sizes: return page_sizes[0] return None def _get_bbox_coords(self, bbox: Union[Dict, List[List[float]], List[float]]) -> Optional[Tuple[float, float, float, float]]: """將任何 bbox 格式 (dict, 多邊形或 [x1,y1,x2,y2]) 轉換為 [x_min, y_min, x_max, y_max]. Uses shared bbox utility.""" return normalize_bbox(bbox) def _is_bbox_inside(self, inner_bbox_data: Dict, outer_bbox_data: Dict, tolerance: float = 5.0) -> bool: """ 檢查 'inner_bbox' 是否在 'outer_bbox' 內部(帶有容錯)。 此版本可處理多邊形和矩形。 """ inner_coords = self._get_bbox_coords(inner_bbox_data.get('bbox')) outer_coords = self._get_bbox_coords(outer_bbox_data.get('bbox')) if not inner_coords or not outer_coords: return False inner_x1, inner_y1, inner_x2, inner_y2 = inner_coords outer_x1, outer_y1, outer_x2, outer_y2 = outer_coords # 檢查 inner 是否在 outer 內部 (加入 tolerance) is_inside = ( (inner_x1 >= outer_x1 - tolerance) and (inner_y1 >= outer_y1 - tolerance) and (inner_x2 <= outer_x2 + tolerance) and (inner_y2 <= outer_y2 + tolerance) ) return is_inside def _bbox_overlaps(self, bbox1_data: Dict, bbox2_data: Dict, tolerance: float = 5.0) -> bool: """ 檢查兩個 bbox 是否有重疊(帶有容錯)。 如果有任何重疊,返回 True。 Args: bbox1_data: 第一個 bbox 數據 bbox2_data: 第二個 bbox 數據 tolerance: 容錯值(像素) Returns: True 如果兩個 bbox 有重疊 """ coords1 = self._get_bbox_coords(bbox1_data.get('bbox')) coords2 = self._get_bbox_coords(bbox2_data.get('bbox')) if not coords1 or not coords2: return False x1_min, y1_min, x1_max, y1_max = coords1 x2_min, y2_min, x2_max, y2_max = coords2 # 擴展 bbox2(表格/圖片區域)的範圍 x2_min -= tolerance y2_min -= tolerance x2_max += tolerance y2_max += tolerance # 檢查是否有重疊:如果沒有重疊,則必定滿足以下條件之一 no_overlap = ( x1_max < x2_min or # bbox1 在 bbox2 左側 x1_min > x2_max or # bbox1 在 bbox2 右側 y1_max < y2_min or # bbox1 在 bbox2 上方 y1_min > y2_max # bbox1 在 bbox2 下方 ) return not no_overlap def _calculate_overlap_ratio(self, text_bbox_data: Dict, avoid_bbox_data: Dict) -> float: """ 計算文字區域與避免區域的重疊比例。 Args: text_bbox_data: 文字區域 bbox 數據 avoid_bbox_data: 避免區域 bbox 數據 Returns: 重疊面積佔文字區域面積的比例 (0.0 - 1.0) """ text_coords = self._get_bbox_coords(text_bbox_data.get('bbox')) avoid_coords = self._get_bbox_coords(avoid_bbox_data.get('bbox')) if not text_coords or not avoid_coords: return 0.0 tx0, ty0, tx1, ty1 = text_coords ax0, ay0, ax1, ay1 = avoid_coords # Calculate text area text_area = (tx1 - tx0) * (ty1 - ty0) if text_area <= 0: return 0.0 # Calculate intersection inter_x0 = max(tx0, ax0) inter_y0 = max(ty0, ay0) inter_x1 = min(tx1, ax1) inter_y1 = min(ty1, ay1) # Check if there's actual intersection if inter_x1 <= inter_x0 or inter_y1 <= inter_y0: return 0.0 inter_area = (inter_x1 - inter_x0) * (inter_y1 - inter_y0) return inter_area / text_area def _filter_text_in_regions(self, text_regions: List[Dict], regions_to_avoid: List[Dict], overlap_threshold: float = 0.5) -> List[Dict]: """ 過濾掉與 'regions_to_avoid'(例如表格、圖片)顯著重疊的文字區域。 使用重疊比例閾值來判斷是否過濾,避免過濾掉僅相鄰但不重疊的文字。 Args: text_regions: 文字區域列表 regions_to_avoid: 需要避免的區域列表(表格、圖片) overlap_threshold: 重疊比例閾值 (0.0-1.0),只有當文字區域 與避免區域的重疊比例超過此閾值時才會被過濾 預設 0.5 表示超過 50% 重疊才過濾 Returns: 過濾後的文字區域列表 """ filtered_text = [] filtered_count = 0 for text_region in text_regions: should_filter = False max_overlap = 0.0 for avoid_region in regions_to_avoid: # 計算重疊比例 overlap_ratio = self._calculate_overlap_ratio(text_region, avoid_region) max_overlap = max(max_overlap, overlap_ratio) # 只有當重疊比例超過閾值時才過濾 if overlap_ratio > overlap_threshold: should_filter = True filtered_count += 1 logger.debug(f"過濾掉重疊文字 (重疊比例: {overlap_ratio:.1%}): {text_region.get('text', '')[:30]}...") break if not should_filter: filtered_text.append(text_region) if max_overlap > 0: logger.debug(f"保留文字 (最大重疊比例: {max_overlap:.1%}): {text_region.get('text', '')[:30]}...") logger.info(f"原始文字區域: {len(text_regions)}, 過濾後: {len(filtered_text)}, 移除: {filtered_count}") return filtered_text def draw_text_region( self, pdf_canvas: canvas.Canvas, region: Dict, page_height: float, scale_w: float = 1.0, scale_h: float = 1.0 ): """ Draw a text region at precise coordinates Args: pdf_canvas: ReportLab canvas object region: Text region dict with text, bbox, confidence page_height: Height of page (for coordinate transformation) scale_w: Scale factor for X coordinates (PDF width / OCR width) scale_h: Scale factor for Y coordinates (PDF height / OCR height) """ text = region.get('text', '') bbox = region.get('bbox', []) confidence = region.get('confidence', 1.0) if not text or not bbox: return try: # Handle different bbox formats if isinstance(bbox, dict): # Dict format from UnifiedDocument: {"x0": ..., "y0": ..., "x1": ..., "y1": ...} if 'x0' in bbox and 'y0' in bbox and 'x1' in bbox and 'y1' in bbox: ocr_x_left = float(bbox['x0']) ocr_y_top = float(bbox['y0']) ocr_x_right = float(bbox['x1']) ocr_y_bottom = float(bbox['y1']) else: logger.warning(f"Dict bbox missing required keys: {bbox}") return elif isinstance(bbox, list): if len(bbox) < 4: return # Polygon format [[x,y], [x,y], [x,y], [x,y]] (4 points) if isinstance(bbox[0], list): ocr_x_left = bbox[0][0] # Left X ocr_y_top = bbox[0][1] # Top Y in OCR coordinates ocr_x_right = bbox[2][0] # Right X ocr_y_bottom = bbox[2][1] # Bottom Y in OCR coordinates # Simple list format [x0, y0, x1, y1] elif isinstance(bbox[0], (int, float)): ocr_x_left = bbox[0] ocr_y_top = bbox[1] ocr_x_right = bbox[2] ocr_y_bottom = bbox[3] else: logger.warning(f"Unexpected bbox list format: {bbox}") return else: logger.warning(f"Invalid bbox format: {bbox}") return logger.info(f"[文字] '{text[:20]}...' OCR原始座標: L={ocr_x_left:.0f}, T={ocr_y_top:.0f}, R={ocr_x_right:.0f}, B={ocr_y_bottom:.0f}") # Apply scale factors to convert from OCR space to PDF space scaled_x_left = ocr_x_left * scale_w scaled_y_top = ocr_y_top * scale_h scaled_x_right = ocr_x_right * scale_w scaled_y_bottom = ocr_y_bottom * scale_h logger.info(f"[文字] '{text[:20]}...' 縮放後(scale={scale_w:.3f},{scale_h:.3f}): L={scaled_x_left:.1f}, T={scaled_y_top:.1f}, R={scaled_x_right:.1f}, B={scaled_y_bottom:.1f}") # Calculate bbox dimensions (after scaling) bbox_width = abs(scaled_x_right - scaled_x_left) bbox_height = abs(scaled_y_bottom - scaled_y_top) # Calculate font size using heuristics # For multi-line text, divide bbox height by number of lines lines = text.split('\n') non_empty_lines = [l for l in lines if l.strip()] num_lines = max(len(non_empty_lines), 1) # Font size calculation with stabilization # Use 0.8 factor to leave room for line spacing raw_font_size = (bbox_height / num_lines) * 0.8 # Stabilize font size for body text (most common case) # Normal body text should be 9-11pt, only deviate for clear outliers element_type = region.get('element_type', 'text') if element_type in ('text', 'paragraph'): # For body text, bias toward 10pt baseline if 7 <= raw_font_size <= 14: # Near-normal range: use weighted average toward 10pt font_size = raw_font_size * 0.7 + 10 * 0.3 else: # Clear outlier: use raw but clamp more aggressively font_size = max(min(raw_font_size, 14), 7) else: # For titles/headers/etc, use raw calculation with wider range font_size = max(min(raw_font_size, 72), 4) logger.debug(f"Text has {num_lines} non-empty lines, bbox_height={bbox_height:.1f}, raw={raw_font_size:.1f}, final={font_size:.1f}") # Transform coordinates: OCR (top-left origin) → PDF (bottom-left origin) # CRITICAL: Y-axis flip! # For multi-line text, start from TOP of bbox and go downward pdf_x = scaled_x_left pdf_y_top = page_height - scaled_y_top # Top of bbox in PDF coordinates # Adjust for font baseline: first line starts below the top edge pdf_y = pdf_y_top - font_size # Start first line one font size below top logger.info(f"[文字] '{text[:30]}' → PDF位置: ({pdf_x:.1f}, {pdf_y:.1f}), 字體:{font_size:.1f}pt, 寬x高:{bbox_width:.0f}x{bbox_height:.0f}, 行數:{num_lines}") # Set font with track-specific styling style_info = region.get('style') element_type = region.get('element_type', 'text') is_direct_track = (self.current_processing_track == ProcessingTrack.DIRECT or self.current_processing_track == ProcessingTrack.HYBRID) if style_info and is_direct_track: # Direct track: Apply rich styling from StyleInfo self._apply_text_style(pdf_canvas, style_info, default_size=font_size) # Get current font for width calculation font_name = pdf_canvas._fontname font_size = pdf_canvas._fontsize logger.debug(f"Applied Direct track style: font={font_name}, size={font_size}") else: # OCR track or no style: Use simple font selection with element-type based styling font_name = self.font_name if self.font_registered else 'Helvetica' # Apply element-type specific styling (for OCR track) if element_type == 'title': # Titles: use larger, bold font font_size = min(font_size * 1.3, 36) # 30% larger, max 36pt pdf_canvas.setFont(font_name, font_size) logger.debug(f"Applied title style: size={font_size:.1f}") elif element_type == 'header': # Headers: slightly larger font_size = min(font_size * 1.15, 24) # 15% larger, max 24pt pdf_canvas.setFont(font_name, font_size) elif element_type == 'caption': # Captions: slightly smaller, italic if available font_size = max(font_size * 0.9, 6) # 10% smaller, min 6pt pdf_canvas.setFont(font_name, font_size) else: pdf_canvas.setFont(font_name, font_size) # Handle line breaks (split text by newlines) # OCR track: simple left-aligned rendering # Note: non_empty_lines was already calculated above for font sizing line_height = font_size * 1.2 # 120% of font size for line spacing # Draw each non-empty line (using proper line index for positioning) for i, line in enumerate(non_empty_lines): line_y = pdf_y - (i * line_height) # Calculate text width to prevent overflow text_width = pdf_canvas.stringWidth(line, font_name, font_size) # If text is too wide for bbox, scale down font for this line current_font_size = font_size if text_width > bbox_width: scale_factor = bbox_width / text_width current_font_size = font_size * scale_factor * 0.95 # 95% to add small margin current_font_size = max(current_font_size, 3) # Minimum 3pt pdf_canvas.setFont(font_name, current_font_size) # Draw text at left-aligned position (OCR track uses simple left alignment) pdf_canvas.drawString(pdf_x, line_y, line) # Reset font size for next line if text_width > bbox_width: pdf_canvas.setFont(font_name, font_size) # Debug: Draw bounding box (optional) if settings.pdf_enable_bbox_debug: pdf_canvas.setStrokeColorRGB(1, 0, 0, 0.3) # Red, semi-transparent pdf_canvas.setLineWidth(0.5) # Use already-extracted coordinates (works for all bbox formats) # Draw rectangle using the scaled coordinates pdf_x1 = ocr_x_left * scale_w pdf_y1 = page_height - ocr_y_top * scale_h pdf_x2 = ocr_x_right * scale_w pdf_y2 = page_height - ocr_y_bottom * scale_h # Draw bbox rectangle pdf_canvas.line(pdf_x1, pdf_y1, pdf_x2, pdf_y1) # top pdf_canvas.line(pdf_x2, pdf_y1, pdf_x2, pdf_y2) # right pdf_canvas.line(pdf_x2, pdf_y2, pdf_x1, pdf_y2) # bottom pdf_canvas.line(pdf_x1, pdf_y2, pdf_x1, pdf_y1) # left except Exception as e: logger.warning(f"Failed to draw text region '{text[:20]}...': {e}") def _compute_table_grid_from_cell_boxes( self, cell_boxes: List[List[float]], table_bbox: List[float], num_rows: int, num_cols: int ) -> Tuple[Optional[List[float]], Optional[List[float]]]: """ Compute column widths and row heights from cell bounding boxes. This uses the cell boxes extracted by SLANeXt to calculate the actual column widths and row heights, which provides more accurate table rendering than uniform distribution. Args: cell_boxes: List of cell bboxes [[x1,y1,x2,y2], ...] table_bbox: Table bounding box [x1,y1,x2,y2] num_rows: Number of rows in the table num_cols: Number of columns in the table Returns: Tuple of (col_widths, row_heights) or (None, None) if calculation fails """ if not cell_boxes or len(cell_boxes) < 2: return None, None try: table_x1, table_y1, table_x2, table_y2 = table_bbox table_width = table_x2 - table_x1 table_height = table_y2 - table_y1 # Collect all unique X and Y boundaries from cell boxes x_boundaries = set() y_boundaries = set() for box in cell_boxes: if len(box) >= 4: x1, y1, x2, y2 = box[:4] # Convert to relative coordinates within table x_boundaries.add(x1 - table_x1) x_boundaries.add(x2 - table_x1) y_boundaries.add(y1 - table_y1) y_boundaries.add(y2 - table_y1) # Sort boundaries x_boundaries = sorted(x_boundaries) y_boundaries = sorted(y_boundaries) # Ensure we have boundaries at table edges if x_boundaries and x_boundaries[0] > 5: x_boundaries.insert(0, 0) if x_boundaries and x_boundaries[-1] < table_width - 5: x_boundaries.append(table_width) if y_boundaries and y_boundaries[0] > 5: y_boundaries.insert(0, 0) if y_boundaries and y_boundaries[-1] < table_height - 5: y_boundaries.append(table_height) # Calculate column widths from X boundaries # Merge boundaries that are too close (< 5px) merged_x = [x_boundaries[0]] if x_boundaries else [] for x in x_boundaries[1:]: if x - merged_x[-1] > 5: merged_x.append(x) x_boundaries = merged_x # Calculate row heights from Y boundaries merged_y = [y_boundaries[0]] if y_boundaries else [] for y in y_boundaries[1:]: if y - merged_y[-1] > 5: merged_y.append(y) y_boundaries = merged_y # Calculate widths and heights col_widths = [] for i in range(len(x_boundaries) - 1): col_widths.append(x_boundaries[i + 1] - x_boundaries[i]) row_heights = [] for i in range(len(y_boundaries) - 1): row_heights.append(y_boundaries[i + 1] - y_boundaries[i]) # Validate: number of columns/rows should match expected if len(col_widths) == num_cols and len(row_heights) == num_rows: logger.info(f"[TABLE] Cell boxes grid: {num_cols} cols, {num_rows} rows") logger.debug(f"[TABLE] Col widths from cell_boxes: {[f'{w:.1f}' for w in col_widths]}") logger.debug(f"[TABLE] Row heights from cell_boxes: {[f'{h:.1f}' for h in row_heights]}") return col_widths, row_heights else: # Grid doesn't match, might be due to merged cells logger.debug( f"[TABLE] Cell boxes grid mismatch: " f"got {len(col_widths)}x{len(row_heights)}, expected {num_cols}x{num_rows}" ) # Still return the widths/heights if counts are close if abs(len(col_widths) - num_cols) <= 1 and abs(len(row_heights) - num_rows) <= 1: # Adjust to match expected count while len(col_widths) < num_cols: col_widths.append(col_widths[-1] if col_widths else table_width / num_cols) while len(col_widths) > num_cols: col_widths.pop() while len(row_heights) < num_rows: row_heights.append(row_heights[-1] if row_heights else table_height / num_rows) while len(row_heights) > num_rows: row_heights.pop() return col_widths, row_heights return None, None except Exception as e: logger.warning(f"[TABLE] Failed to compute grid from cell boxes: {e}") return None, None def draw_table_region( self, pdf_canvas: canvas.Canvas, table_element: Dict, images_metadata: List[Dict], page_height: float, scale_w: float = 1.0, scale_h: float = 1.0, result_dir: Optional[Path] = None ): """ Draw a table region by parsing HTML and rebuilding with ReportLab Table Args: pdf_canvas: ReportLab canvas object table_element: Table element dict with HTML content images_metadata: List of image metadata to find table bbox page_height: Height of page scale_w: Scale factor for X coordinates (PDF width / OCR width) scale_h: Scale factor for Y coordinates (PDF height / OCR height) result_dir: Directory containing result files (for embedded images) """ try: elem_id = table_element.get('element_id', 'unknown') use_border_only = table_element.get('_use_border_only', False) logger.info(f"[DEBUG] draw_table_region: elem_id={elem_id}, _use_border_only={use_border_only}") html_content = table_element.get('content', '') if not html_content: # Even without HTML, draw border if requested if use_border_only: self._draw_table_border_only(pdf_canvas, table_element, page_height, scale_w, scale_h) return # Apply column correction if enabled cell_boxes = table_element.get('cell_boxes', []) if (settings.table_column_correction_enabled and TABLE_COLUMN_CORRECTOR_AVAILABLE and cell_boxes): try: corrector = TableColumnCorrector( correction_threshold=settings.table_column_correction_threshold, vertical_merge_enabled=settings.vertical_fragment_merge_enabled, vertical_aspect_ratio=settings.vertical_fragment_aspect_ratio ) # Get table bbox for vertical fragment detection table_bbox = table_element.get('bbox', []) if isinstance(table_bbox, dict): table_bbox = [table_bbox['x0'], table_bbox['y0'], table_bbox['x1'], table_bbox['y1']] corrected_html, stats = corrector.correct( html=html_content, cell_boxes=cell_boxes, table_bbox=table_bbox if isinstance(table_bbox, list) and len(table_bbox) >= 4 else None ) if stats.get('column_corrections', 0) > 0: logger.info(f"[TABLE] {elem_id}: Column correction applied - {stats}") html_content = corrected_html except Exception as e: logger.warning(f"[TABLE] {elem_id}: Column correction failed: {e}, using original HTML") # Parse HTML first to get table structure for grid validation parser = HTMLTableParser() parser.feed(html_content) if not parser.tables: logger.warning("No tables found in HTML content") return # Get the first table (PP-StructureV3 usually provides one table per element) table_data = parser.tables[0] rows = table_data['rows'] if not rows: return # Calculate number of rows and columns from HTML for grid validation num_rows = len(rows) max_cols = 0 for row in rows: row_cols = sum(cell.get('colspan', 1) for cell in row['cells']) max_cols = max(max_cols, row_cols) # Check if table was rebuilt - if so, use HTML content directly was_rebuilt = table_element.get('was_rebuilt', False) cell_boxes_rendered = False # Track if we rendered borders with cell_boxes if was_rebuilt: logger.info(f"[TABLE] {elem_id}: Table was rebuilt, using HTML content directly") elif use_border_only: # Bad quality cell_boxes: skip cell_boxes rendering, use ReportLab Table with borders logger.info(f"[TABLE] {elem_id}: Bad cell_boxes quality, using ReportLab Table with borders") else: # Check if cell_boxes can produce a valid grid before rendering borders cell_boxes = table_element.get('cell_boxes', []) if cell_boxes: # Get table bbox for grid calculation temp_bbox = table_element.get('bbox', []) if isinstance(temp_bbox, dict): raw_bbox = [temp_bbox['x0'], temp_bbox['y0'], temp_bbox['x1'], temp_bbox['y1']] elif isinstance(temp_bbox, list) and len(temp_bbox) >= 4: if isinstance(temp_bbox[0], (int, float)): raw_bbox = temp_bbox[:4] else: raw_bbox = [temp_bbox[0][0], temp_bbox[0][1], temp_bbox[2][0], temp_bbox[2][1]] else: raw_bbox = None # Pre-check: can we compute a valid grid from cell_boxes? if raw_bbox: test_col_widths, test_row_heights = self._compute_table_grid_from_cell_boxes( cell_boxes, raw_bbox, num_rows, max_cols ) grid_valid = test_col_widths is not None and test_row_heights is not None if grid_valid: logger.info(f"[TABLE] Grid validation passed, rendering borders with cell_boxes") success = self._draw_table_with_cell_boxes( pdf_canvas, table_element, page_height, scale_w, scale_h, result_dir ) if success: cell_boxes_rendered = True logger.info("[TABLE] cell_boxes rendered borders, continuing with text-only ReportLab Table") else: logger.info("[TABLE] cell_boxes rendering failed, using ReportLab Table with borders") else: logger.info(f"[TABLE] Grid validation failed (mismatch), using ReportLab Table with borders") else: logger.info("[TABLE] No valid bbox for grid validation, using ReportLab Table with borders") # Get bbox directly from table element table_bbox = table_element.get('bbox') # If no bbox directly, check for bbox_polygon if not table_bbox: bbox_polygon = table_element.get('bbox_polygon') if bbox_polygon and len(bbox_polygon) >= 4: # Convert polygon format to simple bbox [x0, y0, x1, y1] table_bbox = [ bbox_polygon[0][0], # x0 bbox_polygon[0][1], # y0 bbox_polygon[2][0], # x1 bbox_polygon[2][1] # y1 ] if not table_bbox: logger.warning(f"No bbox found for table element") return # Handle different bbox formats if isinstance(table_bbox, dict): # Dict format from UnifiedDocument: {"x0": ..., "y0": ..., "x1": ..., "y1": ...} if 'x0' in table_bbox and 'y0' in table_bbox and 'x1' in table_bbox and 'y1' in table_bbox: ocr_x_left_raw = float(table_bbox['x0']) ocr_y_top_raw = float(table_bbox['y0']) ocr_x_right_raw = float(table_bbox['x1']) ocr_y_bottom_raw = float(table_bbox['y1']) else: logger.error(f"Dict bbox missing required keys (x0, y0, x1, y1): {table_bbox}") return elif isinstance(table_bbox, list) and len(table_bbox) == 4: # Simple bbox format [x0, y0, x1, y1] if isinstance(table_bbox[0], (int, float)): ocr_x_left_raw = table_bbox[0] ocr_y_top_raw = table_bbox[1] ocr_x_right_raw = table_bbox[2] ocr_y_bottom_raw = table_bbox[3] # Polygon format [[x,y], [x,y], [x,y], [x,y]] elif isinstance(table_bbox[0], list): ocr_x_left_raw = table_bbox[0][0] ocr_y_top_raw = table_bbox[0][1] ocr_x_right_raw = table_bbox[2][0] ocr_y_bottom_raw = table_bbox[2][1] else: logger.error(f"Unexpected bbox format: {table_bbox}") return else: logger.error(f"Invalid table_bbox format: {table_bbox}") return logger.info(f"[表格] OCR原始座標: L={ocr_x_left_raw:.0f}, T={ocr_y_top_raw:.0f}, R={ocr_x_right_raw:.0f}, B={ocr_y_bottom_raw:.0f}") # Apply scaling ocr_x_left = ocr_x_left_raw * scale_w ocr_y_top = ocr_y_top_raw * scale_h ocr_x_right = ocr_x_right_raw * scale_w ocr_y_bottom = ocr_y_bottom_raw * scale_h table_width = abs(ocr_x_right - ocr_x_left) table_height = abs(ocr_y_bottom - ocr_y_top) # Transform coordinates pdf_x = ocr_x_left pdf_y = page_height - ocr_y_bottom # Build table data for ReportLab with proper colspan/rowspan handling # num_rows and max_cols already calculated above for grid validation logger.info(f"[表格] {num_rows}行x{max_cols}列 → PDF位置: ({pdf_x:.1f}, {pdf_y:.1f}), 寬x高: {table_width:.0f}x{table_height:.0f}") # Create a grid to track occupied cells (for rowspan handling) # occupied[row][col] = True if cell is occupied by a span from above occupied = [[False] * max_cols for _ in range(num_rows)] # Build the 2D data array and collect span commands reportlab_data = [] span_commands = [] for row_idx, row in enumerate(rows): row_data = [''] * max_cols col_idx = 0 for cell in row['cells']: # Skip occupied cells (from rowspan above) while col_idx < max_cols and occupied[row_idx][col_idx]: col_idx += 1 if col_idx >= max_cols: break text = cell['text'].strip() colspan = cell.get('colspan', 1) rowspan = cell.get('rowspan', 1) # Place text in the top-left cell of the span row_data[col_idx] = text # Mark cells as occupied for rowspan for r in range(row_idx, min(row_idx + rowspan, num_rows)): for c in range(col_idx, min(col_idx + colspan, max_cols)): if r > row_idx or c > col_idx: occupied[r][c] = True # Add SPAN command if cell spans multiple rows/cols if colspan > 1 or rowspan > 1: span_end_col = min(col_idx + colspan - 1, max_cols - 1) span_end_row = min(row_idx + rowspan - 1, num_rows - 1) span_commands.append(('SPAN', (col_idx, row_idx), (span_end_col, span_end_row))) col_idx += colspan reportlab_data.append(row_data) # Calculate column widths and row heights # First, try to use cell_boxes if available for more accurate layout cell_boxes = table_element.get('cell_boxes') raw_table_bbox = [ocr_x_left_raw, ocr_y_top_raw, ocr_x_right_raw, ocr_y_bottom_raw] computed_col_widths = None computed_row_heights = None if cell_boxes: cell_boxes_source = table_element.get('cell_boxes_source', 'unknown') logger.info(f"[TABLE] Using {len(cell_boxes)} cell boxes from {cell_boxes_source}") computed_col_widths, computed_row_heights = self._compute_table_grid_from_cell_boxes( cell_boxes, raw_table_bbox, num_rows, max_cols ) # Use computed widths if available, otherwise fall back to equal distribution if computed_col_widths: # Scale col_widths to PDF coordinates col_widths = [w * scale_w for w in computed_col_widths] logger.info(f"[TABLE] Using cell_boxes col widths (scaled)") else: col_widths = [table_width / max_cols] * max_cols logger.info(f"[TABLE] Using equal distribution col widths: {table_width/max_cols:.1f} each") # Row heights - ALWAYS use to ensure table fits bbox properly # Use computed heights from cell_boxes, or uniform distribution as fallback if computed_row_heights: # Scale row_heights to PDF coordinates row_heights = [h * scale_h for h in computed_row_heights] logger.info(f"[TABLE] Using cell_boxes row heights (scaled)") else: # Uniform distribution based on table bbox - ensures table fills its allocated space row_heights = [table_height / num_rows] * num_rows logger.info(f"[TABLE] Using uniform row heights: {table_height/num_rows:.1f} each") # Create ReportLab Table # Use smaller font to fit content with auto-wrap font_size = 8 # Fixed reasonable font size for table content # Create paragraph style for text wrapping in cells cell_style = ParagraphStyle( 'CellStyle', fontName=self.font_name if self.font_registered else 'Helvetica', fontSize=font_size, leading=font_size * 1.2, alignment=TA_CENTER, wordWrap='CJK', # Better wrapping for Chinese text ) # Convert text to Paragraph objects for auto-wrapping for row_idx, row_data in enumerate(reportlab_data): for col_idx, cell_text in enumerate(row_data): if cell_text: # Escape HTML special characters and create Paragraph escaped_text = cell_text.replace('&', '&').replace('<', '<').replace('>', '>') reportlab_data[row_idx][col_idx] = Paragraph(escaped_text, cell_style) # Create table with col widths and row heights # Always use row_heights to ensure table fits bbox properly table = Table(reportlab_data, colWidths=col_widths, rowHeights=row_heights) logger.info(f"[TABLE] Created with {len(col_widths)} cols, {len(row_heights)} rows") # Apply table style # If cell_boxes rendered borders, skip GRID style (text-only rendering) style_commands = [ ('FONT', (0, 0), (-1, -1), self.font_name if self.font_registered else 'Helvetica', font_size), ('VALIGN', (0, 0), (-1, -1), 'MIDDLE'), ('ALIGN', (0, 0), (-1, -1), 'CENTER'), ('LEFTPADDING', (0, 0), (-1, -1), 2), ('RIGHTPADDING', (0, 0), (-1, -1), 2), ('TOPPADDING', (0, 0), (-1, -1), 2), ('BOTTOMPADDING', (0, 0), (-1, -1), 2), ] # Only add GRID if cell_boxes didn't render borders if not cell_boxes_rendered: style_commands.insert(1, ('GRID', (0, 0), (-1, -1), 0.5, colors.black)) logger.info("[TABLE] Adding GRID style (cell_boxes not used)") else: logger.info("[TABLE] Skipping GRID style (cell_boxes rendered borders)") style = TableStyle(style_commands) # Add header style if first row has headers if rows and rows[0]['cells'] and rows[0]['cells'][0].get('is_header'): style.add('BACKGROUND', (0, 0), (-1, 0), colors.lightgrey) style.add('FONT', (0, 0), (-1, 0), self.font_name if self.font_registered else 'Helvetica-Bold', font_size) # Add span commands for merged cells for span_cmd in span_commands: style.add(*span_cmd) table.setStyle(style) logger.info(f"[表格] 套用 {len(span_commands)} 個合併儲存格 (SPAN)") # Calculate actual table size after wrapping actual_width, actual_height = table.wrapOn(pdf_canvas, table_width, table_height) logger.info(f"[表格] 目標尺寸: {table_width:.0f}x{table_height:.0f}, 實際尺寸: {actual_width:.0f}x{actual_height:.0f}") # Scale table to fit bbox if it exceeds the target size scale_x = table_width / actual_width if actual_width > table_width else 1.0 scale_y = table_height / actual_height if actual_height > table_height else 1.0 scale_factor = min(scale_x, scale_y) # Use smaller scale to fit both dimensions # Calculate the table top position in PDF coordinates # ReportLab uses bottom-left origin, so we need to position from TOP pdf_y_top = page_height - ocr_y_top # Top of table in PDF coords # Calculate the actual bottom position based on scaled height # Table should be positioned so its TOP aligns with the bbox top scaled_height = actual_height * scale_factor pdf_y_bottom = pdf_y_top - scaled_height # Bottom of scaled table logger.info(f"[表格] PDF座標: top={pdf_y_top:.0f}, bottom={pdf_y_bottom:.0f}, scaled_height={scaled_height:.0f}") if scale_factor < 1.0: logger.info(f"[表格] 縮放比例: {scale_factor:.2f} (需要縮小以適應 bbox)") # Apply scaling transformation pdf_canvas.saveState() pdf_canvas.translate(pdf_x, pdf_y_bottom) pdf_canvas.scale(scale_factor, scale_factor) # Draw at origin since we've already translated table.drawOn(pdf_canvas, 0, 0) pdf_canvas.restoreState() else: # Draw table at position without scaling # pdf_y should be the bottom of the table table.drawOn(pdf_canvas, pdf_x, pdf_y_bottom) logger.info(f"Drew table at ({pdf_x:.0f}, {pdf_y_bottom:.0f}) size {table_width:.0f}x{scaled_height:.0f} with {len(rows)} rows") # Draw embedded images (images detected inside the table region) embedded_images = table_element.get('embedded_images', []) if embedded_images and result_dir: logger.info(f"[TABLE] Drawing {len(embedded_images)} embedded images") for emb_img in embedded_images: self._draw_embedded_image( pdf_canvas, emb_img, page_height, result_dir, scale_w, scale_h ) except Exception as e: logger.warning(f"Failed to draw table region: {e}") import traceback traceback.print_exc() def _draw_embedded_image( self, pdf_canvas: canvas.Canvas, emb_img: Dict, page_height: float, result_dir: Path, scale_w: float = 1.0, scale_h: float = 1.0 ): """Draw an embedded image inside a table region.""" try: # Get image path saved_path = emb_img.get('saved_path', '') if not saved_path: return # Construct full path image_path = result_dir / saved_path if not image_path.exists(): image_path = result_dir / Path(saved_path).name if not image_path.exists(): logger.warning(f"Embedded image not found: {saved_path}") return # Get bbox from embedded image data bbox = emb_img.get('bbox', []) if not bbox or len(bbox) < 4: logger.warning(f"No bbox for embedded image: {saved_path}") return # Calculate position (bbox is [x0, y0, x1, y1]) x0, y0, x1, y1 = bbox[0], bbox[1], bbox[2], bbox[3] # Apply scaling x0_scaled = x0 * scale_w y0_scaled = y0 * scale_h x1_scaled = x1 * scale_w y1_scaled = y1 * scale_h width = x1_scaled - x0_scaled height = y1_scaled - y0_scaled # Transform Y coordinate (ReportLab uses bottom-left origin) pdf_x = x0_scaled pdf_y = page_height - y1_scaled # Draw the image from reportlab.lib.utils import ImageReader img_reader = ImageReader(str(image_path)) pdf_canvas.drawImage( img_reader, pdf_x, pdf_y, width, height, preserveAspectRatio=True, mask='auto' ) logger.info(f"Drew embedded image at ({pdf_x:.0f}, {pdf_y:.0f}) size {width:.0f}x{height:.0f}") except Exception as e: logger.warning(f"Failed to draw embedded image: {e}") def _normalize_cell_boxes_to_grid( self, cell_boxes: List[List[float]], threshold: float = 10.0 ) -> List[List[float]]: """ Normalize cell boxes to create a proper aligned grid. Groups nearby coordinates and snaps them to a common value, eliminating the 2-11 pixel variations that cause skewed tables. Args: cell_boxes: List of cell bboxes [[x1,y1,x2,y2], ...] threshold: Maximum distance to consider coordinates as "same line" Returns: Normalized cell_boxes with aligned coordinates """ if not cell_boxes or len(cell_boxes) < 2: return cell_boxes # Collect all X and Y coordinates x_coords = [] # (value, box_idx, is_x1) y_coords = [] # (value, box_idx, is_y1) for i, box in enumerate(cell_boxes): x1, y1, x2, y2 = box[0], box[1], box[2], box[3] x_coords.append((x1, i, True)) # x1 (left) x_coords.append((x2, i, False)) # x2 (right) y_coords.append((y1, i, True)) # y1 (top) y_coords.append((y2, i, False)) # y2 (bottom) def cluster_and_normalize(coords, threshold): """Cluster nearby coordinates and return mapping to normalized values.""" if not coords: return {} # Sort by value sorted_coords = sorted(coords, key=lambda x: x[0]) # Cluster nearby values clusters = [] current_cluster = [sorted_coords[0]] for coord in sorted_coords[1:]: if coord[0] - current_cluster[-1][0] <= threshold: current_cluster.append(coord) else: clusters.append(current_cluster) current_cluster = [coord] clusters.append(current_cluster) # Create mapping: (box_idx, is_first) -> normalized value mapping = {} for cluster in clusters: # Use average of cluster as normalized value avg_value = sum(c[0] for c in cluster) / len(cluster) for _, box_idx, is_first in cluster: mapping[(box_idx, is_first)] = avg_value return mapping x_mapping = cluster_and_normalize(x_coords, threshold) y_mapping = cluster_and_normalize(y_coords, threshold) # Create normalized cell boxes normalized_boxes = [] for i, box in enumerate(cell_boxes): x1_norm = x_mapping.get((i, True), box[0]) x2_norm = x_mapping.get((i, False), box[2]) y1_norm = y_mapping.get((i, True), box[1]) y2_norm = y_mapping.get((i, False), box[3]) normalized_boxes.append([x1_norm, y1_norm, x2_norm, y2_norm]) logger.debug(f"[TABLE] Normalized {len(cell_boxes)} cell boxes to grid") return normalized_boxes def _draw_table_border_only( self, pdf_canvas: canvas.Canvas, table_element: Dict, page_height: float, scale_w: float = 1.0, scale_h: float = 1.0 ): """ Draw only the outer border of a table (for tables with bad cell_boxes quality). Text inside the table will be rendered using raw OCR positions. Args: pdf_canvas: ReportLab canvas object table_element: Table element dict page_height: Height of page in PDF coordinates scale_w: Scale factor for X coordinates scale_h: Scale factor for Y coordinates """ table_bbox = table_element.get('bbox', []) if not table_bbox or len(table_bbox) < 4: return element_id = table_element.get('element_id', 'unknown') # Handle different bbox formats if isinstance(table_bbox, dict): x0, y0, x1, y1 = table_bbox['x0'], table_bbox['y0'], table_bbox['x1'], table_bbox['y1'] elif isinstance(table_bbox[0], (int, float)): x0, y0, x1, y1 = table_bbox[0], table_bbox[1], table_bbox[2], table_bbox[3] else: return # Apply scaling pdf_x0 = x0 * scale_w pdf_y0 = y0 * scale_h pdf_x1 = x1 * scale_w pdf_y1 = y1 * scale_h # Convert to PDF coordinates (flip Y) pdf_top = page_height - pdf_y0 pdf_bottom = page_height - pdf_y1 width = pdf_x1 - pdf_x0 height = pdf_y1 - pdf_y0 # Draw outer border only pdf_canvas.setStrokeColor(colors.black) pdf_canvas.setLineWidth(0.5) pdf_canvas.rect(pdf_x0, pdf_bottom, width, height, stroke=1, fill=0) logger.info(f"[TABLE] {element_id}: Drew border only (bad cell_boxes quality)") def _check_cell_boxes_quality(self, cell_boxes: List, element_id: str = "") -> str: """ Check the quality of cell_boxes to determine rendering strategy. Always returns 'good' to use pure PP-Structure output (quality check removed). Args: cell_boxes: List of cell bounding boxes element_id: Optional element ID for logging Returns: 'good' - always use cell_boxes rendering """ logger.debug(f"[TABLE QUALITY] {element_id}: good - pure PP-Structure mode") return 'good' def _draw_table_with_cell_boxes( self, pdf_canvas: canvas.Canvas, table_element: Dict, page_height: float, scale_w: float = 1.0, scale_h: float = 1.0, result_dir: Optional[Path] = None ): """ Draw table borders using cell_boxes for accurate positioning. LAYERED RENDERING APPROACH: - This method ONLY draws cell borders and embedded images - Text is rendered separately using raw OCR positions (via GapFillingService) - This decouples visual structure (borders) from content (text) FALLBACK: If cell_boxes are incomplete, always draws the outer table border using the table's bbox to ensure table boundaries are visible. Args: pdf_canvas: ReportLab canvas object table_element: Table element dict with cell_boxes page_height: Height of page in PDF coordinates scale_w: Scale factor for X coordinates scale_h: Scale factor for Y coordinates result_dir: Directory containing result files (for embedded images) """ try: cell_boxes = table_element.get('cell_boxes', []) table_bbox = table_element.get('bbox', []) # Check cell_boxes quality - skip if they don't form a proper grid if cell_boxes and len(cell_boxes) > 2: # Count overlapping cell pairs overlap_count = 0 for i, box1 in enumerate(cell_boxes): for j, box2 in enumerate(cell_boxes): if i >= j: continue x_overlap = box1[0] < box2[2] and box1[2] > box2[0] y_overlap = box1[1] < box2[3] and box1[3] > box2[1] if x_overlap and y_overlap: overlap_count += 1 # If more than 25% of cell pairs overlap, cell_boxes are unreliable # Increased from 10% to 25% to allow more tables to use cell_boxes rendering # which provides better visual fidelity than ReportLab Table fallback total_pairs = len(cell_boxes) * (len(cell_boxes) - 1) // 2 overlap_ratio = overlap_count / total_pairs if total_pairs > 0 else 0 if overlap_ratio > 0.25: logger.warning( f"[TABLE] Skipping cell_boxes rendering: {overlap_count}/{total_pairs} " f"({overlap_ratio:.1%}) cell pairs overlap - using ReportLab Table fallback" ) return False # Return False to trigger ReportLab Table fallback if not cell_boxes: # Fallback: draw outer border only when no cell_boxes if table_bbox and len(table_bbox) >= 4: # Handle different bbox formats (list or dict) if isinstance(table_bbox, dict): tx1 = float(table_bbox.get('x0', 0)) ty1 = float(table_bbox.get('y0', 0)) tx2 = float(table_bbox.get('x1', 0)) ty2 = float(table_bbox.get('y1', 0)) else: tx1, ty1, tx2, ty2 = table_bbox[:4] # Apply scaling tx1_scaled = tx1 * scale_w ty1_scaled = ty1 * scale_h tx2_scaled = tx2 * scale_w ty2_scaled = ty2 * scale_h table_width = tx2_scaled - tx1_scaled table_height = ty2_scaled - ty1_scaled # Transform Y coordinate (PDF uses bottom-left origin) pdf_x = tx1_scaled pdf_y = page_height - ty2_scaled # Bottom of table in PDF coords # Draw outer table border (slightly thicker for visibility) pdf_canvas.setStrokeColor(colors.black) pdf_canvas.setLineWidth(1.0) pdf_canvas.rect(pdf_x, pdf_y, table_width, table_height, stroke=1, fill=0) logger.info(f"[TABLE] Drew outer table border at [{int(tx1)},{int(ty1)},{int(tx2)},{int(ty2)}]") logger.warning("[TABLE] No cell_boxes available, only outer border drawn") # Still draw embedded images even without cell borders embedded_images = table_element.get('embedded_images', []) if embedded_images and result_dir: for emb_img in embedded_images: self._draw_embedded_image( pdf_canvas, emb_img, page_height, result_dir, scale_w, scale_h ) return True # Outer border drawn successfully # Normalize cell boxes to create aligned grid cell_boxes = self._normalize_cell_boxes_to_grid(cell_boxes) logger.info(f"[TABLE] Drawing {len(cell_boxes)} cells using grid lines (avoiding duplicates)") # Collect unique grid lines to avoid drawing duplicate/overlapping lines h_lines = set() # Horizontal lines: (y, x_start, x_end) v_lines = set() # Vertical lines: (x, y_start, y_end) for box in cell_boxes: x1, y1, x2, y2 = box[0], box[1], box[2], box[3] # Apply scaling x1_s = x1 * scale_w y1_s = y1 * scale_h x2_s = x2 * scale_w y2_s = y2 * scale_h # Round to 1 decimal place to help with deduplication x1_s, y1_s, x2_s, y2_s = round(x1_s, 1), round(y1_s, 1), round(x2_s, 1), round(y2_s, 1) # Add horizontal lines (top and bottom of cell) h_lines.add((y1_s, x1_s, x2_s)) # Top line h_lines.add((y2_s, x1_s, x2_s)) # Bottom line # Add vertical lines (left and right of cell) v_lines.add((x1_s, y1_s, y2_s)) # Left line v_lines.add((x2_s, y1_s, y2_s)) # Right line # Draw unique horizontal lines pdf_canvas.setStrokeColor(colors.black) pdf_canvas.setLineWidth(0.5) for y, x_start, x_end in h_lines: pdf_y = page_height - y # Transform Y coordinate pdf_canvas.line(x_start, pdf_y, x_end, pdf_y) # Draw unique vertical lines for x, y_start, y_end in v_lines: pdf_y_start = page_height - y_start pdf_y_end = page_height - y_end pdf_canvas.line(x, pdf_y_start, x, pdf_y_end) logger.info(f"[TABLE] Drew {len(h_lines)} horizontal + {len(v_lines)} vertical grid lines") # Draw embedded images embedded_images = table_element.get('embedded_images', []) if embedded_images and result_dir: logger.info(f"[TABLE] Drawing {len(embedded_images)} embedded images") for emb_img in embedded_images: self._draw_embedded_image( pdf_canvas, emb_img, page_height, result_dir, scale_w, scale_h ) return True except Exception as e: logger.warning(f"[TABLE] Failed to draw cell borders: {e}") import traceback traceback.print_exc() return False def draw_image_region( self, pdf_canvas: canvas.Canvas, region: Dict, page_height: float, result_dir: Path, scale_w: float = 1.0, scale_h: float = 1.0 ): """ Draw an image region by embedding the extracted image Handles images extracted by PP-StructureV3 (tables, figures, charts, etc.) Args: pdf_canvas: ReportLab canvas object region: Image metadata dict with image_path and bbox page_height: Height of page (for coordinate transformation) result_dir: Directory containing result files scale_w: Scale factor for X coordinates (PDF width / OCR width) scale_h: Scale factor for Y coordinates (PDF height / OCR height) """ try: image_path_str = region.get('image_path', '') if not image_path_str: return # Construct full path to image # saved_path is relative to result_dir (e.g., "imgs/element_id.png") image_path = result_dir / image_path_str # Fallback for legacy data if not image_path.exists(): image_path = result_dir / Path(image_path_str).name if not image_path.exists(): logger.warning(f"Image not found: {image_path_str} (in {result_dir})") return # Get bbox for positioning bbox = region.get('bbox', []) if not bbox: logger.warning(f"No bbox for image {image_path_str}") return # Handle different bbox formats if isinstance(bbox, dict): # Dict format from UnifiedDocument: {"x0": ..., "y0": ..., "x1": ..., "y1": ...} if 'x0' in bbox and 'y0' in bbox and 'x1' in bbox and 'y1' in bbox: ocr_x_left_raw = float(bbox['x0']) ocr_y_top_raw = float(bbox['y0']) ocr_x_right_raw = float(bbox['x1']) ocr_y_bottom_raw = float(bbox['y1']) else: logger.warning(f"Dict bbox missing required keys for image: {bbox}") return elif isinstance(bbox, list): if len(bbox) < 4: logger.warning(f"List bbox too short for image: {bbox}") return # Polygon format [[x,y], [x,y], [x,y], [x,y]] if isinstance(bbox[0], list): ocr_x_left_raw = bbox[0][0] ocr_y_top_raw = bbox[0][1] ocr_x_right_raw = bbox[2][0] ocr_y_bottom_raw = bbox[2][1] # Simple list format [x0, y0, x1, y1] elif isinstance(bbox[0], (int, float)): ocr_x_left_raw = bbox[0] ocr_y_top_raw = bbox[1] ocr_x_right_raw = bbox[2] ocr_y_bottom_raw = bbox[3] else: logger.warning(f"Unexpected bbox list format for image: {bbox}") return else: logger.warning(f"Invalid bbox format for image: {bbox}") return logger.info(f"[圖片] '{image_path_str}' OCR原始座標: L={ocr_x_left_raw:.0f}, T={ocr_y_top_raw:.0f}, R={ocr_x_right_raw:.0f}, B={ocr_y_bottom_raw:.0f}") # Apply scaling ocr_x_left = ocr_x_left_raw * scale_w ocr_y_top = ocr_y_top_raw * scale_h ocr_x_right = ocr_x_right_raw * scale_w ocr_y_bottom = ocr_y_bottom_raw * scale_h # Calculate bbox dimensions (after scaling) bbox_width = abs(ocr_x_right - ocr_x_left) bbox_height = abs(ocr_y_bottom - ocr_y_top) # Transform coordinates: OCR (top-left origin) → PDF (bottom-left origin) # CRITICAL: Y-axis flip! # For images, we position at bottom-left corner pdf_x_left = ocr_x_left pdf_y_bottom = page_height - ocr_y_bottom # Flip Y-axis logger.info(f"[圖片] '{image_path_str}' → PDF位置: ({pdf_x_left:.1f}, {pdf_y_bottom:.1f}), 寬x高: {bbox_width:.0f}x{bbox_height:.0f}") # Draw image using ReportLab # drawImage expects: (path, x, y, width, height) # where (x, y) is the bottom-left corner of the image pdf_canvas.drawImage( str(image_path), pdf_x_left, pdf_y_bottom, width=bbox_width, height=bbox_height, preserveAspectRatio=True, mask='auto' # Handle transparency ) logger.info(f"[圖片] ✓ 成功繪製 '{image_path_str}'") except Exception as e: logger.warning(f"Failed to draw image region: {e}") def generate_layout_pdf( self, json_path: Path, output_path: Path, source_file_path: Optional[Path] = None ) -> bool: """ Generate layout-preserving PDF from OCR JSON data Args: json_path: Path to OCR JSON file output_path: Path to save generated PDF source_file_path: Optional path to original source file for dimension extraction Returns: True if successful, False otherwise """ try: # Load JSON data ocr_data = self.load_ocr_json(json_path) if not ocr_data: return False # Check if this is new UnifiedDocument format (has 'pages' with elements) # vs old OCR format (has 'text_regions') if 'pages' in ocr_data and isinstance(ocr_data.get('pages'), list): # New UnifiedDocument format - convert and use Direct track rendering logger.info("Detected UnifiedDocument JSON format, using Direct track rendering") unified_doc = self._json_to_unified_document(ocr_data, json_path.parent) if unified_doc: return self.generate_from_unified_document( unified_doc=unified_doc, output_path=output_path, source_file_path=source_file_path ) else: logger.error("Failed to convert JSON to UnifiedDocument") return False else: # Old OCR format - use legacy generation logger.info("Detected legacy OCR JSON format, using OCR track rendering") return self._generate_pdf_from_data( ocr_data=ocr_data, output_path=output_path, source_file_path=source_file_path, json_parent_dir=json_path.parent ) except Exception as e: logger.error(f"Failed to generate PDF: {e}") import traceback traceback.print_exc() return False def _json_to_unified_document(self, json_data: Dict, result_dir: Path) -> Optional['UnifiedDocument']: """ Convert JSON dict to UnifiedDocument object. Args: json_data: Loaded JSON dictionary in UnifiedDocument format result_dir: Directory containing image files Returns: UnifiedDocument object or None if conversion fails """ try: from datetime import datetime # Parse metadata metadata_dict = json_data.get('metadata', {}) # Parse processing track track_str = metadata_dict.get('processing_track', 'direct') try: processing_track = ProcessingTrack(track_str) except ValueError: processing_track = ProcessingTrack.DIRECT # Create DocumentMetadata metadata = DocumentMetadata( filename=metadata_dict.get('filename', ''), file_type=metadata_dict.get('file_type', 'pdf'), file_size=metadata_dict.get('file_size', 0), created_at=datetime.fromisoformat(metadata_dict.get('created_at', datetime.now().isoformat()).replace('Z', '+00:00')), processing_track=processing_track, processing_time=metadata_dict.get('processing_time', 0), language=metadata_dict.get('language'), title=metadata_dict.get('title'), author=metadata_dict.get('author'), subject=metadata_dict.get('subject'), keywords=metadata_dict.get('keywords'), producer=metadata_dict.get('producer'), creator=metadata_dict.get('creator'), creation_date=datetime.fromisoformat(metadata_dict['creation_date'].replace('Z', '+00:00')) if metadata_dict.get('creation_date') else None, modification_date=datetime.fromisoformat(metadata_dict['modification_date'].replace('Z', '+00:00')) if metadata_dict.get('modification_date') else None, ) # Parse pages pages = [] for page_dict in json_data.get('pages', []): # Parse page dimensions dims = page_dict.get('dimensions', {}) if not dims: # Fallback dimensions dims = {'width': 595.32, 'height': 841.92} dimensions = Dimensions( width=dims.get('width', 595.32), height=dims.get('height', 841.92), dpi=dims.get('dpi') ) # Parse elements elements = [] for elem_dict in page_dict.get('elements', []): element = self._json_to_document_element(elem_dict) if element: elements.append(element) page = Page( page_number=page_dict.get('page_number', 1), dimensions=dimensions, elements=elements, metadata=page_dict.get('metadata', {}) ) pages.append(page) # Create UnifiedDocument unified_doc = UnifiedDocument( document_id=json_data.get('document_id', ''), metadata=metadata, pages=pages, processing_errors=json_data.get('processing_errors', []) ) logger.info(f"Converted JSON to UnifiedDocument: {len(pages)} pages, track={processing_track.value}") return unified_doc except Exception as e: logger.error(f"Failed to convert JSON to UnifiedDocument: {e}") import traceback traceback.print_exc() return None def _json_to_document_element(self, elem_dict: Dict) -> Optional['DocumentElement']: """ Convert JSON dict to DocumentElement. Args: elem_dict: Element dictionary from JSON Returns: DocumentElement or None if conversion fails """ try: # Parse element type type_str = elem_dict.get('type', 'text') try: elem_type = ElementType(type_str) except ValueError: # Fallback to TEXT for unknown types elem_type = ElementType.TEXT logger.warning(f"Unknown element type '{type_str}', falling back to TEXT") # Content-based HTML table detection: reclassify text elements with HTML table content content = elem_dict.get('content', '') if elem_type == ElementType.TEXT and isinstance(content, str) and '