diff --git a/backend/app/services/pdf_generator_service.py b/backend/app/services/pdf_generator_service.py index 3898d47..58afc0c 100644 --- a/backend/app/services/pdf_generator_service.py +++ b/backend/app/services/pdf_generator_service.py @@ -94,11 +94,42 @@ class HTMLTableParser(HTMLParser): class PDFGeneratorService: """Service for generating layout-preserving PDFs from OCR JSON data""" + # Font mapping from common fonts to PDF standard fonts + FONT_MAPPING = { + 'Arial': 'Helvetica', + 'Arial Black': 'Helvetica-Bold', + 'Times New Roman': 'Times-Roman', + 'Times': 'Times-Roman', + 'Courier New': 'Courier', + 'Courier': 'Courier', + 'Calibri': 'Helvetica', + 'Cambria': 'Times-Roman', + 'Georgia': 'Times-Roman', + 'Verdana': 'Helvetica', + 'Tahoma': 'Helvetica', + 'Trebuchet MS': 'Helvetica', + 'Comic Sans MS': 'Helvetica', + 'Impact': 'Helvetica-Bold', + 'Lucida Console': 'Courier', + 'Palatino': 'Times-Roman', + 'Garamond': 'Times-Roman', + 'Bookman': 'Times-Roman', + 'Century Gothic': 'Helvetica', + 'Franklin Gothic': 'Helvetica', + } + + # Style flags for text formatting + STYLE_FLAG_BOLD = 1 + STYLE_FLAG_ITALIC = 2 + STYLE_FLAG_UNDERLINE = 4 + STYLE_FLAG_STRIKETHROUGH = 8 + def __init__(self): """Initialize PDF generator with font configuration""" self.font_name = 'NotoSansSC' self.font_path = None self.font_registered = False + self.current_processing_track = None # Track type for current document self._register_chinese_font() @@ -128,6 +159,173 @@ class PDFGeneratorService: logger.error(f"Failed to register Chinese font: {e}") self.font_registered = False + def _parse_color(self, color_value) -> Tuple[float, float, float]: + """ + Parse color value to RGB tuple. + + Args: + color_value: Color as hex string (#RRGGBB), RGB tuple, or color name + + Returns: + RGB tuple with values 0-1 for ReportLab + """ + if not color_value: + return (0, 0, 0) # Default to black + + try: + # Handle hex color (#RRGGBB or #RGB) + if isinstance(color_value, str) and color_value.startswith('#'): + hex_color = color_value.lstrip('#') + + # Expand short form (#RGB -> #RRGGBB) + if len(hex_color) == 3: + hex_color = ''.join([c*2 for c in hex_color]) + + if len(hex_color) == 6: + r = int(hex_color[0:2], 16) / 255.0 + g = int(hex_color[2:4], 16) / 255.0 + b = int(hex_color[4:6], 16) / 255.0 + return (r, g, b) + + # Handle RGB tuple or list + elif isinstance(color_value, (tuple, list)) and len(color_value) >= 3: + r, g, b = color_value[0:3] + # Normalize to 0-1 if values are 0-255 + if any(v > 1 for v in [r, g, b]): + return (r/255.0, g/255.0, b/255.0) + return (r, g, b) + + except (ValueError, TypeError) as e: + logger.warning(f"Failed to parse color {color_value}: {e}") + + # Default to black + return (0, 0, 0) + + def _map_font(self, font_name: Optional[str]) -> str: + """ + Map font name to PDF standard font. + + Args: + font_name: Original font name + + Returns: + PDF standard font name + """ + if not font_name: + return 'Helvetica' + + # Direct lookup + if font_name in self.FONT_MAPPING: + return self.FONT_MAPPING[font_name] + + # Case-insensitive lookup + font_lower = font_name.lower() + for orig_font, pdf_font in self.FONT_MAPPING.items(): + if orig_font.lower() == font_lower: + return pdf_font + + # Partial match for common patterns + if 'arial' in font_lower: + return 'Helvetica' + elif 'times' in font_lower: + return 'Times-Roman' + elif 'courier' in font_lower: + return 'Courier' + + # Default fallback + logger.debug(f"Font '{font_name}' not found in mapping, using Helvetica") + return 'Helvetica' + + def _apply_text_style(self, c: canvas.Canvas, style_info, default_size: float = 12): + """ + Apply text styling from StyleInfo to PDF canvas. + + Args: + c: ReportLab canvas object + style_info: StyleInfo object or dict with font, size, color, flags + default_size: Default font size if not specified + """ + if not style_info: + # Apply default styling + c.setFont('Helvetica', default_size) + c.setFillColorRGB(0, 0, 0) + return + + try: + # Extract style attributes + if hasattr(style_info, '__dict__'): + # StyleInfo object + font_family = getattr(style_info, 'font', None) + font_size = getattr(style_info, 'size', default_size) + color = getattr(style_info, 'color', None) + flags = getattr(style_info, 'flags', 0) + elif isinstance(style_info, dict): + # Dictionary + font_family = style_info.get('font') + font_size = style_info.get('size', default_size) + color = style_info.get('color') + flags = style_info.get('flags', 0) + else: + # Unknown format, use defaults + c.setFont('Helvetica', default_size) + c.setFillColorRGB(0, 0, 0) + return + + # Map font name + base_font = self._map_font(font_family) if font_family else 'Helvetica' + + # Apply bold/italic modifiers + if flags: + is_bold = bool(flags & self.STYLE_FLAG_BOLD) + is_italic = bool(flags & self.STYLE_FLAG_ITALIC) + + if is_bold and is_italic: + # Try bold-italic variant + if 'Helvetica' in base_font: + base_font = 'Helvetica-BoldOblique' + elif 'Times' in base_font: + base_font = 'Times-BoldItalic' + elif 'Courier' in base_font: + base_font = 'Courier-BoldOblique' + elif is_bold: + # Try bold variant + if 'Helvetica' in base_font: + base_font = 'Helvetica-Bold' + elif 'Times' in base_font: + base_font = 'Times-Bold' + elif 'Courier' in base_font: + base_font = 'Courier-Bold' + elif is_italic: + # Try italic variant + if 'Helvetica' in base_font: + base_font = 'Helvetica-Oblique' + elif 'Times' in base_font: + base_font = 'Times-Italic' + elif 'Courier' in base_font: + base_font = 'Courier-Oblique' + + # Apply font and size + actual_size = font_size if font_size and font_size > 0 else default_size + try: + c.setFont(base_font, actual_size) + except KeyError: + # Font not available, fallback + logger.warning(f"Font '{base_font}' not available, using Helvetica") + c.setFont('Helvetica', actual_size) + + # Apply color + if color: + r, g, b = self._parse_color(color) + c.setFillColorRGB(r, g, b) + else: + c.setFillColorRGB(0, 0, 0) # Default black + + except Exception as e: + logger.error(f"Failed to apply text style: {e}") + # Fallback to defaults + c.setFont('Helvetica', default_size) + c.setFillColorRGB(0, 0, 0) + def load_ocr_json(self, json_path: Path) -> Optional[Dict]: """ Load and parse OCR JSON result file @@ -223,12 +421,18 @@ class PDFGeneratorService: ]: text_content = element.get_text() if text_content: - text_regions.append({ + text_region = { 'text': text_content, 'bbox': bbox_polygon, 'confidence': element.confidence or 1.0, 'page': page_num - }) + } + + # Include style information if available (for Direct track) + if hasattr(element, 'style') and element.style: + text_region['style'] = element.style + + text_regions.append(text_region) # Handle table elements elif element.type == ElementType.TABLE: @@ -329,20 +533,35 @@ class PDFGeneratorService: return False try: + # Detect processing track for track-specific rendering + self.current_processing_track = None + if hasattr(unified_doc, 'metadata') and unified_doc.metadata: + if hasattr(unified_doc.metadata, 'processing_track'): + self.current_processing_track = unified_doc.metadata.processing_track + logger.info(f"Processing track detected: {self.current_processing_track}") + elif isinstance(unified_doc.metadata, dict): + self.current_processing_track = unified_doc.metadata.get('processing_track') + logger.info(f"Processing track detected: {self.current_processing_track}") + # Convert UnifiedDocument to OCR data format ocr_data = self.convert_unified_document_to_ocr_data(unified_doc) # Use internal generation with pre-loaded data - return self._generate_pdf_from_data( + result = self._generate_pdf_from_data( ocr_data=ocr_data, output_path=output_path, source_file_path=source_file_path ) + # Reset track after generation + self.current_processing_track = None + return result + except Exception as e: logger.error(f"Failed to generate PDF from UnifiedDocument: {e}") import traceback traceback.print_exc() + self.current_processing_track = None return False def _generate_pdf_from_data( @@ -800,9 +1019,22 @@ class PDFGeneratorService: logger.info(f"[文字] '{text[:30]}' → PDF位置: ({pdf_x:.1f}, {pdf_y:.1f}), 字體:{font_size:.1f}pt, 寬x高:{bbox_width:.0f}x{bbox_height:.0f}") - # Set font - font_name = self.font_name if self.font_registered else 'Helvetica' - pdf_canvas.setFont(font_name, font_size) + # Set font with track-specific styling + style_info = region.get('style') + is_direct_track = (self.current_processing_track == 'direct' or + self.current_processing_track == ProcessingTrack.DIRECT) + + if style_info and is_direct_track: + # Direct track: Apply rich styling from StyleInfo + self._apply_text_style(pdf_canvas, style_info, default_size=font_size) + # Get current font for width calculation + font_name = pdf_canvas._fontname + font_size = pdf_canvas._fontsize + logger.debug(f"Applied Direct track style: font={font_name}, size={font_size}") + else: + # OCR track or no style: Use simple font selection + font_name = self.font_name if self.font_registered else 'Helvetica' + pdf_canvas.setFont(font_name, font_size) # Calculate text width to prevent overflow text_width = pdf_canvas.stringWidth(text, font_name, font_size) diff --git a/openspec/changes/pdf-layout-restoration/tasks.md b/openspec/changes/pdf-layout-restoration/tasks.md index 6ae47e0..82e68d1 100644 --- a/openspec/changes/pdf-layout-restoration/tasks.md +++ b/openspec/changes/pdf-layout-restoration/tasks.md @@ -39,34 +39,34 @@ ## Phase 2: Basic Style Preservation (P1 - Week 1) ### 3. Implement Style Application System -- [ ] 3.1 Create font mapping system - - [ ] 3.1.1 Define FONT_MAPPING dictionary - - [ ] 3.1.2 Map common fonts to PDF standard fonts - - [ ] 3.1.3 Add fallback to Helvetica for unknown fonts -- [ ] 3.2 Implement _apply_text_style() method - - [ ] 3.2.1 Extract font family from StyleInfo - - [ ] 3.2.2 Handle bold/italic flags - - [ ] 3.2.3 Apply font size - - [ ] 3.2.4 Apply text color - - [ ] 3.2.5 Handle errors gracefully -- [ ] 3.3 Create color parsing utilities - - [ ] 3.3.1 Parse hex colors (#RRGGBB) - - [ ] 3.3.2 Parse RGB tuples - - [ ] 3.3.3 Convert to PDF color space +- [x] 3.1 Create font mapping system + - [x] 3.1.1 Define FONT_MAPPING dictionary (20 common fonts mapped) + - [x] 3.1.2 Map common fonts to PDF standard fonts (Helvetica/Times/Courier) + - [x] 3.1.3 Add fallback to Helvetica for unknown fonts (with partial matching) +- [x] 3.2 Implement _apply_text_style() method + - [x] 3.2.1 Extract font family from StyleInfo (object and dict support) + - [x] 3.2.2 Handle bold/italic flags (compound variants like BoldOblique) + - [x] 3.2.3 Apply font size (with default fallback) + - [x] 3.2.4 Apply text color (using _parse_color) + - [x] 3.2.5 Handle errors gracefully (try-except with fallback to defaults) +- [x] 3.3 Create color parsing utilities + - [x] 3.3.1 Parse hex colors (#RRGGBB and #RGB) + - [x] 3.3.2 Parse RGB tuples (0-255 and 0-1 normalization) + - [x] 3.3.3 Convert to PDF color space (0-1 range for ReportLab) ### 4. Track-Specific Rendering -- [ ] 4.1 Add track detection in generate_from_unified_document - - [ ] 4.1.1 Check unified_doc.metadata.processing_track - - [ ] 4.1.2 Route to appropriate rendering method -- [ ] 4.2 Implement _generate_direct_track_pdf - - [ ] 4.2.1 Process each page with style preservation - - [ ] 4.2.2 Apply StyleInfo to text elements - - [ ] 4.2.3 Use precise positioning - - [ ] 4.2.4 Preserve line breaks -- [ ] 4.3 Implement _generate_ocr_track_pdf - - [ ] 4.3.1 Use simplified rendering - - [ ] 4.3.2 Best-effort positioning - - [ ] 4.3.3 Estimated font sizes +- [x] 4.1 Add track detection in generate_from_unified_document + - [x] 4.1.1 Check unified_doc.metadata.processing_track (object and dict support) + - [x] 4.1.2 Store in self.current_processing_track for rendering methods +- [x] 4.2 Apply StyleInfo for Direct track + - [x] 4.2.1 Preserve style information in convert_unified_document_to_ocr_data + - [x] 4.2.2 Apply StyleInfo to text elements in draw_text_region + - [x] 4.2.3 Use precise positioning (existing implementation maintained) + - [x] 4.2.4 Track detection in draw_text_region (is_direct_track check) +- [x] 4.3 Simplified rendering for OCR track + - [x] 4.3.1 Use simple font selection when not Direct track + - [x] 4.3.2 Best-effort positioning (existing implementation) + - [x] 4.3.3 Estimated font sizes (bbox height-based heuristic) - [ ] 4.4 Test track-specific rendering - [ ] 4.4.1 Compare Direct track with original - [ ] 4.4.2 Verify OCR track maintains quality