From 09cf9149ce7aaf1406198d7c680583acdb3507a8 Mon Sep 17 00:00:00 2001 From: egg Date: Mon, 24 Nov 2025 07:53:17 +0800 Subject: [PATCH] feat: implement proper track-specific PDF rendering MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement independent Direct and OCR track rendering methods with complete separation of concerns and proper line break handling. **Architecture Changes**: - Created _generate_direct_track_pdf() for rich formatting - Created _generate_ocr_track_pdf() for backward compatible rendering - Modified generate_from_unified_document() to route by track type - No more shared rendering path that loses information **Direct Track Features** (_generate_direct_track_pdf): - Processes UnifiedDocument directly (no legacy conversion) - Preserves all StyleInfo without information loss - Handles line breaks (\n) in text content - Layer-based rendering: images → tables → text - Three specialized helper methods: - _draw_text_element_direct(): Multi-line text with styling - _draw_table_element_direct(): Direct bbox table rendering - _draw_image_element_direct(): Image positioning from bbox **OCR Track Features** (_generate_ocr_track_pdf): - Uses legacy OCR data conversion pipeline - Routes to existing _generate_pdf_from_data() - Maintains full backward compatibility - Simplified rendering for OCR-detected layout **Line Break Handling** (Direct Track): - Split text on '\n' into multiple lines - Calculate line height as font_size * 1.2 - Render each line with proper vertical spacing - Font scaling per line if width exceeds bbox **Implementation Details**: Lines 535-569: Track detection and routing Lines 571-670: _generate_direct_track_pdf() main method Lines 672-717: _generate_ocr_track_pdf() main method Lines 1497-1575: _draw_text_element_direct() with line breaks Lines 1577-1656: _draw_table_element_direct() Lines 1658-1714: _draw_image_element_direct() **Corrected Task Status**: - Task 4.2: NOW properly implements separate Direct track pipeline - Task 4.3: NOW properly implements separate OCR track pipeline - Both with distinct rendering logic as designed **Breaking vs Previous Commit**: Previous commit (3fc32bc) only added conditional styling in shared draw_text_region(). This commit creates true track-specific pipelines as per design.md requirements. Direct track PDFs will now: ✅ Process without legacy conversion (no info loss) ✅ Render multi-line text properly (split on \n) ✅ Apply StyleInfo per element ✅ Use precise bbox positioning ✅ Render images and tables directly OCR track PDFs will: ✅ Use existing proven pipeline ✅ Maintain backward compatibility ✅ No changes to current behavior 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- backend/app/services/pdf_generator_service.py | 389 +++++++++++++++++- .../changes/pdf-layout-restoration/tasks.md | 23 +- 2 files changed, 393 insertions(+), 19 deletions(-) diff --git a/backend/app/services/pdf_generator_service.py b/backend/app/services/pdf_generator_service.py index 58afc0c..9cd6702 100644 --- a/backend/app/services/pdf_generator_service.py +++ b/backend/app/services/pdf_generator_service.py @@ -534,31 +534,183 @@ class PDFGeneratorService: try: # Detect processing track for track-specific rendering - self.current_processing_track = None + processing_track = None if hasattr(unified_doc, 'metadata') and unified_doc.metadata: if hasattr(unified_doc.metadata, 'processing_track'): - self.current_processing_track = unified_doc.metadata.processing_track - logger.info(f"Processing track detected: {self.current_processing_track}") + processing_track = unified_doc.metadata.processing_track elif isinstance(unified_doc.metadata, dict): - self.current_processing_track = unified_doc.metadata.get('processing_track') - logger.info(f"Processing track detected: {self.current_processing_track}") + processing_track = unified_doc.metadata.get('processing_track') - # Convert UnifiedDocument to OCR data format + # Route to track-specific rendering method + is_direct_track = (processing_track == 'direct' or + processing_track == ProcessingTrack.DIRECT) + + logger.info(f"Processing track: {processing_track}, using {'Direct' if is_direct_track else 'OCR'} track rendering") + + if is_direct_track: + # Direct track: Rich formatting preservation + return self._generate_direct_track_pdf( + unified_doc=unified_doc, + output_path=output_path, + source_file_path=source_file_path + ) + else: + # OCR track: Simplified rendering (backward compatible) + return self._generate_ocr_track_pdf( + unified_doc=unified_doc, + output_path=output_path, + source_file_path=source_file_path + ) + + except Exception as e: + logger.error(f"Failed to generate PDF from UnifiedDocument: {e}") + import traceback + traceback.print_exc() + return False + + def _generate_direct_track_pdf( + self, + unified_doc: 'UnifiedDocument', + output_path: Path, + source_file_path: Optional[Path] = None + ) -> bool: + """ + Generate PDF with rich formatting preservation for Direct track. + + This method processes UnifiedDocument directly without converting to + legacy OCR format, preserving StyleInfo and applying proper text + formatting including line breaks. + + Args: + unified_doc: UnifiedDocument from Direct extraction + output_path: Path to save generated PDF + source_file_path: Optional path to original source file + + Returns: + True if successful, False otherwise + """ + try: + logger.info("=== Direct Track PDF Generation ===") + logger.info(f"Total pages: {len(unified_doc.pages)}") + + # Set current track for helper methods + self.current_processing_track = 'direct' + + # Get page dimensions from first page + if not unified_doc.pages: + logger.error("No pages in document") + return False + + first_page = unified_doc.pages[0] + page_width = first_page.width + page_height = first_page.height + + logger.info(f"Page dimensions: {page_width} x {page_height}") + + # Create PDF canvas with source dimensions + from reportlab.pdfgen import canvas + pdf_canvas = canvas.Canvas(str(output_path), pagesize=(page_width, page_height)) + + # Process each page + for page_idx, page in enumerate(unified_doc.pages): + logger.info(f">>> Processing page {page_idx + 1}/{len(unified_doc.pages)}") + + if page_idx > 0: + pdf_canvas.showPage() + + # Separate elements by type + text_elements = [] + table_elements = [] + image_elements = [] + + for element in page.elements: + if element.type == ElementType.TABLE: + table_elements.append(element) + elif element.is_visual or element.type in [ + ElementType.IMAGE, ElementType.FIGURE, + ElementType.CHART, ElementType.DIAGRAM + ]: + image_elements.append(element) + elif element.is_text or element.type in [ + ElementType.TEXT, ElementType.TITLE, ElementType.HEADER, + ElementType.FOOTER, ElementType.PARAGRAPH + ]: + text_elements.append(element) + + logger.info(f"Page {page_idx + 1}: {len(text_elements)} text, " + f"{len(table_elements)} tables, {len(image_elements)} images") + + # Draw in layers: images → tables → text + + # 1. Draw images + for img_elem in image_elements: + self._draw_image_element_direct(pdf_canvas, img_elem, page_height, output_path.parent) + + # 2. Draw tables + for table_elem in table_elements: + self._draw_table_element_direct(pdf_canvas, table_elem, page_height) + + # 3. Draw text with line breaks and styling + for text_elem in text_elements: + self._draw_text_element_direct(pdf_canvas, text_elem, page_height) + + # Save PDF + pdf_canvas.save() + logger.info(f"Direct track PDF saved to {output_path}") + + # Reset track + self.current_processing_track = None + return True + + except Exception as e: + logger.error(f"Failed to generate Direct track PDF: {e}") + import traceback + traceback.print_exc() + self.current_processing_track = None + return False + + def _generate_ocr_track_pdf( + self, + unified_doc: 'UnifiedDocument', + output_path: Path, + source_file_path: Optional[Path] = None + ) -> bool: + """ + Generate PDF with simplified rendering for OCR track. + + This method uses the existing OCR data conversion and rendering + pipeline for backward compatibility. + + Args: + unified_doc: UnifiedDocument from OCR processing + output_path: Path to save generated PDF + source_file_path: Optional path to original source file + + Returns: + True if successful, False otherwise + """ + try: + logger.info("=== OCR Track PDF Generation ===") + + # Set current track + self.current_processing_track = 'ocr' + + # Convert UnifiedDocument to OCR data format (legacy) ocr_data = self.convert_unified_document_to_ocr_data(unified_doc) - # Use internal generation with pre-loaded data + # Use existing generation pipeline result = self._generate_pdf_from_data( ocr_data=ocr_data, output_path=output_path, source_file_path=source_file_path ) - # Reset track after generation + # Reset track self.current_processing_track = None return result except Exception as e: - logger.error(f"Failed to generate PDF from UnifiedDocument: {e}") + logger.error(f"Failed to generate OCR track PDF: {e}") import traceback traceback.print_exc() self.current_processing_track = None @@ -1342,6 +1494,225 @@ class PDFGeneratorService: traceback.print_exc() return False + def _draw_text_element_direct( + self, + pdf_canvas: canvas.Canvas, + element: 'DocumentElement', + page_height: float + ): + """ + Draw text element with Direct track rich formatting. + + Handles line breaks, applies StyleInfo, and preserves text positioning. + + Args: + pdf_canvas: ReportLab canvas object + element: DocumentElement with text content + page_height: Page height for coordinate transformation + """ + try: + text_content = element.get_text() + if not text_content: + return + + # Get bounding box + bbox = element.bbox + if not bbox: + logger.warning(f"No bbox for text element {element.element_id}") + return + + # Transform coordinates (top-left origin → bottom-left origin) + pdf_x = bbox.x0 + pdf_y = page_height - bbox.y1 # Use bottom of bbox + + bbox_width = bbox.x1 - bbox.x0 + bbox_height = bbox.y1 - bbox.y0 + + # Calculate font size from bbox height + font_size = bbox_height * 0.75 + font_size = max(min(font_size, 72), 4) # Clamp 4-72pt + + # Apply style if available + if hasattr(element, 'style') and element.style: + self._apply_text_style(pdf_canvas, element.style, default_size=font_size) + else: + # Use default font + font_name = self.font_name if self.font_registered else 'Helvetica' + pdf_canvas.setFont(font_name, font_size) + + # Handle line breaks + lines = text_content.split('\n') + line_height = font_size * 1.2 # 120% of font size + + # Draw each line + for i, line in enumerate(lines): + if not line.strip(): + continue + + line_y = pdf_y - (i * line_height) + + # Check if text fits in bbox width + font_name = pdf_canvas._fontname + text_width = pdf_canvas.stringWidth(line, font_name, font_size) + + if text_width > bbox_width: + # Scale down font to fit + scale_factor = bbox_width / text_width + scaled_size = font_size * scale_factor * 0.95 + scaled_size = max(scaled_size, 3) + pdf_canvas.setFont(font_name, scaled_size) + + # Draw the line + pdf_canvas.drawString(pdf_x, line_y, line) + + # Reset font size for next line + if text_width > bbox_width: + pdf_canvas.setFont(font_name, font_size) + + logger.debug(f"Drew text element: {text_content[:30]}... ({len(lines)} lines)") + + except Exception as e: + logger.error(f"Failed to draw text element {element.element_id}: {e}") + + def _draw_table_element_direct( + self, + pdf_canvas: canvas.Canvas, + element: 'DocumentElement', + page_height: float + ): + """ + Draw table element with Direct track positioning. + + Args: + pdf_canvas: ReportLab canvas object + element: DocumentElement with table content + page_height: Page height for coordinate transformation + """ + try: + # Get table HTML content + if isinstance(element.content, TableData): + html_content = element.content.to_html() + elif isinstance(element.content, dict): + html_content = element.content.get('html', str(element.content)) + else: + html_content = str(element.content) + + if not html_content: + logger.warning(f"No HTML content for table {element.element_id}") + return + + # Parse HTML + parser = HTMLTableParser() + parser.feed(html_content) + + if not parser.tables or not parser.tables[0]['rows']: + logger.warning(f"No table data parsed for {element.element_id}") + return + + table_data = parser.tables[0] + rows = table_data['rows'] + + # Get bbox + bbox = element.bbox + if not bbox: + logger.warning(f"No bbox for table {element.element_id}") + return + + # Transform coordinates + pdf_x = bbox.x0 + pdf_y = page_height - bbox.y1 # Bottom of table + + table_width = bbox.x1 - bbox.x0 + table_height = bbox.y1 - bbox.y0 + + # Build table data for ReportLab + table_content = [] + for row in rows: + row_data = [cell['text'].strip() for cell in row['cells']] + table_content.append(row_data) + + # Create table + from reportlab.platypus import Table, TableStyle + from reportlab.lib import colors + + t = Table(table_content, colWidths=[table_width / len(table_content[0])] * len(table_content[0])) + + # Apply style + style = TableStyle([ + ('GRID', (0, 0), (-1, -1), 0.5, colors.grey), + ('FONTSIZE', (0, 0), (-1, -1), 8), + ('ALIGN', (0, 0), (-1, -1), 'LEFT'), + ('VALIGN', (0, 0), (-1, -1), 'TOP'), + ]) + t.setStyle(style) + + # Draw table + t.wrapOn(pdf_canvas, table_width, table_height) + t.drawOn(pdf_canvas, pdf_x, pdf_y) + + logger.debug(f"Drew table element: {len(rows)} rows") + + except Exception as e: + logger.error(f"Failed to draw table element {element.element_id}: {e}") + + def _draw_image_element_direct( + self, + pdf_canvas: canvas.Canvas, + element: 'DocumentElement', + page_height: float, + result_dir: Path + ): + """ + Draw image element with Direct track positioning. + + Args: + pdf_canvas: ReportLab canvas object + element: DocumentElement with image content + page_height: Page height for coordinate transformation + result_dir: Directory containing image files + """ + try: + # Get image path + image_path_str = self._get_image_path(element) + if not image_path_str: + logger.warning(f"No image path for element {element.element_id}") + return + + # Construct full path + image_path = result_dir / image_path_str + + if not image_path.exists(): + logger.warning(f"Image not found: {image_path}") + return + + # Get bbox + bbox = element.bbox + if not bbox: + logger.warning(f"No bbox for image {element.element_id}") + return + + # Transform coordinates + pdf_x = bbox.x0 + pdf_y = page_height - bbox.y1 # Bottom of image + + image_width = bbox.x1 - bbox.x0 + image_height = bbox.y1 - bbox.y0 + + # Draw image + pdf_canvas.drawImage( + str(image_path), + pdf_x, + pdf_y, + width=image_width, + height=image_height, + preserveAspectRatio=True + ) + + logger.debug(f"Drew image: {image_path_str}") + + except Exception as e: + logger.error(f"Failed to draw image element {element.element_id}: {e}") + # Singleton instance pdf_generator_service = PDFGeneratorService() diff --git a/openspec/changes/pdf-layout-restoration/tasks.md b/openspec/changes/pdf-layout-restoration/tasks.md index 82e68d1..ff432e4 100644 --- a/openspec/changes/pdf-layout-restoration/tasks.md +++ b/openspec/changes/pdf-layout-restoration/tasks.md @@ -57,16 +57,19 @@ ### 4. Track-Specific Rendering - [x] 4.1 Add track detection in generate_from_unified_document - [x] 4.1.1 Check unified_doc.metadata.processing_track (object and dict support) - - [x] 4.1.2 Store in self.current_processing_track for rendering methods -- [x] 4.2 Apply StyleInfo for Direct track - - [x] 4.2.1 Preserve style information in convert_unified_document_to_ocr_data - - [x] 4.2.2 Apply StyleInfo to text elements in draw_text_region - - [x] 4.2.3 Use precise positioning (existing implementation maintained) - - [x] 4.2.4 Track detection in draw_text_region (is_direct_track check) -- [x] 4.3 Simplified rendering for OCR track - - [x] 4.3.1 Use simple font selection when not Direct track - - [x] 4.3.2 Best-effort positioning (existing implementation) - - [x] 4.3.3 Estimated font sizes (bbox height-based heuristic) + - [x] 4.1.2 Route to _generate_direct_track_pdf or _generate_ocr_track_pdf +- [x] 4.2 Implement _generate_direct_track_pdf + - [x] 4.2.1 Process each page directly from UnifiedDocument (no legacy conversion) + - [x] 4.2.2 Apply StyleInfo to text elements (_draw_text_element_direct) + - [x] 4.2.3 Use precise positioning from element.bbox + - [x] 4.2.4 Preserve line breaks (split on \n, render multi-line) + - [x] 4.2.5 Implement _draw_text_element_direct with line break handling + - [x] 4.2.6 Implement _draw_table_element_direct for tables + - [x] 4.2.7 Implement _draw_image_element_direct for images +- [x] 4.3 Implement _generate_ocr_track_pdf + - [x] 4.3.1 Use legacy OCR data conversion (convert_unified_document_to_ocr_data) + - [x] 4.3.2 Route to existing _generate_pdf_from_data pipeline + - [x] 4.3.3 Maintain backward compatibility with OCR track behavior - [ ] 4.4 Test track-specific rendering - [ ] 4.4.1 Compare Direct track with original - [ ] 4.4.2 Verify OCR track maintains quality