From ecdce961cad1d1b16bff0f4df038f71097c5ff4b Mon Sep 17 00:00:00 2001 From: egg Date: Wed, 19 Nov 2025 08:48:25 +0800 Subject: [PATCH] feat: update PDF generator to support UnifiedDocument directly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add generate_from_unified_document() method for direct UnifiedDocument processing - Create convert_unified_document_to_ocr_data() for format conversion - Extract _generate_pdf_from_data() as reusable core logic - Support both OCR and DIRECT processing tracks in PDF generation - Handle coordinate transformations (BoundingBox to polygon format) - Update OCR service to use appropriate PDF generation method Completes Section 4 (Unified Processing Pipeline) of dual-track proposal. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- backend/app/services/ocr_service.py | 20 +- backend/app/services/pdf_generator_service.py | 451 +++++++++++++----- .../dual-track-document-processing/tasks.md | 8 +- 3 files changed, 341 insertions(+), 138 deletions(-) diff --git a/backend/app/services/ocr_service.py b/backend/app/services/ocr_service.py index 7a92752..b861a57 100644 --- a/backend/app/services/ocr_service.py +++ b/backend/app/services/ocr_service.py @@ -1223,11 +1223,21 @@ class OCRService: logger.info(f"Generating layout-preserving PDF: {pdf_filename}") - success = pdf_generator_service.generate_layout_pdf( - json_path=json_path, - output_path=pdf_path, - source_file_path=source_file_path - ) + # Use appropriate method based on result type + if isinstance(result, UnifiedDocument): + # Use direct UnifiedDocument generation for better accuracy + success = pdf_generator_service.generate_from_unified_document( + unified_doc=result, + output_path=pdf_path, + source_file_path=source_file_path + ) + else: + # Legacy path: use JSON file + success = pdf_generator_service.generate_layout_pdf( + json_path=json_path, + output_path=pdf_path, + source_file_path=source_file_path + ) if success: logger.info(f"✓ PDF generated successfully: {pdf_path.name}") diff --git a/backend/app/services/pdf_generator_service.py b/backend/app/services/pdf_generator_service.py index b76bf1f..a029023 100644 --- a/backend/app/services/pdf_generator_service.py +++ b/backend/app/services/pdf_generator_service.py @@ -24,6 +24,17 @@ from html.parser import HTMLParser from app.core.config import settings +# Import UnifiedDocument for dual-track support +try: + from app.models.unified_document import ( + UnifiedDocument, DocumentElement, ElementType, + BoundingBox, TableData, ProcessingTrack + ) + UNIFIED_DOCUMENT_AVAILABLE = True +except ImportError: + UNIFIED_DOCUMENT_AVAILABLE = False + UnifiedDocument = None + logger = logging.getLogger(__name__) @@ -138,6 +149,310 @@ class PDFGeneratorService: logger.error(f"Failed to load JSON {json_path}: {e}") return None + def convert_unified_document_to_ocr_data(self, unified_doc: 'UnifiedDocument') -> Dict: + """ + Convert UnifiedDocument to OCR data format for PDF generation. + + This method transforms the UnifiedDocument structure into the legacy + OCR data format that the PDF generator expects, supporting both + OCR and DIRECT processing tracks. + + Args: + unified_doc: UnifiedDocument object from either processing track + + Returns: + Dictionary in OCR data format with text_regions, images_metadata, layout_data + """ + text_regions = [] + images_metadata = [] + layout_elements = [] + + for page in unified_doc.pages: + page_num = page.page_number # 1-based + + for element in page.elements: + # Convert BoundingBox to polygon format [[x,y], [x,y], [x,y], [x,y]] + bbox_polygon = [ + [element.bbox.x0, element.bbox.y0], # top-left + [element.bbox.x1, element.bbox.y0], # top-right + [element.bbox.x1, element.bbox.y1], # bottom-right + [element.bbox.x0, element.bbox.y1], # bottom-left + ] + + # Handle text elements + if element.is_text or element.type in [ + ElementType.TEXT, ElementType.TITLE, ElementType.HEADER, + ElementType.FOOTER, ElementType.PARAGRAPH, ElementType.CAPTION, + ElementType.LIST_ITEM, ElementType.FOOTNOTE, ElementType.REFERENCE + ]: + text_content = element.get_text() + if text_content: + text_regions.append({ + 'text': text_content, + 'bbox': bbox_polygon, + 'confidence': element.confidence or 1.0, + 'page': page_num + }) + + # Handle table elements + elif element.type == ElementType.TABLE: + # Convert TableData to HTML for layout_data + if isinstance(element.content, TableData): + html_content = element.content.to_html() + elif isinstance(element.content, dict): + html_content = element.content.get('html', str(element.content)) + else: + html_content = str(element.content) + + layout_elements.append({ + 'type': 'table', + 'content': html_content, + 'bbox': [element.bbox.x0, element.bbox.y0, + element.bbox.x1, element.bbox.y1], + 'page': page_num - 1 # layout uses 0-based + }) + + # Also add to images_metadata for overlap filtering + # Tables are often rendered as images + table_id = element.element_id or f"table_{page_num}_{len(images_metadata)}" + images_metadata.append({ + 'image_path': f"table_{table_id}.png", + 'bbox': bbox_polygon, + 'page': page_num - 1, # 0-based for images_metadata + 'type': 'table' + }) + + # Handle image/visual elements + elif element.is_visual or element.type in [ + ElementType.IMAGE, ElementType.FIGURE, ElementType.CHART, + ElementType.DIAGRAM, ElementType.LOGO + ]: + # Get image path from content or metadata + if isinstance(element.content, dict): + image_path = element.content.get('path', '') + else: + image_path = element.metadata.get('path', f"image_{element.element_id}.png") + + images_metadata.append({ + 'image_path': image_path, + 'bbox': bbox_polygon, + 'page': page_num - 1, # 0-based + 'type': element.type.value + }) + + # Build OCR data structure + ocr_data = { + 'text_regions': text_regions, + 'images_metadata': images_metadata, + 'layout_data': { + 'elements': layout_elements, + 'total_elements': len(layout_elements) + }, + 'total_pages': unified_doc.page_count, + 'ocr_dimensions': { + 'width': unified_doc.pages[0].dimensions.width if unified_doc.pages else 0, + 'height': unified_doc.pages[0].dimensions.height if unified_doc.pages else 0 + }, + # Metadata for tracking + '_from_unified_document': True, + '_processing_track': unified_doc.metadata.processing_track.value + } + + logger.info(f"Converted UnifiedDocument to OCR data: " + f"{len(text_regions)} text regions, " + f"{len(images_metadata)} images, " + f"{len(layout_elements)} layout elements, " + f"track={unified_doc.metadata.processing_track.value}") + + return ocr_data + + def generate_from_unified_document( + self, + unified_doc: 'UnifiedDocument', + output_path: Path, + source_file_path: Optional[Path] = None + ) -> bool: + """ + Generate layout-preserving PDF directly from UnifiedDocument. + + This method supports both OCR and DIRECT processing tracks, + preserving layout and coordinate information from either source. + + Args: + unified_doc: UnifiedDocument object + output_path: Path to save generated PDF + source_file_path: Optional path to original source file + + Returns: + True if successful, False otherwise + """ + if not UNIFIED_DOCUMENT_AVAILABLE: + logger.error("UnifiedDocument support not available") + return False + + try: + # Convert UnifiedDocument to OCR data format + ocr_data = self.convert_unified_document_to_ocr_data(unified_doc) + + # Use internal generation with pre-loaded data + return self._generate_pdf_from_data( + ocr_data=ocr_data, + output_path=output_path, + source_file_path=source_file_path + ) + + except Exception as e: + logger.error(f"Failed to generate PDF from UnifiedDocument: {e}") + import traceback + traceback.print_exc() + return False + + def _generate_pdf_from_data( + self, + ocr_data: Dict, + output_path: Path, + source_file_path: Optional[Path] = None, + json_parent_dir: Optional[Path] = None + ) -> bool: + """ + Internal method to generate PDF from OCR data dictionary. + + This is the core generation logic extracted for reuse by both + JSON-based and UnifiedDocument-based generation paths. + + Args: + ocr_data: OCR data dictionary + output_path: Path to save generated PDF + source_file_path: Optional path to original source file + json_parent_dir: Directory containing images (for JSON-based generation) + + Returns: + True if successful, False otherwise + """ + try: + # Check if PDF already exists (caching) + if output_path.exists(): + logger.info(f"PDF already exists: {output_path.name}") + return True + + # Get text regions + text_regions = ocr_data.get('text_regions', []) + if not text_regions: + logger.warning("No text regions found in data") + # Don't fail - might have only tables/images + + # Get images metadata + images_metadata = ocr_data.get('images_metadata', []) + + # Get layout data + layout_data = ocr_data.get('layout_data', {}) + + # Step 1: Get OCR processing dimensions + ocr_width, ocr_height = self.calculate_page_dimensions(ocr_data, source_file_path=None) + logger.info(f"OCR 處理時使用的座標系尺寸: {ocr_width:.1f} x {ocr_height:.1f}") + + # Step 2: Get target PDF dimensions + if source_file_path: + target_dims = self.get_original_page_size(source_file_path) + if target_dims: + target_width, target_height = target_dims + logger.info(f"目標 PDF 尺寸(來自原始文件): {target_width:.1f} x {target_height:.1f}") + else: + target_width, target_height = ocr_width, ocr_height + logger.warning(f"無法獲取原始文件尺寸,使用 OCR 尺寸作為目標") + else: + target_width, target_height = ocr_width, ocr_height + logger.info(f"無原始文件,使用 OCR 尺寸作為目標: {target_width:.1f} x {target_height:.1f}") + + # Step 3: Calculate scale factors + scale_w = target_width / ocr_width if ocr_width > 0 else 1.0 + scale_h = target_height / ocr_height if ocr_height > 0 else 1.0 + logger.info(f"縮放因子: X={scale_w:.3f}, Y={scale_h:.3f}") + + # Create PDF canvas + pdf_canvas = canvas.Canvas(str(output_path), pagesize=(target_width, target_height)) + + # Filter text regions to avoid overlap with tables/images + regions_to_avoid = images_metadata + table_count = len([img for img in images_metadata if 'table' in img.get('image_path', '').lower()]) + + logger.info(f"過濾文字區域: {len(regions_to_avoid)} 個區域需要避免") + + filtered_text_regions = self._filter_text_in_regions(text_regions, regions_to_avoid) + + # Group regions by page + pages_data = {} + for region in filtered_text_regions: + page_num = region.get('page', 1) + if page_num not in pages_data: + pages_data[page_num] = [] + pages_data[page_num].append(region) + + # Get table elements from layout_data + table_elements = [] + if layout_data and layout_data.get('elements'): + table_elements = [e for e in layout_data['elements'] if e.get('type') == 'table'] + + # Process each page + total_pages = ocr_data.get('total_pages', 1) + logger.info(f"開始處理 {total_pages} 頁 PDF") + + # Determine image directory + if json_parent_dir is None: + json_parent_dir = output_path.parent + + for page_num in range(1, total_pages + 1): + logger.info(f">>> 處理第 {page_num}/{total_pages} 頁") + if page_num > 1: + pdf_canvas.showPage() + + # Get regions for this page + page_text_regions = pages_data.get(page_num, []) + page_table_regions = [t for t in table_elements if t.get('page') == page_num - 1] + page_image_regions = [ + img for img in images_metadata + if img.get('page') == page_num - 1 + and 'table' not in img.get('image_path', '').lower() + ] + + # Draw in layers: images → tables → text + + # 1. Draw images (bottom layer) + for img_meta in page_image_regions: + self.draw_image_region( + pdf_canvas, img_meta, target_height, + json_parent_dir, scale_w, scale_h + ) + + # 2. Draw tables (middle layer) + for table_elem in page_table_regions: + self.draw_table_region( + pdf_canvas, table_elem, images_metadata, + target_height, scale_w, scale_h + ) + + # 3. Draw text (top layer) + for region in page_text_regions: + self.draw_text_region( + pdf_canvas, region, target_height, + scale_w, scale_h + ) + + logger.info(f"<<< 第 {page_num} 頁完成") + + # Save PDF + pdf_canvas.save() + + file_size = output_path.stat().st_size + logger.info(f"Generated PDF: {output_path.name} ({file_size} bytes)") + return True + + except Exception as e: + logger.error(f"Failed to generate PDF: {e}") + import traceback + traceback.print_exc() + return False + def calculate_page_dimensions(self, ocr_data: Dict, source_file_path: Optional[Path] = None) -> Tuple[float, float]: """ 從 OCR JSON 數據中推斷 OCR 處理時的實際頁面尺寸。 @@ -717,140 +1032,18 @@ class PDFGeneratorService: True if successful, False otherwise """ try: - # Check if PDF already exists (caching) - if output_path.exists(): - logger.info(f"PDF already exists: {output_path.name}") - return True - # Load JSON data ocr_data = self.load_ocr_json(json_path) if not ocr_data: return False - # Get text regions - text_regions = ocr_data.get('text_regions', []) - if not text_regions: - logger.warning("No text regions found in JSON") - return False - - # Get images metadata - images_metadata = ocr_data.get('images_metadata', []) - - # Get layout data - layout_data = ocr_data.get('layout_data', {}) - - # Step 1: Get OCR processing dimensions (the large image OCR actually used) - # This comes from analyzing all bbox coordinates in the OCR data - ocr_width, ocr_height = self.calculate_page_dimensions(ocr_data, source_file_path=None) - logger.info(f"OCR 處理時使用的座標系尺寸: {ocr_width:.1f} x {ocr_height:.1f}") - - # Step 2: Get target PDF dimensions (usually the original file size) - # This is what we want the final PDF size to be - if source_file_path: - target_dims = self.get_original_page_size(source_file_path) - if target_dims: - target_width, target_height = target_dims - logger.info(f"目標 PDF 尺寸(來自原始文件): {target_width:.1f} x {target_height:.1f}") - else: - # If we can't get original size, use OCR dimensions as target - target_width, target_height = ocr_width, ocr_height - logger.warning(f"無法獲取原始文件尺寸,使用 OCR 尺寸作為目標: {target_width:.1f} x {target_height:.1f}") - else: - # No source file, use OCR dimensions as target (1:1 mapping) - target_width, target_height = ocr_width, ocr_height - logger.info(f"無原始文件,使用 OCR 尺寸作為目標: {target_width:.1f} x {target_height:.1f}") - - # Step 3: Calculate scale factors to convert OCR coordinates to PDF coordinates - scale_w = target_width / ocr_width - scale_h = target_height / ocr_height - logger.info(f"縮放因子: X={scale_w:.3f}, Y={scale_h:.3f} (OCR座標 → PDF座標)") - - # Create PDF canvas with target dimensions - pdf_canvas = canvas.Canvas(str(output_path), pagesize=(target_width, target_height)) - - # *** 關鍵修復:收集所有需要避免的區域(表格 + 圖片)*** - # 注意:OCR JSON 中沒有 'tables' 和 'image_regions' 頂層欄位 - # 重要發現: - # - layout_data.elements 中的表格元素沒有 bbox(都是空列表) - # - images_metadata 包含所有表格和圖片,並且有正確的 bbox - # - 因此,只需使用 images_metadata 來過濾文字即可 - - # 使用 images_metadata 作為要避免的區域(包含表格圖片和其他圖片) - regions_to_avoid = images_metadata - - table_count = len([img for img in images_metadata if 'table' in img.get('image_path', '').lower()]) - other_count = len(images_metadata) - table_count - - logger.info(f"使用 images_metadata 過濾文字區域:") - logger.info(f" - 表格圖片: {table_count}") - logger.info(f" - 其他圖片: {other_count}") - logger.info(f" - 總計需要避免的區域: {len(regions_to_avoid)}") - - # 使用新的過濾函式過濾文字區域 - filtered_text_regions = self._filter_text_in_regions(text_regions, regions_to_avoid) - - # Group regions by page - pages_data = {} - for region in filtered_text_regions: - page_num = region.get('page', 1) - if page_num not in pages_data: - pages_data[page_num] = [] - pages_data[page_num].append(region) - - # Get table elements from layout_data - table_elements = [] - if layout_data and layout_data.get('elements'): - table_elements = [e for e in layout_data['elements'] if e.get('type') == 'table'] - - # Process each page - total_pages = ocr_data.get('total_pages', 1) - logger.info(f"=" * 70) - logger.info(f"開始處理 {total_pages} 頁 PDF") - logger.info(f"=" * 70) - - for page_num in range(1, total_pages + 1): - logger.info(f"\n>>> 處理第 {page_num}/{total_pages} 頁") - if page_num > 1: - pdf_canvas.showPage() # Start new page - - # Get filtered regions for this page - page_text_regions = pages_data.get(page_num, []) - page_table_regions = [t for t in table_elements if t.get('page') == page_num - 1] - page_image_regions = [img for img in images_metadata if img.get('page') == page_num - 1 and 'table' not in img.get('image_path', '').lower()] - - # 繪製順序:圖片(底層) → 表格(中間層) → 文字(最上層) - - # 1. Draw images first (bottom layer) - logger.info(f"第 {page_num} 頁: 繪製 {len(page_image_regions)} 個圖片") - for img_meta in page_image_regions: - self.draw_image_region( - pdf_canvas, - img_meta, - target_height, - json_path.parent, - scale_w, - scale_h - ) - - # 2. Draw tables (middle layer) - logger.info(f"第 {page_num} 頁: 繪製 {len(page_table_regions)} 個表格") - for table_elem in page_table_regions: - self.draw_table_region(pdf_canvas, table_elem, images_metadata, target_height, scale_w, scale_h) - - # 3. Draw text regions last (top layer) - excluding table text - logger.info(f"第 {page_num} 頁: 繪製 {len(page_text_regions)} 個文字區域") - for i, region in enumerate(page_text_regions, 1): - logger.debug(f" 文字 {i}/{len(page_text_regions)}") - self.draw_text_region(pdf_canvas, region, target_height, scale_w, scale_h) - - logger.info(f"<<< 第 {page_num} 頁完成") - - # Save PDF - pdf_canvas.save() - - file_size = output_path.stat().st_size - logger.info(f"Generated layout-preserving PDF: {output_path.name} ({file_size} bytes)") - return True + # Use internal generation with pre-loaded data + return self._generate_pdf_from_data( + ocr_data=ocr_data, + output_path=output_path, + source_file_path=source_file_path, + json_parent_dir=json_path.parent + ) except Exception as e: logger.error(f"Failed to generate PDF: {e}") diff --git a/openspec/changes/dual-track-document-processing/tasks.md b/openspec/changes/dual-track-document-processing/tasks.md index 338437f..e45cd02 100644 --- a/openspec/changes/dual-track-document-processing/tasks.md +++ b/openspec/changes/dual-track-document-processing/tasks.md @@ -63,10 +63,10 @@ - [x] 4.2.1 Define standardized JSON schema - [x] 4.2.2 Include processing metadata - [x] 4.2.3 Support both track outputs -- [ ] 4.3 Update PDF generator for UnifiedDocument - - [ ] 4.3.1 Adapt PDF generation to use UnifiedDocument - - [ ] 4.3.2 Preserve layout from both tracks - - [ ] 4.3.3 Handle coordinate transformations +- [x] 4.3 Update PDF generator for UnifiedDocument + - [x] 4.3.1 Adapt PDF generation to use UnifiedDocument + - [x] 4.3.2 Preserve layout from both tracks + - [x] 4.3.3 Handle coordinate transformations ## 5. Translation System Foundation - [ ] 5.1 Create TranslationEngine interface