feat: update PDF generator to support UnifiedDocument directly

- Add generate_from_unified_document() method for direct UnifiedDocument processing - Create convert_unified_document_to_ocr_data() for format conversion - Extract _generate_pdf_from_data() as reusable core logic - Support both OCR and DIRECT processing tracks in PDF generation - Handle coordinate transformations (BoundingBox to polygon format) - Update OCR service to use appropriate PDF generation method Completes Section 4 (Unified Processing Pipeline) of dual-track proposal. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-19 08:48:25 +08:00
parent ab89a40e8d
commit ecdce961ca
3 changed files with 341 additions and 138 deletions
--- a/backend/app/services/ocr_service.py
+++ b/backend/app/services/ocr_service.py
@@ -1223,11 +1223,21 @@ class OCRService:

                logger.info(f"Generating layout-preserving PDF: {pdf_filename}")

-                success = pdf_generator_service.generate_layout_pdf(
-                    json_path=json_path,
-                    output_path=pdf_path,
-                    source_file_path=source_file_path
-                )
+                # Use appropriate method based on result type
+                if isinstance(result, UnifiedDocument):
+                    # Use direct UnifiedDocument generation for better accuracy
+                    success = pdf_generator_service.generate_from_unified_document(
+                        unified_doc=result,
+                        output_path=pdf_path,
+                        source_file_path=source_file_path
+                    )
+                else:
+                    # Legacy path: use JSON file
+                    success = pdf_generator_service.generate_layout_pdf(
+                        json_path=json_path,
+                        output_path=pdf_path,
+                        source_file_path=source_file_path
+                    )

                if success:
                    logger.info(f"✓ PDF generated successfully: {pdf_path.name}")