feat: add unified JSON export with standardized schema

- Create JSON Schema definition for UnifiedDocument format - Implement UnifiedDocumentExporter service with multiple export formats - Include comprehensive processing metadata and statistics - Update OCR service to use new exporter for dual-track outputs - Support JSON, Markdown, Text, and legacy format exports 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-19 08:36:24 +08:00
parent 5bcf3dfd42
commit ab89a40e8d
5 changed files with 999 additions and 21 deletions
--- a/backend/app/services/ocr_service.py
+++ b/backend/app/services/ocr_service.py
@@ -23,6 +23,7 @@ try:
    from app.services.document_type_detector import DocumentTypeDetector, ProcessingTrackRecommendation
    from app.services.direct_extraction_engine import DirectExtractionEngine
    from app.services.ocr_to_unified_converter import OCRToUnifiedConverter
+    from app.services.unified_document_exporter import UnifiedDocumentExporter
    from app.models.unified_document import (
        UnifiedDocument, DocumentMetadata,
        ProcessingTrack, ElementType, DocumentElement, Page, Dimensions,
@@ -30,8 +31,9 @@ try:
    )
    DUAL_TRACK_AVAILABLE = True
 except ImportError as e:
-    logger.warning(f"Dual-track components not available: {e}")
+    logging.getLogger(__name__).warning(f"Dual-track components not available: {e}")
    DUAL_TRACK_AVAILABLE = False
+    UnifiedDocumentExporter = None

 logger = logging.getLogger(__name__)

@@ -1175,26 +1177,39 @@ class OCRService:
        try:
            output_dir.mkdir(parents=True, exist_ok=True)

-            # Convert UnifiedDocument to dict if needed
-            if isinstance(result, UnifiedDocument):
-                result_dict = result.to_dict()
-                legacy_result = result.to_legacy_format()
+            # Use UnifiedDocumentExporter for standardized export
+            if isinstance(result, UnifiedDocument) and UnifiedDocumentExporter is not None:
+                # Use the new exporter for UnifiedDocument
+                json_path = output_dir / f"{file_id}_result.json"
+                UnifiedDocumentExporter.export_to_json(
+                    result,
+                    json_path,
+                    include_metadata=True,
+                    include_statistics=True
+                )
+
+                markdown_path = output_dir / f"{file_id}_output.md"
+                UnifiedDocumentExporter.export_to_markdown(
+                    result,
+                    markdown_path,
+                    include_metadata_header=False  # Keep output clean
+                )
+
                markdown_content = result.extract_all_text()
            else:
-                result_dict = result
-                legacy_result = result
-                markdown_content = result.get('markdown_content', '')
+                # Legacy path for dict results
+                result_dict = result if isinstance(result, dict) else result.to_dict()
+                markdown_content = result.get('markdown_content', '') if isinstance(result, dict) else ''

-            # Save JSON (use dict format for compatibility)
-            json_path = output_dir / f"{file_id}_result.json"
-            with open(json_path, 'w', encoding='utf-8') as f:
-                json.dump(result_dict if isinstance(result, UnifiedDocument) else result,
-                         f, ensure_ascii=False, indent=2)
+                # Save JSON
+                json_path = output_dir / f"{file_id}_result.json"
+                with open(json_path, 'w', encoding='utf-8') as f:
+                    json.dump(result_dict, f, ensure_ascii=False, indent=2)

-            # Save Markdown
-            markdown_path = output_dir / f"{file_id}_output.md"
-            with open(markdown_path, 'w', encoding='utf-8') as f:
-                f.write(markdown_content)
+                # Save Markdown
+                markdown_path = output_dir / f"{file_id}_output.md"
+                with open(markdown_path, 'w', encoding='utf-8') as f:
+                    f.write(markdown_content)

            logger.info(f"Results saved: {json_path.name}, {markdown_path.name}")