feat: add unified JSON export with standardized schema

- Create JSON Schema definition for UnifiedDocument format - Implement UnifiedDocumentExporter service with multiple export formats - Include comprehensive processing metadata and statistics - Update OCR service to use new exporter for dual-track outputs - Support JSON, Markdown, Text, and legacy format exports 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-19 08:36:24 +08:00
parent 5bcf3dfd42
commit ab89a40e8d
5 changed files with 999 additions and 21 deletions
--- a/backend/app/services/unified_document_exporter.py
+++ b/backend/app/services/unified_document_exporter.py
@@ -0,0 +1,506 @@
+"""
+Unified Document Exporter Service
+
+Provides standardized export functionality for UnifiedDocument,
+supporting both OCR and Direct extraction track outputs with
+comprehensive processing metadata.
+"""
+
+import json
+import logging
+from pathlib import Path
+from typing import Dict, Any, Optional, Tuple, Union
+from datetime import datetime
+
+from ..models.unified_document import (
+    UnifiedDocument,
+    ProcessingTrack,
+    ElementType
+)
+
+logger = logging.getLogger(__name__)
+
+
+class ExportFormat:
+    """Supported export formats"""
+    JSON = "json"
+    JSON_MINIMAL = "json_minimal"
+    JSON_LEGACY = "json_legacy"
+    MARKDOWN = "markdown"
+    TEXT = "text"
+    HTML = "html"
+
+
+class UnifiedDocumentExporter:
+    """
+    Exporter service for UnifiedDocument with comprehensive metadata support.
+
+    Supports both OCR and DIRECT processing tracks with consistent output format.
+    """
+
+    # Schema version for tracking format changes
+    SCHEMA_VERSION = "1.0.0"
+
+    @staticmethod
+    def export_to_json(
+        document: UnifiedDocument,
+        output_path: Optional[Path] = None,
+        include_metadata: bool = True,
+        include_statistics: bool = True,
+        include_binary_content: bool = False,
+        pretty_print: bool = True
+    ) -> Union[str, Path]:
+        """
+        Export UnifiedDocument to standardized JSON format.
+
+        Args:
+            document: The UnifiedDocument to export
+            output_path: Optional path to save the JSON file
+            include_metadata: Include processing metadata
+            include_statistics: Include document statistics
+            include_binary_content: Include base64-encoded binary content
+            pretty_print: Format JSON with indentation
+
+        Returns:
+            JSON string if no output_path, otherwise the output Path
+        """
+        export_data = UnifiedDocumentExporter._build_export_data(
+            document,
+            include_metadata=include_metadata,
+            include_statistics=include_statistics,
+            include_binary_content=include_binary_content
+        )
+
+        json_str = json.dumps(
+            export_data,
+            ensure_ascii=False,
+            indent=2 if pretty_print else None,
+            default=str
+        )
+
+        if output_path:
+            output_path = Path(output_path)
+            output_path.parent.mkdir(parents=True, exist_ok=True)
+            output_path.write_text(json_str, encoding='utf-8')
+            logger.info(f"Exported JSON to: {output_path}")
+            return output_path
+
+        return json_str
+
+    @staticmethod
+    def export_to_markdown(
+        document: UnifiedDocument,
+        output_path: Optional[Path] = None,
+        include_metadata_header: bool = True,
+        include_page_breaks: bool = True
+    ) -> Union[str, Path]:
+        """
+        Export UnifiedDocument to Markdown format.
+
+        Args:
+            document: The UnifiedDocument to export
+            output_path: Optional path to save the Markdown file
+            include_metadata_header: Include document metadata as header
+            include_page_breaks: Include page break markers
+
+        Returns:
+            Markdown string if no output_path, otherwise the output Path
+        """
+        lines = []
+
+        # Add metadata header
+        if include_metadata_header:
+            lines.append(f"# {document.metadata.filename}")
+            lines.append("")
+            lines.append("## Document Info")
+            lines.append(f"- **Processing Track**: {document.metadata.processing_track.value}")
+            lines.append(f"- **Processing Time**: {document.metadata.processing_time:.2f}s")
+            lines.append(f"- **Pages**: {document.page_count}")
+            lines.append(f"- **Total Elements**: {document.total_elements}")
+            if document.metadata.language:
+                lines.append(f"- **Language**: {document.metadata.language}")
+            lines.append("")
+            lines.append("---")
+            lines.append("")
+
+        # Export each page
+        for page in document.pages:
+            if include_page_breaks and page.page_number > 1:
+                lines.append("")
+                lines.append(f"---")
+                lines.append("")
+
+            lines.append(f"## Page {page.page_number}")
+            lines.append("")
+
+            # Get elements in reading order
+            for element in page.get_reading_order():
+                content = UnifiedDocumentExporter._element_to_markdown(element)
+                if content:
+                    lines.append(content)
+                    lines.append("")
+
+        md_content = "\n".join(lines)
+
+        if output_path:
+            output_path = Path(output_path)
+            output_path.parent.mkdir(parents=True, exist_ok=True)
+            output_path.write_text(md_content, encoding='utf-8')
+            logger.info(f"Exported Markdown to: {output_path}")
+            return output_path
+
+        return md_content
+
+    @staticmethod
+    def export_to_text(
+        document: UnifiedDocument,
+        output_path: Optional[Path] = None,
+        page_separator: str = "\n\n--- Page Break ---\n\n"
+    ) -> Union[str, Path]:
+        """
+        Export UnifiedDocument to plain text format.
+
+        Args:
+            document: The UnifiedDocument to export
+            output_path: Optional path to save the text file
+            page_separator: Separator between pages
+
+        Returns:
+            Text string if no output_path, otherwise the output Path
+        """
+        pages_text = []
+
+        for page in document.pages:
+            page_text = page.extract_text()
+            if page_text:
+                pages_text.append(page_text)
+
+        text_content = page_separator.join(pages_text)
+
+        if output_path:
+            output_path = Path(output_path)
+            output_path.parent.mkdir(parents=True, exist_ok=True)
+            output_path.write_text(text_content, encoding='utf-8')
+            logger.info(f"Exported text to: {output_path}")
+            return output_path
+
+        return text_content
+
+    @staticmethod
+    def export_to_legacy_json(
+        document: UnifiedDocument,
+        output_path: Optional[Path] = None
+    ) -> Union[str, Path]:
+        """
+        Export UnifiedDocument to legacy JSON format for backward compatibility.
+
+        Args:
+            document: The UnifiedDocument to export
+            output_path: Optional path to save the JSON file
+
+        Returns:
+            JSON string if no output_path, otherwise the output Path
+        """
+        legacy_data = document.to_legacy_format()
+
+        json_str = json.dumps(
+            legacy_data,
+            ensure_ascii=False,
+            indent=2,
+            default=str
+        )
+
+        if output_path:
+            output_path = Path(output_path)
+            output_path.parent.mkdir(parents=True, exist_ok=True)
+            output_path.write_text(json_str, encoding='utf-8')
+            logger.info(f"Exported legacy JSON to: {output_path}")
+            return output_path
+
+        return json_str
+
+    @staticmethod
+    def export_all_formats(
+        document: UnifiedDocument,
+        output_dir: Path,
+        file_id: str
+    ) -> Dict[str, Optional[Path]]:
+        """
+        Export UnifiedDocument to all standard formats.
+
+        Args:
+            document: The UnifiedDocument to export
+            output_dir: Directory to save output files
+            file_id: Base filename for outputs
+
+        Returns:
+            Dictionary mapping format names to output paths
+        """
+        output_dir = Path(output_dir)
+        output_dir.mkdir(parents=True, exist_ok=True)
+
+        results = {}
+
+        # Export JSON
+        try:
+            json_path = output_dir / f"{file_id}_result.json"
+            UnifiedDocumentExporter.export_to_json(
+                document,
+                json_path,
+                include_metadata=True,
+                include_statistics=True
+            )
+            results['json'] = json_path
+        except Exception as e:
+            logger.error(f"Failed to export JSON: {e}")
+            results['json'] = None
+
+        # Export Markdown
+        try:
+            md_path = output_dir / f"{file_id}_output.md"
+            UnifiedDocumentExporter.export_to_markdown(document, md_path)
+            results['markdown'] = md_path
+        except Exception as e:
+            logger.error(f"Failed to export Markdown: {e}")
+            results['markdown'] = None
+
+        # Export plain text
+        try:
+            txt_path = output_dir / f"{file_id}_text.txt"
+            UnifiedDocumentExporter.export_to_text(document, txt_path)
+            results['text'] = txt_path
+        except Exception as e:
+            logger.error(f"Failed to export text: {e}")
+            results['text'] = None
+
+        return results
+
+    @staticmethod
+    def _build_export_data(
+        document: UnifiedDocument,
+        include_metadata: bool = True,
+        include_statistics: bool = True,
+        include_binary_content: bool = False
+    ) -> Dict[str, Any]:
+        """
+        Build the export data structure with processing metadata.
+
+        Supports both OCR and DIRECT track outputs with consistent format.
+        """
+        # Base document data
+        export_data = {
+            "schema_version": UnifiedDocumentExporter.SCHEMA_VERSION,
+            "document_id": document.document_id,
+            "export_timestamp": datetime.utcnow().isoformat() + "Z"
+        }
+
+        # Add metadata
+        if include_metadata:
+            export_data["metadata"] = document.metadata.to_dict()
+
+            # Add extended processing metadata
+            export_data["metadata"]["processing_info"] = {
+                "track_description": UnifiedDocumentExporter._get_track_description(
+                    document.metadata.processing_track
+                ),
+                "schema_version": UnifiedDocumentExporter.SCHEMA_VERSION,
+                "export_format": "unified_document_v1"
+            }
+
+        # Add pages
+        export_data["pages"] = [page.to_dict() for page in document.pages]
+
+        # Add statistics
+        if include_statistics:
+            export_data["statistics"] = {
+                "page_count": document.page_count,
+                "total_elements": document.total_elements,
+                "total_tables": len(document.get_all_tables()),
+                "total_images": len(document.get_all_images()),
+                "element_type_counts": UnifiedDocumentExporter._count_element_types(document),
+                "text_stats": UnifiedDocumentExporter._calculate_text_stats(document)
+            }
+
+        # Add processing errors if any
+        if document.processing_errors:
+            export_data["processing_errors"] = document.processing_errors
+
+        return export_data
+
+    @staticmethod
+    def _get_track_description(track: ProcessingTrack) -> str:
+        """Get human-readable description for processing track."""
+        descriptions = {
+            ProcessingTrack.OCR: "PaddleOCR PP-StructureV3 - Used for scanned documents and images",
+            ProcessingTrack.DIRECT: "PyMuPDF Direct Extraction - Used for editable PDFs with embedded text",
+            ProcessingTrack.HYBRID: "Hybrid Processing - Combined OCR and direct extraction"
+        }
+        return descriptions.get(track, "Unknown processing track")
+
+    @staticmethod
+    def _count_element_types(document: UnifiedDocument) -> Dict[str, int]:
+        """Count occurrences of each element type in the document."""
+        counts = {}
+        for page in document.pages:
+            for element in page.elements:
+                type_name = element.type.value
+                counts[type_name] = counts.get(type_name, 0) + 1
+        return counts
+
+    @staticmethod
+    def _calculate_text_stats(document: UnifiedDocument) -> Dict[str, Any]:
+        """Calculate text statistics for the document."""
+        full_text = document.extract_all_text()
+        words = full_text.split()
+        chars = len(full_text)
+
+        # Calculate average confidence
+        confidences = []
+        for page in document.pages:
+            for element in page.elements:
+                if element.confidence is not None:
+                    confidences.append(element.confidence)
+
+        avg_confidence = sum(confidences) / len(confidences) if confidences else None
+
+        return {
+            "total_characters": chars,
+            "total_words": len(words),
+            "average_confidence": round(avg_confidence, 4) if avg_confidence else None
+        }
+
+    @staticmethod
+    def _element_to_markdown(element) -> str:
+        """Convert a document element to Markdown format."""
+        content = element.get_text()
+
+        if not content and element.type not in [ElementType.TABLE, ElementType.IMAGE]:
+            return ""
+
+        # Format based on element type
+        if element.type == ElementType.TITLE:
+            return f"# {content}"
+        elif element.type == ElementType.HEADER:
+            return f"### {content}"
+        elif element.type in [ElementType.TEXT, ElementType.PARAGRAPH, ElementType.BODY]:
+            return content
+        elif element.type == ElementType.LIST_ITEM:
+            return f"- {content}"
+        elif element.type == ElementType.TABLE:
+            # Use HTML table if available
+            if hasattr(element.content, 'to_html'):
+                return element.content.to_html()
+            return f"[Table: {content}]"
+        elif element.type == ElementType.IMAGE:
+            return f"![Image]({element.metadata.get('path', 'image')})"
+        elif element.type == ElementType.FIGURE:
+            return f"[Figure: {content or 'No caption'}]"
+        elif element.type == ElementType.CODE:
+            return f"```\n{content}\n```"
+        elif element.type == ElementType.EQUATION:
+            return f"$${content}$$"
+        elif element.type == ElementType.CAPTION:
+            return f"*{content}*"
+        elif element.type == ElementType.FOOTNOTE:
+            return f"[^{content}]"
+        elif element.type == ElementType.REFERENCE:
+            return f"> {content}"
+        else:
+            return content if content else ""
+
+
+class JSONSchemaValidator:
+    """
+    Validator for UnifiedDocument JSON exports.
+
+    Uses the JSON Schema definition to validate exported data.
+    """
+
+    _schema = None
+
+    @classmethod
+    def get_schema(cls) -> Dict[str, Any]:
+        """Load and return the JSON Schema for UnifiedDocument."""
+        if cls._schema is None:
+            schema_path = Path(__file__).parent.parent / "schemas" / "unified_document_schema.json"
+            if schema_path.exists():
+                cls._schema = json.loads(schema_path.read_text(encoding='utf-8'))
+            else:
+                logger.warning(f"Schema file not found: {schema_path}")
+                cls._schema = {}
+        return cls._schema
+
+    @classmethod
+    def validate(cls, data: Dict[str, Any]) -> Tuple[bool, Optional[str]]:
+        """
+        Validate JSON data against the UnifiedDocument schema.
+
+        Args:
+            data: The JSON data to validate
+
+        Returns:
+            Tuple of (is_valid, error_message)
+        """
+        try:
+            import jsonschema
+            schema = cls.get_schema()
+            if not schema:
+                return True, None  # Skip validation if schema not available
+
+            jsonschema.validate(data, schema)
+            return True, None
+        except ImportError:
+            logger.warning("jsonschema package not installed, skipping validation")
+            return True, None
+        except Exception as e:
+            return False, str(e)
+
+
+def save_unified_document(
+    document: UnifiedDocument,
+    output_dir: Path,
+    file_id: str,
+    formats: Optional[list] = None
+) -> Dict[str, Optional[Path]]:
+    """
+    Convenience function to save UnifiedDocument to multiple formats.
+
+    Args:
+        document: The UnifiedDocument to save
+        output_dir: Output directory
+        file_id: Base filename
+        formats: List of formats to export (default: ['json', 'markdown'])
+
+    Returns:
+        Dictionary mapping format names to output paths
+    """
+    if formats is None:
+        formats = ['json', 'markdown']
+
+    results = {}
+    output_dir = Path(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    for fmt in formats:
+        try:
+            if fmt == 'json':
+                path = output_dir / f"{file_id}_result.json"
+                UnifiedDocumentExporter.export_to_json(document, path)
+                results['json'] = path
+            elif fmt == 'markdown':
+                path = output_dir / f"{file_id}_output.md"
+                UnifiedDocumentExporter.export_to_markdown(document, path)
+                results['markdown'] = path
+            elif fmt == 'text':
+                path = output_dir / f"{file_id}_text.txt"
+                UnifiedDocumentExporter.export_to_text(document, path)
+                results['text'] = path
+            elif fmt == 'legacy':
+                path = output_dir / f"{file_id}_legacy.json"
+                UnifiedDocumentExporter.export_to_legacy_json(document, path)
+                results['legacy'] = path
+        except Exception as e:
+            logger.error(f"Failed to export {fmt}: {e}")
+            results[fmt] = None
+
+    return results