feat: add unified JSON export with standardized schema

- Create JSON Schema definition for UnifiedDocument format - Implement UnifiedDocumentExporter service with multiple export formats - Include comprehensive processing metadata and statistics - Update OCR service to use new exporter for dual-track outputs - Support JSON, Markdown, Text, and legacy format exports 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-19 08:36:24 +08:00
parent 5bcf3dfd42
commit ab89a40e8d
5 changed files with 999 additions and 21 deletions
--- a/backend/app/schemas/unified_document_schema.json
+++ b/backend/app/schemas/unified_document_schema.json
@@ -0,0 +1,443 @@
 {
  "$schema": "http://json-schema.org/draft-07/schema#",
  "$id": "https://tool-ocr.local/schemas/unified-document.json",
  "title": "UnifiedDocument",
  "description": "Unified document representation for dual-track OCR/Direct extraction processing",
  "type": "object",
  "required": ["document_id", "metadata", "pages", "statistics"],
  "properties": {
    "document_id": {
      "type": "string",
      "description": "Unique identifier for the document"
    },
    "metadata": {
      "$ref": "#/definitions/DocumentMetadata"
    },
    "pages": {
      "type": "array",
      "items": {
        "$ref": "#/definitions/Page"
      },
      "description": "List of pages in the document"
    },
    "statistics": {
      "$ref": "#/definitions/DocumentStatistics"
    },
    "processing_errors": {
      "type": "array",
      "items": {
        "$ref": "#/definitions/ProcessingError"
      },
      "default": [],
      "description": "List of any errors encountered during processing"
    }
  },
  "definitions": {
    "DocumentMetadata": {
      "type": "object",
      "required": ["filename", "file_type", "file_size", "created_at", "processing_track", "processing_time"],
      "properties": {
        "filename": {
          "type": "string",
          "description": "Original filename"
        },
        "file_type": {
          "type": "string",
          "description": "File MIME type or extension"
        },
        "file_size": {
          "type": "integer",
          "minimum": 0,
          "description": "File size in bytes"
        },
        "created_at": {
          "type": "string",
          "format": "date-time",
          "description": "Processing timestamp (ISO 8601)"
        },
        "processing_track": {
          "type": "string",
          "enum": ["ocr", "direct", "hybrid"],
          "description": "Processing track used"
        },
        "processing_time": {
          "type": "number",
          "minimum": 0,
          "description": "Processing time in seconds"
        },
        "language": {
          "type": "string",
          "description": "Detected or specified language code"
        },
        "title": {
          "type": "string",
          "description": "Document title from metadata"
        },
        "author": {
          "type": "string",
          "description": "Document author"
        },
        "subject": {
          "type": "string",
          "description": "Document subject"
        },
        "keywords": {
          "type": "array",
          "items": {"type": "string"},
          "description": "Document keywords"
        },
        "producer": {
          "type": "string",
          "description": "PDF producer application"
        },
        "creator": {
          "type": "string",
          "description": "Document creator application"
        },
        "creation_date": {
          "type": "string",
          "format": "date-time",
          "description": "Document creation date"
        },
        "modification_date": {
          "type": "string",
          "format": "date-time",
          "description": "Document last modification date"
        }
      }
    },
    "Page": {
      "type": "object",
      "required": ["page_number", "elements", "dimensions"],
      "properties": {
        "page_number": {
          "type": "integer",
          "minimum": 1,
          "description": "1-based page number"
        },
        "elements": {
          "type": "array",
          "items": {
            "$ref": "#/definitions/DocumentElement"
          },
          "description": "List of elements on the page"
        },
        "dimensions": {
          "$ref": "#/definitions/Dimensions"
        },
        "metadata": {
          "type": "object",
          "additionalProperties": true,
          "description": "Additional page-specific metadata"
        },
        "statistics": {
          "$ref": "#/definitions/PageStatistics"
        }
      }
    },
    "DocumentElement": {
      "type": "object",
      "required": ["element_id", "type", "bbox"],
      "properties": {
        "element_id": {
          "type": "string",
          "description": "Unique identifier for the element"
        },
        "type": {
          "type": "string",
          "enum": [
            "text", "title", "header", "footer", "reference", "equation", "footnote", "caption",
            "list", "list_item",
            "table", "table_cell", "table_caption",
            "image", "figure", "chart", "diagram",
            "section", "paragraph", "page_number", "watermark", "header_group", "body",
            "code", "formula", "signature", "stamp", "logo", "barcode", "qr_code"
          ],
          "description": "Element type (supports all 23 PP-StructureV3 types plus custom ones)"
        },
        "content": {
          "oneOf": [
            {"type": "string"},
            {"$ref": "#/definitions/TableData"},
            {"type": "object"}
          ],
          "description": "Element content (text, table data, or structured data)"
        },
        "content_type": {
          "type": "string",
          "enum": ["text", "table", "binary"],
          "description": "Type of content when not a simple string"
        },
        "content_length": {
          "type": "integer",
          "description": "Length of binary content in bytes"
        },
        "bbox": {
          "$ref": "#/definitions/BoundingBox"
        },
        "confidence": {
          "type": "number",
          "minimum": 0,
          "maximum": 1,
          "description": "OCR confidence score (0-1)"
        },
        "style": {
          "$ref": "#/definitions/StyleInfo"
        },
        "metadata": {
          "type": "object",
          "additionalProperties": true,
          "description": "Additional element metadata"
        },
        "children": {
          "type": "array",
          "items": {
            "$ref": "#/definitions/DocumentElement"
          },
          "description": "Nested child elements"
        }
      }
    },
    "BoundingBox": {
      "type": "object",
      "required": ["x0", "y0", "x1", "y1"],
      "properties": {
        "x0": {
          "type": "number",
          "description": "Left coordinate"
        },
        "y0": {
          "type": "number",
          "description": "Top coordinate"
        },
        "x1": {
          "type": "number",
          "description": "Right coordinate"
        },
        "y1": {
          "type": "number",
          "description": "Bottom coordinate"
        },
        "width": {
          "type": "number",
          "description": "Width (calculated)"
        },
        "height": {
          "type": "number",
          "description": "Height (calculated)"
        }
      }
    },
    "StyleInfo": {
      "type": "object",
      "properties": {
        "font_name": {
          "type": "string",
          "description": "Font family name"
        },
        "font_size": {
          "type": "number",
          "minimum": 0,
          "description": "Font size in points"
        },
        "font_weight": {
          "type": "string",
          "enum": ["normal", "bold"],
          "description": "Font weight"
        },
        "font_style": {
          "type": "string",
          "enum": ["normal", "italic"],
          "description": "Font style"
        },
        "text_color": {
          "type": "integer",
          "description": "Text color as RGB integer"
        },
        "text_color_rgb": {
          "type": "array",
          "items": {"type": "integer", "minimum": 0, "maximum": 255},
          "minItems": 3,
          "maxItems": 3,
          "description": "Text color as [R, G, B] array"
        },
        "bg_color": {
          "type": "integer",
          "description": "Background color as RGB integer"
        },
        "alignment": {
          "type": "string",
          "enum": ["left", "center", "right", "justify"],
          "description": "Text alignment"
        }
      }
    },
    "TableData": {
      "type": "object",
      "required": ["rows", "cols", "cells"],
      "properties": {
        "rows": {
          "type": "integer",
          "minimum": 0,
          "description": "Number of rows"
        },
        "cols": {
          "type": "integer",
          "minimum": 0,
          "description": "Number of columns"
        },
        "cells": {
          "type": "array",
          "items": {
            "$ref": "#/definitions/TableCell"
          },
          "description": "Table cells"
        },
        "headers": {
          "type": "array",
          "items": {"type": "string"},
          "description": "Header row labels"
        },
        "caption": {
          "type": "string",
          "description": "Table caption"
        }
      }
    },
    "TableCell": {
      "type": "object",
      "required": ["row", "col"],
      "properties": {
        "row": {
          "type": "integer",
          "minimum": 0,
          "description": "Row index (0-based)"
        },
        "col": {
          "type": "integer",
          "minimum": 0,
          "description": "Column index (0-based)"
        },
        "row_span": {
          "type": "integer",
          "minimum": 1,
          "default": 1,
          "description": "Number of rows spanned"
        },
        "col_span": {
          "type": "integer",
          "minimum": 1,
          "default": 1,
          "description": "Number of columns spanned"
        },
        "content": {
          "type": "string",
          "default": "",
          "description": "Cell text content"
        },
        "bbox": {
          "$ref": "#/definitions/BoundingBox"
        },
        "style": {
          "$ref": "#/definitions/StyleInfo"
        }
      }
    },
    "Dimensions": {
      "type": "object",
      "required": ["width", "height"],
      "properties": {
        "width": {
          "type": "number",
          "minimum": 0,
          "description": "Width in pixels or points"
        },
        "height": {
          "type": "number",
          "minimum": 0,
          "description": "Height in pixels or points"
        },
        "dpi": {
          "type": "integer",
          "minimum": 1,
          "description": "Resolution in DPI"
        }
      }
    },
    "DocumentStatistics": {
      "type": "object",
      "required": ["page_count", "total_elements", "total_tables", "total_images"],
      "properties": {
        "page_count": {
          "type": "integer",
          "minimum": 0,
          "description": "Total number of pages"
        },
        "total_elements": {
          "type": "integer",
          "minimum": 0,
          "description": "Total elements across all pages"
        },
        "total_tables": {
          "type": "integer",
          "minimum": 0,
          "description": "Total tables across all pages"
        },
        "total_images": {
          "type": "integer",
          "minimum": 0,
          "description": "Total images across all pages"
        }
      }
    },
    "PageStatistics": {
      "type": "object",
      "required": ["total_elements", "text_elements", "tables", "images"],
      "properties": {
        "total_elements": {
          "type": "integer",
          "minimum": 0
        },
        "text_elements": {
          "type": "integer",
          "minimum": 0
        },
        "tables": {
          "type": "integer",
          "minimum": 0
        },
        "images": {
          "type": "integer",
          "minimum": 0
        }
      }
    },
    "ProcessingError": {
      "type": "object",
      "required": ["error_type", "message"],
      "properties": {
        "error_type": {
          "type": "string",
          "description": "Error classification"
        },
        "message": {
          "type": "string",
          "description": "Error description"
        },
        "page": {
          "type": "integer",
          "description": "Page number where error occurred"
        },
        "element_id": {
          "type": "string",
          "description": "Element ID if applicable"
        },
        "timestamp": {
          "type": "string",
          "format": "date-time",
          "description": "When the error occurred"
        }
      }
    }
  }
 }
--- a/backend/app/services/init.py
+++ b/backend/app/services/init.py
@@ -1,3 +1,17 @@
 """
 Tool_OCR - Services Package
 """
 from .unified_document_exporter import (
    UnifiedDocumentExporter,
    ExportFormat,
    JSONSchemaValidator,
    save_unified_document
 )
 __all__ = [
    'UnifiedDocumentExporter',
    'ExportFormat',
    'JSONSchemaValidator',
    'save_unified_document'
 ]
--- a/backend/app/services/ocr_service.py
+++ b/backend/app/services/ocr_service.py
@@ -23,6 +23,7 @@ try:
    from app.services.document_type_detector import DocumentTypeDetector, ProcessingTrackRecommendation
    from app.services.direct_extraction_engine import DirectExtractionEngine
    from app.services.ocr_to_unified_converter import OCRToUnifiedConverter
    from app.services.unified_document_exporter import UnifiedDocumentExporter
    from app.models.unified_document import (
        UnifiedDocument, DocumentMetadata,
        ProcessingTrack, ElementType, DocumentElement, Page, Dimensions,
@@ -30,8 +31,9 @@ try:
    )
    DUAL_TRACK_AVAILABLE = True
 except ImportError as e:
-    logger.warning(f"Dual-track components not available: {e}")
+    logging.getLogger(__name__).warning(f"Dual-track components not available: {e}")
    DUAL_TRACK_AVAILABLE = False
    UnifiedDocumentExporter = None
 logger = logging.getLogger(__name__)
@@ -1175,26 +1177,39 @@ class OCRService:
        try:
            output_dir.mkdir(parents=True, exist_ok=True)
-            # Convert UnifiedDocument to dict if needed
+            # Use UnifiedDocumentExporter for standardized export
-            if isinstance(result, UnifiedDocument):
+            if isinstance(result, UnifiedDocument) and UnifiedDocumentExporter is not None:
-                result_dict = result.to_dict()
+                # Use the new exporter for UnifiedDocument
-                legacy_result = result.to_legacy_format()
+                json_path = output_dir / f"{file_id}_result.json"
                UnifiedDocumentExporter.export_to_json(
                    result,
                    json_path,
                    include_metadata=True,
                    include_statistics=True
                )
                markdown_path = output_dir / f"{file_id}_output.md"
                UnifiedDocumentExporter.export_to_markdown(
                    result,
                    markdown_path,
                    include_metadata_header=False  # Keep output clean
                )
                markdown_content = result.extract_all_text()
            else:
-                result_dict = result
+                # Legacy path for dict results
-                legacy_result = result
+                result_dict = result if isinstance(result, dict) else result.to_dict()
-                markdown_content = result.get('markdown_content', '')
+                markdown_content = result.get('markdown_content', '') if isinstance(result, dict) else ''
-            # Save JSON (use dict format for compatibility)
+                # Save JSON
-            json_path = output_dir / f"{file_id}_result.json"
+                json_path = output_dir / f"{file_id}_result.json"
-            with open(json_path, 'w', encoding='utf-8') as f:
+                with open(json_path, 'w', encoding='utf-8') as f:
-                json.dump(result_dict if isinstance(result, UnifiedDocument) else result,
+                    json.dump(result_dict, f, ensure_ascii=False, indent=2)
                         f, ensure_ascii=False, indent=2)
-            # Save Markdown
+                # Save Markdown
-            markdown_path = output_dir / f"{file_id}_output.md"
+                markdown_path = output_dir / f"{file_id}_output.md"
-            with open(markdown_path, 'w', encoding='utf-8') as f:
+                with open(markdown_path, 'w', encoding='utf-8') as f:
-                f.write(markdown_content)
+                    f.write(markdown_content)
            logger.info(f"Results saved: {json_path.name}, {markdown_path.name}")
--- a/backend/app/services/unified_document_exporter.py
+++ b/backend/app/services/unified_document_exporter.py
@@ -0,0 +1,506 @@
 """
 Unified Document Exporter Service
 Provides standardized export functionality for UnifiedDocument,
 supporting both OCR and Direct extraction track outputs with
 comprehensive processing metadata.
 """
 import json
 import logging
 from pathlib import Path
 from typing import Dict, Any, Optional, Tuple, Union
 from datetime import datetime
 from ..models.unified_document import (
    UnifiedDocument,
    ProcessingTrack,
    ElementType
 )
 logger = logging.getLogger(__name__)
 class ExportFormat:
    """Supported export formats"""
    JSON = "json"
    JSON_MINIMAL = "json_minimal"
    JSON_LEGACY = "json_legacy"
    MARKDOWN = "markdown"
    TEXT = "text"
    HTML = "html"
 class UnifiedDocumentExporter:
    """
    Exporter service for UnifiedDocument with comprehensive metadata support.
    Supports both OCR and DIRECT processing tracks with consistent output format.
    """
    # Schema version for tracking format changes
    SCHEMA_VERSION = "1.0.0"
    @staticmethod
    def export_to_json(
        document: UnifiedDocument,
        output_path: Optional[Path] = None,
        include_metadata: bool = True,
        include_statistics: bool = True,
        include_binary_content: bool = False,
        pretty_print: bool = True
    ) -> Union[str, Path]:
        """
        Export UnifiedDocument to standardized JSON format.
        Args:
            document: The UnifiedDocument to export
            output_path: Optional path to save the JSON file
            include_metadata: Include processing metadata
            include_statistics: Include document statistics
            include_binary_content: Include base64-encoded binary content
            pretty_print: Format JSON with indentation
        Returns:
            JSON string if no output_path, otherwise the output Path
        """
        export_data = UnifiedDocumentExporter._build_export_data(
            document,
            include_metadata=include_metadata,
            include_statistics=include_statistics,
            include_binary_content=include_binary_content
        )
        json_str = json.dumps(
            export_data,
            ensure_ascii=False,
            indent=2 if pretty_print else None,
            default=str
        )
        if output_path:
            output_path = Path(output_path)
            output_path.parent.mkdir(parents=True, exist_ok=True)
            output_path.write_text(json_str, encoding='utf-8')
            logger.info(f"Exported JSON to: {output_path}")
            return output_path
        return json_str
    @staticmethod
    def export_to_markdown(
        document: UnifiedDocument,
        output_path: Optional[Path] = None,
        include_metadata_header: bool = True,
        include_page_breaks: bool = True
    ) -> Union[str, Path]:
        """
        Export UnifiedDocument to Markdown format.
        Args:
            document: The UnifiedDocument to export
            output_path: Optional path to save the Markdown file
            include_metadata_header: Include document metadata as header
            include_page_breaks: Include page break markers
        Returns:
            Markdown string if no output_path, otherwise the output Path
        """
        lines = []
        # Add metadata header
        if include_metadata_header:
            lines.append(f"# {document.metadata.filename}")
            lines.append("")
            lines.append("## Document Info")
            lines.append(f"- **Processing Track**: {document.metadata.processing_track.value}")
            lines.append(f"- **Processing Time**: {document.metadata.processing_time:.2f}s")
            lines.append(f"- **Pages**: {document.page_count}")
            lines.append(f"- **Total Elements**: {document.total_elements}")
            if document.metadata.language:
                lines.append(f"- **Language**: {document.metadata.language}")
            lines.append("")
            lines.append("---")
            lines.append("")
        # Export each page
        for page in document.pages:
            if include_page_breaks and page.page_number > 1:
                lines.append("")
                lines.append(f"---")
                lines.append("")
            lines.append(f"## Page {page.page_number}")
            lines.append("")
            # Get elements in reading order
            for element in page.get_reading_order():
                content = UnifiedDocumentExporter._element_to_markdown(element)
                if content:
                    lines.append(content)
                    lines.append("")
        md_content = "\n".join(lines)
        if output_path:
            output_path = Path(output_path)
            output_path.parent.mkdir(parents=True, exist_ok=True)
            output_path.write_text(md_content, encoding='utf-8')
            logger.info(f"Exported Markdown to: {output_path}")
            return output_path
        return md_content
    @staticmethod
    def export_to_text(
        document: UnifiedDocument,
        output_path: Optional[Path] = None,
        page_separator: str = "\n\n--- Page Break ---\n\n"
    ) -> Union[str, Path]:
        """
        Export UnifiedDocument to plain text format.
        Args:
            document: The UnifiedDocument to export
            output_path: Optional path to save the text file
            page_separator: Separator between pages
        Returns:
            Text string if no output_path, otherwise the output Path
        """
        pages_text = []
        for page in document.pages:
            page_text = page.extract_text()
            if page_text:
                pages_text.append(page_text)
        text_content = page_separator.join(pages_text)
        if output_path:
            output_path = Path(output_path)
            output_path.parent.mkdir(parents=True, exist_ok=True)
            output_path.write_text(text_content, encoding='utf-8')
            logger.info(f"Exported text to: {output_path}")
            return output_path
        return text_content
    @staticmethod
    def export_to_legacy_json(
        document: UnifiedDocument,
        output_path: Optional[Path] = None
    ) -> Union[str, Path]:
        """
        Export UnifiedDocument to legacy JSON format for backward compatibility.
        Args:
            document: The UnifiedDocument to export
            output_path: Optional path to save the JSON file
        Returns:
            JSON string if no output_path, otherwise the output Path
        """
        legacy_data = document.to_legacy_format()
        json_str = json.dumps(
            legacy_data,
            ensure_ascii=False,
            indent=2,
            default=str
        )
        if output_path:
            output_path = Path(output_path)
            output_path.parent.mkdir(parents=True, exist_ok=True)
            output_path.write_text(json_str, encoding='utf-8')
            logger.info(f"Exported legacy JSON to: {output_path}")
            return output_path
        return json_str
    @staticmethod
    def export_all_formats(
        document: UnifiedDocument,
        output_dir: Path,
        file_id: str
    ) -> Dict[str, Optional[Path]]:
        """
        Export UnifiedDocument to all standard formats.
        Args:
            document: The UnifiedDocument to export
            output_dir: Directory to save output files
            file_id: Base filename for outputs
        Returns:
            Dictionary mapping format names to output paths
        """
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        results = {}
        # Export JSON
        try:
            json_path = output_dir / f"{file_id}_result.json"
            UnifiedDocumentExporter.export_to_json(
                document,
                json_path,
                include_metadata=True,
                include_statistics=True
            )
            results['json'] = json_path
        except Exception as e:
            logger.error(f"Failed to export JSON: {e}")
            results['json'] = None
        # Export Markdown
        try:
            md_path = output_dir / f"{file_id}_output.md"
            UnifiedDocumentExporter.export_to_markdown(document, md_path)
            results['markdown'] = md_path
        except Exception as e:
            logger.error(f"Failed to export Markdown: {e}")
            results['markdown'] = None
        # Export plain text
        try:
            txt_path = output_dir / f"{file_id}_text.txt"
            UnifiedDocumentExporter.export_to_text(document, txt_path)
            results['text'] = txt_path
        except Exception as e:
            logger.error(f"Failed to export text: {e}")
            results['text'] = None
        return results
    @staticmethod
    def _build_export_data(
        document: UnifiedDocument,
        include_metadata: bool = True,
        include_statistics: bool = True,
        include_binary_content: bool = False
    ) -> Dict[str, Any]:
        """
        Build the export data structure with processing metadata.
        Supports both OCR and DIRECT track outputs with consistent format.
        """
        # Base document data
        export_data = {
            "schema_version": UnifiedDocumentExporter.SCHEMA_VERSION,
            "document_id": document.document_id,
            "export_timestamp": datetime.utcnow().isoformat() + "Z"
        }
        # Add metadata
        if include_metadata:
            export_data["metadata"] = document.metadata.to_dict()
            # Add extended processing metadata
            export_data["metadata"]["processing_info"] = {
                "track_description": UnifiedDocumentExporter._get_track_description(
                    document.metadata.processing_track
                ),
                "schema_version": UnifiedDocumentExporter.SCHEMA_VERSION,
                "export_format": "unified_document_v1"
            }
        # Add pages
        export_data["pages"] = [page.to_dict() for page in document.pages]
        # Add statistics
        if include_statistics:
            export_data["statistics"] = {
                "page_count": document.page_count,
                "total_elements": document.total_elements,
                "total_tables": len(document.get_all_tables()),
                "total_images": len(document.get_all_images()),
                "element_type_counts": UnifiedDocumentExporter._count_element_types(document),
                "text_stats": UnifiedDocumentExporter._calculate_text_stats(document)
            }
        # Add processing errors if any
        if document.processing_errors:
            export_data["processing_errors"] = document.processing_errors
        return export_data
    @staticmethod
    def _get_track_description(track: ProcessingTrack) -> str:
        """Get human-readable description for processing track."""
        descriptions = {
            ProcessingTrack.OCR: "PaddleOCR PP-StructureV3 - Used for scanned documents and images",
            ProcessingTrack.DIRECT: "PyMuPDF Direct Extraction - Used for editable PDFs with embedded text",
            ProcessingTrack.HYBRID: "Hybrid Processing - Combined OCR and direct extraction"
        }
        return descriptions.get(track, "Unknown processing track")
    @staticmethod
    def _count_element_types(document: UnifiedDocument) -> Dict[str, int]:
        """Count occurrences of each element type in the document."""
        counts = {}
        for page in document.pages:
            for element in page.elements:
                type_name = element.type.value
                counts[type_name] = counts.get(type_name, 0) + 1
        return counts
    @staticmethod
    def _calculate_text_stats(document: UnifiedDocument) -> Dict[str, Any]:
        """Calculate text statistics for the document."""
        full_text = document.extract_all_text()
        words = full_text.split()
        chars = len(full_text)
        # Calculate average confidence
        confidences = []
        for page in document.pages:
            for element in page.elements:
                if element.confidence is not None:
                    confidences.append(element.confidence)
        avg_confidence = sum(confidences) / len(confidences) if confidences else None
        return {
            "total_characters": chars,
            "total_words": len(words),
            "average_confidence": round(avg_confidence, 4) if avg_confidence else None
        }
    @staticmethod
    def _element_to_markdown(element) -> str:
        """Convert a document element to Markdown format."""
        content = element.get_text()
        if not content and element.type not in [ElementType.TABLE, ElementType.IMAGE]:
            return ""
        # Format based on element type
        if element.type == ElementType.TITLE:
            return f"# {content}"
        elif element.type == ElementType.HEADER:
            return f"### {content}"
        elif element.type in [ElementType.TEXT, ElementType.PARAGRAPH, ElementType.BODY]:
            return content
        elif element.type == ElementType.LIST_ITEM:
            return f"- {content}"
        elif element.type == ElementType.TABLE:
            # Use HTML table if available
            if hasattr(element.content, 'to_html'):
                return element.content.to_html()
            return f"[Table: {content}]"
        elif element.type == ElementType.IMAGE:
            return f"![Image]({element.metadata.get('path', 'image')})"
        elif element.type == ElementType.FIGURE:
            return f"[Figure: {content or 'No caption'}]"
        elif element.type == ElementType.CODE:
            return f"```\n{content}\n```"
        elif element.type == ElementType.EQUATION:
            return f"$${content}$$"
        elif element.type == ElementType.CAPTION:
            return f"*{content}*"
        elif element.type == ElementType.FOOTNOTE:
            return f"[^{content}]"
        elif element.type == ElementType.REFERENCE:
            return f"> {content}"
        else:
            return content if content else ""
 class JSONSchemaValidator:
    """
    Validator for UnifiedDocument JSON exports.
    Uses the JSON Schema definition to validate exported data.
    """
    _schema = None
    @classmethod
    def get_schema(cls) -> Dict[str, Any]:
        """Load and return the JSON Schema for UnifiedDocument."""
        if cls._schema is None:
            schema_path = Path(__file__).parent.parent / "schemas" / "unified_document_schema.json"
            if schema_path.exists():
                cls._schema = json.loads(schema_path.read_text(encoding='utf-8'))
            else:
                logger.warning(f"Schema file not found: {schema_path}")
                cls._schema = {}
        return cls._schema
    @classmethod
    def validate(cls, data: Dict[str, Any]) -> Tuple[bool, Optional[str]]:
        """
        Validate JSON data against the UnifiedDocument schema.
        Args:
            data: The JSON data to validate
        Returns:
            Tuple of (is_valid, error_message)
        """
        try:
            import jsonschema
            schema = cls.get_schema()
            if not schema:
                return True, None  # Skip validation if schema not available
            jsonschema.validate(data, schema)
            return True, None
        except ImportError:
            logger.warning("jsonschema package not installed, skipping validation")
            return True, None
        except Exception as e:
            return False, str(e)
 def save_unified_document(
    document: UnifiedDocument,
    output_dir: Path,
    file_id: str,
    formats: Optional[list] = None
 ) -> Dict[str, Optional[Path]]:
    """
    Convenience function to save UnifiedDocument to multiple formats.
    Args:
        document: The UnifiedDocument to save
        output_dir: Output directory
        file_id: Base filename
        formats: List of formats to export (default: ['json', 'markdown'])
    Returns:
        Dictionary mapping format names to output paths
    """
    if formats is None:
        formats = ['json', 'markdown']
    results = {}
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    for fmt in formats:
        try:
            if fmt == 'json':
                path = output_dir / f"{file_id}_result.json"
                UnifiedDocumentExporter.export_to_json(document, path)
                results['json'] = path
            elif fmt == 'markdown':
                path = output_dir / f"{file_id}_output.md"
                UnifiedDocumentExporter.export_to_markdown(document, path)
                results['markdown'] = path
            elif fmt == 'text':
                path = output_dir / f"{file_id}_text.txt"
                UnifiedDocumentExporter.export_to_text(document, path)
                results['text'] = path
            elif fmt == 'legacy':
                path = output_dir / f"{file_id}_legacy.json"
                UnifiedDocumentExporter.export_to_legacy_json(document, path)
                results['legacy'] = path
        except Exception as e:
            logger.error(f"Failed to export {fmt}: {e}")
            results[fmt] = None
    return results
--- a/openspec/changes/dual-track-document-processing/tasks.md
+++ b/openspec/changes/dual-track-document-processing/tasks.md
@@ -59,10 +59,10 @@
  - [x] 4.1.2 Route to appropriate processing engine
  - [x] 4.1.3 Return UnifiedDocument from both tracks
  - [x] 4.1.4 Maintain backward compatibility
- [ ] 4.2 Create unified JSON export
+- [x] 4.2 Create unified JSON export
-  - [ ] 4.2.1 Define standardized JSON schema
+  - [x] 4.2.1 Define standardized JSON schema
-  - [ ] 4.2.2 Include processing metadata
+  - [x] 4.2.2 Include processing metadata
-  - [ ] 4.2.3 Support both track outputs
+  - [x] 4.2.3 Support both track outputs
 - [ ] 4.3 Update PDF generator for UnifiedDocument
  - [ ] 4.3.1 Adapt PDF generation to use UnifiedDocument
  - [ ] 4.3.2 Preserve layout from both tracks