feat: add unified JSON export with standardized schema

- Create JSON Schema definition for UnifiedDocument format - Implement UnifiedDocumentExporter service with multiple export formats - Include comprehensive processing metadata and statistics - Update OCR service to use new exporter for dual-track outputs - Support JSON, Markdown, Text, and legacy format exports 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-19 08:36:24 +08:00
parent 5bcf3dfd42
commit ab89a40e8d
5 changed files with 999 additions and 21 deletions
--- a/backend/app/schemas/unified_document_schema.json
+++ b/backend/app/schemas/unified_document_schema.json
@@ -0,0 +1,443 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "$id": "https://tool-ocr.local/schemas/unified-document.json",
+  "title": "UnifiedDocument",
+  "description": "Unified document representation for dual-track OCR/Direct extraction processing",
+  "type": "object",
+  "required": ["document_id", "metadata", "pages", "statistics"],
+  "properties": {
+    "document_id": {
+      "type": "string",
+      "description": "Unique identifier for the document"
+    },
+    "metadata": {
+      "$ref": "#/definitions/DocumentMetadata"
+    },
+    "pages": {
+      "type": "array",
+      "items": {
+        "$ref": "#/definitions/Page"
+      },
+      "description": "List of pages in the document"
+    },
+    "statistics": {
+      "$ref": "#/definitions/DocumentStatistics"
+    },
+    "processing_errors": {
+      "type": "array",
+      "items": {
+        "$ref": "#/definitions/ProcessingError"
+      },
+      "default": [],
+      "description": "List of any errors encountered during processing"
+    }
+  },
+  "definitions": {
+    "DocumentMetadata": {
+      "type": "object",
+      "required": ["filename", "file_type", "file_size", "created_at", "processing_track", "processing_time"],
+      "properties": {
+        "filename": {
+          "type": "string",
+          "description": "Original filename"
+        },
+        "file_type": {
+          "type": "string",
+          "description": "File MIME type or extension"
+        },
+        "file_size": {
+          "type": "integer",
+          "minimum": 0,
+          "description": "File size in bytes"
+        },
+        "created_at": {
+          "type": "string",
+          "format": "date-time",
+          "description": "Processing timestamp (ISO 8601)"
+        },
+        "processing_track": {
+          "type": "string",
+          "enum": ["ocr", "direct", "hybrid"],
+          "description": "Processing track used"
+        },
+        "processing_time": {
+          "type": "number",
+          "minimum": 0,
+          "description": "Processing time in seconds"
+        },
+        "language": {
+          "type": "string",
+          "description": "Detected or specified language code"
+        },
+        "title": {
+          "type": "string",
+          "description": "Document title from metadata"
+        },
+        "author": {
+          "type": "string",
+          "description": "Document author"
+        },
+        "subject": {
+          "type": "string",
+          "description": "Document subject"
+        },
+        "keywords": {
+          "type": "array",
+          "items": {"type": "string"},
+          "description": "Document keywords"
+        },
+        "producer": {
+          "type": "string",
+          "description": "PDF producer application"
+        },
+        "creator": {
+          "type": "string",
+          "description": "Document creator application"
+        },
+        "creation_date": {
+          "type": "string",
+          "format": "date-time",
+          "description": "Document creation date"
+        },
+        "modification_date": {
+          "type": "string",
+          "format": "date-time",
+          "description": "Document last modification date"
+        }
+      }
+    },
+    "Page": {
+      "type": "object",
+      "required": ["page_number", "elements", "dimensions"],
+      "properties": {
+        "page_number": {
+          "type": "integer",
+          "minimum": 1,
+          "description": "1-based page number"
+        },
+        "elements": {
+          "type": "array",
+          "items": {
+            "$ref": "#/definitions/DocumentElement"
+          },
+          "description": "List of elements on the page"
+        },
+        "dimensions": {
+          "$ref": "#/definitions/Dimensions"
+        },
+        "metadata": {
+          "type": "object",
+          "additionalProperties": true,
+          "description": "Additional page-specific metadata"
+        },
+        "statistics": {
+          "$ref": "#/definitions/PageStatistics"
+        }
+      }
+    },
+    "DocumentElement": {
+      "type": "object",
+      "required": ["element_id", "type", "bbox"],
+      "properties": {
+        "element_id": {
+          "type": "string",
+          "description": "Unique identifier for the element"
+        },
+        "type": {
+          "type": "string",
+          "enum": [
+            "text", "title", "header", "footer", "reference", "equation", "footnote", "caption",
+            "list", "list_item",
+            "table", "table_cell", "table_caption",
+            "image", "figure", "chart", "diagram",
+            "section", "paragraph", "page_number", "watermark", "header_group", "body",
+            "code", "formula", "signature", "stamp", "logo", "barcode", "qr_code"
+          ],
+          "description": "Element type (supports all 23 PP-StructureV3 types plus custom ones)"
+        },
+        "content": {
+          "oneOf": [
+            {"type": "string"},
+            {"$ref": "#/definitions/TableData"},
+            {"type": "object"}
+          ],
+          "description": "Element content (text, table data, or structured data)"
+        },
+        "content_type": {
+          "type": "string",
+          "enum": ["text", "table", "binary"],
+          "description": "Type of content when not a simple string"
+        },
+        "content_length": {
+          "type": "integer",
+          "description": "Length of binary content in bytes"
+        },
+        "bbox": {
+          "$ref": "#/definitions/BoundingBox"
+        },
+        "confidence": {
+          "type": "number",
+          "minimum": 0,
+          "maximum": 1,
+          "description": "OCR confidence score (0-1)"
+        },
+        "style": {
+          "$ref": "#/definitions/StyleInfo"
+        },
+        "metadata": {
+          "type": "object",
+          "additionalProperties": true,
+          "description": "Additional element metadata"
+        },
+        "children": {
+          "type": "array",
+          "items": {
+            "$ref": "#/definitions/DocumentElement"
+          },
+          "description": "Nested child elements"
+        }
+      }
+    },
+    "BoundingBox": {
+      "type": "object",
+      "required": ["x0", "y0", "x1", "y1"],
+      "properties": {
+        "x0": {
+          "type": "number",
+          "description": "Left coordinate"
+        },
+        "y0": {
+          "type": "number",
+          "description": "Top coordinate"
+        },
+        "x1": {
+          "type": "number",
+          "description": "Right coordinate"
+        },
+        "y1": {
+          "type": "number",
+          "description": "Bottom coordinate"
+        },
+        "width": {
+          "type": "number",
+          "description": "Width (calculated)"
+        },
+        "height": {
+          "type": "number",
+          "description": "Height (calculated)"
+        }
+      }
+    },
+    "StyleInfo": {
+      "type": "object",
+      "properties": {
+        "font_name": {
+          "type": "string",
+          "description": "Font family name"
+        },
+        "font_size": {
+          "type": "number",
+          "minimum": 0,
+          "description": "Font size in points"
+        },
+        "font_weight": {
+          "type": "string",
+          "enum": ["normal", "bold"],
+          "description": "Font weight"
+        },
+        "font_style": {
+          "type": "string",
+          "enum": ["normal", "italic"],
+          "description": "Font style"
+        },
+        "text_color": {
+          "type": "integer",
+          "description": "Text color as RGB integer"
+        },
+        "text_color_rgb": {
+          "type": "array",
+          "items": {"type": "integer", "minimum": 0, "maximum": 255},
+          "minItems": 3,
+          "maxItems": 3,
+          "description": "Text color as [R, G, B] array"
+        },
+        "bg_color": {
+          "type": "integer",
+          "description": "Background color as RGB integer"
+        },
+        "alignment": {
+          "type": "string",
+          "enum": ["left", "center", "right", "justify"],
+          "description": "Text alignment"
+        }
+      }
+    },
+    "TableData": {
+      "type": "object",
+      "required": ["rows", "cols", "cells"],
+      "properties": {
+        "rows": {
+          "type": "integer",
+          "minimum": 0,
+          "description": "Number of rows"
+        },
+        "cols": {
+          "type": "integer",
+          "minimum": 0,
+          "description": "Number of columns"
+        },
+        "cells": {
+          "type": "array",
+          "items": {
+            "$ref": "#/definitions/TableCell"
+          },
+          "description": "Table cells"
+        },
+        "headers": {
+          "type": "array",
+          "items": {"type": "string"},
+          "description": "Header row labels"
+        },
+        "caption": {
+          "type": "string",
+          "description": "Table caption"
+        }
+      }
+    },
+    "TableCell": {
+      "type": "object",
+      "required": ["row", "col"],
+      "properties": {
+        "row": {
+          "type": "integer",
+          "minimum": 0,
+          "description": "Row index (0-based)"
+        },
+        "col": {
+          "type": "integer",
+          "minimum": 0,
+          "description": "Column index (0-based)"
+        },
+        "row_span": {
+          "type": "integer",
+          "minimum": 1,
+          "default": 1,
+          "description": "Number of rows spanned"
+        },
+        "col_span": {
+          "type": "integer",
+          "minimum": 1,
+          "default": 1,
+          "description": "Number of columns spanned"
+        },
+        "content": {
+          "type": "string",
+          "default": "",
+          "description": "Cell text content"
+        },
+        "bbox": {
+          "$ref": "#/definitions/BoundingBox"
+        },
+        "style": {
+          "$ref": "#/definitions/StyleInfo"
+        }
+      }
+    },
+    "Dimensions": {
+      "type": "object",
+      "required": ["width", "height"],
+      "properties": {
+        "width": {
+          "type": "number",
+          "minimum": 0,
+          "description": "Width in pixels or points"
+        },
+        "height": {
+          "type": "number",
+          "minimum": 0,
+          "description": "Height in pixels or points"
+        },
+        "dpi": {
+          "type": "integer",
+          "minimum": 1,
+          "description": "Resolution in DPI"
+        }
+      }
+    },
+    "DocumentStatistics": {
+      "type": "object",
+      "required": ["page_count", "total_elements", "total_tables", "total_images"],
+      "properties": {
+        "page_count": {
+          "type": "integer",
+          "minimum": 0,
+          "description": "Total number of pages"
+        },
+        "total_elements": {
+          "type": "integer",
+          "minimum": 0,
+          "description": "Total elements across all pages"
+        },
+        "total_tables": {
+          "type": "integer",
+          "minimum": 0,
+          "description": "Total tables across all pages"
+        },
+        "total_images": {
+          "type": "integer",
+          "minimum": 0,
+          "description": "Total images across all pages"
+        }
+      }
+    },
+    "PageStatistics": {
+      "type": "object",
+      "required": ["total_elements", "text_elements", "tables", "images"],
+      "properties": {
+        "total_elements": {
+          "type": "integer",
+          "minimum": 0
+        },
+        "text_elements": {
+          "type": "integer",
+          "minimum": 0
+        },
+        "tables": {
+          "type": "integer",
+          "minimum": 0
+        },
+        "images": {
+          "type": "integer",
+          "minimum": 0
+        }
+      }
+    },
+    "ProcessingError": {
+      "type": "object",
+      "required": ["error_type", "message"],
+      "properties": {
+        "error_type": {
+          "type": "string",
+          "description": "Error classification"
+        },
+        "message": {
+          "type": "string",
+          "description": "Error description"
+        },
+        "page": {
+          "type": "integer",
+          "description": "Page number where error occurred"
+        },
+        "element_id": {
+          "type": "string",
+          "description": "Element ID if applicable"
+        },
+        "timestamp": {
+          "type": "string",
+          "format": "date-time",
+          "description": "When the error occurred"
+        }
+      }
+    }
+  }
+}
--- a/backend/app/services/init.py
+++ b/backend/app/services/init.py
@@ -1,3 +1,17 @@
 """
 Tool_OCR - Services Package
 """
+
+from .unified_document_exporter import (
+    UnifiedDocumentExporter,
+    ExportFormat,
+    JSONSchemaValidator,
+    save_unified_document
+)
+
+__all__ = [
+    'UnifiedDocumentExporter',
+    'ExportFormat',
+    'JSONSchemaValidator',
+    'save_unified_document'
+]
--- a/backend/app/services/ocr_service.py
+++ b/backend/app/services/ocr_service.py
@@ -23,6 +23,7 @@ try:
    from app.services.document_type_detector import DocumentTypeDetector, ProcessingTrackRecommendation
    from app.services.direct_extraction_engine import DirectExtractionEngine
    from app.services.ocr_to_unified_converter import OCRToUnifiedConverter
+    from app.services.unified_document_exporter import UnifiedDocumentExporter
    from app.models.unified_document import (
        UnifiedDocument, DocumentMetadata,
        ProcessingTrack, ElementType, DocumentElement, Page, Dimensions,
@@ -30,8 +31,9 @@ try:
    )
    DUAL_TRACK_AVAILABLE = True
 except ImportError as e:
-    logger.warning(f"Dual-track components not available: {e}")
+    logging.getLogger(__name__).warning(f"Dual-track components not available: {e}")
    DUAL_TRACK_AVAILABLE = False
+    UnifiedDocumentExporter = None

 logger = logging.getLogger(__name__)

@@ -1175,21 +1177,34 @@ class OCRService:
        try:
            output_dir.mkdir(parents=True, exist_ok=True)

-            # Convert UnifiedDocument to dict if needed
-            if isinstance(result, UnifiedDocument):
-                result_dict = result.to_dict()
-                legacy_result = result.to_legacy_format()
+            # Use UnifiedDocumentExporter for standardized export
+            if isinstance(result, UnifiedDocument) and UnifiedDocumentExporter is not None:
+                # Use the new exporter for UnifiedDocument
+                json_path = output_dir / f"{file_id}_result.json"
+                UnifiedDocumentExporter.export_to_json(
+                    result,
+                    json_path,
+                    include_metadata=True,
+                    include_statistics=True
+                )
+
+                markdown_path = output_dir / f"{file_id}_output.md"
+                UnifiedDocumentExporter.export_to_markdown(
+                    result,
+                    markdown_path,
+                    include_metadata_header=False  # Keep output clean
+                )
+
                markdown_content = result.extract_all_text()
            else:
-                result_dict = result
-                legacy_result = result
-                markdown_content = result.get('markdown_content', '')
+                # Legacy path for dict results
+                result_dict = result if isinstance(result, dict) else result.to_dict()
+                markdown_content = result.get('markdown_content', '') if isinstance(result, dict) else ''

-            # Save JSON (use dict format for compatibility)
+                # Save JSON
                json_path = output_dir / f"{file_id}_result.json"
                with open(json_path, 'w', encoding='utf-8') as f:
-                json.dump(result_dict if isinstance(result, UnifiedDocument) else result,
-                         f, ensure_ascii=False, indent=2)
+                    json.dump(result_dict, f, ensure_ascii=False, indent=2)

                # Save Markdown
                markdown_path = output_dir / f"{file_id}_output.md"
--- a/backend/app/services/unified_document_exporter.py
+++ b/backend/app/services/unified_document_exporter.py
@@ -0,0 +1,506 @@
+"""
+Unified Document Exporter Service
+
+Provides standardized export functionality for UnifiedDocument,
+supporting both OCR and Direct extraction track outputs with
+comprehensive processing metadata.
+"""
+
+import json
+import logging
+from pathlib import Path
+from typing import Dict, Any, Optional, Tuple, Union
+from datetime import datetime
+
+from ..models.unified_document import (
+    UnifiedDocument,
+    ProcessingTrack,
+    ElementType
+)
+
+logger = logging.getLogger(__name__)
+
+
+class ExportFormat:
+    """Supported export formats"""
+    JSON = "json"
+    JSON_MINIMAL = "json_minimal"
+    JSON_LEGACY = "json_legacy"
+    MARKDOWN = "markdown"
+    TEXT = "text"
+    HTML = "html"
+
+
+class UnifiedDocumentExporter:
+    """
+    Exporter service for UnifiedDocument with comprehensive metadata support.
+
+    Supports both OCR and DIRECT processing tracks with consistent output format.
+    """
+
+    # Schema version for tracking format changes
+    SCHEMA_VERSION = "1.0.0"
+
+    @staticmethod
+    def export_to_json(
+        document: UnifiedDocument,
+        output_path: Optional[Path] = None,
+        include_metadata: bool = True,
+        include_statistics: bool = True,
+        include_binary_content: bool = False,
+        pretty_print: bool = True
+    ) -> Union[str, Path]:
+        """
+        Export UnifiedDocument to standardized JSON format.
+
+        Args:
+            document: The UnifiedDocument to export
+            output_path: Optional path to save the JSON file
+            include_metadata: Include processing metadata
+            include_statistics: Include document statistics
+            include_binary_content: Include base64-encoded binary content
+            pretty_print: Format JSON with indentation
+
+        Returns:
+            JSON string if no output_path, otherwise the output Path
+        """
+        export_data = UnifiedDocumentExporter._build_export_data(
+            document,
+            include_metadata=include_metadata,
+            include_statistics=include_statistics,
+            include_binary_content=include_binary_content
+        )
+
+        json_str = json.dumps(
+            export_data,
+            ensure_ascii=False,
+            indent=2 if pretty_print else None,
+            default=str
+        )
+
+        if output_path:
+            output_path = Path(output_path)
+            output_path.parent.mkdir(parents=True, exist_ok=True)
+            output_path.write_text(json_str, encoding='utf-8')
+            logger.info(f"Exported JSON to: {output_path}")
+            return output_path
+
+        return json_str
+
+    @staticmethod
+    def export_to_markdown(
+        document: UnifiedDocument,
+        output_path: Optional[Path] = None,
+        include_metadata_header: bool = True,
+        include_page_breaks: bool = True
+    ) -> Union[str, Path]:
+        """
+        Export UnifiedDocument to Markdown format.
+
+        Args:
+            document: The UnifiedDocument to export
+            output_path: Optional path to save the Markdown file
+            include_metadata_header: Include document metadata as header
+            include_page_breaks: Include page break markers
+
+        Returns:
+            Markdown string if no output_path, otherwise the output Path
+        """
+        lines = []
+
+        # Add metadata header
+        if include_metadata_header:
+            lines.append(f"# {document.metadata.filename}")
+            lines.append("")
+            lines.append("## Document Info")
+            lines.append(f"- **Processing Track**: {document.metadata.processing_track.value}")
+            lines.append(f"- **Processing Time**: {document.metadata.processing_time:.2f}s")
+            lines.append(f"- **Pages**: {document.page_count}")
+            lines.append(f"- **Total Elements**: {document.total_elements}")
+            if document.metadata.language:
+                lines.append(f"- **Language**: {document.metadata.language}")
+            lines.append("")
+            lines.append("---")
+            lines.append("")
+
+        # Export each page
+        for page in document.pages:
+            if include_page_breaks and page.page_number > 1:
+                lines.append("")
+                lines.append(f"---")
+                lines.append("")
+
+            lines.append(f"## Page {page.page_number}")
+            lines.append("")
+
+            # Get elements in reading order
+            for element in page.get_reading_order():
+                content = UnifiedDocumentExporter._element_to_markdown(element)
+                if content:
+                    lines.append(content)
+                    lines.append("")
+
+        md_content = "\n".join(lines)
+
+        if output_path:
+            output_path = Path(output_path)
+            output_path.parent.mkdir(parents=True, exist_ok=True)
+            output_path.write_text(md_content, encoding='utf-8')
+            logger.info(f"Exported Markdown to: {output_path}")
+            return output_path
+
+        return md_content
+
+    @staticmethod
+    def export_to_text(
+        document: UnifiedDocument,
+        output_path: Optional[Path] = None,
+        page_separator: str = "\n\n--- Page Break ---\n\n"
+    ) -> Union[str, Path]:
+        """
+        Export UnifiedDocument to plain text format.
+
+        Args:
+            document: The UnifiedDocument to export
+            output_path: Optional path to save the text file
+            page_separator: Separator between pages
+
+        Returns:
+            Text string if no output_path, otherwise the output Path
+        """
+        pages_text = []
+
+        for page in document.pages:
+            page_text = page.extract_text()
+            if page_text:
+                pages_text.append(page_text)
+
+        text_content = page_separator.join(pages_text)
+
+        if output_path:
+            output_path = Path(output_path)
+            output_path.parent.mkdir(parents=True, exist_ok=True)
+            output_path.write_text(text_content, encoding='utf-8')
+            logger.info(f"Exported text to: {output_path}")
+            return output_path
+
+        return text_content
+
+    @staticmethod
+    def export_to_legacy_json(
+        document: UnifiedDocument,
+        output_path: Optional[Path] = None
+    ) -> Union[str, Path]:
+        """
+        Export UnifiedDocument to legacy JSON format for backward compatibility.
+
+        Args:
+            document: The UnifiedDocument to export
+            output_path: Optional path to save the JSON file
+
+        Returns:
+            JSON string if no output_path, otherwise the output Path
+        """
+        legacy_data = document.to_legacy_format()
+
+        json_str = json.dumps(
+            legacy_data,
+            ensure_ascii=False,
+            indent=2,
+            default=str
+        )
+
+        if output_path:
+            output_path = Path(output_path)
+            output_path.parent.mkdir(parents=True, exist_ok=True)
+            output_path.write_text(json_str, encoding='utf-8')
+            logger.info(f"Exported legacy JSON to: {output_path}")
+            return output_path
+
+        return json_str
+
+    @staticmethod
+    def export_all_formats(
+        document: UnifiedDocument,
+        output_dir: Path,
+        file_id: str
+    ) -> Dict[str, Optional[Path]]:
+        """
+        Export UnifiedDocument to all standard formats.
+
+        Args:
+            document: The UnifiedDocument to export
+            output_dir: Directory to save output files
+            file_id: Base filename for outputs
+
+        Returns:
+            Dictionary mapping format names to output paths
+        """
+        output_dir = Path(output_dir)
+        output_dir.mkdir(parents=True, exist_ok=True)
+
+        results = {}
+
+        # Export JSON
+        try:
+            json_path = output_dir / f"{file_id}_result.json"
+            UnifiedDocumentExporter.export_to_json(
+                document,
+                json_path,
+                include_metadata=True,
+                include_statistics=True
+            )
+            results['json'] = json_path
+        except Exception as e:
+            logger.error(f"Failed to export JSON: {e}")
+            results['json'] = None
+
+        # Export Markdown
+        try:
+            md_path = output_dir / f"{file_id}_output.md"
+            UnifiedDocumentExporter.export_to_markdown(document, md_path)
+            results['markdown'] = md_path
+        except Exception as e:
+            logger.error(f"Failed to export Markdown: {e}")
+            results['markdown'] = None
+
+        # Export plain text
+        try:
+            txt_path = output_dir / f"{file_id}_text.txt"
+            UnifiedDocumentExporter.export_to_text(document, txt_path)
+            results['text'] = txt_path
+        except Exception as e:
+            logger.error(f"Failed to export text: {e}")
+            results['text'] = None
+
+        return results
+
+    @staticmethod
+    def _build_export_data(
+        document: UnifiedDocument,
+        include_metadata: bool = True,
+        include_statistics: bool = True,
+        include_binary_content: bool = False
+    ) -> Dict[str, Any]:
+        """
+        Build the export data structure with processing metadata.
+
+        Supports both OCR and DIRECT track outputs with consistent format.
+        """
+        # Base document data
+        export_data = {
+            "schema_version": UnifiedDocumentExporter.SCHEMA_VERSION,
+            "document_id": document.document_id,
+            "export_timestamp": datetime.utcnow().isoformat() + "Z"
+        }
+
+        # Add metadata
+        if include_metadata:
+            export_data["metadata"] = document.metadata.to_dict()
+
+            # Add extended processing metadata
+            export_data["metadata"]["processing_info"] = {
+                "track_description": UnifiedDocumentExporter._get_track_description(
+                    document.metadata.processing_track
+                ),
+                "schema_version": UnifiedDocumentExporter.SCHEMA_VERSION,
+                "export_format": "unified_document_v1"
+            }
+
+        # Add pages
+        export_data["pages"] = [page.to_dict() for page in document.pages]
+
+        # Add statistics
+        if include_statistics:
+            export_data["statistics"] = {
+                "page_count": document.page_count,
+                "total_elements": document.total_elements,
+                "total_tables": len(document.get_all_tables()),
+                "total_images": len(document.get_all_images()),
+                "element_type_counts": UnifiedDocumentExporter._count_element_types(document),
+                "text_stats": UnifiedDocumentExporter._calculate_text_stats(document)
+            }
+
+        # Add processing errors if any
+        if document.processing_errors:
+            export_data["processing_errors"] = document.processing_errors
+
+        return export_data
+
+    @staticmethod
+    def _get_track_description(track: ProcessingTrack) -> str:
+        """Get human-readable description for processing track."""
+        descriptions = {
+            ProcessingTrack.OCR: "PaddleOCR PP-StructureV3 - Used for scanned documents and images",
+            ProcessingTrack.DIRECT: "PyMuPDF Direct Extraction - Used for editable PDFs with embedded text",
+            ProcessingTrack.HYBRID: "Hybrid Processing - Combined OCR and direct extraction"
+        }
+        return descriptions.get(track, "Unknown processing track")
+
+    @staticmethod
+    def _count_element_types(document: UnifiedDocument) -> Dict[str, int]:
+        """Count occurrences of each element type in the document."""
+        counts = {}
+        for page in document.pages:
+            for element in page.elements:
+                type_name = element.type.value
+                counts[type_name] = counts.get(type_name, 0) + 1
+        return counts
+
+    @staticmethod
+    def _calculate_text_stats(document: UnifiedDocument) -> Dict[str, Any]:
+        """Calculate text statistics for the document."""
+        full_text = document.extract_all_text()
+        words = full_text.split()
+        chars = len(full_text)
+
+        # Calculate average confidence
+        confidences = []
+        for page in document.pages:
+            for element in page.elements:
+                if element.confidence is not None:
+                    confidences.append(element.confidence)
+
+        avg_confidence = sum(confidences) / len(confidences) if confidences else None
+
+        return {
+            "total_characters": chars,
+            "total_words": len(words),
+            "average_confidence": round(avg_confidence, 4) if avg_confidence else None
+        }
+
+    @staticmethod
+    def _element_to_markdown(element) -> str:
+        """Convert a document element to Markdown format."""
+        content = element.get_text()
+
+        if not content and element.type not in [ElementType.TABLE, ElementType.IMAGE]:
+            return ""
+
+        # Format based on element type
+        if element.type == ElementType.TITLE:
+            return f"# {content}"
+        elif element.type == ElementType.HEADER:
+            return f"### {content}"
+        elif element.type in [ElementType.TEXT, ElementType.PARAGRAPH, ElementType.BODY]:
+            return content
+        elif element.type == ElementType.LIST_ITEM:
+            return f"- {content}"
+        elif element.type == ElementType.TABLE:
+            # Use HTML table if available
+            if hasattr(element.content, 'to_html'):
+                return element.content.to_html()
+            return f"[Table: {content}]"
+        elif element.type == ElementType.IMAGE:
+            return f"![Image]({element.metadata.get('path', 'image')})"
+        elif element.type == ElementType.FIGURE:
+            return f"[Figure: {content or 'No caption'}]"
+        elif element.type == ElementType.CODE:
+            return f"```\n{content}\n```"
+        elif element.type == ElementType.EQUATION:
+            return f"$${content}$$"
+        elif element.type == ElementType.CAPTION:
+            return f"*{content}*"
+        elif element.type == ElementType.FOOTNOTE:
+            return f"[^{content}]"
+        elif element.type == ElementType.REFERENCE:
+            return f"> {content}"
+        else:
+            return content if content else ""
+
+
+class JSONSchemaValidator:
+    """
+    Validator for UnifiedDocument JSON exports.
+
+    Uses the JSON Schema definition to validate exported data.
+    """
+
+    _schema = None
+
+    @classmethod
+    def get_schema(cls) -> Dict[str, Any]:
+        """Load and return the JSON Schema for UnifiedDocument."""
+        if cls._schema is None:
+            schema_path = Path(__file__).parent.parent / "schemas" / "unified_document_schema.json"
+            if schema_path.exists():
+                cls._schema = json.loads(schema_path.read_text(encoding='utf-8'))
+            else:
+                logger.warning(f"Schema file not found: {schema_path}")
+                cls._schema = {}
+        return cls._schema
+
+    @classmethod
+    def validate(cls, data: Dict[str, Any]) -> Tuple[bool, Optional[str]]:
+        """
+        Validate JSON data against the UnifiedDocument schema.
+
+        Args:
+            data: The JSON data to validate
+
+        Returns:
+            Tuple of (is_valid, error_message)
+        """
+        try:
+            import jsonschema
+            schema = cls.get_schema()
+            if not schema:
+                return True, None  # Skip validation if schema not available
+
+            jsonschema.validate(data, schema)
+            return True, None
+        except ImportError:
+            logger.warning("jsonschema package not installed, skipping validation")
+            return True, None
+        except Exception as e:
+            return False, str(e)
+
+
+def save_unified_document(
+    document: UnifiedDocument,
+    output_dir: Path,
+    file_id: str,
+    formats: Optional[list] = None
+) -> Dict[str, Optional[Path]]:
+    """
+    Convenience function to save UnifiedDocument to multiple formats.
+
+    Args:
+        document: The UnifiedDocument to save
+        output_dir: Output directory
+        file_id: Base filename
+        formats: List of formats to export (default: ['json', 'markdown'])
+
+    Returns:
+        Dictionary mapping format names to output paths
+    """
+    if formats is None:
+        formats = ['json', 'markdown']
+
+    results = {}
+    output_dir = Path(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    for fmt in formats:
+        try:
+            if fmt == 'json':
+                path = output_dir / f"{file_id}_result.json"
+                UnifiedDocumentExporter.export_to_json(document, path)
+                results['json'] = path
+            elif fmt == 'markdown':
+                path = output_dir / f"{file_id}_output.md"
+                UnifiedDocumentExporter.export_to_markdown(document, path)
+                results['markdown'] = path
+            elif fmt == 'text':
+                path = output_dir / f"{file_id}_text.txt"
+                UnifiedDocumentExporter.export_to_text(document, path)
+                results['text'] = path
+            elif fmt == 'legacy':
+                path = output_dir / f"{file_id}_legacy.json"
+                UnifiedDocumentExporter.export_to_legacy_json(document, path)
+                results['legacy'] = path
+        except Exception as e:
+            logger.error(f"Failed to export {fmt}: {e}")
+            results[fmt] = None
+
+    return results
--- a/openspec/changes/dual-track-document-processing/tasks.md
+++ b/openspec/changes/dual-track-document-processing/tasks.md
@@ -59,10 +59,10 @@
  - [x] 4.1.2 Route to appropriate processing engine
  - [x] 4.1.3 Return UnifiedDocument from both tracks
  - [x] 4.1.4 Maintain backward compatibility
- [ ] 4.2 Create unified JSON export
-  - [ ] 4.2.1 Define standardized JSON schema
-  - [ ] 4.2.2 Include processing metadata
-  - [ ] 4.2.3 Support both track outputs
+- [x] 4.2 Create unified JSON export
+  - [x] 4.2.1 Define standardized JSON schema
+  - [x] 4.2.2 Include processing metadata
+  - [x] 4.2.3 Support both track outputs
 - [ ] 4.3 Update PDF generator for UnifiedDocument
  - [ ] 4.3.1 Adapt PDF generation to use UnifiedDocument
  - [ ] 4.3.2 Preserve layout from both tracks