diff --git a/backend/app/schemas/unified_document_schema.json b/backend/app/schemas/unified_document_schema.json new file mode 100644 index 0000000..0ed708e --- /dev/null +++ b/backend/app/schemas/unified_document_schema.json @@ -0,0 +1,443 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "$id": "https://tool-ocr.local/schemas/unified-document.json", + "title": "UnifiedDocument", + "description": "Unified document representation for dual-track OCR/Direct extraction processing", + "type": "object", + "required": ["document_id", "metadata", "pages", "statistics"], + "properties": { + "document_id": { + "type": "string", + "description": "Unique identifier for the document" + }, + "metadata": { + "$ref": "#/definitions/DocumentMetadata" + }, + "pages": { + "type": "array", + "items": { + "$ref": "#/definitions/Page" + }, + "description": "List of pages in the document" + }, + "statistics": { + "$ref": "#/definitions/DocumentStatistics" + }, + "processing_errors": { + "type": "array", + "items": { + "$ref": "#/definitions/ProcessingError" + }, + "default": [], + "description": "List of any errors encountered during processing" + } + }, + "definitions": { + "DocumentMetadata": { + "type": "object", + "required": ["filename", "file_type", "file_size", "created_at", "processing_track", "processing_time"], + "properties": { + "filename": { + "type": "string", + "description": "Original filename" + }, + "file_type": { + "type": "string", + "description": "File MIME type or extension" + }, + "file_size": { + "type": "integer", + "minimum": 0, + "description": "File size in bytes" + }, + "created_at": { + "type": "string", + "format": "date-time", + "description": "Processing timestamp (ISO 8601)" + }, + "processing_track": { + "type": "string", + "enum": ["ocr", "direct", "hybrid"], + "description": "Processing track used" + }, + "processing_time": { + "type": "number", + "minimum": 0, + "description": "Processing time in seconds" + }, + "language": { + "type": "string", + "description": "Detected or specified language code" + }, + "title": { + "type": "string", + "description": "Document title from metadata" + }, + "author": { + "type": "string", + "description": "Document author" + }, + "subject": { + "type": "string", + "description": "Document subject" + }, + "keywords": { + "type": "array", + "items": {"type": "string"}, + "description": "Document keywords" + }, + "producer": { + "type": "string", + "description": "PDF producer application" + }, + "creator": { + "type": "string", + "description": "Document creator application" + }, + "creation_date": { + "type": "string", + "format": "date-time", + "description": "Document creation date" + }, + "modification_date": { + "type": "string", + "format": "date-time", + "description": "Document last modification date" + } + } + }, + "Page": { + "type": "object", + "required": ["page_number", "elements", "dimensions"], + "properties": { + "page_number": { + "type": "integer", + "minimum": 1, + "description": "1-based page number" + }, + "elements": { + "type": "array", + "items": { + "$ref": "#/definitions/DocumentElement" + }, + "description": "List of elements on the page" + }, + "dimensions": { + "$ref": "#/definitions/Dimensions" + }, + "metadata": { + "type": "object", + "additionalProperties": true, + "description": "Additional page-specific metadata" + }, + "statistics": { + "$ref": "#/definitions/PageStatistics" + } + } + }, + "DocumentElement": { + "type": "object", + "required": ["element_id", "type", "bbox"], + "properties": { + "element_id": { + "type": "string", + "description": "Unique identifier for the element" + }, + "type": { + "type": "string", + "enum": [ + "text", "title", "header", "footer", "reference", "equation", "footnote", "caption", + "list", "list_item", + "table", "table_cell", "table_caption", + "image", "figure", "chart", "diagram", + "section", "paragraph", "page_number", "watermark", "header_group", "body", + "code", "formula", "signature", "stamp", "logo", "barcode", "qr_code" + ], + "description": "Element type (supports all 23 PP-StructureV3 types plus custom ones)" + }, + "content": { + "oneOf": [ + {"type": "string"}, + {"$ref": "#/definitions/TableData"}, + {"type": "object"} + ], + "description": "Element content (text, table data, or structured data)" + }, + "content_type": { + "type": "string", + "enum": ["text", "table", "binary"], + "description": "Type of content when not a simple string" + }, + "content_length": { + "type": "integer", + "description": "Length of binary content in bytes" + }, + "bbox": { + "$ref": "#/definitions/BoundingBox" + }, + "confidence": { + "type": "number", + "minimum": 0, + "maximum": 1, + "description": "OCR confidence score (0-1)" + }, + "style": { + "$ref": "#/definitions/StyleInfo" + }, + "metadata": { + "type": "object", + "additionalProperties": true, + "description": "Additional element metadata" + }, + "children": { + "type": "array", + "items": { + "$ref": "#/definitions/DocumentElement" + }, + "description": "Nested child elements" + } + } + }, + "BoundingBox": { + "type": "object", + "required": ["x0", "y0", "x1", "y1"], + "properties": { + "x0": { + "type": "number", + "description": "Left coordinate" + }, + "y0": { + "type": "number", + "description": "Top coordinate" + }, + "x1": { + "type": "number", + "description": "Right coordinate" + }, + "y1": { + "type": "number", + "description": "Bottom coordinate" + }, + "width": { + "type": "number", + "description": "Width (calculated)" + }, + "height": { + "type": "number", + "description": "Height (calculated)" + } + } + }, + "StyleInfo": { + "type": "object", + "properties": { + "font_name": { + "type": "string", + "description": "Font family name" + }, + "font_size": { + "type": "number", + "minimum": 0, + "description": "Font size in points" + }, + "font_weight": { + "type": "string", + "enum": ["normal", "bold"], + "description": "Font weight" + }, + "font_style": { + "type": "string", + "enum": ["normal", "italic"], + "description": "Font style" + }, + "text_color": { + "type": "integer", + "description": "Text color as RGB integer" + }, + "text_color_rgb": { + "type": "array", + "items": {"type": "integer", "minimum": 0, "maximum": 255}, + "minItems": 3, + "maxItems": 3, + "description": "Text color as [R, G, B] array" + }, + "bg_color": { + "type": "integer", + "description": "Background color as RGB integer" + }, + "alignment": { + "type": "string", + "enum": ["left", "center", "right", "justify"], + "description": "Text alignment" + } + } + }, + "TableData": { + "type": "object", + "required": ["rows", "cols", "cells"], + "properties": { + "rows": { + "type": "integer", + "minimum": 0, + "description": "Number of rows" + }, + "cols": { + "type": "integer", + "minimum": 0, + "description": "Number of columns" + }, + "cells": { + "type": "array", + "items": { + "$ref": "#/definitions/TableCell" + }, + "description": "Table cells" + }, + "headers": { + "type": "array", + "items": {"type": "string"}, + "description": "Header row labels" + }, + "caption": { + "type": "string", + "description": "Table caption" + } + } + }, + "TableCell": { + "type": "object", + "required": ["row", "col"], + "properties": { + "row": { + "type": "integer", + "minimum": 0, + "description": "Row index (0-based)" + }, + "col": { + "type": "integer", + "minimum": 0, + "description": "Column index (0-based)" + }, + "row_span": { + "type": "integer", + "minimum": 1, + "default": 1, + "description": "Number of rows spanned" + }, + "col_span": { + "type": "integer", + "minimum": 1, + "default": 1, + "description": "Number of columns spanned" + }, + "content": { + "type": "string", + "default": "", + "description": "Cell text content" + }, + "bbox": { + "$ref": "#/definitions/BoundingBox" + }, + "style": { + "$ref": "#/definitions/StyleInfo" + } + } + }, + "Dimensions": { + "type": "object", + "required": ["width", "height"], + "properties": { + "width": { + "type": "number", + "minimum": 0, + "description": "Width in pixels or points" + }, + "height": { + "type": "number", + "minimum": 0, + "description": "Height in pixels or points" + }, + "dpi": { + "type": "integer", + "minimum": 1, + "description": "Resolution in DPI" + } + } + }, + "DocumentStatistics": { + "type": "object", + "required": ["page_count", "total_elements", "total_tables", "total_images"], + "properties": { + "page_count": { + "type": "integer", + "minimum": 0, + "description": "Total number of pages" + }, + "total_elements": { + "type": "integer", + "minimum": 0, + "description": "Total elements across all pages" + }, + "total_tables": { + "type": "integer", + "minimum": 0, + "description": "Total tables across all pages" + }, + "total_images": { + "type": "integer", + "minimum": 0, + "description": "Total images across all pages" + } + } + }, + "PageStatistics": { + "type": "object", + "required": ["total_elements", "text_elements", "tables", "images"], + "properties": { + "total_elements": { + "type": "integer", + "minimum": 0 + }, + "text_elements": { + "type": "integer", + "minimum": 0 + }, + "tables": { + "type": "integer", + "minimum": 0 + }, + "images": { + "type": "integer", + "minimum": 0 + } + } + }, + "ProcessingError": { + "type": "object", + "required": ["error_type", "message"], + "properties": { + "error_type": { + "type": "string", + "description": "Error classification" + }, + "message": { + "type": "string", + "description": "Error description" + }, + "page": { + "type": "integer", + "description": "Page number where error occurred" + }, + "element_id": { + "type": "string", + "description": "Element ID if applicable" + }, + "timestamp": { + "type": "string", + "format": "date-time", + "description": "When the error occurred" + } + } + } + } +} diff --git a/backend/app/services/__init__.py b/backend/app/services/__init__.py index e986066..6461ffa 100644 --- a/backend/app/services/__init__.py +++ b/backend/app/services/__init__.py @@ -1,3 +1,17 @@ """ Tool_OCR - Services Package """ + +from .unified_document_exporter import ( + UnifiedDocumentExporter, + ExportFormat, + JSONSchemaValidator, + save_unified_document +) + +__all__ = [ + 'UnifiedDocumentExporter', + 'ExportFormat', + 'JSONSchemaValidator', + 'save_unified_document' +] diff --git a/backend/app/services/ocr_service.py b/backend/app/services/ocr_service.py index 72ddfec..7a92752 100644 --- a/backend/app/services/ocr_service.py +++ b/backend/app/services/ocr_service.py @@ -23,6 +23,7 @@ try: from app.services.document_type_detector import DocumentTypeDetector, ProcessingTrackRecommendation from app.services.direct_extraction_engine import DirectExtractionEngine from app.services.ocr_to_unified_converter import OCRToUnifiedConverter + from app.services.unified_document_exporter import UnifiedDocumentExporter from app.models.unified_document import ( UnifiedDocument, DocumentMetadata, ProcessingTrack, ElementType, DocumentElement, Page, Dimensions, @@ -30,8 +31,9 @@ try: ) DUAL_TRACK_AVAILABLE = True except ImportError as e: - logger.warning(f"Dual-track components not available: {e}") + logging.getLogger(__name__).warning(f"Dual-track components not available: {e}") DUAL_TRACK_AVAILABLE = False + UnifiedDocumentExporter = None logger = logging.getLogger(__name__) @@ -1175,26 +1177,39 @@ class OCRService: try: output_dir.mkdir(parents=True, exist_ok=True) - # Convert UnifiedDocument to dict if needed - if isinstance(result, UnifiedDocument): - result_dict = result.to_dict() - legacy_result = result.to_legacy_format() + # Use UnifiedDocumentExporter for standardized export + if isinstance(result, UnifiedDocument) and UnifiedDocumentExporter is not None: + # Use the new exporter for UnifiedDocument + json_path = output_dir / f"{file_id}_result.json" + UnifiedDocumentExporter.export_to_json( + result, + json_path, + include_metadata=True, + include_statistics=True + ) + + markdown_path = output_dir / f"{file_id}_output.md" + UnifiedDocumentExporter.export_to_markdown( + result, + markdown_path, + include_metadata_header=False # Keep output clean + ) + markdown_content = result.extract_all_text() else: - result_dict = result - legacy_result = result - markdown_content = result.get('markdown_content', '') + # Legacy path for dict results + result_dict = result if isinstance(result, dict) else result.to_dict() + markdown_content = result.get('markdown_content', '') if isinstance(result, dict) else '' - # Save JSON (use dict format for compatibility) - json_path = output_dir / f"{file_id}_result.json" - with open(json_path, 'w', encoding='utf-8') as f: - json.dump(result_dict if isinstance(result, UnifiedDocument) else result, - f, ensure_ascii=False, indent=2) + # Save JSON + json_path = output_dir / f"{file_id}_result.json" + with open(json_path, 'w', encoding='utf-8') as f: + json.dump(result_dict, f, ensure_ascii=False, indent=2) - # Save Markdown - markdown_path = output_dir / f"{file_id}_output.md" - with open(markdown_path, 'w', encoding='utf-8') as f: - f.write(markdown_content) + # Save Markdown + markdown_path = output_dir / f"{file_id}_output.md" + with open(markdown_path, 'w', encoding='utf-8') as f: + f.write(markdown_content) logger.info(f"Results saved: {json_path.name}, {markdown_path.name}") diff --git a/backend/app/services/unified_document_exporter.py b/backend/app/services/unified_document_exporter.py new file mode 100644 index 0000000..10dd809 --- /dev/null +++ b/backend/app/services/unified_document_exporter.py @@ -0,0 +1,506 @@ +""" +Unified Document Exporter Service + +Provides standardized export functionality for UnifiedDocument, +supporting both OCR and Direct extraction track outputs with +comprehensive processing metadata. +""" + +import json +import logging +from pathlib import Path +from typing import Dict, Any, Optional, Tuple, Union +from datetime import datetime + +from ..models.unified_document import ( + UnifiedDocument, + ProcessingTrack, + ElementType +) + +logger = logging.getLogger(__name__) + + +class ExportFormat: + """Supported export formats""" + JSON = "json" + JSON_MINIMAL = "json_minimal" + JSON_LEGACY = "json_legacy" + MARKDOWN = "markdown" + TEXT = "text" + HTML = "html" + + +class UnifiedDocumentExporter: + """ + Exporter service for UnifiedDocument with comprehensive metadata support. + + Supports both OCR and DIRECT processing tracks with consistent output format. + """ + + # Schema version for tracking format changes + SCHEMA_VERSION = "1.0.0" + + @staticmethod + def export_to_json( + document: UnifiedDocument, + output_path: Optional[Path] = None, + include_metadata: bool = True, + include_statistics: bool = True, + include_binary_content: bool = False, + pretty_print: bool = True + ) -> Union[str, Path]: + """ + Export UnifiedDocument to standardized JSON format. + + Args: + document: The UnifiedDocument to export + output_path: Optional path to save the JSON file + include_metadata: Include processing metadata + include_statistics: Include document statistics + include_binary_content: Include base64-encoded binary content + pretty_print: Format JSON with indentation + + Returns: + JSON string if no output_path, otherwise the output Path + """ + export_data = UnifiedDocumentExporter._build_export_data( + document, + include_metadata=include_metadata, + include_statistics=include_statistics, + include_binary_content=include_binary_content + ) + + json_str = json.dumps( + export_data, + ensure_ascii=False, + indent=2 if pretty_print else None, + default=str + ) + + if output_path: + output_path = Path(output_path) + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text(json_str, encoding='utf-8') + logger.info(f"Exported JSON to: {output_path}") + return output_path + + return json_str + + @staticmethod + def export_to_markdown( + document: UnifiedDocument, + output_path: Optional[Path] = None, + include_metadata_header: bool = True, + include_page_breaks: bool = True + ) -> Union[str, Path]: + """ + Export UnifiedDocument to Markdown format. + + Args: + document: The UnifiedDocument to export + output_path: Optional path to save the Markdown file + include_metadata_header: Include document metadata as header + include_page_breaks: Include page break markers + + Returns: + Markdown string if no output_path, otherwise the output Path + """ + lines = [] + + # Add metadata header + if include_metadata_header: + lines.append(f"# {document.metadata.filename}") + lines.append("") + lines.append("## Document Info") + lines.append(f"- **Processing Track**: {document.metadata.processing_track.value}") + lines.append(f"- **Processing Time**: {document.metadata.processing_time:.2f}s") + lines.append(f"- **Pages**: {document.page_count}") + lines.append(f"- **Total Elements**: {document.total_elements}") + if document.metadata.language: + lines.append(f"- **Language**: {document.metadata.language}") + lines.append("") + lines.append("---") + lines.append("") + + # Export each page + for page in document.pages: + if include_page_breaks and page.page_number > 1: + lines.append("") + lines.append(f"---") + lines.append("") + + lines.append(f"## Page {page.page_number}") + lines.append("") + + # Get elements in reading order + for element in page.get_reading_order(): + content = UnifiedDocumentExporter._element_to_markdown(element) + if content: + lines.append(content) + lines.append("") + + md_content = "\n".join(lines) + + if output_path: + output_path = Path(output_path) + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text(md_content, encoding='utf-8') + logger.info(f"Exported Markdown to: {output_path}") + return output_path + + return md_content + + @staticmethod + def export_to_text( + document: UnifiedDocument, + output_path: Optional[Path] = None, + page_separator: str = "\n\n--- Page Break ---\n\n" + ) -> Union[str, Path]: + """ + Export UnifiedDocument to plain text format. + + Args: + document: The UnifiedDocument to export + output_path: Optional path to save the text file + page_separator: Separator between pages + + Returns: + Text string if no output_path, otherwise the output Path + """ + pages_text = [] + + for page in document.pages: + page_text = page.extract_text() + if page_text: + pages_text.append(page_text) + + text_content = page_separator.join(pages_text) + + if output_path: + output_path = Path(output_path) + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text(text_content, encoding='utf-8') + logger.info(f"Exported text to: {output_path}") + return output_path + + return text_content + + @staticmethod + def export_to_legacy_json( + document: UnifiedDocument, + output_path: Optional[Path] = None + ) -> Union[str, Path]: + """ + Export UnifiedDocument to legacy JSON format for backward compatibility. + + Args: + document: The UnifiedDocument to export + output_path: Optional path to save the JSON file + + Returns: + JSON string if no output_path, otherwise the output Path + """ + legacy_data = document.to_legacy_format() + + json_str = json.dumps( + legacy_data, + ensure_ascii=False, + indent=2, + default=str + ) + + if output_path: + output_path = Path(output_path) + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text(json_str, encoding='utf-8') + logger.info(f"Exported legacy JSON to: {output_path}") + return output_path + + return json_str + + @staticmethod + def export_all_formats( + document: UnifiedDocument, + output_dir: Path, + file_id: str + ) -> Dict[str, Optional[Path]]: + """ + Export UnifiedDocument to all standard formats. + + Args: + document: The UnifiedDocument to export + output_dir: Directory to save output files + file_id: Base filename for outputs + + Returns: + Dictionary mapping format names to output paths + """ + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + results = {} + + # Export JSON + try: + json_path = output_dir / f"{file_id}_result.json" + UnifiedDocumentExporter.export_to_json( + document, + json_path, + include_metadata=True, + include_statistics=True + ) + results['json'] = json_path + except Exception as e: + logger.error(f"Failed to export JSON: {e}") + results['json'] = None + + # Export Markdown + try: + md_path = output_dir / f"{file_id}_output.md" + UnifiedDocumentExporter.export_to_markdown(document, md_path) + results['markdown'] = md_path + except Exception as e: + logger.error(f"Failed to export Markdown: {e}") + results['markdown'] = None + + # Export plain text + try: + txt_path = output_dir / f"{file_id}_text.txt" + UnifiedDocumentExporter.export_to_text(document, txt_path) + results['text'] = txt_path + except Exception as e: + logger.error(f"Failed to export text: {e}") + results['text'] = None + + return results + + @staticmethod + def _build_export_data( + document: UnifiedDocument, + include_metadata: bool = True, + include_statistics: bool = True, + include_binary_content: bool = False + ) -> Dict[str, Any]: + """ + Build the export data structure with processing metadata. + + Supports both OCR and DIRECT track outputs with consistent format. + """ + # Base document data + export_data = { + "schema_version": UnifiedDocumentExporter.SCHEMA_VERSION, + "document_id": document.document_id, + "export_timestamp": datetime.utcnow().isoformat() + "Z" + } + + # Add metadata + if include_metadata: + export_data["metadata"] = document.metadata.to_dict() + + # Add extended processing metadata + export_data["metadata"]["processing_info"] = { + "track_description": UnifiedDocumentExporter._get_track_description( + document.metadata.processing_track + ), + "schema_version": UnifiedDocumentExporter.SCHEMA_VERSION, + "export_format": "unified_document_v1" + } + + # Add pages + export_data["pages"] = [page.to_dict() for page in document.pages] + + # Add statistics + if include_statistics: + export_data["statistics"] = { + "page_count": document.page_count, + "total_elements": document.total_elements, + "total_tables": len(document.get_all_tables()), + "total_images": len(document.get_all_images()), + "element_type_counts": UnifiedDocumentExporter._count_element_types(document), + "text_stats": UnifiedDocumentExporter._calculate_text_stats(document) + } + + # Add processing errors if any + if document.processing_errors: + export_data["processing_errors"] = document.processing_errors + + return export_data + + @staticmethod + def _get_track_description(track: ProcessingTrack) -> str: + """Get human-readable description for processing track.""" + descriptions = { + ProcessingTrack.OCR: "PaddleOCR PP-StructureV3 - Used for scanned documents and images", + ProcessingTrack.DIRECT: "PyMuPDF Direct Extraction - Used for editable PDFs with embedded text", + ProcessingTrack.HYBRID: "Hybrid Processing - Combined OCR and direct extraction" + } + return descriptions.get(track, "Unknown processing track") + + @staticmethod + def _count_element_types(document: UnifiedDocument) -> Dict[str, int]: + """Count occurrences of each element type in the document.""" + counts = {} + for page in document.pages: + for element in page.elements: + type_name = element.type.value + counts[type_name] = counts.get(type_name, 0) + 1 + return counts + + @staticmethod + def _calculate_text_stats(document: UnifiedDocument) -> Dict[str, Any]: + """Calculate text statistics for the document.""" + full_text = document.extract_all_text() + words = full_text.split() + chars = len(full_text) + + # Calculate average confidence + confidences = [] + for page in document.pages: + for element in page.elements: + if element.confidence is not None: + confidences.append(element.confidence) + + avg_confidence = sum(confidences) / len(confidences) if confidences else None + + return { + "total_characters": chars, + "total_words": len(words), + "average_confidence": round(avg_confidence, 4) if avg_confidence else None + } + + @staticmethod + def _element_to_markdown(element) -> str: + """Convert a document element to Markdown format.""" + content = element.get_text() + + if not content and element.type not in [ElementType.TABLE, ElementType.IMAGE]: + return "" + + # Format based on element type + if element.type == ElementType.TITLE: + return f"# {content}" + elif element.type == ElementType.HEADER: + return f"### {content}" + elif element.type in [ElementType.TEXT, ElementType.PARAGRAPH, ElementType.BODY]: + return content + elif element.type == ElementType.LIST_ITEM: + return f"- {content}" + elif element.type == ElementType.TABLE: + # Use HTML table if available + if hasattr(element.content, 'to_html'): + return element.content.to_html() + return f"[Table: {content}]" + elif element.type == ElementType.IMAGE: + return f"![Image]({element.metadata.get('path', 'image')})" + elif element.type == ElementType.FIGURE: + return f"[Figure: {content or 'No caption'}]" + elif element.type == ElementType.CODE: + return f"```\n{content}\n```" + elif element.type == ElementType.EQUATION: + return f"$${content}$$" + elif element.type == ElementType.CAPTION: + return f"*{content}*" + elif element.type == ElementType.FOOTNOTE: + return f"[^{content}]" + elif element.type == ElementType.REFERENCE: + return f"> {content}" + else: + return content if content else "" + + +class JSONSchemaValidator: + """ + Validator for UnifiedDocument JSON exports. + + Uses the JSON Schema definition to validate exported data. + """ + + _schema = None + + @classmethod + def get_schema(cls) -> Dict[str, Any]: + """Load and return the JSON Schema for UnifiedDocument.""" + if cls._schema is None: + schema_path = Path(__file__).parent.parent / "schemas" / "unified_document_schema.json" + if schema_path.exists(): + cls._schema = json.loads(schema_path.read_text(encoding='utf-8')) + else: + logger.warning(f"Schema file not found: {schema_path}") + cls._schema = {} + return cls._schema + + @classmethod + def validate(cls, data: Dict[str, Any]) -> Tuple[bool, Optional[str]]: + """ + Validate JSON data against the UnifiedDocument schema. + + Args: + data: The JSON data to validate + + Returns: + Tuple of (is_valid, error_message) + """ + try: + import jsonschema + schema = cls.get_schema() + if not schema: + return True, None # Skip validation if schema not available + + jsonschema.validate(data, schema) + return True, None + except ImportError: + logger.warning("jsonschema package not installed, skipping validation") + return True, None + except Exception as e: + return False, str(e) + + +def save_unified_document( + document: UnifiedDocument, + output_dir: Path, + file_id: str, + formats: Optional[list] = None +) -> Dict[str, Optional[Path]]: + """ + Convenience function to save UnifiedDocument to multiple formats. + + Args: + document: The UnifiedDocument to save + output_dir: Output directory + file_id: Base filename + formats: List of formats to export (default: ['json', 'markdown']) + + Returns: + Dictionary mapping format names to output paths + """ + if formats is None: + formats = ['json', 'markdown'] + + results = {} + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + for fmt in formats: + try: + if fmt == 'json': + path = output_dir / f"{file_id}_result.json" + UnifiedDocumentExporter.export_to_json(document, path) + results['json'] = path + elif fmt == 'markdown': + path = output_dir / f"{file_id}_output.md" + UnifiedDocumentExporter.export_to_markdown(document, path) + results['markdown'] = path + elif fmt == 'text': + path = output_dir / f"{file_id}_text.txt" + UnifiedDocumentExporter.export_to_text(document, path) + results['text'] = path + elif fmt == 'legacy': + path = output_dir / f"{file_id}_legacy.json" + UnifiedDocumentExporter.export_to_legacy_json(document, path) + results['legacy'] = path + except Exception as e: + logger.error(f"Failed to export {fmt}: {e}") + results[fmt] = None + + return results diff --git a/openspec/changes/dual-track-document-processing/tasks.md b/openspec/changes/dual-track-document-processing/tasks.md index 85fbfa7..338437f 100644 --- a/openspec/changes/dual-track-document-processing/tasks.md +++ b/openspec/changes/dual-track-document-processing/tasks.md @@ -59,10 +59,10 @@ - [x] 4.1.2 Route to appropriate processing engine - [x] 4.1.3 Return UnifiedDocument from both tracks - [x] 4.1.4 Maintain backward compatibility -- [ ] 4.2 Create unified JSON export - - [ ] 4.2.1 Define standardized JSON schema - - [ ] 4.2.2 Include processing metadata - - [ ] 4.2.3 Support both track outputs +- [x] 4.2 Create unified JSON export + - [x] 4.2.1 Define standardized JSON schema + - [x] 4.2.2 Include processing metadata + - [x] 4.2.3 Support both track outputs - [ ] 4.3 Update PDF generator for UnifiedDocument - [ ] 4.3.1 Adapt PDF generation to use UnifiedDocument - [ ] 4.3.2 Preserve layout from both tracks