""" Unified Document Exporter Service Provides standardized export functionality for UnifiedDocument, supporting both OCR and Direct extraction track outputs with comprehensive processing metadata. """ import json import logging from pathlib import Path from typing import Dict, Any, Optional, Tuple, Union from datetime import datetime from ..models.unified_document import ( UnifiedDocument, ProcessingTrack, ElementType ) logger = logging.getLogger(__name__) class ExportFormat: """Supported export formats""" JSON = "json" JSON_MINIMAL = "json_minimal" JSON_LEGACY = "json_legacy" MARKDOWN = "markdown" TEXT = "text" HTML = "html" class UnifiedDocumentExporter: """ Exporter service for UnifiedDocument with comprehensive metadata support. Supports both OCR and DIRECT processing tracks with consistent output format. """ # Schema version for tracking format changes SCHEMA_VERSION = "1.0.0" @staticmethod def export_to_json( document: UnifiedDocument, output_path: Optional[Path] = None, include_metadata: bool = True, include_statistics: bool = True, include_binary_content: bool = False, pretty_print: bool = True ) -> Union[str, Path]: """ Export UnifiedDocument to standardized JSON format. Args: document: The UnifiedDocument to export output_path: Optional path to save the JSON file include_metadata: Include processing metadata include_statistics: Include document statistics include_binary_content: Include base64-encoded binary content pretty_print: Format JSON with indentation Returns: JSON string if no output_path, otherwise the output Path """ export_data = UnifiedDocumentExporter._build_export_data( document, include_metadata=include_metadata, include_statistics=include_statistics, include_binary_content=include_binary_content ) json_str = json.dumps( export_data, ensure_ascii=False, indent=2 if pretty_print else None, default=str ) if output_path: output_path = Path(output_path) output_path.parent.mkdir(parents=True, exist_ok=True) output_path.write_text(json_str, encoding='utf-8') logger.info(f"Exported JSON to: {output_path}") return output_path return json_str @staticmethod def export_to_markdown( document: UnifiedDocument, output_path: Optional[Path] = None, include_metadata_header: bool = True, include_page_breaks: bool = True ) -> Union[str, Path]: """ Export UnifiedDocument to Markdown format. Args: document: The UnifiedDocument to export output_path: Optional path to save the Markdown file include_metadata_header: Include document metadata as header include_page_breaks: Include page break markers Returns: Markdown string if no output_path, otherwise the output Path """ lines = [] # Add metadata header if include_metadata_header: lines.append(f"# {document.metadata.filename}") lines.append("") lines.append("## Document Info") lines.append(f"- **Processing Track**: {document.metadata.processing_track.value}") lines.append(f"- **Processing Time**: {document.metadata.processing_time:.2f}s") lines.append(f"- **Pages**: {document.page_count}") lines.append(f"- **Total Elements**: {document.total_elements}") if document.metadata.language: lines.append(f"- **Language**: {document.metadata.language}") lines.append("") lines.append("---") lines.append("") # Export each page for page in document.pages: if include_page_breaks and page.page_number > 1: lines.append("") lines.append(f"---") lines.append("") lines.append(f"## Page {page.page_number}") lines.append("") # Get elements in reading order for element in page.get_reading_order(): content = UnifiedDocumentExporter._element_to_markdown(element) if content: lines.append(content) lines.append("") md_content = "\n".join(lines) if output_path: output_path = Path(output_path) output_path.parent.mkdir(parents=True, exist_ok=True) output_path.write_text(md_content, encoding='utf-8') logger.info(f"Exported Markdown to: {output_path}") return output_path return md_content @staticmethod def export_to_text( document: UnifiedDocument, output_path: Optional[Path] = None, page_separator: str = "\n\n--- Page Break ---\n\n" ) -> Union[str, Path]: """ Export UnifiedDocument to plain text format. Args: document: The UnifiedDocument to export output_path: Optional path to save the text file page_separator: Separator between pages Returns: Text string if no output_path, otherwise the output Path """ pages_text = [] for page in document.pages: page_text = page.extract_text() if page_text: pages_text.append(page_text) text_content = page_separator.join(pages_text) if output_path: output_path = Path(output_path) output_path.parent.mkdir(parents=True, exist_ok=True) output_path.write_text(text_content, encoding='utf-8') logger.info(f"Exported text to: {output_path}") return output_path return text_content @staticmethod def export_to_legacy_json( document: UnifiedDocument, output_path: Optional[Path] = None ) -> Union[str, Path]: """ Export UnifiedDocument to legacy JSON format for backward compatibility. Args: document: The UnifiedDocument to export output_path: Optional path to save the JSON file Returns: JSON string if no output_path, otherwise the output Path """ legacy_data = document.to_legacy_format() json_str = json.dumps( legacy_data, ensure_ascii=False, indent=2, default=str ) if output_path: output_path = Path(output_path) output_path.parent.mkdir(parents=True, exist_ok=True) output_path.write_text(json_str, encoding='utf-8') logger.info(f"Exported legacy JSON to: {output_path}") return output_path return json_str @staticmethod def export_all_formats( document: UnifiedDocument, output_dir: Path, file_id: str ) -> Dict[str, Optional[Path]]: """ Export UnifiedDocument to all standard formats. Args: document: The UnifiedDocument to export output_dir: Directory to save output files file_id: Base filename for outputs Returns: Dictionary mapping format names to output paths """ output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) results = {} # Export JSON try: json_path = output_dir / f"{file_id}_result.json" UnifiedDocumentExporter.export_to_json( document, json_path, include_metadata=True, include_statistics=True ) results['json'] = json_path except Exception as e: logger.error(f"Failed to export JSON: {e}") results['json'] = None # Export Markdown try: md_path = output_dir / f"{file_id}_output.md" UnifiedDocumentExporter.export_to_markdown(document, md_path) results['markdown'] = md_path except Exception as e: logger.error(f"Failed to export Markdown: {e}") results['markdown'] = None # Export plain text try: txt_path = output_dir / f"{file_id}_text.txt" UnifiedDocumentExporter.export_to_text(document, txt_path) results['text'] = txt_path except Exception as e: logger.error(f"Failed to export text: {e}") results['text'] = None return results @staticmethod def _build_export_data( document: UnifiedDocument, include_metadata: bool = True, include_statistics: bool = True, include_binary_content: bool = False ) -> Dict[str, Any]: """ Build the export data structure with processing metadata. Supports both OCR and DIRECT track outputs with consistent format. """ # Base document data export_data = { "schema_version": UnifiedDocumentExporter.SCHEMA_VERSION, "document_id": document.document_id, "export_timestamp": datetime.utcnow().isoformat() + "Z" } # Add metadata if include_metadata: export_data["metadata"] = document.metadata.to_dict() # Add extended processing metadata export_data["metadata"]["processing_info"] = { "track_description": UnifiedDocumentExporter._get_track_description( document.metadata.processing_track ), "schema_version": UnifiedDocumentExporter.SCHEMA_VERSION, "export_format": "unified_document_v1" } # Add pages export_data["pages"] = [page.to_dict() for page in document.pages] # Add statistics if include_statistics: export_data["statistics"] = { "page_count": document.page_count, "total_elements": document.total_elements, "total_tables": len(document.get_all_tables()), "total_images": len(document.get_all_images()), "element_type_counts": UnifiedDocumentExporter._count_element_types(document), "text_stats": UnifiedDocumentExporter._calculate_text_stats(document) } # Add processing errors if any if document.processing_errors: export_data["processing_errors"] = document.processing_errors return export_data @staticmethod def _get_track_description(track: ProcessingTrack) -> str: """Get human-readable description for processing track.""" descriptions = { ProcessingTrack.OCR: "PaddleOCR PP-StructureV3 - Used for scanned documents and images", ProcessingTrack.DIRECT: "PyMuPDF Direct Extraction - Used for editable PDFs with embedded text", ProcessingTrack.HYBRID: "Hybrid Processing - Combined OCR and direct extraction" } return descriptions.get(track, "Unknown processing track") @staticmethod def _count_element_types(document: UnifiedDocument) -> Dict[str, int]: """Count occurrences of each element type in the document.""" counts = {} for page in document.pages: for element in page.elements: type_name = element.type.value counts[type_name] = counts.get(type_name, 0) + 1 return counts @staticmethod def _calculate_text_stats(document: UnifiedDocument) -> Dict[str, Any]: """Calculate text statistics for the document.""" full_text = document.extract_all_text() words = full_text.split() chars = len(full_text) # Calculate average confidence confidences = [] for page in document.pages: for element in page.elements: if element.confidence is not None: confidences.append(element.confidence) avg_confidence = sum(confidences) / len(confidences) if confidences else None return { "total_characters": chars, "total_words": len(words), "average_confidence": round(avg_confidence, 4) if avg_confidence else None } @staticmethod def _element_to_markdown(element) -> str: """Convert a document element to Markdown format.""" content = element.get_text() if not content and element.type not in [ElementType.TABLE, ElementType.IMAGE]: return "" # Format based on element type if element.type == ElementType.TITLE: return f"# {content}" elif element.type == ElementType.HEADER: return f"### {content}" elif element.type in [ElementType.TEXT, ElementType.PARAGRAPH, ElementType.BODY]: return content elif element.type == ElementType.LIST_ITEM: return f"- {content}" elif element.type == ElementType.TABLE: # Use HTML table if available if hasattr(element.content, 'to_html'): return element.content.to_html() return f"[Table: {content}]" elif element.type == ElementType.IMAGE: return f"![Image]({element.metadata.get('path', 'image')})" elif element.type == ElementType.FIGURE: return f"[Figure: {content or 'No caption'}]" elif element.type == ElementType.CODE: return f"```\n{content}\n```" elif element.type == ElementType.EQUATION: return f"$${content}$$" elif element.type == ElementType.CAPTION: return f"*{content}*" elif element.type == ElementType.FOOTNOTE: return f"[^{content}]" elif element.type == ElementType.REFERENCE: return f"> {content}" else: return content if content else "" class JSONSchemaValidator: """ Validator for UnifiedDocument JSON exports. Uses the JSON Schema definition to validate exported data. """ _schema = None @classmethod def get_schema(cls) -> Dict[str, Any]: """Load and return the JSON Schema for UnifiedDocument.""" if cls._schema is None: schema_path = Path(__file__).parent.parent / "schemas" / "unified_document_schema.json" if schema_path.exists(): cls._schema = json.loads(schema_path.read_text(encoding='utf-8')) else: logger.warning(f"Schema file not found: {schema_path}") cls._schema = {} return cls._schema @classmethod def validate(cls, data: Dict[str, Any]) -> Tuple[bool, Optional[str]]: """ Validate JSON data against the UnifiedDocument schema. Args: data: The JSON data to validate Returns: Tuple of (is_valid, error_message) """ try: import jsonschema schema = cls.get_schema() if not schema: return True, None # Skip validation if schema not available jsonschema.validate(data, schema) return True, None except ImportError: logger.warning("jsonschema package not installed, skipping validation") return True, None except Exception as e: return False, str(e) def save_unified_document( document: UnifiedDocument, output_dir: Path, file_id: str, formats: Optional[list] = None ) -> Dict[str, Optional[Path]]: """ Convenience function to save UnifiedDocument to multiple formats. Args: document: The UnifiedDocument to save output_dir: Output directory file_id: Base filename formats: List of formats to export (default: ['json', 'markdown']) Returns: Dictionary mapping format names to output paths """ if formats is None: formats = ['json', 'markdown'] results = {} output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) for fmt in formats: try: if fmt == 'json': path = output_dir / f"{file_id}_result.json" UnifiedDocumentExporter.export_to_json(document, path) results['json'] = path elif fmt == 'markdown': path = output_dir / f"{file_id}_output.md" UnifiedDocumentExporter.export_to_markdown(document, path) results['markdown'] = path elif fmt == 'text': path = output_dir / f"{file_id}_text.txt" UnifiedDocumentExporter.export_to_text(document, path) results['text'] = path elif fmt == 'legacy': path = output_dir / f"{file_id}_legacy.json" UnifiedDocumentExporter.export_to_legacy_json(document, path) results['legacy'] = path except Exception as e: logger.error(f"Failed to export {fmt}: {e}") results[fmt] = None return results