feat: add unified JSON export with standardized schema

- Create JSON Schema definition for UnifiedDocument format
- Implement UnifiedDocumentExporter service with multiple export formats
- Include comprehensive processing metadata and statistics
- Update OCR service to use new exporter for dual-track outputs
- Support JSON, Markdown, Text, and legacy format exports

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
egg
2025-11-19 08:36:24 +08:00
parent 5bcf3dfd42
commit ab89a40e8d
5 changed files with 999 additions and 21 deletions

View File

@@ -1,3 +1,17 @@
"""
Tool_OCR - Services Package
"""
from .unified_document_exporter import (
UnifiedDocumentExporter,
ExportFormat,
JSONSchemaValidator,
save_unified_document
)
__all__ = [
'UnifiedDocumentExporter',
'ExportFormat',
'JSONSchemaValidator',
'save_unified_document'
]

View File

@@ -23,6 +23,7 @@ try:
from app.services.document_type_detector import DocumentTypeDetector, ProcessingTrackRecommendation
from app.services.direct_extraction_engine import DirectExtractionEngine
from app.services.ocr_to_unified_converter import OCRToUnifiedConverter
from app.services.unified_document_exporter import UnifiedDocumentExporter
from app.models.unified_document import (
UnifiedDocument, DocumentMetadata,
ProcessingTrack, ElementType, DocumentElement, Page, Dimensions,
@@ -30,8 +31,9 @@ try:
)
DUAL_TRACK_AVAILABLE = True
except ImportError as e:
logger.warning(f"Dual-track components not available: {e}")
logging.getLogger(__name__).warning(f"Dual-track components not available: {e}")
DUAL_TRACK_AVAILABLE = False
UnifiedDocumentExporter = None
logger = logging.getLogger(__name__)
@@ -1175,26 +1177,39 @@ class OCRService:
try:
output_dir.mkdir(parents=True, exist_ok=True)
# Convert UnifiedDocument to dict if needed
if isinstance(result, UnifiedDocument):
result_dict = result.to_dict()
legacy_result = result.to_legacy_format()
# Use UnifiedDocumentExporter for standardized export
if isinstance(result, UnifiedDocument) and UnifiedDocumentExporter is not None:
# Use the new exporter for UnifiedDocument
json_path = output_dir / f"{file_id}_result.json"
UnifiedDocumentExporter.export_to_json(
result,
json_path,
include_metadata=True,
include_statistics=True
)
markdown_path = output_dir / f"{file_id}_output.md"
UnifiedDocumentExporter.export_to_markdown(
result,
markdown_path,
include_metadata_header=False # Keep output clean
)
markdown_content = result.extract_all_text()
else:
result_dict = result
legacy_result = result
markdown_content = result.get('markdown_content', '')
# Legacy path for dict results
result_dict = result if isinstance(result, dict) else result.to_dict()
markdown_content = result.get('markdown_content', '') if isinstance(result, dict) else ''
# Save JSON (use dict format for compatibility)
json_path = output_dir / f"{file_id}_result.json"
with open(json_path, 'w', encoding='utf-8') as f:
json.dump(result_dict if isinstance(result, UnifiedDocument) else result,
f, ensure_ascii=False, indent=2)
# Save JSON
json_path = output_dir / f"{file_id}_result.json"
with open(json_path, 'w', encoding='utf-8') as f:
json.dump(result_dict, f, ensure_ascii=False, indent=2)
# Save Markdown
markdown_path = output_dir / f"{file_id}_output.md"
with open(markdown_path, 'w', encoding='utf-8') as f:
f.write(markdown_content)
# Save Markdown
markdown_path = output_dir / f"{file_id}_output.md"
with open(markdown_path, 'w', encoding='utf-8') as f:
f.write(markdown_content)
logger.info(f"Results saved: {json_path.name}, {markdown_path.name}")

View File

@@ -0,0 +1,506 @@
"""
Unified Document Exporter Service
Provides standardized export functionality for UnifiedDocument,
supporting both OCR and Direct extraction track outputs with
comprehensive processing metadata.
"""
import json
import logging
from pathlib import Path
from typing import Dict, Any, Optional, Tuple, Union
from datetime import datetime
from ..models.unified_document import (
UnifiedDocument,
ProcessingTrack,
ElementType
)
logger = logging.getLogger(__name__)
class ExportFormat:
"""Supported export formats"""
JSON = "json"
JSON_MINIMAL = "json_minimal"
JSON_LEGACY = "json_legacy"
MARKDOWN = "markdown"
TEXT = "text"
HTML = "html"
class UnifiedDocumentExporter:
"""
Exporter service for UnifiedDocument with comprehensive metadata support.
Supports both OCR and DIRECT processing tracks with consistent output format.
"""
# Schema version for tracking format changes
SCHEMA_VERSION = "1.0.0"
@staticmethod
def export_to_json(
document: UnifiedDocument,
output_path: Optional[Path] = None,
include_metadata: bool = True,
include_statistics: bool = True,
include_binary_content: bool = False,
pretty_print: bool = True
) -> Union[str, Path]:
"""
Export UnifiedDocument to standardized JSON format.
Args:
document: The UnifiedDocument to export
output_path: Optional path to save the JSON file
include_metadata: Include processing metadata
include_statistics: Include document statistics
include_binary_content: Include base64-encoded binary content
pretty_print: Format JSON with indentation
Returns:
JSON string if no output_path, otherwise the output Path
"""
export_data = UnifiedDocumentExporter._build_export_data(
document,
include_metadata=include_metadata,
include_statistics=include_statistics,
include_binary_content=include_binary_content
)
json_str = json.dumps(
export_data,
ensure_ascii=False,
indent=2 if pretty_print else None,
default=str
)
if output_path:
output_path = Path(output_path)
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text(json_str, encoding='utf-8')
logger.info(f"Exported JSON to: {output_path}")
return output_path
return json_str
@staticmethod
def export_to_markdown(
document: UnifiedDocument,
output_path: Optional[Path] = None,
include_metadata_header: bool = True,
include_page_breaks: bool = True
) -> Union[str, Path]:
"""
Export UnifiedDocument to Markdown format.
Args:
document: The UnifiedDocument to export
output_path: Optional path to save the Markdown file
include_metadata_header: Include document metadata as header
include_page_breaks: Include page break markers
Returns:
Markdown string if no output_path, otherwise the output Path
"""
lines = []
# Add metadata header
if include_metadata_header:
lines.append(f"# {document.metadata.filename}")
lines.append("")
lines.append("## Document Info")
lines.append(f"- **Processing Track**: {document.metadata.processing_track.value}")
lines.append(f"- **Processing Time**: {document.metadata.processing_time:.2f}s")
lines.append(f"- **Pages**: {document.page_count}")
lines.append(f"- **Total Elements**: {document.total_elements}")
if document.metadata.language:
lines.append(f"- **Language**: {document.metadata.language}")
lines.append("")
lines.append("---")
lines.append("")
# Export each page
for page in document.pages:
if include_page_breaks and page.page_number > 1:
lines.append("")
lines.append(f"---")
lines.append("")
lines.append(f"## Page {page.page_number}")
lines.append("")
# Get elements in reading order
for element in page.get_reading_order():
content = UnifiedDocumentExporter._element_to_markdown(element)
if content:
lines.append(content)
lines.append("")
md_content = "\n".join(lines)
if output_path:
output_path = Path(output_path)
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text(md_content, encoding='utf-8')
logger.info(f"Exported Markdown to: {output_path}")
return output_path
return md_content
@staticmethod
def export_to_text(
document: UnifiedDocument,
output_path: Optional[Path] = None,
page_separator: str = "\n\n--- Page Break ---\n\n"
) -> Union[str, Path]:
"""
Export UnifiedDocument to plain text format.
Args:
document: The UnifiedDocument to export
output_path: Optional path to save the text file
page_separator: Separator between pages
Returns:
Text string if no output_path, otherwise the output Path
"""
pages_text = []
for page in document.pages:
page_text = page.extract_text()
if page_text:
pages_text.append(page_text)
text_content = page_separator.join(pages_text)
if output_path:
output_path = Path(output_path)
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text(text_content, encoding='utf-8')
logger.info(f"Exported text to: {output_path}")
return output_path
return text_content
@staticmethod
def export_to_legacy_json(
document: UnifiedDocument,
output_path: Optional[Path] = None
) -> Union[str, Path]:
"""
Export UnifiedDocument to legacy JSON format for backward compatibility.
Args:
document: The UnifiedDocument to export
output_path: Optional path to save the JSON file
Returns:
JSON string if no output_path, otherwise the output Path
"""
legacy_data = document.to_legacy_format()
json_str = json.dumps(
legacy_data,
ensure_ascii=False,
indent=2,
default=str
)
if output_path:
output_path = Path(output_path)
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text(json_str, encoding='utf-8')
logger.info(f"Exported legacy JSON to: {output_path}")
return output_path
return json_str
@staticmethod
def export_all_formats(
document: UnifiedDocument,
output_dir: Path,
file_id: str
) -> Dict[str, Optional[Path]]:
"""
Export UnifiedDocument to all standard formats.
Args:
document: The UnifiedDocument to export
output_dir: Directory to save output files
file_id: Base filename for outputs
Returns:
Dictionary mapping format names to output paths
"""
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
results = {}
# Export JSON
try:
json_path = output_dir / f"{file_id}_result.json"
UnifiedDocumentExporter.export_to_json(
document,
json_path,
include_metadata=True,
include_statistics=True
)
results['json'] = json_path
except Exception as e:
logger.error(f"Failed to export JSON: {e}")
results['json'] = None
# Export Markdown
try:
md_path = output_dir / f"{file_id}_output.md"
UnifiedDocumentExporter.export_to_markdown(document, md_path)
results['markdown'] = md_path
except Exception as e:
logger.error(f"Failed to export Markdown: {e}")
results['markdown'] = None
# Export plain text
try:
txt_path = output_dir / f"{file_id}_text.txt"
UnifiedDocumentExporter.export_to_text(document, txt_path)
results['text'] = txt_path
except Exception as e:
logger.error(f"Failed to export text: {e}")
results['text'] = None
return results
@staticmethod
def _build_export_data(
document: UnifiedDocument,
include_metadata: bool = True,
include_statistics: bool = True,
include_binary_content: bool = False
) -> Dict[str, Any]:
"""
Build the export data structure with processing metadata.
Supports both OCR and DIRECT track outputs with consistent format.
"""
# Base document data
export_data = {
"schema_version": UnifiedDocumentExporter.SCHEMA_VERSION,
"document_id": document.document_id,
"export_timestamp": datetime.utcnow().isoformat() + "Z"
}
# Add metadata
if include_metadata:
export_data["metadata"] = document.metadata.to_dict()
# Add extended processing metadata
export_data["metadata"]["processing_info"] = {
"track_description": UnifiedDocumentExporter._get_track_description(
document.metadata.processing_track
),
"schema_version": UnifiedDocumentExporter.SCHEMA_VERSION,
"export_format": "unified_document_v1"
}
# Add pages
export_data["pages"] = [page.to_dict() for page in document.pages]
# Add statistics
if include_statistics:
export_data["statistics"] = {
"page_count": document.page_count,
"total_elements": document.total_elements,
"total_tables": len(document.get_all_tables()),
"total_images": len(document.get_all_images()),
"element_type_counts": UnifiedDocumentExporter._count_element_types(document),
"text_stats": UnifiedDocumentExporter._calculate_text_stats(document)
}
# Add processing errors if any
if document.processing_errors:
export_data["processing_errors"] = document.processing_errors
return export_data
@staticmethod
def _get_track_description(track: ProcessingTrack) -> str:
"""Get human-readable description for processing track."""
descriptions = {
ProcessingTrack.OCR: "PaddleOCR PP-StructureV3 - Used for scanned documents and images",
ProcessingTrack.DIRECT: "PyMuPDF Direct Extraction - Used for editable PDFs with embedded text",
ProcessingTrack.HYBRID: "Hybrid Processing - Combined OCR and direct extraction"
}
return descriptions.get(track, "Unknown processing track")
@staticmethod
def _count_element_types(document: UnifiedDocument) -> Dict[str, int]:
"""Count occurrences of each element type in the document."""
counts = {}
for page in document.pages:
for element in page.elements:
type_name = element.type.value
counts[type_name] = counts.get(type_name, 0) + 1
return counts
@staticmethod
def _calculate_text_stats(document: UnifiedDocument) -> Dict[str, Any]:
"""Calculate text statistics for the document."""
full_text = document.extract_all_text()
words = full_text.split()
chars = len(full_text)
# Calculate average confidence
confidences = []
for page in document.pages:
for element in page.elements:
if element.confidence is not None:
confidences.append(element.confidence)
avg_confidence = sum(confidences) / len(confidences) if confidences else None
return {
"total_characters": chars,
"total_words": len(words),
"average_confidence": round(avg_confidence, 4) if avg_confidence else None
}
@staticmethod
def _element_to_markdown(element) -> str:
"""Convert a document element to Markdown format."""
content = element.get_text()
if not content and element.type not in [ElementType.TABLE, ElementType.IMAGE]:
return ""
# Format based on element type
if element.type == ElementType.TITLE:
return f"# {content}"
elif element.type == ElementType.HEADER:
return f"### {content}"
elif element.type in [ElementType.TEXT, ElementType.PARAGRAPH, ElementType.BODY]:
return content
elif element.type == ElementType.LIST_ITEM:
return f"- {content}"
elif element.type == ElementType.TABLE:
# Use HTML table if available
if hasattr(element.content, 'to_html'):
return element.content.to_html()
return f"[Table: {content}]"
elif element.type == ElementType.IMAGE:
return f"![Image]({element.metadata.get('path', 'image')})"
elif element.type == ElementType.FIGURE:
return f"[Figure: {content or 'No caption'}]"
elif element.type == ElementType.CODE:
return f"```\n{content}\n```"
elif element.type == ElementType.EQUATION:
return f"$${content}$$"
elif element.type == ElementType.CAPTION:
return f"*{content}*"
elif element.type == ElementType.FOOTNOTE:
return f"[^{content}]"
elif element.type == ElementType.REFERENCE:
return f"> {content}"
else:
return content if content else ""
class JSONSchemaValidator:
"""
Validator for UnifiedDocument JSON exports.
Uses the JSON Schema definition to validate exported data.
"""
_schema = None
@classmethod
def get_schema(cls) -> Dict[str, Any]:
"""Load and return the JSON Schema for UnifiedDocument."""
if cls._schema is None:
schema_path = Path(__file__).parent.parent / "schemas" / "unified_document_schema.json"
if schema_path.exists():
cls._schema = json.loads(schema_path.read_text(encoding='utf-8'))
else:
logger.warning(f"Schema file not found: {schema_path}")
cls._schema = {}
return cls._schema
@classmethod
def validate(cls, data: Dict[str, Any]) -> Tuple[bool, Optional[str]]:
"""
Validate JSON data against the UnifiedDocument schema.
Args:
data: The JSON data to validate
Returns:
Tuple of (is_valid, error_message)
"""
try:
import jsonschema
schema = cls.get_schema()
if not schema:
return True, None # Skip validation if schema not available
jsonschema.validate(data, schema)
return True, None
except ImportError:
logger.warning("jsonschema package not installed, skipping validation")
return True, None
except Exception as e:
return False, str(e)
def save_unified_document(
document: UnifiedDocument,
output_dir: Path,
file_id: str,
formats: Optional[list] = None
) -> Dict[str, Optional[Path]]:
"""
Convenience function to save UnifiedDocument to multiple formats.
Args:
document: The UnifiedDocument to save
output_dir: Output directory
file_id: Base filename
formats: List of formats to export (default: ['json', 'markdown'])
Returns:
Dictionary mapping format names to output paths
"""
if formats is None:
formats = ['json', 'markdown']
results = {}
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
for fmt in formats:
try:
if fmt == 'json':
path = output_dir / f"{file_id}_result.json"
UnifiedDocumentExporter.export_to_json(document, path)
results['json'] = path
elif fmt == 'markdown':
path = output_dir / f"{file_id}_output.md"
UnifiedDocumentExporter.export_to_markdown(document, path)
results['markdown'] = path
elif fmt == 'text':
path = output_dir / f"{file_id}_text.txt"
UnifiedDocumentExporter.export_to_text(document, path)
results['text'] = path
elif fmt == 'legacy':
path = output_dir / f"{file_id}_legacy.json"
UnifiedDocumentExporter.export_to_legacy_json(document, path)
results['legacy'] = path
except Exception as e:
logger.error(f"Failed to export {fmt}: {e}")
results[fmt] = None
return results