feat: add unified JSON export with standardized schema
- Create JSON Schema definition for UnifiedDocument format - Implement UnifiedDocumentExporter service with multiple export formats - Include comprehensive processing metadata and statistics - Update OCR service to use new exporter for dual-track outputs - Support JSON, Markdown, Text, and legacy format exports 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -23,6 +23,7 @@ try:
|
||||
from app.services.document_type_detector import DocumentTypeDetector, ProcessingTrackRecommendation
|
||||
from app.services.direct_extraction_engine import DirectExtractionEngine
|
||||
from app.services.ocr_to_unified_converter import OCRToUnifiedConverter
|
||||
from app.services.unified_document_exporter import UnifiedDocumentExporter
|
||||
from app.models.unified_document import (
|
||||
UnifiedDocument, DocumentMetadata,
|
||||
ProcessingTrack, ElementType, DocumentElement, Page, Dimensions,
|
||||
@@ -30,8 +31,9 @@ try:
|
||||
)
|
||||
DUAL_TRACK_AVAILABLE = True
|
||||
except ImportError as e:
|
||||
logger.warning(f"Dual-track components not available: {e}")
|
||||
logging.getLogger(__name__).warning(f"Dual-track components not available: {e}")
|
||||
DUAL_TRACK_AVAILABLE = False
|
||||
UnifiedDocumentExporter = None
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -1175,26 +1177,39 @@ class OCRService:
|
||||
try:
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Convert UnifiedDocument to dict if needed
|
||||
if isinstance(result, UnifiedDocument):
|
||||
result_dict = result.to_dict()
|
||||
legacy_result = result.to_legacy_format()
|
||||
# Use UnifiedDocumentExporter for standardized export
|
||||
if isinstance(result, UnifiedDocument) and UnifiedDocumentExporter is not None:
|
||||
# Use the new exporter for UnifiedDocument
|
||||
json_path = output_dir / f"{file_id}_result.json"
|
||||
UnifiedDocumentExporter.export_to_json(
|
||||
result,
|
||||
json_path,
|
||||
include_metadata=True,
|
||||
include_statistics=True
|
||||
)
|
||||
|
||||
markdown_path = output_dir / f"{file_id}_output.md"
|
||||
UnifiedDocumentExporter.export_to_markdown(
|
||||
result,
|
||||
markdown_path,
|
||||
include_metadata_header=False # Keep output clean
|
||||
)
|
||||
|
||||
markdown_content = result.extract_all_text()
|
||||
else:
|
||||
result_dict = result
|
||||
legacy_result = result
|
||||
markdown_content = result.get('markdown_content', '')
|
||||
# Legacy path for dict results
|
||||
result_dict = result if isinstance(result, dict) else result.to_dict()
|
||||
markdown_content = result.get('markdown_content', '') if isinstance(result, dict) else ''
|
||||
|
||||
# Save JSON (use dict format for compatibility)
|
||||
json_path = output_dir / f"{file_id}_result.json"
|
||||
with open(json_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(result_dict if isinstance(result, UnifiedDocument) else result,
|
||||
f, ensure_ascii=False, indent=2)
|
||||
# Save JSON
|
||||
json_path = output_dir / f"{file_id}_result.json"
|
||||
with open(json_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(result_dict, f, ensure_ascii=False, indent=2)
|
||||
|
||||
# Save Markdown
|
||||
markdown_path = output_dir / f"{file_id}_output.md"
|
||||
with open(markdown_path, 'w', encoding='utf-8') as f:
|
||||
f.write(markdown_content)
|
||||
# Save Markdown
|
||||
markdown_path = output_dir / f"{file_id}_output.md"
|
||||
with open(markdown_path, 'w', encoding='utf-8') as f:
|
||||
f.write(markdown_content)
|
||||
|
||||
logger.info(f"Results saved: {json_path.name}, {markdown_path.name}")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user