feat: add unified JSON export with standardized schema
- Create JSON Schema definition for UnifiedDocument format - Implement UnifiedDocumentExporter service with multiple export formats - Include comprehensive processing metadata and statistics - Update OCR service to use new exporter for dual-track outputs - Support JSON, Markdown, Text, and legacy format exports 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
443
backend/app/schemas/unified_document_schema.json
Normal file
443
backend/app/schemas/unified_document_schema.json
Normal file
@@ -0,0 +1,443 @@
|
|||||||
|
{
|
||||||
|
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||||
|
"$id": "https://tool-ocr.local/schemas/unified-document.json",
|
||||||
|
"title": "UnifiedDocument",
|
||||||
|
"description": "Unified document representation for dual-track OCR/Direct extraction processing",
|
||||||
|
"type": "object",
|
||||||
|
"required": ["document_id", "metadata", "pages", "statistics"],
|
||||||
|
"properties": {
|
||||||
|
"document_id": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Unique identifier for the document"
|
||||||
|
},
|
||||||
|
"metadata": {
|
||||||
|
"$ref": "#/definitions/DocumentMetadata"
|
||||||
|
},
|
||||||
|
"pages": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"$ref": "#/definitions/Page"
|
||||||
|
},
|
||||||
|
"description": "List of pages in the document"
|
||||||
|
},
|
||||||
|
"statistics": {
|
||||||
|
"$ref": "#/definitions/DocumentStatistics"
|
||||||
|
},
|
||||||
|
"processing_errors": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"$ref": "#/definitions/ProcessingError"
|
||||||
|
},
|
||||||
|
"default": [],
|
||||||
|
"description": "List of any errors encountered during processing"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"definitions": {
|
||||||
|
"DocumentMetadata": {
|
||||||
|
"type": "object",
|
||||||
|
"required": ["filename", "file_type", "file_size", "created_at", "processing_track", "processing_time"],
|
||||||
|
"properties": {
|
||||||
|
"filename": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Original filename"
|
||||||
|
},
|
||||||
|
"file_type": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "File MIME type or extension"
|
||||||
|
},
|
||||||
|
"file_size": {
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 0,
|
||||||
|
"description": "File size in bytes"
|
||||||
|
},
|
||||||
|
"created_at": {
|
||||||
|
"type": "string",
|
||||||
|
"format": "date-time",
|
||||||
|
"description": "Processing timestamp (ISO 8601)"
|
||||||
|
},
|
||||||
|
"processing_track": {
|
||||||
|
"type": "string",
|
||||||
|
"enum": ["ocr", "direct", "hybrid"],
|
||||||
|
"description": "Processing track used"
|
||||||
|
},
|
||||||
|
"processing_time": {
|
||||||
|
"type": "number",
|
||||||
|
"minimum": 0,
|
||||||
|
"description": "Processing time in seconds"
|
||||||
|
},
|
||||||
|
"language": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Detected or specified language code"
|
||||||
|
},
|
||||||
|
"title": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Document title from metadata"
|
||||||
|
},
|
||||||
|
"author": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Document author"
|
||||||
|
},
|
||||||
|
"subject": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Document subject"
|
||||||
|
},
|
||||||
|
"keywords": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {"type": "string"},
|
||||||
|
"description": "Document keywords"
|
||||||
|
},
|
||||||
|
"producer": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "PDF producer application"
|
||||||
|
},
|
||||||
|
"creator": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Document creator application"
|
||||||
|
},
|
||||||
|
"creation_date": {
|
||||||
|
"type": "string",
|
||||||
|
"format": "date-time",
|
||||||
|
"description": "Document creation date"
|
||||||
|
},
|
||||||
|
"modification_date": {
|
||||||
|
"type": "string",
|
||||||
|
"format": "date-time",
|
||||||
|
"description": "Document last modification date"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"Page": {
|
||||||
|
"type": "object",
|
||||||
|
"required": ["page_number", "elements", "dimensions"],
|
||||||
|
"properties": {
|
||||||
|
"page_number": {
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 1,
|
||||||
|
"description": "1-based page number"
|
||||||
|
},
|
||||||
|
"elements": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"$ref": "#/definitions/DocumentElement"
|
||||||
|
},
|
||||||
|
"description": "List of elements on the page"
|
||||||
|
},
|
||||||
|
"dimensions": {
|
||||||
|
"$ref": "#/definitions/Dimensions"
|
||||||
|
},
|
||||||
|
"metadata": {
|
||||||
|
"type": "object",
|
||||||
|
"additionalProperties": true,
|
||||||
|
"description": "Additional page-specific metadata"
|
||||||
|
},
|
||||||
|
"statistics": {
|
||||||
|
"$ref": "#/definitions/PageStatistics"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"DocumentElement": {
|
||||||
|
"type": "object",
|
||||||
|
"required": ["element_id", "type", "bbox"],
|
||||||
|
"properties": {
|
||||||
|
"element_id": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Unique identifier for the element"
|
||||||
|
},
|
||||||
|
"type": {
|
||||||
|
"type": "string",
|
||||||
|
"enum": [
|
||||||
|
"text", "title", "header", "footer", "reference", "equation", "footnote", "caption",
|
||||||
|
"list", "list_item",
|
||||||
|
"table", "table_cell", "table_caption",
|
||||||
|
"image", "figure", "chart", "diagram",
|
||||||
|
"section", "paragraph", "page_number", "watermark", "header_group", "body",
|
||||||
|
"code", "formula", "signature", "stamp", "logo", "barcode", "qr_code"
|
||||||
|
],
|
||||||
|
"description": "Element type (supports all 23 PP-StructureV3 types plus custom ones)"
|
||||||
|
},
|
||||||
|
"content": {
|
||||||
|
"oneOf": [
|
||||||
|
{"type": "string"},
|
||||||
|
{"$ref": "#/definitions/TableData"},
|
||||||
|
{"type": "object"}
|
||||||
|
],
|
||||||
|
"description": "Element content (text, table data, or structured data)"
|
||||||
|
},
|
||||||
|
"content_type": {
|
||||||
|
"type": "string",
|
||||||
|
"enum": ["text", "table", "binary"],
|
||||||
|
"description": "Type of content when not a simple string"
|
||||||
|
},
|
||||||
|
"content_length": {
|
||||||
|
"type": "integer",
|
||||||
|
"description": "Length of binary content in bytes"
|
||||||
|
},
|
||||||
|
"bbox": {
|
||||||
|
"$ref": "#/definitions/BoundingBox"
|
||||||
|
},
|
||||||
|
"confidence": {
|
||||||
|
"type": "number",
|
||||||
|
"minimum": 0,
|
||||||
|
"maximum": 1,
|
||||||
|
"description": "OCR confidence score (0-1)"
|
||||||
|
},
|
||||||
|
"style": {
|
||||||
|
"$ref": "#/definitions/StyleInfo"
|
||||||
|
},
|
||||||
|
"metadata": {
|
||||||
|
"type": "object",
|
||||||
|
"additionalProperties": true,
|
||||||
|
"description": "Additional element metadata"
|
||||||
|
},
|
||||||
|
"children": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"$ref": "#/definitions/DocumentElement"
|
||||||
|
},
|
||||||
|
"description": "Nested child elements"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"BoundingBox": {
|
||||||
|
"type": "object",
|
||||||
|
"required": ["x0", "y0", "x1", "y1"],
|
||||||
|
"properties": {
|
||||||
|
"x0": {
|
||||||
|
"type": "number",
|
||||||
|
"description": "Left coordinate"
|
||||||
|
},
|
||||||
|
"y0": {
|
||||||
|
"type": "number",
|
||||||
|
"description": "Top coordinate"
|
||||||
|
},
|
||||||
|
"x1": {
|
||||||
|
"type": "number",
|
||||||
|
"description": "Right coordinate"
|
||||||
|
},
|
||||||
|
"y1": {
|
||||||
|
"type": "number",
|
||||||
|
"description": "Bottom coordinate"
|
||||||
|
},
|
||||||
|
"width": {
|
||||||
|
"type": "number",
|
||||||
|
"description": "Width (calculated)"
|
||||||
|
},
|
||||||
|
"height": {
|
||||||
|
"type": "number",
|
||||||
|
"description": "Height (calculated)"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"StyleInfo": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"font_name": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Font family name"
|
||||||
|
},
|
||||||
|
"font_size": {
|
||||||
|
"type": "number",
|
||||||
|
"minimum": 0,
|
||||||
|
"description": "Font size in points"
|
||||||
|
},
|
||||||
|
"font_weight": {
|
||||||
|
"type": "string",
|
||||||
|
"enum": ["normal", "bold"],
|
||||||
|
"description": "Font weight"
|
||||||
|
},
|
||||||
|
"font_style": {
|
||||||
|
"type": "string",
|
||||||
|
"enum": ["normal", "italic"],
|
||||||
|
"description": "Font style"
|
||||||
|
},
|
||||||
|
"text_color": {
|
||||||
|
"type": "integer",
|
||||||
|
"description": "Text color as RGB integer"
|
||||||
|
},
|
||||||
|
"text_color_rgb": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {"type": "integer", "minimum": 0, "maximum": 255},
|
||||||
|
"minItems": 3,
|
||||||
|
"maxItems": 3,
|
||||||
|
"description": "Text color as [R, G, B] array"
|
||||||
|
},
|
||||||
|
"bg_color": {
|
||||||
|
"type": "integer",
|
||||||
|
"description": "Background color as RGB integer"
|
||||||
|
},
|
||||||
|
"alignment": {
|
||||||
|
"type": "string",
|
||||||
|
"enum": ["left", "center", "right", "justify"],
|
||||||
|
"description": "Text alignment"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"TableData": {
|
||||||
|
"type": "object",
|
||||||
|
"required": ["rows", "cols", "cells"],
|
||||||
|
"properties": {
|
||||||
|
"rows": {
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 0,
|
||||||
|
"description": "Number of rows"
|
||||||
|
},
|
||||||
|
"cols": {
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 0,
|
||||||
|
"description": "Number of columns"
|
||||||
|
},
|
||||||
|
"cells": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"$ref": "#/definitions/TableCell"
|
||||||
|
},
|
||||||
|
"description": "Table cells"
|
||||||
|
},
|
||||||
|
"headers": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {"type": "string"},
|
||||||
|
"description": "Header row labels"
|
||||||
|
},
|
||||||
|
"caption": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Table caption"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"TableCell": {
|
||||||
|
"type": "object",
|
||||||
|
"required": ["row", "col"],
|
||||||
|
"properties": {
|
||||||
|
"row": {
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 0,
|
||||||
|
"description": "Row index (0-based)"
|
||||||
|
},
|
||||||
|
"col": {
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 0,
|
||||||
|
"description": "Column index (0-based)"
|
||||||
|
},
|
||||||
|
"row_span": {
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 1,
|
||||||
|
"default": 1,
|
||||||
|
"description": "Number of rows spanned"
|
||||||
|
},
|
||||||
|
"col_span": {
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 1,
|
||||||
|
"default": 1,
|
||||||
|
"description": "Number of columns spanned"
|
||||||
|
},
|
||||||
|
"content": {
|
||||||
|
"type": "string",
|
||||||
|
"default": "",
|
||||||
|
"description": "Cell text content"
|
||||||
|
},
|
||||||
|
"bbox": {
|
||||||
|
"$ref": "#/definitions/BoundingBox"
|
||||||
|
},
|
||||||
|
"style": {
|
||||||
|
"$ref": "#/definitions/StyleInfo"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"Dimensions": {
|
||||||
|
"type": "object",
|
||||||
|
"required": ["width", "height"],
|
||||||
|
"properties": {
|
||||||
|
"width": {
|
||||||
|
"type": "number",
|
||||||
|
"minimum": 0,
|
||||||
|
"description": "Width in pixels or points"
|
||||||
|
},
|
||||||
|
"height": {
|
||||||
|
"type": "number",
|
||||||
|
"minimum": 0,
|
||||||
|
"description": "Height in pixels or points"
|
||||||
|
},
|
||||||
|
"dpi": {
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 1,
|
||||||
|
"description": "Resolution in DPI"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"DocumentStatistics": {
|
||||||
|
"type": "object",
|
||||||
|
"required": ["page_count", "total_elements", "total_tables", "total_images"],
|
||||||
|
"properties": {
|
||||||
|
"page_count": {
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 0,
|
||||||
|
"description": "Total number of pages"
|
||||||
|
},
|
||||||
|
"total_elements": {
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 0,
|
||||||
|
"description": "Total elements across all pages"
|
||||||
|
},
|
||||||
|
"total_tables": {
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 0,
|
||||||
|
"description": "Total tables across all pages"
|
||||||
|
},
|
||||||
|
"total_images": {
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 0,
|
||||||
|
"description": "Total images across all pages"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"PageStatistics": {
|
||||||
|
"type": "object",
|
||||||
|
"required": ["total_elements", "text_elements", "tables", "images"],
|
||||||
|
"properties": {
|
||||||
|
"total_elements": {
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 0
|
||||||
|
},
|
||||||
|
"text_elements": {
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 0
|
||||||
|
},
|
||||||
|
"tables": {
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 0
|
||||||
|
},
|
||||||
|
"images": {
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"ProcessingError": {
|
||||||
|
"type": "object",
|
||||||
|
"required": ["error_type", "message"],
|
||||||
|
"properties": {
|
||||||
|
"error_type": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Error classification"
|
||||||
|
},
|
||||||
|
"message": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Error description"
|
||||||
|
},
|
||||||
|
"page": {
|
||||||
|
"type": "integer",
|
||||||
|
"description": "Page number where error occurred"
|
||||||
|
},
|
||||||
|
"element_id": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Element ID if applicable"
|
||||||
|
},
|
||||||
|
"timestamp": {
|
||||||
|
"type": "string",
|
||||||
|
"format": "date-time",
|
||||||
|
"description": "When the error occurred"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,3 +1,17 @@
|
|||||||
"""
|
"""
|
||||||
Tool_OCR - Services Package
|
Tool_OCR - Services Package
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
from .unified_document_exporter import (
|
||||||
|
UnifiedDocumentExporter,
|
||||||
|
ExportFormat,
|
||||||
|
JSONSchemaValidator,
|
||||||
|
save_unified_document
|
||||||
|
)
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
'UnifiedDocumentExporter',
|
||||||
|
'ExportFormat',
|
||||||
|
'JSONSchemaValidator',
|
||||||
|
'save_unified_document'
|
||||||
|
]
|
||||||
|
|||||||
@@ -23,6 +23,7 @@ try:
|
|||||||
from app.services.document_type_detector import DocumentTypeDetector, ProcessingTrackRecommendation
|
from app.services.document_type_detector import DocumentTypeDetector, ProcessingTrackRecommendation
|
||||||
from app.services.direct_extraction_engine import DirectExtractionEngine
|
from app.services.direct_extraction_engine import DirectExtractionEngine
|
||||||
from app.services.ocr_to_unified_converter import OCRToUnifiedConverter
|
from app.services.ocr_to_unified_converter import OCRToUnifiedConverter
|
||||||
|
from app.services.unified_document_exporter import UnifiedDocumentExporter
|
||||||
from app.models.unified_document import (
|
from app.models.unified_document import (
|
||||||
UnifiedDocument, DocumentMetadata,
|
UnifiedDocument, DocumentMetadata,
|
||||||
ProcessingTrack, ElementType, DocumentElement, Page, Dimensions,
|
ProcessingTrack, ElementType, DocumentElement, Page, Dimensions,
|
||||||
@@ -30,8 +31,9 @@ try:
|
|||||||
)
|
)
|
||||||
DUAL_TRACK_AVAILABLE = True
|
DUAL_TRACK_AVAILABLE = True
|
||||||
except ImportError as e:
|
except ImportError as e:
|
||||||
logger.warning(f"Dual-track components not available: {e}")
|
logging.getLogger(__name__).warning(f"Dual-track components not available: {e}")
|
||||||
DUAL_TRACK_AVAILABLE = False
|
DUAL_TRACK_AVAILABLE = False
|
||||||
|
UnifiedDocumentExporter = None
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@@ -1175,26 +1177,39 @@ class OCRService:
|
|||||||
try:
|
try:
|
||||||
output_dir.mkdir(parents=True, exist_ok=True)
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
# Convert UnifiedDocument to dict if needed
|
# Use UnifiedDocumentExporter for standardized export
|
||||||
if isinstance(result, UnifiedDocument):
|
if isinstance(result, UnifiedDocument) and UnifiedDocumentExporter is not None:
|
||||||
result_dict = result.to_dict()
|
# Use the new exporter for UnifiedDocument
|
||||||
legacy_result = result.to_legacy_format()
|
json_path = output_dir / f"{file_id}_result.json"
|
||||||
|
UnifiedDocumentExporter.export_to_json(
|
||||||
|
result,
|
||||||
|
json_path,
|
||||||
|
include_metadata=True,
|
||||||
|
include_statistics=True
|
||||||
|
)
|
||||||
|
|
||||||
|
markdown_path = output_dir / f"{file_id}_output.md"
|
||||||
|
UnifiedDocumentExporter.export_to_markdown(
|
||||||
|
result,
|
||||||
|
markdown_path,
|
||||||
|
include_metadata_header=False # Keep output clean
|
||||||
|
)
|
||||||
|
|
||||||
markdown_content = result.extract_all_text()
|
markdown_content = result.extract_all_text()
|
||||||
else:
|
else:
|
||||||
result_dict = result
|
# Legacy path for dict results
|
||||||
legacy_result = result
|
result_dict = result if isinstance(result, dict) else result.to_dict()
|
||||||
markdown_content = result.get('markdown_content', '')
|
markdown_content = result.get('markdown_content', '') if isinstance(result, dict) else ''
|
||||||
|
|
||||||
# Save JSON (use dict format for compatibility)
|
# Save JSON
|
||||||
json_path = output_dir / f"{file_id}_result.json"
|
json_path = output_dir / f"{file_id}_result.json"
|
||||||
with open(json_path, 'w', encoding='utf-8') as f:
|
with open(json_path, 'w', encoding='utf-8') as f:
|
||||||
json.dump(result_dict if isinstance(result, UnifiedDocument) else result,
|
json.dump(result_dict, f, ensure_ascii=False, indent=2)
|
||||||
f, ensure_ascii=False, indent=2)
|
|
||||||
|
|
||||||
# Save Markdown
|
# Save Markdown
|
||||||
markdown_path = output_dir / f"{file_id}_output.md"
|
markdown_path = output_dir / f"{file_id}_output.md"
|
||||||
with open(markdown_path, 'w', encoding='utf-8') as f:
|
with open(markdown_path, 'w', encoding='utf-8') as f:
|
||||||
f.write(markdown_content)
|
f.write(markdown_content)
|
||||||
|
|
||||||
logger.info(f"Results saved: {json_path.name}, {markdown_path.name}")
|
logger.info(f"Results saved: {json_path.name}, {markdown_path.name}")
|
||||||
|
|
||||||
|
|||||||
506
backend/app/services/unified_document_exporter.py
Normal file
506
backend/app/services/unified_document_exporter.py
Normal file
@@ -0,0 +1,506 @@
|
|||||||
|
"""
|
||||||
|
Unified Document Exporter Service
|
||||||
|
|
||||||
|
Provides standardized export functionality for UnifiedDocument,
|
||||||
|
supporting both OCR and Direct extraction track outputs with
|
||||||
|
comprehensive processing metadata.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict, Any, Optional, Tuple, Union
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
from ..models.unified_document import (
|
||||||
|
UnifiedDocument,
|
||||||
|
ProcessingTrack,
|
||||||
|
ElementType
|
||||||
|
)
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class ExportFormat:
|
||||||
|
"""Supported export formats"""
|
||||||
|
JSON = "json"
|
||||||
|
JSON_MINIMAL = "json_minimal"
|
||||||
|
JSON_LEGACY = "json_legacy"
|
||||||
|
MARKDOWN = "markdown"
|
||||||
|
TEXT = "text"
|
||||||
|
HTML = "html"
|
||||||
|
|
||||||
|
|
||||||
|
class UnifiedDocumentExporter:
|
||||||
|
"""
|
||||||
|
Exporter service for UnifiedDocument with comprehensive metadata support.
|
||||||
|
|
||||||
|
Supports both OCR and DIRECT processing tracks with consistent output format.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Schema version for tracking format changes
|
||||||
|
SCHEMA_VERSION = "1.0.0"
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def export_to_json(
|
||||||
|
document: UnifiedDocument,
|
||||||
|
output_path: Optional[Path] = None,
|
||||||
|
include_metadata: bool = True,
|
||||||
|
include_statistics: bool = True,
|
||||||
|
include_binary_content: bool = False,
|
||||||
|
pretty_print: bool = True
|
||||||
|
) -> Union[str, Path]:
|
||||||
|
"""
|
||||||
|
Export UnifiedDocument to standardized JSON format.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
document: The UnifiedDocument to export
|
||||||
|
output_path: Optional path to save the JSON file
|
||||||
|
include_metadata: Include processing metadata
|
||||||
|
include_statistics: Include document statistics
|
||||||
|
include_binary_content: Include base64-encoded binary content
|
||||||
|
pretty_print: Format JSON with indentation
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
JSON string if no output_path, otherwise the output Path
|
||||||
|
"""
|
||||||
|
export_data = UnifiedDocumentExporter._build_export_data(
|
||||||
|
document,
|
||||||
|
include_metadata=include_metadata,
|
||||||
|
include_statistics=include_statistics,
|
||||||
|
include_binary_content=include_binary_content
|
||||||
|
)
|
||||||
|
|
||||||
|
json_str = json.dumps(
|
||||||
|
export_data,
|
||||||
|
ensure_ascii=False,
|
||||||
|
indent=2 if pretty_print else None,
|
||||||
|
default=str
|
||||||
|
)
|
||||||
|
|
||||||
|
if output_path:
|
||||||
|
output_path = Path(output_path)
|
||||||
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
output_path.write_text(json_str, encoding='utf-8')
|
||||||
|
logger.info(f"Exported JSON to: {output_path}")
|
||||||
|
return output_path
|
||||||
|
|
||||||
|
return json_str
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def export_to_markdown(
|
||||||
|
document: UnifiedDocument,
|
||||||
|
output_path: Optional[Path] = None,
|
||||||
|
include_metadata_header: bool = True,
|
||||||
|
include_page_breaks: bool = True
|
||||||
|
) -> Union[str, Path]:
|
||||||
|
"""
|
||||||
|
Export UnifiedDocument to Markdown format.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
document: The UnifiedDocument to export
|
||||||
|
output_path: Optional path to save the Markdown file
|
||||||
|
include_metadata_header: Include document metadata as header
|
||||||
|
include_page_breaks: Include page break markers
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Markdown string if no output_path, otherwise the output Path
|
||||||
|
"""
|
||||||
|
lines = []
|
||||||
|
|
||||||
|
# Add metadata header
|
||||||
|
if include_metadata_header:
|
||||||
|
lines.append(f"# {document.metadata.filename}")
|
||||||
|
lines.append("")
|
||||||
|
lines.append("## Document Info")
|
||||||
|
lines.append(f"- **Processing Track**: {document.metadata.processing_track.value}")
|
||||||
|
lines.append(f"- **Processing Time**: {document.metadata.processing_time:.2f}s")
|
||||||
|
lines.append(f"- **Pages**: {document.page_count}")
|
||||||
|
lines.append(f"- **Total Elements**: {document.total_elements}")
|
||||||
|
if document.metadata.language:
|
||||||
|
lines.append(f"- **Language**: {document.metadata.language}")
|
||||||
|
lines.append("")
|
||||||
|
lines.append("---")
|
||||||
|
lines.append("")
|
||||||
|
|
||||||
|
# Export each page
|
||||||
|
for page in document.pages:
|
||||||
|
if include_page_breaks and page.page_number > 1:
|
||||||
|
lines.append("")
|
||||||
|
lines.append(f"---")
|
||||||
|
lines.append("")
|
||||||
|
|
||||||
|
lines.append(f"## Page {page.page_number}")
|
||||||
|
lines.append("")
|
||||||
|
|
||||||
|
# Get elements in reading order
|
||||||
|
for element in page.get_reading_order():
|
||||||
|
content = UnifiedDocumentExporter._element_to_markdown(element)
|
||||||
|
if content:
|
||||||
|
lines.append(content)
|
||||||
|
lines.append("")
|
||||||
|
|
||||||
|
md_content = "\n".join(lines)
|
||||||
|
|
||||||
|
if output_path:
|
||||||
|
output_path = Path(output_path)
|
||||||
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
output_path.write_text(md_content, encoding='utf-8')
|
||||||
|
logger.info(f"Exported Markdown to: {output_path}")
|
||||||
|
return output_path
|
||||||
|
|
||||||
|
return md_content
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def export_to_text(
|
||||||
|
document: UnifiedDocument,
|
||||||
|
output_path: Optional[Path] = None,
|
||||||
|
page_separator: str = "\n\n--- Page Break ---\n\n"
|
||||||
|
) -> Union[str, Path]:
|
||||||
|
"""
|
||||||
|
Export UnifiedDocument to plain text format.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
document: The UnifiedDocument to export
|
||||||
|
output_path: Optional path to save the text file
|
||||||
|
page_separator: Separator between pages
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Text string if no output_path, otherwise the output Path
|
||||||
|
"""
|
||||||
|
pages_text = []
|
||||||
|
|
||||||
|
for page in document.pages:
|
||||||
|
page_text = page.extract_text()
|
||||||
|
if page_text:
|
||||||
|
pages_text.append(page_text)
|
||||||
|
|
||||||
|
text_content = page_separator.join(pages_text)
|
||||||
|
|
||||||
|
if output_path:
|
||||||
|
output_path = Path(output_path)
|
||||||
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
output_path.write_text(text_content, encoding='utf-8')
|
||||||
|
logger.info(f"Exported text to: {output_path}")
|
||||||
|
return output_path
|
||||||
|
|
||||||
|
return text_content
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def export_to_legacy_json(
|
||||||
|
document: UnifiedDocument,
|
||||||
|
output_path: Optional[Path] = None
|
||||||
|
) -> Union[str, Path]:
|
||||||
|
"""
|
||||||
|
Export UnifiedDocument to legacy JSON format for backward compatibility.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
document: The UnifiedDocument to export
|
||||||
|
output_path: Optional path to save the JSON file
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
JSON string if no output_path, otherwise the output Path
|
||||||
|
"""
|
||||||
|
legacy_data = document.to_legacy_format()
|
||||||
|
|
||||||
|
json_str = json.dumps(
|
||||||
|
legacy_data,
|
||||||
|
ensure_ascii=False,
|
||||||
|
indent=2,
|
||||||
|
default=str
|
||||||
|
)
|
||||||
|
|
||||||
|
if output_path:
|
||||||
|
output_path = Path(output_path)
|
||||||
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
output_path.write_text(json_str, encoding='utf-8')
|
||||||
|
logger.info(f"Exported legacy JSON to: {output_path}")
|
||||||
|
return output_path
|
||||||
|
|
||||||
|
return json_str
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def export_all_formats(
|
||||||
|
document: UnifiedDocument,
|
||||||
|
output_dir: Path,
|
||||||
|
file_id: str
|
||||||
|
) -> Dict[str, Optional[Path]]:
|
||||||
|
"""
|
||||||
|
Export UnifiedDocument to all standard formats.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
document: The UnifiedDocument to export
|
||||||
|
output_dir: Directory to save output files
|
||||||
|
file_id: Base filename for outputs
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary mapping format names to output paths
|
||||||
|
"""
|
||||||
|
output_dir = Path(output_dir)
|
||||||
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
results = {}
|
||||||
|
|
||||||
|
# Export JSON
|
||||||
|
try:
|
||||||
|
json_path = output_dir / f"{file_id}_result.json"
|
||||||
|
UnifiedDocumentExporter.export_to_json(
|
||||||
|
document,
|
||||||
|
json_path,
|
||||||
|
include_metadata=True,
|
||||||
|
include_statistics=True
|
||||||
|
)
|
||||||
|
results['json'] = json_path
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to export JSON: {e}")
|
||||||
|
results['json'] = None
|
||||||
|
|
||||||
|
# Export Markdown
|
||||||
|
try:
|
||||||
|
md_path = output_dir / f"{file_id}_output.md"
|
||||||
|
UnifiedDocumentExporter.export_to_markdown(document, md_path)
|
||||||
|
results['markdown'] = md_path
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to export Markdown: {e}")
|
||||||
|
results['markdown'] = None
|
||||||
|
|
||||||
|
# Export plain text
|
||||||
|
try:
|
||||||
|
txt_path = output_dir / f"{file_id}_text.txt"
|
||||||
|
UnifiedDocumentExporter.export_to_text(document, txt_path)
|
||||||
|
results['text'] = txt_path
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to export text: {e}")
|
||||||
|
results['text'] = None
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _build_export_data(
|
||||||
|
document: UnifiedDocument,
|
||||||
|
include_metadata: bool = True,
|
||||||
|
include_statistics: bool = True,
|
||||||
|
include_binary_content: bool = False
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Build the export data structure with processing metadata.
|
||||||
|
|
||||||
|
Supports both OCR and DIRECT track outputs with consistent format.
|
||||||
|
"""
|
||||||
|
# Base document data
|
||||||
|
export_data = {
|
||||||
|
"schema_version": UnifiedDocumentExporter.SCHEMA_VERSION,
|
||||||
|
"document_id": document.document_id,
|
||||||
|
"export_timestamp": datetime.utcnow().isoformat() + "Z"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Add metadata
|
||||||
|
if include_metadata:
|
||||||
|
export_data["metadata"] = document.metadata.to_dict()
|
||||||
|
|
||||||
|
# Add extended processing metadata
|
||||||
|
export_data["metadata"]["processing_info"] = {
|
||||||
|
"track_description": UnifiedDocumentExporter._get_track_description(
|
||||||
|
document.metadata.processing_track
|
||||||
|
),
|
||||||
|
"schema_version": UnifiedDocumentExporter.SCHEMA_VERSION,
|
||||||
|
"export_format": "unified_document_v1"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Add pages
|
||||||
|
export_data["pages"] = [page.to_dict() for page in document.pages]
|
||||||
|
|
||||||
|
# Add statistics
|
||||||
|
if include_statistics:
|
||||||
|
export_data["statistics"] = {
|
||||||
|
"page_count": document.page_count,
|
||||||
|
"total_elements": document.total_elements,
|
||||||
|
"total_tables": len(document.get_all_tables()),
|
||||||
|
"total_images": len(document.get_all_images()),
|
||||||
|
"element_type_counts": UnifiedDocumentExporter._count_element_types(document),
|
||||||
|
"text_stats": UnifiedDocumentExporter._calculate_text_stats(document)
|
||||||
|
}
|
||||||
|
|
||||||
|
# Add processing errors if any
|
||||||
|
if document.processing_errors:
|
||||||
|
export_data["processing_errors"] = document.processing_errors
|
||||||
|
|
||||||
|
return export_data
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _get_track_description(track: ProcessingTrack) -> str:
|
||||||
|
"""Get human-readable description for processing track."""
|
||||||
|
descriptions = {
|
||||||
|
ProcessingTrack.OCR: "PaddleOCR PP-StructureV3 - Used for scanned documents and images",
|
||||||
|
ProcessingTrack.DIRECT: "PyMuPDF Direct Extraction - Used for editable PDFs with embedded text",
|
||||||
|
ProcessingTrack.HYBRID: "Hybrid Processing - Combined OCR and direct extraction"
|
||||||
|
}
|
||||||
|
return descriptions.get(track, "Unknown processing track")
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _count_element_types(document: UnifiedDocument) -> Dict[str, int]:
|
||||||
|
"""Count occurrences of each element type in the document."""
|
||||||
|
counts = {}
|
||||||
|
for page in document.pages:
|
||||||
|
for element in page.elements:
|
||||||
|
type_name = element.type.value
|
||||||
|
counts[type_name] = counts.get(type_name, 0) + 1
|
||||||
|
return counts
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _calculate_text_stats(document: UnifiedDocument) -> Dict[str, Any]:
|
||||||
|
"""Calculate text statistics for the document."""
|
||||||
|
full_text = document.extract_all_text()
|
||||||
|
words = full_text.split()
|
||||||
|
chars = len(full_text)
|
||||||
|
|
||||||
|
# Calculate average confidence
|
||||||
|
confidences = []
|
||||||
|
for page in document.pages:
|
||||||
|
for element in page.elements:
|
||||||
|
if element.confidence is not None:
|
||||||
|
confidences.append(element.confidence)
|
||||||
|
|
||||||
|
avg_confidence = sum(confidences) / len(confidences) if confidences else None
|
||||||
|
|
||||||
|
return {
|
||||||
|
"total_characters": chars,
|
||||||
|
"total_words": len(words),
|
||||||
|
"average_confidence": round(avg_confidence, 4) if avg_confidence else None
|
||||||
|
}
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _element_to_markdown(element) -> str:
|
||||||
|
"""Convert a document element to Markdown format."""
|
||||||
|
content = element.get_text()
|
||||||
|
|
||||||
|
if not content and element.type not in [ElementType.TABLE, ElementType.IMAGE]:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
# Format based on element type
|
||||||
|
if element.type == ElementType.TITLE:
|
||||||
|
return f"# {content}"
|
||||||
|
elif element.type == ElementType.HEADER:
|
||||||
|
return f"### {content}"
|
||||||
|
elif element.type in [ElementType.TEXT, ElementType.PARAGRAPH, ElementType.BODY]:
|
||||||
|
return content
|
||||||
|
elif element.type == ElementType.LIST_ITEM:
|
||||||
|
return f"- {content}"
|
||||||
|
elif element.type == ElementType.TABLE:
|
||||||
|
# Use HTML table if available
|
||||||
|
if hasattr(element.content, 'to_html'):
|
||||||
|
return element.content.to_html()
|
||||||
|
return f"[Table: {content}]"
|
||||||
|
elif element.type == ElementType.IMAGE:
|
||||||
|
return f"})"
|
||||||
|
elif element.type == ElementType.FIGURE:
|
||||||
|
return f"[Figure: {content or 'No caption'}]"
|
||||||
|
elif element.type == ElementType.CODE:
|
||||||
|
return f"```\n{content}\n```"
|
||||||
|
elif element.type == ElementType.EQUATION:
|
||||||
|
return f"$${content}$$"
|
||||||
|
elif element.type == ElementType.CAPTION:
|
||||||
|
return f"*{content}*"
|
||||||
|
elif element.type == ElementType.FOOTNOTE:
|
||||||
|
return f"[^{content}]"
|
||||||
|
elif element.type == ElementType.REFERENCE:
|
||||||
|
return f"> {content}"
|
||||||
|
else:
|
||||||
|
return content if content else ""
|
||||||
|
|
||||||
|
|
||||||
|
class JSONSchemaValidator:
|
||||||
|
"""
|
||||||
|
Validator for UnifiedDocument JSON exports.
|
||||||
|
|
||||||
|
Uses the JSON Schema definition to validate exported data.
|
||||||
|
"""
|
||||||
|
|
||||||
|
_schema = None
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_schema(cls) -> Dict[str, Any]:
|
||||||
|
"""Load and return the JSON Schema for UnifiedDocument."""
|
||||||
|
if cls._schema is None:
|
||||||
|
schema_path = Path(__file__).parent.parent / "schemas" / "unified_document_schema.json"
|
||||||
|
if schema_path.exists():
|
||||||
|
cls._schema = json.loads(schema_path.read_text(encoding='utf-8'))
|
||||||
|
else:
|
||||||
|
logger.warning(f"Schema file not found: {schema_path}")
|
||||||
|
cls._schema = {}
|
||||||
|
return cls._schema
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def validate(cls, data: Dict[str, Any]) -> Tuple[bool, Optional[str]]:
|
||||||
|
"""
|
||||||
|
Validate JSON data against the UnifiedDocument schema.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data: The JSON data to validate
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (is_valid, error_message)
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
import jsonschema
|
||||||
|
schema = cls.get_schema()
|
||||||
|
if not schema:
|
||||||
|
return True, None # Skip validation if schema not available
|
||||||
|
|
||||||
|
jsonschema.validate(data, schema)
|
||||||
|
return True, None
|
||||||
|
except ImportError:
|
||||||
|
logger.warning("jsonschema package not installed, skipping validation")
|
||||||
|
return True, None
|
||||||
|
except Exception as e:
|
||||||
|
return False, str(e)
|
||||||
|
|
||||||
|
|
||||||
|
def save_unified_document(
|
||||||
|
document: UnifiedDocument,
|
||||||
|
output_dir: Path,
|
||||||
|
file_id: str,
|
||||||
|
formats: Optional[list] = None
|
||||||
|
) -> Dict[str, Optional[Path]]:
|
||||||
|
"""
|
||||||
|
Convenience function to save UnifiedDocument to multiple formats.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
document: The UnifiedDocument to save
|
||||||
|
output_dir: Output directory
|
||||||
|
file_id: Base filename
|
||||||
|
formats: List of formats to export (default: ['json', 'markdown'])
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary mapping format names to output paths
|
||||||
|
"""
|
||||||
|
if formats is None:
|
||||||
|
formats = ['json', 'markdown']
|
||||||
|
|
||||||
|
results = {}
|
||||||
|
output_dir = Path(output_dir)
|
||||||
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
for fmt in formats:
|
||||||
|
try:
|
||||||
|
if fmt == 'json':
|
||||||
|
path = output_dir / f"{file_id}_result.json"
|
||||||
|
UnifiedDocumentExporter.export_to_json(document, path)
|
||||||
|
results['json'] = path
|
||||||
|
elif fmt == 'markdown':
|
||||||
|
path = output_dir / f"{file_id}_output.md"
|
||||||
|
UnifiedDocumentExporter.export_to_markdown(document, path)
|
||||||
|
results['markdown'] = path
|
||||||
|
elif fmt == 'text':
|
||||||
|
path = output_dir / f"{file_id}_text.txt"
|
||||||
|
UnifiedDocumentExporter.export_to_text(document, path)
|
||||||
|
results['text'] = path
|
||||||
|
elif fmt == 'legacy':
|
||||||
|
path = output_dir / f"{file_id}_legacy.json"
|
||||||
|
UnifiedDocumentExporter.export_to_legacy_json(document, path)
|
||||||
|
results['legacy'] = path
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to export {fmt}: {e}")
|
||||||
|
results[fmt] = None
|
||||||
|
|
||||||
|
return results
|
||||||
@@ -59,10 +59,10 @@
|
|||||||
- [x] 4.1.2 Route to appropriate processing engine
|
- [x] 4.1.2 Route to appropriate processing engine
|
||||||
- [x] 4.1.3 Return UnifiedDocument from both tracks
|
- [x] 4.1.3 Return UnifiedDocument from both tracks
|
||||||
- [x] 4.1.4 Maintain backward compatibility
|
- [x] 4.1.4 Maintain backward compatibility
|
||||||
- [ ] 4.2 Create unified JSON export
|
- [x] 4.2 Create unified JSON export
|
||||||
- [ ] 4.2.1 Define standardized JSON schema
|
- [x] 4.2.1 Define standardized JSON schema
|
||||||
- [ ] 4.2.2 Include processing metadata
|
- [x] 4.2.2 Include processing metadata
|
||||||
- [ ] 4.2.3 Support both track outputs
|
- [x] 4.2.3 Support both track outputs
|
||||||
- [ ] 4.3 Update PDF generator for UnifiedDocument
|
- [ ] 4.3 Update PDF generator for UnifiedDocument
|
||||||
- [ ] 4.3.1 Adapt PDF generation to use UnifiedDocument
|
- [ ] 4.3.1 Adapt PDF generation to use UnifiedDocument
|
||||||
- [ ] 4.3.2 Preserve layout from both tracks
|
- [ ] 4.3.2 Preserve layout from both tracks
|
||||||
|
|||||||
Reference in New Issue
Block a user