feat: add unified JSON export with standardized schema
- Create JSON Schema definition for UnifiedDocument format - Implement UnifiedDocumentExporter service with multiple export formats - Include comprehensive processing metadata and statistics - Update OCR service to use new exporter for dual-track outputs - Support JSON, Markdown, Text, and legacy format exports 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
443
backend/app/schemas/unified_document_schema.json
Normal file
443
backend/app/schemas/unified_document_schema.json
Normal file
@@ -0,0 +1,443 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"$id": "https://tool-ocr.local/schemas/unified-document.json",
|
||||
"title": "UnifiedDocument",
|
||||
"description": "Unified document representation for dual-track OCR/Direct extraction processing",
|
||||
"type": "object",
|
||||
"required": ["document_id", "metadata", "pages", "statistics"],
|
||||
"properties": {
|
||||
"document_id": {
|
||||
"type": "string",
|
||||
"description": "Unique identifier for the document"
|
||||
},
|
||||
"metadata": {
|
||||
"$ref": "#/definitions/DocumentMetadata"
|
||||
},
|
||||
"pages": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/definitions/Page"
|
||||
},
|
||||
"description": "List of pages in the document"
|
||||
},
|
||||
"statistics": {
|
||||
"$ref": "#/definitions/DocumentStatistics"
|
||||
},
|
||||
"processing_errors": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/definitions/ProcessingError"
|
||||
},
|
||||
"default": [],
|
||||
"description": "List of any errors encountered during processing"
|
||||
}
|
||||
},
|
||||
"definitions": {
|
||||
"DocumentMetadata": {
|
||||
"type": "object",
|
||||
"required": ["filename", "file_type", "file_size", "created_at", "processing_track", "processing_time"],
|
||||
"properties": {
|
||||
"filename": {
|
||||
"type": "string",
|
||||
"description": "Original filename"
|
||||
},
|
||||
"file_type": {
|
||||
"type": "string",
|
||||
"description": "File MIME type or extension"
|
||||
},
|
||||
"file_size": {
|
||||
"type": "integer",
|
||||
"minimum": 0,
|
||||
"description": "File size in bytes"
|
||||
},
|
||||
"created_at": {
|
||||
"type": "string",
|
||||
"format": "date-time",
|
||||
"description": "Processing timestamp (ISO 8601)"
|
||||
},
|
||||
"processing_track": {
|
||||
"type": "string",
|
||||
"enum": ["ocr", "direct", "hybrid"],
|
||||
"description": "Processing track used"
|
||||
},
|
||||
"processing_time": {
|
||||
"type": "number",
|
||||
"minimum": 0,
|
||||
"description": "Processing time in seconds"
|
||||
},
|
||||
"language": {
|
||||
"type": "string",
|
||||
"description": "Detected or specified language code"
|
||||
},
|
||||
"title": {
|
||||
"type": "string",
|
||||
"description": "Document title from metadata"
|
||||
},
|
||||
"author": {
|
||||
"type": "string",
|
||||
"description": "Document author"
|
||||
},
|
||||
"subject": {
|
||||
"type": "string",
|
||||
"description": "Document subject"
|
||||
},
|
||||
"keywords": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"description": "Document keywords"
|
||||
},
|
||||
"producer": {
|
||||
"type": "string",
|
||||
"description": "PDF producer application"
|
||||
},
|
||||
"creator": {
|
||||
"type": "string",
|
||||
"description": "Document creator application"
|
||||
},
|
||||
"creation_date": {
|
||||
"type": "string",
|
||||
"format": "date-time",
|
||||
"description": "Document creation date"
|
||||
},
|
||||
"modification_date": {
|
||||
"type": "string",
|
||||
"format": "date-time",
|
||||
"description": "Document last modification date"
|
||||
}
|
||||
}
|
||||
},
|
||||
"Page": {
|
||||
"type": "object",
|
||||
"required": ["page_number", "elements", "dimensions"],
|
||||
"properties": {
|
||||
"page_number": {
|
||||
"type": "integer",
|
||||
"minimum": 1,
|
||||
"description": "1-based page number"
|
||||
},
|
||||
"elements": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/definitions/DocumentElement"
|
||||
},
|
||||
"description": "List of elements on the page"
|
||||
},
|
||||
"dimensions": {
|
||||
"$ref": "#/definitions/Dimensions"
|
||||
},
|
||||
"metadata": {
|
||||
"type": "object",
|
||||
"additionalProperties": true,
|
||||
"description": "Additional page-specific metadata"
|
||||
},
|
||||
"statistics": {
|
||||
"$ref": "#/definitions/PageStatistics"
|
||||
}
|
||||
}
|
||||
},
|
||||
"DocumentElement": {
|
||||
"type": "object",
|
||||
"required": ["element_id", "type", "bbox"],
|
||||
"properties": {
|
||||
"element_id": {
|
||||
"type": "string",
|
||||
"description": "Unique identifier for the element"
|
||||
},
|
||||
"type": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"text", "title", "header", "footer", "reference", "equation", "footnote", "caption",
|
||||
"list", "list_item",
|
||||
"table", "table_cell", "table_caption",
|
||||
"image", "figure", "chart", "diagram",
|
||||
"section", "paragraph", "page_number", "watermark", "header_group", "body",
|
||||
"code", "formula", "signature", "stamp", "logo", "barcode", "qr_code"
|
||||
],
|
||||
"description": "Element type (supports all 23 PP-StructureV3 types plus custom ones)"
|
||||
},
|
||||
"content": {
|
||||
"oneOf": [
|
||||
{"type": "string"},
|
||||
{"$ref": "#/definitions/TableData"},
|
||||
{"type": "object"}
|
||||
],
|
||||
"description": "Element content (text, table data, or structured data)"
|
||||
},
|
||||
"content_type": {
|
||||
"type": "string",
|
||||
"enum": ["text", "table", "binary"],
|
||||
"description": "Type of content when not a simple string"
|
||||
},
|
||||
"content_length": {
|
||||
"type": "integer",
|
||||
"description": "Length of binary content in bytes"
|
||||
},
|
||||
"bbox": {
|
||||
"$ref": "#/definitions/BoundingBox"
|
||||
},
|
||||
"confidence": {
|
||||
"type": "number",
|
||||
"minimum": 0,
|
||||
"maximum": 1,
|
||||
"description": "OCR confidence score (0-1)"
|
||||
},
|
||||
"style": {
|
||||
"$ref": "#/definitions/StyleInfo"
|
||||
},
|
||||
"metadata": {
|
||||
"type": "object",
|
||||
"additionalProperties": true,
|
||||
"description": "Additional element metadata"
|
||||
},
|
||||
"children": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/definitions/DocumentElement"
|
||||
},
|
||||
"description": "Nested child elements"
|
||||
}
|
||||
}
|
||||
},
|
||||
"BoundingBox": {
|
||||
"type": "object",
|
||||
"required": ["x0", "y0", "x1", "y1"],
|
||||
"properties": {
|
||||
"x0": {
|
||||
"type": "number",
|
||||
"description": "Left coordinate"
|
||||
},
|
||||
"y0": {
|
||||
"type": "number",
|
||||
"description": "Top coordinate"
|
||||
},
|
||||
"x1": {
|
||||
"type": "number",
|
||||
"description": "Right coordinate"
|
||||
},
|
||||
"y1": {
|
||||
"type": "number",
|
||||
"description": "Bottom coordinate"
|
||||
},
|
||||
"width": {
|
||||
"type": "number",
|
||||
"description": "Width (calculated)"
|
||||
},
|
||||
"height": {
|
||||
"type": "number",
|
||||
"description": "Height (calculated)"
|
||||
}
|
||||
}
|
||||
},
|
||||
"StyleInfo": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"font_name": {
|
||||
"type": "string",
|
||||
"description": "Font family name"
|
||||
},
|
||||
"font_size": {
|
||||
"type": "number",
|
||||
"minimum": 0,
|
||||
"description": "Font size in points"
|
||||
},
|
||||
"font_weight": {
|
||||
"type": "string",
|
||||
"enum": ["normal", "bold"],
|
||||
"description": "Font weight"
|
||||
},
|
||||
"font_style": {
|
||||
"type": "string",
|
||||
"enum": ["normal", "italic"],
|
||||
"description": "Font style"
|
||||
},
|
||||
"text_color": {
|
||||
"type": "integer",
|
||||
"description": "Text color as RGB integer"
|
||||
},
|
||||
"text_color_rgb": {
|
||||
"type": "array",
|
||||
"items": {"type": "integer", "minimum": 0, "maximum": 255},
|
||||
"minItems": 3,
|
||||
"maxItems": 3,
|
||||
"description": "Text color as [R, G, B] array"
|
||||
},
|
||||
"bg_color": {
|
||||
"type": "integer",
|
||||
"description": "Background color as RGB integer"
|
||||
},
|
||||
"alignment": {
|
||||
"type": "string",
|
||||
"enum": ["left", "center", "right", "justify"],
|
||||
"description": "Text alignment"
|
||||
}
|
||||
}
|
||||
},
|
||||
"TableData": {
|
||||
"type": "object",
|
||||
"required": ["rows", "cols", "cells"],
|
||||
"properties": {
|
||||
"rows": {
|
||||
"type": "integer",
|
||||
"minimum": 0,
|
||||
"description": "Number of rows"
|
||||
},
|
||||
"cols": {
|
||||
"type": "integer",
|
||||
"minimum": 0,
|
||||
"description": "Number of columns"
|
||||
},
|
||||
"cells": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/definitions/TableCell"
|
||||
},
|
||||
"description": "Table cells"
|
||||
},
|
||||
"headers": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"description": "Header row labels"
|
||||
},
|
||||
"caption": {
|
||||
"type": "string",
|
||||
"description": "Table caption"
|
||||
}
|
||||
}
|
||||
},
|
||||
"TableCell": {
|
||||
"type": "object",
|
||||
"required": ["row", "col"],
|
||||
"properties": {
|
||||
"row": {
|
||||
"type": "integer",
|
||||
"minimum": 0,
|
||||
"description": "Row index (0-based)"
|
||||
},
|
||||
"col": {
|
||||
"type": "integer",
|
||||
"minimum": 0,
|
||||
"description": "Column index (0-based)"
|
||||
},
|
||||
"row_span": {
|
||||
"type": "integer",
|
||||
"minimum": 1,
|
||||
"default": 1,
|
||||
"description": "Number of rows spanned"
|
||||
},
|
||||
"col_span": {
|
||||
"type": "integer",
|
||||
"minimum": 1,
|
||||
"default": 1,
|
||||
"description": "Number of columns spanned"
|
||||
},
|
||||
"content": {
|
||||
"type": "string",
|
||||
"default": "",
|
||||
"description": "Cell text content"
|
||||
},
|
||||
"bbox": {
|
||||
"$ref": "#/definitions/BoundingBox"
|
||||
},
|
||||
"style": {
|
||||
"$ref": "#/definitions/StyleInfo"
|
||||
}
|
||||
}
|
||||
},
|
||||
"Dimensions": {
|
||||
"type": "object",
|
||||
"required": ["width", "height"],
|
||||
"properties": {
|
||||
"width": {
|
||||
"type": "number",
|
||||
"minimum": 0,
|
||||
"description": "Width in pixels or points"
|
||||
},
|
||||
"height": {
|
||||
"type": "number",
|
||||
"minimum": 0,
|
||||
"description": "Height in pixels or points"
|
||||
},
|
||||
"dpi": {
|
||||
"type": "integer",
|
||||
"minimum": 1,
|
||||
"description": "Resolution in DPI"
|
||||
}
|
||||
}
|
||||
},
|
||||
"DocumentStatistics": {
|
||||
"type": "object",
|
||||
"required": ["page_count", "total_elements", "total_tables", "total_images"],
|
||||
"properties": {
|
||||
"page_count": {
|
||||
"type": "integer",
|
||||
"minimum": 0,
|
||||
"description": "Total number of pages"
|
||||
},
|
||||
"total_elements": {
|
||||
"type": "integer",
|
||||
"minimum": 0,
|
||||
"description": "Total elements across all pages"
|
||||
},
|
||||
"total_tables": {
|
||||
"type": "integer",
|
||||
"minimum": 0,
|
||||
"description": "Total tables across all pages"
|
||||
},
|
||||
"total_images": {
|
||||
"type": "integer",
|
||||
"minimum": 0,
|
||||
"description": "Total images across all pages"
|
||||
}
|
||||
}
|
||||
},
|
||||
"PageStatistics": {
|
||||
"type": "object",
|
||||
"required": ["total_elements", "text_elements", "tables", "images"],
|
||||
"properties": {
|
||||
"total_elements": {
|
||||
"type": "integer",
|
||||
"minimum": 0
|
||||
},
|
||||
"text_elements": {
|
||||
"type": "integer",
|
||||
"minimum": 0
|
||||
},
|
||||
"tables": {
|
||||
"type": "integer",
|
||||
"minimum": 0
|
||||
},
|
||||
"images": {
|
||||
"type": "integer",
|
||||
"minimum": 0
|
||||
}
|
||||
}
|
||||
},
|
||||
"ProcessingError": {
|
||||
"type": "object",
|
||||
"required": ["error_type", "message"],
|
||||
"properties": {
|
||||
"error_type": {
|
||||
"type": "string",
|
||||
"description": "Error classification"
|
||||
},
|
||||
"message": {
|
||||
"type": "string",
|
||||
"description": "Error description"
|
||||
},
|
||||
"page": {
|
||||
"type": "integer",
|
||||
"description": "Page number where error occurred"
|
||||
},
|
||||
"element_id": {
|
||||
"type": "string",
|
||||
"description": "Element ID if applicable"
|
||||
},
|
||||
"timestamp": {
|
||||
"type": "string",
|
||||
"format": "date-time",
|
||||
"description": "When the error occurred"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user