Files
OCR/backend/app/schemas/unified_document_schema.json
egg ab89a40e8d feat: add unified JSON export with standardized schema
- Create JSON Schema definition for UnifiedDocument format
- Implement UnifiedDocumentExporter service with multiple export formats
- Include comprehensive processing metadata and statistics
- Update OCR service to use new exporter for dual-track outputs
- Support JSON, Markdown, Text, and legacy format exports

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-19 08:36:24 +08:00

444 lines
12 KiB
JSON

{
"$schema": "http://json-schema.org/draft-07/schema#",
"$id": "https://tool-ocr.local/schemas/unified-document.json",
"title": "UnifiedDocument",
"description": "Unified document representation for dual-track OCR/Direct extraction processing",
"type": "object",
"required": ["document_id", "metadata", "pages", "statistics"],
"properties": {
"document_id": {
"type": "string",
"description": "Unique identifier for the document"
},
"metadata": {
"$ref": "#/definitions/DocumentMetadata"
},
"pages": {
"type": "array",
"items": {
"$ref": "#/definitions/Page"
},
"description": "List of pages in the document"
},
"statistics": {
"$ref": "#/definitions/DocumentStatistics"
},
"processing_errors": {
"type": "array",
"items": {
"$ref": "#/definitions/ProcessingError"
},
"default": [],
"description": "List of any errors encountered during processing"
}
},
"definitions": {
"DocumentMetadata": {
"type": "object",
"required": ["filename", "file_type", "file_size", "created_at", "processing_track", "processing_time"],
"properties": {
"filename": {
"type": "string",
"description": "Original filename"
},
"file_type": {
"type": "string",
"description": "File MIME type or extension"
},
"file_size": {
"type": "integer",
"minimum": 0,
"description": "File size in bytes"
},
"created_at": {
"type": "string",
"format": "date-time",
"description": "Processing timestamp (ISO 8601)"
},
"processing_track": {
"type": "string",
"enum": ["ocr", "direct", "hybrid"],
"description": "Processing track used"
},
"processing_time": {
"type": "number",
"minimum": 0,
"description": "Processing time in seconds"
},
"language": {
"type": "string",
"description": "Detected or specified language code"
},
"title": {
"type": "string",
"description": "Document title from metadata"
},
"author": {
"type": "string",
"description": "Document author"
},
"subject": {
"type": "string",
"description": "Document subject"
},
"keywords": {
"type": "array",
"items": {"type": "string"},
"description": "Document keywords"
},
"producer": {
"type": "string",
"description": "PDF producer application"
},
"creator": {
"type": "string",
"description": "Document creator application"
},
"creation_date": {
"type": "string",
"format": "date-time",
"description": "Document creation date"
},
"modification_date": {
"type": "string",
"format": "date-time",
"description": "Document last modification date"
}
}
},
"Page": {
"type": "object",
"required": ["page_number", "elements", "dimensions"],
"properties": {
"page_number": {
"type": "integer",
"minimum": 1,
"description": "1-based page number"
},
"elements": {
"type": "array",
"items": {
"$ref": "#/definitions/DocumentElement"
},
"description": "List of elements on the page"
},
"dimensions": {
"$ref": "#/definitions/Dimensions"
},
"metadata": {
"type": "object",
"additionalProperties": true,
"description": "Additional page-specific metadata"
},
"statistics": {
"$ref": "#/definitions/PageStatistics"
}
}
},
"DocumentElement": {
"type": "object",
"required": ["element_id", "type", "bbox"],
"properties": {
"element_id": {
"type": "string",
"description": "Unique identifier for the element"
},
"type": {
"type": "string",
"enum": [
"text", "title", "header", "footer", "reference", "equation", "footnote", "caption",
"list", "list_item",
"table", "table_cell", "table_caption",
"image", "figure", "chart", "diagram",
"section", "paragraph", "page_number", "watermark", "header_group", "body",
"code", "formula", "signature", "stamp", "logo", "barcode", "qr_code"
],
"description": "Element type (supports all 23 PP-StructureV3 types plus custom ones)"
},
"content": {
"oneOf": [
{"type": "string"},
{"$ref": "#/definitions/TableData"},
{"type": "object"}
],
"description": "Element content (text, table data, or structured data)"
},
"content_type": {
"type": "string",
"enum": ["text", "table", "binary"],
"description": "Type of content when not a simple string"
},
"content_length": {
"type": "integer",
"description": "Length of binary content in bytes"
},
"bbox": {
"$ref": "#/definitions/BoundingBox"
},
"confidence": {
"type": "number",
"minimum": 0,
"maximum": 1,
"description": "OCR confidence score (0-1)"
},
"style": {
"$ref": "#/definitions/StyleInfo"
},
"metadata": {
"type": "object",
"additionalProperties": true,
"description": "Additional element metadata"
},
"children": {
"type": "array",
"items": {
"$ref": "#/definitions/DocumentElement"
},
"description": "Nested child elements"
}
}
},
"BoundingBox": {
"type": "object",
"required": ["x0", "y0", "x1", "y1"],
"properties": {
"x0": {
"type": "number",
"description": "Left coordinate"
},
"y0": {
"type": "number",
"description": "Top coordinate"
},
"x1": {
"type": "number",
"description": "Right coordinate"
},
"y1": {
"type": "number",
"description": "Bottom coordinate"
},
"width": {
"type": "number",
"description": "Width (calculated)"
},
"height": {
"type": "number",
"description": "Height (calculated)"
}
}
},
"StyleInfo": {
"type": "object",
"properties": {
"font_name": {
"type": "string",
"description": "Font family name"
},
"font_size": {
"type": "number",
"minimum": 0,
"description": "Font size in points"
},
"font_weight": {
"type": "string",
"enum": ["normal", "bold"],
"description": "Font weight"
},
"font_style": {
"type": "string",
"enum": ["normal", "italic"],
"description": "Font style"
},
"text_color": {
"type": "integer",
"description": "Text color as RGB integer"
},
"text_color_rgb": {
"type": "array",
"items": {"type": "integer", "minimum": 0, "maximum": 255},
"minItems": 3,
"maxItems": 3,
"description": "Text color as [R, G, B] array"
},
"bg_color": {
"type": "integer",
"description": "Background color as RGB integer"
},
"alignment": {
"type": "string",
"enum": ["left", "center", "right", "justify"],
"description": "Text alignment"
}
}
},
"TableData": {
"type": "object",
"required": ["rows", "cols", "cells"],
"properties": {
"rows": {
"type": "integer",
"minimum": 0,
"description": "Number of rows"
},
"cols": {
"type": "integer",
"minimum": 0,
"description": "Number of columns"
},
"cells": {
"type": "array",
"items": {
"$ref": "#/definitions/TableCell"
},
"description": "Table cells"
},
"headers": {
"type": "array",
"items": {"type": "string"},
"description": "Header row labels"
},
"caption": {
"type": "string",
"description": "Table caption"
}
}
},
"TableCell": {
"type": "object",
"required": ["row", "col"],
"properties": {
"row": {
"type": "integer",
"minimum": 0,
"description": "Row index (0-based)"
},
"col": {
"type": "integer",
"minimum": 0,
"description": "Column index (0-based)"
},
"row_span": {
"type": "integer",
"minimum": 1,
"default": 1,
"description": "Number of rows spanned"
},
"col_span": {
"type": "integer",
"minimum": 1,
"default": 1,
"description": "Number of columns spanned"
},
"content": {
"type": "string",
"default": "",
"description": "Cell text content"
},
"bbox": {
"$ref": "#/definitions/BoundingBox"
},
"style": {
"$ref": "#/definitions/StyleInfo"
}
}
},
"Dimensions": {
"type": "object",
"required": ["width", "height"],
"properties": {
"width": {
"type": "number",
"minimum": 0,
"description": "Width in pixels or points"
},
"height": {
"type": "number",
"minimum": 0,
"description": "Height in pixels or points"
},
"dpi": {
"type": "integer",
"minimum": 1,
"description": "Resolution in DPI"
}
}
},
"DocumentStatistics": {
"type": "object",
"required": ["page_count", "total_elements", "total_tables", "total_images"],
"properties": {
"page_count": {
"type": "integer",
"minimum": 0,
"description": "Total number of pages"
},
"total_elements": {
"type": "integer",
"minimum": 0,
"description": "Total elements across all pages"
},
"total_tables": {
"type": "integer",
"minimum": 0,
"description": "Total tables across all pages"
},
"total_images": {
"type": "integer",
"minimum": 0,
"description": "Total images across all pages"
}
}
},
"PageStatistics": {
"type": "object",
"required": ["total_elements", "text_elements", "tables", "images"],
"properties": {
"total_elements": {
"type": "integer",
"minimum": 0
},
"text_elements": {
"type": "integer",
"minimum": 0
},
"tables": {
"type": "integer",
"minimum": 0
},
"images": {
"type": "integer",
"minimum": 0
}
}
},
"ProcessingError": {
"type": "object",
"required": ["error_type", "message"],
"properties": {
"error_type": {
"type": "string",
"description": "Error classification"
},
"message": {
"type": "string",
"description": "Error description"
},
"page": {
"type": "integer",
"description": "Page number where error occurred"
},
"element_id": {
"type": "string",
"description": "Element ID if applicable"
},
"timestamp": {
"type": "string",
"format": "date-time",
"description": "When the error occurred"
}
}
}
}
}