- Create JSON Schema definition for UnifiedDocument format - Implement UnifiedDocumentExporter service with multiple export formats - Include comprehensive processing metadata and statistics - Update OCR service to use new exporter for dual-track outputs - Support JSON, Markdown, Text, and legacy format exports 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
444 lines
12 KiB
JSON
444 lines
12 KiB
JSON
{
|
|
"$schema": "http://json-schema.org/draft-07/schema#",
|
|
"$id": "https://tool-ocr.local/schemas/unified-document.json",
|
|
"title": "UnifiedDocument",
|
|
"description": "Unified document representation for dual-track OCR/Direct extraction processing",
|
|
"type": "object",
|
|
"required": ["document_id", "metadata", "pages", "statistics"],
|
|
"properties": {
|
|
"document_id": {
|
|
"type": "string",
|
|
"description": "Unique identifier for the document"
|
|
},
|
|
"metadata": {
|
|
"$ref": "#/definitions/DocumentMetadata"
|
|
},
|
|
"pages": {
|
|
"type": "array",
|
|
"items": {
|
|
"$ref": "#/definitions/Page"
|
|
},
|
|
"description": "List of pages in the document"
|
|
},
|
|
"statistics": {
|
|
"$ref": "#/definitions/DocumentStatistics"
|
|
},
|
|
"processing_errors": {
|
|
"type": "array",
|
|
"items": {
|
|
"$ref": "#/definitions/ProcessingError"
|
|
},
|
|
"default": [],
|
|
"description": "List of any errors encountered during processing"
|
|
}
|
|
},
|
|
"definitions": {
|
|
"DocumentMetadata": {
|
|
"type": "object",
|
|
"required": ["filename", "file_type", "file_size", "created_at", "processing_track", "processing_time"],
|
|
"properties": {
|
|
"filename": {
|
|
"type": "string",
|
|
"description": "Original filename"
|
|
},
|
|
"file_type": {
|
|
"type": "string",
|
|
"description": "File MIME type or extension"
|
|
},
|
|
"file_size": {
|
|
"type": "integer",
|
|
"minimum": 0,
|
|
"description": "File size in bytes"
|
|
},
|
|
"created_at": {
|
|
"type": "string",
|
|
"format": "date-time",
|
|
"description": "Processing timestamp (ISO 8601)"
|
|
},
|
|
"processing_track": {
|
|
"type": "string",
|
|
"enum": ["ocr", "direct", "hybrid"],
|
|
"description": "Processing track used"
|
|
},
|
|
"processing_time": {
|
|
"type": "number",
|
|
"minimum": 0,
|
|
"description": "Processing time in seconds"
|
|
},
|
|
"language": {
|
|
"type": "string",
|
|
"description": "Detected or specified language code"
|
|
},
|
|
"title": {
|
|
"type": "string",
|
|
"description": "Document title from metadata"
|
|
},
|
|
"author": {
|
|
"type": "string",
|
|
"description": "Document author"
|
|
},
|
|
"subject": {
|
|
"type": "string",
|
|
"description": "Document subject"
|
|
},
|
|
"keywords": {
|
|
"type": "array",
|
|
"items": {"type": "string"},
|
|
"description": "Document keywords"
|
|
},
|
|
"producer": {
|
|
"type": "string",
|
|
"description": "PDF producer application"
|
|
},
|
|
"creator": {
|
|
"type": "string",
|
|
"description": "Document creator application"
|
|
},
|
|
"creation_date": {
|
|
"type": "string",
|
|
"format": "date-time",
|
|
"description": "Document creation date"
|
|
},
|
|
"modification_date": {
|
|
"type": "string",
|
|
"format": "date-time",
|
|
"description": "Document last modification date"
|
|
}
|
|
}
|
|
},
|
|
"Page": {
|
|
"type": "object",
|
|
"required": ["page_number", "elements", "dimensions"],
|
|
"properties": {
|
|
"page_number": {
|
|
"type": "integer",
|
|
"minimum": 1,
|
|
"description": "1-based page number"
|
|
},
|
|
"elements": {
|
|
"type": "array",
|
|
"items": {
|
|
"$ref": "#/definitions/DocumentElement"
|
|
},
|
|
"description": "List of elements on the page"
|
|
},
|
|
"dimensions": {
|
|
"$ref": "#/definitions/Dimensions"
|
|
},
|
|
"metadata": {
|
|
"type": "object",
|
|
"additionalProperties": true,
|
|
"description": "Additional page-specific metadata"
|
|
},
|
|
"statistics": {
|
|
"$ref": "#/definitions/PageStatistics"
|
|
}
|
|
}
|
|
},
|
|
"DocumentElement": {
|
|
"type": "object",
|
|
"required": ["element_id", "type", "bbox"],
|
|
"properties": {
|
|
"element_id": {
|
|
"type": "string",
|
|
"description": "Unique identifier for the element"
|
|
},
|
|
"type": {
|
|
"type": "string",
|
|
"enum": [
|
|
"text", "title", "header", "footer", "reference", "equation", "footnote", "caption",
|
|
"list", "list_item",
|
|
"table", "table_cell", "table_caption",
|
|
"image", "figure", "chart", "diagram",
|
|
"section", "paragraph", "page_number", "watermark", "header_group", "body",
|
|
"code", "formula", "signature", "stamp", "logo", "barcode", "qr_code"
|
|
],
|
|
"description": "Element type (supports all 23 PP-StructureV3 types plus custom ones)"
|
|
},
|
|
"content": {
|
|
"oneOf": [
|
|
{"type": "string"},
|
|
{"$ref": "#/definitions/TableData"},
|
|
{"type": "object"}
|
|
],
|
|
"description": "Element content (text, table data, or structured data)"
|
|
},
|
|
"content_type": {
|
|
"type": "string",
|
|
"enum": ["text", "table", "binary"],
|
|
"description": "Type of content when not a simple string"
|
|
},
|
|
"content_length": {
|
|
"type": "integer",
|
|
"description": "Length of binary content in bytes"
|
|
},
|
|
"bbox": {
|
|
"$ref": "#/definitions/BoundingBox"
|
|
},
|
|
"confidence": {
|
|
"type": "number",
|
|
"minimum": 0,
|
|
"maximum": 1,
|
|
"description": "OCR confidence score (0-1)"
|
|
},
|
|
"style": {
|
|
"$ref": "#/definitions/StyleInfo"
|
|
},
|
|
"metadata": {
|
|
"type": "object",
|
|
"additionalProperties": true,
|
|
"description": "Additional element metadata"
|
|
},
|
|
"children": {
|
|
"type": "array",
|
|
"items": {
|
|
"$ref": "#/definitions/DocumentElement"
|
|
},
|
|
"description": "Nested child elements"
|
|
}
|
|
}
|
|
},
|
|
"BoundingBox": {
|
|
"type": "object",
|
|
"required": ["x0", "y0", "x1", "y1"],
|
|
"properties": {
|
|
"x0": {
|
|
"type": "number",
|
|
"description": "Left coordinate"
|
|
},
|
|
"y0": {
|
|
"type": "number",
|
|
"description": "Top coordinate"
|
|
},
|
|
"x1": {
|
|
"type": "number",
|
|
"description": "Right coordinate"
|
|
},
|
|
"y1": {
|
|
"type": "number",
|
|
"description": "Bottom coordinate"
|
|
},
|
|
"width": {
|
|
"type": "number",
|
|
"description": "Width (calculated)"
|
|
},
|
|
"height": {
|
|
"type": "number",
|
|
"description": "Height (calculated)"
|
|
}
|
|
}
|
|
},
|
|
"StyleInfo": {
|
|
"type": "object",
|
|
"properties": {
|
|
"font_name": {
|
|
"type": "string",
|
|
"description": "Font family name"
|
|
},
|
|
"font_size": {
|
|
"type": "number",
|
|
"minimum": 0,
|
|
"description": "Font size in points"
|
|
},
|
|
"font_weight": {
|
|
"type": "string",
|
|
"enum": ["normal", "bold"],
|
|
"description": "Font weight"
|
|
},
|
|
"font_style": {
|
|
"type": "string",
|
|
"enum": ["normal", "italic"],
|
|
"description": "Font style"
|
|
},
|
|
"text_color": {
|
|
"type": "integer",
|
|
"description": "Text color as RGB integer"
|
|
},
|
|
"text_color_rgb": {
|
|
"type": "array",
|
|
"items": {"type": "integer", "minimum": 0, "maximum": 255},
|
|
"minItems": 3,
|
|
"maxItems": 3,
|
|
"description": "Text color as [R, G, B] array"
|
|
},
|
|
"bg_color": {
|
|
"type": "integer",
|
|
"description": "Background color as RGB integer"
|
|
},
|
|
"alignment": {
|
|
"type": "string",
|
|
"enum": ["left", "center", "right", "justify"],
|
|
"description": "Text alignment"
|
|
}
|
|
}
|
|
},
|
|
"TableData": {
|
|
"type": "object",
|
|
"required": ["rows", "cols", "cells"],
|
|
"properties": {
|
|
"rows": {
|
|
"type": "integer",
|
|
"minimum": 0,
|
|
"description": "Number of rows"
|
|
},
|
|
"cols": {
|
|
"type": "integer",
|
|
"minimum": 0,
|
|
"description": "Number of columns"
|
|
},
|
|
"cells": {
|
|
"type": "array",
|
|
"items": {
|
|
"$ref": "#/definitions/TableCell"
|
|
},
|
|
"description": "Table cells"
|
|
},
|
|
"headers": {
|
|
"type": "array",
|
|
"items": {"type": "string"},
|
|
"description": "Header row labels"
|
|
},
|
|
"caption": {
|
|
"type": "string",
|
|
"description": "Table caption"
|
|
}
|
|
}
|
|
},
|
|
"TableCell": {
|
|
"type": "object",
|
|
"required": ["row", "col"],
|
|
"properties": {
|
|
"row": {
|
|
"type": "integer",
|
|
"minimum": 0,
|
|
"description": "Row index (0-based)"
|
|
},
|
|
"col": {
|
|
"type": "integer",
|
|
"minimum": 0,
|
|
"description": "Column index (0-based)"
|
|
},
|
|
"row_span": {
|
|
"type": "integer",
|
|
"minimum": 1,
|
|
"default": 1,
|
|
"description": "Number of rows spanned"
|
|
},
|
|
"col_span": {
|
|
"type": "integer",
|
|
"minimum": 1,
|
|
"default": 1,
|
|
"description": "Number of columns spanned"
|
|
},
|
|
"content": {
|
|
"type": "string",
|
|
"default": "",
|
|
"description": "Cell text content"
|
|
},
|
|
"bbox": {
|
|
"$ref": "#/definitions/BoundingBox"
|
|
},
|
|
"style": {
|
|
"$ref": "#/definitions/StyleInfo"
|
|
}
|
|
}
|
|
},
|
|
"Dimensions": {
|
|
"type": "object",
|
|
"required": ["width", "height"],
|
|
"properties": {
|
|
"width": {
|
|
"type": "number",
|
|
"minimum": 0,
|
|
"description": "Width in pixels or points"
|
|
},
|
|
"height": {
|
|
"type": "number",
|
|
"minimum": 0,
|
|
"description": "Height in pixels or points"
|
|
},
|
|
"dpi": {
|
|
"type": "integer",
|
|
"minimum": 1,
|
|
"description": "Resolution in DPI"
|
|
}
|
|
}
|
|
},
|
|
"DocumentStatistics": {
|
|
"type": "object",
|
|
"required": ["page_count", "total_elements", "total_tables", "total_images"],
|
|
"properties": {
|
|
"page_count": {
|
|
"type": "integer",
|
|
"minimum": 0,
|
|
"description": "Total number of pages"
|
|
},
|
|
"total_elements": {
|
|
"type": "integer",
|
|
"minimum": 0,
|
|
"description": "Total elements across all pages"
|
|
},
|
|
"total_tables": {
|
|
"type": "integer",
|
|
"minimum": 0,
|
|
"description": "Total tables across all pages"
|
|
},
|
|
"total_images": {
|
|
"type": "integer",
|
|
"minimum": 0,
|
|
"description": "Total images across all pages"
|
|
}
|
|
}
|
|
},
|
|
"PageStatistics": {
|
|
"type": "object",
|
|
"required": ["total_elements", "text_elements", "tables", "images"],
|
|
"properties": {
|
|
"total_elements": {
|
|
"type": "integer",
|
|
"minimum": 0
|
|
},
|
|
"text_elements": {
|
|
"type": "integer",
|
|
"minimum": 0
|
|
},
|
|
"tables": {
|
|
"type": "integer",
|
|
"minimum": 0
|
|
},
|
|
"images": {
|
|
"type": "integer",
|
|
"minimum": 0
|
|
}
|
|
}
|
|
},
|
|
"ProcessingError": {
|
|
"type": "object",
|
|
"required": ["error_type", "message"],
|
|
"properties": {
|
|
"error_type": {
|
|
"type": "string",
|
|
"description": "Error classification"
|
|
},
|
|
"message": {
|
|
"type": "string",
|
|
"description": "Error description"
|
|
},
|
|
"page": {
|
|
"type": "integer",
|
|
"description": "Page number where error occurred"
|
|
},
|
|
"element_id": {
|
|
"type": "string",
|
|
"description": "Element ID if applicable"
|
|
},
|
|
"timestamp": {
|
|
"type": "string",
|
|
"format": "date-time",
|
|
"description": "When the error occurred"
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|