{ "$schema": "http://json-schema.org/draft-07/schema#", "$id": "https://tool-ocr.local/schemas/unified-document.json", "title": "UnifiedDocument", "description": "Unified document representation for dual-track OCR/Direct extraction processing", "type": "object", "required": ["document_id", "metadata", "pages", "statistics"], "properties": { "document_id": { "type": "string", "description": "Unique identifier for the document" }, "metadata": { "$ref": "#/definitions/DocumentMetadata" }, "pages": { "type": "array", "items": { "$ref": "#/definitions/Page" }, "description": "List of pages in the document" }, "statistics": { "$ref": "#/definitions/DocumentStatistics" }, "processing_errors": { "type": "array", "items": { "$ref": "#/definitions/ProcessingError" }, "default": [], "description": "List of any errors encountered during processing" } }, "definitions": { "DocumentMetadata": { "type": "object", "required": ["filename", "file_type", "file_size", "created_at", "processing_track", "processing_time"], "properties": { "filename": { "type": "string", "description": "Original filename" }, "file_type": { "type": "string", "description": "File MIME type or extension" }, "file_size": { "type": "integer", "minimum": 0, "description": "File size in bytes" }, "created_at": { "type": "string", "format": "date-time", "description": "Processing timestamp (ISO 8601)" }, "processing_track": { "type": "string", "enum": ["ocr", "direct", "hybrid"], "description": "Processing track used" }, "processing_time": { "type": "number", "minimum": 0, "description": "Processing time in seconds" }, "language": { "type": "string", "description": "Detected or specified language code" }, "title": { "type": "string", "description": "Document title from metadata" }, "author": { "type": "string", "description": "Document author" }, "subject": { "type": "string", "description": "Document subject" }, "keywords": { "type": "array", "items": {"type": "string"}, "description": "Document keywords" }, "producer": { "type": "string", "description": "PDF producer application" }, "creator": { "type": "string", "description": "Document creator application" }, "creation_date": { "type": "string", "format": "date-time", "description": "Document creation date" }, "modification_date": { "type": "string", "format": "date-time", "description": "Document last modification date" } } }, "Page": { "type": "object", "required": ["page_number", "elements", "dimensions"], "properties": { "page_number": { "type": "integer", "minimum": 1, "description": "1-based page number" }, "elements": { "type": "array", "items": { "$ref": "#/definitions/DocumentElement" }, "description": "List of elements on the page" }, "dimensions": { "$ref": "#/definitions/Dimensions" }, "metadata": { "type": "object", "additionalProperties": true, "description": "Additional page-specific metadata" }, "statistics": { "$ref": "#/definitions/PageStatistics" } } }, "DocumentElement": { "type": "object", "required": ["element_id", "type", "bbox"], "properties": { "element_id": { "type": "string", "description": "Unique identifier for the element" }, "type": { "type": "string", "enum": [ "text", "title", "header", "footer", "reference", "equation", "footnote", "caption", "list", "list_item", "table", "table_cell", "table_caption", "image", "figure", "chart", "diagram", "section", "paragraph", "page_number", "watermark", "header_group", "body", "code", "formula", "signature", "stamp", "logo", "barcode", "qr_code" ], "description": "Element type (supports all 23 PP-StructureV3 types plus custom ones)" }, "content": { "oneOf": [ {"type": "string"}, {"$ref": "#/definitions/TableData"}, {"type": "object"} ], "description": "Element content (text, table data, or structured data)" }, "content_type": { "type": "string", "enum": ["text", "table", "binary"], "description": "Type of content when not a simple string" }, "content_length": { "type": "integer", "description": "Length of binary content in bytes" }, "bbox": { "$ref": "#/definitions/BoundingBox" }, "confidence": { "type": "number", "minimum": 0, "maximum": 1, "description": "OCR confidence score (0-1)" }, "style": { "$ref": "#/definitions/StyleInfo" }, "metadata": { "type": "object", "additionalProperties": true, "description": "Additional element metadata" }, "children": { "type": "array", "items": { "$ref": "#/definitions/DocumentElement" }, "description": "Nested child elements" } } }, "BoundingBox": { "type": "object", "required": ["x0", "y0", "x1", "y1"], "properties": { "x0": { "type": "number", "description": "Left coordinate" }, "y0": { "type": "number", "description": "Top coordinate" }, "x1": { "type": "number", "description": "Right coordinate" }, "y1": { "type": "number", "description": "Bottom coordinate" }, "width": { "type": "number", "description": "Width (calculated)" }, "height": { "type": "number", "description": "Height (calculated)" } } }, "StyleInfo": { "type": "object", "properties": { "font_name": { "type": "string", "description": "Font family name" }, "font_size": { "type": "number", "minimum": 0, "description": "Font size in points" }, "font_weight": { "type": "string", "enum": ["normal", "bold"], "description": "Font weight" }, "font_style": { "type": "string", "enum": ["normal", "italic"], "description": "Font style" }, "text_color": { "type": "integer", "description": "Text color as RGB integer" }, "text_color_rgb": { "type": "array", "items": {"type": "integer", "minimum": 0, "maximum": 255}, "minItems": 3, "maxItems": 3, "description": "Text color as [R, G, B] array" }, "bg_color": { "type": "integer", "description": "Background color as RGB integer" }, "alignment": { "type": "string", "enum": ["left", "center", "right", "justify"], "description": "Text alignment" } } }, "TableData": { "type": "object", "required": ["rows", "cols", "cells"], "properties": { "rows": { "type": "integer", "minimum": 0, "description": "Number of rows" }, "cols": { "type": "integer", "minimum": 0, "description": "Number of columns" }, "cells": { "type": "array", "items": { "$ref": "#/definitions/TableCell" }, "description": "Table cells" }, "headers": { "type": "array", "items": {"type": "string"}, "description": "Header row labels" }, "caption": { "type": "string", "description": "Table caption" } } }, "TableCell": { "type": "object", "required": ["row", "col"], "properties": { "row": { "type": "integer", "minimum": 0, "description": "Row index (0-based)" }, "col": { "type": "integer", "minimum": 0, "description": "Column index (0-based)" }, "row_span": { "type": "integer", "minimum": 1, "default": 1, "description": "Number of rows spanned" }, "col_span": { "type": "integer", "minimum": 1, "default": 1, "description": "Number of columns spanned" }, "content": { "type": "string", "default": "", "description": "Cell text content" }, "bbox": { "$ref": "#/definitions/BoundingBox" }, "style": { "$ref": "#/definitions/StyleInfo" } } }, "Dimensions": { "type": "object", "required": ["width", "height"], "properties": { "width": { "type": "number", "minimum": 0, "description": "Width in pixels or points" }, "height": { "type": "number", "minimum": 0, "description": "Height in pixels or points" }, "dpi": { "type": "integer", "minimum": 1, "description": "Resolution in DPI" } } }, "DocumentStatistics": { "type": "object", "required": ["page_count", "total_elements", "total_tables", "total_images"], "properties": { "page_count": { "type": "integer", "minimum": 0, "description": "Total number of pages" }, "total_elements": { "type": "integer", "minimum": 0, "description": "Total elements across all pages" }, "total_tables": { "type": "integer", "minimum": 0, "description": "Total tables across all pages" }, "total_images": { "type": "integer", "minimum": 0, "description": "Total images across all pages" } } }, "PageStatistics": { "type": "object", "required": ["total_elements", "text_elements", "tables", "images"], "properties": { "total_elements": { "type": "integer", "minimum": 0 }, "text_elements": { "type": "integer", "minimum": 0 }, "tables": { "type": "integer", "minimum": 0 }, "images": { "type": "integer", "minimum": 0 } } }, "ProcessingError": { "type": "object", "required": ["error_type", "message"], "properties": { "error_type": { "type": "string", "description": "Error classification" }, "message": { "type": "string", "description": "Error description" }, "page": { "type": "integer", "description": "Page number where error occurred" }, "element_id": { "type": "string", "description": "Element ID if applicable" }, "timestamp": { "type": "string", "format": "date-time", "description": "When the error occurred" } } } } }