feat: implement core dual-track processing infrastructure

Added foundation for dual-track document processing: 1. UnifiedDocument Model (backend/app/models/unified_document.py) - Common output format for both OCR and direct extraction - Comprehensive element types (23+ types from PP-StructureV3) - BoundingBox, StyleInfo, TableData structures - Backward compatibility with legacy format 2. DocumentTypeDetector Service (backend/app/services/document_type_detector.py) - Intelligent document type detection using python-magic - PDF editability analysis using PyMuPDF - Processing track recommendation with confidence scores - Support for PDF, images, Office docs, and text files 3. DirectExtractionEngine Service (backend/app/services/direct_extraction_engine.py) - Fast extraction from editable PDFs using PyMuPDF - Preserves fonts, colors, and exact positioning - Native and positional table detection - Image extraction with coordinates - Hyperlink and metadata extraction 4. Dependencies - Added PyMuPDF>=1.23.0 for PDF extraction - Added pdfplumber>=0.10.0 as fallback - Added python-magic-bin>=0.4.14 for file detection Next: Integrate with OCR service for complete dual-track processing 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-18 20:17:50 +08:00
parent cd3cbea49d
commit 2d50c128f7
4 changed files with 1729 additions and 0 deletions
--- a/backend/app/models/unified_document.py
+++ b/backend/app/models/unified_document.py
@@ -0,0 +1,694 @@
+"""
+Unified Document Model for Dual-track Processing
+
+This module defines the common data structure used by both OCR and direct extraction tracks
+to ensure consistent output format regardless of processing method.
+"""
+
+from dataclasses import dataclass, field
+from typing import List, Dict, Optional, Union, Literal, Any
+from datetime import datetime
+from enum import Enum
+
+
+class ElementType(str, Enum):
+    """Document element types supporting all 23 PP-StructureV3 types plus custom ones"""
+    # Text elements
+    TEXT = "text"
+    TITLE = "title"
+    HEADER = "header"
+    FOOTER = "footer"
+    REFERENCE = "reference"
+    EQUATION = "equation"
+    FOOTNOTE = "footnote"
+    CAPTION = "caption"
+
+    # List elements
+    LIST = "list"
+    LIST_ITEM = "list_item"
+
+    # Table elements
+    TABLE = "table"
+    TABLE_CELL = "table_cell"
+    TABLE_CAPTION = "table_caption"
+
+    # Visual elements
+    IMAGE = "image"
+    FIGURE = "figure"
+    CHART = "chart"
+    DIAGRAM = "diagram"
+
+    # Structural elements
+    SECTION = "section"
+    PARAGRAPH = "paragraph"
+    PAGE_NUMBER = "page_number"
+    WATERMARK = "watermark"
+    HEADER_GROUP = "header_group"
+    BODY = "body"
+
+    # Special elements
+    CODE = "code"
+    FORMULA = "formula"
+    SIGNATURE = "signature"
+    STAMP = "stamp"
+    LOGO = "logo"
+    BARCODE = "barcode"
+    QR_CODE = "qr_code"
+
+
+class ProcessingTrack(str, Enum):
+    """Processing track used for the document"""
+    OCR = "ocr"           # PaddleOCR PP-StructureV3 track
+    DIRECT = "direct"     # PyMuPDF direct extraction track
+    HYBRID = "hybrid"     # Mixed processing (future)
+
+
+@dataclass
+class BoundingBox:
+    """Bounding box coordinates for document elements"""
+    x0: float  # Left coordinate
+    y0: float  # Top coordinate
+    x1: float  # Right coordinate
+    y1: float  # Bottom coordinate
+
+    @property
+    def width(self) -> float:
+        return self.x1 - self.x0
+
+    @property
+    def height(self) -> float:
+        return self.y1 - self.y0
+
+    @property
+    def center_x(self) -> float:
+        return (self.x0 + self.x1) / 2
+
+    @property
+    def center_y(self) -> float:
+        return (self.y0 + self.y1) / 2
+
+    def to_dict(self) -> Dict[str, float]:
+        return {
+            "x0": self.x0,
+            "y0": self.y0,
+            "x1": self.x1,
+            "y1": self.y1,
+            "width": self.width,
+            "height": self.height
+        }
+
+    def overlaps(self, other: 'BoundingBox', tolerance: float = 0) -> bool:
+        """Check if this bbox overlaps with another"""
+        return not (
+            self.x1 + tolerance < other.x0 or
+            self.x0 - tolerance > other.x1 or
+            self.y1 + tolerance < other.y0 or
+            self.y0 - tolerance > other.y1
+        )
+
+    def contains(self, other: 'BoundingBox', tolerance: float = 0) -> bool:
+        """Check if this bbox contains another"""
+        return (
+            self.x0 - tolerance <= other.x0 and
+            self.y0 - tolerance <= other.y0 and
+            self.x1 + tolerance >= other.x1 and
+            self.y1 + tolerance >= other.y1
+        )
+
+
+@dataclass
+class StyleInfo:
+    """Style information for text elements"""
+    font_name: Optional[str] = None
+    font_size: Optional[float] = None
+    font_weight: Optional[str] = None  # normal, bold
+    font_style: Optional[str] = None   # normal, italic
+    text_color: Optional[int] = None   # RGB as integer
+    bg_color: Optional[int] = None     # Background color
+    alignment: Optional[str] = None    # left, center, right, justify
+
+    @property
+    def is_bold(self) -> bool:
+        return self.font_weight == "bold"
+
+    @property
+    def is_italic(self) -> bool:
+        return self.font_style == "italic"
+
+    def get_rgb_color(self) -> Optional[tuple]:
+        """Convert integer color to RGB tuple"""
+        if self.text_color is None:
+            return None
+        r = (self.text_color >> 16) & 0xFF
+        g = (self.text_color >> 8) & 0xFF
+        b = self.text_color & 0xFF
+        return (r, g, b)
+
+    def to_dict(self) -> Dict[str, Any]:
+        result = {}
+        if self.font_name:
+            result["font_name"] = self.font_name
+        if self.font_size:
+            result["font_size"] = self.font_size
+        if self.font_weight:
+            result["font_weight"] = self.font_weight
+        if self.font_style:
+            result["font_style"] = self.font_style
+        if self.text_color is not None:
+            result["text_color"] = self.text_color
+            result["text_color_rgb"] = self.get_rgb_color()
+        if self.bg_color is not None:
+            result["bg_color"] = self.bg_color
+        if self.alignment:
+            result["alignment"] = self.alignment
+        return result
+
+
+@dataclass
+class TableCell:
+    """Table cell information"""
+    row: int
+    col: int
+    row_span: int = 1
+    col_span: int = 1
+    content: str = ""
+    bbox: Optional[BoundingBox] = None
+    style: Optional[StyleInfo] = None
+
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "row": self.row,
+            "col": self.col,
+            "row_span": self.row_span,
+            "col_span": self.col_span,
+            "content": self.content,
+            "bbox": self.bbox.to_dict() if self.bbox else None,
+            "style": self.style.to_dict() if self.style else None
+        }
+
+
+@dataclass
+class TableData:
+    """Structured table data"""
+    rows: int
+    cols: int
+    cells: List[TableCell] = field(default_factory=list)
+    headers: Optional[List[str]] = None
+    caption: Optional[str] = None
+
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "rows": self.rows,
+            "cols": self.cols,
+            "cells": [cell.to_dict() for cell in self.cells],
+            "headers": self.headers,
+            "caption": self.caption
+        }
+
+    def to_html(self) -> str:
+        """Convert table to HTML representation"""
+        html = ["<table>"]
+
+        if self.caption:
+            html.append(f"<caption>{self.caption}</caption>")
+
+        # Group cells by row
+        rows_data = {}
+        for cell in self.cells:
+            if cell.row not in rows_data:
+                rows_data[cell.row] = []
+            rows_data[cell.row].append(cell)
+
+        # Generate HTML
+        for row_idx in range(self.rows):
+            html.append("<tr>")
+            if row_idx in rows_data:
+                for cell in sorted(rows_data[row_idx], key=lambda c: c.col):
+                    span_attrs = []
+                    if cell.row_span > 1:
+                        span_attrs.append(f'rowspan="{cell.row_span}"')
+                    if cell.col_span > 1:
+                        span_attrs.append(f'colspan="{cell.col_span}"')
+                    span_str = " ".join(span_attrs)
+                    tag = "th" if row_idx == 0 and self.headers else "td"
+                    html.append(f'<{tag} {span_str}>{cell.content}</{tag}>')
+            html.append("</tr>")
+
+        html.append("</table>")
+        return "\n".join(html)
+
+
+@dataclass
+class DocumentElement:
+    """Individual document element (text, image, table, etc.)"""
+    element_id: str
+    type: ElementType
+    content: Union[str, TableData, bytes, Dict[str, Any]]
+    bbox: BoundingBox
+    confidence: Optional[float] = None  # OCR confidence (0-1)
+    style: Optional[StyleInfo] = None
+    metadata: Dict[str, Any] = field(default_factory=dict)
+    children: List['DocumentElement'] = field(default_factory=list)
+
+    @property
+    def is_text(self) -> bool:
+        return self.type in [
+            ElementType.TEXT, ElementType.TITLE, ElementType.HEADER,
+            ElementType.FOOTER, ElementType.CAPTION, ElementType.PARAGRAPH
+        ]
+
+    @property
+    def is_visual(self) -> bool:
+        return self.type in [
+            ElementType.IMAGE, ElementType.FIGURE, ElementType.CHART,
+            ElementType.DIAGRAM, ElementType.LOGO
+        ]
+
+    @property
+    def is_table(self) -> bool:
+        return self.type in [ElementType.TABLE, ElementType.TABLE_CELL]
+
+    def get_text(self) -> str:
+        """Extract text content from element"""
+        if isinstance(self.content, str):
+            return self.content
+        elif isinstance(self.content, TableData):
+            # Extract text from table cells
+            texts = []
+            for cell in self.content.cells:
+                if cell.content:
+                    texts.append(cell.content)
+            return " ".join(texts)
+        elif isinstance(self.content, dict) and "text" in self.content:
+            return self.content["text"]
+        return ""
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary for JSON serialization"""
+        result = {
+            "element_id": self.element_id,
+            "type": self.type.value,
+            "bbox": self.bbox.to_dict(),
+        }
+
+        # Handle different content types
+        if isinstance(self.content, str):
+            result["content"] = self.content
+        elif isinstance(self.content, TableData):
+            result["content"] = self.content.to_dict()
+            result["content_type"] = "table"
+        elif isinstance(self.content, bytes):
+            result["content_type"] = "binary"
+            result["content_length"] = len(self.content)
+        elif isinstance(self.content, dict):
+            result["content"] = self.content
+
+        if self.confidence is not None:
+            result["confidence"] = self.confidence
+
+        if self.style:
+            result["style"] = self.style.to_dict()
+
+        if self.metadata:
+            result["metadata"] = self.metadata
+
+        if self.children:
+            result["children"] = [child.to_dict() for child in self.children]
+
+        return result
+
+
+@dataclass
+class Dimensions:
+    """Page or image dimensions"""
+    width: float
+    height: float
+    dpi: Optional[int] = None
+
+    def to_dict(self) -> Dict[str, Any]:
+        result = {"width": self.width, "height": self.height}
+        if self.dpi:
+            result["dpi"] = self.dpi
+        return result
+
+
+@dataclass
+class Page:
+    """Single page in a document"""
+    page_number: int  # 1-based page number
+    elements: List[DocumentElement]
+    dimensions: Dimensions
+    metadata: Dict[str, Any] = field(default_factory=dict)
+
+    def get_reading_order(self) -> List[DocumentElement]:
+        """Get elements in reading order (top to bottom, left to right)"""
+        return sorted(
+            self.elements,
+            key=lambda e: (e.bbox.y0, e.bbox.x0)
+        )
+
+    def get_elements_by_type(self, element_type: ElementType) -> List[DocumentElement]:
+        """Get all elements of a specific type"""
+        return [e for e in self.elements if e.type == element_type]
+
+    def get_text_elements(self) -> List[DocumentElement]:
+        """Get all text-containing elements"""
+        return [e for e in self.elements if e.is_text]
+
+    def get_tables(self) -> List[DocumentElement]:
+        """Get all table elements"""
+        return [e for e in self.elements if e.type == ElementType.TABLE]
+
+    def get_images(self) -> List[DocumentElement]:
+        """Get all image elements"""
+        return [e for e in self.elements if e.is_visual]
+
+    def extract_text(self, separator: str = "\n") -> str:
+        """Extract all text from the page in reading order"""
+        texts = []
+        for element in self.get_reading_order():
+            text = element.get_text()
+            if text:
+                texts.append(text)
+        return separator.join(texts)
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary for JSON serialization"""
+        return {
+            "page_number": self.page_number,
+            "elements": [e.to_dict() for e in self.elements],
+            "dimensions": self.dimensions.to_dict(),
+            "metadata": self.metadata,
+            "statistics": {
+                "total_elements": len(self.elements),
+                "text_elements": len(self.get_text_elements()),
+                "tables": len(self.get_tables()),
+                "images": len(self.get_images())
+            }
+        }
+
+
+@dataclass
+class DocumentMetadata:
+    """Document-level metadata"""
+    filename: str
+    file_type: str
+    file_size: int
+    created_at: datetime
+    processing_track: ProcessingTrack
+    processing_time: float  # seconds
+    language: Optional[str] = None
+    title: Optional[str] = None
+    author: Optional[str] = None
+    subject: Optional[str] = None
+    keywords: Optional[List[str]] = None
+    producer: Optional[str] = None
+    creator: Optional[str] = None
+    creation_date: Optional[datetime] = None
+    modification_date: Optional[datetime] = None
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary for JSON serialization"""
+        result = {
+            "filename": self.filename,
+            "file_type": self.file_type,
+            "file_size": self.file_size,
+            "created_at": self.created_at.isoformat(),
+            "processing_track": self.processing_track.value,
+            "processing_time": self.processing_time,
+        }
+
+        # Add optional fields if present
+        optional_fields = [
+            "language", "title", "author", "subject",
+            "keywords", "producer", "creator"
+        ]
+        for field in optional_fields:
+            value = getattr(self, field)
+            if value is not None:
+                result[field] = value
+
+        if self.creation_date:
+            result["creation_date"] = self.creation_date.isoformat()
+        if self.modification_date:
+            result["modification_date"] = self.modification_date.isoformat()
+
+        return result
+
+
+@dataclass
+class UnifiedDocument:
+    """
+    Unified document representation for both OCR and direct extraction tracks.
+
+    This is the primary output format that ensures consistency across different
+    processing methods and enables seamless downstream processing.
+    """
+    document_id: str
+    metadata: DocumentMetadata
+    pages: List[Page]
+    processing_errors: List[Dict[str, Any]] = field(default_factory=list)
+
+    @property
+    def page_count(self) -> int:
+        return len(self.pages)
+
+    @property
+    def total_elements(self) -> int:
+        return sum(len(page.elements) for page in self.pages)
+
+    def get_page(self, page_number: int) -> Optional[Page]:
+        """Get page by number (1-based)"""
+        for page in self.pages:
+            if page.page_number == page_number:
+                return page
+        return None
+
+    def extract_all_text(self, page_separator: str = "\n\n") -> str:
+        """Extract all text from the document"""
+        texts = []
+        for page in self.pages:
+            page_text = page.extract_text()
+            if page_text:
+                texts.append(page_text)
+        return page_separator.join(texts)
+
+    def get_all_tables(self) -> List[DocumentElement]:
+        """Get all tables from all pages"""
+        tables = []
+        for page in self.pages:
+            tables.extend(page.get_tables())
+        return tables
+
+    def get_all_images(self) -> List[DocumentElement]:
+        """Get all images from all pages"""
+        images = []
+        for page in self.pages:
+            images.extend(page.get_images())
+        return images
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary for JSON serialization"""
+        return {
+            "document_id": self.document_id,
+            "metadata": self.metadata.to_dict(),
+            "pages": [page.to_dict() for page in self.pages],
+            "statistics": {
+                "page_count": self.page_count,
+                "total_elements": self.total_elements,
+                "total_tables": len(self.get_all_tables()),
+                "total_images": len(self.get_all_images()),
+            },
+            "processing_errors": self.processing_errors
+        }
+
+    def to_legacy_format(self) -> Dict[str, Any]:
+        """
+        Convert to legacy format for backward compatibility.
+
+        This ensures existing API clients continue to work while we transition
+        to the new unified format.
+        """
+        # Extract text regions in legacy format
+        text_regions = []
+        layout_data = []
+        images_metadata = []
+
+        for page in self.pages:
+            page_num = page.page_number
+
+            for element in page.elements:
+                if element.is_text:
+                    # Legacy text region format
+                    text_regions.append({
+                        "page": page_num,
+                        "text": element.get_text(),
+                        "confidence": element.confidence or 1.0,
+                        "bbox": {
+                            "x_min": element.bbox.x0,
+                            "y_min": element.bbox.y0,
+                            "x_max": element.bbox.x1,
+                            "y_max": element.bbox.y1
+                        }
+                    })
+
+                # Legacy layout data
+                layout_item = {
+                    "element_id": element.element_id,
+                    "type": element.type.value,
+                    "page": page_num - 1,  # Legacy uses 0-based
+                    "bbox": [element.bbox.x0, element.bbox.y0,
+                            element.bbox.x1, element.bbox.y1]
+                }
+
+                if element.is_table and isinstance(element.content, TableData):
+                    layout_item["content"] = element.content.to_html()
+                elif element.is_text:
+                    layout_item["content"] = element.get_text()
+
+                layout_data.append(layout_item)
+
+                # Legacy image metadata
+                if element.is_visual:
+                    images_metadata.append({
+                        "element_id": element.element_id,
+                        "type": "image",
+                        "page": page_num - 1,  # Legacy uses 0-based
+                        "bbox": [element.bbox.x0, element.bbox.y0,
+                                element.bbox.x1, element.bbox.y1]
+                    })
+
+        # Calculate average confidence
+        confidences = [r["confidence"] for r in text_regions if r.get("confidence")]
+        avg_confidence = sum(confidences) / len(confidences) if confidences else 0.0
+
+        return {
+            "status": "success",
+            "filename": self.metadata.filename,
+            "text_regions": text_regions,
+            "total_text_regions": len(text_regions),
+            "average_confidence": avg_confidence,
+            "processing_time": self.metadata.processing_time,
+            "language": self.metadata.language or "ch",
+            "layout_data": {
+                "elements": layout_data,
+                "total_elements": len(layout_data)
+            },
+            "images_metadata": images_metadata,
+            "ocr_dimensions": {
+                "width": self.pages[0].dimensions.width if self.pages else 0,
+                "height": self.pages[0].dimensions.height if self.pages else 0
+            },
+            # New fields that won't break existing clients
+            "_unified_format": True,
+            "_processing_track": self.metadata.processing_track.value
+        }
+
+
+class UnifiedDocumentConverter:
+    """Converter utilities for UnifiedDocument format"""
+
+    @staticmethod
+    def from_ocr_result(ocr_result: Dict[str, Any],
+                        document_id: str,
+                        metadata: DocumentMetadata) -> UnifiedDocument:
+        """
+        Convert PaddleOCR result to UnifiedDocument format.
+
+        This handles the conversion from PP-StructureV3 output to our unified format.
+        """
+        pages = []
+
+        # Handle single page or multi-page results
+        if "pages" in ocr_result:
+            page_results = ocr_result["pages"]
+        else:
+            page_results = [ocr_result]
+
+        for page_idx, page_data in enumerate(page_results):
+            page_num = page_idx + 1
+            elements = []
+
+            # Convert text regions
+            for idx, text_region in enumerate(page_data.get("text_regions", [])):
+                bbox_data = text_region.get("bbox", {})
+                element = DocumentElement(
+                    element_id=f"text_{page_num}_{idx}",
+                    type=ElementType.TEXT,
+                    content=text_region.get("text", ""),
+                    bbox=BoundingBox(
+                        x0=bbox_data.get("x_min", 0),
+                        y0=bbox_data.get("y_min", 0),
+                        x1=bbox_data.get("x_max", 0),
+                        y1=bbox_data.get("y_max", 0)
+                    ),
+                    confidence=text_region.get("confidence")
+                )
+                elements.append(element)
+
+            # Convert layout elements if available
+            if "layout_data" in page_data and page_data["layout_data"]:
+                for layout_elem in page_data["layout_data"].get("elements", []):
+                    # Map layout type to ElementType
+                    layout_type = layout_elem.get("type", "text")
+                    element_type = ElementType.TEXT  # Default
+
+                    if "table" in layout_type.lower():
+                        element_type = ElementType.TABLE
+                    elif "image" in layout_type.lower() or "figure" in layout_type.lower():
+                        element_type = ElementType.IMAGE
+                    elif "title" in layout_type.lower():
+                        element_type = ElementType.TITLE
+                    elif "list" in layout_type.lower():
+                        element_type = ElementType.LIST
+
+                    # Create element
+                    bbox_list = layout_elem.get("bbox", [0, 0, 0, 0])
+                    element = DocumentElement(
+                        element_id=layout_elem.get("element_id", f"layout_{page_num}_{len(elements)}"),
+                        type=element_type,
+                        content=layout_elem.get("content", ""),
+                        bbox=BoundingBox(
+                            x0=bbox_list[0] if len(bbox_list) > 0 else 0,
+                            y0=bbox_list[1] if len(bbox_list) > 1 else 0,
+                            x1=bbox_list[2] if len(bbox_list) > 2 else 0,
+                            y1=bbox_list[3] if len(bbox_list) > 3 else 0
+                        )
+                    )
+                    elements.append(element)
+
+            # Get page dimensions
+            ocr_dims = page_data.get("ocr_dimensions", {})
+            dimensions = Dimensions(
+                width=ocr_dims.get("width", 0),
+                height=ocr_dims.get("height", 0)
+            )
+
+            pages.append(Page(
+                page_number=page_num,
+                elements=elements,
+                dimensions=dimensions
+            ))
+
+        return UnifiedDocument(
+            document_id=document_id,
+            metadata=metadata,
+            pages=pages
+        )
+
+    @staticmethod
+    def from_direct_extraction(extraction_result: Dict[str, Any],
+                              document_id: str,
+                              metadata: DocumentMetadata) -> UnifiedDocument:
+        """
+        Convert PyMuPDF extraction result to UnifiedDocument format.
+
+        This will be implemented when we create the DirectExtractionEngine.
+        """
+        # TODO: Implement when DirectExtractionEngine is created
+        pages = []
+        return UnifiedDocument(
+            document_id=document_id,
+            metadata=metadata,
+            pages=pages
+        )