feat: implement core dual-track processing infrastructure

Added foundation for dual-track document processing: 1. UnifiedDocument Model (backend/app/models/unified_document.py) - Common output format for both OCR and direct extraction - Comprehensive element types (23+ types from PP-StructureV3) - BoundingBox, StyleInfo, TableData structures - Backward compatibility with legacy format 2. DocumentTypeDetector Service (backend/app/services/document_type_detector.py) - Intelligent document type detection using python-magic - PDF editability analysis using PyMuPDF - Processing track recommendation with confidence scores - Support for PDF, images, Office docs, and text files 3. DirectExtractionEngine Service (backend/app/services/direct_extraction_engine.py) - Fast extraction from editable PDFs using PyMuPDF - Preserves fonts, colors, and exact positioning - Native and positional table detection - Image extraction with coordinates - Hyperlink and metadata extraction 4. Dependencies - Added PyMuPDF>=1.23.0 for PDF extraction - Added pdfplumber>=0.10.0 as fallback - Added python-magic-bin>=0.4.14 for file detection Next: Integrate with OCR service for complete dual-track processing 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-18 20:17:50 +08:00
parent cd3cbea49d
commit 2d50c128f7
4 changed files with 1729 additions and 0 deletions
--- a/backend/app/models/unified_document.py
+++ b/backend/app/models/unified_document.py
@@ -0,0 +1,694 @@
+"""
+Unified Document Model for Dual-track Processing
+
+This module defines the common data structure used by both OCR and direct extraction tracks
+to ensure consistent output format regardless of processing method.
+"""
+
+from dataclasses import dataclass, field
+from typing import List, Dict, Optional, Union, Literal, Any
+from datetime import datetime
+from enum import Enum
+
+
+class ElementType(str, Enum):
+    """Document element types supporting all 23 PP-StructureV3 types plus custom ones"""
+    # Text elements
+    TEXT = "text"
+    TITLE = "title"
+    HEADER = "header"
+    FOOTER = "footer"
+    REFERENCE = "reference"
+    EQUATION = "equation"
+    FOOTNOTE = "footnote"
+    CAPTION = "caption"
+
+    # List elements
+    LIST = "list"
+    LIST_ITEM = "list_item"
+
+    # Table elements
+    TABLE = "table"
+    TABLE_CELL = "table_cell"
+    TABLE_CAPTION = "table_caption"
+
+    # Visual elements
+    IMAGE = "image"
+    FIGURE = "figure"
+    CHART = "chart"
+    DIAGRAM = "diagram"
+
+    # Structural elements
+    SECTION = "section"
+    PARAGRAPH = "paragraph"
+    PAGE_NUMBER = "page_number"
+    WATERMARK = "watermark"
+    HEADER_GROUP = "header_group"
+    BODY = "body"
+
+    # Special elements
+    CODE = "code"
+    FORMULA = "formula"
+    SIGNATURE = "signature"
+    STAMP = "stamp"
+    LOGO = "logo"
+    BARCODE = "barcode"
+    QR_CODE = "qr_code"
+
+
+class ProcessingTrack(str, Enum):
+    """Processing track used for the document"""
+    OCR = "ocr"           # PaddleOCR PP-StructureV3 track
+    DIRECT = "direct"     # PyMuPDF direct extraction track
+    HYBRID = "hybrid"     # Mixed processing (future)
+
+
+@dataclass
+class BoundingBox:
+    """Bounding box coordinates for document elements"""
+    x0: float  # Left coordinate
+    y0: float  # Top coordinate
+    x1: float  # Right coordinate
+    y1: float  # Bottom coordinate
+
+    @property
+    def width(self) -> float:
+        return self.x1 - self.x0
+
+    @property
+    def height(self) -> float:
+        return self.y1 - self.y0
+
+    @property
+    def center_x(self) -> float:
+        return (self.x0 + self.x1) / 2
+
+    @property
+    def center_y(self) -> float:
+        return (self.y0 + self.y1) / 2
+
+    def to_dict(self) -> Dict[str, float]:
+        return {
+            "x0": self.x0,
+            "y0": self.y0,
+            "x1": self.x1,
+            "y1": self.y1,
+            "width": self.width,
+            "height": self.height
+        }
+
+    def overlaps(self, other: 'BoundingBox', tolerance: float = 0) -> bool:
+        """Check if this bbox overlaps with another"""
+        return not (
+            self.x1 + tolerance < other.x0 or
+            self.x0 - tolerance > other.x1 or
+            self.y1 + tolerance < other.y0 or
+            self.y0 - tolerance > other.y1
+        )
+
+    def contains(self, other: 'BoundingBox', tolerance: float = 0) -> bool:
+        """Check if this bbox contains another"""
+        return (
+            self.x0 - tolerance <= other.x0 and
+            self.y0 - tolerance <= other.y0 and
+            self.x1 + tolerance >= other.x1 and
+            self.y1 + tolerance >= other.y1
+        )
+
+
+@dataclass
+class StyleInfo:
+    """Style information for text elements"""
+    font_name: Optional[str] = None
+    font_size: Optional[float] = None
+    font_weight: Optional[str] = None  # normal, bold
+    font_style: Optional[str] = None   # normal, italic
+    text_color: Optional[int] = None   # RGB as integer
+    bg_color: Optional[int] = None     # Background color
+    alignment: Optional[str] = None    # left, center, right, justify
+
+    @property
+    def is_bold(self) -> bool:
+        return self.font_weight == "bold"
+
+    @property
+    def is_italic(self) -> bool:
+        return self.font_style == "italic"
+
+    def get_rgb_color(self) -> Optional[tuple]:
+        """Convert integer color to RGB tuple"""
+        if self.text_color is None:
+            return None
+        r = (self.text_color >> 16) & 0xFF
+        g = (self.text_color >> 8) & 0xFF
+        b = self.text_color & 0xFF
+        return (r, g, b)
+
+    def to_dict(self) -> Dict[str, Any]:
+        result = {}
+        if self.font_name:
+            result["font_name"] = self.font_name
+        if self.font_size:
+            result["font_size"] = self.font_size
+        if self.font_weight:
+            result["font_weight"] = self.font_weight
+        if self.font_style:
+            result["font_style"] = self.font_style
+        if self.text_color is not None:
+            result["text_color"] = self.text_color
+            result["text_color_rgb"] = self.get_rgb_color()
+        if self.bg_color is not None:
+            result["bg_color"] = self.bg_color
+        if self.alignment:
+            result["alignment"] = self.alignment
+        return result
+
+
+@dataclass
+class TableCell:
+    """Table cell information"""
+    row: int
+    col: int
+    row_span: int = 1
+    col_span: int = 1
+    content: str = ""
+    bbox: Optional[BoundingBox] = None
+    style: Optional[StyleInfo] = None
+
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "row": self.row,
+            "col": self.col,
+            "row_span": self.row_span,
+            "col_span": self.col_span,
+            "content": self.content,
+            "bbox": self.bbox.to_dict() if self.bbox else None,
+            "style": self.style.to_dict() if self.style else None
+        }
+
+
+@dataclass
+class TableData:
+    """Structured table data"""
+    rows: int
+    cols: int
+    cells: List[TableCell] = field(default_factory=list)
+    headers: Optional[List[str]] = None
+    caption: Optional[str] = None
+
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "rows": self.rows,
+            "cols": self.cols,
+            "cells": [cell.to_dict() for cell in self.cells],
+            "headers": self.headers,
+            "caption": self.caption
+        }
+
+    def to_html(self) -> str:
+        """Convert table to HTML representation"""
+        html = ["<table>"]
+
+        if self.caption:
+            html.append(f"<caption>{self.caption}</caption>")
+
+        # Group cells by row
+        rows_data = {}
+        for cell in self.cells:
+            if cell.row not in rows_data:
+                rows_data[cell.row] = []
+            rows_data[cell.row].append(cell)
+
+        # Generate HTML
+        for row_idx in range(self.rows):
+            html.append("<tr>")
+            if row_idx in rows_data:
+                for cell in sorted(rows_data[row_idx], key=lambda c: c.col):
+                    span_attrs = []
+                    if cell.row_span > 1:
+                        span_attrs.append(f'rowspan="{cell.row_span}"')
+                    if cell.col_span > 1:
+                        span_attrs.append(f'colspan="{cell.col_span}"')
+                    span_str = " ".join(span_attrs)
+                    tag = "th" if row_idx == 0 and self.headers else "td"
+                    html.append(f'<{tag} {span_str}>{cell.content}</{tag}>')
+            html.append("</tr>")
+
+        html.append("</table>")
+        return "\n".join(html)
+
+
+@dataclass
+class DocumentElement:
+    """Individual document element (text, image, table, etc.)"""
+    element_id: str
+    type: ElementType
+    content: Union[str, TableData, bytes, Dict[str, Any]]
+    bbox: BoundingBox
+    confidence: Optional[float] = None  # OCR confidence (0-1)
+    style: Optional[StyleInfo] = None
+    metadata: Dict[str, Any] = field(default_factory=dict)
+    children: List['DocumentElement'] = field(default_factory=list)
+
+    @property
+    def is_text(self) -> bool:
+        return self.type in [
+            ElementType.TEXT, ElementType.TITLE, ElementType.HEADER,
+            ElementType.FOOTER, ElementType.CAPTION, ElementType.PARAGRAPH
+        ]
+
+    @property
+    def is_visual(self) -> bool:
+        return self.type in [
+            ElementType.IMAGE, ElementType.FIGURE, ElementType.CHART,
+            ElementType.DIAGRAM, ElementType.LOGO
+        ]
+
+    @property
+    def is_table(self) -> bool:
+        return self.type in [ElementType.TABLE, ElementType.TABLE_CELL]
+
+    def get_text(self) -> str:
+        """Extract text content from element"""
+        if isinstance(self.content, str):
+            return self.content
+        elif isinstance(self.content, TableData):
+            # Extract text from table cells
+            texts = []
+            for cell in self.content.cells:
+                if cell.content:
+                    texts.append(cell.content)
+            return " ".join(texts)
+        elif isinstance(self.content, dict) and "text" in self.content:
+            return self.content["text"]
+        return ""
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary for JSON serialization"""
+        result = {
+            "element_id": self.element_id,
+            "type": self.type.value,
+            "bbox": self.bbox.to_dict(),
+        }
+
+        # Handle different content types
+        if isinstance(self.content, str):
+            result["content"] = self.content
+        elif isinstance(self.content, TableData):
+            result["content"] = self.content.to_dict()
+            result["content_type"] = "table"
+        elif isinstance(self.content, bytes):
+            result["content_type"] = "binary"
+            result["content_length"] = len(self.content)
+        elif isinstance(self.content, dict):
+            result["content"] = self.content
+
+        if self.confidence is not None:
+            result["confidence"] = self.confidence
+
+        if self.style:
+            result["style"] = self.style.to_dict()
+
+        if self.metadata:
+            result["metadata"] = self.metadata
+
+        if self.children:
+            result["children"] = [child.to_dict() for child in self.children]
+
+        return result
+
+
+@dataclass
+class Dimensions:
+    """Page or image dimensions"""
+    width: float
+    height: float
+    dpi: Optional[int] = None
+
+    def to_dict(self) -> Dict[str, Any]:
+        result = {"width": self.width, "height": self.height}
+        if self.dpi:
+            result["dpi"] = self.dpi
+        return result
+
+
+@dataclass
+class Page:
+    """Single page in a document"""
+    page_number: int  # 1-based page number
+    elements: List[DocumentElement]
+    dimensions: Dimensions
+    metadata: Dict[str, Any] = field(default_factory=dict)
+
+    def get_reading_order(self) -> List[DocumentElement]:
+        """Get elements in reading order (top to bottom, left to right)"""
+        return sorted(
+            self.elements,
+            key=lambda e: (e.bbox.y0, e.bbox.x0)
+        )
+
+    def get_elements_by_type(self, element_type: ElementType) -> List[DocumentElement]:
+        """Get all elements of a specific type"""
+        return [e for e in self.elements if e.type == element_type]
+
+    def get_text_elements(self) -> List[DocumentElement]:
+        """Get all text-containing elements"""
+        return [e for e in self.elements if e.is_text]
+
+    def get_tables(self) -> List[DocumentElement]:
+        """Get all table elements"""
+        return [e for e in self.elements if e.type == ElementType.TABLE]
+
+    def get_images(self) -> List[DocumentElement]:
+        """Get all image elements"""
+        return [e for e in self.elements if e.is_visual]
+
+    def extract_text(self, separator: str = "\n") -> str:
+        """Extract all text from the page in reading order"""
+        texts = []
+        for element in self.get_reading_order():
+            text = element.get_text()
+            if text:
+                texts.append(text)
+        return separator.join(texts)
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary for JSON serialization"""
+        return {
+            "page_number": self.page_number,
+            "elements": [e.to_dict() for e in self.elements],
+            "dimensions": self.dimensions.to_dict(),
+            "metadata": self.metadata,
+            "statistics": {
+                "total_elements": len(self.elements),
+                "text_elements": len(self.get_text_elements()),
+                "tables": len(self.get_tables()),
+                "images": len(self.get_images())
+            }
+        }
+
+
+@dataclass
+class DocumentMetadata:
+    """Document-level metadata"""
+    filename: str
+    file_type: str
+    file_size: int
+    created_at: datetime
+    processing_track: ProcessingTrack
+    processing_time: float  # seconds
+    language: Optional[str] = None
+    title: Optional[str] = None
+    author: Optional[str] = None
+    subject: Optional[str] = None
+    keywords: Optional[List[str]] = None
+    producer: Optional[str] = None
+    creator: Optional[str] = None
+    creation_date: Optional[datetime] = None
+    modification_date: Optional[datetime] = None
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary for JSON serialization"""
+        result = {
+            "filename": self.filename,
+            "file_type": self.file_type,
+            "file_size": self.file_size,
+            "created_at": self.created_at.isoformat(),
+            "processing_track": self.processing_track.value,
+            "processing_time": self.processing_time,
+        }
+
+        # Add optional fields if present
+        optional_fields = [
+            "language", "title", "author", "subject",
+            "keywords", "producer", "creator"
+        ]
+        for field in optional_fields:
+            value = getattr(self, field)
+            if value is not None:
+                result[field] = value
+
+        if self.creation_date:
+            result["creation_date"] = self.creation_date.isoformat()
+        if self.modification_date:
+            result["modification_date"] = self.modification_date.isoformat()
+
+        return result
+
+
+@dataclass
+class UnifiedDocument:
+    """
+    Unified document representation for both OCR and direct extraction tracks.
+
+    This is the primary output format that ensures consistency across different
+    processing methods and enables seamless downstream processing.
+    """
+    document_id: str
+    metadata: DocumentMetadata
+    pages: List[Page]
+    processing_errors: List[Dict[str, Any]] = field(default_factory=list)
+
+    @property
+    def page_count(self) -> int:
+        return len(self.pages)
+
+    @property
+    def total_elements(self) -> int:
+        return sum(len(page.elements) for page in self.pages)
+
+    def get_page(self, page_number: int) -> Optional[Page]:
+        """Get page by number (1-based)"""
+        for page in self.pages:
+            if page.page_number == page_number:
+                return page
+        return None
+
+    def extract_all_text(self, page_separator: str = "\n\n") -> str:
+        """Extract all text from the document"""
+        texts = []
+        for page in self.pages:
+            page_text = page.extract_text()
+            if page_text:
+                texts.append(page_text)
+        return page_separator.join(texts)
+
+    def get_all_tables(self) -> List[DocumentElement]:
+        """Get all tables from all pages"""
+        tables = []
+        for page in self.pages:
+            tables.extend(page.get_tables())
+        return tables
+
+    def get_all_images(self) -> List[DocumentElement]:
+        """Get all images from all pages"""
+        images = []
+        for page in self.pages:
+            images.extend(page.get_images())
+        return images
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary for JSON serialization"""
+        return {
+            "document_id": self.document_id,
+            "metadata": self.metadata.to_dict(),
+            "pages": [page.to_dict() for page in self.pages],
+            "statistics": {
+                "page_count": self.page_count,
+                "total_elements": self.total_elements,
+                "total_tables": len(self.get_all_tables()),
+                "total_images": len(self.get_all_images()),
+            },
+            "processing_errors": self.processing_errors
+        }
+
+    def to_legacy_format(self) -> Dict[str, Any]:
+        """
+        Convert to legacy format for backward compatibility.
+
+        This ensures existing API clients continue to work while we transition
+        to the new unified format.
+        """
+        # Extract text regions in legacy format
+        text_regions = []
+        layout_data = []
+        images_metadata = []
+
+        for page in self.pages:
+            page_num = page.page_number
+
+            for element in page.elements:
+                if element.is_text:
+                    # Legacy text region format
+                    text_regions.append({
+                        "page": page_num,
+                        "text": element.get_text(),
+                        "confidence": element.confidence or 1.0,
+                        "bbox": {
+                            "x_min": element.bbox.x0,
+                            "y_min": element.bbox.y0,
+                            "x_max": element.bbox.x1,
+                            "y_max": element.bbox.y1
+                        }
+                    })
+
+                # Legacy layout data
+                layout_item = {
+                    "element_id": element.element_id,
+                    "type": element.type.value,
+                    "page": page_num - 1,  # Legacy uses 0-based
+                    "bbox": [element.bbox.x0, element.bbox.y0,
+                            element.bbox.x1, element.bbox.y1]
+                }
+
+                if element.is_table and isinstance(element.content, TableData):
+                    layout_item["content"] = element.content.to_html()
+                elif element.is_text:
+                    layout_item["content"] = element.get_text()
+
+                layout_data.append(layout_item)
+
+                # Legacy image metadata
+                if element.is_visual:
+                    images_metadata.append({
+                        "element_id": element.element_id,
+                        "type": "image",
+                        "page": page_num - 1,  # Legacy uses 0-based
+                        "bbox": [element.bbox.x0, element.bbox.y0,
+                                element.bbox.x1, element.bbox.y1]
+                    })
+
+        # Calculate average confidence
+        confidences = [r["confidence"] for r in text_regions if r.get("confidence")]
+        avg_confidence = sum(confidences) / len(confidences) if confidences else 0.0
+
+        return {
+            "status": "success",
+            "filename": self.metadata.filename,
+            "text_regions": text_regions,
+            "total_text_regions": len(text_regions),
+            "average_confidence": avg_confidence,
+            "processing_time": self.metadata.processing_time,
+            "language": self.metadata.language or "ch",
+            "layout_data": {
+                "elements": layout_data,
+                "total_elements": len(layout_data)
+            },
+            "images_metadata": images_metadata,
+            "ocr_dimensions": {
+                "width": self.pages[0].dimensions.width if self.pages else 0,
+                "height": self.pages[0].dimensions.height if self.pages else 0
+            },
+            # New fields that won't break existing clients
+            "_unified_format": True,
+            "_processing_track": self.metadata.processing_track.value
+        }
+
+
+class UnifiedDocumentConverter:
+    """Converter utilities for UnifiedDocument format"""
+
+    @staticmethod
+    def from_ocr_result(ocr_result: Dict[str, Any],
+                        document_id: str,
+                        metadata: DocumentMetadata) -> UnifiedDocument:
+        """
+        Convert PaddleOCR result to UnifiedDocument format.
+
+        This handles the conversion from PP-StructureV3 output to our unified format.
+        """
+        pages = []
+
+        # Handle single page or multi-page results
+        if "pages" in ocr_result:
+            page_results = ocr_result["pages"]
+        else:
+            page_results = [ocr_result]
+
+        for page_idx, page_data in enumerate(page_results):
+            page_num = page_idx + 1
+            elements = []
+
+            # Convert text regions
+            for idx, text_region in enumerate(page_data.get("text_regions", [])):
+                bbox_data = text_region.get("bbox", {})
+                element = DocumentElement(
+                    element_id=f"text_{page_num}_{idx}",
+                    type=ElementType.TEXT,
+                    content=text_region.get("text", ""),
+                    bbox=BoundingBox(
+                        x0=bbox_data.get("x_min", 0),
+                        y0=bbox_data.get("y_min", 0),
+                        x1=bbox_data.get("x_max", 0),
+                        y1=bbox_data.get("y_max", 0)
+                    ),
+                    confidence=text_region.get("confidence")
+                )
+                elements.append(element)
+
+            # Convert layout elements if available
+            if "layout_data" in page_data and page_data["layout_data"]:
+                for layout_elem in page_data["layout_data"].get("elements", []):
+                    # Map layout type to ElementType
+                    layout_type = layout_elem.get("type", "text")
+                    element_type = ElementType.TEXT  # Default
+
+                    if "table" in layout_type.lower():
+                        element_type = ElementType.TABLE
+                    elif "image" in layout_type.lower() or "figure" in layout_type.lower():
+                        element_type = ElementType.IMAGE
+                    elif "title" in layout_type.lower():
+                        element_type = ElementType.TITLE
+                    elif "list" in layout_type.lower():
+                        element_type = ElementType.LIST
+
+                    # Create element
+                    bbox_list = layout_elem.get("bbox", [0, 0, 0, 0])
+                    element = DocumentElement(
+                        element_id=layout_elem.get("element_id", f"layout_{page_num}_{len(elements)}"),
+                        type=element_type,
+                        content=layout_elem.get("content", ""),
+                        bbox=BoundingBox(
+                            x0=bbox_list[0] if len(bbox_list) > 0 else 0,
+                            y0=bbox_list[1] if len(bbox_list) > 1 else 0,
+                            x1=bbox_list[2] if len(bbox_list) > 2 else 0,
+                            y1=bbox_list[3] if len(bbox_list) > 3 else 0
+                        )
+                    )
+                    elements.append(element)
+
+            # Get page dimensions
+            ocr_dims = page_data.get("ocr_dimensions", {})
+            dimensions = Dimensions(
+                width=ocr_dims.get("width", 0),
+                height=ocr_dims.get("height", 0)
+            )
+
+            pages.append(Page(
+                page_number=page_num,
+                elements=elements,
+                dimensions=dimensions
+            ))
+
+        return UnifiedDocument(
+            document_id=document_id,
+            metadata=metadata,
+            pages=pages
+        )
+
+    @staticmethod
+    def from_direct_extraction(extraction_result: Dict[str, Any],
+                              document_id: str,
+                              metadata: DocumentMetadata) -> UnifiedDocument:
+        """
+        Convert PyMuPDF extraction result to UnifiedDocument format.
+
+        This will be implemented when we create the DirectExtractionEngine.
+        """
+        # TODO: Implement when DirectExtractionEngine is created
+        pages = []
+        return UnifiedDocument(
+            document_id=document_id,
+            metadata=metadata,
+            pages=pages
+        )
--- a/backend/app/services/direct_extraction_engine.py
+++ b/backend/app/services/direct_extraction_engine.py
@@ -0,0 +1,633 @@
+"""
+Direct Extraction Engine using PyMuPDF
+
+Handles direct text and structure extraction from editable PDFs without OCR.
+This provides much faster processing and perfect accuracy for documents with
+extractable text.
+"""
+
+import os
+import logging
+import fitz  # PyMuPDF
+import uuid
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple, Any, Union
+from datetime import datetime
+import re
+
+from ..models.unified_document import (
+    UnifiedDocument, DocumentElement, Page, DocumentMetadata,
+    BoundingBox, StyleInfo, TableData, TableCell, Dimensions,
+    ElementType, ProcessingTrack
+)
+
+logger = logging.getLogger(__name__)
+
+
+class DirectExtractionEngine:
+    """
+    Engine for direct text extraction from editable PDFs using PyMuPDF.
+
+    This engine provides:
+    - Fast text extraction with exact positioning
+    - Font and style information preservation
+    - Table structure detection
+    - Image extraction with coordinates
+    - Hyperlink and annotation extraction
+    """
+
+    def __init__(self,
+                 enable_table_detection: bool = True,
+                 enable_image_extraction: bool = True,
+                 min_table_rows: int = 2,
+                 min_table_cols: int = 2):
+        """
+        Initialize the extraction engine.
+
+        Args:
+            enable_table_detection: Whether to detect and extract tables
+            enable_image_extraction: Whether to extract images
+            min_table_rows: Minimum rows for table detection
+            min_table_cols: Minimum columns for table detection
+        """
+        self.enable_table_detection = enable_table_detection
+        self.enable_image_extraction = enable_image_extraction
+        self.min_table_rows = min_table_rows
+        self.min_table_cols = min_table_cols
+
+    def extract(self,
+                file_path: Path,
+                output_dir: Optional[Path] = None) -> UnifiedDocument:
+        """
+        Extract content from PDF file to UnifiedDocument format.
+
+        Args:
+            file_path: Path to PDF file
+            output_dir: Optional directory to save extracted images
+
+        Returns:
+            UnifiedDocument with extracted content
+        """
+        start_time = datetime.now()
+        document_id = str(uuid.uuid4())
+
+        try:
+            doc = fitz.open(str(file_path))
+
+            # Extract document metadata
+            metadata = self._extract_metadata(file_path, doc, start_time)
+
+            # Extract pages
+            pages = []
+            for page_num in range(len(doc)):
+                logger.info(f"Extracting page {page_num + 1}/{len(doc)}")
+                page = self._extract_page(
+                    doc[page_num],
+                    page_num + 1,
+                    document_id,
+                    output_dir
+                )
+                pages.append(page)
+
+            doc.close()
+
+            # Calculate processing time
+            processing_time = (datetime.now() - start_time).total_seconds()
+            metadata.processing_time = processing_time
+
+            logger.info(f"Direct extraction completed in {processing_time:.2f}s")
+
+            return UnifiedDocument(
+                document_id=document_id,
+                metadata=metadata,
+                pages=pages
+            )
+
+        except Exception as e:
+            logger.error(f"Error during direct extraction: {e}")
+            # Return partial result with error information
+            processing_time = (datetime.now() - start_time).total_seconds()
+
+            if 'metadata' not in locals():
+                metadata = DocumentMetadata(
+                    filename=file_path.name,
+                    file_type="pdf",
+                    file_size=file_path.stat().st_size if file_path.exists() else 0,
+                    created_at=datetime.now(),
+                    processing_track=ProcessingTrack.DIRECT,
+                    processing_time=processing_time
+                )
+
+            return UnifiedDocument(
+                document_id=document_id,
+                metadata=metadata,
+                pages=pages if 'pages' in locals() else [],
+                processing_errors=[{
+                    "error": str(e),
+                    "type": type(e).__name__
+                }]
+            )
+
+    def _extract_metadata(self,
+                         file_path: Path,
+                         doc: fitz.Document,
+                         start_time: datetime) -> DocumentMetadata:
+        """Extract document metadata"""
+        pdf_metadata = doc.metadata
+
+        return DocumentMetadata(
+            filename=file_path.name,
+            file_type="pdf",
+            file_size=file_path.stat().st_size,
+            created_at=start_time,
+            processing_track=ProcessingTrack.DIRECT,
+            processing_time=0.0,  # Will be updated later
+            title=pdf_metadata.get("title"),
+            author=pdf_metadata.get("author"),
+            subject=pdf_metadata.get("subject"),
+            keywords=pdf_metadata.get("keywords", "").split(",") if pdf_metadata.get("keywords") else None,
+            producer=pdf_metadata.get("producer"),
+            creator=pdf_metadata.get("creator"),
+            creation_date=self._parse_pdf_date(pdf_metadata.get("creationDate")),
+            modification_date=self._parse_pdf_date(pdf_metadata.get("modDate"))
+        )
+
+    def _parse_pdf_date(self, date_str: str) -> Optional[datetime]:
+        """Parse PDF date string to datetime"""
+        if not date_str:
+            return None
+
+        try:
+            # PDF date format: D:YYYYMMDDHHmmSSOHH'mm
+            # Example: D:20240101120000+09'00
+            if date_str.startswith("D:"):
+                date_str = date_str[2:]
+
+            # Extract just the date/time part (first 14 characters)
+            if len(date_str) >= 14:
+                date_part = date_str[:14]
+                return datetime.strptime(date_part, "%Y%m%d%H%M%S")
+        except:
+            pass
+
+        return None
+
+    def _extract_page(self,
+                     page: fitz.Page,
+                     page_num: int,
+                     document_id: str,
+                     output_dir: Optional[Path]) -> Page:
+        """Extract content from a single page"""
+        elements = []
+        element_counter = 0
+
+        # Get page dimensions
+        rect = page.rect
+        dimensions = Dimensions(
+            width=rect.width,
+            height=rect.height,
+            dpi=72  # PDF standard DPI
+        )
+
+        # Extract text blocks with formatting
+        text_dict = page.get_text("dict")
+        for block_idx, block in enumerate(text_dict.get("blocks", [])):
+            if block.get("type") == 0:  # Text block
+                element = self._process_text_block(
+                    block, page_num, element_counter
+                )
+                if element:
+                    elements.append(element)
+                    element_counter += 1
+
+        # Extract tables (if enabled)
+        if self.enable_table_detection:
+            try:
+                # Try native table detection (PyMuPDF 1.23.0+)
+                tables = page.find_tables()
+                for table_idx, table in enumerate(tables):
+                    element = self._process_native_table(
+                        table, page_num, element_counter
+                    )
+                    if element:
+                        elements.append(element)
+                        element_counter += 1
+            except AttributeError:
+                # Fallback to positional table detection
+                logger.debug("Native table detection not available, using positional detection")
+                table_elements = self._detect_tables_by_position(page, page_num, element_counter)
+                elements.extend(table_elements)
+                element_counter += len(table_elements)
+
+        # Extract images (if enabled)
+        if self.enable_image_extraction:
+            image_elements = self._extract_images(
+                page, page_num, document_id, element_counter, output_dir
+            )
+            elements.extend(image_elements)
+            element_counter += len(image_elements)
+
+        # Extract hyperlinks
+        links = page.get_links()
+        for link_idx, link in enumerate(links):
+            # Create link annotation element if it has URI
+            if link.get("uri"):
+                from_rect = link.get("from")
+                if from_rect:
+                    element = DocumentElement(
+                        element_id=f"link_{page_num}_{element_counter}",
+                        type=ElementType.REFERENCE,
+                        content={"uri": link["uri"], "type": "hyperlink"},
+                        bbox=BoundingBox(
+                            x0=from_rect.x0,
+                            y0=from_rect.y0,
+                            x1=from_rect.x1,
+                            y1=from_rect.y1
+                        ),
+                        metadata={"link_type": "external" if link["uri"].startswith("http") else "internal"}
+                    )
+                    elements.append(element)
+                    element_counter += 1
+
+        # Extract vector graphics (as metadata)
+        drawings = page.get_drawings()
+        if drawings:
+            logger.debug(f"Page {page_num} contains {len(drawings)} vector drawing commands")
+
+        return Page(
+            page_number=page_num,
+            elements=elements,
+            dimensions=dimensions,
+            metadata={
+                "has_drawings": len(drawings) > 0,
+                "drawing_count": len(drawings),
+                "link_count": len(links)
+            }
+        )
+
+    def _process_text_block(self, block: Dict, page_num: int, counter: int) -> Optional[DocumentElement]:
+        """Process a text block into a DocumentElement"""
+        # Calculate block bounding box
+        bbox_data = block.get("bbox", [0, 0, 0, 0])
+        bbox = BoundingBox(
+            x0=bbox_data[0],
+            y0=bbox_data[1],
+            x1=bbox_data[2],
+            y1=bbox_data[3]
+        )
+
+        # Extract text content
+        text_parts = []
+        styles = []
+
+        for line in block.get("lines", []):
+            for span in line.get("spans", []):
+                text = span.get("text", "")
+                if text:
+                    text_parts.append(text)
+
+                    # Extract style information
+                    style = StyleInfo(
+                        font_name=span.get("font"),
+                        font_size=span.get("size"),
+                        font_weight="bold" if span.get("flags", 0) & 2**4 else "normal",
+                        font_style="italic" if span.get("flags", 0) & 2**1 else "normal",
+                        text_color=span.get("color")
+                    )
+                    styles.append(style)
+
+        if not text_parts:
+            return None
+
+        full_text = "".join(text_parts)
+
+        # Determine element type based on content and style
+        element_type = self._infer_element_type(full_text, styles)
+
+        # Use the most common style for the block
+        if styles:
+            block_style = styles[0]  # Could be improved with style merging
+        else:
+            block_style = None
+
+        return DocumentElement(
+            element_id=f"text_{page_num}_{counter}",
+            type=element_type,
+            content=full_text,
+            bbox=bbox,
+            style=block_style,
+            confidence=1.0  # Direct extraction has perfect confidence
+        )
+
+    def _infer_element_type(self, text: str, styles: List[StyleInfo]) -> ElementType:
+        """Infer element type based on text content and styling"""
+        text_lower = text.lower().strip()
+
+        # Check for common patterns
+        if len(text_lower) < 100 and styles:
+            # Short text with large font might be title/header
+            avg_size = sum(s.font_size or 12 for s in styles) / len(styles)
+            if avg_size > 16:
+                return ElementType.TITLE
+            elif avg_size > 14:
+                return ElementType.HEADER
+
+        # Check for list patterns
+        if re.match(r'^[\d•·▪▫◦‣⁃]\s', text_lower):
+            return ElementType.LIST_ITEM
+
+        # Check for page numbers
+        if re.match(r'^page\s+\d+|^\d+\s*$|^-\s*\d+\s*-$', text_lower):
+            return ElementType.PAGE_NUMBER
+
+        # Check for footnote patterns
+        if re.match(r'^[\[\d+\]]|^\d+\)', text_lower):
+            return ElementType.FOOTNOTE
+
+        # Default to paragraph for longer text, text for shorter
+        return ElementType.PARAGRAPH if len(text) > 150 else ElementType.TEXT
+
+    def _process_native_table(self, table, page_num: int, counter: int) -> Optional[DocumentElement]:
+        """Process a natively detected table"""
+        try:
+            # Extract table data
+            data = table.extract()
+            if not data or len(data) < self.min_table_rows:
+                return None
+
+            # Get table bounding box
+            bbox_data = table.bbox
+            bbox = BoundingBox(
+                x0=bbox_data[0],
+                y0=bbox_data[1],
+                x1=bbox_data[2],
+                y1=bbox_data[3]
+            )
+
+            # Create table cells
+            cells = []
+            for row_idx, row in enumerate(data):
+                for col_idx, cell_text in enumerate(row):
+                    if cell_text:
+                        cells.append(TableCell(
+                            row=row_idx,
+                            col=col_idx,
+                            content=str(cell_text) if cell_text else ""
+                        ))
+
+            # Create table data
+            table_data = TableData(
+                rows=len(data),
+                cols=max(len(row) for row in data) if data else 0,
+                cells=cells,
+                headers=data[0] if data else None  # Assume first row is header
+            )
+
+            return DocumentElement(
+                element_id=f"table_{page_num}_{counter}",
+                type=ElementType.TABLE,
+                content=table_data,
+                bbox=bbox,
+                confidence=1.0
+            )
+
+        except Exception as e:
+            logger.error(f"Error processing native table: {e}")
+            return None
+
+    def _detect_tables_by_position(self, page: fitz.Page, page_num: int, counter: int) -> List[DocumentElement]:
+        """Detect tables by analyzing text positioning"""
+        tables = []
+
+        # Get all words with positions
+        words = page.get_text("words")  # Returns (x0, y0, x1, y1, "word", block_no, line_no, word_no)
+
+        if not words:
+            return tables
+
+        # Group words by approximate row (y-coordinate)
+        rows = {}
+        for word in words:
+            y = round(word[1] / 5) * 5  # Round to nearest 5 points
+            if y not in rows:
+                rows[y] = []
+            rows[y].append({
+                'x0': word[0],
+                'y0': word[1],
+                'x1': word[2],
+                'y1': word[3],
+                'text': word[4],
+                'block': word[5] if len(word) > 5 else 0
+            })
+
+        # Sort rows by y-coordinate
+        sorted_rows = sorted(rows.items(), key=lambda x: x[0])
+
+        # Find potential tables (consecutive rows with multiple columns)
+        current_table_rows = []
+        tables_found = []
+
+        for y, words_in_row in sorted_rows:
+            words_in_row.sort(key=lambda w: w['x0'])
+
+            if len(words_in_row) >= self.min_table_cols:
+                # Check if this could be a table row
+                x_positions = [w['x0'] for w in words_in_row]
+
+                # Check for somewhat regular spacing
+                if self._has_regular_spacing(x_positions):
+                    current_table_rows.append((y, words_in_row))
+                else:
+                    # End current table if exists
+                    if len(current_table_rows) >= self.min_table_rows:
+                        tables_found.append(current_table_rows)
+                    current_table_rows = []
+            else:
+                # End current table if exists
+                if len(current_table_rows) >= self.min_table_rows:
+                    tables_found.append(current_table_rows)
+                current_table_rows = []
+
+        # Don't forget the last table
+        if len(current_table_rows) >= self.min_table_rows:
+            tables_found.append(current_table_rows)
+
+        # Convert detected tables to DocumentElements
+        for table_idx, table_rows in enumerate(tables_found):
+            if not table_rows:
+                continue
+
+            # Calculate table bounding box
+            all_words = []
+            for _, words in table_rows:
+                all_words.extend(words)
+
+            min_x = min(w['x0'] for w in all_words)
+            min_y = min(w['y0'] for w in all_words)
+            max_x = max(w['x1'] for w in all_words)
+            max_y = max(w['y1'] for w in all_words)
+
+            bbox = BoundingBox(x0=min_x, y0=min_y, x1=max_x, y1=max_y)
+
+            # Create table cells
+            cells = []
+            for row_idx, (y, words) in enumerate(table_rows):
+                # Group words into columns
+                columns = self._group_into_columns(words, table_rows)
+                for col_idx, col_text in enumerate(columns):
+                    if col_text:
+                        cells.append(TableCell(
+                            row=row_idx,
+                            col=col_idx,
+                            content=col_text
+                        ))
+
+            # Create table data
+            table_data = TableData(
+                rows=len(table_rows),
+                cols=max(len(self._group_into_columns(words, table_rows))
+                        for _, words in table_rows),
+                cells=cells
+            )
+
+            element = DocumentElement(
+                element_id=f"table_{page_num}_{counter + table_idx}",
+                type=ElementType.TABLE,
+                content=table_data,
+                bbox=bbox,
+                confidence=0.8,  # Lower confidence for positional detection
+                metadata={"detection_method": "positional"}
+            )
+            tables.append(element)
+
+        return tables
+
+    def _has_regular_spacing(self, x_positions: List[float], tolerance: float = 0.3) -> bool:
+        """Check if x positions have somewhat regular spacing"""
+        if len(x_positions) < 3:
+            return False
+
+        spacings = [x_positions[i+1] - x_positions[i] for i in range(len(x_positions)-1)]
+        avg_spacing = sum(spacings) / len(spacings)
+
+        # Check if spacings are within tolerance of average
+        for spacing in spacings:
+            if abs(spacing - avg_spacing) > avg_spacing * tolerance:
+                return False
+
+        return True
+
+    def _group_into_columns(self, words: List[Dict], all_rows: List) -> List[str]:
+        """Group words into columns based on x-position"""
+        if not words:
+            return []
+
+        # Find common column positions across all rows
+        all_x_positions = []
+        for _, row_words in all_rows:
+            all_x_positions.extend([w['x0'] for w in row_words])
+
+        # Cluster x-positions to find columns
+        column_positions = self._cluster_positions(all_x_positions)
+
+        # Assign words to columns
+        columns = [""] * len(column_positions)
+        for word in words:
+            # Find closest column
+            closest_col = 0
+            min_dist = float('inf')
+            for col_idx, col_x in enumerate(column_positions):
+                dist = abs(word['x0'] - col_x)
+                if dist < min_dist:
+                    min_dist = dist
+                    closest_col = col_idx
+
+            if columns[closest_col]:
+                columns[closest_col] += " " + word['text']
+            else:
+                columns[closest_col] = word['text']
+
+        return columns
+
+    def _cluster_positions(self, positions: List[float], threshold: float = 20) -> List[float]:
+        """Cluster positions to find common columns"""
+        if not positions:
+            return []
+
+        sorted_pos = sorted(positions)
+        clusters = [[sorted_pos[0]]]
+
+        for pos in sorted_pos[1:]:
+            # Check if position belongs to current cluster
+            if pos - clusters[-1][-1] < threshold:
+                clusters[-1].append(pos)
+            else:
+                clusters.append([pos])
+
+        # Return average position of each cluster
+        return [sum(cluster) / len(cluster) for cluster in clusters]
+
+    def _extract_images(self,
+                       page: fitz.Page,
+                       page_num: int,
+                       document_id: str,
+                       counter: int,
+                       output_dir: Optional[Path]) -> List[DocumentElement]:
+        """Extract images from page"""
+        elements = []
+        image_list = page.get_images()
+
+        for img_idx, img in enumerate(image_list):
+            try:
+                xref = img[0]
+
+                # Get image position(s)
+                img_rects = page.get_image_rects(xref)
+                if not img_rects:
+                    continue
+
+                rect = img_rects[0]  # Use first occurrence
+                bbox = BoundingBox(
+                    x0=rect.x0,
+                    y0=rect.y0,
+                    x1=rect.x1,
+                    y1=rect.y1
+                )
+
+                # Extract image data
+                pix = fitz.Pixmap(page.parent, xref)
+                image_data = {
+                    "width": pix.width,
+                    "height": pix.height,
+                    "colorspace": pix.colorspace.name if pix.colorspace else "unknown",
+                    "xref": xref
+                }
+
+                # Save image if output directory provided
+                if output_dir:
+                    output_dir.mkdir(parents=True, exist_ok=True)
+                    image_filename = f"{document_id}_p{page_num}_img{img_idx}.png"
+                    image_path = output_dir / image_filename
+                    pix.save(str(image_path))
+                    image_data["saved_path"] = str(image_path)
+                    logger.debug(f"Saved image to {image_path}")
+
+                element = DocumentElement(
+                    element_id=f"image_{page_num}_{counter + img_idx}",
+                    type=ElementType.IMAGE,
+                    content=image_data,
+                    bbox=bbox,
+                    confidence=1.0,
+                    metadata={
+                        "image_index": img_idx,
+                        "xref": xref
+                    }
+                )
+                elements.append(element)
+
+                pix = None  # Free memory
+
+            except Exception as e:
+                logger.error(f"Error extracting image {img_idx}: {e}")
+
+        return elements
--- a/backend/app/services/document_type_detector.py
+++ b/backend/app/services/document_type_detector.py
@@ -0,0 +1,397 @@
+"""
+Document Type Detector Service
+
+Intelligently determines the optimal processing track for documents based on
+file type, content analysis, and editability checks.
+"""
+
+import os
+import logging
+import magic
+import fitz  # PyMuPDF
+from pathlib import Path
+from typing import Dict, Optional, Tuple, List
+from enum import Enum
+import statistics
+
+logger = logging.getLogger(__name__)
+
+
+class DocumentType(str, Enum):
+    """Document type classification"""
+    PDF_EDITABLE = "pdf_editable"      # PDF with extractable text
+    PDF_SCANNED = "pdf_scanned"        # PDF with images/scanned content
+    PDF_MIXED = "pdf_mixed"            # PDF with both text and scanned pages
+    IMAGE = "image"                     # Image files (PNG, JPG, etc.)
+    OFFICE_WORD = "office_word"        # Word documents
+    OFFICE_EXCEL = "office_excel"      # Excel spreadsheets
+    OFFICE_POWERPOINT = "office_ppt"   # PowerPoint presentations
+    TEXT = "text"                       # Plain text files
+    UNKNOWN = "unknown"                 # Unknown format
+
+
+class ProcessingTrackRecommendation:
+    """Processing track recommendation with confidence"""
+
+    def __init__(self,
+                 track: str,
+                 confidence: float,
+                 reason: str,
+                 document_type: DocumentType,
+                 metadata: Optional[Dict] = None):
+        self.track = track  # "ocr" or "direct"
+        self.confidence = confidence  # 0.0 to 1.0
+        self.reason = reason
+        self.document_type = document_type
+        self.metadata = metadata or {}
+
+    def to_dict(self) -> Dict:
+        return {
+            "recommended_track": self.track,
+            "confidence": self.confidence,
+            "reason": self.reason,
+            "document_type": self.document_type.value,
+            "metadata": self.metadata
+        }
+
+
+class DocumentTypeDetector:
+    """
+    Service for detecting document types and recommending processing tracks.
+
+    This service analyzes documents to determine:
+    1. The document type (PDF, image, Office, etc.)
+    2. Whether the document contains extractable text
+    3. The recommended processing track (OCR vs Direct)
+    """
+
+    # MIME type mappings
+    IMAGE_MIMES = {
+        'image/png', 'image/jpeg', 'image/jpg', 'image/gif',
+        'image/bmp', 'image/tiff', 'image/webp'
+    }
+
+    OFFICE_MIMES = {
+        'application/vnd.openxmlformats-officedocument.wordprocessingml.document': DocumentType.OFFICE_WORD,
+        'application/msword': DocumentType.OFFICE_WORD,
+        'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': DocumentType.OFFICE_EXCEL,
+        'application/vnd.ms-excel': DocumentType.OFFICE_EXCEL,
+        'application/vnd.openxmlformats-officedocument.presentationml.presentation': DocumentType.OFFICE_POWERPOINT,
+        'application/vnd.ms-powerpoint': DocumentType.OFFICE_POWERPOINT,
+    }
+
+    def __init__(self,
+                 min_text_length: int = 100,
+                 sample_pages: int = 3,
+                 text_coverage_threshold: float = 0.9):
+        """
+        Initialize the detector.
+
+        Args:
+            min_text_length: Minimum text length to consider a page as having extractable text
+            sample_pages: Number of pages to sample for PDF analysis
+            text_coverage_threshold: Percentage of pages with text to classify as editable
+        """
+        self.min_text_length = min_text_length
+        self.sample_pages = sample_pages
+        self.text_coverage_threshold = text_coverage_threshold
+
+    def detect(self, file_path: Path) -> ProcessingTrackRecommendation:
+        """
+        Detect document type and recommend processing track.
+
+        Args:
+            file_path: Path to the document file
+
+        Returns:
+            ProcessingTrackRecommendation with track selection and metadata
+        """
+        if not file_path.exists():
+            logger.error(f"File not found: {file_path}")
+            return ProcessingTrackRecommendation(
+                track="ocr",
+                confidence=0.5,
+                reason="File not found, defaulting to OCR",
+                document_type=DocumentType.UNKNOWN
+            )
+
+        try:
+            # Detect MIME type
+            mime_type = magic.from_file(str(file_path), mime=True)
+            logger.info(f"Detected MIME type: {mime_type} for {file_path.name}")
+
+            # Route based on file type
+            if mime_type == 'application/pdf':
+                return self._analyze_pdf(file_path)
+            elif mime_type in self.IMAGE_MIMES:
+                return self._analyze_image(file_path, mime_type)
+            elif mime_type in self.OFFICE_MIMES:
+                return self._analyze_office(file_path, mime_type)
+            elif mime_type.startswith('text/'):
+                return self._analyze_text(file_path, mime_type)
+            else:
+                logger.warning(f"Unknown MIME type: {mime_type}")
+                return ProcessingTrackRecommendation(
+                    track="ocr",
+                    confidence=0.5,
+                    reason=f"Unknown file type ({mime_type}), defaulting to OCR",
+                    document_type=DocumentType.UNKNOWN
+                )
+
+        except Exception as e:
+            logger.error(f"Error detecting document type: {e}")
+            return ProcessingTrackRecommendation(
+                track="ocr",
+                confidence=0.3,
+                reason=f"Error during detection: {str(e)}",
+                document_type=DocumentType.UNKNOWN
+            )
+
+    def _analyze_pdf(self, file_path: Path) -> ProcessingTrackRecommendation:
+        """
+        Analyze PDF to determine if it's editable or scanned.
+
+        Args:
+            file_path: Path to PDF file
+
+        Returns:
+            Processing track recommendation
+        """
+        try:
+            doc = fitz.open(str(file_path))
+            total_pages = len(doc)
+
+            # Sample pages for analysis
+            pages_to_check = min(self.sample_pages, total_pages)
+            text_pages = []
+            page_details = []
+
+            for page_num in range(pages_to_check):
+                page = doc[page_num]
+
+                # Extract text
+                text = page.get_text()
+                text_length = len(text.strip())
+
+                # Check for images
+                images = page.get_images()
+                image_count = len(images)
+
+                # Calculate page area covered by images
+                page_rect = page.rect
+                page_area = page_rect.width * page_rect.height
+                image_area = 0
+
+                for img in images:
+                    try:
+                        # Get image rectangles
+                        xref = img[0]
+                        img_rects = page.get_image_rects(xref)
+                        for rect in img_rects:
+                            image_area += rect.width * rect.height
+                    except:
+                        pass
+
+                image_coverage = image_area / page_area if page_area > 0 else 0
+
+                # Determine if page has meaningful text
+                has_text = text_length >= self.min_text_length
+
+                text_pages.append(has_text)
+                page_details.append({
+                    "page": page_num + 1,
+                    "text_length": text_length,
+                    "has_text": has_text,
+                    "image_count": image_count,
+                    "image_coverage": image_coverage
+                })
+
+                logger.debug(f"Page {page_num + 1}: text_length={text_length}, "
+                           f"images={image_count}, image_coverage={image_coverage:.2%}")
+
+            doc.close()
+
+            # Calculate text coverage
+            text_coverage = sum(text_pages) / len(text_pages) if text_pages else 0
+
+            # Determine document type and track
+            metadata = {
+                "total_pages": total_pages,
+                "sampled_pages": pages_to_check,
+                "text_coverage": text_coverage,
+                "page_details": page_details
+            }
+
+            if text_coverage >= self.text_coverage_threshold:
+                # Mostly text-based PDF
+                return ProcessingTrackRecommendation(
+                    track="direct",
+                    confidence=0.95,
+                    reason=f"PDF has extractable text on {text_coverage:.0%} of sampled pages",
+                    document_type=DocumentType.PDF_EDITABLE,
+                    metadata=metadata
+                )
+            elif text_coverage <= 0.1:
+                # Mostly scanned/image PDF
+                return ProcessingTrackRecommendation(
+                    track="ocr",
+                    confidence=0.95,
+                    reason=f"PDF appears to be scanned (only {text_coverage:.0%} pages have text)",
+                    document_type=DocumentType.PDF_SCANNED,
+                    metadata=metadata
+                )
+            else:
+                # Mixed content
+                # For mixed PDFs, we could implement page-level track selection in the future
+                # For now, use OCR to ensure we don't miss scanned content
+                return ProcessingTrackRecommendation(
+                    track="ocr",
+                    confidence=0.7,
+                    reason=f"PDF has mixed content ({text_coverage:.0%} text pages), using OCR for completeness",
+                    document_type=DocumentType.PDF_MIXED,
+                    metadata=metadata
+                )
+
+        except Exception as e:
+            logger.error(f"Error analyzing PDF: {e}")
+            return ProcessingTrackRecommendation(
+                track="ocr",
+                confidence=0.5,
+                reason=f"Error analyzing PDF: {str(e)}",
+                document_type=DocumentType.PDF_SCANNED
+            )
+
+    def _analyze_image(self, file_path: Path, mime_type: str) -> ProcessingTrackRecommendation:
+        """
+        Analyze image file.
+
+        Images always require OCR processing.
+        """
+        file_size = file_path.stat().st_size
+        metadata = {
+            "mime_type": mime_type,
+            "file_size": file_size,
+            "file_extension": file_path.suffix
+        }
+
+        return ProcessingTrackRecommendation(
+            track="ocr",
+            confidence=1.0,
+            reason="Image files require OCR processing",
+            document_type=DocumentType.IMAGE,
+            metadata=metadata
+        )
+
+    def _analyze_office(self, file_path: Path, mime_type: str) -> ProcessingTrackRecommendation:
+        """
+        Analyze Office document.
+
+        Currently routes all Office documents to OCR track.
+        Future enhancement: implement direct extraction for Office files.
+        """
+        document_type = self.OFFICE_MIMES.get(mime_type, DocumentType.UNKNOWN)
+        file_size = file_path.stat().st_size
+
+        metadata = {
+            "mime_type": mime_type,
+            "file_size": file_size,
+            "file_extension": file_path.suffix
+        }
+
+        # TODO: In future, we could implement direct extraction for Office files
+        # using python-docx, openpyxl, python-pptx
+        return ProcessingTrackRecommendation(
+            track="ocr",
+            confidence=0.9,
+            reason="Office documents currently processed via OCR (direct extraction planned)",
+            document_type=document_type,
+            metadata=metadata
+        )
+
+    def _analyze_text(self, file_path: Path, mime_type: str) -> ProcessingTrackRecommendation:
+        """
+        Analyze text file.
+
+        Plain text files can be directly processed without OCR.
+        """
+        file_size = file_path.stat().st_size
+        metadata = {
+            "mime_type": mime_type,
+            "file_size": file_size,
+            "file_extension": file_path.suffix
+        }
+
+        return ProcessingTrackRecommendation(
+            track="direct",
+            confidence=1.0,
+            reason="Plain text files can be directly processed",
+            document_type=DocumentType.TEXT,
+            metadata=metadata
+        )
+
+    def analyze_batch(self, file_paths: List[Path]) -> Dict[str, ProcessingTrackRecommendation]:
+        """
+        Analyze multiple files and return recommendations.
+
+        Args:
+            file_paths: List of file paths to analyze
+
+        Returns:
+            Dictionary mapping file paths to recommendations
+        """
+        results = {}
+
+        for file_path in file_paths:
+            try:
+                recommendation = self.detect(file_path)
+                results[str(file_path)] = recommendation
+                logger.info(f"Analyzed {file_path.name}: {recommendation.track} "
+                          f"(confidence: {recommendation.confidence:.2f})")
+            except Exception as e:
+                logger.error(f"Error analyzing {file_path}: {e}")
+                results[str(file_path)] = ProcessingTrackRecommendation(
+                    track="ocr",
+                    confidence=0.3,
+                    reason=f"Error during analysis: {str(e)}",
+                    document_type=DocumentType.UNKNOWN
+                )
+
+        return results
+
+    def get_statistics(self, recommendations: Dict[str, ProcessingTrackRecommendation]) -> Dict:
+        """
+        Calculate statistics from batch analysis results.
+
+        Args:
+            recommendations: Dictionary of file recommendations
+
+        Returns:
+            Statistics dictionary
+        """
+        if not recommendations:
+            return {"total": 0}
+
+        tracks = [r.track for r in recommendations.values()]
+        confidences = [r.confidence for r in recommendations.values()]
+        doc_types = [r.document_type.value for r in recommendations.values()]
+
+        stats = {
+            "total": len(recommendations),
+            "by_track": {
+                "ocr": tracks.count("ocr"),
+                "direct": tracks.count("direct")
+            },
+            "by_document_type": {},
+            "confidence": {
+                "mean": statistics.mean(confidences),
+                "median": statistics.median(confidences),
+                "min": min(confidences),
+                "max": max(confidences)
+            }
+        }
+
+        # Count by document type
+        for doc_type in set(doc_types):
+            stats["by_document_type"][doc_type] = doc_types.count(doc_type)
+
+        return stats
--- a/requirements.txt
+++ b/requirements.txt
@@ -25,6 +25,11 @@ reportlab>=4.0.0  # Layout-preserving PDF generation with precise coordinate con
 PyPDF2>=3.0.0  # Extract dimensions from source PDF files
 # Note: pandoc needs to be installed via brew (brew install pandoc)

+# ===== Direct PDF Extraction (Dual-track Processing) =====
+PyMuPDF>=1.23.0  # Primary library for editable PDF text/structure extraction
+pdfplumber>=0.10.0  # Fallback for table extraction and validation
+python-magic-bin>=0.4.14  # Windows-compatible file type detection
+
 # ===== Data Export =====
 pandas>=2.1.0
 openpyxl>=3.1.0  # Excel support