feat: implement core dual-track processing infrastructure

Added foundation for dual-track document processing: 1. UnifiedDocument Model (backend/app/models/unified_document.py) - Common output format for both OCR and direct extraction - Comprehensive element types (23+ types from PP-StructureV3) - BoundingBox, StyleInfo, TableData structures - Backward compatibility with legacy format 2. DocumentTypeDetector Service (backend/app/services/document_type_detector.py) - Intelligent document type detection using python-magic - PDF editability analysis using PyMuPDF - Processing track recommendation with confidence scores - Support for PDF, images, Office docs, and text files 3. DirectExtractionEngine Service (backend/app/services/direct_extraction_engine.py) - Fast extraction from editable PDFs using PyMuPDF - Preserves fonts, colors, and exact positioning - Native and positional table detection - Image extraction with coordinates - Hyperlink and metadata extraction 4. Dependencies - Added PyMuPDF>=1.23.0 for PDF extraction - Added pdfplumber>=0.10.0 as fallback - Added python-magic-bin>=0.4.14 for file detection Next: Integrate with OCR service for complete dual-track processing 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-18 20:17:50 +08:00
parent cd3cbea49d
commit 2d50c128f7
4 changed files with 1729 additions and 0 deletions
--- a/backend/app/services/direct_extraction_engine.py
+++ b/backend/app/services/direct_extraction_engine.py
@@ -0,0 +1,633 @@
+"""
+Direct Extraction Engine using PyMuPDF
+
+Handles direct text and structure extraction from editable PDFs without OCR.
+This provides much faster processing and perfect accuracy for documents with
+extractable text.
+"""
+
+import os
+import logging
+import fitz  # PyMuPDF
+import uuid
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple, Any, Union
+from datetime import datetime
+import re
+
+from ..models.unified_document import (
+    UnifiedDocument, DocumentElement, Page, DocumentMetadata,
+    BoundingBox, StyleInfo, TableData, TableCell, Dimensions,
+    ElementType, ProcessingTrack
+)
+
+logger = logging.getLogger(__name__)
+
+
+class DirectExtractionEngine:
+    """
+    Engine for direct text extraction from editable PDFs using PyMuPDF.
+
+    This engine provides:
+    - Fast text extraction with exact positioning
+    - Font and style information preservation
+    - Table structure detection
+    - Image extraction with coordinates
+    - Hyperlink and annotation extraction
+    """
+
+    def __init__(self,
+                 enable_table_detection: bool = True,
+                 enable_image_extraction: bool = True,
+                 min_table_rows: int = 2,
+                 min_table_cols: int = 2):
+        """
+        Initialize the extraction engine.
+
+        Args:
+            enable_table_detection: Whether to detect and extract tables
+            enable_image_extraction: Whether to extract images
+            min_table_rows: Minimum rows for table detection
+            min_table_cols: Minimum columns for table detection
+        """
+        self.enable_table_detection = enable_table_detection
+        self.enable_image_extraction = enable_image_extraction
+        self.min_table_rows = min_table_rows
+        self.min_table_cols = min_table_cols
+
+    def extract(self,
+                file_path: Path,
+                output_dir: Optional[Path] = None) -> UnifiedDocument:
+        """
+        Extract content from PDF file to UnifiedDocument format.
+
+        Args:
+            file_path: Path to PDF file
+            output_dir: Optional directory to save extracted images
+
+        Returns:
+            UnifiedDocument with extracted content
+        """
+        start_time = datetime.now()
+        document_id = str(uuid.uuid4())
+
+        try:
+            doc = fitz.open(str(file_path))
+
+            # Extract document metadata
+            metadata = self._extract_metadata(file_path, doc, start_time)
+
+            # Extract pages
+            pages = []
+            for page_num in range(len(doc)):
+                logger.info(f"Extracting page {page_num + 1}/{len(doc)}")
+                page = self._extract_page(
+                    doc[page_num],
+                    page_num + 1,
+                    document_id,
+                    output_dir
+                )
+                pages.append(page)
+
+            doc.close()
+
+            # Calculate processing time
+            processing_time = (datetime.now() - start_time).total_seconds()
+            metadata.processing_time = processing_time
+
+            logger.info(f"Direct extraction completed in {processing_time:.2f}s")
+
+            return UnifiedDocument(
+                document_id=document_id,
+                metadata=metadata,
+                pages=pages
+            )
+
+        except Exception as e:
+            logger.error(f"Error during direct extraction: {e}")
+            # Return partial result with error information
+            processing_time = (datetime.now() - start_time).total_seconds()
+
+            if 'metadata' not in locals():
+                metadata = DocumentMetadata(
+                    filename=file_path.name,
+                    file_type="pdf",
+                    file_size=file_path.stat().st_size if file_path.exists() else 0,
+                    created_at=datetime.now(),
+                    processing_track=ProcessingTrack.DIRECT,
+                    processing_time=processing_time
+                )
+
+            return UnifiedDocument(
+                document_id=document_id,
+                metadata=metadata,
+                pages=pages if 'pages' in locals() else [],
+                processing_errors=[{
+                    "error": str(e),
+                    "type": type(e).__name__
+                }]
+            )
+
+    def _extract_metadata(self,
+                         file_path: Path,
+                         doc: fitz.Document,
+                         start_time: datetime) -> DocumentMetadata:
+        """Extract document metadata"""
+        pdf_metadata = doc.metadata
+
+        return DocumentMetadata(
+            filename=file_path.name,
+            file_type="pdf",
+            file_size=file_path.stat().st_size,
+            created_at=start_time,
+            processing_track=ProcessingTrack.DIRECT,
+            processing_time=0.0,  # Will be updated later
+            title=pdf_metadata.get("title"),
+            author=pdf_metadata.get("author"),
+            subject=pdf_metadata.get("subject"),
+            keywords=pdf_metadata.get("keywords", "").split(",") if pdf_metadata.get("keywords") else None,
+            producer=pdf_metadata.get("producer"),
+            creator=pdf_metadata.get("creator"),
+            creation_date=self._parse_pdf_date(pdf_metadata.get("creationDate")),
+            modification_date=self._parse_pdf_date(pdf_metadata.get("modDate"))
+        )
+
+    def _parse_pdf_date(self, date_str: str) -> Optional[datetime]:
+        """Parse PDF date string to datetime"""
+        if not date_str:
+            return None
+
+        try:
+            # PDF date format: D:YYYYMMDDHHmmSSOHH'mm
+            # Example: D:20240101120000+09'00
+            if date_str.startswith("D:"):
+                date_str = date_str[2:]
+
+            # Extract just the date/time part (first 14 characters)
+            if len(date_str) >= 14:
+                date_part = date_str[:14]
+                return datetime.strptime(date_part, "%Y%m%d%H%M%S")
+        except:
+            pass
+
+        return None
+
+    def _extract_page(self,
+                     page: fitz.Page,
+                     page_num: int,
+                     document_id: str,
+                     output_dir: Optional[Path]) -> Page:
+        """Extract content from a single page"""
+        elements = []
+        element_counter = 0
+
+        # Get page dimensions
+        rect = page.rect
+        dimensions = Dimensions(
+            width=rect.width,
+            height=rect.height,
+            dpi=72  # PDF standard DPI
+        )
+
+        # Extract text blocks with formatting
+        text_dict = page.get_text("dict")
+        for block_idx, block in enumerate(text_dict.get("blocks", [])):
+            if block.get("type") == 0:  # Text block
+                element = self._process_text_block(
+                    block, page_num, element_counter
+                )
+                if element:
+                    elements.append(element)
+                    element_counter += 1
+
+        # Extract tables (if enabled)
+        if self.enable_table_detection:
+            try:
+                # Try native table detection (PyMuPDF 1.23.0+)
+                tables = page.find_tables()
+                for table_idx, table in enumerate(tables):
+                    element = self._process_native_table(
+                        table, page_num, element_counter
+                    )
+                    if element:
+                        elements.append(element)
+                        element_counter += 1
+            except AttributeError:
+                # Fallback to positional table detection
+                logger.debug("Native table detection not available, using positional detection")
+                table_elements = self._detect_tables_by_position(page, page_num, element_counter)
+                elements.extend(table_elements)
+                element_counter += len(table_elements)
+
+        # Extract images (if enabled)
+        if self.enable_image_extraction:
+            image_elements = self._extract_images(
+                page, page_num, document_id, element_counter, output_dir
+            )
+            elements.extend(image_elements)
+            element_counter += len(image_elements)
+
+        # Extract hyperlinks
+        links = page.get_links()
+        for link_idx, link in enumerate(links):
+            # Create link annotation element if it has URI
+            if link.get("uri"):
+                from_rect = link.get("from")
+                if from_rect:
+                    element = DocumentElement(
+                        element_id=f"link_{page_num}_{element_counter}",
+                        type=ElementType.REFERENCE,
+                        content={"uri": link["uri"], "type": "hyperlink"},
+                        bbox=BoundingBox(
+                            x0=from_rect.x0,
+                            y0=from_rect.y0,
+                            x1=from_rect.x1,
+                            y1=from_rect.y1
+                        ),
+                        metadata={"link_type": "external" if link["uri"].startswith("http") else "internal"}
+                    )
+                    elements.append(element)
+                    element_counter += 1
+
+        # Extract vector graphics (as metadata)
+        drawings = page.get_drawings()
+        if drawings:
+            logger.debug(f"Page {page_num} contains {len(drawings)} vector drawing commands")
+
+        return Page(
+            page_number=page_num,
+            elements=elements,
+            dimensions=dimensions,
+            metadata={
+                "has_drawings": len(drawings) > 0,
+                "drawing_count": len(drawings),
+                "link_count": len(links)
+            }
+        )
+
+    def _process_text_block(self, block: Dict, page_num: int, counter: int) -> Optional[DocumentElement]:
+        """Process a text block into a DocumentElement"""
+        # Calculate block bounding box
+        bbox_data = block.get("bbox", [0, 0, 0, 0])
+        bbox = BoundingBox(
+            x0=bbox_data[0],
+            y0=bbox_data[1],
+            x1=bbox_data[2],
+            y1=bbox_data[3]
+        )
+
+        # Extract text content
+        text_parts = []
+        styles = []
+
+        for line in block.get("lines", []):
+            for span in line.get("spans", []):
+                text = span.get("text", "")
+                if text:
+                    text_parts.append(text)
+
+                    # Extract style information
+                    style = StyleInfo(
+                        font_name=span.get("font"),
+                        font_size=span.get("size"),
+                        font_weight="bold" if span.get("flags", 0) & 2**4 else "normal",
+                        font_style="italic" if span.get("flags", 0) & 2**1 else "normal",
+                        text_color=span.get("color")
+                    )
+                    styles.append(style)
+
+        if not text_parts:
+            return None
+
+        full_text = "".join(text_parts)
+
+        # Determine element type based on content and style
+        element_type = self._infer_element_type(full_text, styles)
+
+        # Use the most common style for the block
+        if styles:
+            block_style = styles[0]  # Could be improved with style merging
+        else:
+            block_style = None
+
+        return DocumentElement(
+            element_id=f"text_{page_num}_{counter}",
+            type=element_type,
+            content=full_text,
+            bbox=bbox,
+            style=block_style,
+            confidence=1.0  # Direct extraction has perfect confidence
+        )
+
+    def _infer_element_type(self, text: str, styles: List[StyleInfo]) -> ElementType:
+        """Infer element type based on text content and styling"""
+        text_lower = text.lower().strip()
+
+        # Check for common patterns
+        if len(text_lower) < 100 and styles:
+            # Short text with large font might be title/header
+            avg_size = sum(s.font_size or 12 for s in styles) / len(styles)
+            if avg_size > 16:
+                return ElementType.TITLE
+            elif avg_size > 14:
+                return ElementType.HEADER
+
+        # Check for list patterns
+        if re.match(r'^[\d•·▪▫◦‣⁃]\s', text_lower):
+            return ElementType.LIST_ITEM
+
+        # Check for page numbers
+        if re.match(r'^page\s+\d+|^\d+\s*$|^-\s*\d+\s*-$', text_lower):
+            return ElementType.PAGE_NUMBER
+
+        # Check for footnote patterns
+        if re.match(r'^[\[\d+\]]|^\d+\)', text_lower):
+            return ElementType.FOOTNOTE
+
+        # Default to paragraph for longer text, text for shorter
+        return ElementType.PARAGRAPH if len(text) > 150 else ElementType.TEXT
+
+    def _process_native_table(self, table, page_num: int, counter: int) -> Optional[DocumentElement]:
+        """Process a natively detected table"""
+        try:
+            # Extract table data
+            data = table.extract()
+            if not data or len(data) < self.min_table_rows:
+                return None
+
+            # Get table bounding box
+            bbox_data = table.bbox
+            bbox = BoundingBox(
+                x0=bbox_data[0],
+                y0=bbox_data[1],
+                x1=bbox_data[2],
+                y1=bbox_data[3]
+            )
+
+            # Create table cells
+            cells = []
+            for row_idx, row in enumerate(data):
+                for col_idx, cell_text in enumerate(row):
+                    if cell_text:
+                        cells.append(TableCell(
+                            row=row_idx,
+                            col=col_idx,
+                            content=str(cell_text) if cell_text else ""
+                        ))
+
+            # Create table data
+            table_data = TableData(
+                rows=len(data),
+                cols=max(len(row) for row in data) if data else 0,
+                cells=cells,
+                headers=data[0] if data else None  # Assume first row is header
+            )
+
+            return DocumentElement(
+                element_id=f"table_{page_num}_{counter}",
+                type=ElementType.TABLE,
+                content=table_data,
+                bbox=bbox,
+                confidence=1.0
+            )
+
+        except Exception as e:
+            logger.error(f"Error processing native table: {e}")
+            return None
+
+    def _detect_tables_by_position(self, page: fitz.Page, page_num: int, counter: int) -> List[DocumentElement]:
+        """Detect tables by analyzing text positioning"""
+        tables = []
+
+        # Get all words with positions
+        words = page.get_text("words")  # Returns (x0, y0, x1, y1, "word", block_no, line_no, word_no)
+
+        if not words:
+            return tables
+
+        # Group words by approximate row (y-coordinate)
+        rows = {}
+        for word in words:
+            y = round(word[1] / 5) * 5  # Round to nearest 5 points
+            if y not in rows:
+                rows[y] = []
+            rows[y].append({
+                'x0': word[0],
+                'y0': word[1],
+                'x1': word[2],
+                'y1': word[3],
+                'text': word[4],
+                'block': word[5] if len(word) > 5 else 0
+            })
+
+        # Sort rows by y-coordinate
+        sorted_rows = sorted(rows.items(), key=lambda x: x[0])
+
+        # Find potential tables (consecutive rows with multiple columns)
+        current_table_rows = []
+        tables_found = []
+
+        for y, words_in_row in sorted_rows:
+            words_in_row.sort(key=lambda w: w['x0'])
+
+            if len(words_in_row) >= self.min_table_cols:
+                # Check if this could be a table row
+                x_positions = [w['x0'] for w in words_in_row]
+
+                # Check for somewhat regular spacing
+                if self._has_regular_spacing(x_positions):
+                    current_table_rows.append((y, words_in_row))
+                else:
+                    # End current table if exists
+                    if len(current_table_rows) >= self.min_table_rows:
+                        tables_found.append(current_table_rows)
+                    current_table_rows = []
+            else:
+                # End current table if exists
+                if len(current_table_rows) >= self.min_table_rows:
+                    tables_found.append(current_table_rows)
+                current_table_rows = []
+
+        # Don't forget the last table
+        if len(current_table_rows) >= self.min_table_rows:
+            tables_found.append(current_table_rows)
+
+        # Convert detected tables to DocumentElements
+        for table_idx, table_rows in enumerate(tables_found):
+            if not table_rows:
+                continue
+
+            # Calculate table bounding box
+            all_words = []
+            for _, words in table_rows:
+                all_words.extend(words)
+
+            min_x = min(w['x0'] for w in all_words)
+            min_y = min(w['y0'] for w in all_words)
+            max_x = max(w['x1'] for w in all_words)
+            max_y = max(w['y1'] for w in all_words)
+
+            bbox = BoundingBox(x0=min_x, y0=min_y, x1=max_x, y1=max_y)
+
+            # Create table cells
+            cells = []
+            for row_idx, (y, words) in enumerate(table_rows):
+                # Group words into columns
+                columns = self._group_into_columns(words, table_rows)
+                for col_idx, col_text in enumerate(columns):
+                    if col_text:
+                        cells.append(TableCell(
+                            row=row_idx,
+                            col=col_idx,
+                            content=col_text
+                        ))
+
+            # Create table data
+            table_data = TableData(
+                rows=len(table_rows),
+                cols=max(len(self._group_into_columns(words, table_rows))
+                        for _, words in table_rows),
+                cells=cells
+            )
+
+            element = DocumentElement(
+                element_id=f"table_{page_num}_{counter + table_idx}",
+                type=ElementType.TABLE,
+                content=table_data,
+                bbox=bbox,
+                confidence=0.8,  # Lower confidence for positional detection
+                metadata={"detection_method": "positional"}
+            )
+            tables.append(element)
+
+        return tables
+
+    def _has_regular_spacing(self, x_positions: List[float], tolerance: float = 0.3) -> bool:
+        """Check if x positions have somewhat regular spacing"""
+        if len(x_positions) < 3:
+            return False
+
+        spacings = [x_positions[i+1] - x_positions[i] for i in range(len(x_positions)-1)]
+        avg_spacing = sum(spacings) / len(spacings)
+
+        # Check if spacings are within tolerance of average
+        for spacing in spacings:
+            if abs(spacing - avg_spacing) > avg_spacing * tolerance:
+                return False
+
+        return True
+
+    def _group_into_columns(self, words: List[Dict], all_rows: List) -> List[str]:
+        """Group words into columns based on x-position"""
+        if not words:
+            return []
+
+        # Find common column positions across all rows
+        all_x_positions = []
+        for _, row_words in all_rows:
+            all_x_positions.extend([w['x0'] for w in row_words])
+
+        # Cluster x-positions to find columns
+        column_positions = self._cluster_positions(all_x_positions)
+
+        # Assign words to columns
+        columns = [""] * len(column_positions)
+        for word in words:
+            # Find closest column
+            closest_col = 0
+            min_dist = float('inf')
+            for col_idx, col_x in enumerate(column_positions):
+                dist = abs(word['x0'] - col_x)
+                if dist < min_dist:
+                    min_dist = dist
+                    closest_col = col_idx
+
+            if columns[closest_col]:
+                columns[closest_col] += " " + word['text']
+            else:
+                columns[closest_col] = word['text']
+
+        return columns
+
+    def _cluster_positions(self, positions: List[float], threshold: float = 20) -> List[float]:
+        """Cluster positions to find common columns"""
+        if not positions:
+            return []
+
+        sorted_pos = sorted(positions)
+        clusters = [[sorted_pos[0]]]
+
+        for pos in sorted_pos[1:]:
+            # Check if position belongs to current cluster
+            if pos - clusters[-1][-1] < threshold:
+                clusters[-1].append(pos)
+            else:
+                clusters.append([pos])
+
+        # Return average position of each cluster
+        return [sum(cluster) / len(cluster) for cluster in clusters]
+
+    def _extract_images(self,
+                       page: fitz.Page,
+                       page_num: int,
+                       document_id: str,
+                       counter: int,
+                       output_dir: Optional[Path]) -> List[DocumentElement]:
+        """Extract images from page"""
+        elements = []
+        image_list = page.get_images()
+
+        for img_idx, img in enumerate(image_list):
+            try:
+                xref = img[0]
+
+                # Get image position(s)
+                img_rects = page.get_image_rects(xref)
+                if not img_rects:
+                    continue
+
+                rect = img_rects[0]  # Use first occurrence
+                bbox = BoundingBox(
+                    x0=rect.x0,
+                    y0=rect.y0,
+                    x1=rect.x1,
+                    y1=rect.y1
+                )
+
+                # Extract image data
+                pix = fitz.Pixmap(page.parent, xref)
+                image_data = {
+                    "width": pix.width,
+                    "height": pix.height,
+                    "colorspace": pix.colorspace.name if pix.colorspace else "unknown",
+                    "xref": xref
+                }
+
+                # Save image if output directory provided
+                if output_dir:
+                    output_dir.mkdir(parents=True, exist_ok=True)
+                    image_filename = f"{document_id}_p{page_num}_img{img_idx}.png"
+                    image_path = output_dir / image_filename
+                    pix.save(str(image_path))
+                    image_data["saved_path"] = str(image_path)
+                    logger.debug(f"Saved image to {image_path}")
+
+                element = DocumentElement(
+                    element_id=f"image_{page_num}_{counter + img_idx}",
+                    type=ElementType.IMAGE,
+                    content=image_data,
+                    bbox=bbox,
+                    confidence=1.0,
+                    metadata={
+                        "image_index": img_idx,
+                        "xref": xref
+                    }
+                )
+                elements.append(element)
+
+                pix = None  # Free memory
+
+            except Exception as e:
+                logger.error(f"Error extracting image {img_idx}: {e}")
+
+        return elements
--- a/backend/app/services/document_type_detector.py
+++ b/backend/app/services/document_type_detector.py
@@ -0,0 +1,397 @@
+"""
+Document Type Detector Service
+
+Intelligently determines the optimal processing track for documents based on
+file type, content analysis, and editability checks.
+"""
+
+import os
+import logging
+import magic
+import fitz  # PyMuPDF
+from pathlib import Path
+from typing import Dict, Optional, Tuple, List
+from enum import Enum
+import statistics
+
+logger = logging.getLogger(__name__)
+
+
+class DocumentType(str, Enum):
+    """Document type classification"""
+    PDF_EDITABLE = "pdf_editable"      # PDF with extractable text
+    PDF_SCANNED = "pdf_scanned"        # PDF with images/scanned content
+    PDF_MIXED = "pdf_mixed"            # PDF with both text and scanned pages
+    IMAGE = "image"                     # Image files (PNG, JPG, etc.)
+    OFFICE_WORD = "office_word"        # Word documents
+    OFFICE_EXCEL = "office_excel"      # Excel spreadsheets
+    OFFICE_POWERPOINT = "office_ppt"   # PowerPoint presentations
+    TEXT = "text"                       # Plain text files
+    UNKNOWN = "unknown"                 # Unknown format
+
+
+class ProcessingTrackRecommendation:
+    """Processing track recommendation with confidence"""
+
+    def __init__(self,
+                 track: str,
+                 confidence: float,
+                 reason: str,
+                 document_type: DocumentType,
+                 metadata: Optional[Dict] = None):
+        self.track = track  # "ocr" or "direct"
+        self.confidence = confidence  # 0.0 to 1.0
+        self.reason = reason
+        self.document_type = document_type
+        self.metadata = metadata or {}
+
+    def to_dict(self) -> Dict:
+        return {
+            "recommended_track": self.track,
+            "confidence": self.confidence,
+            "reason": self.reason,
+            "document_type": self.document_type.value,
+            "metadata": self.metadata
+        }
+
+
+class DocumentTypeDetector:
+    """
+    Service for detecting document types and recommending processing tracks.
+
+    This service analyzes documents to determine:
+    1. The document type (PDF, image, Office, etc.)
+    2. Whether the document contains extractable text
+    3. The recommended processing track (OCR vs Direct)
+    """
+
+    # MIME type mappings
+    IMAGE_MIMES = {
+        'image/png', 'image/jpeg', 'image/jpg', 'image/gif',
+        'image/bmp', 'image/tiff', 'image/webp'
+    }
+
+    OFFICE_MIMES = {
+        'application/vnd.openxmlformats-officedocument.wordprocessingml.document': DocumentType.OFFICE_WORD,
+        'application/msword': DocumentType.OFFICE_WORD,
+        'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': DocumentType.OFFICE_EXCEL,
+        'application/vnd.ms-excel': DocumentType.OFFICE_EXCEL,
+        'application/vnd.openxmlformats-officedocument.presentationml.presentation': DocumentType.OFFICE_POWERPOINT,
+        'application/vnd.ms-powerpoint': DocumentType.OFFICE_POWERPOINT,
+    }
+
+    def __init__(self,
+                 min_text_length: int = 100,
+                 sample_pages: int = 3,
+                 text_coverage_threshold: float = 0.9):
+        """
+        Initialize the detector.
+
+        Args:
+            min_text_length: Minimum text length to consider a page as having extractable text
+            sample_pages: Number of pages to sample for PDF analysis
+            text_coverage_threshold: Percentage of pages with text to classify as editable
+        """
+        self.min_text_length = min_text_length
+        self.sample_pages = sample_pages
+        self.text_coverage_threshold = text_coverage_threshold
+
+    def detect(self, file_path: Path) -> ProcessingTrackRecommendation:
+        """
+        Detect document type and recommend processing track.
+
+        Args:
+            file_path: Path to the document file
+
+        Returns:
+            ProcessingTrackRecommendation with track selection and metadata
+        """
+        if not file_path.exists():
+            logger.error(f"File not found: {file_path}")
+            return ProcessingTrackRecommendation(
+                track="ocr",
+                confidence=0.5,
+                reason="File not found, defaulting to OCR",
+                document_type=DocumentType.UNKNOWN
+            )
+
+        try:
+            # Detect MIME type
+            mime_type = magic.from_file(str(file_path), mime=True)
+            logger.info(f"Detected MIME type: {mime_type} for {file_path.name}")
+
+            # Route based on file type
+            if mime_type == 'application/pdf':
+                return self._analyze_pdf(file_path)
+            elif mime_type in self.IMAGE_MIMES:
+                return self._analyze_image(file_path, mime_type)
+            elif mime_type in self.OFFICE_MIMES:
+                return self._analyze_office(file_path, mime_type)
+            elif mime_type.startswith('text/'):
+                return self._analyze_text(file_path, mime_type)
+            else:
+                logger.warning(f"Unknown MIME type: {mime_type}")
+                return ProcessingTrackRecommendation(
+                    track="ocr",
+                    confidence=0.5,
+                    reason=f"Unknown file type ({mime_type}), defaulting to OCR",
+                    document_type=DocumentType.UNKNOWN
+                )
+
+        except Exception as e:
+            logger.error(f"Error detecting document type: {e}")
+            return ProcessingTrackRecommendation(
+                track="ocr",
+                confidence=0.3,
+                reason=f"Error during detection: {str(e)}",
+                document_type=DocumentType.UNKNOWN
+            )
+
+    def _analyze_pdf(self, file_path: Path) -> ProcessingTrackRecommendation:
+        """
+        Analyze PDF to determine if it's editable or scanned.
+
+        Args:
+            file_path: Path to PDF file
+
+        Returns:
+            Processing track recommendation
+        """
+        try:
+            doc = fitz.open(str(file_path))
+            total_pages = len(doc)
+
+            # Sample pages for analysis
+            pages_to_check = min(self.sample_pages, total_pages)
+            text_pages = []
+            page_details = []
+
+            for page_num in range(pages_to_check):
+                page = doc[page_num]
+
+                # Extract text
+                text = page.get_text()
+                text_length = len(text.strip())
+
+                # Check for images
+                images = page.get_images()
+                image_count = len(images)
+
+                # Calculate page area covered by images
+                page_rect = page.rect
+                page_area = page_rect.width * page_rect.height
+                image_area = 0
+
+                for img in images:
+                    try:
+                        # Get image rectangles
+                        xref = img[0]
+                        img_rects = page.get_image_rects(xref)
+                        for rect in img_rects:
+                            image_area += rect.width * rect.height
+                    except:
+                        pass
+
+                image_coverage = image_area / page_area if page_area > 0 else 0
+
+                # Determine if page has meaningful text
+                has_text = text_length >= self.min_text_length
+
+                text_pages.append(has_text)
+                page_details.append({
+                    "page": page_num + 1,
+                    "text_length": text_length,
+                    "has_text": has_text,
+                    "image_count": image_count,
+                    "image_coverage": image_coverage
+                })
+
+                logger.debug(f"Page {page_num + 1}: text_length={text_length}, "
+                           f"images={image_count}, image_coverage={image_coverage:.2%}")
+
+            doc.close()
+
+            # Calculate text coverage
+            text_coverage = sum(text_pages) / len(text_pages) if text_pages else 0
+
+            # Determine document type and track
+            metadata = {
+                "total_pages": total_pages,
+                "sampled_pages": pages_to_check,
+                "text_coverage": text_coverage,
+                "page_details": page_details
+            }
+
+            if text_coverage >= self.text_coverage_threshold:
+                # Mostly text-based PDF
+                return ProcessingTrackRecommendation(
+                    track="direct",
+                    confidence=0.95,
+                    reason=f"PDF has extractable text on {text_coverage:.0%} of sampled pages",
+                    document_type=DocumentType.PDF_EDITABLE,
+                    metadata=metadata
+                )
+            elif text_coverage <= 0.1:
+                # Mostly scanned/image PDF
+                return ProcessingTrackRecommendation(
+                    track="ocr",
+                    confidence=0.95,
+                    reason=f"PDF appears to be scanned (only {text_coverage:.0%} pages have text)",
+                    document_type=DocumentType.PDF_SCANNED,
+                    metadata=metadata
+                )
+            else:
+                # Mixed content
+                # For mixed PDFs, we could implement page-level track selection in the future
+                # For now, use OCR to ensure we don't miss scanned content
+                return ProcessingTrackRecommendation(
+                    track="ocr",
+                    confidence=0.7,
+                    reason=f"PDF has mixed content ({text_coverage:.0%} text pages), using OCR for completeness",
+                    document_type=DocumentType.PDF_MIXED,
+                    metadata=metadata
+                )
+
+        except Exception as e:
+            logger.error(f"Error analyzing PDF: {e}")
+            return ProcessingTrackRecommendation(
+                track="ocr",
+                confidence=0.5,
+                reason=f"Error analyzing PDF: {str(e)}",
+                document_type=DocumentType.PDF_SCANNED
+            )
+
+    def _analyze_image(self, file_path: Path, mime_type: str) -> ProcessingTrackRecommendation:
+        """
+        Analyze image file.
+
+        Images always require OCR processing.
+        """
+        file_size = file_path.stat().st_size
+        metadata = {
+            "mime_type": mime_type,
+            "file_size": file_size,
+            "file_extension": file_path.suffix
+        }
+
+        return ProcessingTrackRecommendation(
+            track="ocr",
+            confidence=1.0,
+            reason="Image files require OCR processing",
+            document_type=DocumentType.IMAGE,
+            metadata=metadata
+        )
+
+    def _analyze_office(self, file_path: Path, mime_type: str) -> ProcessingTrackRecommendation:
+        """
+        Analyze Office document.
+
+        Currently routes all Office documents to OCR track.
+        Future enhancement: implement direct extraction for Office files.
+        """
+        document_type = self.OFFICE_MIMES.get(mime_type, DocumentType.UNKNOWN)
+        file_size = file_path.stat().st_size
+
+        metadata = {
+            "mime_type": mime_type,
+            "file_size": file_size,
+            "file_extension": file_path.suffix
+        }
+
+        # TODO: In future, we could implement direct extraction for Office files
+        # using python-docx, openpyxl, python-pptx
+        return ProcessingTrackRecommendation(
+            track="ocr",
+            confidence=0.9,
+            reason="Office documents currently processed via OCR (direct extraction planned)",
+            document_type=document_type,
+            metadata=metadata
+        )
+
+    def _analyze_text(self, file_path: Path, mime_type: str) -> ProcessingTrackRecommendation:
+        """
+        Analyze text file.
+
+        Plain text files can be directly processed without OCR.
+        """
+        file_size = file_path.stat().st_size
+        metadata = {
+            "mime_type": mime_type,
+            "file_size": file_size,
+            "file_extension": file_path.suffix
+        }
+
+        return ProcessingTrackRecommendation(
+            track="direct",
+            confidence=1.0,
+            reason="Plain text files can be directly processed",
+            document_type=DocumentType.TEXT,
+            metadata=metadata
+        )
+
+    def analyze_batch(self, file_paths: List[Path]) -> Dict[str, ProcessingTrackRecommendation]:
+        """
+        Analyze multiple files and return recommendations.
+
+        Args:
+            file_paths: List of file paths to analyze
+
+        Returns:
+            Dictionary mapping file paths to recommendations
+        """
+        results = {}
+
+        for file_path in file_paths:
+            try:
+                recommendation = self.detect(file_path)
+                results[str(file_path)] = recommendation
+                logger.info(f"Analyzed {file_path.name}: {recommendation.track} "
+                          f"(confidence: {recommendation.confidence:.2f})")
+            except Exception as e:
+                logger.error(f"Error analyzing {file_path}: {e}")
+                results[str(file_path)] = ProcessingTrackRecommendation(
+                    track="ocr",
+                    confidence=0.3,
+                    reason=f"Error during analysis: {str(e)}",
+                    document_type=DocumentType.UNKNOWN
+                )
+
+        return results
+
+    def get_statistics(self, recommendations: Dict[str, ProcessingTrackRecommendation]) -> Dict:
+        """
+        Calculate statistics from batch analysis results.
+
+        Args:
+            recommendations: Dictionary of file recommendations
+
+        Returns:
+            Statistics dictionary
+        """
+        if not recommendations:
+            return {"total": 0}
+
+        tracks = [r.track for r in recommendations.values()]
+        confidences = [r.confidence for r in recommendations.values()]
+        doc_types = [r.document_type.value for r in recommendations.values()]
+
+        stats = {
+            "total": len(recommendations),
+            "by_track": {
+                "ocr": tracks.count("ocr"),
+                "direct": tracks.count("direct")
+            },
+            "by_document_type": {},
+            "confidence": {
+                "mean": statistics.mean(confidences),
+                "median": statistics.median(confidences),
+                "min": min(confidences),
+                "max": max(confidences)
+            }
+        }
+
+        # Count by document type
+        for doc_type in set(doc_types):
+            stats["by_document_type"][doc_type] = doc_types.count(doc_type)
+
+        return stats