""" Processing Orchestrator - Coordinates document processing across tracks. This module provides a unified orchestration layer for document processing, separating the high-level flow control from track-specific implementations. """ import logging import time from abc import ABC, abstractmethod from dataclasses import dataclass, field from enum import Enum from pathlib import Path from typing import Any, Dict, List, Optional, Tuple, Union from app.models.unified_document import ( ProcessingTrack, UnifiedDocument, DocumentMetadata, ElementType, ) logger = logging.getLogger(__name__) # ============================================================================ # Data Classes # ============================================================================ @dataclass class ProcessingConfig: """Configuration for document processing.""" detect_layout: bool = True confidence_threshold: float = 0.5 output_dir: Optional[Path] = None lang: str = "ch" layout_model: str = "ppyolov2_r50vd_dcn_365e_publaynet" preprocessing_mode: str = "auto" preprocessing_config: Optional[Dict] = None table_detection_config: Optional[Dict] = None force_track: Optional[str] = None # "direct" or "ocr" use_dual_track: bool = True @dataclass class TrackRecommendation: """Recommendation for which processing track to use.""" track: ProcessingTrack confidence: float reason: str metrics: Dict[str, Any] = field(default_factory=dict) @dataclass class ProcessingResult: """Result of document processing.""" document: Optional[UnifiedDocument] = None legacy_result: Optional[Dict] = None track_used: ProcessingTrack = ProcessingTrack.DIRECT processing_time: float = 0.0 success: bool = True error: Optional[str] = None # ============================================================================ # Pipeline Interface # ============================================================================ class ProcessingPipeline(ABC): """Abstract base class for processing pipelines.""" @property @abstractmethod def track_type(self) -> ProcessingTrack: """Return the processing track type for this pipeline.""" pass @abstractmethod def process( self, file_path: Path, config: ProcessingConfig ) -> ProcessingResult: """ Process a document through this pipeline. Args: file_path: Path to the document config: Processing configuration Returns: ProcessingResult with the extracted document """ pass @abstractmethod def can_process(self, file_path: Path) -> bool: """ Check if this pipeline can process the given file. Args: file_path: Path to the document Returns: True if the pipeline can process this file type """ pass # ============================================================================ # Direct Track Pipeline # ============================================================================ class DirectPipeline(ProcessingPipeline): """Pipeline for processing editable PDFs via direct text extraction.""" def __init__(self): self._engine = None self._office_converter = None @property def track_type(self) -> ProcessingTrack: return ProcessingTrack.DIRECT @property def engine(self): """Lazy-load DirectExtractionEngine.""" if self._engine is None: from app.services.direct_extraction_engine import DirectExtractionEngine self._engine = DirectExtractionEngine( enable_table_detection=True, enable_image_extraction=True, min_image_area=200.0, enable_whiteout_detection=True, enable_content_sanitization=True ) return self._engine @property def office_converter(self): """Lazy-load OfficeConverter.""" if self._office_converter is None: from app.services.office_converter import OfficeConverter self._office_converter = OfficeConverter() return self._office_converter def can_process(self, file_path: Path) -> bool: """Check if file is processable (PDF or Office document).""" suffix = file_path.suffix.lower() return suffix in ['.pdf', '.docx', '.doc', '.xlsx', '.xls', '.pptx', '.ppt'] def process( self, file_path: Path, config: ProcessingConfig ) -> ProcessingResult: """Process document using direct text extraction.""" start_time = time.time() try: logger.info(f"DirectPipeline: Processing {file_path.name}") # Handle Office document conversion actual_path = file_path if self._is_office_document(file_path): actual_path = self._convert_office_to_pdf(file_path, config.output_dir) if actual_path is None: return ProcessingResult( success=False, error=f"Failed to convert Office document: {file_path.name}", track_used=ProcessingTrack.DIRECT, processing_time=time.time() - start_time ) # Extract document unified_doc = self.engine.extract( actual_path, output_dir=config.output_dir ) processing_time = time.time() - start_time # Update metadata if unified_doc.metadata is None: unified_doc.metadata = DocumentMetadata() unified_doc.metadata.processing_track = ProcessingTrack.DIRECT unified_doc.metadata.processing_time = processing_time logger.info(f"DirectPipeline: Completed in {processing_time:.2f}s, " f"{len(unified_doc.pages)} pages extracted") return ProcessingResult( document=unified_doc, track_used=ProcessingTrack.DIRECT, processing_time=processing_time, success=True ) except Exception as e: logger.error(f"DirectPipeline: Error processing {file_path.name}: {e}") import traceback traceback.print_exc() return ProcessingResult( success=False, error=str(e), track_used=ProcessingTrack.DIRECT, processing_time=time.time() - start_time ) def check_for_missing_images( self, file_path: Path, unified_doc: UnifiedDocument ) -> List[int]: """ Check if document has pages with missing inline images. Args: file_path: Path to the PDF unified_doc: Extracted document Returns: List of page indices with missing images """ return self.engine.check_document_for_missing_images(file_path) def render_missing_images( self, file_path: Path, unified_doc: UnifiedDocument, page_list: List[int], output_dir: Path ) -> UnifiedDocument: """ Render inline image regions that couldn't be extracted. Args: file_path: Path to the PDF unified_doc: Document to update page_list: Pages with missing images output_dir: Directory for output images Returns: Updated UnifiedDocument """ return self.engine.render_inline_image_regions( file_path, unified_doc, page_list, output_dir ) def _is_office_document(self, file_path: Path) -> bool: """Check if file is an Office document.""" return self.office_converter.is_office_document(file_path) def _convert_office_to_pdf( self, file_path: Path, output_dir: Optional[Path] ) -> Optional[Path]: """Convert Office document to PDF.""" try: return self.office_converter.convert_to_pdf(file_path, output_dir) except Exception as e: logger.error(f"Office conversion failed: {e}") return None # ============================================================================ # OCR Track Pipeline # ============================================================================ class OCRPipeline(ProcessingPipeline): """Pipeline for processing scanned documents via OCR.""" def __init__(self): self._ocr_service = None self._converter = None @property def track_type(self) -> ProcessingTrack: return ProcessingTrack.OCR @property def ocr_service(self): """ Get reference to OCR service. Note: This creates a circular dependency that needs careful handling. The OCRPipeline should receive the service via dependency injection. """ if self._ocr_service is None: raise RuntimeError( "OCRPipeline requires OCR service to be set via set_ocr_service()" ) return self._ocr_service def set_ocr_service(self, service): """Set the OCR service for this pipeline (dependency injection).""" self._ocr_service = service @property def converter(self): """Lazy-load OCR to Unified converter.""" if self._converter is None: from app.services.ocr_to_unified_converter import OCRToUnifiedConverter self._converter = OCRToUnifiedConverter() return self._converter def can_process(self, file_path: Path) -> bool: """Check if file is processable (images or PDFs).""" suffix = file_path.suffix.lower() return suffix in ['.pdf', '.png', '.jpg', '.jpeg', '.tiff', '.tif', '.bmp'] def process( self, file_path: Path, config: ProcessingConfig ) -> ProcessingResult: """Process document using OCR.""" start_time = time.time() try: logger.info(f"OCRPipeline: Processing {file_path.name}") # Use OCR service's traditional processing ocr_result = self.ocr_service.process_file_traditional( file_path, detect_layout=config.detect_layout, confidence_threshold=config.confidence_threshold, output_dir=config.output_dir, lang=config.lang, layout_model=config.layout_model, preprocessing_mode=config.preprocessing_mode, preprocessing_config=config.preprocessing_config, table_detection_config=config.table_detection_config ) processing_time = time.time() - start_time # Convert to UnifiedDocument unified_doc = self.converter.convert( ocr_result, file_path, processing_time, config.lang ) # Update metadata if unified_doc.metadata is None: unified_doc.metadata = DocumentMetadata() unified_doc.metadata.processing_track = ProcessingTrack.OCR unified_doc.metadata.processing_time = processing_time logger.info(f"OCRPipeline: Completed in {processing_time:.2f}s") return ProcessingResult( document=unified_doc, legacy_result=ocr_result, track_used=ProcessingTrack.OCR, processing_time=processing_time, success=True ) except Exception as e: logger.error(f"OCRPipeline: Error processing {file_path.name}: {e}") import traceback traceback.print_exc() return ProcessingResult( success=False, error=str(e), track_used=ProcessingTrack.OCR, processing_time=time.time() - start_time ) # ============================================================================ # Processing Orchestrator # ============================================================================ class ProcessingOrchestrator: """ Orchestrates document processing across Direct and OCR tracks. This class coordinates the high-level processing flow: 1. Determines the optimal processing track 2. Routes to the appropriate pipeline 3. Handles hybrid mode (Direct + OCR fallback) 4. Manages result format conversion """ def __init__(self): self._document_detector = None self._direct_pipeline = DirectPipeline() self._ocr_pipeline = OCRPipeline() @property def document_detector(self): """Lazy-load DocumentTypeDetector.""" if self._document_detector is None: from app.services.document_type_detector import DocumentTypeDetector self._document_detector = DocumentTypeDetector() return self._document_detector @property def direct_pipeline(self) -> DirectPipeline: return self._direct_pipeline @property def ocr_pipeline(self) -> OCRPipeline: return self._ocr_pipeline def set_ocr_service(self, service): """Set OCR service for the OCR pipeline (dependency injection).""" self._ocr_pipeline.set_ocr_service(service) def determine_processing_track( self, file_path: Path, force_track: Optional[str] = None ) -> TrackRecommendation: """ Determine the optimal processing track for a document. Args: file_path: Path to the document force_track: Optional override ("direct" or "ocr") Returns: TrackRecommendation with track, confidence, and reason """ # Handle forced track if force_track: track = ProcessingTrack.DIRECT if force_track == "direct" else ProcessingTrack.OCR return TrackRecommendation( track=track, confidence=1.0, reason=f"Forced to use {force_track} track" ) # Use document detector try: recommendation = self.document_detector.detect(file_path) # Convert string track to ProcessingTrack enum track_str = recommendation.track if isinstance(track_str, str): track = ProcessingTrack.DIRECT if track_str == "direct" else ProcessingTrack.OCR else: track = track_str # Already an enum return TrackRecommendation( track=track, confidence=recommendation.confidence, reason=recommendation.reason, metrics=getattr(recommendation, 'metrics', {}) ) except Exception as e: logger.warning(f"Document detection failed: {e}, defaulting to DIRECT") return TrackRecommendation( track=ProcessingTrack.DIRECT, confidence=0.5, reason=f"Detection failed ({e}), using default" ) def process( self, file_path: Path, config: ProcessingConfig ) -> ProcessingResult: """ Process a document using the optimal track. Args: file_path: Path to the document config: Processing configuration Returns: ProcessingResult with extracted document """ file_path = Path(file_path) start_time = time.time() logger.info(f"ProcessingOrchestrator: Processing {file_path.name}") # Determine track recommendation = self.determine_processing_track( file_path, config.force_track ) logger.info(f"Track recommendation: {recommendation.track.value} " f"(confidence: {recommendation.confidence:.2f}, " f"reason: {recommendation.reason})") # Route to appropriate pipeline if recommendation.track == ProcessingTrack.DIRECT: result = self._execute_direct_with_fallback(file_path, config) else: result = self._ocr_pipeline.process(file_path, config) # Update total processing time result.processing_time = time.time() - start_time return result def _execute_direct_with_fallback( self, file_path: Path, config: ProcessingConfig ) -> ProcessingResult: """ Execute direct track with hybrid fallback for missing images. Args: file_path: Path to the document config: Processing configuration Returns: ProcessingResult (may be HYBRID if OCR was used for images) """ # Run direct extraction result = self._direct_pipeline.process(file_path, config) if not result.success or result.document is None: logger.warning("Direct extraction failed, falling back to OCR") return self._ocr_pipeline.process(file_path, config) # Check for missing images try: missing_pages = self._direct_pipeline.check_for_missing_images( file_path, result.document ) if missing_pages: logger.info(f"Found {len(missing_pages)} pages with missing images, " f"entering hybrid mode") return self._execute_hybrid( file_path, config, result.document, missing_pages ) except Exception as e: logger.warning(f"Missing image check failed: {e}") return result def _execute_hybrid( self, file_path: Path, config: ProcessingConfig, direct_doc: UnifiedDocument, missing_pages: List[int] ) -> ProcessingResult: """ Execute hybrid mode: Direct extraction + OCR for missing images. Args: file_path: Path to the document config: Processing configuration direct_doc: Document from direct extraction missing_pages: Pages with missing images Returns: ProcessingResult with HYBRID track """ start_time = time.time() try: # Try OCR for missing images ocr_result = self._ocr_pipeline.process(file_path, config) if ocr_result.success and ocr_result.document: # Merge OCR images into direct result images_added = self._merge_ocr_images( direct_doc, ocr_result.document, missing_pages ) logger.info(f"Hybrid mode: Added {images_added} images from OCR") else: # Fallback: render inline images directly logger.warning("OCR failed, rendering inline images as fallback") if config.output_dir: direct_doc = self._direct_pipeline.render_missing_images( file_path, direct_doc, missing_pages, config.output_dir ) # Update metadata if direct_doc.metadata is None: direct_doc.metadata = DocumentMetadata() direct_doc.metadata.processing_track = ProcessingTrack.HYBRID return ProcessingResult( document=direct_doc, track_used=ProcessingTrack.HYBRID, processing_time=time.time() - start_time, success=True ) except Exception as e: logger.error(f"Hybrid processing failed: {e}") # Return direct result as-is return ProcessingResult( document=direct_doc, track_used=ProcessingTrack.DIRECT, processing_time=time.time() - start_time, success=True ) def _merge_ocr_images( self, direct_doc: UnifiedDocument, ocr_doc: UnifiedDocument, target_pages: List[int] ) -> int: """ Merge image elements from OCR result into direct result. Args: direct_doc: Target document ocr_doc: Source document with images target_pages: Page indices to merge images from Returns: Number of images added """ images_added = 0 for page_idx in target_pages: if page_idx >= len(direct_doc.pages) or page_idx >= len(ocr_doc.pages): continue direct_page = direct_doc.pages[page_idx] ocr_page = ocr_doc.pages[page_idx] # Find image elements in OCR result for elem in ocr_page.elements: if elem.type in [ ElementType.IMAGE, ElementType.FIGURE, ElementType.CHART, ElementType.DIAGRAM, ElementType.LOGO, ElementType.STAMP ]: # Generate unique element ID elem.element_id = f"ocr_img_{page_idx}_{images_added}" direct_page.elements.append(elem) images_added += 1 return images_added