## Backend Changes - **Service Layer Refactoring**: - Add ProcessingOrchestrator for unified document processing - Add PDFTableRenderer for table rendering extraction - Add PDFFontManager for font management with CJK support - Add MemoryPolicyEngine (73% code reduction from MemoryGuard) - **Bug Fixes**: - Fix Direct Track table row span calculation - Fix OCR Track image path handling - Add cell_boxes coordinate validation - Filter out small decorative images - Add covering image detection ## Frontend Changes - **State Management**: - Add TaskStore for centralized task state management - Add localStorage persistence for recent tasks - Add processing state tracking - **Type Consolidation**: - Merge shared types from api.ts to apiV2.ts - Update imports in authStore, uploadStore, ResultsTable, SettingsPage - **Page Integration**: - Integrate TaskStore in ProcessingPage and TaskDetailPage - Update useTaskValidation hook with cache sync ## Testing - Direct Track: edit.pdf (3 pages, 1.281s), edit3.pdf (2 pages, 0.203s) - Cell boxes validation: 43 valid, 0 invalid - Table merging: 12 merged cells verified 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
646 lines
21 KiB
Python
646 lines
21 KiB
Python
"""
|
|
Processing Orchestrator - Coordinates document processing across tracks.
|
|
|
|
This module provides a unified orchestration layer for document processing,
|
|
separating the high-level flow control from track-specific implementations.
|
|
"""
|
|
|
|
import logging
|
|
import time
|
|
from abc import ABC, abstractmethod
|
|
from dataclasses import dataclass, field
|
|
from enum import Enum
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
|
|
from app.models.unified_document import (
|
|
ProcessingTrack,
|
|
UnifiedDocument,
|
|
DocumentMetadata,
|
|
ElementType,
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
# ============================================================================
|
|
# Data Classes
|
|
# ============================================================================
|
|
|
|
@dataclass
|
|
class ProcessingConfig:
|
|
"""Configuration for document processing."""
|
|
detect_layout: bool = True
|
|
confidence_threshold: float = 0.5
|
|
output_dir: Optional[Path] = None
|
|
lang: str = "ch"
|
|
layout_model: str = "ppyolov2_r50vd_dcn_365e_publaynet"
|
|
preprocessing_mode: str = "auto"
|
|
preprocessing_config: Optional[Dict] = None
|
|
table_detection_config: Optional[Dict] = None
|
|
force_track: Optional[str] = None # "direct" or "ocr"
|
|
use_dual_track: bool = True
|
|
|
|
|
|
@dataclass
|
|
class TrackRecommendation:
|
|
"""Recommendation for which processing track to use."""
|
|
track: ProcessingTrack
|
|
confidence: float
|
|
reason: str
|
|
metrics: Dict[str, Any] = field(default_factory=dict)
|
|
|
|
|
|
@dataclass
|
|
class ProcessingResult:
|
|
"""Result of document processing."""
|
|
document: Optional[UnifiedDocument] = None
|
|
legacy_result: Optional[Dict] = None
|
|
track_used: ProcessingTrack = ProcessingTrack.DIRECT
|
|
processing_time: float = 0.0
|
|
success: bool = True
|
|
error: Optional[str] = None
|
|
|
|
|
|
# ============================================================================
|
|
# Pipeline Interface
|
|
# ============================================================================
|
|
|
|
class ProcessingPipeline(ABC):
|
|
"""Abstract base class for processing pipelines."""
|
|
|
|
@property
|
|
@abstractmethod
|
|
def track_type(self) -> ProcessingTrack:
|
|
"""Return the processing track type for this pipeline."""
|
|
pass
|
|
|
|
@abstractmethod
|
|
def process(
|
|
self,
|
|
file_path: Path,
|
|
config: ProcessingConfig
|
|
) -> ProcessingResult:
|
|
"""
|
|
Process a document through this pipeline.
|
|
|
|
Args:
|
|
file_path: Path to the document
|
|
config: Processing configuration
|
|
|
|
Returns:
|
|
ProcessingResult with the extracted document
|
|
"""
|
|
pass
|
|
|
|
@abstractmethod
|
|
def can_process(self, file_path: Path) -> bool:
|
|
"""
|
|
Check if this pipeline can process the given file.
|
|
|
|
Args:
|
|
file_path: Path to the document
|
|
|
|
Returns:
|
|
True if the pipeline can process this file type
|
|
"""
|
|
pass
|
|
|
|
|
|
# ============================================================================
|
|
# Direct Track Pipeline
|
|
# ============================================================================
|
|
|
|
class DirectPipeline(ProcessingPipeline):
|
|
"""Pipeline for processing editable PDFs via direct text extraction."""
|
|
|
|
def __init__(self):
|
|
self._engine = None
|
|
self._office_converter = None
|
|
|
|
@property
|
|
def track_type(self) -> ProcessingTrack:
|
|
return ProcessingTrack.DIRECT
|
|
|
|
@property
|
|
def engine(self):
|
|
"""Lazy-load DirectExtractionEngine."""
|
|
if self._engine is None:
|
|
from app.services.direct_extraction_engine import DirectExtractionEngine
|
|
self._engine = DirectExtractionEngine(
|
|
enable_table_detection=True,
|
|
enable_image_extraction=True,
|
|
min_image_area=200.0,
|
|
enable_whiteout_detection=True,
|
|
enable_content_sanitization=True
|
|
)
|
|
return self._engine
|
|
|
|
@property
|
|
def office_converter(self):
|
|
"""Lazy-load OfficeConverter."""
|
|
if self._office_converter is None:
|
|
from app.services.office_converter import OfficeConverter
|
|
self._office_converter = OfficeConverter()
|
|
return self._office_converter
|
|
|
|
def can_process(self, file_path: Path) -> bool:
|
|
"""Check if file is processable (PDF or Office document)."""
|
|
suffix = file_path.suffix.lower()
|
|
return suffix in ['.pdf', '.docx', '.doc', '.xlsx', '.xls', '.pptx', '.ppt']
|
|
|
|
def process(
|
|
self,
|
|
file_path: Path,
|
|
config: ProcessingConfig
|
|
) -> ProcessingResult:
|
|
"""Process document using direct text extraction."""
|
|
start_time = time.time()
|
|
|
|
try:
|
|
logger.info(f"DirectPipeline: Processing {file_path.name}")
|
|
|
|
# Handle Office document conversion
|
|
actual_path = file_path
|
|
if self._is_office_document(file_path):
|
|
actual_path = self._convert_office_to_pdf(file_path, config.output_dir)
|
|
if actual_path is None:
|
|
return ProcessingResult(
|
|
success=False,
|
|
error=f"Failed to convert Office document: {file_path.name}",
|
|
track_used=ProcessingTrack.DIRECT,
|
|
processing_time=time.time() - start_time
|
|
)
|
|
|
|
# Extract document
|
|
unified_doc = self.engine.extract(
|
|
actual_path,
|
|
output_dir=config.output_dir
|
|
)
|
|
|
|
processing_time = time.time() - start_time
|
|
|
|
# Update metadata
|
|
if unified_doc.metadata is None:
|
|
unified_doc.metadata = DocumentMetadata()
|
|
unified_doc.metadata.processing_track = ProcessingTrack.DIRECT
|
|
unified_doc.metadata.processing_time = processing_time
|
|
|
|
logger.info(f"DirectPipeline: Completed in {processing_time:.2f}s, "
|
|
f"{len(unified_doc.pages)} pages extracted")
|
|
|
|
return ProcessingResult(
|
|
document=unified_doc,
|
|
track_used=ProcessingTrack.DIRECT,
|
|
processing_time=processing_time,
|
|
success=True
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"DirectPipeline: Error processing {file_path.name}: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return ProcessingResult(
|
|
success=False,
|
|
error=str(e),
|
|
track_used=ProcessingTrack.DIRECT,
|
|
processing_time=time.time() - start_time
|
|
)
|
|
|
|
def check_for_missing_images(
|
|
self,
|
|
file_path: Path,
|
|
unified_doc: UnifiedDocument
|
|
) -> List[int]:
|
|
"""
|
|
Check if document has pages with missing inline images.
|
|
|
|
Args:
|
|
file_path: Path to the PDF
|
|
unified_doc: Extracted document
|
|
|
|
Returns:
|
|
List of page indices with missing images
|
|
"""
|
|
return self.engine.check_document_for_missing_images(file_path)
|
|
|
|
def render_missing_images(
|
|
self,
|
|
file_path: Path,
|
|
unified_doc: UnifiedDocument,
|
|
page_list: List[int],
|
|
output_dir: Path
|
|
) -> UnifiedDocument:
|
|
"""
|
|
Render inline image regions that couldn't be extracted.
|
|
|
|
Args:
|
|
file_path: Path to the PDF
|
|
unified_doc: Document to update
|
|
page_list: Pages with missing images
|
|
output_dir: Directory for output images
|
|
|
|
Returns:
|
|
Updated UnifiedDocument
|
|
"""
|
|
return self.engine.render_inline_image_regions(
|
|
file_path, unified_doc, page_list, output_dir
|
|
)
|
|
|
|
def _is_office_document(self, file_path: Path) -> bool:
|
|
"""Check if file is an Office document."""
|
|
return self.office_converter.is_office_document(file_path)
|
|
|
|
def _convert_office_to_pdf(
|
|
self,
|
|
file_path: Path,
|
|
output_dir: Optional[Path]
|
|
) -> Optional[Path]:
|
|
"""Convert Office document to PDF."""
|
|
try:
|
|
return self.office_converter.convert_to_pdf(file_path, output_dir)
|
|
except Exception as e:
|
|
logger.error(f"Office conversion failed: {e}")
|
|
return None
|
|
|
|
|
|
# ============================================================================
|
|
# OCR Track Pipeline
|
|
# ============================================================================
|
|
|
|
class OCRPipeline(ProcessingPipeline):
|
|
"""Pipeline for processing scanned documents via OCR."""
|
|
|
|
def __init__(self):
|
|
self._ocr_service = None
|
|
self._converter = None
|
|
|
|
@property
|
|
def track_type(self) -> ProcessingTrack:
|
|
return ProcessingTrack.OCR
|
|
|
|
@property
|
|
def ocr_service(self):
|
|
"""
|
|
Get reference to OCR service.
|
|
Note: This creates a circular dependency that needs careful handling.
|
|
The OCRPipeline should receive the service via dependency injection.
|
|
"""
|
|
if self._ocr_service is None:
|
|
raise RuntimeError(
|
|
"OCRPipeline requires OCR service to be set via set_ocr_service()"
|
|
)
|
|
return self._ocr_service
|
|
|
|
def set_ocr_service(self, service):
|
|
"""Set the OCR service for this pipeline (dependency injection)."""
|
|
self._ocr_service = service
|
|
|
|
@property
|
|
def converter(self):
|
|
"""Lazy-load OCR to Unified converter."""
|
|
if self._converter is None:
|
|
from app.services.ocr_to_unified_converter import OCRToUnifiedConverter
|
|
self._converter = OCRToUnifiedConverter()
|
|
return self._converter
|
|
|
|
def can_process(self, file_path: Path) -> bool:
|
|
"""Check if file is processable (images or PDFs)."""
|
|
suffix = file_path.suffix.lower()
|
|
return suffix in ['.pdf', '.png', '.jpg', '.jpeg', '.tiff', '.tif', '.bmp']
|
|
|
|
def process(
|
|
self,
|
|
file_path: Path,
|
|
config: ProcessingConfig
|
|
) -> ProcessingResult:
|
|
"""Process document using OCR."""
|
|
start_time = time.time()
|
|
|
|
try:
|
|
logger.info(f"OCRPipeline: Processing {file_path.name}")
|
|
|
|
# Use OCR service's traditional processing
|
|
ocr_result = self.ocr_service.process_file_traditional(
|
|
file_path,
|
|
detect_layout=config.detect_layout,
|
|
confidence_threshold=config.confidence_threshold,
|
|
output_dir=config.output_dir,
|
|
lang=config.lang,
|
|
layout_model=config.layout_model,
|
|
preprocessing_mode=config.preprocessing_mode,
|
|
preprocessing_config=config.preprocessing_config,
|
|
table_detection_config=config.table_detection_config
|
|
)
|
|
|
|
processing_time = time.time() - start_time
|
|
|
|
# Convert to UnifiedDocument
|
|
unified_doc = self.converter.convert(
|
|
ocr_result,
|
|
file_path,
|
|
processing_time,
|
|
config.lang
|
|
)
|
|
|
|
# Update metadata
|
|
if unified_doc.metadata is None:
|
|
unified_doc.metadata = DocumentMetadata()
|
|
unified_doc.metadata.processing_track = ProcessingTrack.OCR
|
|
unified_doc.metadata.processing_time = processing_time
|
|
|
|
logger.info(f"OCRPipeline: Completed in {processing_time:.2f}s")
|
|
|
|
return ProcessingResult(
|
|
document=unified_doc,
|
|
legacy_result=ocr_result,
|
|
track_used=ProcessingTrack.OCR,
|
|
processing_time=processing_time,
|
|
success=True
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"OCRPipeline: Error processing {file_path.name}: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return ProcessingResult(
|
|
success=False,
|
|
error=str(e),
|
|
track_used=ProcessingTrack.OCR,
|
|
processing_time=time.time() - start_time
|
|
)
|
|
|
|
|
|
# ============================================================================
|
|
# Processing Orchestrator
|
|
# ============================================================================
|
|
|
|
class ProcessingOrchestrator:
|
|
"""
|
|
Orchestrates document processing across Direct and OCR tracks.
|
|
|
|
This class coordinates the high-level processing flow:
|
|
1. Determines the optimal processing track
|
|
2. Routes to the appropriate pipeline
|
|
3. Handles hybrid mode (Direct + OCR fallback)
|
|
4. Manages result format conversion
|
|
"""
|
|
|
|
def __init__(self):
|
|
self._document_detector = None
|
|
self._direct_pipeline = DirectPipeline()
|
|
self._ocr_pipeline = OCRPipeline()
|
|
|
|
@property
|
|
def document_detector(self):
|
|
"""Lazy-load DocumentTypeDetector."""
|
|
if self._document_detector is None:
|
|
from app.services.document_type_detector import DocumentTypeDetector
|
|
self._document_detector = DocumentTypeDetector()
|
|
return self._document_detector
|
|
|
|
@property
|
|
def direct_pipeline(self) -> DirectPipeline:
|
|
return self._direct_pipeline
|
|
|
|
@property
|
|
def ocr_pipeline(self) -> OCRPipeline:
|
|
return self._ocr_pipeline
|
|
|
|
def set_ocr_service(self, service):
|
|
"""Set OCR service for the OCR pipeline (dependency injection)."""
|
|
self._ocr_pipeline.set_ocr_service(service)
|
|
|
|
def determine_processing_track(
|
|
self,
|
|
file_path: Path,
|
|
force_track: Optional[str] = None
|
|
) -> TrackRecommendation:
|
|
"""
|
|
Determine the optimal processing track for a document.
|
|
|
|
Args:
|
|
file_path: Path to the document
|
|
force_track: Optional override ("direct" or "ocr")
|
|
|
|
Returns:
|
|
TrackRecommendation with track, confidence, and reason
|
|
"""
|
|
# Handle forced track
|
|
if force_track:
|
|
track = ProcessingTrack.DIRECT if force_track == "direct" else ProcessingTrack.OCR
|
|
return TrackRecommendation(
|
|
track=track,
|
|
confidence=1.0,
|
|
reason=f"Forced to use {force_track} track"
|
|
)
|
|
|
|
# Use document detector
|
|
try:
|
|
recommendation = self.document_detector.detect(file_path)
|
|
# Convert string track to ProcessingTrack enum
|
|
track_str = recommendation.track
|
|
if isinstance(track_str, str):
|
|
track = ProcessingTrack.DIRECT if track_str == "direct" else ProcessingTrack.OCR
|
|
else:
|
|
track = track_str # Already an enum
|
|
return TrackRecommendation(
|
|
track=track,
|
|
confidence=recommendation.confidence,
|
|
reason=recommendation.reason,
|
|
metrics=getattr(recommendation, 'metrics', {})
|
|
)
|
|
except Exception as e:
|
|
logger.warning(f"Document detection failed: {e}, defaulting to DIRECT")
|
|
return TrackRecommendation(
|
|
track=ProcessingTrack.DIRECT,
|
|
confidence=0.5,
|
|
reason=f"Detection failed ({e}), using default"
|
|
)
|
|
|
|
def process(
|
|
self,
|
|
file_path: Path,
|
|
config: ProcessingConfig
|
|
) -> ProcessingResult:
|
|
"""
|
|
Process a document using the optimal track.
|
|
|
|
Args:
|
|
file_path: Path to the document
|
|
config: Processing configuration
|
|
|
|
Returns:
|
|
ProcessingResult with extracted document
|
|
"""
|
|
file_path = Path(file_path)
|
|
start_time = time.time()
|
|
|
|
logger.info(f"ProcessingOrchestrator: Processing {file_path.name}")
|
|
|
|
# Determine track
|
|
recommendation = self.determine_processing_track(
|
|
file_path,
|
|
config.force_track
|
|
)
|
|
|
|
logger.info(f"Track recommendation: {recommendation.track.value} "
|
|
f"(confidence: {recommendation.confidence:.2f}, "
|
|
f"reason: {recommendation.reason})")
|
|
|
|
# Route to appropriate pipeline
|
|
if recommendation.track == ProcessingTrack.DIRECT:
|
|
result = self._execute_direct_with_fallback(file_path, config)
|
|
else:
|
|
result = self._ocr_pipeline.process(file_path, config)
|
|
|
|
# Update total processing time
|
|
result.processing_time = time.time() - start_time
|
|
|
|
return result
|
|
|
|
def _execute_direct_with_fallback(
|
|
self,
|
|
file_path: Path,
|
|
config: ProcessingConfig
|
|
) -> ProcessingResult:
|
|
"""
|
|
Execute direct track with hybrid fallback for missing images.
|
|
|
|
Args:
|
|
file_path: Path to the document
|
|
config: Processing configuration
|
|
|
|
Returns:
|
|
ProcessingResult (may be HYBRID if OCR was used for images)
|
|
"""
|
|
# Run direct extraction
|
|
result = self._direct_pipeline.process(file_path, config)
|
|
|
|
if not result.success or result.document is None:
|
|
logger.warning("Direct extraction failed, falling back to OCR")
|
|
return self._ocr_pipeline.process(file_path, config)
|
|
|
|
# Check for missing images
|
|
try:
|
|
missing_pages = self._direct_pipeline.check_for_missing_images(
|
|
file_path, result.document
|
|
)
|
|
|
|
if missing_pages:
|
|
logger.info(f"Found {len(missing_pages)} pages with missing images, "
|
|
f"entering hybrid mode")
|
|
return self._execute_hybrid(
|
|
file_path, config, result.document, missing_pages
|
|
)
|
|
except Exception as e:
|
|
logger.warning(f"Missing image check failed: {e}")
|
|
|
|
return result
|
|
|
|
def _execute_hybrid(
|
|
self,
|
|
file_path: Path,
|
|
config: ProcessingConfig,
|
|
direct_doc: UnifiedDocument,
|
|
missing_pages: List[int]
|
|
) -> ProcessingResult:
|
|
"""
|
|
Execute hybrid mode: Direct extraction + OCR for missing images.
|
|
|
|
Args:
|
|
file_path: Path to the document
|
|
config: Processing configuration
|
|
direct_doc: Document from direct extraction
|
|
missing_pages: Pages with missing images
|
|
|
|
Returns:
|
|
ProcessingResult with HYBRID track
|
|
"""
|
|
start_time = time.time()
|
|
|
|
try:
|
|
# Try OCR for missing images
|
|
ocr_result = self._ocr_pipeline.process(file_path, config)
|
|
|
|
if ocr_result.success and ocr_result.document:
|
|
# Merge OCR images into direct result
|
|
images_added = self._merge_ocr_images(
|
|
direct_doc,
|
|
ocr_result.document,
|
|
missing_pages
|
|
)
|
|
logger.info(f"Hybrid mode: Added {images_added} images from OCR")
|
|
else:
|
|
# Fallback: render inline images directly
|
|
logger.warning("OCR failed, rendering inline images as fallback")
|
|
if config.output_dir:
|
|
direct_doc = self._direct_pipeline.render_missing_images(
|
|
file_path,
|
|
direct_doc,
|
|
missing_pages,
|
|
config.output_dir
|
|
)
|
|
|
|
# Update metadata
|
|
if direct_doc.metadata is None:
|
|
direct_doc.metadata = DocumentMetadata()
|
|
direct_doc.metadata.processing_track = ProcessingTrack.HYBRID
|
|
|
|
return ProcessingResult(
|
|
document=direct_doc,
|
|
track_used=ProcessingTrack.HYBRID,
|
|
processing_time=time.time() - start_time,
|
|
success=True
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Hybrid processing failed: {e}")
|
|
# Return direct result as-is
|
|
return ProcessingResult(
|
|
document=direct_doc,
|
|
track_used=ProcessingTrack.DIRECT,
|
|
processing_time=time.time() - start_time,
|
|
success=True
|
|
)
|
|
|
|
def _merge_ocr_images(
|
|
self,
|
|
direct_doc: UnifiedDocument,
|
|
ocr_doc: UnifiedDocument,
|
|
target_pages: List[int]
|
|
) -> int:
|
|
"""
|
|
Merge image elements from OCR result into direct result.
|
|
|
|
Args:
|
|
direct_doc: Target document
|
|
ocr_doc: Source document with images
|
|
target_pages: Page indices to merge images from
|
|
|
|
Returns:
|
|
Number of images added
|
|
"""
|
|
images_added = 0
|
|
|
|
for page_idx in target_pages:
|
|
if page_idx >= len(direct_doc.pages) or page_idx >= len(ocr_doc.pages):
|
|
continue
|
|
|
|
direct_page = direct_doc.pages[page_idx]
|
|
ocr_page = ocr_doc.pages[page_idx]
|
|
|
|
# Find image elements in OCR result
|
|
for elem in ocr_page.elements:
|
|
if elem.type in [
|
|
ElementType.IMAGE, ElementType.FIGURE,
|
|
ElementType.CHART, ElementType.DIAGRAM,
|
|
ElementType.LOGO, ElementType.STAMP
|
|
]:
|
|
# Generate unique element ID
|
|
elem.element_id = f"ocr_img_{page_idx}_{images_added}"
|
|
direct_page.elements.append(elem)
|
|
images_added += 1
|
|
|
|
return images_added
|