feat: refactor dual-track architecture (Phase 1-5)

## Backend Changes
- **Service Layer Refactoring**:
  - Add ProcessingOrchestrator for unified document processing
  - Add PDFTableRenderer for table rendering extraction
  - Add PDFFontManager for font management with CJK support
  - Add MemoryPolicyEngine (73% code reduction from MemoryGuard)

- **Bug Fixes**:
  - Fix Direct Track table row span calculation
  - Fix OCR Track image path handling
  - Add cell_boxes coordinate validation
  - Filter out small decorative images
  - Add covering image detection

## Frontend Changes
- **State Management**:
  - Add TaskStore for centralized task state management
  - Add localStorage persistence for recent tasks
  - Add processing state tracking

- **Type Consolidation**:
  - Merge shared types from api.ts to apiV2.ts
  - Update imports in authStore, uploadStore, ResultsTable, SettingsPage

- **Page Integration**:
  - Integrate TaskStore in ProcessingPage and TaskDetailPage
  - Update useTaskValidation hook with cache sync

## Testing
- Direct Track: edit.pdf (3 pages, 1.281s), edit3.pdf (2 pages, 0.203s)
- Cell boxes validation: 43 valid, 0 invalid
- Table merging: 12 merged cells verified

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
egg
2025-12-07 07:18:27 +08:00
parent 8265be1741
commit eff9b0bcd5
19 changed files with 3637 additions and 173 deletions

View File

@@ -0,0 +1,645 @@
"""
Processing Orchestrator - Coordinates document processing across tracks.
This module provides a unified orchestration layer for document processing,
separating the high-level flow control from track-specific implementations.
"""
import logging
import time
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from enum import Enum
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union
from app.models.unified_document import (
ProcessingTrack,
UnifiedDocument,
DocumentMetadata,
ElementType,
)
logger = logging.getLogger(__name__)
# ============================================================================
# Data Classes
# ============================================================================
@dataclass
class ProcessingConfig:
"""Configuration for document processing."""
detect_layout: bool = True
confidence_threshold: float = 0.5
output_dir: Optional[Path] = None
lang: str = "ch"
layout_model: str = "ppyolov2_r50vd_dcn_365e_publaynet"
preprocessing_mode: str = "auto"
preprocessing_config: Optional[Dict] = None
table_detection_config: Optional[Dict] = None
force_track: Optional[str] = None # "direct" or "ocr"
use_dual_track: bool = True
@dataclass
class TrackRecommendation:
"""Recommendation for which processing track to use."""
track: ProcessingTrack
confidence: float
reason: str
metrics: Dict[str, Any] = field(default_factory=dict)
@dataclass
class ProcessingResult:
"""Result of document processing."""
document: Optional[UnifiedDocument] = None
legacy_result: Optional[Dict] = None
track_used: ProcessingTrack = ProcessingTrack.DIRECT
processing_time: float = 0.0
success: bool = True
error: Optional[str] = None
# ============================================================================
# Pipeline Interface
# ============================================================================
class ProcessingPipeline(ABC):
"""Abstract base class for processing pipelines."""
@property
@abstractmethod
def track_type(self) -> ProcessingTrack:
"""Return the processing track type for this pipeline."""
pass
@abstractmethod
def process(
self,
file_path: Path,
config: ProcessingConfig
) -> ProcessingResult:
"""
Process a document through this pipeline.
Args:
file_path: Path to the document
config: Processing configuration
Returns:
ProcessingResult with the extracted document
"""
pass
@abstractmethod
def can_process(self, file_path: Path) -> bool:
"""
Check if this pipeline can process the given file.
Args:
file_path: Path to the document
Returns:
True if the pipeline can process this file type
"""
pass
# ============================================================================
# Direct Track Pipeline
# ============================================================================
class DirectPipeline(ProcessingPipeline):
"""Pipeline for processing editable PDFs via direct text extraction."""
def __init__(self):
self._engine = None
self._office_converter = None
@property
def track_type(self) -> ProcessingTrack:
return ProcessingTrack.DIRECT
@property
def engine(self):
"""Lazy-load DirectExtractionEngine."""
if self._engine is None:
from app.services.direct_extraction_engine import DirectExtractionEngine
self._engine = DirectExtractionEngine(
enable_table_detection=True,
enable_image_extraction=True,
min_image_area=200.0,
enable_whiteout_detection=True,
enable_content_sanitization=True
)
return self._engine
@property
def office_converter(self):
"""Lazy-load OfficeConverter."""
if self._office_converter is None:
from app.services.office_converter import OfficeConverter
self._office_converter = OfficeConverter()
return self._office_converter
def can_process(self, file_path: Path) -> bool:
"""Check if file is processable (PDF or Office document)."""
suffix = file_path.suffix.lower()
return suffix in ['.pdf', '.docx', '.doc', '.xlsx', '.xls', '.pptx', '.ppt']
def process(
self,
file_path: Path,
config: ProcessingConfig
) -> ProcessingResult:
"""Process document using direct text extraction."""
start_time = time.time()
try:
logger.info(f"DirectPipeline: Processing {file_path.name}")
# Handle Office document conversion
actual_path = file_path
if self._is_office_document(file_path):
actual_path = self._convert_office_to_pdf(file_path, config.output_dir)
if actual_path is None:
return ProcessingResult(
success=False,
error=f"Failed to convert Office document: {file_path.name}",
track_used=ProcessingTrack.DIRECT,
processing_time=time.time() - start_time
)
# Extract document
unified_doc = self.engine.extract(
actual_path,
output_dir=config.output_dir
)
processing_time = time.time() - start_time
# Update metadata
if unified_doc.metadata is None:
unified_doc.metadata = DocumentMetadata()
unified_doc.metadata.processing_track = ProcessingTrack.DIRECT
unified_doc.metadata.processing_time = processing_time
logger.info(f"DirectPipeline: Completed in {processing_time:.2f}s, "
f"{len(unified_doc.pages)} pages extracted")
return ProcessingResult(
document=unified_doc,
track_used=ProcessingTrack.DIRECT,
processing_time=processing_time,
success=True
)
except Exception as e:
logger.error(f"DirectPipeline: Error processing {file_path.name}: {e}")
import traceback
traceback.print_exc()
return ProcessingResult(
success=False,
error=str(e),
track_used=ProcessingTrack.DIRECT,
processing_time=time.time() - start_time
)
def check_for_missing_images(
self,
file_path: Path,
unified_doc: UnifiedDocument
) -> List[int]:
"""
Check if document has pages with missing inline images.
Args:
file_path: Path to the PDF
unified_doc: Extracted document
Returns:
List of page indices with missing images
"""
return self.engine.check_document_for_missing_images(file_path)
def render_missing_images(
self,
file_path: Path,
unified_doc: UnifiedDocument,
page_list: List[int],
output_dir: Path
) -> UnifiedDocument:
"""
Render inline image regions that couldn't be extracted.
Args:
file_path: Path to the PDF
unified_doc: Document to update
page_list: Pages with missing images
output_dir: Directory for output images
Returns:
Updated UnifiedDocument
"""
return self.engine.render_inline_image_regions(
file_path, unified_doc, page_list, output_dir
)
def _is_office_document(self, file_path: Path) -> bool:
"""Check if file is an Office document."""
return self.office_converter.is_office_document(file_path)
def _convert_office_to_pdf(
self,
file_path: Path,
output_dir: Optional[Path]
) -> Optional[Path]:
"""Convert Office document to PDF."""
try:
return self.office_converter.convert_to_pdf(file_path, output_dir)
except Exception as e:
logger.error(f"Office conversion failed: {e}")
return None
# ============================================================================
# OCR Track Pipeline
# ============================================================================
class OCRPipeline(ProcessingPipeline):
"""Pipeline for processing scanned documents via OCR."""
def __init__(self):
self._ocr_service = None
self._converter = None
@property
def track_type(self) -> ProcessingTrack:
return ProcessingTrack.OCR
@property
def ocr_service(self):
"""
Get reference to OCR service.
Note: This creates a circular dependency that needs careful handling.
The OCRPipeline should receive the service via dependency injection.
"""
if self._ocr_service is None:
raise RuntimeError(
"OCRPipeline requires OCR service to be set via set_ocr_service()"
)
return self._ocr_service
def set_ocr_service(self, service):
"""Set the OCR service for this pipeline (dependency injection)."""
self._ocr_service = service
@property
def converter(self):
"""Lazy-load OCR to Unified converter."""
if self._converter is None:
from app.services.ocr_to_unified_converter import OCRToUnifiedConverter
self._converter = OCRToUnifiedConverter()
return self._converter
def can_process(self, file_path: Path) -> bool:
"""Check if file is processable (images or PDFs)."""
suffix = file_path.suffix.lower()
return suffix in ['.pdf', '.png', '.jpg', '.jpeg', '.tiff', '.tif', '.bmp']
def process(
self,
file_path: Path,
config: ProcessingConfig
) -> ProcessingResult:
"""Process document using OCR."""
start_time = time.time()
try:
logger.info(f"OCRPipeline: Processing {file_path.name}")
# Use OCR service's traditional processing
ocr_result = self.ocr_service.process_file_traditional(
file_path,
detect_layout=config.detect_layout,
confidence_threshold=config.confidence_threshold,
output_dir=config.output_dir,
lang=config.lang,
layout_model=config.layout_model,
preprocessing_mode=config.preprocessing_mode,
preprocessing_config=config.preprocessing_config,
table_detection_config=config.table_detection_config
)
processing_time = time.time() - start_time
# Convert to UnifiedDocument
unified_doc = self.converter.convert(
ocr_result,
file_path,
processing_time,
config.lang
)
# Update metadata
if unified_doc.metadata is None:
unified_doc.metadata = DocumentMetadata()
unified_doc.metadata.processing_track = ProcessingTrack.OCR
unified_doc.metadata.processing_time = processing_time
logger.info(f"OCRPipeline: Completed in {processing_time:.2f}s")
return ProcessingResult(
document=unified_doc,
legacy_result=ocr_result,
track_used=ProcessingTrack.OCR,
processing_time=processing_time,
success=True
)
except Exception as e:
logger.error(f"OCRPipeline: Error processing {file_path.name}: {e}")
import traceback
traceback.print_exc()
return ProcessingResult(
success=False,
error=str(e),
track_used=ProcessingTrack.OCR,
processing_time=time.time() - start_time
)
# ============================================================================
# Processing Orchestrator
# ============================================================================
class ProcessingOrchestrator:
"""
Orchestrates document processing across Direct and OCR tracks.
This class coordinates the high-level processing flow:
1. Determines the optimal processing track
2. Routes to the appropriate pipeline
3. Handles hybrid mode (Direct + OCR fallback)
4. Manages result format conversion
"""
def __init__(self):
self._document_detector = None
self._direct_pipeline = DirectPipeline()
self._ocr_pipeline = OCRPipeline()
@property
def document_detector(self):
"""Lazy-load DocumentTypeDetector."""
if self._document_detector is None:
from app.services.document_type_detector import DocumentTypeDetector
self._document_detector = DocumentTypeDetector()
return self._document_detector
@property
def direct_pipeline(self) -> DirectPipeline:
return self._direct_pipeline
@property
def ocr_pipeline(self) -> OCRPipeline:
return self._ocr_pipeline
def set_ocr_service(self, service):
"""Set OCR service for the OCR pipeline (dependency injection)."""
self._ocr_pipeline.set_ocr_service(service)
def determine_processing_track(
self,
file_path: Path,
force_track: Optional[str] = None
) -> TrackRecommendation:
"""
Determine the optimal processing track for a document.
Args:
file_path: Path to the document
force_track: Optional override ("direct" or "ocr")
Returns:
TrackRecommendation with track, confidence, and reason
"""
# Handle forced track
if force_track:
track = ProcessingTrack.DIRECT if force_track == "direct" else ProcessingTrack.OCR
return TrackRecommendation(
track=track,
confidence=1.0,
reason=f"Forced to use {force_track} track"
)
# Use document detector
try:
recommendation = self.document_detector.detect(file_path)
# Convert string track to ProcessingTrack enum
track_str = recommendation.track
if isinstance(track_str, str):
track = ProcessingTrack.DIRECT if track_str == "direct" else ProcessingTrack.OCR
else:
track = track_str # Already an enum
return TrackRecommendation(
track=track,
confidence=recommendation.confidence,
reason=recommendation.reason,
metrics=getattr(recommendation, 'metrics', {})
)
except Exception as e:
logger.warning(f"Document detection failed: {e}, defaulting to DIRECT")
return TrackRecommendation(
track=ProcessingTrack.DIRECT,
confidence=0.5,
reason=f"Detection failed ({e}), using default"
)
def process(
self,
file_path: Path,
config: ProcessingConfig
) -> ProcessingResult:
"""
Process a document using the optimal track.
Args:
file_path: Path to the document
config: Processing configuration
Returns:
ProcessingResult with extracted document
"""
file_path = Path(file_path)
start_time = time.time()
logger.info(f"ProcessingOrchestrator: Processing {file_path.name}")
# Determine track
recommendation = self.determine_processing_track(
file_path,
config.force_track
)
logger.info(f"Track recommendation: {recommendation.track.value} "
f"(confidence: {recommendation.confidence:.2f}, "
f"reason: {recommendation.reason})")
# Route to appropriate pipeline
if recommendation.track == ProcessingTrack.DIRECT:
result = self._execute_direct_with_fallback(file_path, config)
else:
result = self._ocr_pipeline.process(file_path, config)
# Update total processing time
result.processing_time = time.time() - start_time
return result
def _execute_direct_with_fallback(
self,
file_path: Path,
config: ProcessingConfig
) -> ProcessingResult:
"""
Execute direct track with hybrid fallback for missing images.
Args:
file_path: Path to the document
config: Processing configuration
Returns:
ProcessingResult (may be HYBRID if OCR was used for images)
"""
# Run direct extraction
result = self._direct_pipeline.process(file_path, config)
if not result.success or result.document is None:
logger.warning("Direct extraction failed, falling back to OCR")
return self._ocr_pipeline.process(file_path, config)
# Check for missing images
try:
missing_pages = self._direct_pipeline.check_for_missing_images(
file_path, result.document
)
if missing_pages:
logger.info(f"Found {len(missing_pages)} pages with missing images, "
f"entering hybrid mode")
return self._execute_hybrid(
file_path, config, result.document, missing_pages
)
except Exception as e:
logger.warning(f"Missing image check failed: {e}")
return result
def _execute_hybrid(
self,
file_path: Path,
config: ProcessingConfig,
direct_doc: UnifiedDocument,
missing_pages: List[int]
) -> ProcessingResult:
"""
Execute hybrid mode: Direct extraction + OCR for missing images.
Args:
file_path: Path to the document
config: Processing configuration
direct_doc: Document from direct extraction
missing_pages: Pages with missing images
Returns:
ProcessingResult with HYBRID track
"""
start_time = time.time()
try:
# Try OCR for missing images
ocr_result = self._ocr_pipeline.process(file_path, config)
if ocr_result.success and ocr_result.document:
# Merge OCR images into direct result
images_added = self._merge_ocr_images(
direct_doc,
ocr_result.document,
missing_pages
)
logger.info(f"Hybrid mode: Added {images_added} images from OCR")
else:
# Fallback: render inline images directly
logger.warning("OCR failed, rendering inline images as fallback")
if config.output_dir:
direct_doc = self._direct_pipeline.render_missing_images(
file_path,
direct_doc,
missing_pages,
config.output_dir
)
# Update metadata
if direct_doc.metadata is None:
direct_doc.metadata = DocumentMetadata()
direct_doc.metadata.processing_track = ProcessingTrack.HYBRID
return ProcessingResult(
document=direct_doc,
track_used=ProcessingTrack.HYBRID,
processing_time=time.time() - start_time,
success=True
)
except Exception as e:
logger.error(f"Hybrid processing failed: {e}")
# Return direct result as-is
return ProcessingResult(
document=direct_doc,
track_used=ProcessingTrack.DIRECT,
processing_time=time.time() - start_time,
success=True
)
def _merge_ocr_images(
self,
direct_doc: UnifiedDocument,
ocr_doc: UnifiedDocument,
target_pages: List[int]
) -> int:
"""
Merge image elements from OCR result into direct result.
Args:
direct_doc: Target document
ocr_doc: Source document with images
target_pages: Page indices to merge images from
Returns:
Number of images added
"""
images_added = 0
for page_idx in target_pages:
if page_idx >= len(direct_doc.pages) or page_idx >= len(ocr_doc.pages):
continue
direct_page = direct_doc.pages[page_idx]
ocr_page = ocr_doc.pages[page_idx]
# Find image elements in OCR result
for elem in ocr_page.elements:
if elem.type in [
ElementType.IMAGE, ElementType.FIGURE,
ElementType.CHART, ElementType.DIAGRAM,
ElementType.LOGO, ElementType.STAMP
]:
# Generate unique element ID
elem.element_id = f"ocr_img_{page_idx}_{images_added}"
direct_page.elements.append(elem)
images_added += 1
return images_added