feat: implement hybrid image extraction and memory management
Backend: - Add hybrid image extraction for Direct track (inline image blocks) - Add render_inline_image_regions() fallback when OCR doesn't find images - Add check_document_for_missing_images() for detecting missing images - Add memory management system (MemoryGuard, ModelManager, ServicePool) - Update pdf_generator_service to handle HYBRID processing track - Add ElementType.LOGO for logo extraction Frontend: - Fix PDF viewer re-rendering issues with memoization - Add TaskNotFound component and useTaskValidation hook - Disable StrictMode due to react-pdf incompatibility - Fix task detail and results page loading states 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -25,6 +25,7 @@ except ImportError:
|
||||
|
||||
from app.core.config import settings
|
||||
from app.services.office_converter import OfficeConverter, OfficeConverterError
|
||||
from app.services.memory_manager import get_model_manager, MemoryConfig, MemoryGuard, prediction_context
|
||||
|
||||
# Import dual-track components
|
||||
try:
|
||||
@@ -96,6 +97,26 @@ class OCRService:
|
||||
self._model_last_used = {} # Track last usage time for each model
|
||||
self._memory_warning_logged = False
|
||||
|
||||
# Initialize MemoryGuard for enhanced memory monitoring
|
||||
self._memory_guard = None
|
||||
if settings.enable_model_lifecycle_management:
|
||||
try:
|
||||
memory_config = MemoryConfig(
|
||||
warning_threshold=settings.memory_warning_threshold,
|
||||
critical_threshold=settings.memory_critical_threshold,
|
||||
emergency_threshold=settings.memory_emergency_threshold,
|
||||
model_idle_timeout_seconds=settings.pp_structure_idle_timeout_seconds,
|
||||
gpu_memory_limit_mb=settings.gpu_memory_limit_mb,
|
||||
enable_cpu_fallback=settings.enable_cpu_fallback,
|
||||
)
|
||||
self._memory_guard = MemoryGuard(memory_config)
|
||||
logger.debug("MemoryGuard initialized for OCRService")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to initialize MemoryGuard: {e}")
|
||||
|
||||
# Track if CPU fallback was activated
|
||||
self._cpu_fallback_active = False
|
||||
|
||||
self._detect_and_configure_gpu()
|
||||
|
||||
# Log GPU optimization settings
|
||||
@@ -217,53 +238,91 @@ class OCRService:
|
||||
def _check_gpu_memory_usage(self):
|
||||
"""
|
||||
Check GPU memory usage and log warnings if approaching limits.
|
||||
Implements memory optimization for RTX 4060 8GB.
|
||||
Uses MemoryGuard for enhanced monitoring with multiple backends.
|
||||
"""
|
||||
if not self.use_gpu or not settings.enable_memory_optimization:
|
||||
return
|
||||
|
||||
try:
|
||||
device_id = self.gpu_info.get('device_id', 0)
|
||||
memory_allocated = paddle.device.cuda.memory_allocated(device_id)
|
||||
memory_allocated_mb = memory_allocated / (1024**2)
|
||||
memory_limit_mb = settings.gpu_memory_limit_mb
|
||||
# Use MemoryGuard if available for better monitoring
|
||||
if self._memory_guard:
|
||||
stats = self._memory_guard.get_memory_stats()
|
||||
|
||||
utilization = (memory_allocated_mb / memory_limit_mb * 100) if memory_limit_mb > 0 else 0
|
||||
# Log based on usage ratio
|
||||
if stats.gpu_used_ratio > 0.90 and not self._memory_warning_logged:
|
||||
logger.warning(
|
||||
f"GPU memory usage critical: {stats.gpu_used_mb:.0f}MB / {stats.gpu_total_mb:.0f}MB "
|
||||
f"({stats.gpu_used_ratio*100:.1f}%)"
|
||||
)
|
||||
logger.warning("Consider enabling auto_unload_unused_models or reducing batch size")
|
||||
self._memory_warning_logged = True
|
||||
|
||||
if utilization > 90 and not self._memory_warning_logged:
|
||||
logger.warning(f"GPU memory usage high: {memory_allocated_mb:.0f}MB / {memory_limit_mb}MB ({utilization:.1f}%)")
|
||||
logger.warning("Consider enabling auto_unload_unused_models or reducing batch size")
|
||||
self._memory_warning_logged = True
|
||||
elif utilization > 75:
|
||||
logger.info(f"GPU memory: {memory_allocated_mb:.0f}MB / {memory_limit_mb}MB ({utilization:.1f}%)")
|
||||
# Trigger emergency cleanup if enabled
|
||||
if settings.enable_emergency_cleanup:
|
||||
self._cleanup_unused_models()
|
||||
self._memory_guard.clear_gpu_cache()
|
||||
|
||||
elif stats.gpu_used_ratio > 0.75:
|
||||
logger.info(
|
||||
f"GPU memory: {stats.gpu_used_mb:.0f}MB / {stats.gpu_total_mb:.0f}MB "
|
||||
f"({stats.gpu_used_ratio*100:.1f}%)"
|
||||
)
|
||||
else:
|
||||
# Fallback to original implementation
|
||||
device_id = self.gpu_info.get('device_id', 0)
|
||||
memory_allocated = paddle.device.cuda.memory_allocated(device_id)
|
||||
memory_allocated_mb = memory_allocated / (1024**2)
|
||||
memory_limit_mb = settings.gpu_memory_limit_mb
|
||||
|
||||
utilization = (memory_allocated_mb / memory_limit_mb * 100) if memory_limit_mb > 0 else 0
|
||||
|
||||
if utilization > 90 and not self._memory_warning_logged:
|
||||
logger.warning(f"GPU memory usage high: {memory_allocated_mb:.0f}MB / {memory_limit_mb}MB ({utilization:.1f}%)")
|
||||
logger.warning("Consider enabling auto_unload_unused_models or reducing batch size")
|
||||
self._memory_warning_logged = True
|
||||
elif utilization > 75:
|
||||
logger.info(f"GPU memory: {memory_allocated_mb:.0f}MB / {memory_limit_mb}MB ({utilization:.1f}%)")
|
||||
|
||||
except Exception as e:
|
||||
logger.debug(f"Memory check failed: {e}")
|
||||
|
||||
def _cleanup_unused_models(self):
|
||||
"""
|
||||
Clean up unused language models to free GPU memory.
|
||||
Clean up unused models (including PP-StructureV3) to free GPU memory.
|
||||
Models idle longer than model_idle_timeout_seconds will be unloaded.
|
||||
|
||||
Note: PP-StructureV3 is NO LONGER exempted from cleanup - it will be
|
||||
unloaded based on pp_structure_idle_timeout_seconds configuration.
|
||||
"""
|
||||
if not settings.auto_unload_unused_models:
|
||||
return
|
||||
|
||||
current_time = datetime.now()
|
||||
timeout = settings.model_idle_timeout_seconds
|
||||
models_to_remove = []
|
||||
|
||||
for lang, last_used in self._model_last_used.items():
|
||||
if lang == 'structure': # Don't unload structure engine
|
||||
continue
|
||||
# Use different timeout for structure engine vs language models
|
||||
if lang == 'structure':
|
||||
timeout = settings.pp_structure_idle_timeout_seconds
|
||||
else:
|
||||
timeout = settings.model_idle_timeout_seconds
|
||||
|
||||
idle_seconds = (current_time - last_used).total_seconds()
|
||||
if idle_seconds > timeout:
|
||||
models_to_remove.append(lang)
|
||||
|
||||
for lang in models_to_remove:
|
||||
if lang in self.ocr_engines:
|
||||
logger.info(f"Unloading idle OCR engine for {lang} (idle {timeout}s)")
|
||||
del self.ocr_engines[lang]
|
||||
del self._model_last_used[lang]
|
||||
for model_key in models_to_remove:
|
||||
if model_key == 'structure':
|
||||
if self.structure_engine is not None:
|
||||
logger.info(f"Unloading idle PP-StructureV3 engine (idle {settings.pp_structure_idle_timeout_seconds}s)")
|
||||
self._unload_structure_engine()
|
||||
if model_key in self._model_last_used:
|
||||
del self._model_last_used[model_key]
|
||||
elif model_key in self.ocr_engines:
|
||||
logger.info(f"Unloading idle OCR engine for {model_key} (idle {settings.model_idle_timeout_seconds}s)")
|
||||
del self.ocr_engines[model_key]
|
||||
if model_key in self._model_last_used:
|
||||
del self._model_last_used[model_key]
|
||||
|
||||
if models_to_remove and self.use_gpu:
|
||||
# Clear CUDA cache
|
||||
@@ -273,6 +332,41 @@ class OCRService:
|
||||
except Exception as e:
|
||||
logger.debug(f"Cache clear failed: {e}")
|
||||
|
||||
def _unload_structure_engine(self):
|
||||
"""
|
||||
Properly unload PP-StructureV3 engine and free GPU memory.
|
||||
"""
|
||||
if self.structure_engine is None:
|
||||
return
|
||||
|
||||
try:
|
||||
# Clear internal engine components
|
||||
if hasattr(self.structure_engine, 'table_engine'):
|
||||
self.structure_engine.table_engine = None
|
||||
if hasattr(self.structure_engine, 'text_detector'):
|
||||
self.structure_engine.text_detector = None
|
||||
if hasattr(self.structure_engine, 'text_recognizer'):
|
||||
self.structure_engine.text_recognizer = None
|
||||
if hasattr(self.structure_engine, 'layout_predictor'):
|
||||
self.structure_engine.layout_predictor = None
|
||||
|
||||
# Delete the engine
|
||||
del self.structure_engine
|
||||
self.structure_engine = None
|
||||
|
||||
# Force garbage collection
|
||||
gc.collect()
|
||||
|
||||
# Clear GPU cache
|
||||
if self.use_gpu:
|
||||
paddle.device.cuda.empty_cache()
|
||||
|
||||
logger.info("PP-StructureV3 engine unloaded successfully")
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Error unloading PP-StructureV3: {e}")
|
||||
self.structure_engine = None
|
||||
|
||||
def clear_gpu_cache(self):
|
||||
"""
|
||||
Manually clear GPU memory cache.
|
||||
@@ -519,46 +613,160 @@ class OCRService:
|
||||
logger.warning(f"GPU memory cleanup failed (non-critical): {e}")
|
||||
# Don't fail the processing if cleanup fails
|
||||
|
||||
def check_gpu_memory(self, required_mb: int = 2000) -> bool:
|
||||
def check_gpu_memory(self, required_mb: int = 2000, enable_fallback: bool = True) -> bool:
|
||||
"""
|
||||
Check if sufficient GPU memory is available.
|
||||
Check if sufficient GPU memory is available using MemoryGuard.
|
||||
|
||||
This method now uses MemoryGuard for accurate memory queries across
|
||||
multiple backends (pynvml, torch, paddle) instead of returning True
|
||||
blindly for PaddlePaddle-only environments.
|
||||
|
||||
Args:
|
||||
required_mb: Required memory in MB (default 2000MB for OCR models)
|
||||
enable_fallback: If True and CPU fallback is enabled, switch to CPU mode
|
||||
when memory is insufficient instead of returning False
|
||||
|
||||
Returns:
|
||||
True if sufficient memory is available or GPU is not used
|
||||
True if sufficient memory is available, GPU is not used, or CPU fallback activated
|
||||
"""
|
||||
try:
|
||||
# Check GPU memory using torch if available, otherwise use PaddlePaddle
|
||||
free_memory = None
|
||||
# If not using GPU, always return True
|
||||
if not self.use_gpu:
|
||||
return True
|
||||
|
||||
if TORCH_AVAILABLE and torch.cuda.is_available():
|
||||
free_memory = torch.cuda.mem_get_info()[0] / 1024**2
|
||||
elif paddle.device.is_compiled_with_cuda():
|
||||
# PaddlePaddle doesn't have direct API to get free memory,
|
||||
# so we rely on cleanup and continue
|
||||
logger.debug("Using PaddlePaddle GPU, memory info not directly available")
|
||||
try:
|
||||
# Use MemoryGuard if available for accurate multi-backend memory queries
|
||||
if self._memory_guard:
|
||||
is_available, stats = self._memory_guard.check_memory(
|
||||
required_mb=required_mb,
|
||||
device_id=self.gpu_info.get('device_id', 0)
|
||||
)
|
||||
|
||||
if not is_available:
|
||||
logger.warning(
|
||||
f"GPU memory check failed: {stats.gpu_free_mb:.0f}MB free, "
|
||||
f"{required_mb}MB required ({stats.gpu_used_ratio*100:.1f}% used)"
|
||||
)
|
||||
|
||||
# Try to free memory
|
||||
logger.info("Attempting memory cleanup before retry...")
|
||||
self._cleanup_unused_models()
|
||||
self._memory_guard.clear_gpu_cache()
|
||||
|
||||
# Check again
|
||||
is_available, stats = self._memory_guard.check_memory(required_mb=required_mb)
|
||||
|
||||
if not is_available:
|
||||
# Memory still insufficient after cleanup
|
||||
if enable_fallback and settings.enable_cpu_fallback:
|
||||
logger.warning(
|
||||
f"Insufficient GPU memory ({stats.gpu_free_mb:.0f}MB) after cleanup. "
|
||||
f"Activating CPU fallback mode."
|
||||
)
|
||||
self._activate_cpu_fallback()
|
||||
return True # Continue with CPU
|
||||
else:
|
||||
logger.error(
|
||||
f"Insufficient GPU memory: {stats.gpu_free_mb:.0f}MB available, "
|
||||
f"{required_mb}MB required"
|
||||
)
|
||||
return False
|
||||
|
||||
logger.debug(
|
||||
f"GPU memory check passed: {stats.gpu_free_mb:.0f}MB free "
|
||||
f"({stats.gpu_used_ratio*100:.1f}% used)"
|
||||
)
|
||||
return True
|
||||
|
||||
if free_memory is not None:
|
||||
if free_memory < required_mb:
|
||||
logger.warning(f"Low GPU memory: {free_memory:.0f}MB available, {required_mb}MB required")
|
||||
# Try to free memory
|
||||
self.cleanup_gpu_memory()
|
||||
# Check again
|
||||
if TORCH_AVAILABLE and torch.cuda.is_available():
|
||||
free_memory = torch.cuda.mem_get_info()[0] / 1024**2
|
||||
if free_memory < required_mb:
|
||||
logger.error(f"Insufficient GPU memory after cleanup: {free_memory:.0f}MB")
|
||||
return False
|
||||
logger.debug(f"GPU memory check passed: {free_memory:.0f}MB available")
|
||||
else:
|
||||
# Fallback to original implementation
|
||||
free_memory = None
|
||||
|
||||
if TORCH_AVAILABLE and torch.cuda.is_available():
|
||||
free_memory = torch.cuda.mem_get_info()[0] / 1024**2
|
||||
elif paddle.device.is_compiled_with_cuda():
|
||||
# PaddlePaddle doesn't have direct API to get free memory,
|
||||
# use allocated memory to estimate
|
||||
device_id = self.gpu_info.get('device_id', 0)
|
||||
allocated = paddle.device.cuda.memory_allocated(device_id) / (1024**2)
|
||||
total = settings.gpu_memory_limit_mb
|
||||
free_memory = max(0, total - allocated)
|
||||
logger.debug(f"Estimated free GPU memory: {free_memory:.0f}MB (total: {total}MB, allocated: {allocated:.0f}MB)")
|
||||
|
||||
if free_memory is not None:
|
||||
if free_memory < required_mb:
|
||||
logger.warning(f"Low GPU memory: {free_memory:.0f}MB available, {required_mb}MB required")
|
||||
self.cleanup_gpu_memory()
|
||||
|
||||
# Recheck
|
||||
if TORCH_AVAILABLE and torch.cuda.is_available():
|
||||
free_memory = torch.cuda.mem_get_info()[0] / 1024**2
|
||||
else:
|
||||
allocated = paddle.device.cuda.memory_allocated(device_id) / (1024**2)
|
||||
free_memory = max(0, total - allocated)
|
||||
|
||||
if free_memory < required_mb:
|
||||
if enable_fallback and settings.enable_cpu_fallback:
|
||||
logger.warning(f"Insufficient GPU memory after cleanup. Activating CPU fallback.")
|
||||
self._activate_cpu_fallback()
|
||||
return True
|
||||
else:
|
||||
logger.error(f"Insufficient GPU memory after cleanup: {free_memory:.0f}MB")
|
||||
return False
|
||||
|
||||
logger.debug(f"GPU memory check passed: {free_memory:.0f}MB available")
|
||||
|
||||
return True
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.warning(f"GPU memory check failed: {e}")
|
||||
return True # Continue processing even if check fails
|
||||
|
||||
def _activate_cpu_fallback(self):
|
||||
"""
|
||||
Activate CPU fallback mode when GPU memory is insufficient.
|
||||
This disables GPU usage for the current service instance.
|
||||
"""
|
||||
if self._cpu_fallback_active:
|
||||
return # Already in CPU mode
|
||||
|
||||
logger.warning("=== CPU FALLBACK MODE ACTIVATED ===")
|
||||
logger.warning("GPU memory insufficient, switching to CPU processing")
|
||||
logger.warning("Performance will be significantly reduced")
|
||||
|
||||
self._cpu_fallback_active = True
|
||||
self.use_gpu = False
|
||||
|
||||
# Update GPU info to reflect fallback
|
||||
self.gpu_info['cpu_fallback'] = True
|
||||
self.gpu_info['fallback_reason'] = 'GPU memory insufficient'
|
||||
|
||||
# Clear GPU cache to free memory
|
||||
if self._memory_guard:
|
||||
self._memory_guard.clear_gpu_cache()
|
||||
|
||||
def _restore_gpu_mode(self):
|
||||
"""
|
||||
Attempt to restore GPU mode after CPU fallback.
|
||||
Called when memory pressure has been relieved.
|
||||
"""
|
||||
if not self._cpu_fallback_active:
|
||||
return
|
||||
|
||||
if not self.gpu_available:
|
||||
return
|
||||
|
||||
# Check if GPU memory is now available
|
||||
if self._memory_guard:
|
||||
is_available, stats = self._memory_guard.check_memory(
|
||||
required_mb=settings.structure_model_memory_mb
|
||||
)
|
||||
if is_available:
|
||||
logger.info("GPU memory available, restoring GPU mode")
|
||||
self._cpu_fallback_active = False
|
||||
self.use_gpu = True
|
||||
self.gpu_info.pop('cpu_fallback', None)
|
||||
self.gpu_info.pop('fallback_reason', None)
|
||||
|
||||
def convert_pdf_to_images(self, pdf_path: Path, output_dir: Path) -> List[Path]:
|
||||
"""
|
||||
Convert PDF to images (one per page)
|
||||
@@ -626,6 +834,24 @@ class OCRService:
|
||||
threshold = confidence_threshold if confidence_threshold is not None else self.confidence_threshold
|
||||
|
||||
try:
|
||||
# Pre-operation memory check: Try to restore GPU if in fallback and memory available
|
||||
if self._cpu_fallback_active:
|
||||
self._restore_gpu_mode()
|
||||
if not self._cpu_fallback_active:
|
||||
logger.info("GPU mode restored for processing")
|
||||
|
||||
# Initial memory check before starting any heavy processing
|
||||
# Estimate memory requirement based on image type
|
||||
estimated_memory_mb = 2500 # Conservative estimate for full OCR + layout
|
||||
if detect_layout:
|
||||
estimated_memory_mb += 500 # Additional for PP-StructureV3
|
||||
|
||||
if not self.check_gpu_memory(required_mb=estimated_memory_mb, enable_fallback=True):
|
||||
logger.warning(
|
||||
f"Pre-operation memory check failed ({estimated_memory_mb}MB required). "
|
||||
f"Processing will attempt to proceed but may encounter issues."
|
||||
)
|
||||
|
||||
# Check if file is Office document
|
||||
if self.office_converter.is_office_document(image_path):
|
||||
logger.info(f"Detected Office document: {image_path.name}, converting to PDF")
|
||||
@@ -748,9 +974,12 @@ class OCRService:
|
||||
# Get OCR engine (for non-PDF images)
|
||||
ocr_engine = self.get_ocr_engine(lang)
|
||||
|
||||
# Check GPU memory before OCR processing
|
||||
if not self.check_gpu_memory(required_mb=1500):
|
||||
logger.warning("Insufficient GPU memory for OCR, attempting to proceed anyway")
|
||||
# Secondary memory check before OCR processing
|
||||
if not self.check_gpu_memory(required_mb=1500, enable_fallback=True):
|
||||
logger.warning(
|
||||
f"OCR memory check: insufficient GPU memory (1500MB required). "
|
||||
f"Mode: {'CPU fallback' if self._cpu_fallback_active else 'GPU (low memory)'}"
|
||||
)
|
||||
|
||||
# Get the actual image dimensions that OCR will use
|
||||
from PIL import Image
|
||||
@@ -950,6 +1179,18 @@ class OCRService:
|
||||
Tuple of (layout_data, images_metadata)
|
||||
"""
|
||||
try:
|
||||
# Pre-operation memory check for layout analysis
|
||||
if self._cpu_fallback_active:
|
||||
self._restore_gpu_mode()
|
||||
if not self._cpu_fallback_active:
|
||||
logger.info("GPU mode restored for layout analysis")
|
||||
|
||||
if not self.check_gpu_memory(required_mb=2000, enable_fallback=True):
|
||||
logger.warning(
|
||||
f"Layout analysis pre-check: insufficient GPU memory (2000MB required). "
|
||||
f"Mode: {'CPU fallback' if self._cpu_fallback_active else 'GPU'}"
|
||||
)
|
||||
|
||||
structure_engine = self._ensure_structure_engine(pp_structure_params)
|
||||
|
||||
# Try enhanced processing first
|
||||
@@ -998,11 +1239,21 @@ class OCRService:
|
||||
# Standard processing (original implementation)
|
||||
logger.info(f"Running standard layout analysis on {image_path.name}")
|
||||
|
||||
# Check GPU memory before processing
|
||||
if not self.check_gpu_memory(required_mb=2000):
|
||||
logger.warning("Insufficient GPU memory for PP-StructureV3, attempting to proceed anyway")
|
||||
# Memory check before PP-StructureV3 processing
|
||||
if not self.check_gpu_memory(required_mb=2000, enable_fallback=True):
|
||||
logger.warning(
|
||||
f"PP-StructureV3 memory check: insufficient GPU memory (2000MB required). "
|
||||
f"Mode: {'CPU fallback' if self._cpu_fallback_active else 'GPU (low memory)'}"
|
||||
)
|
||||
|
||||
results = structure_engine.predict(str(image_path))
|
||||
# Use prediction semaphore to control concurrent predictions
|
||||
# This prevents OOM errors from multiple simultaneous PP-StructureV3.predict() calls
|
||||
with prediction_context(timeout=settings.service_acquire_timeout_seconds) as acquired:
|
||||
if not acquired:
|
||||
logger.error("Failed to acquire prediction slot (timeout), returning empty layout")
|
||||
return None, []
|
||||
|
||||
results = structure_engine.predict(str(image_path))
|
||||
|
||||
layout_elements = []
|
||||
images_metadata = []
|
||||
@@ -1254,6 +1505,46 @@ class OCRService:
|
||||
if temp_pdf_path:
|
||||
unified_doc.metadata.original_filename = file_path.name
|
||||
|
||||
# HYBRID MODE: Check if Direct track missed images (e.g., inline image blocks)
|
||||
# If so, use OCR to extract images and merge them into the Direct result
|
||||
pages_with_missing_images = self.direct_extraction_engine.check_document_for_missing_images(
|
||||
actual_file_path
|
||||
)
|
||||
if pages_with_missing_images:
|
||||
logger.info(f"Hybrid mode: Direct track missing images on pages {pages_with_missing_images}, using OCR to extract images")
|
||||
try:
|
||||
# Run OCR on the file to extract images
|
||||
ocr_result = self.process_file_traditional(
|
||||
actual_file_path, lang, detect_layout=True,
|
||||
confidence_threshold=confidence_threshold,
|
||||
output_dir=output_dir, pp_structure_params=pp_structure_params
|
||||
)
|
||||
|
||||
# Convert OCR result to extract images
|
||||
ocr_unified = self.ocr_to_unified_converter.convert(
|
||||
ocr_result, actual_file_path, 0.0, lang
|
||||
)
|
||||
|
||||
# Merge OCR-extracted images into Direct track result
|
||||
images_added = self._merge_ocr_images_into_direct(
|
||||
unified_doc, ocr_unified, pages_with_missing_images
|
||||
)
|
||||
if images_added > 0:
|
||||
logger.info(f"Hybrid mode: Added {images_added} images from OCR to Direct track result")
|
||||
unified_doc.metadata.processing_track = ProcessingTrack.HYBRID
|
||||
else:
|
||||
# Fallback: OCR didn't find images either, render inline image blocks directly
|
||||
logger.info("Hybrid mode: OCR didn't find images, falling back to inline image rendering")
|
||||
images_added = self.direct_extraction_engine.render_inline_image_regions(
|
||||
actual_file_path, unified_doc, pages_with_missing_images, output_dir
|
||||
)
|
||||
if images_added > 0:
|
||||
logger.info(f"Hybrid mode: Rendered {images_added} inline image regions")
|
||||
unified_doc.metadata.processing_track = ProcessingTrack.HYBRID
|
||||
except Exception as e:
|
||||
logger.warning(f"Hybrid mode image extraction failed: {e}")
|
||||
# Continue with Direct track result without images
|
||||
|
||||
# Use OCR track (either by recommendation or fallback)
|
||||
if recommendation.track == "ocr":
|
||||
# Use OCR for scanned documents, images, etc.
|
||||
@@ -1269,17 +1560,19 @@ class OCRService:
|
||||
)
|
||||
unified_doc.document_id = document_id
|
||||
|
||||
# Update processing track metadata
|
||||
unified_doc.metadata.processing_track = (
|
||||
ProcessingTrack.DIRECT if recommendation.track == "direct"
|
||||
else ProcessingTrack.OCR
|
||||
)
|
||||
# Update processing track metadata (only if not already set to HYBRID)
|
||||
if unified_doc.metadata.processing_track != ProcessingTrack.HYBRID:
|
||||
unified_doc.metadata.processing_track = (
|
||||
ProcessingTrack.DIRECT if recommendation.track == "direct"
|
||||
else ProcessingTrack.OCR
|
||||
)
|
||||
|
||||
# Calculate total processing time
|
||||
processing_time = (datetime.now() - start_time).total_seconds()
|
||||
unified_doc.metadata.processing_time = processing_time
|
||||
|
||||
logger.info(f"Document processing completed in {processing_time:.2f}s using {recommendation.track} track")
|
||||
actual_track = unified_doc.metadata.processing_track.value
|
||||
logger.info(f"Document processing completed in {processing_time:.2f}s using {actual_track} track")
|
||||
|
||||
return unified_doc
|
||||
|
||||
@@ -1290,6 +1583,75 @@ class OCRService:
|
||||
file_path, lang, detect_layout, confidence_threshold, output_dir, pp_structure_params
|
||||
)
|
||||
|
||||
def _merge_ocr_images_into_direct(
|
||||
self,
|
||||
direct_doc: 'UnifiedDocument',
|
||||
ocr_doc: 'UnifiedDocument',
|
||||
pages_with_missing_images: List[int]
|
||||
) -> int:
|
||||
"""
|
||||
Merge OCR-extracted images into Direct track result.
|
||||
|
||||
This is used in hybrid mode when Direct track couldn't extract certain
|
||||
images (like logos composed of inline image blocks).
|
||||
|
||||
Args:
|
||||
direct_doc: UnifiedDocument from Direct track
|
||||
ocr_doc: UnifiedDocument from OCR track
|
||||
pages_with_missing_images: List of page numbers (1-indexed) that need images
|
||||
|
||||
Returns:
|
||||
Number of images added
|
||||
"""
|
||||
images_added = 0
|
||||
|
||||
try:
|
||||
# Get image element types to look for
|
||||
image_types = {ElementType.FIGURE, ElementType.IMAGE, ElementType.LOGO}
|
||||
|
||||
for page_num in pages_with_missing_images:
|
||||
# Find the target page in direct_doc
|
||||
direct_page = None
|
||||
for page in direct_doc.pages:
|
||||
if page.page_number == page_num:
|
||||
direct_page = page
|
||||
break
|
||||
|
||||
if not direct_page:
|
||||
continue
|
||||
|
||||
# Find the source page in ocr_doc
|
||||
ocr_page = None
|
||||
for page in ocr_doc.pages:
|
||||
if page.page_number == page_num:
|
||||
ocr_page = page
|
||||
break
|
||||
|
||||
if not ocr_page:
|
||||
continue
|
||||
|
||||
# Extract image elements from OCR page
|
||||
for element in ocr_page.elements:
|
||||
if element.type in image_types:
|
||||
# Assign new element ID to avoid conflicts
|
||||
new_element_id = f"hybrid_{element.element_id}"
|
||||
element.element_id = new_element_id
|
||||
|
||||
# Add to direct page
|
||||
direct_page.elements.append(element)
|
||||
images_added += 1
|
||||
logger.debug(f"Added image element {new_element_id} to page {page_num}")
|
||||
|
||||
# Update image count in direct_doc metadata
|
||||
if images_added > 0:
|
||||
current_images = direct_doc.metadata.total_images or 0
|
||||
direct_doc.metadata.total_images = current_images + images_added
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error merging OCR images into Direct track: {e}")
|
||||
|
||||
return images_added
|
||||
|
||||
def process_file_traditional(
|
||||
self,
|
||||
file_path: Path,
|
||||
@@ -1441,13 +1803,16 @@ class OCRService:
|
||||
UnifiedDocument if dual-track is enabled and use_dual_track=True,
|
||||
Dict with legacy format otherwise
|
||||
"""
|
||||
if use_dual_track and self.dual_track_enabled:
|
||||
# Use dual-track processing
|
||||
# Use dual-track processing if:
|
||||
# 1. use_dual_track is True (auto-detection), OR
|
||||
# 2. force_track is specified (explicit track selection)
|
||||
if (use_dual_track or force_track) and self.dual_track_enabled:
|
||||
# Use dual-track processing (or forced track)
|
||||
return self.process_with_dual_track(
|
||||
file_path, lang, detect_layout, confidence_threshold, output_dir, force_track, pp_structure_params
|
||||
)
|
||||
else:
|
||||
# Use traditional OCR processing
|
||||
# Use traditional OCR processing (no force_track support)
|
||||
return self.process_file_traditional(
|
||||
file_path, lang, detect_layout, confidence_threshold, output_dir, pp_structure_params
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user