feat: implement hybrid image extraction and memory management

Backend:
- Add hybrid image extraction for Direct track (inline image blocks)
- Add render_inline_image_regions() fallback when OCR doesn't find images
- Add check_document_for_missing_images() for detecting missing images
- Add memory management system (MemoryGuard, ModelManager, ServicePool)
- Update pdf_generator_service to handle HYBRID processing track
- Add ElementType.LOGO for logo extraction

Frontend:
- Fix PDF viewer re-rendering issues with memoization
- Add TaskNotFound component and useTaskValidation hook
- Disable StrictMode due to react-pdf incompatibility
- Fix task detail and results page loading states

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
egg
2025-11-26 10:56:22 +08:00
parent ba8ddf2b68
commit 1afdb822c3
26 changed files with 8273 additions and 366 deletions

View File

@@ -25,6 +25,7 @@ except ImportError:
from app.core.config import settings
from app.services.office_converter import OfficeConverter, OfficeConverterError
from app.services.memory_manager import get_model_manager, MemoryConfig, MemoryGuard, prediction_context
# Import dual-track components
try:
@@ -96,6 +97,26 @@ class OCRService:
self._model_last_used = {} # Track last usage time for each model
self._memory_warning_logged = False
# Initialize MemoryGuard for enhanced memory monitoring
self._memory_guard = None
if settings.enable_model_lifecycle_management:
try:
memory_config = MemoryConfig(
warning_threshold=settings.memory_warning_threshold,
critical_threshold=settings.memory_critical_threshold,
emergency_threshold=settings.memory_emergency_threshold,
model_idle_timeout_seconds=settings.pp_structure_idle_timeout_seconds,
gpu_memory_limit_mb=settings.gpu_memory_limit_mb,
enable_cpu_fallback=settings.enable_cpu_fallback,
)
self._memory_guard = MemoryGuard(memory_config)
logger.debug("MemoryGuard initialized for OCRService")
except Exception as e:
logger.warning(f"Failed to initialize MemoryGuard: {e}")
# Track if CPU fallback was activated
self._cpu_fallback_active = False
self._detect_and_configure_gpu()
# Log GPU optimization settings
@@ -217,53 +238,91 @@ class OCRService:
def _check_gpu_memory_usage(self):
"""
Check GPU memory usage and log warnings if approaching limits.
Implements memory optimization for RTX 4060 8GB.
Uses MemoryGuard for enhanced monitoring with multiple backends.
"""
if not self.use_gpu or not settings.enable_memory_optimization:
return
try:
device_id = self.gpu_info.get('device_id', 0)
memory_allocated = paddle.device.cuda.memory_allocated(device_id)
memory_allocated_mb = memory_allocated / (1024**2)
memory_limit_mb = settings.gpu_memory_limit_mb
# Use MemoryGuard if available for better monitoring
if self._memory_guard:
stats = self._memory_guard.get_memory_stats()
utilization = (memory_allocated_mb / memory_limit_mb * 100) if memory_limit_mb > 0 else 0
# Log based on usage ratio
if stats.gpu_used_ratio > 0.90 and not self._memory_warning_logged:
logger.warning(
f"GPU memory usage critical: {stats.gpu_used_mb:.0f}MB / {stats.gpu_total_mb:.0f}MB "
f"({stats.gpu_used_ratio*100:.1f}%)"
)
logger.warning("Consider enabling auto_unload_unused_models or reducing batch size")
self._memory_warning_logged = True
if utilization > 90 and not self._memory_warning_logged:
logger.warning(f"GPU memory usage high: {memory_allocated_mb:.0f}MB / {memory_limit_mb}MB ({utilization:.1f}%)")
logger.warning("Consider enabling auto_unload_unused_models or reducing batch size")
self._memory_warning_logged = True
elif utilization > 75:
logger.info(f"GPU memory: {memory_allocated_mb:.0f}MB / {memory_limit_mb}MB ({utilization:.1f}%)")
# Trigger emergency cleanup if enabled
if settings.enable_emergency_cleanup:
self._cleanup_unused_models()
self._memory_guard.clear_gpu_cache()
elif stats.gpu_used_ratio > 0.75:
logger.info(
f"GPU memory: {stats.gpu_used_mb:.0f}MB / {stats.gpu_total_mb:.0f}MB "
f"({stats.gpu_used_ratio*100:.1f}%)"
)
else:
# Fallback to original implementation
device_id = self.gpu_info.get('device_id', 0)
memory_allocated = paddle.device.cuda.memory_allocated(device_id)
memory_allocated_mb = memory_allocated / (1024**2)
memory_limit_mb = settings.gpu_memory_limit_mb
utilization = (memory_allocated_mb / memory_limit_mb * 100) if memory_limit_mb > 0 else 0
if utilization > 90 and not self._memory_warning_logged:
logger.warning(f"GPU memory usage high: {memory_allocated_mb:.0f}MB / {memory_limit_mb}MB ({utilization:.1f}%)")
logger.warning("Consider enabling auto_unload_unused_models or reducing batch size")
self._memory_warning_logged = True
elif utilization > 75:
logger.info(f"GPU memory: {memory_allocated_mb:.0f}MB / {memory_limit_mb}MB ({utilization:.1f}%)")
except Exception as e:
logger.debug(f"Memory check failed: {e}")
def _cleanup_unused_models(self):
"""
Clean up unused language models to free GPU memory.
Clean up unused models (including PP-StructureV3) to free GPU memory.
Models idle longer than model_idle_timeout_seconds will be unloaded.
Note: PP-StructureV3 is NO LONGER exempted from cleanup - it will be
unloaded based on pp_structure_idle_timeout_seconds configuration.
"""
if not settings.auto_unload_unused_models:
return
current_time = datetime.now()
timeout = settings.model_idle_timeout_seconds
models_to_remove = []
for lang, last_used in self._model_last_used.items():
if lang == 'structure': # Don't unload structure engine
continue
# Use different timeout for structure engine vs language models
if lang == 'structure':
timeout = settings.pp_structure_idle_timeout_seconds
else:
timeout = settings.model_idle_timeout_seconds
idle_seconds = (current_time - last_used).total_seconds()
if idle_seconds > timeout:
models_to_remove.append(lang)
for lang in models_to_remove:
if lang in self.ocr_engines:
logger.info(f"Unloading idle OCR engine for {lang} (idle {timeout}s)")
del self.ocr_engines[lang]
del self._model_last_used[lang]
for model_key in models_to_remove:
if model_key == 'structure':
if self.structure_engine is not None:
logger.info(f"Unloading idle PP-StructureV3 engine (idle {settings.pp_structure_idle_timeout_seconds}s)")
self._unload_structure_engine()
if model_key in self._model_last_used:
del self._model_last_used[model_key]
elif model_key in self.ocr_engines:
logger.info(f"Unloading idle OCR engine for {model_key} (idle {settings.model_idle_timeout_seconds}s)")
del self.ocr_engines[model_key]
if model_key in self._model_last_used:
del self._model_last_used[model_key]
if models_to_remove and self.use_gpu:
# Clear CUDA cache
@@ -273,6 +332,41 @@ class OCRService:
except Exception as e:
logger.debug(f"Cache clear failed: {e}")
def _unload_structure_engine(self):
"""
Properly unload PP-StructureV3 engine and free GPU memory.
"""
if self.structure_engine is None:
return
try:
# Clear internal engine components
if hasattr(self.structure_engine, 'table_engine'):
self.structure_engine.table_engine = None
if hasattr(self.structure_engine, 'text_detector'):
self.structure_engine.text_detector = None
if hasattr(self.structure_engine, 'text_recognizer'):
self.structure_engine.text_recognizer = None
if hasattr(self.structure_engine, 'layout_predictor'):
self.structure_engine.layout_predictor = None
# Delete the engine
del self.structure_engine
self.structure_engine = None
# Force garbage collection
gc.collect()
# Clear GPU cache
if self.use_gpu:
paddle.device.cuda.empty_cache()
logger.info("PP-StructureV3 engine unloaded successfully")
except Exception as e:
logger.warning(f"Error unloading PP-StructureV3: {e}")
self.structure_engine = None
def clear_gpu_cache(self):
"""
Manually clear GPU memory cache.
@@ -519,46 +613,160 @@ class OCRService:
logger.warning(f"GPU memory cleanup failed (non-critical): {e}")
# Don't fail the processing if cleanup fails
def check_gpu_memory(self, required_mb: int = 2000) -> bool:
def check_gpu_memory(self, required_mb: int = 2000, enable_fallback: bool = True) -> bool:
"""
Check if sufficient GPU memory is available.
Check if sufficient GPU memory is available using MemoryGuard.
This method now uses MemoryGuard for accurate memory queries across
multiple backends (pynvml, torch, paddle) instead of returning True
blindly for PaddlePaddle-only environments.
Args:
required_mb: Required memory in MB (default 2000MB for OCR models)
enable_fallback: If True and CPU fallback is enabled, switch to CPU mode
when memory is insufficient instead of returning False
Returns:
True if sufficient memory is available or GPU is not used
True if sufficient memory is available, GPU is not used, or CPU fallback activated
"""
try:
# Check GPU memory using torch if available, otherwise use PaddlePaddle
free_memory = None
# If not using GPU, always return True
if not self.use_gpu:
return True
if TORCH_AVAILABLE and torch.cuda.is_available():
free_memory = torch.cuda.mem_get_info()[0] / 1024**2
elif paddle.device.is_compiled_with_cuda():
# PaddlePaddle doesn't have direct API to get free memory,
# so we rely on cleanup and continue
logger.debug("Using PaddlePaddle GPU, memory info not directly available")
try:
# Use MemoryGuard if available for accurate multi-backend memory queries
if self._memory_guard:
is_available, stats = self._memory_guard.check_memory(
required_mb=required_mb,
device_id=self.gpu_info.get('device_id', 0)
)
if not is_available:
logger.warning(
f"GPU memory check failed: {stats.gpu_free_mb:.0f}MB free, "
f"{required_mb}MB required ({stats.gpu_used_ratio*100:.1f}% used)"
)
# Try to free memory
logger.info("Attempting memory cleanup before retry...")
self._cleanup_unused_models()
self._memory_guard.clear_gpu_cache()
# Check again
is_available, stats = self._memory_guard.check_memory(required_mb=required_mb)
if not is_available:
# Memory still insufficient after cleanup
if enable_fallback and settings.enable_cpu_fallback:
logger.warning(
f"Insufficient GPU memory ({stats.gpu_free_mb:.0f}MB) after cleanup. "
f"Activating CPU fallback mode."
)
self._activate_cpu_fallback()
return True # Continue with CPU
else:
logger.error(
f"Insufficient GPU memory: {stats.gpu_free_mb:.0f}MB available, "
f"{required_mb}MB required"
)
return False
logger.debug(
f"GPU memory check passed: {stats.gpu_free_mb:.0f}MB free "
f"({stats.gpu_used_ratio*100:.1f}% used)"
)
return True
if free_memory is not None:
if free_memory < required_mb:
logger.warning(f"Low GPU memory: {free_memory:.0f}MB available, {required_mb}MB required")
# Try to free memory
self.cleanup_gpu_memory()
# Check again
if TORCH_AVAILABLE and torch.cuda.is_available():
free_memory = torch.cuda.mem_get_info()[0] / 1024**2
if free_memory < required_mb:
logger.error(f"Insufficient GPU memory after cleanup: {free_memory:.0f}MB")
return False
logger.debug(f"GPU memory check passed: {free_memory:.0f}MB available")
else:
# Fallback to original implementation
free_memory = None
if TORCH_AVAILABLE and torch.cuda.is_available():
free_memory = torch.cuda.mem_get_info()[0] / 1024**2
elif paddle.device.is_compiled_with_cuda():
# PaddlePaddle doesn't have direct API to get free memory,
# use allocated memory to estimate
device_id = self.gpu_info.get('device_id', 0)
allocated = paddle.device.cuda.memory_allocated(device_id) / (1024**2)
total = settings.gpu_memory_limit_mb
free_memory = max(0, total - allocated)
logger.debug(f"Estimated free GPU memory: {free_memory:.0f}MB (total: {total}MB, allocated: {allocated:.0f}MB)")
if free_memory is not None:
if free_memory < required_mb:
logger.warning(f"Low GPU memory: {free_memory:.0f}MB available, {required_mb}MB required")
self.cleanup_gpu_memory()
# Recheck
if TORCH_AVAILABLE and torch.cuda.is_available():
free_memory = torch.cuda.mem_get_info()[0] / 1024**2
else:
allocated = paddle.device.cuda.memory_allocated(device_id) / (1024**2)
free_memory = max(0, total - allocated)
if free_memory < required_mb:
if enable_fallback and settings.enable_cpu_fallback:
logger.warning(f"Insufficient GPU memory after cleanup. Activating CPU fallback.")
self._activate_cpu_fallback()
return True
else:
logger.error(f"Insufficient GPU memory after cleanup: {free_memory:.0f}MB")
return False
logger.debug(f"GPU memory check passed: {free_memory:.0f}MB available")
return True
return True
except Exception as e:
logger.warning(f"GPU memory check failed: {e}")
return True # Continue processing even if check fails
def _activate_cpu_fallback(self):
"""
Activate CPU fallback mode when GPU memory is insufficient.
This disables GPU usage for the current service instance.
"""
if self._cpu_fallback_active:
return # Already in CPU mode
logger.warning("=== CPU FALLBACK MODE ACTIVATED ===")
logger.warning("GPU memory insufficient, switching to CPU processing")
logger.warning("Performance will be significantly reduced")
self._cpu_fallback_active = True
self.use_gpu = False
# Update GPU info to reflect fallback
self.gpu_info['cpu_fallback'] = True
self.gpu_info['fallback_reason'] = 'GPU memory insufficient'
# Clear GPU cache to free memory
if self._memory_guard:
self._memory_guard.clear_gpu_cache()
def _restore_gpu_mode(self):
"""
Attempt to restore GPU mode after CPU fallback.
Called when memory pressure has been relieved.
"""
if not self._cpu_fallback_active:
return
if not self.gpu_available:
return
# Check if GPU memory is now available
if self._memory_guard:
is_available, stats = self._memory_guard.check_memory(
required_mb=settings.structure_model_memory_mb
)
if is_available:
logger.info("GPU memory available, restoring GPU mode")
self._cpu_fallback_active = False
self.use_gpu = True
self.gpu_info.pop('cpu_fallback', None)
self.gpu_info.pop('fallback_reason', None)
def convert_pdf_to_images(self, pdf_path: Path, output_dir: Path) -> List[Path]:
"""
Convert PDF to images (one per page)
@@ -626,6 +834,24 @@ class OCRService:
threshold = confidence_threshold if confidence_threshold is not None else self.confidence_threshold
try:
# Pre-operation memory check: Try to restore GPU if in fallback and memory available
if self._cpu_fallback_active:
self._restore_gpu_mode()
if not self._cpu_fallback_active:
logger.info("GPU mode restored for processing")
# Initial memory check before starting any heavy processing
# Estimate memory requirement based on image type
estimated_memory_mb = 2500 # Conservative estimate for full OCR + layout
if detect_layout:
estimated_memory_mb += 500 # Additional for PP-StructureV3
if not self.check_gpu_memory(required_mb=estimated_memory_mb, enable_fallback=True):
logger.warning(
f"Pre-operation memory check failed ({estimated_memory_mb}MB required). "
f"Processing will attempt to proceed but may encounter issues."
)
# Check if file is Office document
if self.office_converter.is_office_document(image_path):
logger.info(f"Detected Office document: {image_path.name}, converting to PDF")
@@ -748,9 +974,12 @@ class OCRService:
# Get OCR engine (for non-PDF images)
ocr_engine = self.get_ocr_engine(lang)
# Check GPU memory before OCR processing
if not self.check_gpu_memory(required_mb=1500):
logger.warning("Insufficient GPU memory for OCR, attempting to proceed anyway")
# Secondary memory check before OCR processing
if not self.check_gpu_memory(required_mb=1500, enable_fallback=True):
logger.warning(
f"OCR memory check: insufficient GPU memory (1500MB required). "
f"Mode: {'CPU fallback' if self._cpu_fallback_active else 'GPU (low memory)'}"
)
# Get the actual image dimensions that OCR will use
from PIL import Image
@@ -950,6 +1179,18 @@ class OCRService:
Tuple of (layout_data, images_metadata)
"""
try:
# Pre-operation memory check for layout analysis
if self._cpu_fallback_active:
self._restore_gpu_mode()
if not self._cpu_fallback_active:
logger.info("GPU mode restored for layout analysis")
if not self.check_gpu_memory(required_mb=2000, enable_fallback=True):
logger.warning(
f"Layout analysis pre-check: insufficient GPU memory (2000MB required). "
f"Mode: {'CPU fallback' if self._cpu_fallback_active else 'GPU'}"
)
structure_engine = self._ensure_structure_engine(pp_structure_params)
# Try enhanced processing first
@@ -998,11 +1239,21 @@ class OCRService:
# Standard processing (original implementation)
logger.info(f"Running standard layout analysis on {image_path.name}")
# Check GPU memory before processing
if not self.check_gpu_memory(required_mb=2000):
logger.warning("Insufficient GPU memory for PP-StructureV3, attempting to proceed anyway")
# Memory check before PP-StructureV3 processing
if not self.check_gpu_memory(required_mb=2000, enable_fallback=True):
logger.warning(
f"PP-StructureV3 memory check: insufficient GPU memory (2000MB required). "
f"Mode: {'CPU fallback' if self._cpu_fallback_active else 'GPU (low memory)'}"
)
results = structure_engine.predict(str(image_path))
# Use prediction semaphore to control concurrent predictions
# This prevents OOM errors from multiple simultaneous PP-StructureV3.predict() calls
with prediction_context(timeout=settings.service_acquire_timeout_seconds) as acquired:
if not acquired:
logger.error("Failed to acquire prediction slot (timeout), returning empty layout")
return None, []
results = structure_engine.predict(str(image_path))
layout_elements = []
images_metadata = []
@@ -1254,6 +1505,46 @@ class OCRService:
if temp_pdf_path:
unified_doc.metadata.original_filename = file_path.name
# HYBRID MODE: Check if Direct track missed images (e.g., inline image blocks)
# If so, use OCR to extract images and merge them into the Direct result
pages_with_missing_images = self.direct_extraction_engine.check_document_for_missing_images(
actual_file_path
)
if pages_with_missing_images:
logger.info(f"Hybrid mode: Direct track missing images on pages {pages_with_missing_images}, using OCR to extract images")
try:
# Run OCR on the file to extract images
ocr_result = self.process_file_traditional(
actual_file_path, lang, detect_layout=True,
confidence_threshold=confidence_threshold,
output_dir=output_dir, pp_structure_params=pp_structure_params
)
# Convert OCR result to extract images
ocr_unified = self.ocr_to_unified_converter.convert(
ocr_result, actual_file_path, 0.0, lang
)
# Merge OCR-extracted images into Direct track result
images_added = self._merge_ocr_images_into_direct(
unified_doc, ocr_unified, pages_with_missing_images
)
if images_added > 0:
logger.info(f"Hybrid mode: Added {images_added} images from OCR to Direct track result")
unified_doc.metadata.processing_track = ProcessingTrack.HYBRID
else:
# Fallback: OCR didn't find images either, render inline image blocks directly
logger.info("Hybrid mode: OCR didn't find images, falling back to inline image rendering")
images_added = self.direct_extraction_engine.render_inline_image_regions(
actual_file_path, unified_doc, pages_with_missing_images, output_dir
)
if images_added > 0:
logger.info(f"Hybrid mode: Rendered {images_added} inline image regions")
unified_doc.metadata.processing_track = ProcessingTrack.HYBRID
except Exception as e:
logger.warning(f"Hybrid mode image extraction failed: {e}")
# Continue with Direct track result without images
# Use OCR track (either by recommendation or fallback)
if recommendation.track == "ocr":
# Use OCR for scanned documents, images, etc.
@@ -1269,17 +1560,19 @@ class OCRService:
)
unified_doc.document_id = document_id
# Update processing track metadata
unified_doc.metadata.processing_track = (
ProcessingTrack.DIRECT if recommendation.track == "direct"
else ProcessingTrack.OCR
)
# Update processing track metadata (only if not already set to HYBRID)
if unified_doc.metadata.processing_track != ProcessingTrack.HYBRID:
unified_doc.metadata.processing_track = (
ProcessingTrack.DIRECT if recommendation.track == "direct"
else ProcessingTrack.OCR
)
# Calculate total processing time
processing_time = (datetime.now() - start_time).total_seconds()
unified_doc.metadata.processing_time = processing_time
logger.info(f"Document processing completed in {processing_time:.2f}s using {recommendation.track} track")
actual_track = unified_doc.metadata.processing_track.value
logger.info(f"Document processing completed in {processing_time:.2f}s using {actual_track} track")
return unified_doc
@@ -1290,6 +1583,75 @@ class OCRService:
file_path, lang, detect_layout, confidence_threshold, output_dir, pp_structure_params
)
def _merge_ocr_images_into_direct(
self,
direct_doc: 'UnifiedDocument',
ocr_doc: 'UnifiedDocument',
pages_with_missing_images: List[int]
) -> int:
"""
Merge OCR-extracted images into Direct track result.
This is used in hybrid mode when Direct track couldn't extract certain
images (like logos composed of inline image blocks).
Args:
direct_doc: UnifiedDocument from Direct track
ocr_doc: UnifiedDocument from OCR track
pages_with_missing_images: List of page numbers (1-indexed) that need images
Returns:
Number of images added
"""
images_added = 0
try:
# Get image element types to look for
image_types = {ElementType.FIGURE, ElementType.IMAGE, ElementType.LOGO}
for page_num in pages_with_missing_images:
# Find the target page in direct_doc
direct_page = None
for page in direct_doc.pages:
if page.page_number == page_num:
direct_page = page
break
if not direct_page:
continue
# Find the source page in ocr_doc
ocr_page = None
for page in ocr_doc.pages:
if page.page_number == page_num:
ocr_page = page
break
if not ocr_page:
continue
# Extract image elements from OCR page
for element in ocr_page.elements:
if element.type in image_types:
# Assign new element ID to avoid conflicts
new_element_id = f"hybrid_{element.element_id}"
element.element_id = new_element_id
# Add to direct page
direct_page.elements.append(element)
images_added += 1
logger.debug(f"Added image element {new_element_id} to page {page_num}")
# Update image count in direct_doc metadata
if images_added > 0:
current_images = direct_doc.metadata.total_images or 0
direct_doc.metadata.total_images = current_images + images_added
except Exception as e:
logger.error(f"Error merging OCR images into Direct track: {e}")
return images_added
def process_file_traditional(
self,
file_path: Path,
@@ -1441,13 +1803,16 @@ class OCRService:
UnifiedDocument if dual-track is enabled and use_dual_track=True,
Dict with legacy format otherwise
"""
if use_dual_track and self.dual_track_enabled:
# Use dual-track processing
# Use dual-track processing if:
# 1. use_dual_track is True (auto-detection), OR
# 2. force_track is specified (explicit track selection)
if (use_dual_track or force_track) and self.dual_track_enabled:
# Use dual-track processing (or forced track)
return self.process_with_dual_track(
file_path, lang, detect_layout, confidence_threshold, output_dir, force_track, pp_structure_params
)
else:
# Use traditional OCR processing
# Use traditional OCR processing (no force_track support)
return self.process_file_traditional(
file_path, lang, detect_layout, confidence_threshold, output_dir, pp_structure_params
)