feat: implement hybrid image extraction and memory management

Backend:
- Add hybrid image extraction for Direct track (inline image blocks)
- Add render_inline_image_regions() fallback when OCR doesn't find images
- Add check_document_for_missing_images() for detecting missing images
- Add memory management system (MemoryGuard, ModelManager, ServicePool)
- Update pdf_generator_service to handle HYBRID processing track
- Add ElementType.LOGO for logo extraction

Frontend:
- Fix PDF viewer re-rendering issues with memoization
- Add TaskNotFound component and useTaskValidation hook
- Disable StrictMode due to react-pdf incompatibility
- Fix task detail and results page loading states

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
egg
2025-11-26 10:56:22 +08:00
parent ba8ddf2b68
commit 1afdb822c3
26 changed files with 8273 additions and 366 deletions

View File

@@ -39,6 +39,7 @@ from app.schemas.task import (
from app.services.task_service import task_service
from app.services.file_access_service import file_access_service
from app.services.ocr_service import OCRService
from app.services.service_pool import get_service_pool, PoolConfig
# Import dual-track components
try:
@@ -47,6 +48,13 @@ try:
except ImportError:
DUAL_TRACK_AVAILABLE = False
# Service pool availability
SERVICE_POOL_AVAILABLE = True
try:
from app.services.memory_manager import get_model_manager
except ImportError:
SERVICE_POOL_AVAILABLE = False
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/api/v2/tasks", tags=["Tasks"])
@@ -63,7 +71,10 @@ def process_task_ocr(
pp_structure_params: Optional[dict] = None
):
"""
Background task to process OCR for a task with dual-track support
Background task to process OCR for a task with dual-track support.
Uses OCRServicePool to acquire a shared service instance instead of
creating a new one, preventing GPU memory proliferation.
Args:
task_id: Task UUID string
@@ -80,6 +91,7 @@ def process_task_ocr(
db = SessionLocal()
start_time = datetime.now()
pooled_service = None
try:
logger.info(f"Starting OCR processing for task {task_id}, file: {filename}")
@@ -91,16 +103,39 @@ def process_task_ocr(
logger.error(f"Task {task_id} not found in database")
return
# Initialize OCR service
ocr_service = OCRService()
# Acquire OCR service from pool (or create new if pool disabled)
ocr_service = None
if settings.enable_service_pool and SERVICE_POOL_AVAILABLE:
try:
service_pool = get_service_pool()
pooled_service = service_pool.acquire(
device="GPU:0",
timeout=settings.service_acquire_timeout_seconds,
task_id=task_id
)
if pooled_service:
ocr_service = pooled_service.service
logger.info(f"Acquired OCR service from pool for task {task_id}")
else:
logger.warning(f"Timeout acquiring service from pool, creating new instance")
except Exception as e:
logger.warning(f"Failed to acquire service from pool: {e}, creating new instance")
# Fallback: create new instance if pool acquisition failed
if ocr_service is None:
logger.info("Creating new OCRService instance (pool disabled or unavailable)")
ocr_service = OCRService()
# Create result directory before OCR processing (needed for saving extracted images)
result_dir = Path(settings.result_dir) / task_id
result_dir.mkdir(parents=True, exist_ok=True)
# Process the file with OCR (use dual-track if available)
if use_dual_track and hasattr(ocr_service, 'process'):
# Use new dual-track processing
# Process the file with OCR
# Use dual-track processing if:
# 1. use_dual_track is True (auto-detection)
# 2. OR force_track is specified (explicit track selection)
if (use_dual_track or force_track) and hasattr(ocr_service, 'process'):
# Use new dual-track processing (or forced track)
ocr_result = ocr_service.process(
file_path=Path(file_path),
lang=language,
@@ -111,7 +146,7 @@ def process_task_ocr(
pp_structure_params=pp_structure_params
)
else:
# Fall back to traditional processing
# Fall back to traditional processing (no force_track support)
ocr_result = ocr_service.process_image(
image_path=Path(file_path),
lang=language,
@@ -131,6 +166,16 @@ def process_task_ocr(
source_file_path=Path(file_path)
)
# Release service back to pool (success case)
if pooled_service:
try:
service_pool = get_service_pool()
service_pool.release(pooled_service, error=None)
logger.info(f"Released OCR service back to pool for task {task_id}")
pooled_service = None # Prevent double release in finally
except Exception as e:
logger.warning(f"Failed to release service to pool: {e}")
# Close old session and create fresh one to avoid MySQL timeout
# (long OCR processing may cause connection to become stale)
db.close()
@@ -158,6 +203,15 @@ def process_task_ocr(
except Exception as e:
logger.exception(f"OCR processing failed for task {task_id}")
# Release service back to pool with error
if pooled_service:
try:
service_pool = get_service_pool()
service_pool.release(pooled_service, error=e)
pooled_service = None
except Exception as release_error:
logger.warning(f"Failed to release service to pool: {release_error}")
# Update task status to failed (direct database update)
try:
task = db.query(Task).filter(Task.id == task_db_id).first()
@@ -170,6 +224,13 @@ def process_task_ocr(
logger.error(f"Failed to update task status: {update_error}")
finally:
# Ensure service is released in case of any missed release
if pooled_service:
try:
service_pool = get_service_pool()
service_pool.release(pooled_service, error=None)
except Exception:
pass
db.close()
@@ -330,7 +391,13 @@ async def get_task(
with open(result_path) as f:
result_data = json.load(f)
metadata = result_data.get("metadata", {})
processing_track = metadata.get("processing_track")
track_str = metadata.get("processing_track")
# Convert string to enum to avoid Pydantic serialization warning
if track_str:
try:
processing_track = ProcessingTrackEnum(track_str)
except ValueError:
processing_track = None
except Exception:
pass # Silently ignore errors reading the result file