## Backend Changes - **Service Layer Refactoring**: - Add ProcessingOrchestrator for unified document processing - Add PDFTableRenderer for table rendering extraction - Add PDFFontManager for font management with CJK support - Add MemoryPolicyEngine (73% code reduction from MemoryGuard) - **Bug Fixes**: - Fix Direct Track table row span calculation - Fix OCR Track image path handling - Add cell_boxes coordinate validation - Filter out small decorative images - Add covering image detection ## Frontend Changes - **State Management**: - Add TaskStore for centralized task state management - Add localStorage persistence for recent tasks - Add processing state tracking - **Type Consolidation**: - Merge shared types from api.ts to apiV2.ts - Update imports in authStore, uploadStore, ResultsTable, SettingsPage - **Page Integration**: - Integrate TaskStore in ProcessingPage and TaskDetailPage - Update useTaskValidation hook with cache sync ## Testing - Direct Track: edit.pdf (3 pages, 1.281s), edit3.pdf (2 pages, 0.203s) - Cell boxes validation: 43 valid, 0 invalid - Table merging: 12 merged cells verified 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
476 lines
16 KiB
Python
476 lines
16 KiB
Python
"""
|
|
Tool_OCR - OCR Service Pool
|
|
Manages a pool of OCRService instances to prevent duplicate model loading
|
|
and control concurrent GPU operations.
|
|
"""
|
|
|
|
import asyncio
|
|
import logging
|
|
import threading
|
|
import time
|
|
from contextlib import contextmanager
|
|
from dataclasses import dataclass, field
|
|
from enum import Enum
|
|
from typing import Any, Dict, List, Optional, TYPE_CHECKING
|
|
|
|
from app.services.memory_manager import get_model_manager, MemoryConfig
|
|
from app.services.memory_policy_engine import get_memory_policy_engine
|
|
|
|
if TYPE_CHECKING:
|
|
from app.services.ocr_service import OCRService
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class ServiceState(Enum):
|
|
"""State of a pooled service"""
|
|
AVAILABLE = "available"
|
|
IN_USE = "in_use"
|
|
UNHEALTHY = "unhealthy"
|
|
INITIALIZING = "initializing"
|
|
|
|
|
|
@dataclass
|
|
class PooledService:
|
|
"""Wrapper for a pooled OCRService instance"""
|
|
service: Any # OCRService
|
|
device: str
|
|
state: ServiceState = ServiceState.AVAILABLE
|
|
created_at: float = field(default_factory=time.time)
|
|
last_used: float = field(default_factory=time.time)
|
|
use_count: int = 0
|
|
error_count: int = 0
|
|
current_task_id: Optional[str] = None
|
|
|
|
|
|
class PoolConfig:
|
|
"""Configuration for the service pool"""
|
|
|
|
def __init__(
|
|
self,
|
|
max_services_per_device: int = 1,
|
|
max_total_services: int = 2,
|
|
acquire_timeout_seconds: float = 300.0,
|
|
max_queue_size: int = 50,
|
|
health_check_interval_seconds: int = 60,
|
|
max_consecutive_errors: int = 3,
|
|
service_idle_timeout_seconds: int = 600,
|
|
enable_auto_scaling: bool = False,
|
|
):
|
|
self.max_services_per_device = max_services_per_device
|
|
self.max_total_services = max_total_services
|
|
self.acquire_timeout_seconds = acquire_timeout_seconds
|
|
self.max_queue_size = max_queue_size
|
|
self.health_check_interval_seconds = health_check_interval_seconds
|
|
self.max_consecutive_errors = max_consecutive_errors
|
|
self.service_idle_timeout_seconds = service_idle_timeout_seconds
|
|
self.enable_auto_scaling = enable_auto_scaling
|
|
|
|
|
|
class OCRServicePool:
|
|
"""
|
|
Pool of OCRService instances with concurrency control.
|
|
|
|
Features:
|
|
- Per-device instance management (one service per GPU)
|
|
- Queue-based task distribution
|
|
- Semaphore-based concurrency limits
|
|
- Health monitoring
|
|
- Automatic service recovery
|
|
"""
|
|
|
|
_instance = None
|
|
_lock = threading.Lock()
|
|
|
|
def __new__(cls, *args, **kwargs):
|
|
"""Singleton pattern"""
|
|
with cls._lock:
|
|
if cls._instance is None:
|
|
cls._instance = super().__new__(cls)
|
|
cls._instance._initialized = False
|
|
return cls._instance
|
|
|
|
def __init__(self, config: Optional[PoolConfig] = None):
|
|
if self._initialized:
|
|
return
|
|
|
|
self.config = config or PoolConfig()
|
|
self.services: Dict[str, List[PooledService]] = {}
|
|
self.semaphores: Dict[str, threading.Semaphore] = {}
|
|
self.queues: Dict[str, List] = {}
|
|
self._pool_lock = threading.RLock()
|
|
self._condition = threading.Condition(self._pool_lock)
|
|
|
|
# Metrics
|
|
self._metrics = {
|
|
"total_acquisitions": 0,
|
|
"total_releases": 0,
|
|
"total_timeouts": 0,
|
|
"total_errors": 0,
|
|
"queue_waits": 0,
|
|
}
|
|
|
|
# Initialize default device pool
|
|
self._initialize_device("GPU:0")
|
|
|
|
self._initialized = True
|
|
logger.info("OCRServicePool initialized")
|
|
|
|
def _initialize_device(self, device: str):
|
|
"""Initialize pool resources for a device"""
|
|
with self._pool_lock:
|
|
if device not in self.services:
|
|
self.services[device] = []
|
|
self.semaphores[device] = threading.Semaphore(
|
|
self.config.max_services_per_device
|
|
)
|
|
self.queues[device] = []
|
|
logger.info(f"Initialized pool for device {device}")
|
|
|
|
def _create_service(self, device: str) -> PooledService:
|
|
"""
|
|
Create a new OCRService instance for the pool.
|
|
|
|
Args:
|
|
device: Device identifier (e.g., "GPU:0", "CPU")
|
|
|
|
Returns:
|
|
PooledService wrapper
|
|
"""
|
|
# Import here to avoid circular imports
|
|
from app.services.ocr_service import OCRService
|
|
|
|
logger.info(f"Creating new OCRService for device {device}")
|
|
start_time = time.time()
|
|
|
|
# Create service instance
|
|
service = OCRService()
|
|
|
|
creation_time = time.time() - start_time
|
|
logger.info(f"OCRService created in {creation_time:.2f}s for device {device}")
|
|
|
|
return PooledService(
|
|
service=service,
|
|
device=device,
|
|
state=ServiceState.AVAILABLE
|
|
)
|
|
|
|
def acquire(
|
|
self,
|
|
device: str = "GPU:0",
|
|
timeout: Optional[float] = None,
|
|
task_id: Optional[str] = None
|
|
) -> Optional[PooledService]:
|
|
"""
|
|
Acquire an OCRService from the pool.
|
|
|
|
Args:
|
|
device: Preferred device (e.g., "GPU:0")
|
|
timeout: Maximum time to wait for a service
|
|
task_id: Optional task ID for tracking
|
|
|
|
Returns:
|
|
PooledService if available, None if timeout
|
|
"""
|
|
timeout = timeout or self.config.acquire_timeout_seconds
|
|
self._initialize_device(device)
|
|
|
|
start_time = time.time()
|
|
deadline = start_time + timeout
|
|
|
|
with self._condition:
|
|
while True:
|
|
# Try to get an available service
|
|
service = self._try_acquire_service(device, task_id)
|
|
if service:
|
|
self._metrics["total_acquisitions"] += 1
|
|
return service
|
|
|
|
# Check if we can create a new service
|
|
if self._can_create_service(device):
|
|
try:
|
|
pooled = self._create_service(device)
|
|
pooled.state = ServiceState.IN_USE
|
|
pooled.current_task_id = task_id
|
|
pooled.use_count += 1
|
|
self.services[device].append(pooled)
|
|
self._metrics["total_acquisitions"] += 1
|
|
logger.info(f"Created and acquired new service for {device}")
|
|
return pooled
|
|
except Exception as e:
|
|
logger.error(f"Failed to create service for {device}: {e}")
|
|
self._metrics["total_errors"] += 1
|
|
|
|
# Wait for a service to become available
|
|
remaining = deadline - time.time()
|
|
if remaining <= 0:
|
|
self._metrics["total_timeouts"] += 1
|
|
logger.warning(f"Timeout waiting for service on {device}")
|
|
return None
|
|
|
|
self._metrics["queue_waits"] += 1
|
|
logger.debug(f"Waiting for service on {device} (timeout: {remaining:.1f}s)")
|
|
self._condition.wait(timeout=min(remaining, 1.0))
|
|
|
|
def _try_acquire_service(self, device: str, task_id: Optional[str]) -> Optional[PooledService]:
|
|
"""Try to acquire an available service without waiting"""
|
|
for pooled in self.services.get(device, []):
|
|
if pooled.state == ServiceState.AVAILABLE:
|
|
pooled.state = ServiceState.IN_USE
|
|
pooled.last_used = time.time()
|
|
pooled.use_count += 1
|
|
pooled.current_task_id = task_id
|
|
logger.debug(f"Acquired existing service for {device} (use #{pooled.use_count})")
|
|
return pooled
|
|
return None
|
|
|
|
def _can_create_service(self, device: str) -> bool:
|
|
"""Check if a new service can be created"""
|
|
device_count = len(self.services.get(device, []))
|
|
total_count = sum(len(services) for services in self.services.values())
|
|
|
|
return (
|
|
device_count < self.config.max_services_per_device and
|
|
total_count < self.config.max_total_services
|
|
)
|
|
|
|
def release(self, pooled: PooledService, error: Optional[Exception] = None):
|
|
"""
|
|
Release a service back to the pool.
|
|
|
|
Args:
|
|
pooled: The pooled service to release
|
|
error: Optional error that occurred during use
|
|
"""
|
|
with self._condition:
|
|
if error:
|
|
pooled.error_count += 1
|
|
self._metrics["total_errors"] += 1
|
|
logger.warning(f"Service released with error: {error}")
|
|
|
|
# Mark unhealthy if too many errors
|
|
if pooled.error_count >= self.config.max_consecutive_errors:
|
|
pooled.state = ServiceState.UNHEALTHY
|
|
logger.error(f"Service marked unhealthy after {pooled.error_count} errors")
|
|
else:
|
|
pooled.state = ServiceState.AVAILABLE
|
|
else:
|
|
pooled.error_count = 0 # Reset error count on success
|
|
pooled.state = ServiceState.AVAILABLE
|
|
|
|
pooled.last_used = time.time()
|
|
pooled.current_task_id = None
|
|
self._metrics["total_releases"] += 1
|
|
|
|
# Clean up GPU memory after release
|
|
try:
|
|
# Prefer new MemoryPolicyEngine
|
|
engine = get_memory_policy_engine()
|
|
engine.clear_cache()
|
|
except Exception:
|
|
# Fallback to legacy model_manager
|
|
try:
|
|
model_manager = get_model_manager()
|
|
model_manager.memory_guard.clear_gpu_cache()
|
|
except Exception as e:
|
|
logger.debug(f"Cache clear after release failed: {e}")
|
|
|
|
# Notify waiting threads
|
|
self._condition.notify_all()
|
|
|
|
logger.debug(f"Service released for device {pooled.device}")
|
|
|
|
@contextmanager
|
|
def acquire_context(
|
|
self,
|
|
device: str = "GPU:0",
|
|
timeout: Optional[float] = None,
|
|
task_id: Optional[str] = None
|
|
):
|
|
"""
|
|
Context manager for acquiring and releasing a service.
|
|
|
|
Usage:
|
|
with pool.acquire_context("GPU:0") as pooled:
|
|
result = pooled.service.process(...)
|
|
"""
|
|
pooled = None
|
|
error = None
|
|
try:
|
|
pooled = self.acquire(device, timeout, task_id)
|
|
if pooled is None:
|
|
raise TimeoutError(f"Failed to acquire service for {device}")
|
|
yield pooled
|
|
except Exception as e:
|
|
error = e
|
|
raise
|
|
finally:
|
|
if pooled:
|
|
self.release(pooled, error)
|
|
|
|
def get_service(self, device: str = "GPU:0") -> Optional["OCRService"]:
|
|
"""
|
|
Get a service directly (for backward compatibility).
|
|
|
|
This acquires a service and returns the underlying OCRService.
|
|
The caller is responsible for calling release_service() when done.
|
|
|
|
Args:
|
|
device: Device identifier
|
|
|
|
Returns:
|
|
OCRService instance or None
|
|
"""
|
|
pooled = self.acquire(device)
|
|
if pooled:
|
|
return pooled.service
|
|
return None
|
|
|
|
def get_pool_stats(self) -> Dict:
|
|
"""Get current pool statistics"""
|
|
with self._pool_lock:
|
|
stats = {
|
|
"devices": {},
|
|
"metrics": self._metrics.copy(),
|
|
"total_services": 0,
|
|
"available_services": 0,
|
|
"in_use_services": 0,
|
|
}
|
|
|
|
for device, services in self.services.items():
|
|
available = sum(1 for s in services if s.state == ServiceState.AVAILABLE)
|
|
in_use = sum(1 for s in services if s.state == ServiceState.IN_USE)
|
|
unhealthy = sum(1 for s in services if s.state == ServiceState.UNHEALTHY)
|
|
|
|
stats["devices"][device] = {
|
|
"total": len(services),
|
|
"available": available,
|
|
"in_use": in_use,
|
|
"unhealthy": unhealthy,
|
|
"max_allowed": self.config.max_services_per_device,
|
|
}
|
|
|
|
stats["total_services"] += len(services)
|
|
stats["available_services"] += available
|
|
stats["in_use_services"] += in_use
|
|
|
|
return stats
|
|
|
|
def health_check(self) -> Dict:
|
|
"""
|
|
Perform health check on all pooled services.
|
|
|
|
Returns:
|
|
Health check results
|
|
"""
|
|
results = {
|
|
"healthy": True,
|
|
"services": [],
|
|
"timestamp": time.time()
|
|
}
|
|
|
|
with self._pool_lock:
|
|
for device, services in self.services.items():
|
|
for idx, pooled in enumerate(services):
|
|
service_health = {
|
|
"device": device,
|
|
"index": idx,
|
|
"state": pooled.state.value,
|
|
"error_count": pooled.error_count,
|
|
"use_count": pooled.use_count,
|
|
"idle_seconds": time.time() - pooled.last_used,
|
|
}
|
|
|
|
# Check if service is responsive
|
|
if pooled.state == ServiceState.AVAILABLE:
|
|
try:
|
|
# Simple check - verify service has required attributes
|
|
has_process = hasattr(pooled.service, 'process')
|
|
has_gpu_status = hasattr(pooled.service, 'get_gpu_status')
|
|
service_health["responsive"] = has_process and has_gpu_status
|
|
except Exception as e:
|
|
service_health["responsive"] = False
|
|
service_health["error"] = str(e)
|
|
results["healthy"] = False
|
|
else:
|
|
service_health["responsive"] = pooled.state != ServiceState.UNHEALTHY
|
|
|
|
if pooled.state == ServiceState.UNHEALTHY:
|
|
results["healthy"] = False
|
|
|
|
results["services"].append(service_health)
|
|
|
|
return results
|
|
|
|
def recover_unhealthy(self):
|
|
"""
|
|
Attempt to recover unhealthy services.
|
|
"""
|
|
with self._pool_lock:
|
|
for device, services in self.services.items():
|
|
for idx, pooled in enumerate(services):
|
|
if pooled.state == ServiceState.UNHEALTHY:
|
|
logger.info(f"Attempting to recover unhealthy service {device}:{idx}")
|
|
try:
|
|
# Remove old service
|
|
services.remove(pooled)
|
|
|
|
# Create new service
|
|
new_pooled = self._create_service(device)
|
|
services.append(new_pooled)
|
|
logger.info(f"Successfully recovered service {device}:{idx}")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to recover service {device}:{idx}: {e}")
|
|
|
|
def shutdown(self):
|
|
"""
|
|
Shutdown the pool and cleanup all services.
|
|
"""
|
|
logger.info("OCRServicePool shutdown started")
|
|
|
|
with self._pool_lock:
|
|
for device, services in self.services.items():
|
|
for pooled in services:
|
|
try:
|
|
# Clean up service resources
|
|
if hasattr(pooled.service, 'cleanup_gpu_memory'):
|
|
pooled.service.cleanup_gpu_memory()
|
|
except Exception as e:
|
|
logger.warning(f"Error cleaning up service: {e}")
|
|
|
|
# Clear all pools
|
|
self.services.clear()
|
|
self.semaphores.clear()
|
|
self.queues.clear()
|
|
|
|
logger.info("OCRServicePool shutdown completed")
|
|
|
|
|
|
# Global singleton instance
|
|
_service_pool: Optional[OCRServicePool] = None
|
|
|
|
|
|
def get_service_pool(config: Optional[PoolConfig] = None) -> OCRServicePool:
|
|
"""
|
|
Get the global OCRServicePool instance.
|
|
|
|
Args:
|
|
config: Optional configuration (only used on first call)
|
|
|
|
Returns:
|
|
OCRServicePool singleton instance
|
|
"""
|
|
global _service_pool
|
|
if _service_pool is None:
|
|
_service_pool = OCRServicePool(config)
|
|
return _service_pool
|
|
|
|
|
|
def shutdown_service_pool():
|
|
"""Shutdown the global service pool"""
|
|
global _service_pool
|
|
if _service_pool is not None:
|
|
_service_pool.shutdown()
|
|
_service_pool = None
|