first
This commit is contained in:
394
backend/app/services/background_tasks.py
Normal file
394
backend/app/services/background_tasks.py
Normal file
@@ -0,0 +1,394 @@
|
||||
"""
|
||||
Tool_OCR - Background Tasks Service
|
||||
Handles async processing, cleanup, and scheduled tasks
|
||||
"""
|
||||
|
||||
import logging
|
||||
import asyncio
|
||||
import time
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
from typing import Optional, Callable, Any
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.core.database import SessionLocal
|
||||
from app.models.ocr import OCRBatch, OCRFile, OCRResult, BatchStatus, FileStatus
|
||||
from app.services.ocr_service import OCRService
|
||||
from app.services.file_manager import FileManager
|
||||
from app.services.pdf_generator import PDFGenerator
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class BackgroundTaskManager:
|
||||
"""
|
||||
Manages background tasks including retry logic, cleanup, and scheduled jobs
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
max_retries: int = 3,
|
||||
retry_delay: int = 5,
|
||||
cleanup_interval: int = 3600, # 1 hour
|
||||
file_retention_hours: int = 24
|
||||
):
|
||||
self.max_retries = max_retries
|
||||
self.retry_delay = retry_delay
|
||||
self.cleanup_interval = cleanup_interval
|
||||
self.file_retention_hours = file_retention_hours
|
||||
self.ocr_service = OCRService()
|
||||
self.file_manager = FileManager()
|
||||
self.pdf_generator = PDFGenerator()
|
||||
|
||||
async def execute_with_retry(
|
||||
self,
|
||||
func: Callable,
|
||||
*args,
|
||||
max_retries: Optional[int] = None,
|
||||
retry_delay: Optional[int] = None,
|
||||
**kwargs
|
||||
) -> Any:
|
||||
"""
|
||||
Execute a function with retry logic
|
||||
|
||||
Args:
|
||||
func: Function to execute
|
||||
args: Positional arguments for func
|
||||
max_retries: Maximum retry attempts (overrides default)
|
||||
retry_delay: Delay between retries in seconds (overrides default)
|
||||
kwargs: Keyword arguments for func
|
||||
|
||||
Returns:
|
||||
Function result
|
||||
|
||||
Raises:
|
||||
Exception: If all retries are exhausted
|
||||
"""
|
||||
max_retries = max_retries or self.max_retries
|
||||
retry_delay = retry_delay or self.retry_delay
|
||||
|
||||
last_exception = None
|
||||
for attempt in range(max_retries + 1):
|
||||
try:
|
||||
if asyncio.iscoroutinefunction(func):
|
||||
return await func(*args, **kwargs)
|
||||
else:
|
||||
return func(*args, **kwargs)
|
||||
except Exception as e:
|
||||
last_exception = e
|
||||
if attempt < max_retries:
|
||||
logger.warning(
|
||||
f"Attempt {attempt + 1}/{max_retries + 1} failed for {func.__name__}: {e}. "
|
||||
f"Retrying in {retry_delay}s..."
|
||||
)
|
||||
await asyncio.sleep(retry_delay)
|
||||
else:
|
||||
logger.error(
|
||||
f"All {max_retries + 1} attempts failed for {func.__name__}: {e}"
|
||||
)
|
||||
|
||||
raise last_exception
|
||||
|
||||
def process_single_file_with_retry(
|
||||
self,
|
||||
ocr_file: OCRFile,
|
||||
batch_id: int,
|
||||
lang: str,
|
||||
detect_layout: bool,
|
||||
db: Session
|
||||
) -> bool:
|
||||
"""
|
||||
Process a single file with retry logic
|
||||
|
||||
Args:
|
||||
ocr_file: OCRFile instance
|
||||
batch_id: Batch ID
|
||||
lang: Language code
|
||||
detect_layout: Whether to detect layout
|
||||
db: Database session
|
||||
|
||||
Returns:
|
||||
bool: True if successful, False otherwise
|
||||
"""
|
||||
for attempt in range(self.max_retries + 1):
|
||||
try:
|
||||
# Update file status
|
||||
ocr_file.status = FileStatus.PROCESSING
|
||||
ocr_file.started_at = datetime.utcnow()
|
||||
ocr_file.retry_count = attempt
|
||||
db.commit()
|
||||
|
||||
# Get file paths
|
||||
file_path = Path(ocr_file.file_path)
|
||||
paths = self.file_manager.get_file_paths(batch_id, ocr_file.id)
|
||||
|
||||
# Process OCR
|
||||
result = self.ocr_service.process_image(
|
||||
file_path,
|
||||
lang=lang,
|
||||
detect_layout=detect_layout
|
||||
)
|
||||
|
||||
# Check if processing was successful
|
||||
if result['status'] != 'success':
|
||||
raise Exception(result.get('error_message', 'Unknown error during OCR processing'))
|
||||
|
||||
# Save results
|
||||
json_path, markdown_path = self.ocr_service.save_results(
|
||||
result=result,
|
||||
output_dir=paths["output_dir"],
|
||||
file_id=str(ocr_file.id)
|
||||
)
|
||||
|
||||
# Extract data from result
|
||||
text_regions = result.get('text_regions', [])
|
||||
layout_data = result.get('layout_data')
|
||||
images_metadata = result.get('images_metadata', [])
|
||||
|
||||
# Calculate average confidence (or use from result)
|
||||
avg_confidence = result.get('average_confidence')
|
||||
|
||||
# Create OCR result record
|
||||
ocr_result = OCRResult(
|
||||
file_id=ocr_file.id,
|
||||
markdown_path=str(markdown_path) if markdown_path else None,
|
||||
json_path=str(json_path) if json_path else None,
|
||||
images_dir=None, # Images dir not used in current implementation
|
||||
detected_language=lang,
|
||||
total_text_regions=len(text_regions),
|
||||
average_confidence=avg_confidence,
|
||||
layout_data=layout_data,
|
||||
images_metadata=images_metadata
|
||||
)
|
||||
db.add(ocr_result)
|
||||
|
||||
# Update file status
|
||||
ocr_file.status = FileStatus.COMPLETED
|
||||
ocr_file.completed_at = datetime.utcnow()
|
||||
ocr_file.processing_time = (ocr_file.completed_at - ocr_file.started_at).total_seconds()
|
||||
|
||||
db.commit()
|
||||
|
||||
logger.info(f"Successfully processed file {ocr_file.id} ({ocr_file.original_filename})")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Attempt {attempt + 1}/{self.max_retries + 1} failed for file {ocr_file.id}: {e}")
|
||||
|
||||
if attempt < self.max_retries:
|
||||
# Wait before retry
|
||||
time.sleep(self.retry_delay)
|
||||
else:
|
||||
# Final failure
|
||||
ocr_file.status = FileStatus.FAILED
|
||||
ocr_file.error_message = f"Failed after {self.max_retries + 1} attempts: {str(e)}"
|
||||
ocr_file.completed_at = datetime.utcnow()
|
||||
ocr_file.retry_count = attempt
|
||||
db.commit()
|
||||
return False
|
||||
|
||||
return False
|
||||
|
||||
async def cleanup_expired_files(self, db: Session):
|
||||
"""
|
||||
Clean up files and batches older than retention period
|
||||
|
||||
Args:
|
||||
db: Database session
|
||||
"""
|
||||
try:
|
||||
cutoff_time = datetime.utcnow() - timedelta(hours=self.file_retention_hours)
|
||||
|
||||
# Find expired batches
|
||||
expired_batches = db.query(OCRBatch).filter(
|
||||
OCRBatch.created_at < cutoff_time,
|
||||
OCRBatch.status.in_([BatchStatus.COMPLETED, BatchStatus.FAILED, BatchStatus.PARTIAL])
|
||||
).all()
|
||||
|
||||
logger.info(f"Found {len(expired_batches)} expired batches to clean up")
|
||||
|
||||
for batch in expired_batches:
|
||||
try:
|
||||
# Get batch directory
|
||||
batch_dir = self.file_manager.base_upload_dir / "batches" / str(batch.id)
|
||||
|
||||
# Delete physical files
|
||||
if batch_dir.exists():
|
||||
import shutil
|
||||
shutil.rmtree(batch_dir)
|
||||
logger.info(f"Deleted batch directory: {batch_dir}")
|
||||
|
||||
# Delete database records
|
||||
# Delete results first (foreign key constraint)
|
||||
db.query(OCRResult).filter(
|
||||
OCRResult.file_id.in_(
|
||||
db.query(OCRFile.id).filter(OCRFile.batch_id == batch.id)
|
||||
)
|
||||
).delete(synchronize_session=False)
|
||||
|
||||
# Delete files
|
||||
db.query(OCRFile).filter(OCRFile.batch_id == batch.id).delete()
|
||||
|
||||
# Delete batch
|
||||
db.delete(batch)
|
||||
db.commit()
|
||||
|
||||
logger.info(f"Cleaned up expired batch {batch.id}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error cleaning up batch {batch.id}: {e}")
|
||||
db.rollback()
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in cleanup_expired_files: {e}")
|
||||
|
||||
async def generate_pdf_background(
|
||||
self,
|
||||
result_id: int,
|
||||
output_path: str,
|
||||
css_template: str = "default",
|
||||
db: Session = None
|
||||
):
|
||||
"""
|
||||
Generate PDF in background with retry logic
|
||||
|
||||
Args:
|
||||
result_id: OCR result ID
|
||||
output_path: Output PDF path
|
||||
css_template: CSS template name
|
||||
db: Database session
|
||||
"""
|
||||
should_close_db = False
|
||||
if db is None:
|
||||
db = SessionLocal()
|
||||
should_close_db = True
|
||||
|
||||
try:
|
||||
# Get result
|
||||
result = db.query(OCRResult).filter(OCRResult.id == result_id).first()
|
||||
if not result:
|
||||
logger.error(f"Result {result_id} not found")
|
||||
return
|
||||
|
||||
# Generate PDF with retry
|
||||
await self.execute_with_retry(
|
||||
self.pdf_generator.generate_pdf,
|
||||
markdown_path=result.markdown_path,
|
||||
output_path=output_path,
|
||||
css_template=css_template,
|
||||
max_retries=2,
|
||||
retry_delay=3
|
||||
)
|
||||
|
||||
logger.info(f"Successfully generated PDF for result {result_id}: {output_path}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to generate PDF for result {result_id}: {e}")
|
||||
finally:
|
||||
if should_close_db:
|
||||
db.close()
|
||||
|
||||
async def start_cleanup_scheduler(self):
|
||||
"""
|
||||
Start periodic cleanup scheduler
|
||||
|
||||
Runs cleanup task at specified intervals
|
||||
"""
|
||||
logger.info(f"Starting cleanup scheduler (interval: {self.cleanup_interval}s, retention: {self.file_retention_hours}h)")
|
||||
|
||||
while True:
|
||||
try:
|
||||
db = SessionLocal()
|
||||
await self.cleanup_expired_files(db)
|
||||
db.close()
|
||||
except Exception as e:
|
||||
logger.error(f"Error in cleanup scheduler: {e}")
|
||||
|
||||
# Wait for next interval
|
||||
await asyncio.sleep(self.cleanup_interval)
|
||||
|
||||
|
||||
# Global task manager instance
|
||||
task_manager = BackgroundTaskManager()
|
||||
|
||||
|
||||
def process_batch_files_with_retry(
|
||||
batch_id: int,
|
||||
lang: str,
|
||||
detect_layout: bool,
|
||||
db: Session
|
||||
):
|
||||
"""
|
||||
Process all files in a batch with retry logic
|
||||
|
||||
Args:
|
||||
batch_id: Batch ID
|
||||
lang: Language code
|
||||
detect_layout: Whether to detect layout
|
||||
db: Database session
|
||||
"""
|
||||
try:
|
||||
# Get batch
|
||||
batch = db.query(OCRBatch).filter(OCRBatch.id == batch_id).first()
|
||||
if not batch:
|
||||
logger.error(f"Batch {batch_id} not found")
|
||||
return
|
||||
|
||||
# Update batch status
|
||||
batch.status = BatchStatus.PROCESSING
|
||||
batch.started_at = datetime.utcnow()
|
||||
db.commit()
|
||||
|
||||
# Get pending files
|
||||
files = db.query(OCRFile).filter(
|
||||
OCRFile.batch_id == batch_id,
|
||||
OCRFile.status == FileStatus.PENDING
|
||||
).all()
|
||||
|
||||
logger.info(f"Processing {len(files)} files in batch {batch_id} with retry logic")
|
||||
|
||||
# Process each file with retry
|
||||
for ocr_file in files:
|
||||
success = task_manager.process_single_file_with_retry(
|
||||
ocr_file=ocr_file,
|
||||
batch_id=batch_id,
|
||||
lang=lang,
|
||||
detect_layout=detect_layout,
|
||||
db=db
|
||||
)
|
||||
|
||||
# Update batch progress
|
||||
if success:
|
||||
batch.completed_files += 1
|
||||
else:
|
||||
batch.failed_files += 1
|
||||
|
||||
db.commit()
|
||||
|
||||
# Update batch final status
|
||||
if batch.failed_files == 0:
|
||||
batch.status = BatchStatus.COMPLETED
|
||||
elif batch.completed_files > 0:
|
||||
batch.status = BatchStatus.PARTIAL
|
||||
else:
|
||||
batch.status = BatchStatus.FAILED
|
||||
|
||||
batch.completed_at = datetime.utcnow()
|
||||
db.commit()
|
||||
|
||||
logger.info(
|
||||
f"Batch {batch_id} processing complete: "
|
||||
f"{batch.completed_files} succeeded, {batch.failed_files} failed"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Fatal error processing batch {batch_id}: {e}")
|
||||
try:
|
||||
batch = db.query(OCRBatch).filter(OCRBatch.id == batch_id).first()
|
||||
if batch:
|
||||
batch.status = BatchStatus.FAILED
|
||||
batch.completed_at = datetime.utcnow()
|
||||
db.commit()
|
||||
except Exception as commit_error:
|
||||
logger.error(f"Error updating batch status: {commit_error}")
|
||||
Reference in New Issue
Block a user