""" Tool_OCR - Background Tasks Service Handles async processing, cleanup, and scheduled tasks """ import logging import asyncio import time from datetime import datetime, timedelta from pathlib import Path from typing import Optional, Callable, Any from sqlalchemy.orm import Session from app.core.database import SessionLocal from app.models.ocr import OCRBatch, OCRFile, OCRResult, BatchStatus, FileStatus from app.services.ocr_service import OCRService from app.services.file_manager import FileManager from app.services.pdf_generator import PDFGenerator logger = logging.getLogger(__name__) class BackgroundTaskManager: """ Manages background tasks including retry logic, cleanup, and scheduled jobs """ def __init__( self, max_retries: int = 3, retry_delay: int = 5, cleanup_interval: int = 3600, # 1 hour file_retention_hours: int = 24 ): self.max_retries = max_retries self.retry_delay = retry_delay self.cleanup_interval = cleanup_interval self.file_retention_hours = file_retention_hours self.ocr_service = OCRService() self.file_manager = FileManager() self.pdf_generator = PDFGenerator() async def execute_with_retry( self, func: Callable, *args, max_retries: Optional[int] = None, retry_delay: Optional[int] = None, **kwargs ) -> Any: """ Execute a function with retry logic Args: func: Function to execute args: Positional arguments for func max_retries: Maximum retry attempts (overrides default) retry_delay: Delay between retries in seconds (overrides default) kwargs: Keyword arguments for func Returns: Function result Raises: Exception: If all retries are exhausted """ max_retries = max_retries or self.max_retries retry_delay = retry_delay or self.retry_delay last_exception = None for attempt in range(max_retries + 1): try: if asyncio.iscoroutinefunction(func): return await func(*args, **kwargs) else: return func(*args, **kwargs) except Exception as e: last_exception = e if attempt < max_retries: logger.warning( f"Attempt {attempt + 1}/{max_retries + 1} failed for {func.__name__}: {e}. " f"Retrying in {retry_delay}s..." ) await asyncio.sleep(retry_delay) else: logger.error( f"All {max_retries + 1} attempts failed for {func.__name__}: {e}" ) raise last_exception def process_single_file_with_retry( self, ocr_file: OCRFile, batch_id: int, lang: str, detect_layout: bool, db: Session ) -> bool: """ Process a single file with retry logic Args: ocr_file: OCRFile instance batch_id: Batch ID lang: Language code detect_layout: Whether to detect layout db: Database session Returns: bool: True if successful, False otherwise """ for attempt in range(self.max_retries + 1): try: # Update file status ocr_file.status = FileStatus.PROCESSING ocr_file.started_at = datetime.utcnow() ocr_file.retry_count = attempt db.commit() # Get file paths file_path = Path(ocr_file.file_path) paths = self.file_manager.get_file_paths(batch_id, ocr_file.id) # Process OCR result = self.ocr_service.process_image( file_path, lang=lang, detect_layout=detect_layout ) # Check if processing was successful if result['status'] != 'success': raise Exception(result.get('error_message', 'Unknown error during OCR processing')) # Save results json_path, markdown_path = self.ocr_service.save_results( result=result, output_dir=paths["output_dir"], file_id=str(ocr_file.id) ) # Extract data from result text_regions = result.get('text_regions', []) layout_data = result.get('layout_data') images_metadata = result.get('images_metadata', []) # Calculate average confidence (or use from result) avg_confidence = result.get('average_confidence') # Create OCR result record ocr_result = OCRResult( file_id=ocr_file.id, markdown_path=str(markdown_path) if markdown_path else None, json_path=str(json_path) if json_path else None, images_dir=None, # Images dir not used in current implementation detected_language=lang, total_text_regions=len(text_regions), average_confidence=avg_confidence, layout_data=layout_data, images_metadata=images_metadata ) db.add(ocr_result) # Update file status ocr_file.status = FileStatus.COMPLETED ocr_file.completed_at = datetime.utcnow() ocr_file.processing_time = (ocr_file.completed_at - ocr_file.started_at).total_seconds() db.commit() logger.info(f"Successfully processed file {ocr_file.id} ({ocr_file.original_filename})") return True except Exception as e: logger.error(f"Attempt {attempt + 1}/{self.max_retries + 1} failed for file {ocr_file.id}: {e}") if attempt < self.max_retries: # Wait before retry time.sleep(self.retry_delay) else: # Final failure ocr_file.status = FileStatus.FAILED ocr_file.error_message = f"Failed after {self.max_retries + 1} attempts: {str(e)}" ocr_file.completed_at = datetime.utcnow() ocr_file.retry_count = attempt db.commit() return False return False async def cleanup_expired_files(self, db: Session): """ Clean up files and batches older than retention period Args: db: Database session """ try: cutoff_time = datetime.utcnow() - timedelta(hours=self.file_retention_hours) # Find expired batches expired_batches = db.query(OCRBatch).filter( OCRBatch.created_at < cutoff_time, OCRBatch.status.in_([BatchStatus.COMPLETED, BatchStatus.FAILED, BatchStatus.PARTIAL]) ).all() logger.info(f"Found {len(expired_batches)} expired batches to clean up") for batch in expired_batches: try: # Get batch directory batch_dir = self.file_manager.base_upload_dir / "batches" / str(batch.id) # Delete physical files if batch_dir.exists(): import shutil shutil.rmtree(batch_dir) logger.info(f"Deleted batch directory: {batch_dir}") # Delete database records # Delete results first (foreign key constraint) db.query(OCRResult).filter( OCRResult.file_id.in_( db.query(OCRFile.id).filter(OCRFile.batch_id == batch.id) ) ).delete(synchronize_session=False) # Delete files db.query(OCRFile).filter(OCRFile.batch_id == batch.id).delete() # Delete batch db.delete(batch) db.commit() logger.info(f"Cleaned up expired batch {batch.id}") except Exception as e: logger.error(f"Error cleaning up batch {batch.id}: {e}") db.rollback() except Exception as e: logger.error(f"Error in cleanup_expired_files: {e}") async def generate_pdf_background( self, result_id: int, output_path: str, css_template: str = "default", db: Session = None ): """ Generate PDF in background with retry logic Args: result_id: OCR result ID output_path: Output PDF path css_template: CSS template name db: Database session """ should_close_db = False if db is None: db = SessionLocal() should_close_db = True try: # Get result result = db.query(OCRResult).filter(OCRResult.id == result_id).first() if not result: logger.error(f"Result {result_id} not found") return # Generate PDF with retry await self.execute_with_retry( self.pdf_generator.generate_pdf, markdown_path=result.markdown_path, output_path=output_path, css_template=css_template, max_retries=2, retry_delay=3 ) logger.info(f"Successfully generated PDF for result {result_id}: {output_path}") except Exception as e: logger.error(f"Failed to generate PDF for result {result_id}: {e}") finally: if should_close_db: db.close() async def start_cleanup_scheduler(self): """ Start periodic cleanup scheduler Runs cleanup task at specified intervals """ logger.info(f"Starting cleanup scheduler (interval: {self.cleanup_interval}s, retention: {self.file_retention_hours}h)") while True: try: db = SessionLocal() await self.cleanup_expired_files(db) db.close() except Exception as e: logger.error(f"Error in cleanup scheduler: {e}") # Wait for next interval await asyncio.sleep(self.cleanup_interval) # Global task manager instance task_manager = BackgroundTaskManager() def process_batch_files_with_retry( batch_id: int, lang: str, detect_layout: bool, db: Session ): """ Process all files in a batch with retry logic Args: batch_id: Batch ID lang: Language code detect_layout: Whether to detect layout db: Database session """ try: # Get batch batch = db.query(OCRBatch).filter(OCRBatch.id == batch_id).first() if not batch: logger.error(f"Batch {batch_id} not found") return # Update batch status batch.status = BatchStatus.PROCESSING batch.started_at = datetime.utcnow() db.commit() # Get pending files files = db.query(OCRFile).filter( OCRFile.batch_id == batch_id, OCRFile.status == FileStatus.PENDING ).all() logger.info(f"Processing {len(files)} files in batch {batch_id} with retry logic") # Process each file with retry for ocr_file in files: success = task_manager.process_single_file_with_retry( ocr_file=ocr_file, batch_id=batch_id, lang=lang, detect_layout=detect_layout, db=db ) # Update batch progress if success: batch.completed_files += 1 else: batch.failed_files += 1 db.commit() # Update batch final status if batch.failed_files == 0: batch.status = BatchStatus.COMPLETED elif batch.completed_files > 0: batch.status = BatchStatus.PARTIAL else: batch.status = BatchStatus.FAILED batch.completed_at = datetime.utcnow() db.commit() logger.info( f"Batch {batch_id} processing complete: " f"{batch.completed_files} succeeded, {batch.failed_files} failed" ) except Exception as e: logger.error(f"Fatal error processing batch {batch_id}: {e}") try: batch = db.query(OCRBatch).filter(OCRBatch.id == batch_id).first() if batch: batch.status = BatchStatus.FAILED batch.completed_at = datetime.utcnow() db.commit() except Exception as commit_error: logger.error(f"Error updating batch status: {commit_error}")