""" Tool_OCR - Cleanup Service Handles file cleanup while preserving database records for statistics """ import os import shutil import logging from typing import Dict, List, Tuple from datetime import datetime from sqlalchemy.orm import Session from sqlalchemy import and_, func from app.models.task import Task, TaskFile, TaskStatus from app.core.config import settings logger = logging.getLogger(__name__) class CleanupService: """Service for cleaning up files while preserving database records""" def cleanup_user_files( self, db: Session, user_id: int, max_files_to_keep: int = 50 ) -> Dict: """ Clean up old files for a user, keeping only the newest N tasks' files. Database records are preserved for statistics. Args: db: Database session user_id: User ID max_files_to_keep: Number of newest tasks to keep files for Returns: Dict with cleanup statistics """ # Get all completed tasks with files (not yet deleted) tasks_with_files = ( db.query(Task) .filter( and_( Task.user_id == user_id, Task.status == TaskStatus.COMPLETED, Task.file_deleted == False, Task.deleted_at.is_(None) # Don't process already soft-deleted ) ) .order_by(Task.created_at.desc()) .all() ) # Keep newest N tasks, clean files from older ones tasks_to_clean = tasks_with_files[max_files_to_keep:] files_deleted = 0 bytes_freed = 0 tasks_cleaned = 0 for task in tasks_to_clean: task_bytes, task_files = self._delete_task_files(task) if task_files > 0: task.file_deleted = True task.updated_at = datetime.utcnow() files_deleted += task_files bytes_freed += task_bytes tasks_cleaned += 1 if tasks_cleaned > 0: db.commit() logger.info( f"Cleaned up {files_deleted} files ({bytes_freed} bytes) " f"from {tasks_cleaned} tasks for user {user_id}" ) return { "user_id": user_id, "tasks_cleaned": tasks_cleaned, "files_deleted": files_deleted, "bytes_freed": bytes_freed, "tasks_with_files_remaining": min(len(tasks_with_files), max_files_to_keep) } def cleanup_all_users( self, db: Session, max_files_per_user: int = 50 ) -> Dict: """ Run cleanup for all users. Args: db: Database session max_files_per_user: Number of newest tasks to keep files for per user Returns: Dict with overall cleanup statistics """ # Get all distinct user IDs with tasks user_ids = ( db.query(Task.user_id) .filter(Task.file_deleted == False) .distinct() .all() ) total_tasks_cleaned = 0 total_files_deleted = 0 total_bytes_freed = 0 users_processed = 0 for (user_id,) in user_ids: result = self.cleanup_user_files(db, user_id, max_files_per_user) total_tasks_cleaned += result["tasks_cleaned"] total_files_deleted += result["files_deleted"] total_bytes_freed += result["bytes_freed"] users_processed += 1 logger.info( f"Cleanup completed: {users_processed} users, " f"{total_tasks_cleaned} tasks, {total_files_deleted} files, " f"{total_bytes_freed} bytes freed" ) return { "users_processed": users_processed, "total_tasks_cleaned": total_tasks_cleaned, "total_files_deleted": total_files_deleted, "total_bytes_freed": total_bytes_freed, "timestamp": datetime.utcnow().isoformat() } def _delete_task_files(self, task: Task) -> Tuple[int, int]: """ Delete actual files for a task from disk. Args: task: Task object Returns: Tuple of (bytes_deleted, files_deleted) """ bytes_deleted = 0 files_deleted = 0 # Delete result directory result_dir = os.path.join(settings.result_dir, task.task_id) if os.path.exists(result_dir): try: dir_size = self._get_dir_size(result_dir) shutil.rmtree(result_dir) bytes_deleted += dir_size files_deleted += 1 logger.debug(f"Deleted result directory: {result_dir}") except Exception as e: logger.error(f"Failed to delete result directory {result_dir}: {e}") # Delete uploaded files from task_files for task_file in task.files: if task_file.stored_path and os.path.exists(task_file.stored_path): try: file_size = os.path.getsize(task_file.stored_path) os.remove(task_file.stored_path) bytes_deleted += file_size files_deleted += 1 logger.debug(f"Deleted uploaded file: {task_file.stored_path}") except Exception as e: logger.error(f"Failed to delete file {task_file.stored_path}: {e}") return bytes_deleted, files_deleted def _get_dir_size(self, path: str) -> int: """Get total size of a directory in bytes.""" total = 0 try: for entry in os.scandir(path): if entry.is_file(): total += entry.stat().st_size elif entry.is_dir(): total += self._get_dir_size(entry.path) except Exception: pass return total def get_storage_stats(self, db: Session) -> Dict: """ Get storage statistics for admin dashboard. Args: db: Database session Returns: Dict with storage statistics """ # Count tasks by file_deleted status total_tasks = db.query(Task).count() tasks_with_files = db.query(Task).filter(Task.file_deleted == False).count() tasks_files_deleted = db.query(Task).filter(Task.file_deleted == True).count() soft_deleted_tasks = db.query(Task).filter(Task.deleted_at.isnot(None)).count() # Get per-user statistics user_stats = ( db.query( Task.user_id, func.count(Task.id).label("total_tasks"), func.sum(func.if_(Task.file_deleted == False, 1, 0)).label("tasks_with_files"), func.sum(func.if_(Task.deleted_at.isnot(None), 1, 0)).label("deleted_tasks") ) .group_by(Task.user_id) .all() ) # Calculate actual disk usage uploads_size = self._get_dir_size(settings.upload_dir) results_size = self._get_dir_size(settings.result_dir) return { "total_tasks": total_tasks, "tasks_with_files": tasks_with_files, "tasks_files_deleted": tasks_files_deleted, "soft_deleted_tasks": soft_deleted_tasks, "disk_usage": { "uploads_bytes": uploads_size, "results_bytes": results_size, "total_bytes": uploads_size + results_size, "uploads_mb": round(uploads_size / (1024 * 1024), 2), "results_mb": round(results_size / (1024 * 1024), 2), "total_mb": round((uploads_size + results_size) / (1024 * 1024), 2) }, "per_user": [ { "user_id": stat.user_id, "total_tasks": stat.total_tasks, "tasks_with_files": int(stat.tasks_with_files or 0), "deleted_tasks": int(stat.deleted_tasks or 0) } for stat in user_stats ] } # Global service instance cleanup_service = CleanupService()