- Add soft delete (deleted_at column) to preserve task records for statistics - Implement cleanup service to delete old files while keeping DB records - Add automatic cleanup scheduler (configurable interval, default 24h) - Add admin endpoints: storage stats, cleanup trigger, scheduler status - Update task service with admin views (include deleted/files_deleted) - Add frontend storage management UI in admin dashboard - Add i18n translations for storage management 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
247 lines
8.1 KiB
Python
247 lines
8.1 KiB
Python
"""
|
|
Tool_OCR - Cleanup Service
|
|
Handles file cleanup while preserving database records for statistics
|
|
"""
|
|
|
|
import os
|
|
import shutil
|
|
import logging
|
|
from typing import Dict, List, Tuple
|
|
from datetime import datetime
|
|
from sqlalchemy.orm import Session
|
|
from sqlalchemy import and_, func
|
|
|
|
from app.models.task import Task, TaskFile, TaskStatus
|
|
from app.core.config import settings
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class CleanupService:
|
|
"""Service for cleaning up files while preserving database records"""
|
|
|
|
def cleanup_user_files(
|
|
self,
|
|
db: Session,
|
|
user_id: int,
|
|
max_files_to_keep: int = 50
|
|
) -> Dict:
|
|
"""
|
|
Clean up old files for a user, keeping only the newest N tasks' files.
|
|
Database records are preserved for statistics.
|
|
|
|
Args:
|
|
db: Database session
|
|
user_id: User ID
|
|
max_files_to_keep: Number of newest tasks to keep files for
|
|
|
|
Returns:
|
|
Dict with cleanup statistics
|
|
"""
|
|
# Get all completed tasks with files (not yet deleted)
|
|
tasks_with_files = (
|
|
db.query(Task)
|
|
.filter(
|
|
and_(
|
|
Task.user_id == user_id,
|
|
Task.status == TaskStatus.COMPLETED,
|
|
Task.file_deleted == False,
|
|
Task.deleted_at.is_(None) # Don't process already soft-deleted
|
|
)
|
|
)
|
|
.order_by(Task.created_at.desc())
|
|
.all()
|
|
)
|
|
|
|
# Keep newest N tasks, clean files from older ones
|
|
tasks_to_clean = tasks_with_files[max_files_to_keep:]
|
|
|
|
files_deleted = 0
|
|
bytes_freed = 0
|
|
tasks_cleaned = 0
|
|
|
|
for task in tasks_to_clean:
|
|
task_bytes, task_files = self._delete_task_files(task)
|
|
if task_files > 0:
|
|
task.file_deleted = True
|
|
task.updated_at = datetime.utcnow()
|
|
files_deleted += task_files
|
|
bytes_freed += task_bytes
|
|
tasks_cleaned += 1
|
|
|
|
if tasks_cleaned > 0:
|
|
db.commit()
|
|
logger.info(
|
|
f"Cleaned up {files_deleted} files ({bytes_freed} bytes) "
|
|
f"from {tasks_cleaned} tasks for user {user_id}"
|
|
)
|
|
|
|
return {
|
|
"user_id": user_id,
|
|
"tasks_cleaned": tasks_cleaned,
|
|
"files_deleted": files_deleted,
|
|
"bytes_freed": bytes_freed,
|
|
"tasks_with_files_remaining": min(len(tasks_with_files), max_files_to_keep)
|
|
}
|
|
|
|
def cleanup_all_users(
|
|
self,
|
|
db: Session,
|
|
max_files_per_user: int = 50
|
|
) -> Dict:
|
|
"""
|
|
Run cleanup for all users.
|
|
|
|
Args:
|
|
db: Database session
|
|
max_files_per_user: Number of newest tasks to keep files for per user
|
|
|
|
Returns:
|
|
Dict with overall cleanup statistics
|
|
"""
|
|
# Get all distinct user IDs with tasks
|
|
user_ids = (
|
|
db.query(Task.user_id)
|
|
.filter(Task.file_deleted == False)
|
|
.distinct()
|
|
.all()
|
|
)
|
|
|
|
total_tasks_cleaned = 0
|
|
total_files_deleted = 0
|
|
total_bytes_freed = 0
|
|
users_processed = 0
|
|
|
|
for (user_id,) in user_ids:
|
|
result = self.cleanup_user_files(db, user_id, max_files_per_user)
|
|
total_tasks_cleaned += result["tasks_cleaned"]
|
|
total_files_deleted += result["files_deleted"]
|
|
total_bytes_freed += result["bytes_freed"]
|
|
users_processed += 1
|
|
|
|
logger.info(
|
|
f"Cleanup completed: {users_processed} users, "
|
|
f"{total_tasks_cleaned} tasks, {total_files_deleted} files, "
|
|
f"{total_bytes_freed} bytes freed"
|
|
)
|
|
|
|
return {
|
|
"users_processed": users_processed,
|
|
"total_tasks_cleaned": total_tasks_cleaned,
|
|
"total_files_deleted": total_files_deleted,
|
|
"total_bytes_freed": total_bytes_freed,
|
|
"timestamp": datetime.utcnow().isoformat()
|
|
}
|
|
|
|
def _delete_task_files(self, task: Task) -> Tuple[int, int]:
|
|
"""
|
|
Delete actual files for a task from disk.
|
|
|
|
Args:
|
|
task: Task object
|
|
|
|
Returns:
|
|
Tuple of (bytes_deleted, files_deleted)
|
|
"""
|
|
bytes_deleted = 0
|
|
files_deleted = 0
|
|
|
|
# Delete result directory
|
|
result_dir = os.path.join(settings.result_dir, task.task_id)
|
|
if os.path.exists(result_dir):
|
|
try:
|
|
dir_size = self._get_dir_size(result_dir)
|
|
shutil.rmtree(result_dir)
|
|
bytes_deleted += dir_size
|
|
files_deleted += 1
|
|
logger.debug(f"Deleted result directory: {result_dir}")
|
|
except Exception as e:
|
|
logger.error(f"Failed to delete result directory {result_dir}: {e}")
|
|
|
|
# Delete uploaded files from task_files
|
|
for task_file in task.files:
|
|
if task_file.stored_path and os.path.exists(task_file.stored_path):
|
|
try:
|
|
file_size = os.path.getsize(task_file.stored_path)
|
|
os.remove(task_file.stored_path)
|
|
bytes_deleted += file_size
|
|
files_deleted += 1
|
|
logger.debug(f"Deleted uploaded file: {task_file.stored_path}")
|
|
except Exception as e:
|
|
logger.error(f"Failed to delete file {task_file.stored_path}: {e}")
|
|
|
|
return bytes_deleted, files_deleted
|
|
|
|
def _get_dir_size(self, path: str) -> int:
|
|
"""Get total size of a directory in bytes."""
|
|
total = 0
|
|
try:
|
|
for entry in os.scandir(path):
|
|
if entry.is_file():
|
|
total += entry.stat().st_size
|
|
elif entry.is_dir():
|
|
total += self._get_dir_size(entry.path)
|
|
except Exception:
|
|
pass
|
|
return total
|
|
|
|
def get_storage_stats(self, db: Session) -> Dict:
|
|
"""
|
|
Get storage statistics for admin dashboard.
|
|
|
|
Args:
|
|
db: Database session
|
|
|
|
Returns:
|
|
Dict with storage statistics
|
|
"""
|
|
# Count tasks by file_deleted status
|
|
total_tasks = db.query(Task).count()
|
|
tasks_with_files = db.query(Task).filter(Task.file_deleted == False).count()
|
|
tasks_files_deleted = db.query(Task).filter(Task.file_deleted == True).count()
|
|
soft_deleted_tasks = db.query(Task).filter(Task.deleted_at.isnot(None)).count()
|
|
|
|
# Get per-user statistics
|
|
user_stats = (
|
|
db.query(
|
|
Task.user_id,
|
|
func.count(Task.id).label("total_tasks"),
|
|
func.sum(func.if_(Task.file_deleted == False, 1, 0)).label("tasks_with_files"),
|
|
func.sum(func.if_(Task.deleted_at.isnot(None), 1, 0)).label("deleted_tasks")
|
|
)
|
|
.group_by(Task.user_id)
|
|
.all()
|
|
)
|
|
|
|
# Calculate actual disk usage
|
|
uploads_size = self._get_dir_size(settings.upload_dir)
|
|
results_size = self._get_dir_size(settings.result_dir)
|
|
|
|
return {
|
|
"total_tasks": total_tasks,
|
|
"tasks_with_files": tasks_with_files,
|
|
"tasks_files_deleted": tasks_files_deleted,
|
|
"soft_deleted_tasks": soft_deleted_tasks,
|
|
"disk_usage": {
|
|
"uploads_bytes": uploads_size,
|
|
"results_bytes": results_size,
|
|
"total_bytes": uploads_size + results_size,
|
|
"uploads_mb": round(uploads_size / (1024 * 1024), 2),
|
|
"results_mb": round(results_size / (1024 * 1024), 2),
|
|
"total_mb": round((uploads_size + results_size) / (1024 * 1024), 2)
|
|
},
|
|
"per_user": [
|
|
{
|
|
"user_id": stat.user_id,
|
|
"total_tasks": stat.total_tasks,
|
|
"tasks_with_files": int(stat.tasks_with_files or 0),
|
|
"deleted_tasks": int(stat.deleted_tasks or 0)
|
|
}
|
|
for stat in user_stats
|
|
]
|
|
}
|
|
|
|
|
|
# Global service instance
|
|
cleanup_service = CleanupService()
|