feat: add storage cleanup mechanism with soft delete and auto scheduler
- Add soft delete (deleted_at column) to preserve task records for statistics - Implement cleanup service to delete old files while keeping DB records - Add automatic cleanup scheduler (configurable interval, default 24h) - Add admin endpoints: storage stats, cleanup trigger, scheduler status - Update task service with admin views (include deleted/files_deleted) - Add frontend storage management UI in admin dashboard - Add i18n translations for storage management 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
246
backend/app/services/cleanup_service.py
Normal file
246
backend/app/services/cleanup_service.py
Normal file
@@ -0,0 +1,246 @@
|
||||
"""
|
||||
Tool_OCR - Cleanup Service
|
||||
Handles file cleanup while preserving database records for statistics
|
||||
"""
|
||||
|
||||
import os
|
||||
import shutil
|
||||
import logging
|
||||
from typing import Dict, List, Tuple
|
||||
from datetime import datetime
|
||||
from sqlalchemy.orm import Session
|
||||
from sqlalchemy import and_, func
|
||||
|
||||
from app.models.task import Task, TaskFile, TaskStatus
|
||||
from app.core.config import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class CleanupService:
|
||||
"""Service for cleaning up files while preserving database records"""
|
||||
|
||||
def cleanup_user_files(
|
||||
self,
|
||||
db: Session,
|
||||
user_id: int,
|
||||
max_files_to_keep: int = 50
|
||||
) -> Dict:
|
||||
"""
|
||||
Clean up old files for a user, keeping only the newest N tasks' files.
|
||||
Database records are preserved for statistics.
|
||||
|
||||
Args:
|
||||
db: Database session
|
||||
user_id: User ID
|
||||
max_files_to_keep: Number of newest tasks to keep files for
|
||||
|
||||
Returns:
|
||||
Dict with cleanup statistics
|
||||
"""
|
||||
# Get all completed tasks with files (not yet deleted)
|
||||
tasks_with_files = (
|
||||
db.query(Task)
|
||||
.filter(
|
||||
and_(
|
||||
Task.user_id == user_id,
|
||||
Task.status == TaskStatus.COMPLETED,
|
||||
Task.file_deleted == False,
|
||||
Task.deleted_at.is_(None) # Don't process already soft-deleted
|
||||
)
|
||||
)
|
||||
.order_by(Task.created_at.desc())
|
||||
.all()
|
||||
)
|
||||
|
||||
# Keep newest N tasks, clean files from older ones
|
||||
tasks_to_clean = tasks_with_files[max_files_to_keep:]
|
||||
|
||||
files_deleted = 0
|
||||
bytes_freed = 0
|
||||
tasks_cleaned = 0
|
||||
|
||||
for task in tasks_to_clean:
|
||||
task_bytes, task_files = self._delete_task_files(task)
|
||||
if task_files > 0:
|
||||
task.file_deleted = True
|
||||
task.updated_at = datetime.utcnow()
|
||||
files_deleted += task_files
|
||||
bytes_freed += task_bytes
|
||||
tasks_cleaned += 1
|
||||
|
||||
if tasks_cleaned > 0:
|
||||
db.commit()
|
||||
logger.info(
|
||||
f"Cleaned up {files_deleted} files ({bytes_freed} bytes) "
|
||||
f"from {tasks_cleaned} tasks for user {user_id}"
|
||||
)
|
||||
|
||||
return {
|
||||
"user_id": user_id,
|
||||
"tasks_cleaned": tasks_cleaned,
|
||||
"files_deleted": files_deleted,
|
||||
"bytes_freed": bytes_freed,
|
||||
"tasks_with_files_remaining": min(len(tasks_with_files), max_files_to_keep)
|
||||
}
|
||||
|
||||
def cleanup_all_users(
|
||||
self,
|
||||
db: Session,
|
||||
max_files_per_user: int = 50
|
||||
) -> Dict:
|
||||
"""
|
||||
Run cleanup for all users.
|
||||
|
||||
Args:
|
||||
db: Database session
|
||||
max_files_per_user: Number of newest tasks to keep files for per user
|
||||
|
||||
Returns:
|
||||
Dict with overall cleanup statistics
|
||||
"""
|
||||
# Get all distinct user IDs with tasks
|
||||
user_ids = (
|
||||
db.query(Task.user_id)
|
||||
.filter(Task.file_deleted == False)
|
||||
.distinct()
|
||||
.all()
|
||||
)
|
||||
|
||||
total_tasks_cleaned = 0
|
||||
total_files_deleted = 0
|
||||
total_bytes_freed = 0
|
||||
users_processed = 0
|
||||
|
||||
for (user_id,) in user_ids:
|
||||
result = self.cleanup_user_files(db, user_id, max_files_per_user)
|
||||
total_tasks_cleaned += result["tasks_cleaned"]
|
||||
total_files_deleted += result["files_deleted"]
|
||||
total_bytes_freed += result["bytes_freed"]
|
||||
users_processed += 1
|
||||
|
||||
logger.info(
|
||||
f"Cleanup completed: {users_processed} users, "
|
||||
f"{total_tasks_cleaned} tasks, {total_files_deleted} files, "
|
||||
f"{total_bytes_freed} bytes freed"
|
||||
)
|
||||
|
||||
return {
|
||||
"users_processed": users_processed,
|
||||
"total_tasks_cleaned": total_tasks_cleaned,
|
||||
"total_files_deleted": total_files_deleted,
|
||||
"total_bytes_freed": total_bytes_freed,
|
||||
"timestamp": datetime.utcnow().isoformat()
|
||||
}
|
||||
|
||||
def _delete_task_files(self, task: Task) -> Tuple[int, int]:
|
||||
"""
|
||||
Delete actual files for a task from disk.
|
||||
|
||||
Args:
|
||||
task: Task object
|
||||
|
||||
Returns:
|
||||
Tuple of (bytes_deleted, files_deleted)
|
||||
"""
|
||||
bytes_deleted = 0
|
||||
files_deleted = 0
|
||||
|
||||
# Delete result directory
|
||||
result_dir = os.path.join(settings.result_dir, task.task_id)
|
||||
if os.path.exists(result_dir):
|
||||
try:
|
||||
dir_size = self._get_dir_size(result_dir)
|
||||
shutil.rmtree(result_dir)
|
||||
bytes_deleted += dir_size
|
||||
files_deleted += 1
|
||||
logger.debug(f"Deleted result directory: {result_dir}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to delete result directory {result_dir}: {e}")
|
||||
|
||||
# Delete uploaded files from task_files
|
||||
for task_file in task.files:
|
||||
if task_file.stored_path and os.path.exists(task_file.stored_path):
|
||||
try:
|
||||
file_size = os.path.getsize(task_file.stored_path)
|
||||
os.remove(task_file.stored_path)
|
||||
bytes_deleted += file_size
|
||||
files_deleted += 1
|
||||
logger.debug(f"Deleted uploaded file: {task_file.stored_path}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to delete file {task_file.stored_path}: {e}")
|
||||
|
||||
return bytes_deleted, files_deleted
|
||||
|
||||
def _get_dir_size(self, path: str) -> int:
|
||||
"""Get total size of a directory in bytes."""
|
||||
total = 0
|
||||
try:
|
||||
for entry in os.scandir(path):
|
||||
if entry.is_file():
|
||||
total += entry.stat().st_size
|
||||
elif entry.is_dir():
|
||||
total += self._get_dir_size(entry.path)
|
||||
except Exception:
|
||||
pass
|
||||
return total
|
||||
|
||||
def get_storage_stats(self, db: Session) -> Dict:
|
||||
"""
|
||||
Get storage statistics for admin dashboard.
|
||||
|
||||
Args:
|
||||
db: Database session
|
||||
|
||||
Returns:
|
||||
Dict with storage statistics
|
||||
"""
|
||||
# Count tasks by file_deleted status
|
||||
total_tasks = db.query(Task).count()
|
||||
tasks_with_files = db.query(Task).filter(Task.file_deleted == False).count()
|
||||
tasks_files_deleted = db.query(Task).filter(Task.file_deleted == True).count()
|
||||
soft_deleted_tasks = db.query(Task).filter(Task.deleted_at.isnot(None)).count()
|
||||
|
||||
# Get per-user statistics
|
||||
user_stats = (
|
||||
db.query(
|
||||
Task.user_id,
|
||||
func.count(Task.id).label("total_tasks"),
|
||||
func.sum(func.if_(Task.file_deleted == False, 1, 0)).label("tasks_with_files"),
|
||||
func.sum(func.if_(Task.deleted_at.isnot(None), 1, 0)).label("deleted_tasks")
|
||||
)
|
||||
.group_by(Task.user_id)
|
||||
.all()
|
||||
)
|
||||
|
||||
# Calculate actual disk usage
|
||||
uploads_size = self._get_dir_size(settings.upload_dir)
|
||||
results_size = self._get_dir_size(settings.result_dir)
|
||||
|
||||
return {
|
||||
"total_tasks": total_tasks,
|
||||
"tasks_with_files": tasks_with_files,
|
||||
"tasks_files_deleted": tasks_files_deleted,
|
||||
"soft_deleted_tasks": soft_deleted_tasks,
|
||||
"disk_usage": {
|
||||
"uploads_bytes": uploads_size,
|
||||
"results_bytes": results_size,
|
||||
"total_bytes": uploads_size + results_size,
|
||||
"uploads_mb": round(uploads_size / (1024 * 1024), 2),
|
||||
"results_mb": round(results_size / (1024 * 1024), 2),
|
||||
"total_mb": round((uploads_size + results_size) / (1024 * 1024), 2)
|
||||
},
|
||||
"per_user": [
|
||||
{
|
||||
"user_id": stat.user_id,
|
||||
"total_tasks": stat.total_tasks,
|
||||
"tasks_with_files": int(stat.tasks_with_files or 0),
|
||||
"deleted_tasks": int(stat.deleted_tasks or 0)
|
||||
}
|
||||
for stat in user_stats
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
# Global service instance
|
||||
cleanup_service = CleanupService()
|
||||
Reference in New Issue
Block a user