Files
OCR/backend/app/services/cleanup_service.py
egg 73112db055 feat: add storage cleanup mechanism with soft delete and auto scheduler
- Add soft delete (deleted_at column) to preserve task records for statistics
- Implement cleanup service to delete old files while keeping DB records
- Add automatic cleanup scheduler (configurable interval, default 24h)
- Add admin endpoints: storage stats, cleanup trigger, scheduler status
- Update task service with admin views (include deleted/files_deleted)
- Add frontend storage management UI in admin dashboard
- Add i18n translations for storage management

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-14 12:41:01 +08:00

247 lines
8.1 KiB
Python

"""
Tool_OCR - Cleanup Service
Handles file cleanup while preserving database records for statistics
"""
import os
import shutil
import logging
from typing import Dict, List, Tuple
from datetime import datetime
from sqlalchemy.orm import Session
from sqlalchemy import and_, func
from app.models.task import Task, TaskFile, TaskStatus
from app.core.config import settings
logger = logging.getLogger(__name__)
class CleanupService:
"""Service for cleaning up files while preserving database records"""
def cleanup_user_files(
self,
db: Session,
user_id: int,
max_files_to_keep: int = 50
) -> Dict:
"""
Clean up old files for a user, keeping only the newest N tasks' files.
Database records are preserved for statistics.
Args:
db: Database session
user_id: User ID
max_files_to_keep: Number of newest tasks to keep files for
Returns:
Dict with cleanup statistics
"""
# Get all completed tasks with files (not yet deleted)
tasks_with_files = (
db.query(Task)
.filter(
and_(
Task.user_id == user_id,
Task.status == TaskStatus.COMPLETED,
Task.file_deleted == False,
Task.deleted_at.is_(None) # Don't process already soft-deleted
)
)
.order_by(Task.created_at.desc())
.all()
)
# Keep newest N tasks, clean files from older ones
tasks_to_clean = tasks_with_files[max_files_to_keep:]
files_deleted = 0
bytes_freed = 0
tasks_cleaned = 0
for task in tasks_to_clean:
task_bytes, task_files = self._delete_task_files(task)
if task_files > 0:
task.file_deleted = True
task.updated_at = datetime.utcnow()
files_deleted += task_files
bytes_freed += task_bytes
tasks_cleaned += 1
if tasks_cleaned > 0:
db.commit()
logger.info(
f"Cleaned up {files_deleted} files ({bytes_freed} bytes) "
f"from {tasks_cleaned} tasks for user {user_id}"
)
return {
"user_id": user_id,
"tasks_cleaned": tasks_cleaned,
"files_deleted": files_deleted,
"bytes_freed": bytes_freed,
"tasks_with_files_remaining": min(len(tasks_with_files), max_files_to_keep)
}
def cleanup_all_users(
self,
db: Session,
max_files_per_user: int = 50
) -> Dict:
"""
Run cleanup for all users.
Args:
db: Database session
max_files_per_user: Number of newest tasks to keep files for per user
Returns:
Dict with overall cleanup statistics
"""
# Get all distinct user IDs with tasks
user_ids = (
db.query(Task.user_id)
.filter(Task.file_deleted == False)
.distinct()
.all()
)
total_tasks_cleaned = 0
total_files_deleted = 0
total_bytes_freed = 0
users_processed = 0
for (user_id,) in user_ids:
result = self.cleanup_user_files(db, user_id, max_files_per_user)
total_tasks_cleaned += result["tasks_cleaned"]
total_files_deleted += result["files_deleted"]
total_bytes_freed += result["bytes_freed"]
users_processed += 1
logger.info(
f"Cleanup completed: {users_processed} users, "
f"{total_tasks_cleaned} tasks, {total_files_deleted} files, "
f"{total_bytes_freed} bytes freed"
)
return {
"users_processed": users_processed,
"total_tasks_cleaned": total_tasks_cleaned,
"total_files_deleted": total_files_deleted,
"total_bytes_freed": total_bytes_freed,
"timestamp": datetime.utcnow().isoformat()
}
def _delete_task_files(self, task: Task) -> Tuple[int, int]:
"""
Delete actual files for a task from disk.
Args:
task: Task object
Returns:
Tuple of (bytes_deleted, files_deleted)
"""
bytes_deleted = 0
files_deleted = 0
# Delete result directory
result_dir = os.path.join(settings.result_dir, task.task_id)
if os.path.exists(result_dir):
try:
dir_size = self._get_dir_size(result_dir)
shutil.rmtree(result_dir)
bytes_deleted += dir_size
files_deleted += 1
logger.debug(f"Deleted result directory: {result_dir}")
except Exception as e:
logger.error(f"Failed to delete result directory {result_dir}: {e}")
# Delete uploaded files from task_files
for task_file in task.files:
if task_file.stored_path and os.path.exists(task_file.stored_path):
try:
file_size = os.path.getsize(task_file.stored_path)
os.remove(task_file.stored_path)
bytes_deleted += file_size
files_deleted += 1
logger.debug(f"Deleted uploaded file: {task_file.stored_path}")
except Exception as e:
logger.error(f"Failed to delete file {task_file.stored_path}: {e}")
return bytes_deleted, files_deleted
def _get_dir_size(self, path: str) -> int:
"""Get total size of a directory in bytes."""
total = 0
try:
for entry in os.scandir(path):
if entry.is_file():
total += entry.stat().st_size
elif entry.is_dir():
total += self._get_dir_size(entry.path)
except Exception:
pass
return total
def get_storage_stats(self, db: Session) -> Dict:
"""
Get storage statistics for admin dashboard.
Args:
db: Database session
Returns:
Dict with storage statistics
"""
# Count tasks by file_deleted status
total_tasks = db.query(Task).count()
tasks_with_files = db.query(Task).filter(Task.file_deleted == False).count()
tasks_files_deleted = db.query(Task).filter(Task.file_deleted == True).count()
soft_deleted_tasks = db.query(Task).filter(Task.deleted_at.isnot(None)).count()
# Get per-user statistics
user_stats = (
db.query(
Task.user_id,
func.count(Task.id).label("total_tasks"),
func.sum(func.if_(Task.file_deleted == False, 1, 0)).label("tasks_with_files"),
func.sum(func.if_(Task.deleted_at.isnot(None), 1, 0)).label("deleted_tasks")
)
.group_by(Task.user_id)
.all()
)
# Calculate actual disk usage
uploads_size = self._get_dir_size(settings.upload_dir)
results_size = self._get_dir_size(settings.result_dir)
return {
"total_tasks": total_tasks,
"tasks_with_files": tasks_with_files,
"tasks_files_deleted": tasks_files_deleted,
"soft_deleted_tasks": soft_deleted_tasks,
"disk_usage": {
"uploads_bytes": uploads_size,
"results_bytes": results_size,
"total_bytes": uploads_size + results_size,
"uploads_mb": round(uploads_size / (1024 * 1024), 2),
"results_mb": round(results_size / (1024 * 1024), 2),
"total_mb": round((uploads_size + results_size) / (1024 * 1024), 2)
},
"per_user": [
{
"user_id": stat.user_id,
"total_tasks": stat.total_tasks,
"tasks_with_files": int(stat.tasks_with_files or 0),
"deleted_tasks": int(stat.deleted_tasks or 0)
}
for stat in user_stats
]
}
# Global service instance
cleanup_service = CleanupService()