feat: add storage cleanup mechanism with soft delete and auto scheduler

- Add soft delete (deleted_at column) to preserve task records for statistics
- Implement cleanup service to delete old files while keeping DB records
- Add automatic cleanup scheduler (configurable interval, default 24h)
- Add admin endpoints: storage stats, cleanup trigger, scheduler status
- Update task service with admin views (include deleted/files_deleted)
- Add frontend storage management UI in admin dashboard
- Add i18n translations for storage management

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
egg
2025-12-14 12:41:01 +08:00
parent 81a0a3ab0f
commit 73112db055
23 changed files with 1359 additions and 634 deletions

View File

@@ -0,0 +1,173 @@
"""
Tool_OCR - Cleanup Scheduler
Background scheduler for periodic file cleanup
"""
import asyncio
import logging
from datetime import datetime
from typing import Optional
from sqlalchemy.orm import Session
from app.core.config import settings
from app.core.database import SessionLocal
from app.services.cleanup_service import cleanup_service
logger = logging.getLogger(__name__)
class CleanupScheduler:
"""
Background scheduler for periodic file cleanup.
Uses asyncio for non-blocking background execution.
"""
def __init__(self):
self._task: Optional[asyncio.Task] = None
self._running: bool = False
self._last_run: Optional[datetime] = None
self._next_run: Optional[datetime] = None
self._last_result: Optional[dict] = None
@property
def is_running(self) -> bool:
"""Check if scheduler is running"""
return self._running and self._task is not None and not self._task.done()
@property
def status(self) -> dict:
"""Get scheduler status"""
return {
"enabled": settings.cleanup_enabled,
"running": self.is_running,
"interval_hours": settings.cleanup_interval_hours,
"max_files_per_user": settings.max_files_per_user,
"last_run": self._last_run.isoformat() if self._last_run else None,
"next_run": self._next_run.isoformat() if self._next_run else None,
"last_result": self._last_result
}
async def start(self):
"""Start the cleanup scheduler"""
if not settings.cleanup_enabled:
logger.info("Cleanup scheduler is disabled in configuration")
return
if self.is_running:
logger.warning("Cleanup scheduler is already running")
return
self._running = True
self._task = asyncio.create_task(self._run_loop())
logger.info(
f"Cleanup scheduler started (interval: {settings.cleanup_interval_hours}h, "
f"max_files_per_user: {settings.max_files_per_user})"
)
async def stop(self):
"""Stop the cleanup scheduler"""
self._running = False
if self._task is not None:
self._task.cancel()
try:
await self._task
except asyncio.CancelledError:
pass
self._task = None
logger.info("Cleanup scheduler stopped")
async def _run_loop(self):
"""Main scheduler loop"""
interval_seconds = settings.cleanup_interval_hours * 3600
while self._running:
try:
# Calculate next run time
self._next_run = datetime.utcnow()
# Run cleanup
await self._execute_cleanup()
# Update next run time after successful execution
self._next_run = datetime.utcnow()
self._next_run = self._next_run.replace(
hour=(self._next_run.hour + settings.cleanup_interval_hours) % 24
)
# Wait for next interval
logger.debug(f"Cleanup scheduler sleeping for {interval_seconds} seconds")
await asyncio.sleep(interval_seconds)
except asyncio.CancelledError:
logger.info("Cleanup scheduler loop cancelled")
break
except Exception as e:
logger.exception(f"Error in cleanup scheduler loop: {e}")
# Wait a bit before retrying to avoid tight error loops
await asyncio.sleep(60)
async def _execute_cleanup(self):
"""Execute the cleanup task"""
logger.info("Starting scheduled cleanup...")
self._last_run = datetime.utcnow()
# Run cleanup in thread pool to avoid blocking
loop = asyncio.get_event_loop()
result = await loop.run_in_executor(None, self._run_cleanup_sync)
self._last_result = result
logger.info(
f"Scheduled cleanup completed: {result.get('total_files_deleted', 0)} files deleted, "
f"{result.get('total_bytes_freed', 0)} bytes freed"
)
def _run_cleanup_sync(self) -> dict:
"""Synchronous cleanup execution (runs in thread pool)"""
db: Session = SessionLocal()
try:
result = cleanup_service.cleanup_all_users(
db=db,
max_files_per_user=settings.max_files_per_user
)
return result
except Exception as e:
logger.exception(f"Cleanup execution failed: {e}")
return {
"error": str(e),
"timestamp": datetime.utcnow().isoformat()
}
finally:
db.close()
async def run_now(self) -> dict:
"""Trigger immediate cleanup (outside of scheduled interval)"""
logger.info("Manual cleanup triggered")
await self._execute_cleanup()
return self._last_result or {}
# Global scheduler instance
_scheduler: Optional[CleanupScheduler] = None
def get_cleanup_scheduler() -> CleanupScheduler:
"""Get the global cleanup scheduler instance"""
global _scheduler
if _scheduler is None:
_scheduler = CleanupScheduler()
return _scheduler
async def start_cleanup_scheduler():
"""Start the global cleanup scheduler"""
scheduler = get_cleanup_scheduler()
await scheduler.start()
async def stop_cleanup_scheduler():
"""Stop the global cleanup scheduler"""
scheduler = get_cleanup_scheduler()
await scheduler.stop()

View File

@@ -0,0 +1,246 @@
"""
Tool_OCR - Cleanup Service
Handles file cleanup while preserving database records for statistics
"""
import os
import shutil
import logging
from typing import Dict, List, Tuple
from datetime import datetime
from sqlalchemy.orm import Session
from sqlalchemy import and_, func
from app.models.task import Task, TaskFile, TaskStatus
from app.core.config import settings
logger = logging.getLogger(__name__)
class CleanupService:
"""Service for cleaning up files while preserving database records"""
def cleanup_user_files(
self,
db: Session,
user_id: int,
max_files_to_keep: int = 50
) -> Dict:
"""
Clean up old files for a user, keeping only the newest N tasks' files.
Database records are preserved for statistics.
Args:
db: Database session
user_id: User ID
max_files_to_keep: Number of newest tasks to keep files for
Returns:
Dict with cleanup statistics
"""
# Get all completed tasks with files (not yet deleted)
tasks_with_files = (
db.query(Task)
.filter(
and_(
Task.user_id == user_id,
Task.status == TaskStatus.COMPLETED,
Task.file_deleted == False,
Task.deleted_at.is_(None) # Don't process already soft-deleted
)
)
.order_by(Task.created_at.desc())
.all()
)
# Keep newest N tasks, clean files from older ones
tasks_to_clean = tasks_with_files[max_files_to_keep:]
files_deleted = 0
bytes_freed = 0
tasks_cleaned = 0
for task in tasks_to_clean:
task_bytes, task_files = self._delete_task_files(task)
if task_files > 0:
task.file_deleted = True
task.updated_at = datetime.utcnow()
files_deleted += task_files
bytes_freed += task_bytes
tasks_cleaned += 1
if tasks_cleaned > 0:
db.commit()
logger.info(
f"Cleaned up {files_deleted} files ({bytes_freed} bytes) "
f"from {tasks_cleaned} tasks for user {user_id}"
)
return {
"user_id": user_id,
"tasks_cleaned": tasks_cleaned,
"files_deleted": files_deleted,
"bytes_freed": bytes_freed,
"tasks_with_files_remaining": min(len(tasks_with_files), max_files_to_keep)
}
def cleanup_all_users(
self,
db: Session,
max_files_per_user: int = 50
) -> Dict:
"""
Run cleanup for all users.
Args:
db: Database session
max_files_per_user: Number of newest tasks to keep files for per user
Returns:
Dict with overall cleanup statistics
"""
# Get all distinct user IDs with tasks
user_ids = (
db.query(Task.user_id)
.filter(Task.file_deleted == False)
.distinct()
.all()
)
total_tasks_cleaned = 0
total_files_deleted = 0
total_bytes_freed = 0
users_processed = 0
for (user_id,) in user_ids:
result = self.cleanup_user_files(db, user_id, max_files_per_user)
total_tasks_cleaned += result["tasks_cleaned"]
total_files_deleted += result["files_deleted"]
total_bytes_freed += result["bytes_freed"]
users_processed += 1
logger.info(
f"Cleanup completed: {users_processed} users, "
f"{total_tasks_cleaned} tasks, {total_files_deleted} files, "
f"{total_bytes_freed} bytes freed"
)
return {
"users_processed": users_processed,
"total_tasks_cleaned": total_tasks_cleaned,
"total_files_deleted": total_files_deleted,
"total_bytes_freed": total_bytes_freed,
"timestamp": datetime.utcnow().isoformat()
}
def _delete_task_files(self, task: Task) -> Tuple[int, int]:
"""
Delete actual files for a task from disk.
Args:
task: Task object
Returns:
Tuple of (bytes_deleted, files_deleted)
"""
bytes_deleted = 0
files_deleted = 0
# Delete result directory
result_dir = os.path.join(settings.result_dir, task.task_id)
if os.path.exists(result_dir):
try:
dir_size = self._get_dir_size(result_dir)
shutil.rmtree(result_dir)
bytes_deleted += dir_size
files_deleted += 1
logger.debug(f"Deleted result directory: {result_dir}")
except Exception as e:
logger.error(f"Failed to delete result directory {result_dir}: {e}")
# Delete uploaded files from task_files
for task_file in task.files:
if task_file.stored_path and os.path.exists(task_file.stored_path):
try:
file_size = os.path.getsize(task_file.stored_path)
os.remove(task_file.stored_path)
bytes_deleted += file_size
files_deleted += 1
logger.debug(f"Deleted uploaded file: {task_file.stored_path}")
except Exception as e:
logger.error(f"Failed to delete file {task_file.stored_path}: {e}")
return bytes_deleted, files_deleted
def _get_dir_size(self, path: str) -> int:
"""Get total size of a directory in bytes."""
total = 0
try:
for entry in os.scandir(path):
if entry.is_file():
total += entry.stat().st_size
elif entry.is_dir():
total += self._get_dir_size(entry.path)
except Exception:
pass
return total
def get_storage_stats(self, db: Session) -> Dict:
"""
Get storage statistics for admin dashboard.
Args:
db: Database session
Returns:
Dict with storage statistics
"""
# Count tasks by file_deleted status
total_tasks = db.query(Task).count()
tasks_with_files = db.query(Task).filter(Task.file_deleted == False).count()
tasks_files_deleted = db.query(Task).filter(Task.file_deleted == True).count()
soft_deleted_tasks = db.query(Task).filter(Task.deleted_at.isnot(None)).count()
# Get per-user statistics
user_stats = (
db.query(
Task.user_id,
func.count(Task.id).label("total_tasks"),
func.sum(func.if_(Task.file_deleted == False, 1, 0)).label("tasks_with_files"),
func.sum(func.if_(Task.deleted_at.isnot(None), 1, 0)).label("deleted_tasks")
)
.group_by(Task.user_id)
.all()
)
# Calculate actual disk usage
uploads_size = self._get_dir_size(settings.upload_dir)
results_size = self._get_dir_size(settings.result_dir)
return {
"total_tasks": total_tasks,
"tasks_with_files": tasks_with_files,
"tasks_files_deleted": tasks_files_deleted,
"soft_deleted_tasks": soft_deleted_tasks,
"disk_usage": {
"uploads_bytes": uploads_size,
"results_bytes": results_size,
"total_bytes": uploads_size + results_size,
"uploads_mb": round(uploads_size / (1024 * 1024), 2),
"results_mb": round(results_size / (1024 * 1024), 2),
"total_mb": round((uploads_size + results_size) / (1024 * 1024), 2)
},
"per_user": [
{
"user_id": stat.user_id,
"total_tasks": stat.total_tasks,
"tasks_with_files": int(stat.tasks_with_files or 0),
"deleted_tasks": int(stat.deleted_tasks or 0)
}
for stat in user_stats
]
}
# Global service instance
cleanup_service = CleanupService()

View File

@@ -65,7 +65,7 @@ class TaskService:
return task
def get_task_by_id(
self, db: Session, task_id: str, user_id: int
self, db: Session, task_id: str, user_id: int, include_deleted: bool = False
) -> Optional[Task]:
"""
Get task by ID with user isolation
@@ -74,16 +74,20 @@ class TaskService:
db: Database session
task_id: Task ID (UUID)
user_id: User ID (for isolation)
include_deleted: If True, include soft-deleted tasks
Returns:
Task object or None if not found/unauthorized
"""
task = (
db.query(Task)
.filter(and_(Task.task_id == task_id, Task.user_id == user_id))
.first()
query = db.query(Task).filter(
and_(Task.task_id == task_id, Task.user_id == user_id)
)
return task
# Filter out soft-deleted tasks by default
if not include_deleted:
query = query.filter(Task.deleted_at.is_(None))
return query.first()
def get_user_tasks(
self,
@@ -97,6 +101,7 @@ class TaskService:
limit: int = 50,
order_by: str = "created_at",
order_desc: bool = True,
include_deleted: bool = False,
) -> Tuple[List[Task], int]:
"""
Get user's tasks with pagination and filtering
@@ -112,6 +117,7 @@ class TaskService:
limit: Pagination limit
order_by: Sort field (created_at, updated_at, completed_at)
order_desc: Sort descending
include_deleted: If True, include soft-deleted tasks
Returns:
Tuple of (tasks list, total count)
@@ -119,6 +125,10 @@ class TaskService:
# Base query with user isolation
query = db.query(Task).filter(Task.user_id == user_id)
# Filter out soft-deleted tasks by default
if not include_deleted:
query = query.filter(Task.deleted_at.is_(None))
# Apply status filter
if status:
query = query.filter(Task.status == status)
@@ -244,7 +254,9 @@ class TaskService:
self, db: Session, task_id: str, user_id: int
) -> bool:
"""
Delete task with user isolation
Soft delete task with user isolation.
Sets deleted_at timestamp instead of removing record.
Database records are preserved for statistics tracking.
Args:
db: Database session
@@ -252,17 +264,18 @@ class TaskService:
user_id: User ID (for isolation)
Returns:
True if deleted, False if not found/unauthorized
True if soft deleted, False if not found/unauthorized
"""
task = self.get_task_by_id(db, task_id, user_id)
if not task:
return False
# Cascade delete will handle task_files
db.delete(task)
# Soft delete: set deleted_at timestamp
task.deleted_at = datetime.utcnow()
task.updated_at = datetime.utcnow()
db.commit()
logger.info(f"Deleted task {task_id} for user {user_id}")
logger.info(f"Soft deleted task {task_id} for user {user_id}")
return True
def _cleanup_old_tasks(
@@ -389,6 +402,82 @@ class TaskService:
"failed": failed,
}
def get_all_tasks_admin(
self,
db: Session,
user_id: Optional[int] = None,
status: Optional[TaskStatus] = None,
include_deleted: bool = True,
include_files_deleted: bool = True,
skip: int = 0,
limit: int = 50,
order_by: str = "created_at",
order_desc: bool = True,
) -> Tuple[List[Task], int]:
"""
Get all tasks for admin view (no user isolation).
Includes soft-deleted tasks by default.
Args:
db: Database session
user_id: Filter by user ID (optional)
status: Filter by status (optional)
include_deleted: Include soft-deleted tasks (default True)
include_files_deleted: Include tasks with deleted files (default True)
skip: Pagination offset
limit: Pagination limit
order_by: Sort field
order_desc: Sort descending
Returns:
Tuple of (tasks list, total count)
"""
query = db.query(Task)
# Optional user filter
if user_id is not None:
query = query.filter(Task.user_id == user_id)
# Filter soft-deleted if requested
if not include_deleted:
query = query.filter(Task.deleted_at.is_(None))
# Filter file-deleted if requested
if not include_files_deleted:
query = query.filter(Task.file_deleted == False)
# Apply status filter
if status:
query = query.filter(Task.status == status)
# Get total count
total = query.count()
# Apply sorting
sort_column = getattr(Task, order_by, Task.created_at)
if order_desc:
query = query.order_by(desc(sort_column))
else:
query = query.order_by(sort_column)
# Apply pagination
tasks = query.offset(skip).limit(limit).all()
return tasks, total
def get_task_by_id_admin(self, db: Session, task_id: str) -> Optional[Task]:
"""
Get task by ID for admin (no user isolation, includes deleted).
Args:
db: Database session
task_id: Task ID (UUID)
Returns:
Task object or None if not found
"""
return db.query(Task).filter(Task.task_id == task_id).first()
# Global service instance
task_service = TaskService()