Files
OCR/backend/app/services/task_service.py
egg 73112db055 feat: add storage cleanup mechanism with soft delete and auto scheduler
- Add soft delete (deleted_at column) to preserve task records for statistics
- Implement cleanup service to delete old files while keeping DB records
- Add automatic cleanup scheduler (configurable interval, default 24h)
- Add admin endpoints: storage stats, cleanup trigger, scheduler status
- Update task service with admin views (include deleted/files_deleted)
- Add frontend storage management UI in admin dashboard
- Add i18n translations for storage management

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-14 12:41:01 +08:00

484 lines
14 KiB
Python

"""
Tool_OCR - Task Management Service
Handles OCR task CRUD operations with user isolation
"""
from typing import List, Optional, Tuple
from sqlalchemy.orm import Session
from sqlalchemy import and_, or_, desc
from datetime import datetime, timedelta
import uuid
import logging
from app.models.task import Task, TaskFile, TaskStatus
from app.core.config import settings
logger = logging.getLogger(__name__)
class TaskService:
"""Service for task management with user isolation"""
def create_task(
self,
db: Session,
user_id: int,
filename: Optional[str] = None,
file_type: Optional[str] = None,
) -> Task:
"""
Create a new task for a user
Args:
db: Database session
user_id: User ID (for isolation)
filename: Original filename
file_type: File MIME type
Returns:
Created Task object
"""
# Generate unique task ID
task_id = str(uuid.uuid4())
# Check user's task limit
if settings.max_tasks_per_user > 0:
user_task_count = db.query(Task).filter(Task.user_id == user_id).count()
if user_task_count >= settings.max_tasks_per_user:
# Auto-delete oldest completed tasks to make room
self._cleanup_old_tasks(db, user_id, limit=10)
# Create task
task = Task(
user_id=user_id,
task_id=task_id,
filename=filename,
file_type=file_type,
status=TaskStatus.PENDING,
)
db.add(task)
db.commit()
db.refresh(task)
logger.info(f"Created task {task_id} for user {user_id}")
return task
def get_task_by_id(
self, db: Session, task_id: str, user_id: int, include_deleted: bool = False
) -> Optional[Task]:
"""
Get task by ID with user isolation
Args:
db: Database session
task_id: Task ID (UUID)
user_id: User ID (for isolation)
include_deleted: If True, include soft-deleted tasks
Returns:
Task object or None if not found/unauthorized
"""
query = db.query(Task).filter(
and_(Task.task_id == task_id, Task.user_id == user_id)
)
# Filter out soft-deleted tasks by default
if not include_deleted:
query = query.filter(Task.deleted_at.is_(None))
return query.first()
def get_user_tasks(
self,
db: Session,
user_id: int,
status: Optional[TaskStatus] = None,
filename_search: Optional[str] = None,
date_from: Optional[datetime] = None,
date_to: Optional[datetime] = None,
skip: int = 0,
limit: int = 50,
order_by: str = "created_at",
order_desc: bool = True,
include_deleted: bool = False,
) -> Tuple[List[Task], int]:
"""
Get user's tasks with pagination and filtering
Args:
db: Database session
user_id: User ID (for isolation)
status: Filter by status (optional)
filename_search: Search by filename (partial match, optional)
date_from: Filter tasks created from this date (optional)
date_to: Filter tasks created until this date (optional)
skip: Pagination offset
limit: Pagination limit
order_by: Sort field (created_at, updated_at, completed_at)
order_desc: Sort descending
include_deleted: If True, include soft-deleted tasks
Returns:
Tuple of (tasks list, total count)
"""
# Base query with user isolation
query = db.query(Task).filter(Task.user_id == user_id)
# Filter out soft-deleted tasks by default
if not include_deleted:
query = query.filter(Task.deleted_at.is_(None))
# Apply status filter
if status:
query = query.filter(Task.status == status)
# Apply filename search (case-insensitive partial match)
if filename_search:
query = query.filter(Task.filename.ilike(f"%{filename_search}%"))
# Apply date range filter
if date_from:
query = query.filter(Task.created_at >= date_from)
if date_to:
# Add one day to include the entire end date
date_to_end = date_to + timedelta(days=1)
query = query.filter(Task.created_at < date_to_end)
# Get total count
total = query.count()
# Apply sorting
sort_column = getattr(Task, order_by, Task.created_at)
if order_desc:
query = query.order_by(desc(sort_column))
else:
query = query.order_by(sort_column)
# Apply pagination
tasks = query.offset(skip).limit(limit).all()
return tasks, total
def update_task_status(
self,
db: Session,
task_id: str,
user_id: int,
status: TaskStatus,
error_message: Optional[str] = None,
processing_time_ms: Optional[int] = None,
) -> Optional[Task]:
"""
Update task status with user isolation
Args:
db: Database session
task_id: Task ID (UUID)
user_id: User ID (for isolation)
status: New status
error_message: Error message if failed
processing_time_ms: Processing time in milliseconds
Returns:
Updated Task object or None if not found/unauthorized
"""
task = self.get_task_by_id(db, task_id, user_id)
if not task:
logger.warning(
f"Task {task_id} not found for user {user_id} during status update"
)
return None
task.status = status
task.updated_at = datetime.utcnow()
if status == TaskStatus.COMPLETED:
task.completed_at = datetime.utcnow()
if error_message:
task.error_message = error_message
if processing_time_ms is not None:
task.processing_time_ms = processing_time_ms
db.commit()
db.refresh(task)
logger.info(f"Updated task {task_id} status to {status.value}")
return task
def update_task_results(
self,
db: Session,
task_id: str,
user_id: int,
result_json_path: Optional[str] = None,
result_markdown_path: Optional[str] = None,
result_pdf_path: Optional[str] = None,
) -> Optional[Task]:
"""
Update task result file paths
Args:
db: Database session
task_id: Task ID (UUID)
user_id: User ID (for isolation)
result_json_path: Path to JSON result
result_markdown_path: Path to Markdown result
result_pdf_path: Path to searchable PDF
Returns:
Updated Task object or None if not found/unauthorized
"""
task = self.get_task_by_id(db, task_id, user_id)
if not task:
return None
if result_json_path:
task.result_json_path = result_json_path
if result_markdown_path:
task.result_markdown_path = result_markdown_path
if result_pdf_path:
task.result_pdf_path = result_pdf_path
task.updated_at = datetime.utcnow()
db.commit()
db.refresh(task)
logger.info(f"Updated task {task_id} result paths")
return task
def delete_task(
self, db: Session, task_id: str, user_id: int
) -> bool:
"""
Soft delete task with user isolation.
Sets deleted_at timestamp instead of removing record.
Database records are preserved for statistics tracking.
Args:
db: Database session
task_id: Task ID (UUID)
user_id: User ID (for isolation)
Returns:
True if soft deleted, False if not found/unauthorized
"""
task = self.get_task_by_id(db, task_id, user_id)
if not task:
return False
# Soft delete: set deleted_at timestamp
task.deleted_at = datetime.utcnow()
task.updated_at = datetime.utcnow()
db.commit()
logger.info(f"Soft deleted task {task_id} for user {user_id}")
return True
def _cleanup_old_tasks(
self, db: Session, user_id: int, limit: int = 10
) -> int:
"""
Clean up old completed tasks for a user
Args:
db: Database session
user_id: User ID
limit: Number of tasks to delete
Returns:
Number of tasks deleted
"""
# Find oldest completed tasks
old_tasks = (
db.query(Task)
.filter(
and_(
Task.user_id == user_id,
Task.status == TaskStatus.COMPLETED,
)
)
.order_by(Task.completed_at)
.limit(limit)
.all()
)
count = 0
for task in old_tasks:
db.delete(task)
count += 1
if count > 0:
db.commit()
logger.info(f"Cleaned up {count} old tasks for user {user_id}")
return count
def auto_cleanup_expired_tasks(self, db: Session) -> int:
"""
Auto-cleanup tasks older than TASK_RETENTION_DAYS
Args:
db: Database session
Returns:
Number of tasks deleted
"""
if settings.task_retention_days <= 0:
return 0
cutoff_date = datetime.utcnow() - timedelta(days=settings.task_retention_days)
# Find expired tasks
expired_tasks = (
db.query(Task)
.filter(
and_(
Task.status == TaskStatus.COMPLETED,
Task.completed_at < cutoff_date,
)
)
.all()
)
count = 0
for task in expired_tasks:
task.file_deleted = True
# TODO: Delete actual files from disk
db.delete(task)
count += 1
if count > 0:
db.commit()
logger.info(f"Auto-cleaned up {count} expired tasks")
return count
def get_user_stats(self, db: Session, user_id: int) -> dict:
"""
Get statistics for a user's tasks
Args:
db: Database session
user_id: User ID
Returns:
Dictionary with task statistics
"""
total = db.query(Task).filter(Task.user_id == user_id).count()
pending = (
db.query(Task)
.filter(and_(Task.user_id == user_id, Task.status == TaskStatus.PENDING))
.count()
)
processing = (
db.query(Task)
.filter(and_(Task.user_id == user_id, Task.status == TaskStatus.PROCESSING))
.count()
)
completed = (
db.query(Task)
.filter(and_(Task.user_id == user_id, Task.status == TaskStatus.COMPLETED))
.count()
)
failed = (
db.query(Task)
.filter(and_(Task.user_id == user_id, Task.status == TaskStatus.FAILED))
.count()
)
return {
"total": total,
"pending": pending,
"processing": processing,
"completed": completed,
"failed": failed,
}
def get_all_tasks_admin(
self,
db: Session,
user_id: Optional[int] = None,
status: Optional[TaskStatus] = None,
include_deleted: bool = True,
include_files_deleted: bool = True,
skip: int = 0,
limit: int = 50,
order_by: str = "created_at",
order_desc: bool = True,
) -> Tuple[List[Task], int]:
"""
Get all tasks for admin view (no user isolation).
Includes soft-deleted tasks by default.
Args:
db: Database session
user_id: Filter by user ID (optional)
status: Filter by status (optional)
include_deleted: Include soft-deleted tasks (default True)
include_files_deleted: Include tasks with deleted files (default True)
skip: Pagination offset
limit: Pagination limit
order_by: Sort field
order_desc: Sort descending
Returns:
Tuple of (tasks list, total count)
"""
query = db.query(Task)
# Optional user filter
if user_id is not None:
query = query.filter(Task.user_id == user_id)
# Filter soft-deleted if requested
if not include_deleted:
query = query.filter(Task.deleted_at.is_(None))
# Filter file-deleted if requested
if not include_files_deleted:
query = query.filter(Task.file_deleted == False)
# Apply status filter
if status:
query = query.filter(Task.status == status)
# Get total count
total = query.count()
# Apply sorting
sort_column = getattr(Task, order_by, Task.created_at)
if order_desc:
query = query.order_by(desc(sort_column))
else:
query = query.order_by(sort_column)
# Apply pagination
tasks = query.offset(skip).limit(limit).all()
return tasks, total
def get_task_by_id_admin(self, db: Session, task_id: str) -> Optional[Task]:
"""
Get task by ID for admin (no user isolation, includes deleted).
Args:
db: Database session
task_id: Task ID (UUID)
Returns:
Task object or None if not found
"""
return db.query(Task).filter(Task.task_id == task_id).first()
# Global service instance
task_service = TaskService()