Files
OCR/backend/app/services/cleanup_scheduler.py
egg 7233e9cb7b fix: logging, warnings, and soft-delete consistency
- Fix duplicate logging in multi-worker mode with file lock for cleanup scheduler
- Add Pydantic V2 model_config to suppress protected_namespaces warning
- Suppress PaddlePaddle ccache warnings
- Fix admin.py using non-existent User.username (now uses email)
- Fix get_user_stats to exclude soft-deleted tasks from statistics
- Fix create_task to exclude soft-deleted tasks from user limit check
- Change LOG_LEVEL default to INFO

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-14 15:40:31 +08:00

227 lines
7.0 KiB
Python

"""
Tool_OCR - Cleanup Scheduler
Background scheduler for periodic file cleanup
"""
import asyncio
import logging
import os
import fcntl
from datetime import datetime
from pathlib import Path
from typing import Optional
from sqlalchemy.orm import Session
from app.core.config import settings
from app.core.database import SessionLocal
from app.services.cleanup_service import cleanup_service
logger = logging.getLogger(__name__)
# Lock file path for multi-worker coordination
_LOCK_FILE = Path(settings.log_file).parent / ".cleanup_scheduler.lock"
_lock_fd = None
def _try_acquire_lock() -> bool:
"""
Try to acquire exclusive lock for scheduler.
Only one worker should run the scheduler.
Returns True if lock acquired, False otherwise.
"""
global _lock_fd
try:
_LOCK_FILE.parent.mkdir(parents=True, exist_ok=True)
_lock_fd = open(_LOCK_FILE, 'w')
fcntl.flock(_lock_fd.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB)
_lock_fd.write(str(os.getpid()))
_lock_fd.flush()
return True
except (IOError, OSError):
# Lock is held by another process
if _lock_fd:
_lock_fd.close()
_lock_fd = None
return False
def _release_lock():
"""Release the scheduler lock"""
global _lock_fd
if _lock_fd:
try:
fcntl.flock(_lock_fd.fileno(), fcntl.LOCK_UN)
_lock_fd.close()
except Exception:
pass
_lock_fd = None
class CleanupScheduler:
"""
Background scheduler for periodic file cleanup.
Uses asyncio for non-blocking background execution.
Uses file lock to ensure only one instance runs across multiple workers.
"""
def __init__(self):
self._task: Optional[asyncio.Task] = None
self._running: bool = False
self._last_run: Optional[datetime] = None
self._next_run: Optional[datetime] = None
self._last_result: Optional[dict] = None
self._has_lock: bool = False
@property
def is_running(self) -> bool:
"""Check if scheduler is running"""
return self._running and self._task is not None and not self._task.done()
@property
def status(self) -> dict:
"""Get scheduler status"""
return {
"enabled": settings.cleanup_enabled,
"running": self.is_running,
"interval_hours": settings.cleanup_interval_hours,
"max_files_per_user": settings.max_files_per_user,
"last_run": self._last_run.isoformat() if self._last_run else None,
"next_run": self._next_run.isoformat() if self._next_run else None,
"last_result": self._last_result
}
async def start(self):
"""Start the cleanup scheduler"""
if not settings.cleanup_enabled:
logger.info("Cleanup scheduler is disabled in configuration")
return
if self.is_running:
logger.debug("Cleanup scheduler is already running")
return
# Try to acquire lock - only one worker should run the scheduler
if not _try_acquire_lock():
logger.debug("Another worker is running the cleanup scheduler, skipping")
return
self._has_lock = True
self._running = True
self._task = asyncio.create_task(self._run_loop())
logger.info(
f"Cleanup scheduler started (interval: {settings.cleanup_interval_hours}h, "
f"max_files_per_user: {settings.max_files_per_user})"
)
async def stop(self):
"""Stop the cleanup scheduler"""
self._running = False
if self._task is not None:
self._task.cancel()
try:
await self._task
except asyncio.CancelledError:
pass
self._task = None
# Release the lock if we had it
if self._has_lock:
_release_lock()
self._has_lock = False
logger.info("Cleanup scheduler stopped")
async def _run_loop(self):
"""Main scheduler loop"""
interval_seconds = settings.cleanup_interval_hours * 3600
while self._running:
try:
# Calculate next run time
self._next_run = datetime.utcnow()
# Run cleanup
await self._execute_cleanup()
# Update next run time after successful execution
self._next_run = datetime.utcnow()
self._next_run = self._next_run.replace(
hour=(self._next_run.hour + settings.cleanup_interval_hours) % 24
)
# Wait for next interval
logger.debug(f"Cleanup scheduler sleeping for {interval_seconds} seconds")
await asyncio.sleep(interval_seconds)
except asyncio.CancelledError:
logger.info("Cleanup scheduler loop cancelled")
break
except Exception as e:
logger.exception(f"Error in cleanup scheduler loop: {e}")
# Wait a bit before retrying to avoid tight error loops
await asyncio.sleep(60)
async def _execute_cleanup(self):
"""Execute the cleanup task"""
logger.info("Starting scheduled cleanup...")
self._last_run = datetime.utcnow()
# Run cleanup in thread pool to avoid blocking
loop = asyncio.get_event_loop()
result = await loop.run_in_executor(None, self._run_cleanup_sync)
self._last_result = result
logger.info(
f"Scheduled cleanup completed: {result.get('total_files_deleted', 0)} files deleted, "
f"{result.get('total_bytes_freed', 0)} bytes freed"
)
def _run_cleanup_sync(self) -> dict:
"""Synchronous cleanup execution (runs in thread pool)"""
db: Session = SessionLocal()
try:
result = cleanup_service.cleanup_all_users(
db=db,
max_files_per_user=settings.max_files_per_user
)
return result
except Exception as e:
logger.exception(f"Cleanup execution failed: {e}")
return {
"error": str(e),
"timestamp": datetime.utcnow().isoformat()
}
finally:
db.close()
async def run_now(self) -> dict:
"""Trigger immediate cleanup (outside of scheduled interval)"""
logger.info("Manual cleanup triggered")
await self._execute_cleanup()
return self._last_result or {}
# Global scheduler instance
_scheduler: Optional[CleanupScheduler] = None
def get_cleanup_scheduler() -> CleanupScheduler:
"""Get the global cleanup scheduler instance"""
global _scheduler
if _scheduler is None:
_scheduler = CleanupScheduler()
return _scheduler
async def start_cleanup_scheduler():
"""Start the global cleanup scheduler"""
scheduler = get_cleanup_scheduler()
await scheduler.start()
async def stop_cleanup_scheduler():
"""Stop the global cleanup scheduler"""
scheduler = get_cleanup_scheduler()
await scheduler.stop()