refactor: complete V1 to V2 migration and remove legacy architecture

Remove all V1 architecture components and promote V2 to primary:
- Delete all paddle_ocr_* table models (export, ocr, translation, user)
- Delete legacy routers (auth, export, ocr, translation)
- Delete legacy schemas and services
- Promote user_v2.py to user.py as primary user model
- Update all imports and dependencies to use V2 models only
- Update main.py version to 2.0.0

Database changes:
- Fix SQLAlchemy reserved word: rename audit_log.metadata to extra_data
- Add migration to drop all paddle_ocr_* tables
- Update alembic env to only import V2 models

Frontend fixes:
- Fix Select component exports in TaskHistoryPage.tsx
- Update to use simplified Select API with options prop
- Fix AxiosInstance TypeScript import syntax

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
egg
2025-11-14 21:27:39 +08:00
parent ad2b832fb6
commit fd98018ddd
34 changed files with 554 additions and 3787 deletions

View File

@@ -1,7 +1,7 @@
"""
Tool_OCR - API Routers
Tool_OCR - API Routers (V2)
"""
from app.routers import auth, ocr, export, translation
from app.routers import auth, tasks, admin
__all__ = ["auth", "ocr", "export", "translation"]
__all__ = ["auth", "tasks", "admin"]

View File

@@ -10,8 +10,8 @@ from datetime import datetime
from fastapi import APIRouter, Depends, HTTPException, status, Query
from sqlalchemy.orm import Session
from app.core.deps import get_db, get_current_admin_user_v2
from app.models.user_v2 import User
from app.core.deps import get_db, get_current_admin_user
from app.models.user import User
from app.services.admin_service import admin_service
from app.services.audit_service import audit_service
@@ -23,7 +23,7 @@ router = APIRouter(prefix="/api/v2/admin", tags=["Admin"])
@router.get("/stats", summary="Get system statistics")
async def get_system_stats(
db: Session = Depends(get_db),
admin_user: User = Depends(get_current_admin_user_v2)
admin_user: User = Depends(get_current_admin_user)
):
"""
Get overall system statistics
@@ -47,7 +47,7 @@ async def list_users(
page: int = Query(1, ge=1),
page_size: int = Query(50, ge=1, le=100),
db: Session = Depends(get_db),
admin_user: User = Depends(get_current_admin_user_v2)
admin_user: User = Depends(get_current_admin_user)
):
"""
Get list of all users with statistics
@@ -79,7 +79,7 @@ async def get_top_users(
metric: str = Query("tasks", regex="^(tasks|completed_tasks)$"),
limit: int = Query(10, ge=1, le=50),
db: Session = Depends(get_db),
admin_user: User = Depends(get_current_admin_user_v2)
admin_user: User = Depends(get_current_admin_user)
):
"""
Get top users by metric
@@ -115,7 +115,7 @@ async def get_audit_logs(
page: int = Query(1, ge=1),
page_size: int = Query(100, ge=1, le=500),
db: Session = Depends(get_db),
admin_user: User = Depends(get_current_admin_user_v2)
admin_user: User = Depends(get_current_admin_user)
):
"""
Get audit logs with filtering
@@ -169,7 +169,7 @@ async def get_user_activity_summary(
user_id: int,
days: int = Query(30, ge=1, le=365),
db: Session = Depends(get_db),
admin_user: User = Depends(get_current_admin_user_v2)
admin_user: User = Depends(get_current_admin_user)
):
"""
Get user activity summary for the last N days

View File

@@ -1,70 +1,347 @@
"""
Tool_OCR - Authentication Router
JWT login endpoint
Tool_OCR - External Authentication Router (V2)
Handles authentication via external Microsoft Azure AD API
"""
from datetime import timedelta
from datetime import datetime, timedelta
import logging
from typing import Optional
from fastapi import APIRouter, Depends, HTTPException, status
from fastapi import APIRouter, Depends, HTTPException, status, Request
from sqlalchemy.orm import Session
from app.core.config import settings
from app.core.deps import get_db
from app.core.security import verify_password, create_access_token
from app.core.deps import get_db, get_current_user
from app.core.security import create_access_token
from app.models.user import User
from app.schemas.auth import LoginRequest, Token
from app.models.session import Session as UserSession
from app.schemas.auth import LoginRequest, Token, UserResponse
from app.services.external_auth_service import external_auth_service
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/api/v1/auth", tags=["Authentication"])
router = APIRouter(prefix="/api/v2/auth", tags=["Authentication V2"])
@router.post("/login", response_model=Token, summary="User login")
def get_client_ip(request: Request) -> str:
"""Extract client IP address from request"""
# Check X-Forwarded-For header (for proxies)
forwarded = request.headers.get("X-Forwarded-For")
if forwarded:
return forwarded.split(",")[0].strip()
# Check X-Real-IP header
real_ip = request.headers.get("X-Real-IP")
if real_ip:
return real_ip
# Fallback to direct client
return request.client.host if request.client else "unknown"
def get_user_agent(request: Request) -> str:
"""Extract user agent from request"""
return request.headers.get("User-Agent", "unknown")[:500]
@router.post("/login", response_model=Token, summary="External API login")
async def login(
login_data: LoginRequest,
request: Request,
db: Session = Depends(get_db)
):
"""
User login with username and password
User login via external Microsoft Azure AD API
Returns JWT access token for authentication
Returns JWT access token and stores session information
- **username**: User's username
- **username**: User's email address
- **password**: User's password
"""
# Query user by username
user = db.query(User).filter(User.username == login_data.username).first()
# Call external authentication API
success, auth_response, error_msg = await external_auth_service.authenticate_user(
username=login_data.username,
password=login_data.password
)
# Verify user exists and password is correct
if not user or not verify_password(login_data.password, user.password_hash):
logger.warning(f"Failed login attempt for username: {login_data.username}")
if not success or not auth_response:
logger.warning(
f"External auth failed for user {login_data.username}: {error_msg}"
)
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="Incorrect username or password",
detail=error_msg or "Authentication failed",
headers={"WWW-Authenticate": "Bearer"},
)
# Check if user is active
if not user.is_active:
logger.warning(f"Inactive user login attempt: {login_data.username}")
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
detail="User account is inactive"
)
# Extract user info from external API response
user_info = auth_response.user_info
email = user_info.email
display_name = user_info.name
# Create access token
access_token_expires = timedelta(minutes=settings.access_token_expire_minutes)
access_token = create_access_token(
data={"sub": str(user.id), "username": user.username},
expires_delta=access_token_expires
# Find or create user in database
user = db.query(User).filter(User.email == email).first()
if not user:
# Create new user
user = User(
email=email,
display_name=display_name,
is_active=True,
last_login=datetime.utcnow()
)
db.add(user)
db.commit()
db.refresh(user)
logger.info(f"Created new user: {email} (ID: {user.id})")
else:
# Update existing user
user.display_name = display_name
user.last_login = datetime.utcnow()
# Check if user is active
if not user.is_active:
logger.warning(f"Inactive user login attempt: {email}")
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
detail="User account is inactive"
)
db.commit()
db.refresh(user)
logger.info(f"Updated existing user: {email} (ID: {user.id})")
# Parse token expiration
try:
expires_at = datetime.fromisoformat(auth_response.expires_at.replace('Z', '+00:00'))
issued_at = datetime.fromisoformat(auth_response.issued_at.replace('Z', '+00:00'))
except Exception as e:
logger.error(f"Failed to parse token timestamps: {e}")
expires_at = datetime.utcnow() + timedelta(seconds=auth_response.expires_in)
issued_at = datetime.utcnow()
# Create session in database
# TODO: Implement token encryption before storing
session = UserSession(
user_id=user.id,
access_token=auth_response.access_token, # Should be encrypted
id_token=auth_response.id_token, # Should be encrypted
token_type=auth_response.token_type,
expires_at=expires_at,
issued_at=issued_at,
ip_address=get_client_ip(request),
user_agent=get_user_agent(request)
)
db.add(session)
db.commit()
db.refresh(session)
logger.info(
f"Created session {session.id} for user {user.email} "
f"(expires: {expires_at})"
)
logger.info(f"Successful login: {user.username} (ID: {user.id})")
# Create internal JWT token for API access
# This token contains user ID and session ID
internal_token_expires = timedelta(minutes=settings.access_token_expire_minutes)
internal_access_token = create_access_token(
data={
"sub": str(user.id),
"email": user.email,
"session_id": session.id
},
expires_delta=internal_token_expires
)
return {
"access_token": access_token,
"access_token": internal_access_token,
"token_type": "bearer",
"expires_in": settings.access_token_expire_minutes * 60 # Convert to seconds
"expires_in": int(internal_token_expires.total_seconds()),
"user": {
"id": user.id,
"email": user.email,
"display_name": user.display_name
}
}
@router.post("/logout", summary="User logout")
async def logout(
session_id: Optional[int] = None,
db: Session = Depends(get_db),
current_user: User = Depends(get_current_user)
):
"""
User logout - invalidates session
- **session_id**: Session ID to logout (optional, logs out all if not provided)
"""
# TODO: Implement proper current_user dependency from JWT token
# For now, this is a placeholder
if session_id:
# Logout specific session
session = db.query(UserSession).filter(
UserSession.id == session_id,
UserSession.user_id == current_user.id
).first()
if session:
db.delete(session)
db.commit()
logger.info(f"Logged out session {session_id} for user {current_user.email}")
return {"message": "Logged out successfully"}
else:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="Session not found"
)
else:
# Logout all sessions
sessions = db.query(UserSession).filter(
UserSession.user_id == current_user.id
).all()
count = len(sessions)
for session in sessions:
db.delete(session)
db.commit()
logger.info(f"Logged out all {count} sessions for user {current_user.email}")
return {"message": f"Logged out {count} sessions"}
@router.get("/me", response_model=UserResponse, summary="Get current user")
async def get_me(
current_user: User = Depends(get_current_user)
):
"""
Get current authenticated user information
"""
# TODO: Implement proper current_user dependency from JWT token
return {
"id": current_user.id,
"email": current_user.email,
"display_name": current_user.display_name,
"created_at": current_user.created_at,
"last_login": current_user.last_login,
"is_active": current_user.is_active
}
@router.get("/sessions", summary="List user sessions")
async def list_sessions(
db: Session = Depends(get_db),
current_user: User = Depends(get_current_user)
):
"""
List all active sessions for current user
"""
sessions = db.query(UserSession).filter(
UserSession.user_id == current_user.id
).order_by(UserSession.created_at.desc()).all()
return {
"sessions": [
{
"id": s.id,
"token_type": s.token_type,
"expires_at": s.expires_at,
"issued_at": s.issued_at,
"ip_address": s.ip_address,
"user_agent": s.user_agent,
"created_at": s.created_at,
"last_accessed_at": s.last_accessed_at,
"is_expired": s.is_expired,
"time_until_expiry": s.time_until_expiry
}
for s in sessions
]
}
@router.post("/refresh", response_model=Token, summary="Refresh access token")
async def refresh_token(
request: Request,
db: Session = Depends(get_db),
current_user: User = Depends(get_current_user)
):
"""
Refresh access token before expiration
Re-authenticates with external API using stored session.
Note: Since external API doesn't provide refresh tokens,
we re-issue internal JWT tokens with extended expiry.
"""
try:
# Find user's most recent session
session = db.query(UserSession).filter(
UserSession.user_id == current_user.id
).order_by(UserSession.created_at.desc()).first()
if not session:
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="No active session found"
)
# Check if token is expiring soon (within TOKEN_REFRESH_BUFFER)
if not external_auth_service.is_token_expiring_soon(session.expires_at):
# Token still valid for a while, just issue new internal JWT
internal_token_expires = timedelta(minutes=settings.access_token_expire_minutes)
internal_access_token = create_access_token(
data={
"sub": str(current_user.id),
"email": current_user.email,
"session_id": session.id
},
expires_delta=internal_token_expires
)
logger.info(f"Refreshed internal token for user {current_user.email}")
return {
"access_token": internal_access_token,
"token_type": "bearer",
"expires_in": int(internal_token_expires.total_seconds()),
"user": {
"id": current_user.id,
"email": current_user.email,
"display_name": current_user.display_name
}
}
# External token expiring soon - would need re-authentication
# For now, we extend internal token and log a warning
logger.warning(
f"External token expiring soon for user {current_user.email}. "
"User should re-authenticate."
)
internal_token_expires = timedelta(minutes=settings.access_token_expire_minutes)
internal_access_token = create_access_token(
data={
"sub": str(current_user.id),
"email": current_user.email,
"session_id": session.id
},
expires_delta=internal_token_expires
)
return {
"access_token": internal_access_token,
"token_type": "bearer",
"expires_in": int(internal_token_expires.total_seconds()),
"user": {
"id": current_user.id,
"email": current_user.email,
"display_name": current_user.display_name
}
}
except HTTPException:
raise
except Exception as e:
logger.exception(f"Token refresh failed for user {current_user.id}")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Token refresh failed: {str(e)}"
)

View File

@@ -1,347 +0,0 @@
"""
Tool_OCR - External Authentication Router (V2)
Handles authentication via external Microsoft Azure AD API
"""
from datetime import datetime, timedelta
import logging
from typing import Optional
from fastapi import APIRouter, Depends, HTTPException, status, Request
from sqlalchemy.orm import Session
from app.core.config import settings
from app.core.deps import get_db, get_current_user_v2
from app.core.security import create_access_token
from app.models.user_v2 import User
from app.models.session import Session as UserSession
from app.schemas.auth import LoginRequest, Token, UserResponse
from app.services.external_auth_service import external_auth_service
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/api/v2/auth", tags=["Authentication V2"])
def get_client_ip(request: Request) -> str:
"""Extract client IP address from request"""
# Check X-Forwarded-For header (for proxies)
forwarded = request.headers.get("X-Forwarded-For")
if forwarded:
return forwarded.split(",")[0].strip()
# Check X-Real-IP header
real_ip = request.headers.get("X-Real-IP")
if real_ip:
return real_ip
# Fallback to direct client
return request.client.host if request.client else "unknown"
def get_user_agent(request: Request) -> str:
"""Extract user agent from request"""
return request.headers.get("User-Agent", "unknown")[:500]
@router.post("/login", response_model=Token, summary="External API login")
async def login(
login_data: LoginRequest,
request: Request,
db: Session = Depends(get_db)
):
"""
User login via external Microsoft Azure AD API
Returns JWT access token and stores session information
- **username**: User's email address
- **password**: User's password
"""
# Call external authentication API
success, auth_response, error_msg = await external_auth_service.authenticate_user(
username=login_data.username,
password=login_data.password
)
if not success or not auth_response:
logger.warning(
f"External auth failed for user {login_data.username}: {error_msg}"
)
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail=error_msg or "Authentication failed",
headers={"WWW-Authenticate": "Bearer"},
)
# Extract user info from external API response
user_info = auth_response.user_info
email = user_info.email
display_name = user_info.name
# Find or create user in database
user = db.query(User).filter(User.email == email).first()
if not user:
# Create new user
user = User(
email=email,
display_name=display_name,
is_active=True,
last_login=datetime.utcnow()
)
db.add(user)
db.commit()
db.refresh(user)
logger.info(f"Created new user: {email} (ID: {user.id})")
else:
# Update existing user
user.display_name = display_name
user.last_login = datetime.utcnow()
# Check if user is active
if not user.is_active:
logger.warning(f"Inactive user login attempt: {email}")
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
detail="User account is inactive"
)
db.commit()
db.refresh(user)
logger.info(f"Updated existing user: {email} (ID: {user.id})")
# Parse token expiration
try:
expires_at = datetime.fromisoformat(auth_response.expires_at.replace('Z', '+00:00'))
issued_at = datetime.fromisoformat(auth_response.issued_at.replace('Z', '+00:00'))
except Exception as e:
logger.error(f"Failed to parse token timestamps: {e}")
expires_at = datetime.utcnow() + timedelta(seconds=auth_response.expires_in)
issued_at = datetime.utcnow()
# Create session in database
# TODO: Implement token encryption before storing
session = UserSession(
user_id=user.id,
access_token=auth_response.access_token, # Should be encrypted
id_token=auth_response.id_token, # Should be encrypted
token_type=auth_response.token_type,
expires_at=expires_at,
issued_at=issued_at,
ip_address=get_client_ip(request),
user_agent=get_user_agent(request)
)
db.add(session)
db.commit()
db.refresh(session)
logger.info(
f"Created session {session.id} for user {user.email} "
f"(expires: {expires_at})"
)
# Create internal JWT token for API access
# This token contains user ID and session ID
internal_token_expires = timedelta(minutes=settings.access_token_expire_minutes)
internal_access_token = create_access_token(
data={
"sub": str(user.id),
"email": user.email,
"session_id": session.id
},
expires_delta=internal_token_expires
)
return {
"access_token": internal_access_token,
"token_type": "bearer",
"expires_in": int(internal_token_expires.total_seconds()),
"user": {
"id": user.id,
"email": user.email,
"display_name": user.display_name
}
}
@router.post("/logout", summary="User logout")
async def logout(
session_id: Optional[int] = None,
db: Session = Depends(get_db),
current_user: User = Depends(get_current_user_v2)
):
"""
User logout - invalidates session
- **session_id**: Session ID to logout (optional, logs out all if not provided)
"""
# TODO: Implement proper current_user dependency from JWT token
# For now, this is a placeholder
if session_id:
# Logout specific session
session = db.query(UserSession).filter(
UserSession.id == session_id,
UserSession.user_id == current_user.id
).first()
if session:
db.delete(session)
db.commit()
logger.info(f"Logged out session {session_id} for user {current_user.email}")
return {"message": "Logged out successfully"}
else:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="Session not found"
)
else:
# Logout all sessions
sessions = db.query(UserSession).filter(
UserSession.user_id == current_user.id
).all()
count = len(sessions)
for session in sessions:
db.delete(session)
db.commit()
logger.info(f"Logged out all {count} sessions for user {current_user.email}")
return {"message": f"Logged out {count} sessions"}
@router.get("/me", response_model=UserResponse, summary="Get current user")
async def get_me(
current_user: User = Depends(get_current_user_v2)
):
"""
Get current authenticated user information
"""
# TODO: Implement proper current_user dependency from JWT token
return {
"id": current_user.id,
"email": current_user.email,
"display_name": current_user.display_name,
"created_at": current_user.created_at,
"last_login": current_user.last_login,
"is_active": current_user.is_active
}
@router.get("/sessions", summary="List user sessions")
async def list_sessions(
db: Session = Depends(get_db),
current_user: User = Depends(get_current_user_v2)
):
"""
List all active sessions for current user
"""
sessions = db.query(UserSession).filter(
UserSession.user_id == current_user.id
).order_by(UserSession.created_at.desc()).all()
return {
"sessions": [
{
"id": s.id,
"token_type": s.token_type,
"expires_at": s.expires_at,
"issued_at": s.issued_at,
"ip_address": s.ip_address,
"user_agent": s.user_agent,
"created_at": s.created_at,
"last_accessed_at": s.last_accessed_at,
"is_expired": s.is_expired,
"time_until_expiry": s.time_until_expiry
}
for s in sessions
]
}
@router.post("/refresh", response_model=Token, summary="Refresh access token")
async def refresh_token(
request: Request,
db: Session = Depends(get_db),
current_user: User = Depends(get_current_user_v2)
):
"""
Refresh access token before expiration
Re-authenticates with external API using stored session.
Note: Since external API doesn't provide refresh tokens,
we re-issue internal JWT tokens with extended expiry.
"""
try:
# Find user's most recent session
session = db.query(UserSession).filter(
UserSession.user_id == current_user.id
).order_by(UserSession.created_at.desc()).first()
if not session:
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="No active session found"
)
# Check if token is expiring soon (within TOKEN_REFRESH_BUFFER)
if not external_auth_service.is_token_expiring_soon(session.expires_at):
# Token still valid for a while, just issue new internal JWT
internal_token_expires = timedelta(minutes=settings.access_token_expire_minutes)
internal_access_token = create_access_token(
data={
"sub": str(current_user.id),
"email": current_user.email,
"session_id": session.id
},
expires_delta=internal_token_expires
)
logger.info(f"Refreshed internal token for user {current_user.email}")
return {
"access_token": internal_access_token,
"token_type": "bearer",
"expires_in": int(internal_token_expires.total_seconds()),
"user": {
"id": current_user.id,
"email": current_user.email,
"display_name": current_user.display_name
}
}
# External token expiring soon - would need re-authentication
# For now, we extend internal token and log a warning
logger.warning(
f"External token expiring soon for user {current_user.email}. "
"User should re-authenticate."
)
internal_token_expires = timedelta(minutes=settings.access_token_expire_minutes)
internal_access_token = create_access_token(
data={
"sub": str(current_user.id),
"email": current_user.email,
"session_id": session.id
},
expires_delta=internal_token_expires
)
return {
"access_token": internal_access_token,
"token_type": "bearer",
"expires_in": int(internal_token_expires.total_seconds()),
"user": {
"id": current_user.id,
"email": current_user.email,
"display_name": current_user.display_name
}
}
except HTTPException:
raise
except Exception as e:
logger.exception(f"Token refresh failed for user {current_user.id}")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Token refresh failed: {str(e)}"
)

View File

@@ -1,338 +0,0 @@
"""
Tool_OCR - Export Router
Export results in multiple formats
"""
import logging
from typing import List
from pathlib import Path
from fastapi import APIRouter, Depends, HTTPException, status
from fastapi.responses import FileResponse
from sqlalchemy.orm import Session
from app.core.deps import get_db, get_current_active_user
from app.models.user import User
from app.models.ocr import OCRBatch, OCRFile, OCRResult, FileStatus
from app.models.export import ExportRule
from app.schemas.export import (
ExportRequest,
ExportRuleCreate,
ExportRuleUpdate,
ExportRuleResponse,
CSSTemplateResponse,
)
from app.services.export_service import ExportService, ExportError
from app.services.pdf_generator import PDFGenerator
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/api/v1/export", tags=["Export"])
# Initialize services
export_service = ExportService()
pdf_generator = PDFGenerator()
@router.post("", summary="Export OCR results")
async def export_results(
request: ExportRequest,
db: Session = Depends(get_db),
current_user: User = Depends(get_current_active_user)
):
"""
Export OCR results in specified format
Supports multiple export formats: txt, json, excel, markdown, pdf, zip
- **batch_id**: Batch ID to export
- **format**: Export format (txt, json, excel, markdown, pdf, zip)
- **rule_id**: Optional export rule ID to apply filters
- **css_template**: CSS template for PDF export (default, academic, business)
- **include_formats**: Formats to include in ZIP export
"""
# Verify batch ownership
batch = db.query(OCRBatch).filter(
OCRBatch.id == request.batch_id,
OCRBatch.user_id == current_user.id
).first()
if not batch:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="Batch not found"
)
# Get completed results
results = db.query(OCRResult).join(OCRFile).filter(
OCRFile.batch_id == request.batch_id,
OCRFile.status == FileStatus.COMPLETED
).all()
if not results:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="No completed results found for this batch"
)
# Apply export rule if specified
if request.rule_id:
try:
results = export_service.apply_export_rule(db, results, request.rule_id)
except ExportError as e:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=str(e)
)
try:
# Generate export based on format
export_dir = Path(f"uploads/batches/{batch.id}/exports")
export_dir.mkdir(parents=True, exist_ok=True)
if request.format == "txt":
output_path = export_dir / f"batch_{batch.id}_export.txt"
export_service.export_to_txt(results, output_path)
elif request.format == "json":
output_path = export_dir / f"batch_{batch.id}_export.json"
export_service.export_to_json(results, output_path)
elif request.format == "excel":
output_path = export_dir / f"batch_{batch.id}_export.xlsx"
export_service.export_to_excel(results, output_path)
elif request.format == "markdown":
output_path = export_dir / f"batch_{batch.id}_export.md"
export_service.export_to_markdown(results, output_path, combine=True)
elif request.format == "zip":
output_path = export_dir / f"batch_{batch.id}_export.zip"
include_formats = request.include_formats or ["markdown", "json"]
export_service.export_batch_to_zip(db, batch.id, output_path, include_formats)
else:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail=f"Unsupported export format: {request.format}"
)
logger.info(f"Exported batch {batch.id} to {request.format} format: {output_path}")
# Return file for download
return FileResponse(
path=str(output_path),
filename=output_path.name,
media_type="application/octet-stream"
)
except ExportError as e:
logger.error(f"Export error: {e}")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=str(e)
)
except Exception as e:
logger.error(f"Unexpected export error: {e}")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail="Export failed"
)
@router.get("/pdf/{file_id}", summary="Generate PDF for single file")
async def generate_pdf(
file_id: int,
css_template: str = "default",
db: Session = Depends(get_db),
current_user: User = Depends(get_current_active_user)
):
"""
Generate layout-preserved PDF for a single file
- **file_id**: File ID
- **css_template**: CSS template (default, academic, business)
"""
# Get file and verify ownership
ocr_file = db.query(OCRFile).join(OCRBatch).filter(
OCRFile.id == file_id,
OCRBatch.user_id == current_user.id
).first()
if not ocr_file:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="File not found"
)
# Get result
result = db.query(OCRResult).filter(OCRResult.file_id == file_id).first()
if not result:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="OCR result not found"
)
try:
# Generate PDF
export_dir = Path(f"uploads/batches/{ocr_file.batch_id}/exports")
export_dir.mkdir(parents=True, exist_ok=True)
output_path = export_dir / f"file_{file_id}_export.pdf"
export_service.export_to_pdf(
result=result,
output_path=output_path,
css_template=css_template,
metadata={"title": ocr_file.original_filename}
)
logger.info(f"Generated PDF for file {file_id}: {output_path}")
return FileResponse(
path=str(output_path),
filename=f"{Path(ocr_file.original_filename).stem}.pdf",
media_type="application/pdf"
)
except ExportError as e:
logger.error(f"PDF generation error: {e}")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=str(e)
)
@router.get("/rules", response_model=List[ExportRuleResponse], summary="List export rules")
async def list_export_rules(
db: Session = Depends(get_db),
current_user: User = Depends(get_current_active_user)
):
"""
List all export rules for current user
Returns list of saved export rules
"""
rules = db.query(ExportRule).filter(ExportRule.user_id == current_user.id).all()
return rules
@router.post("/rules", response_model=ExportRuleResponse, summary="Create export rule")
async def create_export_rule(
rule: ExportRuleCreate,
db: Session = Depends(get_db),
current_user: User = Depends(get_current_active_user)
):
"""
Create new export rule
Saves custom export configuration for reuse
- **rule_name**: Rule name
- **description**: Optional description
- **config_json**: Rule configuration (filters, formatting, export_options)
- **css_template**: Optional custom CSS for PDF export
"""
# Create rule
new_rule = ExportRule(
user_id=current_user.id,
rule_name=rule.rule_name,
description=rule.description,
config_json=rule.config_json,
css_template=rule.css_template
)
db.add(new_rule)
db.commit()
db.refresh(new_rule)
logger.info(f"Created export rule {new_rule.id} for user {current_user.id}")
return new_rule
@router.put("/rules/{rule_id}", response_model=ExportRuleResponse, summary="Update export rule")
async def update_export_rule(
rule_id: int,
rule: ExportRuleUpdate,
db: Session = Depends(get_db),
current_user: User = Depends(get_current_active_user)
):
"""
Update existing export rule
- **rule_id**: Rule ID to update
- **rule_name**: Optional new rule name
- **description**: Optional new description
- **config_json**: Optional new configuration
- **css_template**: Optional new CSS template
"""
# Get rule and verify ownership
db_rule = db.query(ExportRule).filter(
ExportRule.id == rule_id,
ExportRule.user_id == current_user.id
).first()
if not db_rule:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="Export rule not found"
)
# Update fields
update_data = rule.dict(exclude_unset=True)
for field, value in update_data.items():
setattr(db_rule, field, value)
db.commit()
db.refresh(db_rule)
logger.info(f"Updated export rule {rule_id}")
return db_rule
@router.delete("/rules/{rule_id}", summary="Delete export rule")
async def delete_export_rule(
rule_id: int,
db: Session = Depends(get_db),
current_user: User = Depends(get_current_active_user)
):
"""
Delete export rule
- **rule_id**: Rule ID to delete
"""
# Get rule and verify ownership
db_rule = db.query(ExportRule).filter(
ExportRule.id == rule_id,
ExportRule.user_id == current_user.id
).first()
if not db_rule:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="Export rule not found"
)
db.delete(db_rule)
db.commit()
logger.info(f"Deleted export rule {rule_id}")
return {"message": "Export rule deleted successfully"}
@router.get("/css-templates", response_model=List[CSSTemplateResponse], summary="List CSS templates")
async def list_css_templates():
"""
List available CSS templates for PDF generation
Returns list of predefined CSS templates with descriptions
"""
templates = pdf_generator.get_available_templates()
return [
{"name": name, "description": desc}
for name, desc in templates.items()
]

View File

@@ -1,244 +0,0 @@
"""
Tool_OCR - OCR Router
File upload, OCR processing, and status endpoints
"""
import logging
from typing import List
from pathlib import Path
from fastapi import APIRouter, Depends, HTTPException, status, UploadFile, File, BackgroundTasks
from sqlalchemy.orm import Session
from app.core.deps import get_db, get_current_active_user
from app.models.user import User
from app.models.ocr import OCRBatch, OCRFile, OCRResult, BatchStatus, FileStatus
from app.schemas.ocr import (
OCRBatchResponse,
BatchStatusResponse,
FileStatusResponse,
OCRResultDetailResponse,
UploadBatchResponse,
ProcessRequest,
ProcessResponse,
)
from app.services.file_manager import FileManager, FileManagementError
from app.services.ocr_service import OCRService
from app.services.background_tasks import process_batch_files_with_retry
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/api/v1", tags=["OCR"])
# Initialize services
file_manager = FileManager()
ocr_service = OCRService()
@router.post("/upload", response_model=UploadBatchResponse, summary="Upload files for OCR")
async def upload_files(
files: List[UploadFile] = File(..., description="Files to upload (PNG, JPG, PDF)"),
batch_name: str = None,
db: Session = Depends(get_db),
current_user: User = Depends(get_current_active_user)
):
"""
Upload files for OCR processing
Creates a new batch and uploads files to it
- **files**: List of files to upload (PNG, JPG, JPEG, PDF)
- **batch_name**: Optional name for the batch
"""
if not files:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail="No files provided"
)
try:
# Create batch
batch = file_manager.create_batch(db, current_user.id, batch_name)
# Upload files
uploaded_files = file_manager.add_files_to_batch(db, batch.id, files)
logger.info(f"Uploaded {len(uploaded_files)} files to batch {batch.id} for user {current_user.id}")
# Refresh batch to get updated counts
db.refresh(batch)
# Return response matching frontend expectations
return {
"batch_id": batch.id,
"files": uploaded_files
}
except FileManagementError as e:
logger.error(f"File upload error: {e}")
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail=str(e)
)
except Exception as e:
logger.error(f"Unexpected error during upload: {e}")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail="Failed to upload files"
)
# NOTE: process_batch_files function moved to app.services.background_tasks
# Now using process_batch_files_with_retry with retry logic
@router.post("/ocr/process", response_model=ProcessResponse, summary="Trigger OCR processing")
async def process_ocr(
request: ProcessRequest,
background_tasks: BackgroundTasks,
db: Session = Depends(get_db),
current_user: User = Depends(get_current_active_user)
):
"""
Trigger OCR processing for a batch
Starts background processing of all files in the batch
- **batch_id**: Batch ID to process
- **lang**: Language code (ch, en, japan, korean)
- **detect_layout**: Enable layout detection
"""
# Verify batch ownership
batch = db.query(OCRBatch).filter(
OCRBatch.id == request.batch_id,
OCRBatch.user_id == current_user.id
).first()
if not batch:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="Batch not found"
)
if batch.status != BatchStatus.PENDING:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail=f"Batch is already {batch.status.value}"
)
# Start background processing with retry logic
background_tasks.add_task(
process_batch_files_with_retry,
batch_id=batch.id,
lang=request.lang,
detect_layout=request.detect_layout,
db=SessionLocal() # Create new session for background task
)
logger.info(f"Started OCR processing for batch {batch.id}")
return {
"message": "OCR processing started",
"batch_id": batch.id,
"total_files": batch.total_files,
"status": "processing"
}
@router.get("/batch/{batch_id}/status", response_model=BatchStatusResponse, summary="Get batch status")
async def get_batch_status(
batch_id: int,
db: Session = Depends(get_db),
current_user: User = Depends(get_current_active_user)
):
"""
Get batch processing status
Returns batch information and all files in the batch
- **batch_id**: Batch ID
"""
# Verify batch ownership
batch = db.query(OCRBatch).filter(
OCRBatch.id == batch_id,
OCRBatch.user_id == current_user.id
).first()
if not batch:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="Batch not found"
)
# Get all files in batch
files = db.query(OCRFile).filter(OCRFile.batch_id == batch_id).all()
return {
"batch": batch,
"files": files
}
@router.get("/ocr/result/{file_id}", response_model=OCRResultDetailResponse, summary="Get OCR result")
async def get_ocr_result(
file_id: int,
db: Session = Depends(get_db),
current_user: User = Depends(get_current_active_user)
):
"""
Get OCR result for a file
Returns flattened file and OCR result information for frontend preview
- **file_id**: File ID
"""
# Get file
ocr_file = db.query(OCRFile).join(OCRBatch).filter(
OCRFile.id == file_id,
OCRBatch.user_id == current_user.id
).first()
if not ocr_file:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="File not found"
)
# Get result if exists
result = db.query(OCRResult).filter(OCRResult.file_id == file_id).first()
# Read markdown content if result exists
markdown_content = None
if result and result.markdown_path:
markdown_file = Path(result.markdown_path)
if markdown_file.exists():
try:
markdown_content = markdown_file.read_text(encoding='utf-8')
except Exception as e:
logger.warning(f"Failed to read markdown file {result.markdown_path}: {e}")
# Build JSON data from result if available
json_data = None
if result:
json_data = {
"total_text_regions": result.total_text_regions,
"average_confidence": result.average_confidence,
"detected_language": result.detected_language,
"layout_data": result.layout_data,
"images_metadata": result.images_metadata,
}
# Return flattened structure matching frontend expectations
return {
"file_id": ocr_file.id,
"filename": ocr_file.filename,
"status": ocr_file.status.value,
"markdown_content": markdown_content,
"json_data": json_data,
"confidence": result.average_confidence if result else None,
"processing_time": ocr_file.processing_time,
}
# Import SessionLocal for background tasks
from app.core.database import SessionLocal

View File

@@ -10,8 +10,8 @@ from fastapi import APIRouter, Depends, HTTPException, status, Query
from fastapi.responses import FileResponse
from sqlalchemy.orm import Session
from app.core.deps import get_db, get_current_user_v2
from app.models.user_v2 import User
from app.core.deps import get_db, get_current_user
from app.models.user import User
from app.models.task import TaskStatus
from app.schemas.task import (
TaskCreate,
@@ -34,7 +34,7 @@ router = APIRouter(prefix="/api/v2/tasks", tags=["Tasks"])
async def create_task(
task_data: TaskCreate,
db: Session = Depends(get_db),
current_user: User = Depends(get_current_user_v2)
current_user: User = Depends(get_current_user)
):
"""
Create a new OCR task
@@ -72,7 +72,7 @@ async def list_tasks(
order_by: str = Query("created_at"),
order_desc: bool = Query(True),
db: Session = Depends(get_db),
current_user: User = Depends(get_current_user_v2)
current_user: User = Depends(get_current_user)
):
"""
List user's tasks with pagination and filtering
@@ -134,7 +134,7 @@ async def list_tasks(
@router.get("/stats", response_model=TaskStatsResponse)
async def get_task_stats(
db: Session = Depends(get_db),
current_user: User = Depends(get_current_user_v2)
current_user: User = Depends(get_current_user)
):
"""
Get task statistics for current user
@@ -157,7 +157,7 @@ async def get_task_stats(
async def get_task(
task_id: str,
db: Session = Depends(get_db),
current_user: User = Depends(get_current_user_v2)
current_user: User = Depends(get_current_user)
):
"""
Get task details by ID
@@ -184,7 +184,7 @@ async def update_task(
task_id: str,
task_update: TaskUpdate,
db: Session = Depends(get_db),
current_user: User = Depends(get_current_user_v2)
current_user: User = Depends(get_current_user)
):
"""
Update task status and results
@@ -253,7 +253,7 @@ async def update_task(
async def delete_task(
task_id: str,
db: Session = Depends(get_db),
current_user: User = Depends(get_current_user_v2)
current_user: User = Depends(get_current_user)
):
"""
Delete a task
@@ -280,7 +280,7 @@ async def delete_task(
async def download_json(
task_id: str,
db: Session = Depends(get_db),
current_user: User = Depends(get_current_user_v2)
current_user: User = Depends(get_current_user)
):
"""
Download task result as JSON file
@@ -327,7 +327,7 @@ async def download_json(
async def download_markdown(
task_id: str,
db: Session = Depends(get_db),
current_user: User = Depends(get_current_user_v2)
current_user: User = Depends(get_current_user)
):
"""
Download task result as Markdown file
@@ -374,7 +374,7 @@ async def download_markdown(
async def download_pdf(
task_id: str,
db: Session = Depends(get_db),
current_user: User = Depends(get_current_user_v2)
current_user: User = Depends(get_current_user)
):
"""
Download task result as searchable PDF file
@@ -421,7 +421,7 @@ async def download_pdf(
async def start_task(
task_id: str,
db: Session = Depends(get_db),
current_user: User = Depends(get_current_user_v2)
current_user: User = Depends(get_current_user)
):
"""
Start processing a pending task
@@ -459,7 +459,7 @@ async def start_task(
async def cancel_task(
task_id: str,
db: Session = Depends(get_db),
current_user: User = Depends(get_current_user_v2)
current_user: User = Depends(get_current_user)
):
"""
Cancel a pending or processing task
@@ -513,7 +513,7 @@ async def cancel_task(
async def retry_task(
task_id: str,
db: Session = Depends(get_db),
current_user: User = Depends(get_current_user_v2)
current_user: User = Depends(get_current_user)
):
"""
Retry a failed task

View File

@@ -1,189 +0,0 @@
"""
Tool_OCR - Translation Router (RESERVED)
Stub endpoints for future translation feature
"""
import logging
from typing import List
from fastapi import APIRouter, Depends, HTTPException, status
from sqlalchemy.orm import Session
from app.core.deps import get_db, get_current_active_user
from app.models.user import User
from app.schemas.translation import (
TranslationRequest,
TranslationResponse,
TranslationFeatureStatus,
LanguageInfo,
)
from app.services.translation_service import StubTranslationService
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/api/v1/translate", tags=["Translation (RESERVED)"])
@router.get("/status", response_model=TranslationFeatureStatus, summary="Get translation feature status")
async def get_translation_status():
"""
Get translation feature status
Returns current implementation status and roadmap for translation feature.
This is a RESERVED feature that will be implemented in Phase 5.
**Status**: RESERVED - Not yet implemented
**Phase**: Phase 5 (Post-production)
**Priority**: Implemented after production deployment and user feedback
"""
return StubTranslationService.get_feature_status()
@router.get("/languages", response_model=List[LanguageInfo], summary="Get supported languages")
async def get_supported_languages():
"""
Get list of languages planned for translation support
Returns list of languages that will be supported when translation
feature is implemented.
**Status**: RESERVED - Planning phase
"""
return StubTranslationService.get_supported_languages()
@router.post("/document", response_model=TranslationResponse, summary="Translate document (RESERVED)")
async def translate_document(
request: TranslationRequest,
db: Session = Depends(get_db),
current_user: User = Depends(get_current_active_user)
):
"""
Translate OCR document (RESERVED - NOT IMPLEMENTED)
This endpoint is reserved for future translation functionality.
Returns 501 Not Implemented status.
**Expected Functionality** (when implemented):
- Translate markdown documents while preserving structure
- Support multiple translation engines (offline, ERNIE, Google, DeepL)
- Maintain layout and formatting
- Handle technical terminology
**Planned Features**:
- Offline translation (Argos Translate)
- Cloud API integration (ERNIE, Google, DeepL)
- Batch translation support
- Translation memory
- Glossary support
**Current Status**: RESERVED for Phase 5 implementation
---
**Request Parameters** (planned):
- **file_id**: ID of OCR result file to translate
- **source_lang**: Source language code (zh, en, ja, ko)
- **target_lang**: Target language code (zh, en, ja, ko)
- **engine_type**: Translation engine (offline, ernie, google, deepl)
- **preserve_structure**: Whether to preserve markdown structure
- **engine_config**: Engine-specific configuration
**Response** (planned):
- **task_id**: Translation task ID for tracking progress
- **status**: Translation status
- **translated_file_path**: Path to translated file (when completed)
"""
logger.info(f"Translation request received from user {current_user.id} (stub endpoint)")
# Return 501 Not Implemented with informative message
raise HTTPException(
status_code=status.HTTP_501_NOT_IMPLEMENTED,
detail={
"error": "Translation feature not implemented",
"message": "This feature is reserved for future development (Phase 5)",
"status": "RESERVED",
"roadmap": {
"phase": "Phase 5",
"priority": "Implemented after production deployment",
"planned_features": [
"Offline translation (Argos Translate)",
"Cloud API integration (ERNIE, Google, DeepL)",
"Structure-preserving markdown translation",
"Batch translation support"
]
},
"request_received": {
"file_id": request.file_id,
"source_lang": request.source_lang,
"target_lang": request.target_lang,
"engine_type": request.engine_type
},
"action": "Please check back in a future release or contact support for updates"
}
)
@router.get("/task/{task_id}", summary="Get translation task status (RESERVED)")
async def get_translation_task_status(
task_id: int,
db: Session = Depends(get_db),
current_user: User = Depends(get_current_active_user)
):
"""
Get translation task status (RESERVED - NOT IMPLEMENTED)
This endpoint would track translation task progress.
Returns 501 Not Implemented status.
**Planned Functionality**:
- Real-time translation progress
- Status updates (pending, processing, completed, failed)
- Estimated completion time
- Error reporting
**Current Status**: RESERVED for Phase 5 implementation
"""
logger.info(f"Translation status check for task {task_id} from user {current_user.id} (stub endpoint)")
raise HTTPException(
status_code=status.HTTP_501_NOT_IMPLEMENTED,
detail={
"error": "Translation feature not implemented",
"message": "Translation task tracking is reserved for Phase 5",
"task_id": task_id,
"status": "RESERVED"
}
)
@router.delete("/task/{task_id}", summary="Cancel translation task (RESERVED)")
async def cancel_translation_task(
task_id: int,
db: Session = Depends(get_db),
current_user: User = Depends(get_current_active_user)
):
"""
Cancel ongoing translation task (RESERVED - NOT IMPLEMENTED)
This endpoint would allow cancellation of translation tasks.
Returns 501 Not Implemented status.
**Planned Functionality**:
- Cancel in-progress translations
- Clean up temporary files
- Refund credits (if applicable)
**Current Status**: RESERVED for Phase 5 implementation
"""
logger.info(f"Translation cancellation request for task {task_id} from user {current_user.id} (stub endpoint)")
raise HTTPException(
status_code=status.HTTP_501_NOT_IMPLEMENTED,
detail={
"error": "Translation feature not implemented",
"message": "This feature is reserved for Phase 5",
"status": "RESERVED"
}
)