This commit is contained in:
beabigegg
2025-11-12 22:53:17 +08:00
commit da700721fa
130 changed files with 23393 additions and 0 deletions

5
backend/app/__init__.py Normal file
View File

@@ -0,0 +1,5 @@
"""
Tool_OCR Backend Application
"""
__version__ = "0.1.0"

126
backend/app/core/config.py Normal file
View File

@@ -0,0 +1,126 @@
"""
Tool_OCR - Configuration Management
Loads environment variables and provides centralized configuration
"""
from typing import List
from pydantic_settings import BaseSettings
from pydantic import Field
from pathlib import Path
class Settings(BaseSettings):
"""Application settings loaded from environment variables"""
# ===== Database Configuration =====
mysql_host: str = Field(default="mysql.theaken.com")
mysql_port: int = Field(default=33306)
mysql_user: str = Field(default="A060")
mysql_password: str = Field(default="")
mysql_database: str = Field(default="db_A060")
@property
def database_url(self) -> str:
"""Construct SQLAlchemy database URL"""
return (
f"mysql+pymysql://{self.mysql_user}:{self.mysql_password}"
f"@{self.mysql_host}:{self.mysql_port}/{self.mysql_database}"
)
# ===== Application Configuration =====
backend_port: int = Field(default=12010)
frontend_port: int = Field(default=12011)
secret_key: str = Field(default="your-secret-key-change-this")
algorithm: str = Field(default="HS256")
access_token_expire_minutes: int = Field(default=1440) # 24 hours
# ===== OCR Configuration =====
paddleocr_model_dir: str = Field(default="./models/paddleocr")
ocr_languages: str = Field(default="ch,en,japan,korean")
ocr_confidence_threshold: float = Field(default=0.5)
max_ocr_workers: int = Field(default=4)
@property
def ocr_languages_list(self) -> List[str]:
"""Get OCR languages as list"""
return [lang.strip() for lang in self.ocr_languages.split(",")]
# ===== File Upload Configuration =====
max_upload_size: int = Field(default=52428800) # 50MB
allowed_extensions: str = Field(default="png,jpg,jpeg,pdf,bmp,tiff,doc,docx,ppt,pptx")
upload_dir: str = Field(default="./uploads")
temp_dir: str = Field(default="./uploads/temp")
processed_dir: str = Field(default="./uploads/processed")
images_dir: str = Field(default="./uploads/images")
@property
def allowed_extensions_list(self) -> List[str]:
"""Get allowed extensions as list"""
return [ext.strip() for ext in self.allowed_extensions.split(",")]
# ===== Export Configuration =====
storage_dir: str = Field(default="./storage")
markdown_dir: str = Field(default="./storage/markdown")
json_dir: str = Field(default="./storage/json")
exports_dir: str = Field(default="./storage/exports")
# ===== PDF Generation Configuration =====
pandoc_path: str = Field(default="/opt/homebrew/bin/pandoc")
font_dir: str = Field(default="/System/Library/Fonts")
pdf_page_size: str = Field(default="A4")
pdf_margin_top: int = Field(default=20)
pdf_margin_bottom: int = Field(default=20)
pdf_margin_left: int = Field(default=20)
pdf_margin_right: int = Field(default=20)
# ===== Translation Configuration (Reserved) =====
enable_translation: bool = Field(default=False)
translation_engine: str = Field(default="offline")
argostranslate_models_dir: str = Field(default="./models/argostranslate")
# ===== Background Tasks Configuration =====
task_queue_type: str = Field(default="memory")
redis_url: str = Field(default="redis://localhost:6379/0")
# ===== CORS Configuration =====
cors_origins: str = Field(default="http://localhost:12011,http://127.0.0.1:12011")
@property
def cors_origins_list(self) -> List[str]:
"""Get CORS origins as list"""
return [origin.strip() for origin in self.cors_origins.split(",")]
# ===== Logging Configuration =====
log_level: str = Field(default="INFO")
log_file: str = Field(default="./logs/app.log")
class Config:
# Look for .env in project root (one level up from backend/)
env_file = str(Path(__file__).resolve().parent.parent.parent.parent / ".env")
env_file_encoding = "utf-8"
case_sensitive = False
def ensure_directories(self):
"""Create all necessary directories if they don't exist"""
dirs = [
self.upload_dir,
self.temp_dir,
self.processed_dir,
self.images_dir,
self.storage_dir,
self.markdown_dir,
self.json_dir,
self.exports_dir,
self.paddleocr_model_dir,
Path(self.log_file).parent,
]
if self.enable_translation and self.translation_engine == "offline":
dirs.append(self.argostranslate_models_dir)
for dir_path in dirs:
Path(dir_path).mkdir(parents=True, exist_ok=True)
# Global settings instance
settings = Settings()

View File

@@ -0,0 +1,41 @@
"""
Tool_OCR - Database Connection Management
SQLAlchemy setup with async support
"""
from sqlalchemy import create_engine
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
from app.core.config import settings
# Create database engine
engine = create_engine(
settings.database_url,
pool_pre_ping=True, # Enable connection health checks
pool_size=10,
max_overflow=20,
echo=False, # Set to True for SQL query logging
)
# Create session factory
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
# Base class for all models
Base = declarative_base()
# Dependency to get database session
def get_db():
"""
Database session dependency for FastAPI endpoints
Usage:
@app.get("/endpoint")
def endpoint(db: Session = Depends(get_db)):
# Use db session here
"""
db = SessionLocal()
try:
yield db
finally:
db.close()

138
backend/app/core/deps.py Normal file
View File

@@ -0,0 +1,138 @@
"""
Tool_OCR - FastAPI Dependencies
Authentication and database session dependencies
"""
from typing import Generator, Optional
import logging
from fastapi import Depends, HTTPException, status
from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
from sqlalchemy.orm import Session
from app.core.database import SessionLocal
from app.core.security import decode_access_token
from app.models.user import User
logger = logging.getLogger(__name__)
# HTTP Bearer token security scheme
security = HTTPBearer()
def get_db() -> Generator:
"""
Database session dependency
Yields:
Session: SQLAlchemy database session
"""
db = SessionLocal()
try:
yield db
finally:
db.close()
def get_current_user(
credentials: HTTPAuthorizationCredentials = Depends(security),
db: Session = Depends(get_db)
) -> User:
"""
Get current authenticated user from JWT token
Args:
credentials: HTTP Bearer credentials
db: Database session
Returns:
User: Current user object
Raises:
HTTPException: If token is invalid or user not found
"""
credentials_exception = HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="Could not validate credentials",
headers={"WWW-Authenticate": "Bearer"},
)
# Extract token
token = credentials.credentials
# Decode token
payload = decode_access_token(token)
if payload is None:
raise credentials_exception
# Extract user ID from token (convert from string to int)
user_id_str: Optional[str] = payload.get("sub")
if user_id_str is None:
raise credentials_exception
try:
user_id: int = int(user_id_str)
except (ValueError, TypeError):
raise credentials_exception
# Query user from database
user = db.query(User).filter(User.id == user_id).first()
if user is None:
raise credentials_exception
# Check if user is active
if not user.is_active:
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
detail="Inactive user"
)
return user
def get_current_active_user(
current_user: User = Depends(get_current_user)
) -> User:
"""
Get current active user
Args:
current_user: Current user from get_current_user
Returns:
User: Current active user
Raises:
HTTPException: If user is inactive
"""
if not current_user.is_active:
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
detail="Inactive user"
)
return current_user
def get_current_admin_user(
current_user: User = Depends(get_current_user)
) -> User:
"""
Get current admin user
Args:
current_user: Current user from get_current_user
Returns:
User: Current admin user
Raises:
HTTPException: If user is not admin
"""
if not current_user.is_admin:
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
detail="Not enough privileges"
)
return current_user

View File

@@ -0,0 +1,89 @@
"""
Tool_OCR - Security Utilities
JWT token generation and password hashing
"""
from datetime import datetime, timedelta
from typing import Optional
import logging
from jose import JWTError, jwt
from passlib.context import CryptContext
from app.core.config import settings
logger = logging.getLogger(__name__)
# Password hashing context
pwd_context = CryptContext(schemes=["bcrypt"], deprecated="auto")
def verify_password(plain_password: str, hashed_password: str) -> bool:
"""
Verify a password against a hash
Args:
plain_password: Plain text password
hashed_password: Hashed password from database
Returns:
bool: True if password matches, False otherwise
"""
return pwd_context.verify(plain_password, hashed_password)
def get_password_hash(password: str) -> str:
"""
Hash a password
Args:
password: Plain text password
Returns:
str: Hashed password
"""
return pwd_context.hash(password)
def create_access_token(data: dict, expires_delta: Optional[timedelta] = None) -> str:
"""
Create JWT access token
Args:
data: Data to encode in token (typically {"sub": user_id})
expires_delta: Optional expiration time delta
Returns:
str: Encoded JWT token
"""
to_encode = data.copy()
if expires_delta:
expire = datetime.utcnow() + expires_delta
else:
expire = datetime.utcnow() + timedelta(minutes=settings.access_token_expire_minutes)
to_encode.update({"exp": expire})
encoded_jwt = jwt.encode(to_encode, settings.secret_key, algorithm=settings.algorithm)
return encoded_jwt
def decode_access_token(token: str) -> Optional[dict]:
"""
Decode and verify JWT access token
Args:
token: JWT token string
Returns:
dict: Decoded token payload, or None if invalid
"""
try:
payload = jwt.decode(token, settings.secret_key, algorithms=[settings.algorithm])
return payload
except JWTError as e:
logger.warning(f"JWT decode error: {e}")
return None

124
backend/app/main.py Normal file
View File

@@ -0,0 +1,124 @@
"""
Tool_OCR - FastAPI Application Entry Point
Main application setup with CORS, routes, and startup/shutdown events
"""
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from contextlib import asynccontextmanager
import logging
import asyncio
from pathlib import Path
from app.core.config import settings
from app.services.background_tasks import task_manager
# Ensure log directory exists before configuring logging
Path(settings.log_file).parent.mkdir(parents=True, exist_ok=True)
# Configure logging
logging.basicConfig(
level=getattr(logging, settings.log_level),
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
handlers=[
logging.FileHandler(settings.log_file),
logging.StreamHandler(),
],
)
logger = logging.getLogger(__name__)
@asynccontextmanager
async def lifespan(app: FastAPI):
"""Application lifespan events"""
# Startup
logger.info("Starting Tool_OCR application...")
# Ensure all directories exist
settings.ensure_directories()
logger.info("All directories created/verified")
# Start cleanup scheduler as background task
cleanup_task = asyncio.create_task(task_manager.start_cleanup_scheduler())
logger.info("Started cleanup scheduler for expired files")
# TODO: Initialize database connection pool
# TODO: Load PaddleOCR models
logger.info("Application startup complete")
yield
# Shutdown
logger.info("Shutting down Tool_OCR application...")
# Cancel cleanup task
cleanup_task.cancel()
try:
await cleanup_task
except asyncio.CancelledError:
logger.info("Cleanup scheduler stopped")
# TODO: Close database connections
# Create FastAPI application
app = FastAPI(
title="Tool_OCR",
description="OCR Batch Processing System with Structure Extraction",
version="0.1.0",
lifespan=lifespan,
)
# Configure CORS
app.add_middleware(
CORSMiddleware,
allow_origins=settings.cors_origins_list,
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Health check endpoint
@app.get("/health")
async def health_check():
"""Health check endpoint"""
return {
"status": "healthy",
"service": "Tool_OCR",
"version": "0.1.0",
}
# Root endpoint
@app.get("/")
async def root():
"""Root endpoint with API information"""
return {
"message": "Tool_OCR API",
"version": "0.1.0",
"docs_url": "/docs",
"health_check": "/health",
}
# Include API routers
from app.routers import auth, ocr, export, translation
app.include_router(auth.router)
app.include_router(ocr.router)
app.include_router(export.router)
app.include_router(translation.router) # RESERVED for Phase 5
if __name__ == "__main__":
import uvicorn
uvicorn.run(
"app.main:app",
host="0.0.0.0",
port=settings.backend_port,
reload=True,
log_level=settings.log_level.lower(),
)

View File

@@ -0,0 +1,17 @@
"""
Tool_OCR - Database Models
"""
from app.models.user import User
from app.models.ocr import OCRBatch, OCRFile, OCRResult
from app.models.export import ExportRule
from app.models.translation import TranslationConfig
__all__ = [
"User",
"OCRBatch",
"OCRFile",
"OCRResult",
"ExportRule",
"TranslationConfig",
]

View File

@@ -0,0 +1,55 @@
"""
Tool_OCR - Export Rule Model
User-defined export rules and formatting configurations
"""
from sqlalchemy import Column, Integer, String, DateTime, Text, ForeignKey, JSON
from sqlalchemy.orm import relationship
from datetime import datetime
from app.core.database import Base
class ExportRule(Base):
"""Export rule configuration for customized output formatting"""
__tablename__ = "paddle_ocr_export_rules"
id = Column(Integer, primary_key=True, index=True)
user_id = Column(Integer, ForeignKey("paddle_ocr_users.id", ondelete="CASCADE"), nullable=False, index=True)
rule_name = Column(String(100), nullable=False)
description = Column(Text, nullable=True)
# Rule configuration stored as JSON
# {
# "filters": {
# "confidence_threshold": 0.8,
# "filename_pattern": "invoice_*",
# "language": "ch"
# },
# "formatting": {
# "add_line_numbers": true,
# "sort_by_position": true,
# "group_by_filename": false
# },
# "export_options": {
# "include_metadata": true,
# "include_confidence": true,
# "include_bounding_boxes": false
# }
# }
config_json = Column(JSON, nullable=False)
# CSS template for PDF export (optional)
# Can reference predefined templates: "default", "academic", "business", "report"
# Or store custom CSS
css_template = Column(Text, nullable=True)
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False)
# Relationships
user = relationship("User", back_populates="export_rules")
def __repr__(self):
return f"<ExportRule(id={self.id}, name='{self.rule_name}', user_id={self.user_id})>"

122
backend/app/models/ocr.py Normal file
View File

@@ -0,0 +1,122 @@
"""
Tool_OCR - OCR Models
Database models for OCR batches, files, and results
"""
from sqlalchemy import Column, Integer, String, DateTime, Float, Text, ForeignKey, Enum, JSON
from sqlalchemy.orm import relationship
from datetime import datetime
import enum
from app.core.database import Base
class BatchStatus(str, enum.Enum):
"""Batch processing status"""
PENDING = "pending"
PROCESSING = "processing"
COMPLETED = "completed"
PARTIAL = "partial" # Some files failed
FAILED = "failed"
class FileStatus(str, enum.Enum):
"""Individual file processing status"""
PENDING = "pending"
PROCESSING = "processing"
COMPLETED = "completed"
FAILED = "failed"
class OCRBatch(Base):
"""OCR batch processing tracking"""
__tablename__ = "paddle_ocr_batches"
id = Column(Integer, primary_key=True, index=True)
user_id = Column(Integer, ForeignKey("paddle_ocr_users.id", ondelete="CASCADE"), nullable=False, index=True)
batch_name = Column(String(255), nullable=True)
status = Column(Enum(BatchStatus), default=BatchStatus.PENDING, nullable=False, index=True)
total_files = Column(Integer, default=0, nullable=False)
completed_files = Column(Integer, default=0, nullable=False)
failed_files = Column(Integer, default=0, nullable=False)
created_at = Column(DateTime, default=datetime.utcnow, nullable=False, index=True)
started_at = Column(DateTime, nullable=True)
completed_at = Column(DateTime, nullable=True)
# Relationships
user = relationship("User", back_populates="ocr_batches")
files = relationship("OCRFile", back_populates="batch", cascade="all, delete-orphan")
@property
def progress_percentage(self) -> float:
"""Calculate progress percentage"""
if self.total_files == 0:
return 0.0
return (self.completed_files / self.total_files) * 100
def __repr__(self):
return f"<OCRBatch(id={self.id}, status='{self.status}', progress={self.progress_percentage:.1f}%)>"
class OCRFile(Base):
"""Individual file in an OCR batch"""
__tablename__ = "paddle_ocr_files"
id = Column(Integer, primary_key=True, index=True)
batch_id = Column(Integer, ForeignKey("paddle_ocr_batches.id", ondelete="CASCADE"), nullable=False, index=True)
filename = Column(String(255), nullable=False)
original_filename = Column(String(255), nullable=False)
file_path = Column(String(512), nullable=False)
file_size = Column(Integer, nullable=False) # Size in bytes
file_format = Column(String(20), nullable=False) # png, jpg, pdf, etc.
status = Column(Enum(FileStatus), default=FileStatus.PENDING, nullable=False, index=True)
error_message = Column(Text, nullable=True)
retry_count = Column(Integer, default=0, nullable=False) # Number of retry attempts
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
started_at = Column(DateTime, nullable=True)
completed_at = Column(DateTime, nullable=True)
processing_time = Column(Float, nullable=True) # Processing time in seconds
# Relationships
batch = relationship("OCRBatch", back_populates="files")
result = relationship("OCRResult", back_populates="file", uselist=False, cascade="all, delete-orphan")
def __repr__(self):
return f"<OCRFile(id={self.id}, filename='{self.filename}', status='{self.status}')>"
class OCRResult(Base):
"""OCR processing result with structure and images"""
__tablename__ = "paddle_ocr_results"
id = Column(Integer, primary_key=True, index=True)
file_id = Column(Integer, ForeignKey("paddle_ocr_files.id", ondelete="CASCADE"), unique=True, nullable=False, index=True)
# Output file paths
markdown_path = Column(String(512), nullable=True) # Path to Markdown file
json_path = Column(String(512), nullable=True) # Path to JSON file
images_dir = Column(String(512), nullable=True) # Directory containing extracted images
# OCR metadata
detected_language = Column(String(20), nullable=True) # ch, en, japan, korean
total_text_regions = Column(Integer, default=0, nullable=False)
average_confidence = Column(Float, nullable=True)
# Layout structure data (stored as JSON)
# Contains: layout elements (title, paragraph, table, image, formula), reading order, bounding boxes
layout_data = Column(JSON, nullable=True)
# Extracted images metadata (stored as JSON)
# Contains: list of {image_path, bbox, element_type}
images_metadata = Column(JSON, nullable=True)
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
# Relationships
file = relationship("OCRFile", back_populates="result")
def __repr__(self):
return f"<OCRResult(id={self.id}, file_id={self.file_id}, language='{self.detected_language}')>"

View File

@@ -0,0 +1,43 @@
"""
Tool_OCR - Translation Config Model (RESERVED)
Reserved for future translation feature implementation
"""
from sqlalchemy import Column, Integer, String, DateTime, ForeignKey, JSON
from sqlalchemy.orm import relationship
from datetime import datetime
from app.core.database import Base
class TranslationConfig(Base):
"""
Translation configuration (RESERVED for future implementation)
This table is created but not actively used until translation feature is implemented.
"""
__tablename__ = "paddle_ocr_translation_configs"
id = Column(Integer, primary_key=True, index=True)
user_id = Column(Integer, ForeignKey("paddle_ocr_users.id", ondelete="CASCADE"), nullable=False, index=True)
source_lang = Column(String(20), nullable=False) # ch, en, japan, korean, etc.
target_lang = Column(String(20), nullable=False) # en, ch, japan, korean, etc.
# Translation engine type: "offline" (argostranslate), "ernie", "google", "deepl"
engine_type = Column(String(50), nullable=False, default="offline")
# Engine-specific configuration stored as JSON
# For offline (argostranslate): {"model_path": "/path/to/model"}
# For API-based: {"api_key": "xxx", "endpoint": "https://..."}
engine_config = Column(JSON, nullable=True)
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False)
# Relationships
user = relationship("User", back_populates="translation_configs")
def __repr__(self):
return f"<TranslationConfig(id={self.id}, {self.source_lang}->{self.target_lang}, engine='{self.engine_type}')>"

View File

@@ -0,0 +1,34 @@
"""
Tool_OCR - User Model
User authentication and management
"""
from sqlalchemy import Column, Integer, String, DateTime, Boolean
from sqlalchemy.orm import relationship
from datetime import datetime
from app.core.database import Base
class User(Base):
"""User model for JWT authentication"""
__tablename__ = "paddle_ocr_users"
id = Column(Integer, primary_key=True, index=True)
username = Column(String(50), unique=True, nullable=False, index=True)
email = Column(String(100), unique=True, nullable=False, index=True)
password_hash = Column(String(255), nullable=False)
full_name = Column(String(100), nullable=True)
is_active = Column(Boolean, default=True, nullable=False)
is_admin = Column(Boolean, default=False, nullable=False)
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False)
# Relationships
ocr_batches = relationship("OCRBatch", back_populates="user", cascade="all, delete-orphan")
export_rules = relationship("ExportRule", back_populates="user", cascade="all, delete-orphan")
translation_configs = relationship("TranslationConfig", back_populates="user", cascade="all, delete-orphan")
def __repr__(self):
return f"<User(id={self.id}, username='{self.username}', email='{self.email}')>"

View File

@@ -0,0 +1,7 @@
"""
Tool_OCR - API Routers
"""
from app.routers import auth, ocr, export, translation
__all__ = ["auth", "ocr", "export", "translation"]

View File

@@ -0,0 +1,70 @@
"""
Tool_OCR - Authentication Router
JWT login endpoint
"""
from datetime import timedelta
import logging
from fastapi import APIRouter, Depends, HTTPException, status
from sqlalchemy.orm import Session
from app.core.config import settings
from app.core.deps import get_db
from app.core.security import verify_password, create_access_token
from app.models.user import User
from app.schemas.auth import LoginRequest, Token
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/api/v1/auth", tags=["Authentication"])
@router.post("/login", response_model=Token, summary="User login")
async def login(
login_data: LoginRequest,
db: Session = Depends(get_db)
):
"""
User login with username and password
Returns JWT access token for authentication
- **username**: User's username
- **password**: User's password
"""
# Query user by username
user = db.query(User).filter(User.username == login_data.username).first()
# Verify user exists and password is correct
if not user or not verify_password(login_data.password, user.password_hash):
logger.warning(f"Failed login attempt for username: {login_data.username}")
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="Incorrect username or password",
headers={"WWW-Authenticate": "Bearer"},
)
# Check if user is active
if not user.is_active:
logger.warning(f"Inactive user login attempt: {login_data.username}")
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
detail="User account is inactive"
)
# Create access token
access_token_expires = timedelta(minutes=settings.access_token_expire_minutes)
access_token = create_access_token(
data={"sub": str(user.id), "username": user.username},
expires_delta=access_token_expires
)
logger.info(f"Successful login: {user.username} (ID: {user.id})")
return {
"access_token": access_token,
"token_type": "bearer",
"expires_in": settings.access_token_expire_minutes * 60 # Convert to seconds
}

View File

@@ -0,0 +1,338 @@
"""
Tool_OCR - Export Router
Export results in multiple formats
"""
import logging
from typing import List
from pathlib import Path
from fastapi import APIRouter, Depends, HTTPException, status
from fastapi.responses import FileResponse
from sqlalchemy.orm import Session
from app.core.deps import get_db, get_current_active_user
from app.models.user import User
from app.models.ocr import OCRBatch, OCRFile, OCRResult, FileStatus
from app.models.export import ExportRule
from app.schemas.export import (
ExportRequest,
ExportRuleCreate,
ExportRuleUpdate,
ExportRuleResponse,
CSSTemplateResponse,
)
from app.services.export_service import ExportService, ExportError
from app.services.pdf_generator import PDFGenerator
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/api/v1/export", tags=["Export"])
# Initialize services
export_service = ExportService()
pdf_generator = PDFGenerator()
@router.post("", summary="Export OCR results")
async def export_results(
request: ExportRequest,
db: Session = Depends(get_db),
current_user: User = Depends(get_current_active_user)
):
"""
Export OCR results in specified format
Supports multiple export formats: txt, json, excel, markdown, pdf, zip
- **batch_id**: Batch ID to export
- **format**: Export format (txt, json, excel, markdown, pdf, zip)
- **rule_id**: Optional export rule ID to apply filters
- **css_template**: CSS template for PDF export (default, academic, business)
- **include_formats**: Formats to include in ZIP export
"""
# Verify batch ownership
batch = db.query(OCRBatch).filter(
OCRBatch.id == request.batch_id,
OCRBatch.user_id == current_user.id
).first()
if not batch:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="Batch not found"
)
# Get completed results
results = db.query(OCRResult).join(OCRFile).filter(
OCRFile.batch_id == request.batch_id,
OCRFile.status == FileStatus.COMPLETED
).all()
if not results:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="No completed results found for this batch"
)
# Apply export rule if specified
if request.rule_id:
try:
results = export_service.apply_export_rule(db, results, request.rule_id)
except ExportError as e:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=str(e)
)
try:
# Generate export based on format
export_dir = Path(f"uploads/batches/{batch.id}/exports")
export_dir.mkdir(parents=True, exist_ok=True)
if request.format == "txt":
output_path = export_dir / f"batch_{batch.id}_export.txt"
export_service.export_to_txt(results, output_path)
elif request.format == "json":
output_path = export_dir / f"batch_{batch.id}_export.json"
export_service.export_to_json(results, output_path)
elif request.format == "excel":
output_path = export_dir / f"batch_{batch.id}_export.xlsx"
export_service.export_to_excel(results, output_path)
elif request.format == "markdown":
output_path = export_dir / f"batch_{batch.id}_export.md"
export_service.export_to_markdown(results, output_path, combine=True)
elif request.format == "zip":
output_path = export_dir / f"batch_{batch.id}_export.zip"
include_formats = request.include_formats or ["markdown", "json"]
export_service.export_batch_to_zip(db, batch.id, output_path, include_formats)
else:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail=f"Unsupported export format: {request.format}"
)
logger.info(f"Exported batch {batch.id} to {request.format} format: {output_path}")
# Return file for download
return FileResponse(
path=str(output_path),
filename=output_path.name,
media_type="application/octet-stream"
)
except ExportError as e:
logger.error(f"Export error: {e}")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=str(e)
)
except Exception as e:
logger.error(f"Unexpected export error: {e}")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail="Export failed"
)
@router.get("/pdf/{file_id}", summary="Generate PDF for single file")
async def generate_pdf(
file_id: int,
css_template: str = "default",
db: Session = Depends(get_db),
current_user: User = Depends(get_current_active_user)
):
"""
Generate layout-preserved PDF for a single file
- **file_id**: File ID
- **css_template**: CSS template (default, academic, business)
"""
# Get file and verify ownership
ocr_file = db.query(OCRFile).join(OCRBatch).filter(
OCRFile.id == file_id,
OCRBatch.user_id == current_user.id
).first()
if not ocr_file:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="File not found"
)
# Get result
result = db.query(OCRResult).filter(OCRResult.file_id == file_id).first()
if not result:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="OCR result not found"
)
try:
# Generate PDF
export_dir = Path(f"uploads/batches/{ocr_file.batch_id}/exports")
export_dir.mkdir(parents=True, exist_ok=True)
output_path = export_dir / f"file_{file_id}_export.pdf"
export_service.export_to_pdf(
result=result,
output_path=output_path,
css_template=css_template,
metadata={"title": ocr_file.original_filename}
)
logger.info(f"Generated PDF for file {file_id}: {output_path}")
return FileResponse(
path=str(output_path),
filename=f"{Path(ocr_file.original_filename).stem}.pdf",
media_type="application/pdf"
)
except ExportError as e:
logger.error(f"PDF generation error: {e}")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=str(e)
)
@router.get("/rules", response_model=List[ExportRuleResponse], summary="List export rules")
async def list_export_rules(
db: Session = Depends(get_db),
current_user: User = Depends(get_current_active_user)
):
"""
List all export rules for current user
Returns list of saved export rules
"""
rules = db.query(ExportRule).filter(ExportRule.user_id == current_user.id).all()
return rules
@router.post("/rules", response_model=ExportRuleResponse, summary="Create export rule")
async def create_export_rule(
rule: ExportRuleCreate,
db: Session = Depends(get_db),
current_user: User = Depends(get_current_active_user)
):
"""
Create new export rule
Saves custom export configuration for reuse
- **rule_name**: Rule name
- **description**: Optional description
- **config_json**: Rule configuration (filters, formatting, export_options)
- **css_template**: Optional custom CSS for PDF export
"""
# Create rule
new_rule = ExportRule(
user_id=current_user.id,
rule_name=rule.rule_name,
description=rule.description,
config_json=rule.config_json,
css_template=rule.css_template
)
db.add(new_rule)
db.commit()
db.refresh(new_rule)
logger.info(f"Created export rule {new_rule.id} for user {current_user.id}")
return new_rule
@router.put("/rules/{rule_id}", response_model=ExportRuleResponse, summary="Update export rule")
async def update_export_rule(
rule_id: int,
rule: ExportRuleUpdate,
db: Session = Depends(get_db),
current_user: User = Depends(get_current_active_user)
):
"""
Update existing export rule
- **rule_id**: Rule ID to update
- **rule_name**: Optional new rule name
- **description**: Optional new description
- **config_json**: Optional new configuration
- **css_template**: Optional new CSS template
"""
# Get rule and verify ownership
db_rule = db.query(ExportRule).filter(
ExportRule.id == rule_id,
ExportRule.user_id == current_user.id
).first()
if not db_rule:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="Export rule not found"
)
# Update fields
update_data = rule.dict(exclude_unset=True)
for field, value in update_data.items():
setattr(db_rule, field, value)
db.commit()
db.refresh(db_rule)
logger.info(f"Updated export rule {rule_id}")
return db_rule
@router.delete("/rules/{rule_id}", summary="Delete export rule")
async def delete_export_rule(
rule_id: int,
db: Session = Depends(get_db),
current_user: User = Depends(get_current_active_user)
):
"""
Delete export rule
- **rule_id**: Rule ID to delete
"""
# Get rule and verify ownership
db_rule = db.query(ExportRule).filter(
ExportRule.id == rule_id,
ExportRule.user_id == current_user.id
).first()
if not db_rule:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="Export rule not found"
)
db.delete(db_rule)
db.commit()
logger.info(f"Deleted export rule {rule_id}")
return {"message": "Export rule deleted successfully"}
@router.get("/css-templates", response_model=List[CSSTemplateResponse], summary="List CSS templates")
async def list_css_templates():
"""
List available CSS templates for PDF generation
Returns list of predefined CSS templates with descriptions
"""
templates = pdf_generator.get_available_templates()
return [
{"name": name, "description": desc}
for name, desc in templates.items()
]

244
backend/app/routers/ocr.py Normal file
View File

@@ -0,0 +1,244 @@
"""
Tool_OCR - OCR Router
File upload, OCR processing, and status endpoints
"""
import logging
from typing import List
from pathlib import Path
from fastapi import APIRouter, Depends, HTTPException, status, UploadFile, File, BackgroundTasks
from sqlalchemy.orm import Session
from app.core.deps import get_db, get_current_active_user
from app.models.user import User
from app.models.ocr import OCRBatch, OCRFile, OCRResult, BatchStatus, FileStatus
from app.schemas.ocr import (
OCRBatchResponse,
BatchStatusResponse,
FileStatusResponse,
OCRResultDetailResponse,
UploadBatchResponse,
ProcessRequest,
ProcessResponse,
)
from app.services.file_manager import FileManager, FileManagementError
from app.services.ocr_service import OCRService
from app.services.background_tasks import process_batch_files_with_retry
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/api/v1", tags=["OCR"])
# Initialize services
file_manager = FileManager()
ocr_service = OCRService()
@router.post("/upload", response_model=UploadBatchResponse, summary="Upload files for OCR")
async def upload_files(
files: List[UploadFile] = File(..., description="Files to upload (PNG, JPG, PDF)"),
batch_name: str = None,
db: Session = Depends(get_db),
current_user: User = Depends(get_current_active_user)
):
"""
Upload files for OCR processing
Creates a new batch and uploads files to it
- **files**: List of files to upload (PNG, JPG, JPEG, PDF)
- **batch_name**: Optional name for the batch
"""
if not files:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail="No files provided"
)
try:
# Create batch
batch = file_manager.create_batch(db, current_user.id, batch_name)
# Upload files
uploaded_files = file_manager.add_files_to_batch(db, batch.id, files)
logger.info(f"Uploaded {len(uploaded_files)} files to batch {batch.id} for user {current_user.id}")
# Refresh batch to get updated counts
db.refresh(batch)
# Return response matching frontend expectations
return {
"batch_id": batch.id,
"files": uploaded_files
}
except FileManagementError as e:
logger.error(f"File upload error: {e}")
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail=str(e)
)
except Exception as e:
logger.error(f"Unexpected error during upload: {e}")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail="Failed to upload files"
)
# NOTE: process_batch_files function moved to app.services.background_tasks
# Now using process_batch_files_with_retry with retry logic
@router.post("/ocr/process", response_model=ProcessResponse, summary="Trigger OCR processing")
async def process_ocr(
request: ProcessRequest,
background_tasks: BackgroundTasks,
db: Session = Depends(get_db),
current_user: User = Depends(get_current_active_user)
):
"""
Trigger OCR processing for a batch
Starts background processing of all files in the batch
- **batch_id**: Batch ID to process
- **lang**: Language code (ch, en, japan, korean)
- **detect_layout**: Enable layout detection
"""
# Verify batch ownership
batch = db.query(OCRBatch).filter(
OCRBatch.id == request.batch_id,
OCRBatch.user_id == current_user.id
).first()
if not batch:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="Batch not found"
)
if batch.status != BatchStatus.PENDING:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail=f"Batch is already {batch.status.value}"
)
# Start background processing with retry logic
background_tasks.add_task(
process_batch_files_with_retry,
batch_id=batch.id,
lang=request.lang,
detect_layout=request.detect_layout,
db=SessionLocal() # Create new session for background task
)
logger.info(f"Started OCR processing for batch {batch.id}")
return {
"message": "OCR processing started",
"batch_id": batch.id,
"total_files": batch.total_files,
"status": "processing"
}
@router.get("/batch/{batch_id}/status", response_model=BatchStatusResponse, summary="Get batch status")
async def get_batch_status(
batch_id: int,
db: Session = Depends(get_db),
current_user: User = Depends(get_current_active_user)
):
"""
Get batch processing status
Returns batch information and all files in the batch
- **batch_id**: Batch ID
"""
# Verify batch ownership
batch = db.query(OCRBatch).filter(
OCRBatch.id == batch_id,
OCRBatch.user_id == current_user.id
).first()
if not batch:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="Batch not found"
)
# Get all files in batch
files = db.query(OCRFile).filter(OCRFile.batch_id == batch_id).all()
return {
"batch": batch,
"files": files
}
@router.get("/ocr/result/{file_id}", response_model=OCRResultDetailResponse, summary="Get OCR result")
async def get_ocr_result(
file_id: int,
db: Session = Depends(get_db),
current_user: User = Depends(get_current_active_user)
):
"""
Get OCR result for a file
Returns flattened file and OCR result information for frontend preview
- **file_id**: File ID
"""
# Get file
ocr_file = db.query(OCRFile).join(OCRBatch).filter(
OCRFile.id == file_id,
OCRBatch.user_id == current_user.id
).first()
if not ocr_file:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="File not found"
)
# Get result if exists
result = db.query(OCRResult).filter(OCRResult.file_id == file_id).first()
# Read markdown content if result exists
markdown_content = None
if result and result.markdown_path:
markdown_file = Path(result.markdown_path)
if markdown_file.exists():
try:
markdown_content = markdown_file.read_text(encoding='utf-8')
except Exception as e:
logger.warning(f"Failed to read markdown file {result.markdown_path}: {e}")
# Build JSON data from result if available
json_data = None
if result:
json_data = {
"total_text_regions": result.total_text_regions,
"average_confidence": result.average_confidence,
"detected_language": result.detected_language,
"layout_data": result.layout_data,
"images_metadata": result.images_metadata,
}
# Return flattened structure matching frontend expectations
return {
"file_id": ocr_file.id,
"filename": ocr_file.filename,
"status": ocr_file.status.value,
"markdown_content": markdown_content,
"json_data": json_data,
"confidence": result.average_confidence if result else None,
"processing_time": ocr_file.processing_time,
}
# Import SessionLocal for background tasks
from app.core.database import SessionLocal

View File

@@ -0,0 +1,189 @@
"""
Tool_OCR - Translation Router (RESERVED)
Stub endpoints for future translation feature
"""
import logging
from typing import List
from fastapi import APIRouter, Depends, HTTPException, status
from sqlalchemy.orm import Session
from app.core.deps import get_db, get_current_active_user
from app.models.user import User
from app.schemas.translation import (
TranslationRequest,
TranslationResponse,
TranslationFeatureStatus,
LanguageInfo,
)
from app.services.translation_service import StubTranslationService
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/api/v1/translate", tags=["Translation (RESERVED)"])
@router.get("/status", response_model=TranslationFeatureStatus, summary="Get translation feature status")
async def get_translation_status():
"""
Get translation feature status
Returns current implementation status and roadmap for translation feature.
This is a RESERVED feature that will be implemented in Phase 5.
**Status**: RESERVED - Not yet implemented
**Phase**: Phase 5 (Post-production)
**Priority**: Implemented after production deployment and user feedback
"""
return StubTranslationService.get_feature_status()
@router.get("/languages", response_model=List[LanguageInfo], summary="Get supported languages")
async def get_supported_languages():
"""
Get list of languages planned for translation support
Returns list of languages that will be supported when translation
feature is implemented.
**Status**: RESERVED - Planning phase
"""
return StubTranslationService.get_supported_languages()
@router.post("/document", response_model=TranslationResponse, summary="Translate document (RESERVED)")
async def translate_document(
request: TranslationRequest,
db: Session = Depends(get_db),
current_user: User = Depends(get_current_active_user)
):
"""
Translate OCR document (RESERVED - NOT IMPLEMENTED)
This endpoint is reserved for future translation functionality.
Returns 501 Not Implemented status.
**Expected Functionality** (when implemented):
- Translate markdown documents while preserving structure
- Support multiple translation engines (offline, ERNIE, Google, DeepL)
- Maintain layout and formatting
- Handle technical terminology
**Planned Features**:
- Offline translation (Argos Translate)
- Cloud API integration (ERNIE, Google, DeepL)
- Batch translation support
- Translation memory
- Glossary support
**Current Status**: RESERVED for Phase 5 implementation
---
**Request Parameters** (planned):
- **file_id**: ID of OCR result file to translate
- **source_lang**: Source language code (zh, en, ja, ko)
- **target_lang**: Target language code (zh, en, ja, ko)
- **engine_type**: Translation engine (offline, ernie, google, deepl)
- **preserve_structure**: Whether to preserve markdown structure
- **engine_config**: Engine-specific configuration
**Response** (planned):
- **task_id**: Translation task ID for tracking progress
- **status**: Translation status
- **translated_file_path**: Path to translated file (when completed)
"""
logger.info(f"Translation request received from user {current_user.id} (stub endpoint)")
# Return 501 Not Implemented with informative message
raise HTTPException(
status_code=status.HTTP_501_NOT_IMPLEMENTED,
detail={
"error": "Translation feature not implemented",
"message": "This feature is reserved for future development (Phase 5)",
"status": "RESERVED",
"roadmap": {
"phase": "Phase 5",
"priority": "Implemented after production deployment",
"planned_features": [
"Offline translation (Argos Translate)",
"Cloud API integration (ERNIE, Google, DeepL)",
"Structure-preserving markdown translation",
"Batch translation support"
]
},
"request_received": {
"file_id": request.file_id,
"source_lang": request.source_lang,
"target_lang": request.target_lang,
"engine_type": request.engine_type
},
"action": "Please check back in a future release or contact support for updates"
}
)
@router.get("/task/{task_id}", summary="Get translation task status (RESERVED)")
async def get_translation_task_status(
task_id: int,
db: Session = Depends(get_db),
current_user: User = Depends(get_current_active_user)
):
"""
Get translation task status (RESERVED - NOT IMPLEMENTED)
This endpoint would track translation task progress.
Returns 501 Not Implemented status.
**Planned Functionality**:
- Real-time translation progress
- Status updates (pending, processing, completed, failed)
- Estimated completion time
- Error reporting
**Current Status**: RESERVED for Phase 5 implementation
"""
logger.info(f"Translation status check for task {task_id} from user {current_user.id} (stub endpoint)")
raise HTTPException(
status_code=status.HTTP_501_NOT_IMPLEMENTED,
detail={
"error": "Translation feature not implemented",
"message": "Translation task tracking is reserved for Phase 5",
"task_id": task_id,
"status": "RESERVED"
}
)
@router.delete("/task/{task_id}", summary="Cancel translation task (RESERVED)")
async def cancel_translation_task(
task_id: int,
db: Session = Depends(get_db),
current_user: User = Depends(get_current_active_user)
):
"""
Cancel ongoing translation task (RESERVED - NOT IMPLEMENTED)
This endpoint would allow cancellation of translation tasks.
Returns 501 Not Implemented status.
**Planned Functionality**:
- Cancel in-progress translations
- Clean up temporary files
- Refund credits (if applicable)
**Current Status**: RESERVED for Phase 5 implementation
"""
logger.info(f"Translation cancellation request for task {task_id} from user {current_user.id} (stub endpoint)")
raise HTTPException(
status_code=status.HTTP_501_NOT_IMPLEMENTED,
detail={
"error": "Translation feature not implemented",
"message": "This feature is reserved for Phase 5",
"status": "RESERVED"
}
)

View File

@@ -0,0 +1,59 @@
"""
Tool_OCR - API Schemas
Pydantic models for request/response validation
"""
from app.schemas.auth import Token, TokenData, LoginRequest
from app.schemas.user import UserBase, UserCreate, UserResponse
from app.schemas.ocr import (
OCRBatchResponse,
OCRFileResponse,
OCRResultResponse,
BatchStatusResponse,
FileStatusResponse,
ProcessRequest,
ProcessResponse,
)
from app.schemas.export import (
ExportRequest,
ExportRuleCreate,
ExportRuleUpdate,
ExportRuleResponse,
CSSTemplateResponse,
)
from app.schemas.translation import (
TranslationRequest,
TranslationResponse,
TranslationFeatureStatus,
LanguageInfo,
)
__all__ = [
# Auth
"Token",
"TokenData",
"LoginRequest",
# User
"UserBase",
"UserCreate",
"UserResponse",
# OCR
"OCRBatchResponse",
"OCRFileResponse",
"OCRResultResponse",
"BatchStatusResponse",
"FileStatusResponse",
"ProcessRequest",
"ProcessResponse",
# Export
"ExportRequest",
"ExportRuleCreate",
"ExportRuleUpdate",
"ExportRuleResponse",
"CSSTemplateResponse",
# Translation (RESERVED)
"TranslationRequest",
"TranslationResponse",
"TranslationFeatureStatus",
"LanguageInfo",
]

View File

@@ -0,0 +1,42 @@
"""
Tool_OCR - Authentication Schemas
"""
from typing import Optional
from pydantic import BaseModel, Field
class LoginRequest(BaseModel):
"""Login request schema"""
username: str = Field(..., min_length=3, max_length=50, description="Username")
password: str = Field(..., min_length=6, description="Password")
class Config:
json_schema_extra = {
"example": {
"username": "admin",
"password": "password123"
}
}
class Token(BaseModel):
"""JWT token response schema"""
access_token: str = Field(..., description="JWT access token")
token_type: str = Field(default="bearer", description="Token type")
expires_in: int = Field(..., description="Token expiration time in seconds")
class Config:
json_schema_extra = {
"example": {
"access_token": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9...",
"token_type": "bearer",
"expires_in": 3600
}
}
class TokenData(BaseModel):
"""Token payload data"""
user_id: Optional[int] = None
username: Optional[str] = None

View File

@@ -0,0 +1,104 @@
"""
Tool_OCR - Export Schemas
"""
from datetime import datetime
from typing import Optional, Dict, Any, List
from pydantic import BaseModel, Field
class ExportOptions(BaseModel):
"""Export options schema"""
confidence_threshold: Optional[float] = Field(None, description="Minimum confidence threshold")
include_metadata: Optional[bool] = Field(True, description="Include metadata in export")
filename_pattern: Optional[str] = Field(None, description="Filename pattern for export")
css_template: Optional[str] = Field(None, description="CSS template for PDF export")
class ExportRequest(BaseModel):
"""Export request schema"""
batch_id: int = Field(..., description="Batch ID to export")
format: str = Field(..., description="Export format (txt, json, excel, markdown, pdf, zip)")
rule_id: Optional[int] = Field(None, description="Optional export rule ID to apply")
css_template: Optional[str] = Field("default", description="CSS template for PDF export")
include_formats: Optional[List[str]] = Field(None, description="Formats to include in ZIP export")
options: Optional[ExportOptions] = Field(None, description="Additional export options")
class Config:
json_schema_extra = {
"example": {
"batch_id": 1,
"format": "pdf",
"rule_id": None,
"css_template": "default",
"include_formats": ["markdown", "json"],
"options": {
"confidence_threshold": 0.8,
"include_metadata": True
}
}
}
class ExportRuleCreate(BaseModel):
"""Export rule creation schema"""
rule_name: str = Field(..., max_length=100, description="Rule name")
description: Optional[str] = Field(None, description="Rule description")
config_json: Dict[str, Any] = Field(..., description="Rule configuration as JSON")
css_template: Optional[str] = Field(None, description="Custom CSS template")
class Config:
json_schema_extra = {
"example": {
"rule_name": "High Confidence Only",
"description": "Export only results with confidence > 0.8",
"config_json": {
"filters": {
"confidence_threshold": 0.8
},
"formatting": {
"add_line_numbers": True
}
},
"css_template": None
}
}
class ExportRuleUpdate(BaseModel):
"""Export rule update schema"""
rule_name: Optional[str] = Field(None, max_length=100)
description: Optional[str] = None
config_json: Optional[Dict[str, Any]] = None
css_template: Optional[str] = None
class ExportRuleResponse(BaseModel):
"""Export rule response schema"""
id: int
user_id: int
rule_name: str
description: Optional[str] = None
config_json: Dict[str, Any]
css_template: Optional[str] = None
created_at: datetime
updated_at: datetime
class Config:
from_attributes = True
class CSSTemplateResponse(BaseModel):
"""CSS template response schema"""
name: str = Field(..., description="Template name")
description: str = Field(..., description="Template description")
filename: str = Field(..., description="Template filename")
class Config:
json_schema_extra = {
"example": {
"name": "default",
"description": "通用排版模板,適合大多數文檔",
"filename": "default.css"
}
}

151
backend/app/schemas/ocr.py Normal file
View File

@@ -0,0 +1,151 @@
"""
Tool_OCR - OCR Schemas
"""
from datetime import datetime
from typing import Optional, Dict, List, Any
from pydantic import BaseModel, Field
from app.models.ocr import BatchStatus, FileStatus
class OCRFileResponse(BaseModel):
"""OCR file response schema"""
id: int
batch_id: int
filename: str
original_filename: str
file_size: int
file_format: str
status: FileStatus
error: Optional[str] = Field(None, validation_alias='error_message') # Map from error_message to error
created_at: datetime
processing_time: Optional[float] = None
class Config:
from_attributes = True
populate_by_name = True
class OCRResultResponse(BaseModel):
"""OCR result response schema"""
id: int
file_id: int
markdown_path: Optional[str] = None
markdown_content: Optional[str] = None # Added for frontend preview
json_path: Optional[str] = None
images_dir: Optional[str] = None
detected_language: Optional[str] = None
total_text_regions: int
average_confidence: Optional[float] = None
layout_data: Optional[Dict[str, Any]] = None
images_metadata: Optional[List[Dict[str, Any]]] = None
created_at: datetime
class Config:
from_attributes = True
class OCRBatchResponse(BaseModel):
"""OCR batch response schema"""
id: int
user_id: int
batch_name: Optional[str] = None
status: BatchStatus
total_files: int
completed_files: int
failed_files: int
progress_percentage: float
created_at: datetime
started_at: Optional[datetime] = None
completed_at: Optional[datetime] = None
class Config:
from_attributes = True
class BatchStatusResponse(BaseModel):
"""Batch status with file details"""
batch: OCRBatchResponse
files: List[OCRFileResponse]
class FileStatusResponse(BaseModel):
"""File status with result details"""
file: OCRFileResponse
result: Optional[OCRResultResponse] = None
class OCRResultDetailResponse(BaseModel):
"""OCR result detail response for frontend preview - flattened structure"""
file_id: int
filename: str
status: str
markdown_content: Optional[str] = None
json_data: Optional[Dict[str, Any]] = None
confidence: Optional[float] = None
processing_time: Optional[float] = None
class Config:
from_attributes = True
class UploadBatchResponse(BaseModel):
"""Upload response schema matching frontend expectations"""
batch_id: int = Field(..., description="Batch ID")
files: List[OCRFileResponse] = Field(..., description="Uploaded files")
class Config:
json_schema_extra = {
"example": {
"batch_id": 1,
"files": [
{
"id": 1,
"batch_id": 1,
"filename": "doc_1.png",
"original_filename": "document.png",
"file_size": 1024000,
"file_format": "png",
"status": "pending",
"error_message": None,
"created_at": "2025-01-01T00:00:00",
"processing_time": None
}
]
}
}
class ProcessRequest(BaseModel):
"""OCR process request schema"""
batch_id: int = Field(..., description="Batch ID to process")
lang: str = Field(default="ch", description="Language code (ch, en, japan, korean)")
detect_layout: bool = Field(default=True, description="Enable layout detection")
class Config:
json_schema_extra = {
"example": {
"batch_id": 1,
"lang": "ch",
"detect_layout": True
}
}
class ProcessResponse(BaseModel):
"""OCR process response schema"""
message: str
batch_id: int
total_files: int
status: str
class Config:
json_schema_extra = {
"example": {
"message": "OCR processing started",
"batch_id": 1,
"total_files": 5,
"status": "processing"
}
}

View File

@@ -0,0 +1,124 @@
"""
Tool_OCR - Translation Schemas (RESERVED)
Request/response models for translation endpoints
"""
from typing import Optional, Dict, List, Any
from pydantic import BaseModel, Field
class TranslationRequest(BaseModel):
"""
Translation request schema (RESERVED)
Expected format for document translation requests
"""
file_id: int = Field(..., description="File ID to translate")
source_lang: str = Field(..., description="Source language code (zh, en, ja, ko)")
target_lang: str = Field(..., description="Target language code (zh, en, ja, ko)")
engine_type: Optional[str] = Field("offline", description="Translation engine (offline, ernie, google, deepl)")
preserve_structure: bool = Field(True, description="Preserve markdown structure")
engine_config: Optional[Dict[str, Any]] = Field(None, description="Engine-specific configuration")
class Config:
json_schema_extra = {
"example": {
"file_id": 1,
"source_lang": "zh",
"target_lang": "en",
"engine_type": "offline",
"preserve_structure": True,
"engine_config": {}
}
}
class TranslationResponse(BaseModel):
"""
Translation response schema (RESERVED)
Expected format for translation results
"""
task_id: int = Field(..., description="Translation task ID")
file_id: int
source_lang: str
target_lang: str
engine_type: str
status: str = Field(..., description="Translation status (pending, processing, completed, failed)")
translated_file_path: Optional[str] = Field(None, description="Path to translated markdown file")
progress: float = Field(0.0, description="Translation progress (0.0-1.0)")
error_message: Optional[str] = None
class Config:
json_schema_extra = {
"example": {
"task_id": 1,
"file_id": 1,
"source_lang": "zh",
"target_lang": "en",
"engine_type": "offline",
"status": "processing",
"translated_file_path": None,
"progress": 0.5,
"error_message": None
}
}
class TranslationStatusResponse(BaseModel):
"""Translation task status response (RESERVED)"""
task_id: int
status: str
progress: float
created_at: str
completed_at: Optional[str] = None
error_message: Optional[str] = None
class TranslationConfigRequest(BaseModel):
"""Translation configuration request (RESERVED)"""
source_lang: str = Field(..., max_length=20)
target_lang: str = Field(..., max_length=20)
engine_type: str = Field(..., max_length=50)
engine_config: Optional[Dict[str, Any]] = None
class Config:
json_schema_extra = {
"example": {
"source_lang": "zh",
"target_lang": "en",
"engine_type": "offline",
"engine_config": {
"model_path": "/path/to/model"
}
}
}
class TranslationConfigResponse(BaseModel):
"""Translation configuration response (RESERVED)"""
id: int
user_id: int
source_lang: str
target_lang: str
engine_type: str
engine_config: Optional[Dict[str, Any]] = None
created_at: str
updated_at: str
class TranslationFeatureStatus(BaseModel):
"""Translation feature status response"""
available: bool = Field(..., description="Whether translation is available")
status: str = Field(..., description="Feature status (reserved, planned, implemented)")
message: str = Field(..., description="Status message")
supported_engines: List[str] = Field(default_factory=list, description="Currently supported engines")
planned_engines: List[Dict[str, str]] = Field(default_factory=list, description="Planned engines")
roadmap: Dict[str, Any] = Field(default_factory=dict, description="Implementation roadmap")
class LanguageInfo(BaseModel):
"""Language information"""
code: str = Field(..., description="Language code (ISO 639-1)")
name: str = Field(..., description="Language name")
status: str = Field(..., description="Support status (planned, supported)")

View File

@@ -0,0 +1,53 @@
"""
Tool_OCR - User Schemas
"""
from datetime import datetime
from typing import Optional
from pydantic import BaseModel, EmailStr, Field
class UserBase(BaseModel):
"""Base user schema"""
username: str = Field(..., min_length=3, max_length=50)
email: EmailStr
full_name: Optional[str] = Field(None, max_length=100)
class UserCreate(UserBase):
"""User creation schema"""
password: str = Field(..., min_length=6, description="Password (min 6 characters)")
class Config:
json_schema_extra = {
"example": {
"username": "johndoe",
"email": "john@example.com",
"full_name": "John Doe",
"password": "secret123"
}
}
class UserResponse(UserBase):
"""User response schema"""
id: int
is_active: bool
is_admin: bool
created_at: datetime
updated_at: datetime
class Config:
from_attributes = True
json_schema_extra = {
"example": {
"id": 1,
"username": "johndoe",
"email": "john@example.com",
"full_name": "John Doe",
"is_active": True,
"is_admin": False,
"created_at": "2025-01-01T00:00:00",
"updated_at": "2025-01-01T00:00:00"
}
}

View File

@@ -0,0 +1,3 @@
"""
Tool_OCR - Services Package
"""

View File

@@ -0,0 +1,394 @@
"""
Tool_OCR - Background Tasks Service
Handles async processing, cleanup, and scheduled tasks
"""
import logging
import asyncio
import time
from datetime import datetime, timedelta
from pathlib import Path
from typing import Optional, Callable, Any
from sqlalchemy.orm import Session
from app.core.database import SessionLocal
from app.models.ocr import OCRBatch, OCRFile, OCRResult, BatchStatus, FileStatus
from app.services.ocr_service import OCRService
from app.services.file_manager import FileManager
from app.services.pdf_generator import PDFGenerator
logger = logging.getLogger(__name__)
class BackgroundTaskManager:
"""
Manages background tasks including retry logic, cleanup, and scheduled jobs
"""
def __init__(
self,
max_retries: int = 3,
retry_delay: int = 5,
cleanup_interval: int = 3600, # 1 hour
file_retention_hours: int = 24
):
self.max_retries = max_retries
self.retry_delay = retry_delay
self.cleanup_interval = cleanup_interval
self.file_retention_hours = file_retention_hours
self.ocr_service = OCRService()
self.file_manager = FileManager()
self.pdf_generator = PDFGenerator()
async def execute_with_retry(
self,
func: Callable,
*args,
max_retries: Optional[int] = None,
retry_delay: Optional[int] = None,
**kwargs
) -> Any:
"""
Execute a function with retry logic
Args:
func: Function to execute
args: Positional arguments for func
max_retries: Maximum retry attempts (overrides default)
retry_delay: Delay between retries in seconds (overrides default)
kwargs: Keyword arguments for func
Returns:
Function result
Raises:
Exception: If all retries are exhausted
"""
max_retries = max_retries or self.max_retries
retry_delay = retry_delay or self.retry_delay
last_exception = None
for attempt in range(max_retries + 1):
try:
if asyncio.iscoroutinefunction(func):
return await func(*args, **kwargs)
else:
return func(*args, **kwargs)
except Exception as e:
last_exception = e
if attempt < max_retries:
logger.warning(
f"Attempt {attempt + 1}/{max_retries + 1} failed for {func.__name__}: {e}. "
f"Retrying in {retry_delay}s..."
)
await asyncio.sleep(retry_delay)
else:
logger.error(
f"All {max_retries + 1} attempts failed for {func.__name__}: {e}"
)
raise last_exception
def process_single_file_with_retry(
self,
ocr_file: OCRFile,
batch_id: int,
lang: str,
detect_layout: bool,
db: Session
) -> bool:
"""
Process a single file with retry logic
Args:
ocr_file: OCRFile instance
batch_id: Batch ID
lang: Language code
detect_layout: Whether to detect layout
db: Database session
Returns:
bool: True if successful, False otherwise
"""
for attempt in range(self.max_retries + 1):
try:
# Update file status
ocr_file.status = FileStatus.PROCESSING
ocr_file.started_at = datetime.utcnow()
ocr_file.retry_count = attempt
db.commit()
# Get file paths
file_path = Path(ocr_file.file_path)
paths = self.file_manager.get_file_paths(batch_id, ocr_file.id)
# Process OCR
result = self.ocr_service.process_image(
file_path,
lang=lang,
detect_layout=detect_layout
)
# Check if processing was successful
if result['status'] != 'success':
raise Exception(result.get('error_message', 'Unknown error during OCR processing'))
# Save results
json_path, markdown_path = self.ocr_service.save_results(
result=result,
output_dir=paths["output_dir"],
file_id=str(ocr_file.id)
)
# Extract data from result
text_regions = result.get('text_regions', [])
layout_data = result.get('layout_data')
images_metadata = result.get('images_metadata', [])
# Calculate average confidence (or use from result)
avg_confidence = result.get('average_confidence')
# Create OCR result record
ocr_result = OCRResult(
file_id=ocr_file.id,
markdown_path=str(markdown_path) if markdown_path else None,
json_path=str(json_path) if json_path else None,
images_dir=None, # Images dir not used in current implementation
detected_language=lang,
total_text_regions=len(text_regions),
average_confidence=avg_confidence,
layout_data=layout_data,
images_metadata=images_metadata
)
db.add(ocr_result)
# Update file status
ocr_file.status = FileStatus.COMPLETED
ocr_file.completed_at = datetime.utcnow()
ocr_file.processing_time = (ocr_file.completed_at - ocr_file.started_at).total_seconds()
db.commit()
logger.info(f"Successfully processed file {ocr_file.id} ({ocr_file.original_filename})")
return True
except Exception as e:
logger.error(f"Attempt {attempt + 1}/{self.max_retries + 1} failed for file {ocr_file.id}: {e}")
if attempt < self.max_retries:
# Wait before retry
time.sleep(self.retry_delay)
else:
# Final failure
ocr_file.status = FileStatus.FAILED
ocr_file.error_message = f"Failed after {self.max_retries + 1} attempts: {str(e)}"
ocr_file.completed_at = datetime.utcnow()
ocr_file.retry_count = attempt
db.commit()
return False
return False
async def cleanup_expired_files(self, db: Session):
"""
Clean up files and batches older than retention period
Args:
db: Database session
"""
try:
cutoff_time = datetime.utcnow() - timedelta(hours=self.file_retention_hours)
# Find expired batches
expired_batches = db.query(OCRBatch).filter(
OCRBatch.created_at < cutoff_time,
OCRBatch.status.in_([BatchStatus.COMPLETED, BatchStatus.FAILED, BatchStatus.PARTIAL])
).all()
logger.info(f"Found {len(expired_batches)} expired batches to clean up")
for batch in expired_batches:
try:
# Get batch directory
batch_dir = self.file_manager.base_upload_dir / "batches" / str(batch.id)
# Delete physical files
if batch_dir.exists():
import shutil
shutil.rmtree(batch_dir)
logger.info(f"Deleted batch directory: {batch_dir}")
# Delete database records
# Delete results first (foreign key constraint)
db.query(OCRResult).filter(
OCRResult.file_id.in_(
db.query(OCRFile.id).filter(OCRFile.batch_id == batch.id)
)
).delete(synchronize_session=False)
# Delete files
db.query(OCRFile).filter(OCRFile.batch_id == batch.id).delete()
# Delete batch
db.delete(batch)
db.commit()
logger.info(f"Cleaned up expired batch {batch.id}")
except Exception as e:
logger.error(f"Error cleaning up batch {batch.id}: {e}")
db.rollback()
except Exception as e:
logger.error(f"Error in cleanup_expired_files: {e}")
async def generate_pdf_background(
self,
result_id: int,
output_path: str,
css_template: str = "default",
db: Session = None
):
"""
Generate PDF in background with retry logic
Args:
result_id: OCR result ID
output_path: Output PDF path
css_template: CSS template name
db: Database session
"""
should_close_db = False
if db is None:
db = SessionLocal()
should_close_db = True
try:
# Get result
result = db.query(OCRResult).filter(OCRResult.id == result_id).first()
if not result:
logger.error(f"Result {result_id} not found")
return
# Generate PDF with retry
await self.execute_with_retry(
self.pdf_generator.generate_pdf,
markdown_path=result.markdown_path,
output_path=output_path,
css_template=css_template,
max_retries=2,
retry_delay=3
)
logger.info(f"Successfully generated PDF for result {result_id}: {output_path}")
except Exception as e:
logger.error(f"Failed to generate PDF for result {result_id}: {e}")
finally:
if should_close_db:
db.close()
async def start_cleanup_scheduler(self):
"""
Start periodic cleanup scheduler
Runs cleanup task at specified intervals
"""
logger.info(f"Starting cleanup scheduler (interval: {self.cleanup_interval}s, retention: {self.file_retention_hours}h)")
while True:
try:
db = SessionLocal()
await self.cleanup_expired_files(db)
db.close()
except Exception as e:
logger.error(f"Error in cleanup scheduler: {e}")
# Wait for next interval
await asyncio.sleep(self.cleanup_interval)
# Global task manager instance
task_manager = BackgroundTaskManager()
def process_batch_files_with_retry(
batch_id: int,
lang: str,
detect_layout: bool,
db: Session
):
"""
Process all files in a batch with retry logic
Args:
batch_id: Batch ID
lang: Language code
detect_layout: Whether to detect layout
db: Database session
"""
try:
# Get batch
batch = db.query(OCRBatch).filter(OCRBatch.id == batch_id).first()
if not batch:
logger.error(f"Batch {batch_id} not found")
return
# Update batch status
batch.status = BatchStatus.PROCESSING
batch.started_at = datetime.utcnow()
db.commit()
# Get pending files
files = db.query(OCRFile).filter(
OCRFile.batch_id == batch_id,
OCRFile.status == FileStatus.PENDING
).all()
logger.info(f"Processing {len(files)} files in batch {batch_id} with retry logic")
# Process each file with retry
for ocr_file in files:
success = task_manager.process_single_file_with_retry(
ocr_file=ocr_file,
batch_id=batch_id,
lang=lang,
detect_layout=detect_layout,
db=db
)
# Update batch progress
if success:
batch.completed_files += 1
else:
batch.failed_files += 1
db.commit()
# Update batch final status
if batch.failed_files == 0:
batch.status = BatchStatus.COMPLETED
elif batch.completed_files > 0:
batch.status = BatchStatus.PARTIAL
else:
batch.status = BatchStatus.FAILED
batch.completed_at = datetime.utcnow()
db.commit()
logger.info(
f"Batch {batch_id} processing complete: "
f"{batch.completed_files} succeeded, {batch.failed_files} failed"
)
except Exception as e:
logger.error(f"Fatal error processing batch {batch_id}: {e}")
try:
batch = db.query(OCRBatch).filter(OCRBatch.id == batch_id).first()
if batch:
batch.status = BatchStatus.FAILED
batch.completed_at = datetime.utcnow()
db.commit()
except Exception as commit_error:
logger.error(f"Error updating batch status: {commit_error}")

View File

@@ -0,0 +1,512 @@
"""
Tool_OCR - Export Service
Handles OCR result export in multiple formats with filtering and formatting rules
"""
import json
import logging
import zipfile
from pathlib import Path
from typing import List, Dict, Optional, Any
from datetime import datetime
import pandas as pd
from sqlalchemy.orm import Session
from app.core.config import settings
from app.models.ocr import OCRBatch, OCRFile, OCRResult, FileStatus
from app.models.export import ExportRule
from app.services.pdf_generator import PDFGenerator, PDFGenerationError
logger = logging.getLogger(__name__)
class ExportError(Exception):
"""Exception raised for export errors"""
pass
class ExportService:
"""
Export service for OCR results
Supported formats:
- TXT: Plain text export
- JSON: Full metadata export
- Excel: Tabular data export
- Markdown: Direct Markdown export
- PDF: Layout-preserved PDF export
- ZIP: Batch export archive
"""
def __init__(self):
"""Initialize export service"""
self.pdf_generator = PDFGenerator()
def apply_filters(
self,
results: List[OCRResult],
filters: Dict[str, Any]
) -> List[OCRResult]:
"""
Apply filters to OCR results
Args:
results: List of OCR results
filters: Filter configuration
- confidence_threshold: Minimum confidence (0.0-1.0)
- filename_pattern: Glob pattern for filename matching
- language: Filter by detected language
Returns:
List[OCRResult]: Filtered results
"""
filtered = results
# Confidence threshold filter
if "confidence_threshold" in filters:
threshold = filters["confidence_threshold"]
filtered = [r for r in filtered if r.average_confidence and r.average_confidence >= threshold]
# Filename pattern filter (using simple substring match)
if "filename_pattern" in filters:
pattern = filters["filename_pattern"].lower()
filtered = [
r for r in filtered
if pattern in r.file.original_filename.lower()
]
# Language filter
if "language" in filters:
lang = filters["language"]
filtered = [r for r in filtered if r.detected_language == lang]
return filtered
def export_to_txt(
self,
results: List[OCRResult],
output_path: Path,
formatting: Optional[Dict] = None
) -> Path:
"""
Export results to plain text file
Args:
results: List of OCR results
output_path: Output file path
formatting: Formatting options
- add_line_numbers: Add line numbers
- group_by_filename: Group text by source file
- include_metadata: Add file metadata headers
Returns:
Path: Output file path
Raises:
ExportError: If export fails
"""
try:
formatting = formatting or {}
output_lines = []
for idx, result in enumerate(results, 1):
# Read Markdown file
if not result.markdown_path or not Path(result.markdown_path).exists():
logger.warning(f"Markdown file not found for result {result.id}")
continue
markdown_content = Path(result.markdown_path).read_text(encoding="utf-8")
# Add metadata header if requested
if formatting.get("include_metadata", False):
output_lines.append(f"=" * 80)
output_lines.append(f"文件: {result.file.original_filename}")
output_lines.append(f"語言: {result.detected_language or '未知'}")
output_lines.append(f"信心度: {result.average_confidence:.2%}" if result.average_confidence else "信心度: N/A")
output_lines.append(f"=" * 80)
output_lines.append("")
# Add content with optional line numbers
if formatting.get("add_line_numbers", False):
for line_num, line in enumerate(markdown_content.split('\n'), 1):
output_lines.append(f"{line_num:4d} | {line}")
else:
output_lines.append(markdown_content)
# Add separator between files if grouping
if formatting.get("group_by_filename", False) and idx < len(results):
output_lines.append("\n" + "-" * 80 + "\n")
# Write to file
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text("\n".join(output_lines), encoding="utf-8")
logger.info(f"Exported {len(results)} results to TXT: {output_path}")
return output_path
except Exception as e:
raise ExportError(f"TXT export failed: {str(e)}")
def export_to_json(
self,
results: List[OCRResult],
output_path: Path,
include_layout: bool = True,
include_images: bool = True
) -> Path:
"""
Export results to JSON file with full metadata
Args:
results: List of OCR results
output_path: Output file path
include_layout: Include layout data
include_images: Include images metadata
Returns:
Path: Output file path
Raises:
ExportError: If export fails
"""
try:
export_data = {
"export_time": datetime.utcnow().isoformat(),
"total_files": len(results),
"results": []
}
for result in results:
result_data = {
"file_id": result.file.id,
"filename": result.file.original_filename,
"file_format": result.file.file_format,
"file_size": result.file.file_size,
"processing_time": result.file.processing_time,
"detected_language": result.detected_language,
"total_text_regions": result.total_text_regions,
"average_confidence": result.average_confidence,
"markdown_path": result.markdown_path,
}
# Include layout data if requested
if include_layout and result.layout_data:
result_data["layout_data"] = result.layout_data
# Include images metadata if requested
if include_images and result.images_metadata:
result_data["images_metadata"] = result.images_metadata
export_data["results"].append(result_data)
# Write to file
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text(
json.dumps(export_data, ensure_ascii=False, indent=2),
encoding="utf-8"
)
logger.info(f"Exported {len(results)} results to JSON: {output_path}")
return output_path
except Exception as e:
raise ExportError(f"JSON export failed: {str(e)}")
def export_to_excel(
self,
results: List[OCRResult],
output_path: Path,
include_confidence: bool = True,
include_processing_time: bool = True
) -> Path:
"""
Export results to Excel file
Args:
results: List of OCR results
output_path: Output file path
include_confidence: Include confidence scores
include_processing_time: Include processing time
Returns:
Path: Output file path
Raises:
ExportError: If export fails
"""
try:
rows = []
for result in results:
# Read Markdown content
text_content = ""
if result.markdown_path and Path(result.markdown_path).exists():
text_content = Path(result.markdown_path).read_text(encoding="utf-8")
row = {
"文件名": result.file.original_filename,
"格式": result.file.file_format,
"大小(字節)": result.file.file_size,
"語言": result.detected_language or "未知",
"文本區域數": result.total_text_regions,
"提取內容": text_content[:1000] + "..." if len(text_content) > 1000 else text_content,
}
if include_confidence:
row["平均信心度"] = f"{result.average_confidence:.2%}" if result.average_confidence else "N/A"
if include_processing_time:
row["處理時間(秒)"] = f"{result.file.processing_time:.2f}" if result.file.processing_time else "N/A"
rows.append(row)
# Create DataFrame and export
df = pd.DataFrame(rows)
output_path.parent.mkdir(parents=True, exist_ok=True)
df.to_excel(output_path, index=False, engine='openpyxl')
logger.info(f"Exported {len(results)} results to Excel: {output_path}")
return output_path
except Exception as e:
raise ExportError(f"Excel export failed: {str(e)}")
def export_to_markdown(
self,
results: List[OCRResult],
output_path: Path,
combine: bool = True
) -> Path:
"""
Export results to Markdown file(s)
Args:
results: List of OCR results
output_path: Output file path (or directory if not combining)
combine: Combine all results into one file
Returns:
Path: Output file/directory path
Raises:
ExportError: If export fails
"""
try:
if combine:
# Combine all Markdown files into one
combined_content = []
for result in results:
if not result.markdown_path or not Path(result.markdown_path).exists():
continue
markdown_content = Path(result.markdown_path).read_text(encoding="utf-8")
# Add header
combined_content.append(f"# {result.file.original_filename}\n")
combined_content.append(markdown_content)
combined_content.append("\n---\n") # Separator
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text("\n".join(combined_content), encoding="utf-8")
logger.info(f"Exported {len(results)} results to combined Markdown: {output_path}")
return output_path
else:
# Export each result to separate file
output_path.mkdir(parents=True, exist_ok=True)
for result in results:
if not result.markdown_path or not Path(result.markdown_path).exists():
continue
# Copy Markdown file to output directory
src_path = Path(result.markdown_path)
dst_path = output_path / f"{result.file.original_filename}.md"
dst_path.write_text(src_path.read_text(encoding="utf-8"), encoding="utf-8")
logger.info(f"Exported {len(results)} results to separate Markdown files: {output_path}")
return output_path
except Exception as e:
raise ExportError(f"Markdown export failed: {str(e)}")
def export_to_pdf(
self,
result: OCRResult,
output_path: Path,
css_template: str = "default",
metadata: Optional[Dict] = None
) -> Path:
"""
Export single result to PDF with layout preservation
Args:
result: OCR result
output_path: Output PDF path
css_template: CSS template name or custom CSS
metadata: Optional PDF metadata
Returns:
Path: Output PDF path
Raises:
ExportError: If export fails
"""
try:
if not result.markdown_path or not Path(result.markdown_path).exists():
raise ExportError(f"Markdown file not found for result {result.id}")
markdown_path = Path(result.markdown_path)
# Prepare metadata
pdf_metadata = metadata or {}
if "title" not in pdf_metadata:
pdf_metadata["title"] = result.file.original_filename
# Generate PDF
self.pdf_generator.generate_pdf(
markdown_path=markdown_path,
output_path=output_path,
css_template=css_template,
metadata=pdf_metadata
)
logger.info(f"Exported result {result.id} to PDF: {output_path}")
return output_path
except PDFGenerationError as e:
raise ExportError(f"PDF generation failed: {str(e)}")
except Exception as e:
raise ExportError(f"PDF export failed: {str(e)}")
def export_batch_to_zip(
self,
db: Session,
batch_id: int,
output_path: Path,
include_formats: Optional[List[str]] = None
) -> Path:
"""
Export entire batch to ZIP archive
Args:
db: Database session
batch_id: Batch ID
output_path: Output ZIP path
include_formats: List of formats to include (markdown, json, txt, excel, pdf)
Returns:
Path: Output ZIP path
Raises:
ExportError: If export fails
"""
try:
include_formats = include_formats or ["markdown", "json"]
# Get batch and results
batch = db.query(OCRBatch).filter(OCRBatch.id == batch_id).first()
if not batch:
raise ExportError(f"Batch {batch_id} not found")
results = db.query(OCRResult).join(OCRFile).filter(
OCRFile.batch_id == batch_id,
OCRFile.status == FileStatus.COMPLETED
).all()
if not results:
raise ExportError(f"No completed results found for batch {batch_id}")
# Create temporary export directory
temp_dir = output_path.parent / f"temp_export_{batch_id}"
temp_dir.mkdir(parents=True, exist_ok=True)
try:
# Export in requested formats
if "markdown" in include_formats:
md_dir = temp_dir / "markdown"
self.export_to_markdown(results, md_dir, combine=False)
if "json" in include_formats:
json_path = temp_dir / "batch_results.json"
self.export_to_json(results, json_path)
if "txt" in include_formats:
txt_path = temp_dir / "batch_results.txt"
self.export_to_txt(results, txt_path)
if "excel" in include_formats:
excel_path = temp_dir / "batch_results.xlsx"
self.export_to_excel(results, excel_path)
# Create ZIP archive
output_path.parent.mkdir(parents=True, exist_ok=True)
with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
for file_path in temp_dir.rglob('*'):
if file_path.is_file():
arcname = file_path.relative_to(temp_dir)
zipf.write(file_path, arcname)
logger.info(f"Exported batch {batch_id} to ZIP: {output_path}")
return output_path
finally:
# Clean up temporary directory
import shutil
shutil.rmtree(temp_dir, ignore_errors=True)
except Exception as e:
raise ExportError(f"Batch ZIP export failed: {str(e)}")
def apply_export_rule(
self,
db: Session,
results: List[OCRResult],
rule_id: int
) -> List[OCRResult]:
"""
Apply export rule to filter and format results
Args:
db: Database session
results: List of OCR results
rule_id: Export rule ID
Returns:
List[OCRResult]: Filtered results
Raises:
ExportError: If rule not found
"""
rule = db.query(ExportRule).filter(ExportRule.id == rule_id).first()
if not rule:
raise ExportError(f"Export rule {rule_id} not found")
config = rule.config_json
# Apply filters
if "filters" in config:
results = self.apply_filters(results, config["filters"])
# Note: Formatting options are applied in individual export methods
return results
def get_export_formats(self) -> Dict[str, str]:
"""
Get available export formats
Returns:
Dict mapping format codes to descriptions
"""
return {
"txt": "純文本格式 (.txt)",
"json": "JSON 格式 - 包含完整元數據 (.json)",
"excel": "Excel 表格格式 (.xlsx)",
"markdown": "Markdown 格式 (.md)",
"pdf": "版面保留 PDF 格式 (.pdf)",
"zip": "批次打包格式 (.zip)",
}

View File

@@ -0,0 +1,420 @@
"""
Tool_OCR - File Management Service
Handles file uploads, storage, validation, and cleanup
"""
import logging
import shutil
import uuid
from pathlib import Path
from typing import List, Tuple, Optional
from datetime import datetime, timedelta
from fastapi import UploadFile
from sqlalchemy.orm import Session
from app.core.config import settings
from app.models.ocr import OCRBatch, OCRFile, FileStatus
from app.services.preprocessor import DocumentPreprocessor
logger = logging.getLogger(__name__)
class FileManagementError(Exception):
"""Exception raised for file management errors"""
pass
class FileManager:
"""
File management service for upload, storage, and cleanup
Directory structure:
uploads/
├── batches/
│ └── {batch_id}/
│ ├── inputs/ # Original uploaded files
│ ├── outputs/ # OCR results
│ │ ├── markdown/ # Markdown files
│ │ ├── json/ # JSON files
│ │ └── images/ # Extracted images
│ └── exports/ # Export files (PDF, Excel, etc.)
"""
def __init__(self):
"""Initialize file manager"""
self.preprocessor = DocumentPreprocessor()
self.base_upload_dir = Path(settings.upload_dir)
self.base_upload_dir.mkdir(parents=True, exist_ok=True)
def create_batch_directory(self, batch_id: int) -> Path:
"""
Create directory structure for a batch
Args:
batch_id: Batch ID
Returns:
Path: Batch directory path
"""
batch_dir = self.base_upload_dir / "batches" / str(batch_id)
# Create subdirectories
(batch_dir / "inputs").mkdir(parents=True, exist_ok=True)
(batch_dir / "outputs" / "markdown").mkdir(parents=True, exist_ok=True)
(batch_dir / "outputs" / "json").mkdir(parents=True, exist_ok=True)
(batch_dir / "outputs" / "images").mkdir(parents=True, exist_ok=True)
(batch_dir / "exports").mkdir(parents=True, exist_ok=True)
logger.info(f"Created batch directory: {batch_dir}")
return batch_dir
def get_batch_directory(self, batch_id: int) -> Path:
"""
Get batch directory path
Args:
batch_id: Batch ID
Returns:
Path: Batch directory path
"""
return self.base_upload_dir / "batches" / str(batch_id)
def validate_upload(self, file: UploadFile) -> Tuple[bool, Optional[str]]:
"""
Validate uploaded file before saving
Args:
file: Uploaded file
Returns:
Tuple of (is_valid, error_message)
"""
# Check filename
if not file.filename:
return False, "文件名不能為空"
# Check file size (read content size)
file.file.seek(0, 2) # Seek to end
file_size = file.file.tell()
file.file.seek(0) # Reset to beginning
if file_size == 0:
return False, "文件為空"
if file_size > settings.max_upload_size:
max_mb = settings.max_upload_size / (1024 * 1024)
return False, f"文件大小超過限制 ({max_mb}MB)"
# Check file extension
file_ext = Path(file.filename).suffix.lower()
allowed_extensions = {'.png', '.jpg', '.jpeg', '.pdf', '.doc', '.docx', '.ppt', '.pptx'}
if file_ext not in allowed_extensions:
return False, f"不支持的文件格式 ({file_ext}),僅支持: {', '.join(allowed_extensions)}"
return True, None
def save_upload(
self,
file: UploadFile,
batch_id: int,
validate: bool = True
) -> Tuple[Path, str]:
"""
Save uploaded file to batch directory
Args:
file: Uploaded file
batch_id: Batch ID
validate: Whether to validate file
Returns:
Tuple of (file_path, original_filename)
Raises:
FileManagementError: If file validation or saving fails
"""
# Validate if requested
if validate:
is_valid, error_msg = self.validate_upload(file)
if not is_valid:
raise FileManagementError(error_msg)
# Generate unique filename to avoid conflicts
original_filename = file.filename
file_ext = Path(original_filename).suffix
unique_filename = f"{uuid.uuid4()}{file_ext}"
# Get batch input directory
batch_dir = self.get_batch_directory(batch_id)
input_dir = batch_dir / "inputs"
input_dir.mkdir(parents=True, exist_ok=True)
# Save file
file_path = input_dir / unique_filename
try:
with file_path.open("wb") as buffer:
shutil.copyfileobj(file.file, buffer)
logger.info(f"Saved upload: {file_path} (original: {original_filename})")
return file_path, original_filename
except Exception as e:
# Clean up partial file if exists
file_path.unlink(missing_ok=True)
raise FileManagementError(f"保存文件失敗: {str(e)}")
def validate_saved_file(self, file_path: Path) -> Tuple[bool, Optional[str], Optional[str]]:
"""
Validate saved file using preprocessor
Args:
file_path: Path to saved file
Returns:
Tuple of (is_valid, error_message, detected_format)
"""
return self.preprocessor.validate_file(file_path)
def create_batch(
self,
db: Session,
user_id: int,
batch_name: Optional[str] = None
) -> OCRBatch:
"""
Create new OCR batch
Args:
db: Database session
user_id: User ID
batch_name: Optional batch name
Returns:
OCRBatch: Created batch object
"""
# Create batch record
batch = OCRBatch(
user_id=user_id,
batch_name=batch_name or f"Batch_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
)
db.add(batch)
db.commit()
db.refresh(batch)
# Create directory structure
self.create_batch_directory(batch.id)
logger.info(f"Created batch: {batch.id} for user {user_id}")
return batch
def add_file_to_batch(
self,
db: Session,
batch_id: int,
file: UploadFile
) -> OCRFile:
"""
Add file to batch and save to disk
Args:
db: Database session
batch_id: Batch ID
file: Uploaded file
Returns:
OCRFile: Created file record
Raises:
FileManagementError: If file operations fail
"""
# Save file to disk
file_path, original_filename = self.save_upload(file, batch_id)
# Validate saved file
is_valid, detected_format, error_msg = self.validate_saved_file(file_path)
# Create file record
ocr_file = OCRFile(
batch_id=batch_id,
filename=file_path.name,
original_filename=original_filename,
file_path=str(file_path),
file_size=file_path.stat().st_size,
file_format=detected_format or Path(original_filename).suffix.lower().lstrip('.'),
status=FileStatus.PENDING if is_valid else FileStatus.FAILED,
error_message=error_msg if not is_valid else None
)
db.add(ocr_file)
# Update batch total_files count
batch = db.query(OCRBatch).filter(OCRBatch.id == batch_id).first()
if batch:
batch.total_files += 1
if not is_valid:
batch.failed_files += 1
db.commit()
db.refresh(ocr_file)
logger.info(f"Added file to batch {batch_id}: {ocr_file.id} (status: {ocr_file.status})")
return ocr_file
def add_files_to_batch(
self,
db: Session,
batch_id: int,
files: List[UploadFile]
) -> List[OCRFile]:
"""
Add multiple files to batch
Args:
db: Database session
batch_id: Batch ID
files: List of uploaded files
Returns:
List[OCRFile]: List of created file records
"""
ocr_files = []
for file in files:
try:
ocr_file = self.add_file_to_batch(db, batch_id, file)
ocr_files.append(ocr_file)
except FileManagementError as e:
logger.error(f"Failed to add file {file.filename} to batch {batch_id}: {e}")
# Continue with other files
continue
return ocr_files
def get_file_paths(self, batch_id: int, file_id: int) -> dict:
"""
Get all paths for a file in a batch
Args:
batch_id: Batch ID
file_id: File ID
Returns:
Dict containing all relevant paths
"""
batch_dir = self.get_batch_directory(batch_id)
return {
"input_dir": batch_dir / "inputs",
"output_dir": batch_dir / "outputs",
"markdown_dir": batch_dir / "outputs" / "markdown",
"json_dir": batch_dir / "outputs" / "json",
"images_dir": batch_dir / "outputs" / "images" / str(file_id),
"export_dir": batch_dir / "exports",
}
def cleanup_expired_batches(self, db: Session, retention_hours: int = 24) -> int:
"""
Clean up expired batch files
Args:
db: Database session
retention_hours: Number of hours to retain files
Returns:
int: Number of batches cleaned up
"""
cutoff_time = datetime.utcnow() - timedelta(hours=retention_hours)
# Find expired batches
expired_batches = db.query(OCRBatch).filter(
OCRBatch.created_at < cutoff_time
).all()
cleaned_count = 0
for batch in expired_batches:
try:
# Delete batch directory
batch_dir = self.get_batch_directory(batch.id)
if batch_dir.exists():
shutil.rmtree(batch_dir)
logger.info(f"Deleted batch directory: {batch_dir}")
# Delete database records (cascade will handle related records)
db.delete(batch)
cleaned_count += 1
except Exception as e:
logger.error(f"Failed to cleanup batch {batch.id}: {e}")
continue
if cleaned_count > 0:
db.commit()
logger.info(f"Cleaned up {cleaned_count} expired batches")
return cleaned_count
def verify_file_ownership(
self,
db: Session,
user_id: int,
batch_id: int
) -> bool:
"""
Verify user owns the batch
Args:
db: Database session
user_id: User ID
batch_id: Batch ID
Returns:
bool: True if user owns batch, False otherwise
"""
batch = db.query(OCRBatch).filter(
OCRBatch.id == batch_id,
OCRBatch.user_id == user_id
).first()
return batch is not None
def get_batch_statistics(self, db: Session, batch_id: int) -> dict:
"""
Get statistics for a batch
Args:
db: Database session
batch_id: Batch ID
Returns:
Dict containing batch statistics
"""
batch = db.query(OCRBatch).filter(OCRBatch.id == batch_id).first()
if not batch:
return {}
# Calculate total file size
total_size = sum(f.file_size for f in batch.files)
# Calculate processing time
processing_time = None
if batch.completed_at and batch.started_at:
processing_time = (batch.completed_at - batch.started_at).total_seconds()
return {
"batch_id": batch.id,
"batch_name": batch.batch_name,
"status": batch.status,
"total_files": batch.total_files,
"completed_files": batch.completed_files,
"failed_files": batch.failed_files,
"pending_files": batch.total_files - batch.completed_files - batch.failed_files,
"progress_percentage": batch.progress_percentage,
"total_file_size": total_size,
"total_file_size_mb": round(total_size / (1024 * 1024), 2),
"created_at": batch.created_at.isoformat(),
"started_at": batch.started_at.isoformat() if batch.started_at else None,
"completed_at": batch.completed_at.isoformat() if batch.completed_at else None,
"processing_time": processing_time,
}

View File

@@ -0,0 +1,516 @@
"""
Tool_OCR - Core OCR Service
PaddleOCR-VL integration for text and structure extraction
"""
import json
import logging
from pathlib import Path
from typing import Dict, List, Optional, Tuple
from datetime import datetime
import uuid
from paddleocr import PaddleOCR, PPStructureV3
from PIL import Image
from pdf2image import convert_from_path
from app.core.config import settings
from app.services.office_converter import OfficeConverter, OfficeConverterError
logger = logging.getLogger(__name__)
class OCRService:
"""
Core OCR service using PaddleOCR-VL
Handles text recognition and document structure analysis
"""
def __init__(self):
"""Initialize PaddleOCR and PPStructure engines"""
self.ocr_languages = settings.ocr_languages_list
self.confidence_threshold = settings.ocr_confidence_threshold
# Initialize PaddleOCR engine (will be lazy-loaded per language)
self.ocr_engines = {}
# Initialize PP-Structure for layout analysis
self.structure_engine = None
# Initialize Office document converter
self.office_converter = OfficeConverter()
logger.info("OCR Service initialized")
def get_ocr_engine(self, lang: str = 'ch') -> PaddleOCR:
"""
Get or create OCR engine for specified language
Args:
lang: Language code (ch, en, japan, korean, etc.)
Returns:
PaddleOCR engine instance
"""
if lang not in self.ocr_engines:
logger.info(f"Initializing PaddleOCR engine for language: {lang}")
self.ocr_engines[lang] = PaddleOCR(
use_angle_cls=True,
lang=lang,
# Note: show_log and use_gpu parameters removed in PaddleOCR 3.x
)
logger.info(f"PaddleOCR engine ready for {lang}")
return self.ocr_engines[lang]
def get_structure_engine(self) -> PPStructureV3:
"""
Get or create PP-Structure engine for layout analysis
Returns:
PPStructure engine instance
"""
if self.structure_engine is None:
logger.info("Initializing PP-StructureV3 engine")
self.structure_engine = PPStructureV3(
use_doc_orientation_classify=False,
use_doc_unwarping=False,
use_textline_orientation=False,
use_table_recognition=True,
use_formula_recognition=True,
layout_threshold=0.5,
)
logger.info("PP-StructureV3 engine ready")
return self.structure_engine
def convert_pdf_to_images(self, pdf_path: Path, output_dir: Path) -> List[Path]:
"""
Convert PDF to images (one per page)
Args:
pdf_path: Path to PDF file
output_dir: Directory to save converted images
Returns:
List of paths to converted images
"""
try:
output_dir.mkdir(parents=True, exist_ok=True)
logger.info(f"Converting PDF {pdf_path.name} to images")
# Convert PDF to images (300 DPI for good quality)
images = convert_from_path(
str(pdf_path),
dpi=300,
fmt='png'
)
image_paths = []
for i, image in enumerate(images):
# Save each page as PNG
image_path = output_dir / f"{pdf_path.stem}_page_{i+1}.png"
image.save(str(image_path), 'PNG')
image_paths.append(image_path)
logger.info(f"Saved page {i+1} to {image_path.name}")
logger.info(f"Converted {len(image_paths)} pages from PDF")
return image_paths
except Exception as e:
logger.error(f"PDF conversion error: {str(e)}")
raise
def process_image(
self,
image_path: Path,
lang: str = 'ch',
detect_layout: bool = True,
confidence_threshold: Optional[float] = None
) -> Dict:
"""
Process single image with OCR and layout analysis
Args:
image_path: Path to image file
lang: Language for OCR
detect_layout: Whether to perform layout analysis
confidence_threshold: Minimum confidence threshold (uses default if None)
Returns:
Dictionary with OCR results and metadata
"""
start_time = datetime.now()
threshold = confidence_threshold if confidence_threshold is not None else self.confidence_threshold
try:
# Check if file is Office document
if self.office_converter.is_office_document(image_path):
logger.info(f"Detected Office document: {image_path.name}, converting to PDF")
try:
# Convert Office document to PDF
pdf_path = self.office_converter.convert_to_pdf(image_path)
logger.info(f"Office document converted to PDF: {pdf_path.name}")
# Process the PDF (will be handled by PDF processing logic below)
image_path = pdf_path
except OfficeConverterError as e:
logger.error(f"Office conversion failed: {str(e)}")
raise
# Check if file is PDF
is_pdf = image_path.suffix.lower() == '.pdf'
if is_pdf:
# Convert PDF to images
logger.info(f"Detected PDF file: {image_path.name}, converting to images")
pdf_images_dir = image_path.parent / f"{image_path.stem}_pages"
image_paths = self.convert_pdf_to_images(image_path, pdf_images_dir)
# Process all pages
all_text_regions = []
total_confidence_sum = 0.0
total_valid_regions = 0
all_layout_data = []
all_images_metadata = []
for page_num, page_image_path in enumerate(image_paths, 1):
logger.info(f"Processing PDF page {page_num}/{len(image_paths)}")
# Process each page
page_result = self.process_image(
page_image_path,
lang=lang,
detect_layout=detect_layout,
confidence_threshold=confidence_threshold
)
# Accumulate results
if page_result['status'] == 'success':
# Add page number to each text region
for region in page_result['text_regions']:
region['page'] = page_num
all_text_regions.append(region)
total_confidence_sum += page_result['average_confidence'] * page_result['total_text_regions']
total_valid_regions += page_result['total_text_regions']
# Accumulate layout data
if page_result.get('layout_data'):
all_layout_data.append(page_result['layout_data'])
# Accumulate images metadata
if page_result.get('images_metadata'):
all_images_metadata.extend(page_result['images_metadata'])
# Calculate overall average confidence
avg_confidence = total_confidence_sum / total_valid_regions if total_valid_regions > 0 else 0.0
# Combine layout data from all pages
combined_layout = None
if all_layout_data:
combined_elements = []
for layout in all_layout_data:
if layout.get('elements'):
combined_elements.extend(layout['elements'])
if combined_elements:
combined_layout = {
'elements': combined_elements,
'total_elements': len(combined_elements),
'reading_order': list(range(len(combined_elements))),
}
# Generate combined markdown
markdown_content = self.generate_markdown(all_text_regions, combined_layout)
# Calculate processing time
processing_time = (datetime.now() - start_time).total_seconds()
logger.info(
f"PDF processing completed: {image_path.name} - "
f"{len(image_paths)} pages, "
f"{len(all_text_regions)} regions, "
f"{avg_confidence:.2f} avg confidence, "
f"{processing_time:.2f}s"
)
return {
'status': 'success',
'file_name': image_path.name,
'language': lang,
'text_regions': all_text_regions,
'total_text_regions': len(all_text_regions),
'average_confidence': avg_confidence,
'layout_data': combined_layout,
'images_metadata': all_images_metadata,
'markdown_content': markdown_content,
'processing_time': processing_time,
'timestamp': datetime.utcnow().isoformat(),
'total_pages': len(image_paths),
}
# Get OCR engine (for non-PDF images)
ocr_engine = self.get_ocr_engine(lang)
# Perform OCR
logger.info(f"Processing image: {image_path.name}")
# Note: In PaddleOCR 3.x, use_angle_cls is set during initialization, not in ocr() call
ocr_results = ocr_engine.ocr(str(image_path))
# Parse OCR results (PaddleOCR 3.x format)
text_regions = []
total_confidence = 0.0
valid_regions = 0
if ocr_results and isinstance(ocr_results, (list, tuple)) and len(ocr_results) > 0:
# PaddleOCR 3.x returns a list of dictionaries (one per page)
for page_result in ocr_results:
if isinstance(page_result, dict):
# New format: {'rec_texts': [...], 'rec_scores': [...], 'rec_polys': [...]}
texts = page_result.get('rec_texts', [])
scores = page_result.get('rec_scores', [])
polys = page_result.get('rec_polys', [])
# Process each recognized text
for idx, text in enumerate(texts):
# Get corresponding score and bbox
confidence = scores[idx] if idx < len(scores) else 1.0
bbox = polys[idx] if idx < len(polys) else []
# Convert numpy array bbox to list for JSON serialization
if hasattr(bbox, 'tolist'):
bbox = bbox.tolist()
# Filter by confidence threshold
if confidence >= threshold:
text_regions.append({
'text': text,
'bbox': bbox,
'confidence': float(confidence),
})
total_confidence += confidence
valid_regions += 1
avg_confidence = total_confidence / valid_regions if valid_regions > 0 else 0.0
logger.info(f"Parsed {len(text_regions)} text regions with avg confidence {avg_confidence:.3f}")
# Layout analysis (if requested)
layout_data = None
images_metadata = []
if detect_layout:
layout_data, images_metadata = self.analyze_layout(image_path)
# Generate Markdown
markdown_content = self.generate_markdown(text_regions, layout_data)
# Calculate processing time
processing_time = (datetime.now() - start_time).total_seconds()
result = {
'status': 'success',
'file_name': image_path.name,
'language': lang,
'text_regions': text_regions,
'total_text_regions': len(text_regions),
'average_confidence': avg_confidence,
'layout_data': layout_data,
'images_metadata': images_metadata,
'markdown_content': markdown_content,
'processing_time': processing_time,
'timestamp': datetime.utcnow().isoformat(),
}
logger.info(
f"OCR completed: {image_path.name} - "
f"{len(text_regions)} regions, "
f"{avg_confidence:.2f} avg confidence, "
f"{processing_time:.2f}s"
)
return result
except Exception as e:
import traceback
error_trace = traceback.format_exc()
logger.error(f"OCR processing error for {image_path.name}: {str(e)}\n{error_trace}")
return {
'status': 'error',
'file_name': image_path.name,
'error_message': str(e),
'processing_time': (datetime.now() - start_time).total_seconds(),
}
def analyze_layout(self, image_path: Path) -> Tuple[Optional[Dict], List[Dict]]:
"""
Analyze document layout using PP-StructureV3
Args:
image_path: Path to image file
Returns:
Tuple of (layout_data, images_metadata)
"""
try:
structure_engine = self.get_structure_engine()
# Perform structure analysis using predict() method (PaddleOCR 3.x API)
logger.info(f"Running layout analysis on {image_path.name}")
results = structure_engine.predict(str(image_path))
layout_elements = []
images_metadata = []
# Process each page result (for images, usually just one page)
for page_idx, page_result in enumerate(results):
# Get markdown dictionary from result object
if hasattr(page_result, 'markdown'):
markdown_dict = page_result.markdown
logger.info(f"Page {page_idx} markdown keys: {markdown_dict.keys() if isinstance(markdown_dict, dict) else type(markdown_dict)}")
# Extract layout information from markdown structure
if isinstance(markdown_dict, dict):
# Get markdown texts (HTML format with tables and structure)
markdown_texts = markdown_dict.get('markdown_texts', '')
markdown_images = markdown_dict.get('markdown_images', {})
# Create a layout element for the structured content
if markdown_texts:
# Parse HTML content to identify tables and text
import re
# Check if content contains tables
has_table = '<table' in markdown_texts.lower()
element = {
'element_id': len(layout_elements),
'type': 'table' if has_table else 'text',
'content': markdown_texts,
'page': page_idx,
'bbox': [], # PP-StructureV3 doesn't provide individual bbox in this format
}
layout_elements.append(element)
# Add image metadata
for img_idx, (img_path, img_obj) in enumerate(markdown_images.items()):
images_metadata.append({
'element_id': len(layout_elements) + img_idx,
'image_path': img_path,
'type': 'image',
'page': page_idx,
'bbox': [],
})
if layout_elements:
layout_data = {
'elements': layout_elements,
'total_elements': len(layout_elements),
'reading_order': list(range(len(layout_elements))),
}
logger.info(f"Detected {len(layout_elements)} layout elements")
return layout_data, images_metadata
else:
logger.warning("No layout elements detected")
return None, []
except Exception as e:
import traceback
error_trace = traceback.format_exc()
logger.error(f"Layout analysis error: {str(e)}\n{error_trace}")
return None, []
def generate_markdown(
self,
text_regions: List[Dict],
layout_data: Optional[Dict] = None
) -> str:
"""
Generate Markdown from OCR results
Args:
text_regions: List of text regions with bbox and text
layout_data: Optional layout structure information
Returns:
Markdown formatted string
"""
markdown_lines = []
if layout_data and layout_data.get('elements'):
# Generate structured Markdown based on layout
for element in layout_data['elements']:
element_type = element.get('type', 'text')
content = element.get('content', '')
if element_type == 'title':
markdown_lines.append(f"# {content}\n")
elif element_type == 'table':
# Table in HTML format
markdown_lines.append(content)
markdown_lines.append("")
elif element_type == 'figure':
element_id = element.get('element_id')
markdown_lines.append(f"![Figure {element_id}](./images/img_{element_id}.jpg)\n")
else:
markdown_lines.append(f"{content}\n")
else:
# Simple Markdown from text regions only
# Sort by vertical position (top to bottom)
def get_y_coord(region):
"""Safely extract Y coordinate from bbox"""
bbox = region.get('bbox', [])
if isinstance(bbox, (list, tuple)) and len(bbox) > 0:
if isinstance(bbox[0], (list, tuple)) and len(bbox[0]) > 1:
return bbox[0][1] # [[x1,y1], [x2,y2], ...] format
elif len(bbox) > 1:
return bbox[1] # [x1, y1, x2, y2, ...] format
return 0 # Default to 0 if can't extract
sorted_regions = sorted(text_regions, key=get_y_coord)
for region in sorted_regions:
text = region['text']
markdown_lines.append(text)
return "\n".join(markdown_lines)
def save_results(
self,
result: Dict,
output_dir: Path,
file_id: str
) -> Tuple[Optional[Path], Optional[Path]]:
"""
Save OCR results to JSON and Markdown files
Args:
result: OCR result dictionary
output_dir: Output directory
file_id: Unique file identifier
Returns:
Tuple of (json_path, markdown_path)
"""
try:
output_dir.mkdir(parents=True, exist_ok=True)
# Save JSON
json_path = output_dir / f"{file_id}_result.json"
with open(json_path, 'w', encoding='utf-8') as f:
json.dump(result, f, ensure_ascii=False, indent=2)
# Save Markdown
markdown_path = output_dir / f"{file_id}_output.md"
markdown_content = result.get('markdown_content', '')
with open(markdown_path, 'w', encoding='utf-8') as f:
f.write(markdown_content)
logger.info(f"Results saved: {json_path.name}, {markdown_path.name}")
return json_path, markdown_path
except Exception as e:
logger.error(f"Error saving results: {str(e)}")
return None, None

View File

@@ -0,0 +1,210 @@
"""
Tool_OCR - Office Document Converter Service
Convert Office documents (DOC/DOCX/PPT/PPTX) to PDF for OCR processing
"""
import logging
import subprocess
from pathlib import Path
from typing import Optional
import tempfile
import shutil
logger = logging.getLogger(__name__)
class OfficeConverterError(Exception):
"""Exception raised for Office conversion errors"""
pass
class OfficeConverter:
"""Convert Office documents to PDF for OCR processing"""
# Supported Office formats
OFFICE_FORMATS = {
'.doc': 'application/msword',
'.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'.ppt': 'application/vnd.ms-powerpoint',
'.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation'
}
def __init__(self, libreoffice_path: str = "/Applications/LibreOffice.app/Contents/MacOS/soffice"):
"""
Initialize Office converter
Args:
libreoffice_path: Path to LibreOffice executable
"""
self.libreoffice_path = libreoffice_path
self._verify_libreoffice()
def _verify_libreoffice(self):
"""Verify LibreOffice is installed and accessible"""
if not Path(self.libreoffice_path).exists():
# Try alternative path for Homebrew installation
alt_path = shutil.which("soffice")
if alt_path:
self.libreoffice_path = alt_path
logger.info(f"Using LibreOffice at: {alt_path}")
else:
raise OfficeConverterError(
"LibreOffice not found. Please install LibreOffice: brew install libreoffice"
)
def is_office_document(self, file_path: Path) -> bool:
"""
Check if file is an Office document
Args:
file_path: Path to file
Returns:
True if file is an Office document
"""
return file_path.suffix.lower() in self.OFFICE_FORMATS
def convert_to_pdf(self, office_path: Path, output_dir: Optional[Path] = None) -> Path:
"""
Convert Office document to PDF
Args:
office_path: Path to Office document
output_dir: Optional output directory (uses temp dir if not specified)
Returns:
Path to converted PDF file
Raises:
OfficeConverterError: If conversion fails
"""
if not office_path.exists():
raise OfficeConverterError(f"Office file not found: {office_path}")
if not self.is_office_document(office_path):
raise OfficeConverterError(
f"Unsupported format: {office_path.suffix}. "
f"Supported formats: {', '.join(self.OFFICE_FORMATS.keys())}"
)
# Determine output directory
if output_dir is None:
output_dir = office_path.parent
else:
output_dir.mkdir(parents=True, exist_ok=True)
# Expected output PDF path
pdf_filename = office_path.stem + '.pdf'
output_pdf_path = output_dir / pdf_filename
# Remove existing PDF if present
if output_pdf_path.exists():
output_pdf_path.unlink()
logger.info(f"Converting {office_path.name} to PDF using LibreOffice")
try:
# Use LibreOffice headless mode for conversion
# --headless: Run without GUI
# --convert-to pdf: Convert to PDF format
# --outdir: Output directory
cmd = [
self.libreoffice_path,
'--headless',
'--convert-to', 'pdf',
'--outdir', str(output_dir),
str(office_path)
]
logger.debug(f"Running command: {' '.join(cmd)}")
result = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=60 # 60 second timeout
)
if result.returncode != 0:
error_msg = result.stderr or result.stdout
raise OfficeConverterError(
f"LibreOffice conversion failed: {error_msg}"
)
# Verify PDF was created
if not output_pdf_path.exists():
raise OfficeConverterError(
f"PDF file not created at expected location: {output_pdf_path}"
)
logger.info(f"Successfully converted to PDF: {output_pdf_path.name}")
return output_pdf_path
except subprocess.TimeoutExpired:
raise OfficeConverterError(
f"Conversion timeout (60s) for file: {office_path.name}"
)
except Exception as e:
if isinstance(e, OfficeConverterError):
raise
raise OfficeConverterError(f"Conversion error: {str(e)}")
def convert_docx_to_pdf(self, docx_path: Path, output_dir: Optional[Path] = None) -> Path:
"""
Convert DOCX to PDF
Args:
docx_path: Path to DOCX file
output_dir: Optional output directory
Returns:
Path to converted PDF
"""
if docx_path.suffix.lower() != '.docx':
raise OfficeConverterError(f"Expected .docx file, got: {docx_path.suffix}")
return self.convert_to_pdf(docx_path, output_dir)
def convert_doc_to_pdf(self, doc_path: Path, output_dir: Optional[Path] = None) -> Path:
"""
Convert legacy DOC to PDF
Args:
doc_path: Path to DOC file
output_dir: Optional output directory
Returns:
Path to converted PDF
"""
if doc_path.suffix.lower() != '.doc':
raise OfficeConverterError(f"Expected .doc file, got: {doc_path.suffix}")
return self.convert_to_pdf(doc_path, output_dir)
def convert_pptx_to_pdf(self, pptx_path: Path, output_dir: Optional[Path] = None) -> Path:
"""
Convert PPTX to PDF
Args:
pptx_path: Path to PPTX file
output_dir: Optional output directory
Returns:
Path to converted PDF
"""
if pptx_path.suffix.lower() != '.pptx':
raise OfficeConverterError(f"Expected .pptx file, got: {pptx_path.suffix}")
return self.convert_to_pdf(pptx_path, output_dir)
def convert_ppt_to_pdf(self, ppt_path: Path, output_dir: Optional[Path] = None) -> Path:
"""
Convert legacy PPT to PDF
Args:
ppt_path: Path to PPT file
output_dir: Optional output directory
Returns:
Path to converted PDF
"""
if ppt_path.suffix.lower() != '.ppt':
raise OfficeConverterError(f"Expected .ppt file, got: {ppt_path.suffix}")
return self.convert_to_pdf(ppt_path, output_dir)

View File

@@ -0,0 +1,507 @@
"""
Tool_OCR - PDF Generator Service
Converts Markdown to layout-preserved PDFs using Pandoc + WeasyPrint
"""
import logging
import subprocess
from pathlib import Path
from typing import Optional, Dict
from datetime import datetime
from weasyprint import HTML, CSS
from markdown import markdown
from app.core.config import settings
logger = logging.getLogger(__name__)
class PDFGenerationError(Exception):
"""Exception raised when PDF generation fails"""
pass
class PDFGenerator:
"""
PDF generation service with layout preservation
Supports two generation methods:
1. Pandoc (preferred): Markdown → HTML → PDF via pandoc command
2. WeasyPrint (fallback): Direct Python-based HTML → PDF conversion
"""
# Default CSS template for layout preservation
DEFAULT_CSS = """
@page {
size: A4;
margin: 2cm;
}
body {
font-family: "Noto Sans CJK SC", "Noto Sans CJK TC", "Microsoft YaHei", "SimSun", sans-serif;
font-size: 11pt;
line-height: 1.6;
color: #333;
}
h1 {
font-size: 24pt;
font-weight: bold;
margin-top: 0;
margin-bottom: 12pt;
color: #000;
page-break-after: avoid;
}
h2 {
font-size: 18pt;
font-weight: bold;
margin-top: 18pt;
margin-bottom: 10pt;
color: #000;
page-break-after: avoid;
}
h3 {
font-size: 14pt;
font-weight: bold;
margin-top: 14pt;
margin-bottom: 8pt;
color: #000;
page-break-after: avoid;
}
p {
margin: 0 0 10pt 0;
text-align: justify;
}
table {
width: 100%;
border-collapse: collapse;
margin: 12pt 0;
page-break-inside: avoid;
}
table th {
background-color: #f0f0f0;
border: 1px solid #ccc;
padding: 8pt;
text-align: left;
font-weight: bold;
}
table td {
border: 1px solid #ccc;
padding: 8pt;
text-align: left;
}
code {
font-family: "Courier New", monospace;
font-size: 10pt;
background-color: #f5f5f5;
padding: 2pt 4pt;
border-radius: 3px;
}
pre {
background-color: #f5f5f5;
border: 1px solid #ddd;
border-radius: 5px;
padding: 10pt;
overflow-x: auto;
page-break-inside: avoid;
}
pre code {
background-color: transparent;
padding: 0;
}
img {
max-width: 100%;
height: auto;
display: block;
margin: 12pt auto;
page-break-inside: avoid;
}
blockquote {
border-left: 4px solid #ddd;
padding-left: 12pt;
margin: 12pt 0;
color: #666;
font-style: italic;
}
ul, ol {
margin: 10pt 0;
padding-left: 20pt;
}
li {
margin: 5pt 0;
}
hr {
border: none;
border-top: 1px solid #ccc;
margin: 20pt 0;
}
.page-break {
page-break-after: always;
}
"""
# Academic paper template
ACADEMIC_CSS = """
@page {
size: A4;
margin: 2.5cm;
}
body {
font-family: "Times New Roman", "Noto Serif CJK SC", serif;
font-size: 12pt;
line-height: 1.8;
color: #000;
}
h1 {
font-size: 20pt;
text-align: center;
margin-bottom: 24pt;
page-break-after: avoid;
}
h2 {
font-size: 16pt;
margin-top: 20pt;
margin-bottom: 12pt;
page-break-after: avoid;
}
h3 {
font-size: 14pt;
margin-top: 16pt;
margin-bottom: 10pt;
page-break-after: avoid;
}
p {
text-indent: 2em;
text-align: justify;
margin: 0 0 12pt 0;
}
table {
width: 100%;
border-collapse: collapse;
margin: 16pt auto;
page-break-inside: avoid;
}
table caption {
font-weight: bold;
margin-bottom: 8pt;
}
"""
# Business report template
BUSINESS_CSS = """
@page {
size: A4;
margin: 2cm 2.5cm;
}
body {
font-family: "Arial", "Noto Sans CJK SC", sans-serif;
font-size: 11pt;
line-height: 1.5;
color: #333;
}
h1 {
font-size: 22pt;
color: #0066cc;
border-bottom: 3px solid #0066cc;
padding-bottom: 8pt;
margin-bottom: 20pt;
page-break-after: avoid;
}
h2 {
font-size: 16pt;
color: #0066cc;
margin-top: 20pt;
margin-bottom: 12pt;
page-break-after: avoid;
}
table {
width: 100%;
border-collapse: collapse;
margin: 16pt 0;
}
table th {
background-color: #0066cc;
color: white;
padding: 10pt;
font-weight: bold;
}
table td {
border: 1px solid #ddd;
padding: 10pt;
}
table tr:nth-child(even) {
background-color: #f9f9f9;
}
"""
def __init__(self):
"""Initialize PDF generator"""
self.css_templates = {
"default": self.DEFAULT_CSS,
"academic": self.ACADEMIC_CSS,
"business": self.BUSINESS_CSS,
}
def check_pandoc_available(self) -> bool:
"""
Check if Pandoc is installed and available
Returns:
bool: True if pandoc is available, False otherwise
"""
try:
result = subprocess.run(
["pandoc", "--version"],
capture_output=True,
text=True,
timeout=5
)
return result.returncode == 0
except (subprocess.TimeoutExpired, FileNotFoundError):
logger.warning("Pandoc not found or timed out")
return False
def generate_pdf_pandoc(
self,
markdown_path: Path,
output_path: Path,
css_template: str = "default",
metadata: Optional[Dict] = None
) -> Path:
"""
Generate PDF using Pandoc (preferred method)
Args:
markdown_path: Path to input Markdown file
output_path: Path to output PDF file
css_template: CSS template name or custom CSS string
metadata: Optional metadata dict (title, author, date)
Returns:
Path: Path to generated PDF file
Raises:
PDFGenerationError: If PDF generation fails
"""
try:
# Create temporary CSS file
css_content = self.css_templates.get(css_template, css_template)
css_file = output_path.parent / f"temp_{datetime.now().timestamp()}.css"
css_file.write_text(css_content, encoding="utf-8")
# Build pandoc command
pandoc_cmd = [
"pandoc",
str(markdown_path),
"-o", str(output_path),
"--pdf-engine=weasyprint",
"--css", str(css_file),
"--standalone",
"--from=markdown+tables+fenced_code_blocks+footnotes",
]
# Add metadata if provided
if metadata:
if metadata.get("title"):
pandoc_cmd.extend(["--metadata", f"title={metadata['title']}"])
if metadata.get("author"):
pandoc_cmd.extend(["--metadata", f"author={metadata['author']}"])
if metadata.get("date"):
pandoc_cmd.extend(["--metadata", f"date={metadata['date']}"])
# Execute pandoc
logger.info(f"Executing pandoc: {' '.join(pandoc_cmd)}")
result = subprocess.run(
pandoc_cmd,
capture_output=True,
text=True,
timeout=60 # 60 second timeout for large documents
)
# Clean up temporary CSS file
css_file.unlink(missing_ok=True)
if result.returncode != 0:
error_msg = f"Pandoc failed: {result.stderr}"
logger.error(error_msg)
raise PDFGenerationError(error_msg)
if not output_path.exists():
raise PDFGenerationError(f"PDF file not created: {output_path}")
logger.info(f"PDF generated successfully via Pandoc: {output_path}")
return output_path
except subprocess.TimeoutExpired:
css_file.unlink(missing_ok=True)
raise PDFGenerationError("Pandoc execution timed out")
except Exception as e:
css_file.unlink(missing_ok=True)
raise PDFGenerationError(f"Pandoc PDF generation failed: {str(e)}")
def generate_pdf_weasyprint(
self,
markdown_path: Path,
output_path: Path,
css_template: str = "default",
metadata: Optional[Dict] = None
) -> Path:
"""
Generate PDF using WeasyPrint directly (fallback method)
Args:
markdown_path: Path to input Markdown file
output_path: Path to output PDF file
css_template: CSS template name or custom CSS string
metadata: Optional metadata dict (title, author, date)
Returns:
Path: Path to generated PDF file
Raises:
PDFGenerationError: If PDF generation fails
"""
try:
# Read Markdown content
markdown_content = markdown_path.read_text(encoding="utf-8")
# Convert Markdown to HTML
html_content = markdown(
markdown_content,
extensions=[
'tables',
'fenced_code',
'codehilite',
'nl2br',
'sane_lists',
]
)
# Wrap HTML with proper structure
title = metadata.get("title", markdown_path.stem) if metadata else markdown_path.stem
full_html = f"""
<!DOCTYPE html>
<html lang="zh-CN">
<head>
<meta charset="UTF-8">
<title>{title}</title>
</head>
<body>
{html_content}
</body>
</html>
"""
# Get CSS content
css_content = self.css_templates.get(css_template, css_template)
# Generate PDF
logger.info(f"Generating PDF via WeasyPrint: {output_path}")
html = HTML(string=full_html, base_url=str(markdown_path.parent))
css = CSS(string=css_content)
html.write_pdf(str(output_path), stylesheets=[css])
if not output_path.exists():
raise PDFGenerationError(f"PDF file not created: {output_path}")
logger.info(f"PDF generated successfully via WeasyPrint: {output_path}")
return output_path
except Exception as e:
raise PDFGenerationError(f"WeasyPrint PDF generation failed: {str(e)}")
def generate_pdf(
self,
markdown_path: Path,
output_path: Path,
css_template: str = "default",
metadata: Optional[Dict] = None,
prefer_pandoc: bool = True
) -> Path:
"""
Generate PDF from Markdown with automatic fallback
Args:
markdown_path: Path to input Markdown file
output_path: Path to output PDF file
css_template: CSS template name ("default", "academic", "business") or custom CSS
metadata: Optional metadata dict (title, author, date)
prefer_pandoc: Use Pandoc if available, fallback to WeasyPrint
Returns:
Path: Path to generated PDF file
Raises:
PDFGenerationError: If both methods fail
"""
if not markdown_path.exists():
raise PDFGenerationError(f"Markdown file not found: {markdown_path}")
# Ensure output directory exists
output_path.parent.mkdir(parents=True, exist_ok=True)
# Try Pandoc first if preferred and available
if prefer_pandoc and self.check_pandoc_available():
try:
return self.generate_pdf_pandoc(markdown_path, output_path, css_template, metadata)
except PDFGenerationError as e:
logger.warning(f"Pandoc failed, falling back to WeasyPrint: {e}")
# Fall through to WeasyPrint
# Use WeasyPrint (fallback or direct)
return self.generate_pdf_weasyprint(markdown_path, output_path, css_template, metadata)
def get_available_templates(self) -> Dict[str, str]:
"""
Get list of available CSS templates
Returns:
Dict mapping template names to descriptions
"""
return {
"default": "通用排版模板,適合大多數文檔",
"academic": "學術論文模板,適合研究報告",
"business": "商業報告模板,適合企業文檔",
}
def save_custom_template(self, template_name: str, css_content: str) -> None:
"""
Save a custom CSS template
Args:
template_name: Template name
css_content: CSS content
"""
self.css_templates[template_name] = css_content
logger.info(f"Custom CSS template saved: {template_name}")

View File

@@ -0,0 +1,230 @@
"""
Tool_OCR - Document Preprocessor Service
Handles file validation, format detection, and preprocessing
"""
import magic
from pathlib import Path
from typing import Tuple, Optional
import logging
from PIL import Image
import cv2
import numpy as np
from app.core.config import settings
logger = logging.getLogger(__name__)
class DocumentPreprocessor:
"""
Document preprocessing service for format standardization
Validates and prepares documents for OCR processing
"""
SUPPORTED_IMAGE_FORMATS = ['png', 'jpg', 'jpeg', 'bmp', 'tiff', 'tif']
SUPPORTED_PDF_FORMAT = ['pdf']
ALL_SUPPORTED_FORMATS = SUPPORTED_IMAGE_FORMATS + SUPPORTED_PDF_FORMAT
def __init__(self):
self.allowed_extensions = settings.allowed_extensions_list
self.max_file_size = settings.max_upload_size
logger.info(f"DocumentPreprocessor initialized with allowed_extensions: {self.allowed_extensions}")
def validate_file(self, file_path: Path) -> Tuple[bool, Optional[str], Optional[str]]:
"""
Validate file format, size, and integrity
Args:
file_path: Path to the file to validate
Returns:
Tuple of (is_valid, file_format, error_message)
"""
try:
# Check file exists
if not file_path.exists():
return False, None, f"File not found: {file_path}"
# Check file size
file_size = file_path.stat().st_size
if file_size > self.max_file_size:
max_mb = self.max_file_size / (1024 * 1024)
actual_mb = file_size / (1024 * 1024)
return False, None, f"File too large: {actual_mb:.2f}MB (max {max_mb:.2f}MB)"
# Detect file format using magic numbers
mime = magic.Magic(mime=True)
mime_type = mime.from_file(str(file_path))
# Map MIME type to format
file_format = self._mime_to_format(mime_type)
if not file_format:
return False, None, f"Unsupported file type: {mime_type}"
# Check if format is in allowed extensions
if file_format not in self.allowed_extensions:
return False, None, f"File format '{file_format}' not allowed"
# Validate file integrity
is_valid, error = self._validate_integrity(file_path, file_format)
if not is_valid:
return False, file_format, f"File corrupted: {error}"
logger.info(f"File validated successfully: {file_path.name} ({file_format})")
return True, file_format, None
except Exception as e:
logger.error(f"File validation error: {str(e)}")
return False, None, f"Validation error: {str(e)}"
def _mime_to_format(self, mime_type: str) -> Optional[str]:
"""Convert MIME type to file format"""
mime_map = {
'image/png': 'png',
'image/jpeg': 'jpg',
'image/jpg': 'jpg',
'image/bmp': 'bmp',
'image/tiff': 'tiff',
'image/x-tiff': 'tiff',
'application/pdf': 'pdf',
'application/msword': 'doc',
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
'application/vnd.ms-powerpoint': 'ppt',
'application/vnd.openxmlformats-officedocument.presentationml.presentation': 'pptx',
}
return mime_map.get(mime_type)
def _validate_integrity(self, file_path: Path, file_format: str) -> Tuple[bool, Optional[str]]:
"""
Validate file integrity by attempting to open it
Args:
file_path: Path to file
file_format: Detected file format
Returns:
Tuple of (is_valid, error_message)
"""
try:
if file_format in self.SUPPORTED_IMAGE_FORMATS:
# Try to open image
with Image.open(file_path) as img:
img.verify() # Verify image integrity
# Reopen for actual check (verify() closes the file)
with Image.open(file_path) as img:
_ = img.size # Force load to detect corruption
return True, None
elif file_format == 'pdf':
# Basic PDF validation - check file starts with PDF signature
with open(file_path, 'rb') as f:
header = f.read(5)
if header != b'%PDF-':
return False, "Invalid PDF header"
return True, None
elif file_format in ['doc', 'docx', 'ppt', 'pptx']:
# Office documents - basic validation (check file size and can be opened)
# Modern Office formats (docx, pptx) are ZIP-based
if file_format in ['docx', 'pptx']:
import zipfile
try:
with zipfile.ZipFile(file_path, 'r') as zf:
# Check if it has the required Office structure
if file_format == 'docx' and 'word/document.xml' not in zf.namelist():
return False, "Invalid DOCX structure"
elif file_format == 'pptx' and 'ppt/presentation.xml' not in zf.namelist():
return False, "Invalid PPTX structure"
except zipfile.BadZipFile:
return False, "Invalid Office file (corrupt ZIP)"
# Old formats (doc, ppt) - just check file exists and has content
return True, None
else:
return False, f"Unknown format: {file_format}"
except Exception as e:
return False, str(e)
def preprocess_image(
self,
image_path: Path,
enhance: bool = True,
output_path: Optional[Path] = None
) -> Tuple[bool, Optional[Path], Optional[str]]:
"""
Preprocess image to improve OCR accuracy
Args:
image_path: Path to input image
enhance: Whether to apply enhancement
output_path: Optional output path (defaults to temp directory)
Returns:
Tuple of (success, processed_image_path, error_message)
"""
try:
# Read image
img = cv2.imread(str(image_path))
if img is None:
return False, None, "Failed to read image"
if not enhance:
# No preprocessing, return original
return True, image_path, None
# Convert to grayscale
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# Apply adaptive thresholding to handle varying lighting
processed = cv2.adaptiveThreshold(
gray,
255,
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY,
11,
2
)
# Denoise
processed = cv2.fastNlMeansDenoising(processed, None, 10, 7, 21)
# Determine output path
if output_path is None:
output_path = Path(settings.processed_dir) / f"processed_{image_path.name}"
# Save processed image
cv2.imwrite(str(output_path), processed)
logger.info(f"Image preprocessed: {image_path.name} -> {output_path.name}")
return True, output_path, None
except Exception as e:
logger.error(f"Image preprocessing error: {str(e)}")
return False, None, f"Preprocessing error: {str(e)}"
def get_file_info(self, file_path: Path) -> dict:
"""
Get comprehensive file information
Args:
file_path: Path to file
Returns:
Dictionary with file information
"""
stat = file_path.stat()
mime = magic.Magic(mime=True)
mime_type = mime.from_file(str(file_path))
return {
'name': file_path.name,
'path': str(file_path),
'size': stat.st_size,
'size_mb': stat.st_size / (1024 * 1024),
'mime_type': mime_type,
'format': self._mime_to_format(mime_type),
'created_at': stat.st_ctime,
'modified_at': stat.st_mtime,
}

View File

@@ -0,0 +1,282 @@
"""
Tool_OCR - Translation Service (RESERVED)
Abstract interface and stub implementation for future translation feature
"""
from abc import ABC, abstractmethod
from typing import Dict, Optional, List
from enum import Enum
import logging
logger = logging.getLogger(__name__)
class TranslationEngine(str, Enum):
"""Supported translation engines"""
OFFLINE = "offline" # Argos Translate (offline)
ERNIE = "ernie" # Baidu ERNIE API
GOOGLE = "google" # Google Translate API
DEEPL = "deepl" # DeepL API
class LanguageCode(str, Enum):
"""Supported language codes"""
CHINESE = "zh"
ENGLISH = "en"
JAPANESE = "ja"
KOREAN = "ko"
FRENCH = "fr"
GERMAN = "de"
SPANISH = "es"
class TranslationServiceInterface(ABC):
"""
Abstract interface for translation services
This interface defines the contract for all translation engine implementations.
Future implementations should inherit from this class.
"""
@abstractmethod
def translate_text(
self,
text: str,
source_lang: str,
target_lang: str,
**kwargs
) -> str:
"""
Translate a single text string
Args:
text: Text to translate
source_lang: Source language code
target_lang: Target language code
**kwargs: Engine-specific parameters
Returns:
str: Translated text
"""
pass
@abstractmethod
def translate_document(
self,
markdown_content: str,
source_lang: str,
target_lang: str,
preserve_structure: bool = True,
**kwargs
) -> Dict[str, any]:
"""
Translate a Markdown document while preserving structure
Args:
markdown_content: Markdown content to translate
source_lang: Source language code
target_lang: Target language code
preserve_structure: Whether to preserve markdown structure
**kwargs: Engine-specific parameters
Returns:
Dict containing:
- translated_content: Translated markdown
- metadata: Translation metadata (engine, time, etc.)
"""
pass
@abstractmethod
def batch_translate(
self,
texts: List[str],
source_lang: str,
target_lang: str,
**kwargs
) -> List[str]:
"""
Translate multiple texts in batch
Args:
texts: List of texts to translate
source_lang: Source language code
target_lang: Target language code
**kwargs: Engine-specific parameters
Returns:
List[str]: List of translated texts
"""
pass
@abstractmethod
def get_supported_languages(self) -> List[str]:
"""
Get list of supported language codes for this engine
Returns:
List[str]: List of supported language codes
"""
pass
@abstractmethod
def validate_config(self) -> bool:
"""
Validate engine configuration (API keys, model files, etc.)
Returns:
bool: True if configuration is valid
"""
pass
class TranslationEngineFactory:
"""
Factory for creating translation engine instances
RESERVED: This is a placeholder for future implementation.
When translation feature is implemented, this factory will instantiate
the appropriate translation engine based on configuration.
"""
@staticmethod
def create_engine(
engine_type: TranslationEngine,
config: Optional[Dict] = None
) -> TranslationServiceInterface:
"""
Create a translation engine instance
Args:
engine_type: Type of translation engine
config: Engine-specific configuration
Returns:
TranslationServiceInterface: Translation engine instance
Raises:
NotImplementedError: Always raised (stub implementation)
"""
raise NotImplementedError(
"Translation feature is not yet implemented. "
"This is a reserved placeholder for future development."
)
@staticmethod
def get_available_engines() -> List[str]:
"""
Get list of available translation engines
Returns:
List[str]: List of engine types (currently empty)
"""
return []
@staticmethod
def is_engine_available(engine_type: TranslationEngine) -> bool:
"""
Check if a specific engine is available
Args:
engine_type: Engine type to check
Returns:
bool: Always False (stub implementation)
"""
return False
class StubTranslationService:
"""
Stub translation service for API endpoints
This service provides placeholder responses for translation endpoints
until the feature is fully implemented.
"""
@staticmethod
def get_feature_status() -> Dict[str, any]:
"""
Get translation feature status
Returns:
Dict with feature status information
"""
return {
"available": False,
"status": "reserved",
"message": "Translation feature is reserved for future implementation",
"supported_engines": [],
"planned_engines": [
{
"type": "offline",
"name": "Argos Translate",
"description": "Offline neural translation",
"status": "planned"
},
{
"type": "ernie",
"name": "Baidu ERNIE",
"description": "Baidu AI translation API",
"status": "planned"
},
{
"type": "google",
"name": "Google Translate",
"description": "Google Cloud Translation API",
"status": "planned"
},
{
"type": "deepl",
"name": "DeepL",
"description": "DeepL translation API",
"status": "planned"
}
],
"roadmap": {
"phase": "Phase 5",
"priority": "low",
"implementation_after": "Production deployment and user feedback"
}
}
@staticmethod
def get_supported_languages() -> List[Dict[str, str]]:
"""
Get list of languages planned for translation support
Returns:
List of language info dicts
"""
return [
{"code": "zh", "name": "Chinese (Simplified)", "status": "planned"},
{"code": "en", "name": "English", "status": "planned"},
{"code": "ja", "name": "Japanese", "status": "planned"},
{"code": "ko", "name": "Korean", "status": "planned"},
{"code": "fr", "name": "French", "status": "planned"},
{"code": "de", "name": "German", "status": "planned"},
{"code": "es", "name": "Spanish", "status": "planned"},
]
# Example placeholder for future engine implementations:
#
# class ArgosTranslationEngine(TranslationServiceInterface):
# """Offline translation using Argos Translate"""
# def __init__(self, model_path: str):
# self.model_path = model_path
# # Initialize Argos models
#
# def translate_text(self, text, source_lang, target_lang, **kwargs):
# # Implementation here
# pass
#
# class ERNIETranslationEngine(TranslationServiceInterface):
# """Baidu ERNIE API translation"""
# def __init__(self, api_key: str, api_secret: str):
# self.api_key = api_key
# self.api_secret = api_secret
#
# def translate_text(self, text, source_lang, target_lang, **kwargs):
# # Implementation here
# pass