feat: add document translation via DIFY AI API

Implement document translation feature using DIFY AI API with batch processing:

Backend:
- Add DIFY client with batch translation support (5000 chars, 20 items per batch)
- Add translation service with element extraction and result building
- Add translation router with start/status/result/list/delete endpoints
- Add translation schemas (TranslationRequest, TranslationStatus, etc.)

Frontend:
- Enable translation UI in TaskDetailPage
- Add translation API methods to apiV2.ts
- Add translation types

Features:
- Batch translation with numbered markers [1], [2], [3]...
- Support for text, title, header, footer, paragraph, footnote, table cells
- Translation result JSON with statistics (tokens, latency, batch_count)
- Background task processing with progress tracking

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
egg
2025-12-02 11:57:02 +08:00
parent 87dc97d951
commit 8d9b69ba93
18 changed files with 2970 additions and 26 deletions

View File

@@ -371,7 +371,7 @@ async def root():
# Include V2 API routers
from app.routers import auth, tasks, admin
from app.routers import auth, tasks, admin, translate
from fastapi import UploadFile, File, Depends, HTTPException, status
from sqlalchemy.orm import Session
import hashlib
@@ -385,6 +385,7 @@ from app.services.task_service import task_service
app.include_router(auth.router)
app.include_router(tasks.router)
app.include_router(admin.router)
app.include_router(translate.router)
# File upload endpoint

View File

@@ -2,6 +2,6 @@
Tool_OCR - API Routers (V2)
"""
from app.routers import auth, tasks, admin
from app.routers import auth, tasks, admin, translate
__all__ = ["auth", "tasks", "admin"]
__all__ = ["auth", "tasks", "admin", "translate"]

View File

@@ -0,0 +1,503 @@
"""
Tool_OCR - Translation Router
Handles document translation operations via DIFY AI API
"""
import logging
import json
from datetime import datetime
from pathlib import Path
from typing import Optional
from fastapi import APIRouter, Depends, HTTPException, status, Query, BackgroundTasks
from fastapi.responses import FileResponse, JSONResponse
from sqlalchemy.orm import Session
from app.core.deps import get_db, get_current_user
from app.core.config import settings
from app.models.user import User
from app.models.task import Task, TaskStatus
from app.schemas.translation import (
TranslationRequest,
TranslationStartResponse,
TranslationStatusResponse,
TranslationStatusEnum,
TranslationProgress,
TranslationListResponse,
TranslationListItem,
TranslationStatistics,
)
from app.services.task_service import task_service
from app.services.dify_client import LANGUAGE_NAMES
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/api/v2/translate", tags=["Translation"])
def run_translation_task(
task_id: str,
task_db_id: int,
target_lang: str,
source_lang: str = "auto"
):
"""
Background task to run document translation.
Args:
task_id: Task UUID string
task_db_id: Task database ID (for verification)
target_lang: Target language code
source_lang: Source language code ('auto' for detection)
"""
from app.core.database import SessionLocal
from app.services.translation_service import get_translation_service
from app.schemas.translation import TranslationJobState, TranslationProgress
db = SessionLocal()
translation_service = get_translation_service()
try:
logger.info(f"Starting translation for task {task_id} -> {target_lang}")
# Get task to find result JSON path
task = db.query(Task).filter(Task.task_id == task_id).first()
if not task:
logger.error(f"Task {task_id} not found")
return
if not task.result_json_path:
logger.error(f"Task {task_id} has no result JSON")
translation_service.set_job_state(task_id, TranslationJobState(
task_id=task_id,
target_lang=target_lang,
source_lang=source_lang,
status=TranslationStatusEnum.FAILED,
progress=TranslationProgress(),
error_message="No OCR result found",
started_at=datetime.utcnow()
))
return
result_json_path = Path(task.result_json_path)
if not result_json_path.exists():
logger.error(f"Result JSON not found: {result_json_path}")
translation_service.set_job_state(task_id, TranslationJobState(
task_id=task_id,
target_lang=target_lang,
source_lang=source_lang,
status=TranslationStatusEnum.FAILED,
progress=TranslationProgress(),
error_message="Result file not found",
started_at=datetime.utcnow()
))
return
# Update state to translating
translation_service.set_job_state(task_id, TranslationJobState(
task_id=task_id,
target_lang=target_lang,
source_lang=source_lang,
status=TranslationStatusEnum.TRANSLATING,
progress=TranslationProgress(),
started_at=datetime.utcnow()
))
# Progress callback
def progress_callback(progress: TranslationProgress):
current_state = translation_service.get_job_state(task_id)
if current_state:
current_state.status = TranslationStatusEnum.TRANSLATING
current_state.progress = progress
translation_service.set_job_state(task_id, current_state)
# Run translation
success, output_path, error_message = translation_service.translate_document(
task_id=task_id,
result_json_path=result_json_path,
target_lang=target_lang,
source_lang=source_lang,
progress_callback=progress_callback
)
if success:
translation_service.set_job_state(task_id, TranslationJobState(
task_id=task_id,
target_lang=target_lang,
source_lang=source_lang,
status=TranslationStatusEnum.COMPLETED,
progress=TranslationProgress(percentage=100.0),
started_at=datetime.utcnow(),
completed_at=datetime.utcnow(),
result_file_path=str(output_path) if output_path else None
))
logger.info(f"Translation completed for task {task_id}")
else:
translation_service.set_job_state(task_id, TranslationJobState(
task_id=task_id,
target_lang=target_lang,
source_lang=source_lang,
status=TranslationStatusEnum.FAILED,
progress=TranslationProgress(),
error_message=error_message,
started_at=datetime.utcnow()
))
logger.error(f"Translation failed for task {task_id}: {error_message}")
except Exception as e:
logger.exception(f"Translation failed for task {task_id}")
translation_service.set_job_state(task_id, TranslationJobState(
task_id=task_id,
target_lang=target_lang,
source_lang=source_lang,
status=TranslationStatusEnum.FAILED,
progress=TranslationProgress(),
error_message=str(e),
started_at=datetime.utcnow()
))
finally:
db.close()
@router.post("/{task_id}", response_model=TranslationStartResponse, status_code=status.HTTP_202_ACCEPTED)
async def start_translation(
task_id: str,
request: TranslationRequest,
background_tasks: BackgroundTasks,
db: Session = Depends(get_db),
current_user: User = Depends(get_current_user)
):
"""
Start a document translation job.
- **task_id**: Task UUID of a completed OCR task
- **target_lang**: Target language code (e.g., 'en', 'ja', 'zh-TW')
- **source_lang**: Source language code ('auto' for automatic detection)
Returns 202 Accepted with job information. Use /status endpoint to track progress.
"""
from app.services.translation_service import get_translation_service
from app.schemas.translation import TranslationJobState
# Get task
task = task_service.get_task_by_id(
db=db,
task_id=task_id,
user_id=current_user.id
)
if not task:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="Task not found"
)
# Check task is completed
if task.status != TaskStatus.COMPLETED:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail=f"Cannot translate task in '{task.status.value}' status. Task must be completed."
)
# Check result JSON exists
if not task.result_json_path or not Path(task.result_json_path).exists():
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail="OCR result not found. Please process the document first."
)
# Validate target language
target_lang = request.target_lang
if target_lang not in LANGUAGE_NAMES:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail=f"Unsupported target language: {target_lang}. Supported: {', '.join(LANGUAGE_NAMES.keys())}"
)
# Check if translation already exists
result_dir = Path(task.result_json_path).parent
existing_translation = result_dir / f"{Path(task.result_json_path).stem.replace('_result', '')}_translated_{target_lang}.json"
if existing_translation.exists():
logger.info(f"Translation already exists: {existing_translation}")
# Return as completed
return TranslationStartResponse(
task_id=task_id,
status=TranslationStatusEnum.COMPLETED,
target_lang=target_lang,
message="Translation already exists"
)
# Check if translation is already in progress
translation_service = get_translation_service()
current_job = translation_service.get_job_state(task_id)
if current_job and current_job.status in [TranslationStatusEnum.PENDING, TranslationStatusEnum.TRANSLATING]:
return TranslationStartResponse(
task_id=task_id,
status=current_job.status,
target_lang=current_job.target_lang,
message="Translation already in progress"
)
# Initialize job state
translation_service.set_job_state(task_id, TranslationJobState(
task_id=task_id,
target_lang=target_lang,
source_lang=request.source_lang,
status=TranslationStatusEnum.PENDING,
progress=TranslationProgress(),
started_at=datetime.utcnow()
))
# Start background translation task
background_tasks.add_task(
run_translation_task,
task_id=task_id,
task_db_id=task.id,
target_lang=target_lang,
source_lang=request.source_lang
)
logger.info(f"Started translation job for task {task_id}, target_lang={target_lang}")
return TranslationStartResponse(
task_id=task_id,
status=TranslationStatusEnum.PENDING,
target_lang=target_lang,
message="Translation job started"
)
@router.get("/{task_id}/status", response_model=TranslationStatusResponse)
async def get_translation_status(
task_id: str,
db: Session = Depends(get_db),
current_user: User = Depends(get_current_user)
):
"""
Get the status of a translation job.
- **task_id**: Task UUID
Returns current translation status with progress information.
"""
from app.services.translation_service import get_translation_service
# Verify task ownership
task = task_service.get_task_by_id(
db=db,
task_id=task_id,
user_id=current_user.id
)
if not task:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="Task not found"
)
# Get job state
translation_service = get_translation_service()
job_state = translation_service.get_job_state(task_id)
if not job_state:
# No active job - check if any completed translations exist
if task.result_json_path:
result_dir = Path(task.result_json_path).parent
translated_files = list(result_dir.glob("*_translated_*.json"))
if translated_files:
# Return completed status for the most recent translation
latest_file = max(translated_files, key=lambda f: f.stat().st_mtime)
# Extract language from filename
lang = latest_file.stem.split("_translated_")[-1]
return TranslationStatusResponse(
task_id=task_id,
status=TranslationStatusEnum.COMPLETED,
target_lang=lang,
progress=TranslationProgress(percentage=100.0)
)
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="No translation job found for this task"
)
return TranslationStatusResponse(
task_id=task_id,
status=job_state.status,
target_lang=job_state.target_lang,
progress=job_state.progress,
error_message=job_state.error_message,
started_at=job_state.started_at,
completed_at=job_state.completed_at
)
@router.get("/{task_id}/result")
async def get_translation_result(
task_id: str,
lang: str = Query(..., description="Target language code"),
db: Session = Depends(get_db),
current_user: User = Depends(get_current_user)
):
"""
Get the translation result for a specific language.
- **task_id**: Task UUID
- **lang**: Target language code (e.g., 'en', 'ja')
Returns the translation JSON file.
"""
# Verify task ownership
task = task_service.get_task_by_id(
db=db,
task_id=task_id,
user_id=current_user.id
)
if not task:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="Task not found"
)
if not task.result_json_path:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="OCR result not found"
)
# Find translation file
result_dir = Path(task.result_json_path).parent
base_name = Path(task.result_json_path).stem.replace('_result', '')
translation_file = result_dir / f"{base_name}_translated_{lang}.json"
if not translation_file.exists():
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=f"Translation for language '{lang}' not found"
)
# Return as JSON response with proper content type
return FileResponse(
path=str(translation_file),
filename=translation_file.name,
media_type="application/json"
)
@router.get("/{task_id}/translations", response_model=TranslationListResponse)
async def list_translations(
task_id: str,
db: Session = Depends(get_db),
current_user: User = Depends(get_current_user)
):
"""
List all available translations for a task.
- **task_id**: Task UUID
Returns list of available translations with metadata.
"""
# Verify task ownership
task = task_service.get_task_by_id(
db=db,
task_id=task_id,
user_id=current_user.id
)
if not task:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="Task not found"
)
translations = []
if task.result_json_path:
result_dir = Path(task.result_json_path).parent
translated_files = list(result_dir.glob("*_translated_*.json"))
for translation_file in translated_files:
try:
# Extract language from filename
lang = translation_file.stem.split("_translated_")[-1]
# Read translation metadata
with open(translation_file, 'r', encoding='utf-8') as f:
data = json.load(f)
stats_data = data.get('statistics', {})
translations.append(TranslationListItem(
target_lang=lang,
translated_at=datetime.fromisoformat(data.get('translated_at', '').replace('Z', '+00:00')),
provider=data.get('provider', 'dify'),
statistics=TranslationStatistics(
total_elements=stats_data.get('total_elements', 0),
translated_elements=stats_data.get('translated_elements', 0),
skipped_elements=stats_data.get('skipped_elements', 0),
total_characters=stats_data.get('total_characters', 0),
processing_time_seconds=stats_data.get('processing_time_seconds', 0.0),
total_tokens=stats_data.get('total_tokens', 0)
),
file_path=str(translation_file)
))
except Exception as e:
logger.warning(f"Failed to read translation file {translation_file}: {e}")
continue
return TranslationListResponse(
task_id=task_id,
translations=translations
)
@router.delete("/{task_id}/translations/{lang}", status_code=status.HTTP_204_NO_CONTENT)
async def delete_translation(
task_id: str,
lang: str,
db: Session = Depends(get_db),
current_user: User = Depends(get_current_user)
):
"""
Delete a specific translation.
- **task_id**: Task UUID
- **lang**: Target language code to delete
"""
# Verify task ownership
task = task_service.get_task_by_id(
db=db,
task_id=task_id,
user_id=current_user.id
)
if not task:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="Task not found"
)
if not task.result_json_path:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="OCR result not found"
)
# Find translation file
result_dir = Path(task.result_json_path).parent
base_name = Path(task.result_json_path).stem.replace('_result', '')
translation_file = result_dir / f"{base_name}_translated_{lang}.json"
if not translation_file.exists():
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=f"Translation for language '{lang}' not found"
)
# Delete file
translation_file.unlink()
logger.info(f"Deleted translation {lang} for task {task_id}")
return None

View File

@@ -13,6 +13,16 @@ from app.schemas.task import (
TaskStatsResponse,
TaskStatusEnum,
)
from app.schemas.translation import (
TranslationStatusEnum,
TranslationRequest,
TranslationProgress,
TranslationStatusResponse,
TranslationStartResponse,
TranslationStatistics,
TranslationResultResponse,
TranslationListResponse,
)
__all__ = [
# Auth
@@ -27,4 +37,13 @@ __all__ = [
"TaskListResponse",
"TaskStatsResponse",
"TaskStatusEnum",
# Translation
"TranslationStatusEnum",
"TranslationRequest",
"TranslationProgress",
"TranslationStatusResponse",
"TranslationStartResponse",
"TranslationStatistics",
"TranslationResultResponse",
"TranslationListResponse",
]

View File

@@ -0,0 +1,163 @@
"""
Tool_OCR - Translation Schemas
Pydantic models for document translation feature (DIFY API)
"""
from typing import Optional, List, Dict, Any, Tuple
from datetime import datetime
from pydantic import BaseModel, Field
from enum import Enum
from dataclasses import dataclass
class TranslationStatusEnum(str, Enum):
"""Translation job status enumeration"""
PENDING = "pending"
TRANSLATING = "translating"
COMPLETED = "completed"
FAILED = "failed"
class TargetLanguageEnum(str, Enum):
"""Supported target languages for translation."""
ENGLISH = "en"
JAPANESE = "ja"
KOREAN = "ko"
CHINESE_SIMPLIFIED = "zh-CN"
CHINESE_TRADITIONAL = "zh-TW"
GERMAN = "de"
FRENCH = "fr"
SPANISH = "es"
PORTUGUESE = "pt"
ITALIAN = "it"
RUSSIAN = "ru"
VIETNAMESE = "vi"
THAI = "th"
class TranslationRequest(BaseModel):
"""Request model for starting a translation job"""
target_lang: str = Field(
...,
description="Target language code (e.g., 'en', 'ja', 'zh-TW')"
)
source_lang: str = Field(
default="auto",
description="Source language code, 'auto' for automatic detection"
)
class TranslationProgress(BaseModel):
"""Progress information for ongoing translation"""
current_element: int = Field(default=0, description="Current element being translated")
total_elements: int = Field(default=0, description="Total elements to translate")
percentage: float = Field(default=0.0, description="Progress percentage (0-100)")
class TranslationStatusResponse(BaseModel):
"""Response model for translation status query"""
task_id: str = Field(..., description="Task ID")
status: TranslationStatusEnum = Field(..., description="Current translation status")
target_lang: str = Field(..., description="Target language")
progress: Optional[TranslationProgress] = Field(
default=None,
description="Progress information when translating"
)
error_message: Optional[str] = Field(
default=None,
description="Error message if translation failed"
)
started_at: Optional[datetime] = Field(default=None, description="Translation start time")
completed_at: Optional[datetime] = Field(default=None, description="Translation completion time")
class TranslationStartResponse(BaseModel):
"""Response model for starting a translation job"""
task_id: str = Field(..., description="Task ID")
status: TranslationStatusEnum = Field(..., description="Initial status")
target_lang: str = Field(..., description="Target language")
message: str = Field(..., description="Status message")
class TranslationStatistics(BaseModel):
"""Statistics for completed translation"""
total_elements: int = Field(default=0, description="Total elements in document")
translated_elements: int = Field(default=0, description="Successfully translated elements")
skipped_elements: int = Field(default=0, description="Skipped elements (images, etc.)")
total_characters: int = Field(default=0, description="Total characters translated")
processing_time_seconds: float = Field(default=0.0, description="Translation duration")
total_tokens: int = Field(default=0, description="Total API tokens used")
class TranslationResultResponse(BaseModel):
"""Response model for translation result"""
schema_version: str = Field(default="1.0.0", description="Schema version")
source_document: str = Field(..., description="Source document filename")
source_lang: str = Field(..., description="Source language (detected or specified)")
target_lang: str = Field(..., description="Target language")
provider: str = Field(default="dify", description="Translation provider")
translated_at: datetime = Field(..., description="Translation timestamp")
statistics: TranslationStatistics = Field(..., description="Translation statistics")
translations: Dict[str, Any] = Field(
...,
description="Translations dict mapping element_id to translated content"
)
class TranslationListItem(BaseModel):
"""Item in translation list response"""
target_lang: str = Field(..., description="Target language")
translated_at: datetime = Field(..., description="Translation timestamp")
provider: str = Field(default="dify", description="Translation provider")
statistics: TranslationStatistics = Field(..., description="Translation statistics")
file_path: str = Field(..., description="Path to translation JSON file")
class TranslationListResponse(BaseModel):
"""Response model for listing available translations"""
task_id: str = Field(..., description="Task ID")
translations: List[TranslationListItem] = Field(
default_factory=list,
description="Available translations"
)
# Dataclasses for internal use
@dataclass
class TranslatableItem:
"""Internal representation of a translatable element"""
element_id: str
content: str
element_type: str # 'text', 'title', 'header', etc. or 'table_cell'
page_number: int = 1
cell_position: Optional[Tuple[int, int]] = None # (row, col) for table cells
def __post_init__(self):
# Clean content - remove excessive whitespace
if self.content:
self.content = ' '.join(self.content.split())
@dataclass
class TranslatedItem:
"""Internal representation of a translated element"""
element_id: str
original_content: str
translated_content: str
element_type: str
cell_position: Optional[Tuple[int, int]] = None
@dataclass
class TranslationJobState:
"""Internal state for a translation job"""
task_id: str
target_lang: str
source_lang: str
status: TranslationStatusEnum
progress: TranslationProgress
error_message: Optional[str] = None
started_at: Optional[datetime] = None
completed_at: Optional[datetime] = None
result_file_path: Optional[str] = None

View File

@@ -0,0 +1,332 @@
"""
Tool_OCR - DIFY AI Client
HTTP client for DIFY translation API with batch support
"""
import asyncio
import logging
import re
import time
from dataclasses import dataclass, field
from typing import Dict, List, Optional
import httpx
logger = logging.getLogger(__name__)
# DIFY API Configuration
DIFY_BASE_URL = "https://dify.theaken.com/v1"
DIFY_API_KEY = "app-YOPrF2ro5fshzMkCZviIuUJd"
DIFY_TIMEOUT = 120.0 # seconds (increased for batch)
DIFY_MAX_RETRIES = 3
# Batch translation limits
# Conservative limits to avoid gateway timeouts
# DIFY server may have processing time limits
MAX_BATCH_CHARS = 5000
MAX_BATCH_ITEMS = 20
# Language name mapping
LANGUAGE_NAMES = {
"en": "English",
"zh-TW": "Traditional Chinese",
"zh-CN": "Simplified Chinese",
"ja": "Japanese",
"ko": "Korean",
"de": "German",
"fr": "French",
"es": "Spanish",
"pt": "Portuguese",
"it": "Italian",
"ru": "Russian",
"vi": "Vietnamese",
"th": "Thai",
}
@dataclass
class TranslationResponse:
"""Response from DIFY translation API"""
translated_text: str
total_tokens: int
latency: float
conversation_id: str
@dataclass
class BatchTranslationResponse:
"""Response from DIFY batch translation API"""
translations: Dict[int, str] # marker_id -> translated_text
total_tokens: int
latency: float
conversation_id: str
missing_markers: List[int] = field(default_factory=list)
class DifyTranslationError(Exception):
"""Error during DIFY API translation"""
pass
class DifyClient:
"""
Client for DIFY AI translation API.
Features:
- Single and batch translation
- Blocking mode API calls
- Automatic retry with exponential backoff
- Token and latency tracking
"""
def __init__(
self,
base_url: str = DIFY_BASE_URL,
api_key: str = DIFY_API_KEY,
timeout: float = DIFY_TIMEOUT,
max_retries: int = DIFY_MAX_RETRIES
):
self.base_url = base_url
self.api_key = api_key
self.timeout = timeout
self.max_retries = max_retries
self._total_tokens = 0
self._total_requests = 0
def _get_language_name(self, lang_code: str) -> str:
"""Convert language code to full name for prompt"""
return LANGUAGE_NAMES.get(lang_code, lang_code)
def _build_prompt(self, text: str, target_lang: str) -> str:
"""Build translation prompt for single text"""
lang_name = self._get_language_name(target_lang)
return (
f"Translate the following text to {lang_name}.\n"
f"Return ONLY the translated text, no explanations.\n\n"
f"{text}"
)
def _build_batch_prompt(self, texts: List[str], target_lang: str) -> str:
"""
Build batch translation prompt with numbered markers.
Format:
Translate the following texts to {Language}.
Each text is marked with [N]. Return translations in the same format.
Return ONLY the translations with their markers, no explanations.
[1] First text
[2] Second text
...
"""
lang_name = self._get_language_name(target_lang)
# Build numbered text list
numbered_texts = []
for i, text in enumerate(texts, start=1):
# Clean text - remove newlines within each item to avoid parsing issues
clean_text = ' '.join(text.split())
numbered_texts.append(f"[{i}] {clean_text}")
texts_block = "\n".join(numbered_texts)
prompt = (
f"Translate the following texts to {lang_name}.\n"
f"Each text is marked with [N]. Return translations in the same format.\n"
f"Return ONLY the translations with their markers, no explanations.\n\n"
f"{texts_block}"
)
return prompt
def _parse_batch_response(self, response_text: str, expected_count: int) -> Dict[int, str]:
"""
Parse batch translation response with numbered markers.
Expected format:
[1] 翻譯文字一
[2] 翻譯文字二
...
Returns:
Dict mapping marker number to translated text
"""
translations = {}
# Pattern to match [N] followed by text until next [N] or end
# Use DOTALL to match across lines, but be careful with greedy matching
pattern = r'\[(\d+)\]\s*(.+?)(?=\[\d+\]|$)'
matches = re.findall(pattern, response_text, re.DOTALL)
for match in matches:
try:
marker_id = int(match[0])
text = match[1].strip()
if text:
translations[marker_id] = text
except (ValueError, IndexError):
continue
return translations
def _call_api(self, prompt: str, user_id: str) -> dict:
"""Make API call to DIFY with retry logic"""
payload = {
"inputs": {},
"query": prompt,
"response_mode": "blocking",
"conversation_id": "",
"user": user_id
}
headers = {
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json"
}
last_error = None
for attempt in range(self.max_retries):
try:
with httpx.Client(timeout=self.timeout) as client:
response = client.post(
f"{self.base_url}/chat-messages",
json=payload,
headers=headers
)
if response.status_code != 200:
raise DifyTranslationError(
f"API returned status {response.status_code}: {response.text}"
)
return response.json()
except httpx.TimeoutException as e:
last_error = e
logger.warning(f"DIFY API timeout (attempt {attempt + 1}/{self.max_retries})")
except httpx.RequestError as e:
last_error = e
logger.warning(f"DIFY API request error (attempt {attempt + 1}/{self.max_retries}): {e}")
except Exception as e:
last_error = e
logger.warning(f"DIFY API error (attempt {attempt + 1}/{self.max_retries}): {e}")
# Exponential backoff
if attempt < self.max_retries - 1:
wait_time = 2 ** attempt
logger.info(f"Retrying in {wait_time}s...")
time.sleep(wait_time)
raise DifyTranslationError(f"API call failed after {self.max_retries} attempts: {last_error}")
def translate(
self,
text: str,
target_lang: str,
user_id: str = "tool-ocr"
) -> TranslationResponse:
"""
Translate single text using DIFY API.
Args:
text: Text to translate
target_lang: Target language code (e.g., 'en', 'zh-TW')
user_id: User identifier for tracking
Returns:
TranslationResponse with translated text and metadata
"""
prompt = self._build_prompt(text, target_lang)
data = self._call_api(prompt, user_id)
# Extract response fields
translated_text = data.get("answer", "")
usage = data.get("metadata", {}).get("usage", {})
self._total_tokens += usage.get("total_tokens", 0)
self._total_requests += 1
return TranslationResponse(
translated_text=translated_text,
total_tokens=usage.get("total_tokens", 0),
latency=usage.get("latency", 0.0),
conversation_id=data.get("conversation_id", "")
)
def translate_batch(
self,
texts: List[str],
target_lang: str,
user_id: str = "tool-ocr"
) -> BatchTranslationResponse:
"""
Translate multiple texts in a single API call.
Args:
texts: List of texts to translate
target_lang: Target language code
user_id: User identifier for tracking
Returns:
BatchTranslationResponse with translations dict and metadata
"""
if not texts:
return BatchTranslationResponse(
translations={},
total_tokens=0,
latency=0.0,
conversation_id=""
)
prompt = self._build_batch_prompt(texts, target_lang)
logger.debug(f"Batch translation: {len(texts)} items, ~{len(prompt)} chars")
data = self._call_api(prompt, user_id)
# Extract and parse response
answer = data.get("answer", "")
usage = data.get("metadata", {}).get("usage", {})
translations = self._parse_batch_response(answer, len(texts))
# Check for missing markers
missing_markers = []
for i in range(1, len(texts) + 1):
if i not in translations:
missing_markers.append(i)
logger.warning(f"Missing translation for marker [{i}]")
self._total_tokens += usage.get("total_tokens", 0)
self._total_requests += 1
return BatchTranslationResponse(
translations=translations,
total_tokens=usage.get("total_tokens", 0),
latency=usage.get("latency", 0.0),
conversation_id=data.get("conversation_id", ""),
missing_markers=missing_markers
)
def get_stats(self) -> dict:
"""Get client statistics"""
return {
"total_tokens": self._total_tokens,
"total_requests": self._total_requests,
"base_url": self.base_url,
}
# Global singleton
_dify_client: Optional[DifyClient] = None
def get_dify_client() -> DifyClient:
"""Get the global DifyClient instance"""
global _dify_client
if _dify_client is None:
_dify_client = DifyClient()
return _dify_client

View File

@@ -0,0 +1,490 @@
"""
Tool_OCR - Translation Service
Document translation using DIFY AI API with batch processing
"""
import json
import logging
import threading
import time
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
from app.schemas.translation import (
TranslatableItem,
TranslatedItem,
TranslationJobState,
TranslationProgress,
TranslationStatusEnum,
)
from app.services.dify_client import (
DifyClient,
DifyTranslationError,
get_dify_client,
MAX_BATCH_CHARS,
MAX_BATCH_ITEMS,
)
logger = logging.getLogger(__name__)
# Element types that should be translated
TRANSLATABLE_TEXT_TYPES = {'text', 'title', 'header', 'footer', 'paragraph', 'footnote'}
TABLE_TYPE = 'table'
SKIP_TYPES = {'page_number', 'image', 'chart', 'logo', 'reference'}
@dataclass
class TranslationBatch:
"""A batch of items to translate together"""
items: List[TranslatableItem] = field(default_factory=list)
total_chars: int = 0
def can_add(self, item: TranslatableItem) -> bool:
"""Check if item can be added to this batch"""
item_chars = len(item.content)
return (
len(self.items) < MAX_BATCH_ITEMS and
self.total_chars + item_chars <= MAX_BATCH_CHARS
)
def add(self, item: TranslatableItem):
"""Add item to batch"""
self.items.append(item)
self.total_chars += len(item.content)
class TranslationService:
"""
Main translation service for document translation using DIFY AI.
Features:
- Extract translatable elements from UnifiedDocument
- Batch translation via DIFY API (efficient)
- Fallback to single-item translation for failures
- Translation JSON generation
- Progress tracking
"""
def __init__(self, dify_client: Optional[DifyClient] = None):
self.dify_client = dify_client or get_dify_client()
self._active_jobs: Dict[str, TranslationJobState] = {}
self._jobs_lock = threading.Lock()
self._total_tokens = 0
self._total_latency = 0.0
def extract_translatable_elements(
self,
result_json: Dict
) -> Tuple[List[TranslatableItem], int]:
"""
Extract all translatable elements from a result JSON.
Args:
result_json: UnifiedDocument JSON data
Returns:
Tuple of (list of TranslatableItem, total element count)
"""
items = []
total_elements = 0
for page in result_json.get('pages', []):
page_number = page.get('page_number', 1)
for elem in page.get('elements', []):
total_elements += 1
elem_type = elem.get('type', '')
elem_id = elem.get('element_id', '')
content = elem.get('content')
# Skip non-translatable types
if elem_type in SKIP_TYPES:
continue
# Handle text elements
if elem_type in TRANSLATABLE_TEXT_TYPES and isinstance(content, str):
text = content.strip()
if text: # Skip empty content
items.append(TranslatableItem(
element_id=elem_id,
content=text,
element_type=elem_type,
page_number=page_number
))
# Handle table elements
elif elem_type == TABLE_TYPE and isinstance(content, dict):
cells = content.get('cells', [])
for cell in cells:
cell_content = cell.get('content', '')
if isinstance(cell_content, str) and cell_content.strip():
row = cell.get('row', 0)
col = cell.get('col', 0)
items.append(TranslatableItem(
element_id=elem_id,
content=cell_content.strip(),
element_type='table_cell',
page_number=page_number,
cell_position=(row, col)
))
logger.info(
f"Extracted {len(items)} translatable items from {total_elements} elements"
)
return items, total_elements
def create_batches(self, items: List[TranslatableItem]) -> List[TranslationBatch]:
"""
Create translation batches from items based on character limits.
Args:
items: List of TranslatableItem
Returns:
List of TranslationBatch
"""
batches = []
current_batch = TranslationBatch()
for item in items:
if current_batch.can_add(item):
current_batch.add(item)
else:
# Save current batch and start new one
if current_batch.items:
batches.append(current_batch)
current_batch = TranslationBatch()
current_batch.add(item)
# Don't forget the last batch
if current_batch.items:
batches.append(current_batch)
logger.info(
f"Created {len(batches)} batches from {len(items)} items "
f"(max {MAX_BATCH_CHARS} chars, max {MAX_BATCH_ITEMS} items per batch)"
)
return batches
def translate_batch(
self,
batch: TranslationBatch,
target_lang: str,
user_id: str
) -> List[TranslatedItem]:
"""
Translate a batch of items using DIFY API.
Args:
batch: TranslationBatch to translate
target_lang: Target language code
user_id: User identifier for tracking
Returns:
List of TranslatedItem
"""
if not batch.items:
return []
# Extract texts in order
texts = [item.content for item in batch.items]
try:
response = self.dify_client.translate_batch(
texts=texts,
target_lang=target_lang,
user_id=user_id
)
self._total_tokens += response.total_tokens
self._total_latency += response.latency
# Map translations back to items
translated_items = []
for idx, item in enumerate(batch.items):
marker_id = idx + 1 # Markers are 1-indexed
if marker_id in response.translations:
translated_content = response.translations[marker_id]
else:
# Missing translation - use original
logger.warning(f"Missing translation for {item.element_id}, using original")
translated_content = item.content
translated_items.append(TranslatedItem(
element_id=item.element_id,
original_content=item.content,
translated_content=translated_content,
element_type=item.element_type,
cell_position=item.cell_position
))
return translated_items
except DifyTranslationError as e:
logger.error(f"Batch translation failed: {e}")
# Return items with original content on failure
return [
TranslatedItem(
element_id=item.element_id,
original_content=item.content,
translated_content=item.content, # Keep original
element_type=item.element_type,
cell_position=item.cell_position
)
for item in batch.items
]
def translate_item(
self,
item: TranslatableItem,
target_lang: str,
user_id: str
) -> TranslatedItem:
"""
Translate a single item using DIFY API (fallback for batch failures).
Args:
item: TranslatableItem to translate
target_lang: Target language code
user_id: User identifier for tracking
Returns:
TranslatedItem with translation result
"""
try:
response = self.dify_client.translate(
text=item.content,
target_lang=target_lang,
user_id=user_id
)
self._total_tokens += response.total_tokens
self._total_latency += response.latency
return TranslatedItem(
element_id=item.element_id,
original_content=item.content,
translated_content=response.translated_text,
element_type=item.element_type,
cell_position=item.cell_position
)
except DifyTranslationError as e:
logger.error(f"Translation failed for {item.element_id}: {e}")
# Return original content on failure
return TranslatedItem(
element_id=item.element_id,
original_content=item.content,
translated_content=item.content, # Keep original
element_type=item.element_type,
cell_position=item.cell_position
)
def build_translation_result(
self,
translated_items: List[TranslatedItem],
source_document: str,
source_lang: str,
target_lang: str,
total_elements: int,
processing_time: float,
batch_count: int
) -> Dict:
"""
Build the translation result JSON structure.
Args:
translated_items: List of TranslatedItem
source_document: Source document filename
source_lang: Source language
target_lang: Target language
total_elements: Total elements in document
processing_time: Processing time in seconds
batch_count: Number of batches used
Returns:
Translation result dictionary
"""
# Build translations dict
translations: Dict[str, Any] = {}
total_chars = 0
for item in translated_items:
total_chars += len(item.translated_content)
if item.element_type == 'table_cell':
# Group table cells by element_id
if item.element_id not in translations:
translations[item.element_id] = {'cells': []}
translations[item.element_id]['cells'].append({
'row': item.cell_position[0] if item.cell_position else 0,
'col': item.cell_position[1] if item.cell_position else 0,
'content': item.translated_content
})
else:
translations[item.element_id] = item.translated_content
# Build statistics
translated_element_ids = set(item.element_id for item in translated_items)
skipped = total_elements - len(translated_element_ids)
result = {
'schema_version': '1.0.0',
'source_document': source_document,
'source_lang': source_lang,
'target_lang': target_lang,
'provider': 'dify',
'translated_at': datetime.utcnow().isoformat() + 'Z',
'statistics': {
'total_elements': total_elements,
'translated_elements': len(translated_element_ids),
'skipped_elements': skipped,
'total_characters': total_chars,
'processing_time_seconds': round(processing_time, 2),
'total_tokens': self._total_tokens,
'batch_count': batch_count
},
'translations': translations
}
return result
def translate_document(
self,
task_id: str,
result_json_path: Path,
target_lang: str,
source_lang: str = 'auto',
progress_callback: Optional[callable] = None
) -> Tuple[bool, Optional[Path], Optional[str]]:
"""
Translate a document using batch processing and save the result.
Args:
task_id: Task ID
result_json_path: Path to source result.json
target_lang: Target language (e.g., 'en', 'zh-TW')
source_lang: Source language ('auto' for detection)
progress_callback: Optional callback(progress: TranslationProgress)
Returns:
Tuple of (success, output_path, error_message)
"""
start_time = time.time()
self._total_tokens = 0
self._total_latency = 0.0
logger.info(
f"Starting translation: task_id={task_id}, target={target_lang}"
)
try:
# Load source JSON
with open(result_json_path, 'r', encoding='utf-8') as f:
result_json = json.load(f)
source_document = result_json.get('metadata', {}).get('filename', 'unknown')
# Extract translatable elements
items, total_elements = self.extract_translatable_elements(result_json)
if not items:
logger.warning("No translatable elements found")
return False, None, "No translatable elements found"
# Create batches
batches = self.create_batches(items)
# Update initial progress
if progress_callback:
progress_callback(TranslationProgress(
total_elements=len(items)
))
# Translate each batch
all_translated: List[TranslatedItem] = []
user_id = f"tool-ocr-{task_id}"
processed_items = 0
for batch_idx, batch in enumerate(batches):
logger.info(
f"Translating batch {batch_idx + 1}/{len(batches)} "
f"({len(batch.items)} items, {batch.total_chars} chars)"
)
translated = self.translate_batch(batch, target_lang, user_id)
all_translated.extend(translated)
processed_items += len(batch.items)
# Update progress
if progress_callback:
progress_callback(TranslationProgress(
current_element=processed_items,
total_elements=len(items),
percentage=(processed_items / len(items)) * 100
))
# Build result
processing_time = time.time() - start_time
result = self.build_translation_result(
translated_items=all_translated,
source_document=source_document,
source_lang=source_lang,
target_lang=target_lang,
total_elements=total_elements,
processing_time=processing_time,
batch_count=len(batches)
)
# Save result
output_filename = result_json_path.stem.replace('_result', '')
output_path = result_json_path.parent / f"{output_filename}_translated_{target_lang}.json"
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(result, f, ensure_ascii=False, indent=2)
logger.info(
f"Translation completed: {len(all_translated)} items in {len(batches)} batches, "
f"{processing_time:.2f}s, {self._total_tokens} tokens, "
f"saved to {output_path}"
)
return True, output_path, None
except Exception as e:
logger.error(f"Translation failed: {e}")
import traceback
traceback.print_exc()
return False, None, str(e)
def get_job_state(self, task_id: str) -> Optional[TranslationJobState]:
"""Get the current state of a translation job"""
with self._jobs_lock:
return self._active_jobs.get(task_id)
def set_job_state(self, task_id: str, state: TranslationJobState):
"""Set the state of a translation job"""
with self._jobs_lock:
self._active_jobs[task_id] = state
def remove_job_state(self, task_id: str):
"""Remove a translation job state"""
with self._jobs_lock:
self._active_jobs.pop(task_id, None)
# Global singleton
_translation_service: Optional[TranslationService] = None
def get_translation_service() -> TranslationService:
"""Get the global TranslationService instance"""
global _translation_service
if _translation_service is None:
_translation_service = TranslationService()
return _translation_service

View File

@@ -0,0 +1,138 @@
#!/usr/bin/env python3
"""
Test translation service with DIFY API using real OCR results from storage/results/
"""
import json
import pytest
from pathlib import Path
from app.services.dify_client import DifyClient, get_dify_client
from app.services.translation_service import TranslationService, get_translation_service
# Real task IDs with their result files
REAL_TASKS = [
("ca2b59a3-3362-4678-954f-cf0a9bcc152e", "img3_result.json"),
("8ab2f24d-992b-46a2-87dc-2e024e006ac7", "img1_result.json"),
("1c94bfbf-9391-444c-bebf-ae22fa3dad32", "edit_result.json"),
("c85fff69-9ddb-40b8-8a9b-ebb513c60f05", "scan_result.json"),
("0088e960-7b61-4cdf-bfe5-956960b00dd1", "scan2_result.json"),
("8eedd9ed-7aad-46d5-93ca-951352c954b9", "ppt_result.json"),
("992156c5-72b4-4e3d-8d43-cbb15f23e630", "edit3_result.json"),
("1484ba43-7484-4326-95a7-1544b181e9e8", "edit2_result.json"),
("e9a16bba-7d37-42f4-84c8-6624cb58fe19", "img2_result.json"),
]
RESULTS_DIR = Path(__file__).parent.parent / "storage" / "results"
@pytest.fixture
def dify_client():
"""Get DIFY client instance"""
return get_dify_client()
@pytest.fixture
def translation_service():
"""Get translation service instance"""
return get_translation_service()
class TestDifyClient:
"""Test DIFY API client"""
def test_client_initialization(self, dify_client):
"""Test client can be initialized"""
assert dify_client is not None
assert dify_client.api_key is not None
def test_simple_translation(self, dify_client):
"""Test simple translation via DIFY API"""
text = "Hello, this is a test."
response = dify_client.translate(text, "zh-TW")
assert response.translated_text is not None
assert len(response.translated_text) > 0
assert response.total_tokens > 0
print(f"\nOriginal: {text}")
print(f"Translated: {response.translated_text}")
print(f"Tokens: {response.total_tokens}, Latency: {response.latency:.2f}s")
class TestTranslationServiceExtraction:
"""Test element extraction"""
def test_service_initialization(self, translation_service):
"""Test service can be initialized"""
assert translation_service is not None
assert translation_service.dify_client is not None
@pytest.mark.parametrize("task_id,result_file", REAL_TASKS)
def test_extract_translatable_elements(self, translation_service, task_id, result_file):
"""Test extracting translatable elements from real OCR results"""
result_path = RESULTS_DIR / task_id / result_file
if not result_path.exists():
pytest.skip(f"Result file not found: {result_path}")
with open(result_path, 'r', encoding='utf-8') as f:
ocr_result = json.load(f)
elements, total_count = translation_service.extract_translatable_elements(ocr_result)
# Verify extraction
assert isinstance(elements, list)
assert isinstance(total_count, int)
assert total_count >= 0
print(f"\nTask {task_id}:")
print(f" Extracted {len(elements)} translatable elements")
print(f" Total elements in document: {total_count}")
if elements:
first = elements[0]
assert hasattr(first, 'element_id')
assert hasattr(first, 'content')
assert hasattr(first, 'element_type')
print(f" First element type: {first.element_type}")
print(f" First element preview: {first.content[:50]}..." if len(first.content) > 50 else f" First element: {first.content}")
class TestTranslationExecution:
"""Test actual translation via DIFY API"""
@pytest.mark.parametrize("task_id,result_file", REAL_TASKS[:2]) # Test only first 2
def test_translate_first_3_elements(self, translation_service, task_id, result_file):
"""Test translating first 3 elements from a real OCR document"""
result_path = RESULTS_DIR / task_id / result_file
if not result_path.exists():
pytest.skip(f"Result file not found: {result_path}")
with open(result_path, 'r', encoding='utf-8') as f:
ocr_result = json.load(f)
elements, _ = translation_service.extract_translatable_elements(ocr_result)
if not elements:
pytest.skip("No translatable elements found")
# Translate first 3 elements only
elements_to_translate = elements[:3]
print(f"\n{task_id} translations:")
for i, elem in enumerate(elements_to_translate):
translated = translation_service.translate_item(elem, "en", f"test-{task_id}")
assert translated.translated_content is not None
assert len(translated.translated_content) > 0
orig_preview = elem.content[:30] + "..." if len(elem.content) > 30 else elem.content
trans_preview = translated.translated_content[:30] + "..." if len(translated.translated_content) > 30 else translated.translated_content
print(f" [{i+1}] {orig_preview} -> {trans_preview}")
print(f" Total tokens: {translation_service._total_tokens}")
if __name__ == "__main__":
# Run extraction tests only (no API calls) by default
# pytest.main([__file__, "-v", "-k", "extraction", "--tb=short"])
# Run all tests including API calls
pytest.main([__file__, "-v", "--tb=short"])