- Add OCR Track support for reflow PDF generation using raw_ocr_regions.json - Add OCR Track translation extraction from raw_ocr_regions instead of elements - Add raw_ocr_translations output format for OCR Track documents - Add exclusion zone filtering to remove text overlapping with images - Update API validation to accept both translations and raw_ocr_translations - Add page_number field to TranslatedItem for proper tracking 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
165 lines
5.9 KiB
Python
165 lines
5.9 KiB
Python
"""
|
|
Tool_OCR - Translation Schemas
|
|
Pydantic models for document translation feature (DIFY API)
|
|
"""
|
|
|
|
from typing import Optional, List, Dict, Any, Tuple
|
|
from datetime import datetime
|
|
from pydantic import BaseModel, Field
|
|
from enum import Enum
|
|
from dataclasses import dataclass
|
|
|
|
|
|
class TranslationStatusEnum(str, Enum):
|
|
"""Translation job status enumeration"""
|
|
PENDING = "pending"
|
|
TRANSLATING = "translating"
|
|
COMPLETED = "completed"
|
|
FAILED = "failed"
|
|
|
|
|
|
class TargetLanguageEnum(str, Enum):
|
|
"""Supported target languages for translation."""
|
|
ENGLISH = "en"
|
|
JAPANESE = "ja"
|
|
KOREAN = "ko"
|
|
CHINESE_SIMPLIFIED = "zh-CN"
|
|
CHINESE_TRADITIONAL = "zh-TW"
|
|
GERMAN = "de"
|
|
FRENCH = "fr"
|
|
SPANISH = "es"
|
|
PORTUGUESE = "pt"
|
|
ITALIAN = "it"
|
|
RUSSIAN = "ru"
|
|
VIETNAMESE = "vi"
|
|
THAI = "th"
|
|
|
|
|
|
class TranslationRequest(BaseModel):
|
|
"""Request model for starting a translation job"""
|
|
target_lang: str = Field(
|
|
...,
|
|
description="Target language code (e.g., 'en', 'ja', 'zh-TW')"
|
|
)
|
|
source_lang: str = Field(
|
|
default="auto",
|
|
description="Source language code, 'auto' for automatic detection"
|
|
)
|
|
|
|
|
|
class TranslationProgress(BaseModel):
|
|
"""Progress information for ongoing translation"""
|
|
current_element: int = Field(default=0, description="Current element being translated")
|
|
total_elements: int = Field(default=0, description="Total elements to translate")
|
|
percentage: float = Field(default=0.0, description="Progress percentage (0-100)")
|
|
|
|
|
|
class TranslationStatusResponse(BaseModel):
|
|
"""Response model for translation status query"""
|
|
task_id: str = Field(..., description="Task ID")
|
|
status: TranslationStatusEnum = Field(..., description="Current translation status")
|
|
target_lang: str = Field(..., description="Target language")
|
|
progress: Optional[TranslationProgress] = Field(
|
|
default=None,
|
|
description="Progress information when translating"
|
|
)
|
|
error_message: Optional[str] = Field(
|
|
default=None,
|
|
description="Error message if translation failed"
|
|
)
|
|
started_at: Optional[datetime] = Field(default=None, description="Translation start time")
|
|
completed_at: Optional[datetime] = Field(default=None, description="Translation completion time")
|
|
|
|
|
|
class TranslationStartResponse(BaseModel):
|
|
"""Response model for starting a translation job"""
|
|
task_id: str = Field(..., description="Task ID")
|
|
status: TranslationStatusEnum = Field(..., description="Initial status")
|
|
target_lang: str = Field(..., description="Target language")
|
|
message: str = Field(..., description="Status message")
|
|
|
|
|
|
class TranslationStatistics(BaseModel):
|
|
"""Statistics for completed translation"""
|
|
total_elements: int = Field(default=0, description="Total elements in document")
|
|
translated_elements: int = Field(default=0, description="Successfully translated elements")
|
|
skipped_elements: int = Field(default=0, description="Skipped elements (images, etc.)")
|
|
total_characters: int = Field(default=0, description="Total characters translated")
|
|
processing_time_seconds: float = Field(default=0.0, description="Translation duration")
|
|
total_tokens: int = Field(default=0, description="Total API tokens used")
|
|
|
|
|
|
class TranslationResultResponse(BaseModel):
|
|
"""Response model for translation result"""
|
|
schema_version: str = Field(default="1.0.0", description="Schema version")
|
|
source_document: str = Field(..., description="Source document filename")
|
|
source_lang: str = Field(..., description="Source language (detected or specified)")
|
|
target_lang: str = Field(..., description="Target language")
|
|
provider: str = Field(default="dify", description="Translation provider")
|
|
translated_at: datetime = Field(..., description="Translation timestamp")
|
|
statistics: TranslationStatistics = Field(..., description="Translation statistics")
|
|
translations: Dict[str, Any] = Field(
|
|
...,
|
|
description="Translations dict mapping element_id to translated content"
|
|
)
|
|
|
|
|
|
class TranslationListItem(BaseModel):
|
|
"""Item in translation list response"""
|
|
target_lang: str = Field(..., description="Target language")
|
|
translated_at: datetime = Field(..., description="Translation timestamp")
|
|
provider: str = Field(default="dify", description="Translation provider")
|
|
statistics: TranslationStatistics = Field(..., description="Translation statistics")
|
|
file_path: str = Field(..., description="Path to translation JSON file")
|
|
|
|
|
|
class TranslationListResponse(BaseModel):
|
|
"""Response model for listing available translations"""
|
|
task_id: str = Field(..., description="Task ID")
|
|
translations: List[TranslationListItem] = Field(
|
|
default_factory=list,
|
|
description="Available translations"
|
|
)
|
|
|
|
|
|
# Dataclasses for internal use
|
|
|
|
@dataclass
|
|
class TranslatableItem:
|
|
"""Internal representation of a translatable element"""
|
|
element_id: str
|
|
content: str
|
|
element_type: str # 'text', 'title', 'header', etc. or 'table_cell'
|
|
page_number: int = 1
|
|
cell_position: Optional[Tuple[int, int]] = None # (row, col) for table cells
|
|
|
|
def __post_init__(self):
|
|
# Clean content - remove excessive whitespace
|
|
if self.content:
|
|
self.content = ' '.join(self.content.split())
|
|
|
|
|
|
@dataclass
|
|
class TranslatedItem:
|
|
"""Internal representation of a translated element"""
|
|
element_id: str
|
|
original_content: str
|
|
translated_content: str
|
|
element_type: str
|
|
page_number: int = 1
|
|
cell_position: Optional[Tuple[int, int]] = None
|
|
|
|
|
|
@dataclass
|
|
class TranslationJobState:
|
|
"""Internal state for a translation job"""
|
|
task_id: str
|
|
target_lang: str
|
|
source_lang: str
|
|
status: TranslationStatusEnum
|
|
progress: TranslationProgress
|
|
error_message: Optional[str] = None
|
|
started_at: Optional[datetime] = None
|
|
completed_at: Optional[datetime] = None
|
|
result_file_path: Optional[str] = None
|