Files
Document_Translator/app/services/translation_service.py
2025-09-02 10:31:35 +08:00

424 lines
16 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
翻譯服務
Author: PANJIT IT Team
Created: 2024-01-28
Modified: 2024-01-28
"""
import hashlib
import time
from pathlib import Path
from typing import List, Dict, Any, Optional
from app.utils.logger import get_logger
from app.utils.exceptions import TranslationError, FileProcessingError
from app.services.dify_client import DifyClient
from app.models.cache import TranslationCache
from app.models.job import TranslationJob
from app.utils.helpers import generate_filename, create_job_directory
logger = get_logger(__name__)
class DocumentParser:
"""文件解析器基類"""
def __init__(self, file_path: str):
self.file_path = Path(file_path)
if not self.file_path.exists():
raise FileProcessingError(f"檔案不存在: {file_path}")
def extract_text_segments(self) -> List[str]:
"""提取文字片段"""
raise NotImplementedError
def generate_translated_document(self, translations: Dict[str, List[str]],
target_language: str, output_dir: Path) -> str:
"""生成翻譯後的文件"""
raise NotImplementedError
class DocxParser(DocumentParser):
"""DOCX 文件解析器"""
def extract_text_segments(self) -> List[str]:
"""提取 DOCX 文件的文字片段"""
try:
import docx
from docx.table import _Cell
doc = docx.Document(str(self.file_path))
text_segments = []
# 提取段落文字
for paragraph in doc.paragraphs:
text = paragraph.text.strip()
if text and len(text) > 3: # 過濾太短的文字
text_segments.append(text)
# 提取表格文字
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
text = cell.text.strip()
if text and len(text) > 3:
text_segments.append(text)
logger.info(f"Extracted {len(text_segments)} text segments from DOCX")
return text_segments
except Exception as e:
logger.error(f"Failed to extract text from DOCX: {str(e)}")
raise FileProcessingError(f"DOCX 文件解析失敗: {str(e)}")
def generate_translated_document(self, translations: Dict[str, List[str]],
target_language: str, output_dir: Path) -> str:
"""生成翻譯後的 DOCX 文件"""
try:
import docx
from docx.shared import Pt
# 開啟原始文件
doc = docx.Document(str(self.file_path))
# 取得對應的翻譯
translated_texts = translations.get(target_language, [])
text_index = 0
# 處理段落
for paragraph in doc.paragraphs:
if paragraph.text.strip() and len(paragraph.text.strip()) > 3:
if text_index < len(translated_texts):
# 保留原文,添加翻譯
original_text = paragraph.text
translated_text = translated_texts[text_index]
# 清空段落
paragraph.clear()
# 添加原文
run = paragraph.add_run(original_text)
# 添加翻譯(新行,較小字體)
paragraph.add_run('\n')
trans_run = paragraph.add_run(translated_text)
trans_run.font.size = Pt(10)
trans_run.italic = True
text_index += 1
# 處理表格(簡化版本)
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
if cell.text.strip() and len(cell.text.strip()) > 3:
if text_index < len(translated_texts):
original_text = cell.text
translated_text = translated_texts[text_index]
# 清空儲存格
cell.text = f"{original_text}\n{translated_text}"
text_index += 1
# 生成輸出檔名
output_filename = generate_filename(
self.file_path.name,
'translated',
'translated',
target_language
)
output_path = output_dir / output_filename
# 儲存文件
doc.save(str(output_path))
logger.info(f"Generated translated DOCX: {output_path}")
return str(output_path)
except Exception as e:
logger.error(f"Failed to generate translated DOCX: {str(e)}")
raise FileProcessingError(f"生成翻譯 DOCX 失敗: {str(e)}")
class PdfParser(DocumentParser):
"""PDF 文件解析器(只讀)"""
def extract_text_segments(self) -> List[str]:
"""提取 PDF 文件的文字片段"""
try:
from PyPDF2 import PdfReader
reader = PdfReader(str(self.file_path))
text_segments = []
for page in reader.pages:
text = page.extract_text()
# 簡單的句子分割
sentences = text.split('.')
for sentence in sentences:
sentence = sentence.strip()
if sentence and len(sentence) > 10:
text_segments.append(sentence)
logger.info(f"Extracted {len(text_segments)} text segments from PDF")
return text_segments
except Exception as e:
logger.error(f"Failed to extract text from PDF: {str(e)}")
raise FileProcessingError(f"PDF 文件解析失敗: {str(e)}")
def generate_translated_document(self, translations: Dict[str, List[str]],
target_language: str, output_dir: Path) -> str:
"""生成翻譯文字檔PDF 不支援直接編輯)"""
try:
translated_texts = translations.get(target_language, [])
# 生成純文字檔案
output_filename = f"{self.file_path.stem}_{target_language}_translated.txt"
output_path = output_dir / output_filename
with open(output_path, 'w', encoding='utf-8') as f:
f.write(f"翻譯結果 - {target_language}\n")
f.write("=" * 50 + "\n\n")
for i, text in enumerate(translated_texts):
f.write(f"{i+1}. {text}\n\n")
logger.info(f"Generated translated text file: {output_path}")
return str(output_path)
except Exception as e:
logger.error(f"Failed to generate translated text file: {str(e)}")
raise FileProcessingError(f"生成翻譯文字檔失敗: {str(e)}")
class TranslationService:
"""翻譯服務"""
def __init__(self):
self.dify_client = DifyClient()
# 文件解析器映射
self.parsers = {
'.docx': DocxParser,
'.doc': DocxParser, # 假設可以用 docx 處理
'.pdf': PdfParser,
# 其他格式可以稍後添加
}
def get_document_parser(self, file_path: str) -> DocumentParser:
"""取得文件解析器"""
file_ext = Path(file_path).suffix.lower()
parser_class = self.parsers.get(file_ext)
if not parser_class:
raise FileProcessingError(f"不支援的檔案格式: {file_ext}")
return parser_class(file_path)
def split_text_into_sentences(self, text: str, language: str = 'auto') -> List[str]:
"""將文字分割成句子"""
# 這裡可以使用更智能的句子分割
# 暫時使用簡單的分割方式
sentences = []
# 基本的句子分割符號
separators = ['. ', '', '', '', '!', '?']
current_text = text
for sep in separators:
parts = current_text.split(sep)
if len(parts) > 1:
sentences.extend([part.strip() + sep.rstrip() for part in parts[:-1] if part.strip()])
current_text = parts[-1]
# 添加最後一部分
if current_text.strip():
sentences.append(current_text.strip())
# 過濾太短的句子
sentences = [s for s in sentences if len(s.strip()) > 5]
return sentences
def translate_text_with_cache(self, text: str, source_language: str,
target_language: str, user_id: int = None,
job_id: int = None) -> str:
"""帶快取的文字翻譯"""
# 檢查快取
cached_translation = TranslationCache.get_translation(
text, source_language, target_language
)
if cached_translation:
logger.debug(f"Cache hit for translation: {text[:50]}...")
return cached_translation
# 呼叫 Dify API
try:
result = self.dify_client.translate_text(
text=text,
source_language=source_language,
target_language=target_language,
user_id=user_id,
job_id=job_id
)
translated_text = result['translated_text']
# 儲存到快取
TranslationCache.save_translation(
text, source_language, target_language, translated_text
)
return translated_text
except Exception as e:
logger.error(f"Translation failed for text: {text[:50]}... Error: {str(e)}")
raise TranslationError(f"翻譯失敗: {str(e)}")
def translate_document(self, job_uuid: str) -> Dict[str, Any]:
"""翻譯文件(主要入口點)"""
try:
# 取得任務資訊
job = TranslationJob.query.filter_by(job_uuid=job_uuid).first()
if not job:
raise TranslationError(f"找不到任務: {job_uuid}")
logger.info(f"Starting document translation: {job_uuid}")
# 更新任務狀態
job.update_status('PROCESSING', progress=0)
# 取得文件解析器
parser = self.get_document_parser(job.file_path)
# 提取文字片段
logger.info("Extracting text segments from document")
text_segments = parser.extract_text_segments()
if not text_segments:
raise TranslationError("文件中未找到可翻譯的文字")
# 分割成句子
logger.info("Splitting text into sentences")
all_sentences = []
for segment in text_segments:
sentences = self.split_text_into_sentences(segment, job.source_language)
all_sentences.extend(sentences)
# 去重複
unique_sentences = list(dict.fromkeys(all_sentences)) # 保持順序的去重
logger.info(f"Found {len(unique_sentences)} unique sentences to translate")
# 批次翻譯
translation_results = {}
total_sentences = len(unique_sentences)
for target_language in job.target_languages:
logger.info(f"Translating to {target_language}")
translated_sentences = []
for i, sentence in enumerate(unique_sentences):
try:
translated = self.translate_text_with_cache(
text=sentence,
source_language=job.source_language,
target_language=target_language,
user_id=job.user_id,
job_id=job.id
)
translated_sentences.append(translated)
# 更新進度
progress = (i + 1) / total_sentences * 100 / len(job.target_languages)
current_lang_index = job.target_languages.index(target_language)
total_progress = (current_lang_index * 100 + progress) / len(job.target_languages)
job.update_status('PROCESSING', progress=total_progress)
# 短暫延遲避免過快請求
time.sleep(0.1)
except Exception as e:
logger.error(f"Failed to translate sentence: {sentence[:50]}... Error: {str(e)}")
# 翻譯失敗時保留原文
translated_sentences.append(f"[翻譯失敗] {sentence}")
translation_results[target_language] = translated_sentences
# 生成翻譯文件
logger.info("Generating translated documents")
output_dir = Path(job.file_path).parent
output_files = {}
for target_language, translations in translation_results.items():
try:
# 重建翻譯映射
translation_mapping = {target_language: translations}
output_file = parser.generate_translated_document(
translations=translation_mapping,
target_language=target_language,
output_dir=output_dir
)
output_files[target_language] = output_file
# 記錄翻譯檔案到資料庫
file_size = Path(output_file).stat().st_size
job.add_translated_file(
language_code=target_language,
filename=Path(output_file).name,
file_path=output_file,
file_size=file_size
)
except Exception as e:
logger.error(f"Failed to generate translated document for {target_language}: {str(e)}")
raise TranslationError(f"生成 {target_language} 翻譯文件失敗: {str(e)}")
# 計算總成本(從 API 使用統計中取得)
total_cost = self._calculate_job_cost(job.id)
# 更新任務狀態為完成
job.update_status('COMPLETED', progress=100)
job.total_cost = total_cost
job.total_tokens = len(unique_sentences) # 簡化的 token 計算
from app import db
db.session.commit()
logger.info(f"Document translation completed: {job_uuid}")
return {
'success': True,
'job_uuid': job_uuid,
'output_files': output_files,
'total_sentences': len(unique_sentences),
'total_cost': float(total_cost),
'target_languages': job.target_languages
}
except TranslationError:
raise
except Exception as e:
logger.error(f"Document translation failed: {job_uuid}. Error: {str(e)}")
raise TranslationError(f"文件翻譯失敗: {str(e)}")
def _calculate_job_cost(self, job_id: int) -> float:
"""計算任務總成本"""
from app import db
from sqlalchemy import func
total_cost = db.session.query(
func.sum(APIUsageStats.cost)
).filter_by(job_id=job_id).scalar()
return float(total_cost) if total_cost else 0.0