#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ 翻譯服務 Author: PANJIT IT Team Created: 2024-01-28 Modified: 2024-01-28 """ import hashlib import time from pathlib import Path from typing import List, Dict, Any, Optional from app.utils.logger import get_logger from app.utils.exceptions import TranslationError, FileProcessingError from app.services.dify_client import DifyClient from app.models.cache import TranslationCache from app.models.job import TranslationJob from app.utils.helpers import generate_filename, create_job_directory logger = get_logger(__name__) class DocumentParser: """文件解析器基類""" def __init__(self, file_path: str): self.file_path = Path(file_path) if not self.file_path.exists(): raise FileProcessingError(f"檔案不存在: {file_path}") def extract_text_segments(self) -> List[str]: """提取文字片段""" raise NotImplementedError def generate_translated_document(self, translations: Dict[str, List[str]], target_language: str, output_dir: Path) -> str: """生成翻譯後的文件""" raise NotImplementedError class DocxParser(DocumentParser): """DOCX 文件解析器""" def extract_text_segments(self) -> List[str]: """提取 DOCX 文件的文字片段""" try: import docx from docx.table import _Cell doc = docx.Document(str(self.file_path)) text_segments = [] # 提取段落文字 for paragraph in doc.paragraphs: text = paragraph.text.strip() if text and len(text) > 3: # 過濾太短的文字 text_segments.append(text) # 提取表格文字 for table in doc.tables: for row in table.rows: for cell in row.cells: text = cell.text.strip() if text and len(text) > 3: text_segments.append(text) logger.info(f"Extracted {len(text_segments)} text segments from DOCX") return text_segments except Exception as e: logger.error(f"Failed to extract text from DOCX: {str(e)}") raise FileProcessingError(f"DOCX 文件解析失敗: {str(e)}") def generate_translated_document(self, translations: Dict[str, List[str]], target_language: str, output_dir: Path) -> str: """生成翻譯後的 DOCX 文件""" try: import docx from docx.shared import Pt # 開啟原始文件 doc = docx.Document(str(self.file_path)) # 取得對應的翻譯 translated_texts = translations.get(target_language, []) text_index = 0 # 處理段落 for paragraph in doc.paragraphs: if paragraph.text.strip() and len(paragraph.text.strip()) > 3: if text_index < len(translated_texts): # 保留原文,添加翻譯 original_text = paragraph.text translated_text = translated_texts[text_index] # 清空段落 paragraph.clear() # 添加原文 run = paragraph.add_run(original_text) # 添加翻譯(新行,較小字體) paragraph.add_run('\n') trans_run = paragraph.add_run(translated_text) trans_run.font.size = Pt(10) trans_run.italic = True text_index += 1 # 處理表格(簡化版本) for table in doc.tables: for row in table.rows: for cell in row.cells: if cell.text.strip() and len(cell.text.strip()) > 3: if text_index < len(translated_texts): original_text = cell.text translated_text = translated_texts[text_index] # 清空儲存格 cell.text = f"{original_text}\n{translated_text}" text_index += 1 # 生成輸出檔名 output_filename = generate_filename( self.file_path.name, 'translated', 'translated', target_language ) output_path = output_dir / output_filename # 儲存文件 doc.save(str(output_path)) logger.info(f"Generated translated DOCX: {output_path}") return str(output_path) except Exception as e: logger.error(f"Failed to generate translated DOCX: {str(e)}") raise FileProcessingError(f"生成翻譯 DOCX 失敗: {str(e)}") class PdfParser(DocumentParser): """PDF 文件解析器(只讀)""" def extract_text_segments(self) -> List[str]: """提取 PDF 文件的文字片段""" try: from PyPDF2 import PdfReader reader = PdfReader(str(self.file_path)) text_segments = [] for page in reader.pages: text = page.extract_text() # 簡單的句子分割 sentences = text.split('.') for sentence in sentences: sentence = sentence.strip() if sentence and len(sentence) > 10: text_segments.append(sentence) logger.info(f"Extracted {len(text_segments)} text segments from PDF") return text_segments except Exception as e: logger.error(f"Failed to extract text from PDF: {str(e)}") raise FileProcessingError(f"PDF 文件解析失敗: {str(e)}") def generate_translated_document(self, translations: Dict[str, List[str]], target_language: str, output_dir: Path) -> str: """生成翻譯文字檔(PDF 不支援直接編輯)""" try: translated_texts = translations.get(target_language, []) # 生成純文字檔案 output_filename = f"{self.file_path.stem}_{target_language}_translated.txt" output_path = output_dir / output_filename with open(output_path, 'w', encoding='utf-8') as f: f.write(f"翻譯結果 - {target_language}\n") f.write("=" * 50 + "\n\n") for i, text in enumerate(translated_texts): f.write(f"{i+1}. {text}\n\n") logger.info(f"Generated translated text file: {output_path}") return str(output_path) except Exception as e: logger.error(f"Failed to generate translated text file: {str(e)}") raise FileProcessingError(f"生成翻譯文字檔失敗: {str(e)}") class TranslationService: """翻譯服務""" def __init__(self): self.dify_client = DifyClient() # 文件解析器映射 self.parsers = { '.docx': DocxParser, '.doc': DocxParser, # 假設可以用 docx 處理 '.pdf': PdfParser, # 其他格式可以稍後添加 } def get_document_parser(self, file_path: str) -> DocumentParser: """取得文件解析器""" file_ext = Path(file_path).suffix.lower() parser_class = self.parsers.get(file_ext) if not parser_class: raise FileProcessingError(f"不支援的檔案格式: {file_ext}") return parser_class(file_path) def split_text_into_sentences(self, text: str, language: str = 'auto') -> List[str]: """將文字分割成句子""" # 這裡可以使用更智能的句子分割 # 暫時使用簡單的分割方式 sentences = [] # 基本的句子分割符號 separators = ['. ', '。', '!', '?', '!', '?'] current_text = text for sep in separators: parts = current_text.split(sep) if len(parts) > 1: sentences.extend([part.strip() + sep.rstrip() for part in parts[:-1] if part.strip()]) current_text = parts[-1] # 添加最後一部分 if current_text.strip(): sentences.append(current_text.strip()) # 過濾太短的句子 sentences = [s for s in sentences if len(s.strip()) > 5] return sentences def translate_text_with_cache(self, text: str, source_language: str, target_language: str, user_id: int = None, job_id: int = None) -> str: """帶快取的文字翻譯""" # 檢查快取 cached_translation = TranslationCache.get_translation( text, source_language, target_language ) if cached_translation: logger.debug(f"Cache hit for translation: {text[:50]}...") return cached_translation # 呼叫 Dify API try: result = self.dify_client.translate_text( text=text, source_language=source_language, target_language=target_language, user_id=user_id, job_id=job_id ) translated_text = result['translated_text'] # 儲存到快取 TranslationCache.save_translation( text, source_language, target_language, translated_text ) return translated_text except Exception as e: logger.error(f"Translation failed for text: {text[:50]}... Error: {str(e)}") raise TranslationError(f"翻譯失敗: {str(e)}") def translate_document(self, job_uuid: str) -> Dict[str, Any]: """翻譯文件(主要入口點)""" try: # 取得任務資訊 job = TranslationJob.query.filter_by(job_uuid=job_uuid).first() if not job: raise TranslationError(f"找不到任務: {job_uuid}") logger.info(f"Starting document translation: {job_uuid}") # 更新任務狀態 job.update_status('PROCESSING', progress=0) # 取得文件解析器 parser = self.get_document_parser(job.file_path) # 提取文字片段 logger.info("Extracting text segments from document") text_segments = parser.extract_text_segments() if not text_segments: raise TranslationError("文件中未找到可翻譯的文字") # 分割成句子 logger.info("Splitting text into sentences") all_sentences = [] for segment in text_segments: sentences = self.split_text_into_sentences(segment, job.source_language) all_sentences.extend(sentences) # 去重複 unique_sentences = list(dict.fromkeys(all_sentences)) # 保持順序的去重 logger.info(f"Found {len(unique_sentences)} unique sentences to translate") # 批次翻譯 translation_results = {} total_sentences = len(unique_sentences) for target_language in job.target_languages: logger.info(f"Translating to {target_language}") translated_sentences = [] for i, sentence in enumerate(unique_sentences): try: translated = self.translate_text_with_cache( text=sentence, source_language=job.source_language, target_language=target_language, user_id=job.user_id, job_id=job.id ) translated_sentences.append(translated) # 更新進度 progress = (i + 1) / total_sentences * 100 / len(job.target_languages) current_lang_index = job.target_languages.index(target_language) total_progress = (current_lang_index * 100 + progress) / len(job.target_languages) job.update_status('PROCESSING', progress=total_progress) # 短暫延遲避免過快請求 time.sleep(0.1) except Exception as e: logger.error(f"Failed to translate sentence: {sentence[:50]}... Error: {str(e)}") # 翻譯失敗時保留原文 translated_sentences.append(f"[翻譯失敗] {sentence}") translation_results[target_language] = translated_sentences # 生成翻譯文件 logger.info("Generating translated documents") output_dir = Path(job.file_path).parent output_files = {} for target_language, translations in translation_results.items(): try: # 重建翻譯映射 translation_mapping = {target_language: translations} output_file = parser.generate_translated_document( translations=translation_mapping, target_language=target_language, output_dir=output_dir ) output_files[target_language] = output_file # 記錄翻譯檔案到資料庫 file_size = Path(output_file).stat().st_size job.add_translated_file( language_code=target_language, filename=Path(output_file).name, file_path=output_file, file_size=file_size ) except Exception as e: logger.error(f"Failed to generate translated document for {target_language}: {str(e)}") raise TranslationError(f"生成 {target_language} 翻譯文件失敗: {str(e)}") # 計算總成本(從 API 使用統計中取得) total_cost = self._calculate_job_cost(job.id) # 更新任務狀態為完成 job.update_status('COMPLETED', progress=100) job.total_cost = total_cost job.total_tokens = len(unique_sentences) # 簡化的 token 計算 from app import db db.session.commit() logger.info(f"Document translation completed: {job_uuid}") return { 'success': True, 'job_uuid': job_uuid, 'output_files': output_files, 'total_sentences': len(unique_sentences), 'total_cost': float(total_cost), 'target_languages': job.target_languages } except TranslationError: raise except Exception as e: logger.error(f"Document translation failed: {job_uuid}. Error: {str(e)}") raise TranslationError(f"文件翻譯失敗: {str(e)}") def _calculate_job_cost(self, job_id: int) -> float: """計算任務總成本""" from app import db from sqlalchemy import func total_cost = db.session.query( func.sum(APIUsageStats.cost) ).filter_by(job_id=job_id).scalar() return float(total_cost) if total_cost else 0.0