#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ 翻譯服務 Author: PANJIT IT Team Created: 2024-01-28 Modified: 2024-01-28 """ import hashlib import time from pathlib import Path from typing import List, Dict, Any, Optional, Tuple from app.utils.logger import get_logger from app.utils.exceptions import TranslationError, FileProcessingError from app.services.dify_client import DifyClient from app.services.document_processor import DocumentProcessor, Segment from app.models.cache import TranslationCache from app.models.job import TranslationJob from app.utils.helpers import generate_filename, create_job_directory from app import db logger = get_logger(__name__) class DocumentParser: """文件解析器基類""" def __init__(self, file_path: str): self.file_path = Path(file_path) if not self.file_path.exists(): raise FileProcessingError(f"檔案不存在: {file_path}") def extract_text_segments(self) -> List[str]: """提取文字片段""" raise NotImplementedError def generate_translated_document(self, translations: Dict[str, List[str]], target_language: str, output_dir: Path) -> str: """生成翻譯後的文件""" raise NotImplementedError class DocxParser(DocumentParser): """DOCX 文件解析器 - 使用增強的 DocumentProcessor""" def __init__(self, file_path: str): super().__init__(file_path) self.processor = DocumentProcessor() def extract_text_segments(self) -> List[str]: """提取 DOCX 文件的文字片段 - 使用增強邏輯""" try: # 使用新的文檔處理器提取段落 segments = self.processor.extract_docx_segments(str(self.file_path)) # 轉換為文字列表 text_segments = [] for seg in segments: if seg.text.strip() and len(seg.text.strip()) > 3: text_segments.append(seg.text) logger.info(f"Enhanced extraction: {len(text_segments)} text segments from DOCX") return text_segments except Exception as e: logger.error(f"Failed to extract text from DOCX: {str(e)}") raise FileProcessingError(f"DOCX 文件解析失敗: {str(e)}") def extract_segments_with_context(self) -> List[Segment]: """提取帶上下文的段落資訊""" return self.processor.extract_docx_segments(str(self.file_path)) def generate_translated_document(self, translations: Dict[str, List[str]], target_language: str, output_dir: Path) -> str: """生成翻譯後的 DOCX 文件 - 使用增強的翻譯插入邏輯(從快取讀取)""" try: from sqlalchemy import text as sql_text from app import db # 生成輸出檔名 output_filename = generate_filename( self.file_path.name, 'translated', 'translated', target_language ) output_path = output_dir / output_filename # 提取段落資訊 segments = self.extract_segments_with_context() # 建立翻譯映射 - 從快取讀取而非使用傳入的translations參數 translation_map = {} logger.info(f"Building translation map for {len(segments)} segments in language {target_language}") for seg in segments: # 從翻譯快取中查詢每個段落的翻譯 result = db.session.execute(sql_text(""" SELECT translated_text FROM dt_translation_cache WHERE source_text = :text AND target_language = :lang ORDER BY created_at DESC LIMIT 1 """), {'text': seg.text, 'lang': target_language}) row = result.fetchone() if row and row[0]: translation_map[(target_language, seg.text)] = row[0] logger.debug(f"Found translation for: {seg.text[:50]}...") else: logger.warning(f"No translation found for: {seg.text[:50]}...") logger.info(f"Translation map built with {len(translation_map)} mappings") # 使用增強的翻譯插入邏輯 ok_count, skip_count = self.processor.insert_docx_translations( str(self.file_path), segments, translation_map, [target_language], str(output_path) ) logger.info(f"Enhanced translation: Generated {output_path} with {ok_count} insertions, {skip_count} skips") return str(output_path) except Exception as e: logger.error(f"Failed to generate translated DOCX: {str(e)}") raise FileProcessingError(f"生成翻譯 DOCX 失敗: {str(e)}") class DocParser(DocumentParser): """DOC 文件解析器 - 需要先轉換為 DOCX""" def extract_text_segments(self) -> List[str]: """提取 DOC 文件的文字片段 - 先轉換為 DOCX 再處理""" try: # 檢查是否有 Word COM 支援 import tempfile import os try: import win32com.client as win32 import pythoncom _WIN32COM_AVAILABLE = True except ImportError: _WIN32COM_AVAILABLE = False if not _WIN32COM_AVAILABLE: raise FileProcessingError("DOC 格式需要 Word COM 支援,請先手動轉換為 DOCX 格式或安裝 Microsoft Office") # 創建臨時 DOCX 文件 temp_docx = None try: with tempfile.NamedTemporaryFile(suffix='.docx', delete=False) as tmp: temp_docx = tmp.name # 使用 Word COM 轉換 DOC 到 DOCX (格式 16) self._word_convert(str(self.file_path), temp_docx, 16) # 使用 DOCX 解析器處理轉換後的文件 docx_parser = DocxParser(temp_docx) segments = docx_parser.extract_text_segments() logger.info(f"Converted DOC to DOCX and extracted {len(segments)} segments") return segments finally: # 清理臨時文件 if temp_docx and os.path.exists(temp_docx): try: os.remove(temp_docx) except Exception: pass except Exception as e: logger.error(f"Failed to extract text from DOC file: {str(e)}") raise FileProcessingError(f"DOC 文件解析失敗: {str(e)}") def _word_convert(self, input_path: str, output_path: str, target_format: int): """使用 Word COM 轉換文件格式(移植自參考檔案)""" try: import win32com.client as win32 import pythoncom pythoncom.CoInitialize() try: word = win32.Dispatch("Word.Application") word.Visible = False doc = word.Documents.Open(os.path.abspath(input_path)) doc.SaveAs2(os.path.abspath(output_path), FileFormat=target_format) doc.Close(False) finally: word.Quit() pythoncom.CoUninitialize() except Exception as e: raise FileProcessingError(f"Word COM 轉換失敗: {str(e)}") def generate_translated_document(self, translations: Dict[str, List[str]], target_language: str, output_dir: Path) -> str: """生成翻譯後的 DOC 文件 - 先轉為 DOCX 處理後輸出為 DOCX""" try: import tempfile import os # 先轉換為 DOCX,然後使用 DOCX 處理邏輯 temp_docx = None try: with tempfile.NamedTemporaryFile(suffix='.docx', delete=False) as tmp: temp_docx = tmp.name # 轉換 DOC 到 DOCX self._word_convert(str(self.file_path), temp_docx, 16) # 使用 DOCX 解析器生成翻譯文檔 docx_parser = DocxParser(temp_docx) # 注意:最終輸出為 DOCX 格式,因為 DOC 格式較難直接處理 output_filename = f"{self.file_path.stem}_{target_language}_translated.docx" output_path = output_dir / output_filename result_path = docx_parser.generate_translated_document(translations, target_language, output_dir) logger.info(f"Generated translated DOC file (as DOCX): {result_path}") return result_path finally: # 清理臨時文件 if temp_docx and os.path.exists(temp_docx): try: os.remove(temp_docx) except Exception: pass except Exception as e: logger.error(f"Failed to generate translated DOC file: {str(e)}") raise FileProcessingError(f"DOC 翻譯檔生成失敗: {str(e)}") class ExcelParser(DocumentParser): """Excel 文件解析器(XLSX/XLS)- 移植自參考檔案""" def extract_text_segments(self) -> List[str]: """提取 Excel 文件的文字片段""" try: import openpyxl from openpyxl.utils.exceptions import InvalidFileException # 載入工作簿(移植自參考檔案邏輯) try: wb = openpyxl.load_workbook(str(self.file_path), data_only=False) wb_vals = openpyxl.load_workbook(str(self.file_path), data_only=True) except InvalidFileException: if self.file_path.suffix.lower() == '.xls': raise FileProcessingError("XLS 格式需要先轉換為 XLSX 格式") raise except Exception: wb_vals = None # 提取文字段落(完全按照參考檔案的邏輯) segs = [] for ws in wb.worksheets: ws_vals = wb_vals[ws.title] if wb_vals and ws.title in wb_vals.sheetnames else None max_row, max_col = ws.max_row, ws.max_column for r in range(1, max_row + 1): for c in range(1, max_col + 1): src_text = self._get_display_text_for_translation(ws, ws_vals, r, c) if not src_text: continue if not self._should_translate(src_text, 'auto'): continue segs.append(src_text) # 去重保持順序 unique_segments = [] seen = set() for seg in segs: if seg not in seen: unique_segments.append(seg) seen.add(seg) logger.info(f"Extracted {len(unique_segments)} unique text segments from Excel file") return unique_segments except Exception as e: logger.error(f"Failed to extract text from Excel file: {str(e)}") raise FileProcessingError(f"Excel 文件解析失敗: {str(e)}") def _get_display_text_for_translation(self, ws, ws_vals, r: int, c: int) -> Optional[str]: """取得儲存格用於翻譯的顯示文字(完全移植自參考檔案)""" val = ws.cell(row=r, column=c).value if isinstance(val, str) and val.startswith("="): if ws_vals is not None: shown = ws_vals.cell(row=r, column=c).value return shown if isinstance(shown, str) and shown.strip() else None return None if isinstance(val, str) and val.strip(): return val if ws_vals is not None: shown = ws_vals.cell(row=r, column=c).value if isinstance(shown, str) and shown.strip(): return shown return None def _should_translate(self, text: str, src_lang: str) -> bool: """判斷文字是否需要翻譯(只要有字就翻譯)""" text = text.strip() # 只要有字就翻譯 - 最小長度設為1 if len(text) < 1: return False # Skip pure numbers, dates, etc. import re if re.match(r'^[\d\s\.\-\:\/]+$', text): return False # For auto-detect, translate if has CJK or meaningful text if src_lang.lower() in ('auto', 'auto-detect'): return self._has_cjk(text) or len(text) > 5 return True def _has_cjk(self, text: str) -> bool: """檢查是否包含中日韓文字(移植自參考檔案)""" for char in text: if '\u4e00' <= char <= '\u9fff' or \ '\u3400' <= char <= '\u4dbf' or \ '\u20000' <= char <= '\u2a6df' or \ '\u3040' <= char <= '\u309f' or \ '\u30a0' <= char <= '\u30ff' or \ '\uac00' <= char <= '\ud7af': return True return False def generate_translated_document(self, translations: Dict[str, List[str]], target_language: str, output_dir: Path) -> str: """生成翻譯後的 Excel 文件(使用翻譯快取確保正確映射)""" try: import openpyxl from openpyxl.styles import Alignment from openpyxl.comments import Comment from sqlalchemy import text as sql_text from app import db # 載入原始工作簿 wb = openpyxl.load_workbook(str(self.file_path), data_only=False) try: wb_vals = openpyxl.load_workbook(str(self.file_path), data_only=True) except Exception: wb_vals = None # 建立翻譯映射 - 改用翻譯快取查詢,確保正確對應 original_segments = self.extract_text_segments() tmap = {} logger.info(f"Building translation map for {len(original_segments)} segments in language {target_language}") for original_text in original_segments: # 從翻譯快取中查詢每個原文的翻譯 # 使用聯合查詢,優先使用最早的翻譯記錄(原始DIFY翻譯) normalized_text = original_text.replace('\n', ' ').replace('\r', ' ').strip() result = db.session.execute(sql_text(""" SELECT translated_text, created_at, 'exact' as match_type FROM dt_translation_cache WHERE source_text = :exact_text AND target_language = :lang UNION ALL SELECT translated_text, created_at, 'normalized' as match_type FROM dt_translation_cache WHERE REPLACE(REPLACE(TRIM(source_text), '\n', ' '), '\r', ' ') = :norm_text AND target_language = :lang AND source_text != :exact_text ORDER BY created_at ASC LIMIT 1 """), {'exact_text': original_text, 'norm_text': normalized_text, 'lang': target_language}) row = result.fetchone() if row and row[0]: tmap[original_text] = row[0] logger.debug(f"Cache hit for Excel: {original_text[:30]}... -> {row[0][:30]}...") else: logger.warning(f"No translation found in cache for: {original_text[:50]}...") logger.info(f"Translation map built with {len(tmap)} mappings from cache") # 處理每個工作表(加入詳細調試日誌) translation_count = 0 skip_count = 0 for ws in wb.worksheets: logger.info(f"Processing worksheet: {ws.title}") ws_vals = wb_vals[ws.title] if wb_vals and ws.title in wb_vals.sheetnames else None max_row, max_col = ws.max_row, ws.max_column for r in range(1, max_row + 1): for c in range(1, max_col + 1): cell_name = f"{openpyxl.utils.get_column_letter(c)}{r}" src_text = self._get_display_text_for_translation(ws, ws_vals, r, c) if not src_text: continue # 檢查是否需要翻譯 should_translate = self._should_translate(src_text, 'auto') if not should_translate: logger.debug(f"Skip {cell_name}: '{src_text[:30]}...' (should not translate)") skip_count += 1 continue # 檢查翻譯映射 if src_text not in tmap: logger.warning(f"No translation mapping for {cell_name}: '{src_text[:30]}...'") skip_count += 1 continue val = ws.cell(row=r, column=c).value is_formula = isinstance(val, str) and val.startswith("=") translated_text = tmap[src_text] cell = ws.cell(row=r, column=c) if is_formula: # 公式儲存格:添加註解 txt_comment = f"翻譯: {translated_text}" exist = cell.comment if not exist or exist.text.strip() != txt_comment: cell.comment = Comment(txt_comment, "translator") logger.debug(f"Added comment to {cell_name}: {translated_text[:30]}...") translation_count += 1 else: # 一般儲存格:單語言檔案只保留翻譯文,不包含原文 # 檢查是否已經是預期的格式 current_text = str(cell.value) if cell.value else "" if current_text.strip() == translated_text.strip(): logger.debug(f"Skip {cell_name}: already translated") continue cell.value = translated_text # 只保留翻譯文 logger.info(f"Translated {cell_name}: '{src_text[:20]}...' -> '{translated_text[:20]}...'") translation_count += 1 # 設定自動換行(移植自參考檔案) try: if cell.alignment: cell.alignment = Alignment( horizontal=cell.alignment.horizontal, vertical=cell.alignment.vertical, wrap_text=True ) else: cell.alignment = Alignment(wrap_text=True) except Exception: cell.alignment = Alignment(wrap_text=True) # 儲存翻譯後的檔案 output_filename = f"{self.file_path.stem}_{target_language}_translated.xlsx" output_path = output_dir / output_filename wb.save(str(output_path)) logger.info(f"Excel translation completed: {translation_count} translations, {skip_count} skips") logger.info(f"Generated translated Excel file: {output_path}") return str(output_path) except Exception as e: logger.error(f"Failed to generate translated Excel file: {str(e)}") raise FileProcessingError(f"Excel 翻譯檔生成失敗: {str(e)}") class PdfParser(DocumentParser): """PDF 文件解析器 - 支持扫描PDF的OCR处理""" def extract_text_segments(self, user_id: int = None, job_id: int = None) -> List[str]: """提取 PDF 文件的文字片段 - 支持扫描PDF的智能处理""" try: from app.services.enhanced_pdf_parser import EnhancedPdfParser # 使用增强的PDF解析器 enhanced_parser = EnhancedPdfParser(str(self.file_path)) text_segments = enhanced_parser.extract_text_segments(user_id, job_id) logger.info(f"Enhanced PDF extraction: {len(text_segments)} text segments") return text_segments except Exception as e: logger.error(f"Enhanced PDF extraction failed, falling back to basic extraction: {str(e)}") # 回退到基本文字提取 try: from PyPDF2 import PdfReader reader = PdfReader(str(self.file_path)) text_segments = [] for page in reader.pages: text = page.extract_text() # 簡單的句子分割 sentences = text.split('.') for sentence in sentences: sentence = sentence.strip() if sentence and len(sentence) > 10: text_segments.append(sentence) logger.info(f"Basic PDF extraction: {len(text_segments)} text segments") return text_segments except Exception as e2: logger.error(f"Basic PDF extraction also failed: {str(e2)}") raise FileProcessingError(f"PDF 文件解析失敗: {str(e2)}") def generate_translated_document(self, translations: Dict[str, List[str]], target_language: str, output_dir: Path) -> str: """生成翻譯文字檔(PDF 不支援直接編輯)""" try: from app.services.enhanced_pdf_parser import EnhancedPdfParser # 使用增强解析器生成翻译文档 enhanced_parser = EnhancedPdfParser(str(self.file_path)) return enhanced_parser.generate_translated_document(translations, target_language, output_dir) except Exception as e: # 回退到基本生成方式 logger.warning(f"Enhanced PDF generation failed, using basic method: {str(e)}") translated_texts = translations.get(target_language, []) # 生成純文字檔案 output_filename = f"{self.file_path.stem}_{target_language}_translated.txt" output_path = output_dir / output_filename with open(output_path, 'w', encoding='utf-8') as f: f.write(f"翻譯結果 - {target_language}\n") f.write("=" * 50 + "\n\n") for i, text in enumerate(translated_texts): f.write(f"{i+1}. {text}\n\n") logger.info(f"Generated translated text file: {output_path}") return str(output_path) class PptxParser(DocumentParser): """PowerPoint 文件解析器""" def extract_text_segments(self) -> List[str]: """提取 PPTX 文件的文字片段(包含表格)""" try: import pptx prs = pptx.Presentation(str(self.file_path)) text_segments = [] for slide_idx, slide in enumerate(prs.slides, 1): for shape_idx, shape in enumerate(slide.shapes, 1): shape_processed = False # 處理文字框 - 優先處理,因為大多數文字都在這裡 if getattr(shape, "has_text_frame", False): text_frame = shape.text_frame text = self._extract_text_from_frame(text_frame) if text.strip(): text_segments.append(text) logger.debug(f"Extracted text frame from slide {slide_idx}, shape {shape_idx}: {text[:50]}...") shape_processed = True # 處理表格 if getattr(shape, "has_table", False): table_texts = self._extract_text_from_table(shape.table, slide_idx, shape_idx) text_segments.extend(table_texts) if table_texts: shape_processed = True # 處理圖表 (Charts) if getattr(shape, "has_chart", False): chart_texts = self._extract_text_from_chart(shape.chart, slide_idx, shape_idx) text_segments.extend(chart_texts) if chart_texts: shape_processed = True # 處理群組形狀 (Grouped Shapes) - 支援深度嵌套 if hasattr(shape, 'shapes'): group_texts = self._extract_text_from_group(shape.shapes, slide_idx, shape_idx, depth=0) text_segments.extend(group_texts) if group_texts: shape_processed = True # 處理 GraphicFrame (可能包含 SmartArt 等) if getattr(shape, "has_smart_art", False): smartart_texts = self._extract_text_from_smartart(shape, slide_idx, shape_idx) text_segments.extend(smartart_texts) if smartart_texts: shape_processed = True # 處理基本形狀內的文字 - 作為備用方案,避免重複提取 if not shape_processed and hasattr(shape, 'text') and shape.text.strip(): text_segments.append(shape.text) logger.debug(f"Extracted shape text from slide {slide_idx}, shape {shape_idx}: {shape.text[:50]}...") shape_processed = True # 如果以上都沒有處理到,檢查是否有其他可能的文字內容 if not shape_processed: # 嘗試更深層的文字提取 fallback_texts = self._extract_fallback_text(shape, slide_idx, shape_idx) text_segments.extend(fallback_texts) logger.info(f"PowerPoint extraction: {len(text_segments)} text segments from PPTX (including tables)") # 診斷特定關鍵字 - 增強版 target_keywords = [ "檢驗盤剔線作業時缺少線塌防護設計", "治工具未標準化管理", "彈匣裝載料片間距不足", "彈匣未評估防震防傾倒風險", "搬運台車選用錯誤" ] logger.info("=== 關鍵字診斷開始 ===") for keyword in target_keywords: # 完全匹配 exact_matches = [seg for seg in text_segments if keyword == seg.strip()] # 包含匹配 contains_matches = [seg for seg in text_segments if keyword in seg] # 模糊匹配(去掉空白和換行符) normalized_keyword = keyword.replace(' ', '').replace('\n', '').replace('\r', '') fuzzy_matches = [seg for seg in text_segments if normalized_keyword in seg.replace(' ', '').replace('\n', '').replace('\r', '')] if exact_matches: logger.info(f"✅ 完全匹配關鍵字: '{keyword}' 在 {len(exact_matches)} 個文字片段中") for i, seg in enumerate(exact_matches): logger.info(f" 完全匹配{i+1}: '{seg}'") elif contains_matches: logger.info(f"🔍 包含關鍵字: '{keyword}' 在 {len(contains_matches)} 個文字片段中") for i, seg in enumerate(contains_matches): logger.info(f" 包含匹配{i+1}: '{seg}'") elif fuzzy_matches: logger.info(f"🎯 模糊匹配關鍵字: '{keyword}' 在 {len(fuzzy_matches)} 個文字片段中") for i, seg in enumerate(fuzzy_matches): logger.info(f" 模糊匹配{i+1}: '{seg}'") # 顯示標準化後的比較 normalized_seg = seg.replace(' ', '').replace('\n', '').replace('\r', '') logger.info(f" 標準化後: 關鍵字='{normalized_keyword}' vs 片段='{normalized_seg}'") else: logger.warning(f"❌ 未找到關鍵字: '{keyword}'") # 檢查是否有類似的文字 similar_segments = [] for seg in text_segments: # 計算相似度(簡單的關鍵詞匹配) keyword_chars = set(keyword) seg_chars = set(seg) intersection = keyword_chars.intersection(seg_chars) if len(intersection) >= min(5, len(keyword_chars) * 0.5): similar_segments.append(seg) if similar_segments: logger.info(f"💡 可能相似的片段 ({len(similar_segments)} 個):") for i, seg in enumerate(similar_segments[:3]): # 只顯示前3個 logger.info(f" 相似{i+1}: '{seg}'") logger.info("=== 關鍵字診斷結束 ===") return text_segments except Exception as e: logger.error(f"Failed to extract text from PPTX: {str(e)}") raise FileProcessingError(f"PPTX 文件解析失敗: {str(e)}") def _extract_text_from_frame(self, text_frame) -> str: """從文字框中提取文字內容,包含標準化處理""" if not text_frame or not hasattr(text_frame, 'paragraphs'): return "" # 收集所有段落文字 paragraphs = [] for para in text_frame.paragraphs: para_text = para.text if para_text and para_text.strip(): paragraphs.append(para_text.strip()) if not paragraphs: return "" # 合併段落 text = "\n".join(paragraphs) # 標準化文字處理 import re # 1. 標準化換行符 text = text.replace('\r\n', '\n').replace('\r', '\n') # 2. 移除末尾的換行符(但保留中間的) text = text.rstrip('\n') # 3. 標準化多重空白(但保留單個換行符) text = re.sub(r'[ \t]+', ' ', text) # 4. 移除段落間多餘空行 text = re.sub(r'\n\s*\n', '\n', text) return text def _extract_text_from_table(self, table, slide_idx: int, shape_idx: int) -> List[str]: """從表格中提取文字內容""" table_texts = [] try: for row_idx, row in enumerate(table.rows): for col_idx, cell in enumerate(row.cells): cell_text = cell.text_frame.text.strip() if cell_text: table_texts.append(cell_text) logger.debug(f"Extracted table cell text from slide {slide_idx}, shape {shape_idx}, " f"row {row_idx+1}, col {col_idx+1}: {cell_text[:50]}...") logger.info(f"Extracted {len(table_texts)} cells from table on slide {slide_idx}") except Exception as e: logger.error(f"Failed to extract text from table on slide {slide_idx}: {str(e)}") return table_texts def _extract_text_from_chart(self, chart, slide_idx: int, shape_idx: int) -> List[str]: """從圖表中提取文字內容""" chart_texts = [] try: # 嘗試提取圖表標題 if hasattr(chart, 'chart_title') and chart.chart_title.has_text_frame: title_text = chart.chart_title.text_frame.text.strip() if title_text: chart_texts.append(title_text) logger.debug(f"Extracted chart title from slide {slide_idx}: {title_text[:50]}...") # 嘗試提取其他圖表元素的文字(受限於 python-pptx 支援) # 注意:python-pptx 對圖表的支援有限,無法直接存取軸標籤等 logger.info(f"Extracted {len(chart_texts)} text elements from chart on slide {slide_idx}") except Exception as e: logger.error(f"Failed to extract text from chart on slide {slide_idx}: {str(e)}") return chart_texts def _extract_text_from_group(self, shapes, slide_idx: int, shape_idx: int, depth: int = 0) -> List[str]: """從群組形狀中提取文字內容 - 支援深度嵌套群組""" group_texts = [] max_depth = 10 # 防止無限遞歸 if depth > max_depth: logger.warning(f"Group nesting depth exceeded {max_depth} on slide {slide_idx}, skipping deeper levels") return group_texts try: for sub_shape_idx, sub_shape in enumerate(shapes): shape_processed = False # 1. 優先處理嵌套群組(遞歸處理) if hasattr(sub_shape, 'shapes') and hasattr(sub_shape, 'shape_type'): try: # 這是一個嵌套的群組 nested_texts = self._extract_text_from_group(sub_shape.shapes, slide_idx, f"{shape_idx}.{sub_shape_idx}", depth + 1) group_texts.extend(nested_texts) if nested_texts: shape_processed = True logger.debug(f"Extracted {len(nested_texts)} texts from nested group " f"at slide {slide_idx}, depth {depth + 1}") except Exception as e: logger.debug(f"Failed to process nested group at slide {slide_idx}, " f"depth {depth + 1}: {str(e)}") # 2. 處理文字框 if getattr(sub_shape, "has_text_frame", False): text = self._extract_text_from_frame(sub_shape.text_frame) if text.strip(): group_texts.append(text) logger.debug(f"Extracted group text from slide {slide_idx}, group {shape_idx}, " f"sub-shape {sub_shape_idx} (depth {depth}): {text[:50]}...") shape_processed = True # 3. 處理群組內的表格 if getattr(sub_shape, "has_table", False): sub_table_texts = self._extract_text_from_table(sub_shape.table, slide_idx, f"{shape_idx}.{sub_shape_idx}") group_texts.extend(sub_table_texts) if sub_table_texts: shape_processed = True # 4. 處理群組內的圖表 if getattr(sub_shape, "has_chart", False): chart_texts = self._extract_text_from_chart(sub_shape.chart, slide_idx, f"{shape_idx}.{sub_shape_idx}") group_texts.extend(chart_texts) if chart_texts: shape_processed = True # 5. 處理基本形狀文字(作為最後的備選方案) if not shape_processed and hasattr(sub_shape, 'text') and sub_shape.text.strip(): group_texts.append(sub_shape.text) logger.debug(f"Extracted group shape text from slide {slide_idx} " f"(depth {depth}): {sub_shape.text[:50]}...") shape_processed = True # 6. 如果仍未處理,使用備用文字提取 if not shape_processed: fallback_texts = self._extract_fallback_text(sub_shape, slide_idx, f"{shape_idx}.{sub_shape_idx}") group_texts.extend(fallback_texts) logger.info(f"Extracted {len(group_texts)} text elements from grouped shapes " f"on slide {slide_idx} (depth {depth})") except Exception as e: logger.error(f"Failed to extract text from grouped shapes on slide {slide_idx} " f"(depth {depth}): {str(e)}") return group_texts def _extract_text_from_smartart(self, shape, slide_idx: int, shape_idx: int) -> List[str]: """從 SmartArt 中提取文字內容 - 有限支援""" smartart_texts = [] try: # python-pptx 對 SmartArt 支援有限,嘗試透過 XML 提取 # 這是一個基本實現,可能無法涵蓋所有 SmartArt 類型 logger.warning(f"SmartArt detected on slide {slide_idx}, shape {shape_idx} - limited support available") logger.info("Consider using alternative libraries like Spire.Presentation for full SmartArt support") # 暫時回傳空列表,避免錯誤 # 在未來版本中可以考慮整合 Spire.Presentation 或其他支援 SmartArt 的庫 except Exception as e: logger.error(f"Failed to extract text from SmartArt on slide {slide_idx}: {str(e)}") return smartart_texts def _extract_fallback_text(self, shape, slide_idx: int, shape_idx: int) -> List[str]: """備用文字提取方法,處理可能遺漏的文字內容,包括深層嵌套結構""" fallback_texts = [] try: # 檢查形狀類型和屬性 shape_type = getattr(shape, 'shape_type', None) logger.debug(f"Fallback extraction for slide {slide_idx}, shape {shape_idx}, type: {shape_type}") # 嘗試透過不同的方式取得文字 # 方法 1: 直接檢查 text 屬性(即使之前沒處理到) if hasattr(shape, 'text'): text = getattr(shape, 'text', '') if text and text.strip(): fallback_texts.append(text) logger.debug(f"Fallback: Found direct text - {text[:50]}...") # 方法 2: 檢查是否有 text_frame 但之前沒有正確處理 try: if hasattr(shape, 'text_frame'): text_frame = shape.text_frame if text_frame and hasattr(text_frame, 'text'): text = text_frame.text if text and text.strip(): fallback_texts.append(text) logger.debug(f"Fallback: Found text_frame text - {text[:50]}...") except: pass # 方法 2.5: 深度檢查 text_frame 內的段落結構 try: if hasattr(shape, 'text_frame') and shape.text_frame: text_frame = shape.text_frame if hasattr(text_frame, 'paragraphs'): for para_idx, paragraph in enumerate(text_frame.paragraphs): if hasattr(paragraph, 'runs'): for run_idx, run in enumerate(paragraph.runs): if hasattr(run, 'text') and run.text.strip(): fallback_texts.append(run.text) logger.debug(f"Fallback: Found run text {para_idx}.{run_idx} - {run.text[:30]}...") except Exception as e: logger.debug(f"Failed to extract paragraph runs: {str(e)}") # 方法 2.6: 如果形狀有嵌套的 shapes,遞歸處理 if hasattr(shape, 'shapes') and shape.shapes: try: nested_texts = self._extract_text_from_group(shape.shapes, slide_idx, f"fallback_{shape_idx}", depth=0) fallback_texts.extend(nested_texts) if nested_texts: logger.debug(f"Fallback: Found {len(nested_texts)} texts from nested shapes") except Exception as e: logger.debug(f"Failed to extract from nested shapes: {str(e)}") # 方法 3: 檢查特殊屬性 special_attrs = ['textFrame', 'text_frame', '_element'] for attr in special_attrs: try: if hasattr(shape, attr): obj = getattr(shape, attr) if hasattr(obj, 'text') and obj.text and obj.text.strip(): fallback_texts.append(obj.text) logger.debug(f"Fallback: Found {attr} text - {obj.text[:30]}...") except: continue # 方法 3: 如果是 GraphicFrame,嘗試更深入的提取 if hasattr(shape, 'element'): try: # 透過 XML 元素搜尋文字節點 element = shape.element # 搜尋 XML 中的文字內容 text_elements = [] # 搜尋 標籤(文字內容) for t_elem in element.iter(): if t_elem.tag.endswith('}t'): # 匹配 a:t 標籤 if t_elem.text and t_elem.text.strip(): text_elements.append(t_elem.text.strip()) # 去重並添加 for text in set(text_elements): if text not in [existing_text for existing_text in fallback_texts]: fallback_texts.append(text) logger.debug(f"Fallback: Found XML text - {text[:50]}...") except Exception as xml_e: logger.debug(f"XML extraction failed for shape {shape_idx}: {str(xml_e)}") if fallback_texts: logger.info(f"Fallback extraction found {len(fallback_texts)} additional text elements on slide {slide_idx}, shape {shape_idx}") else: logger.debug(f"No additional text found in fallback for slide {slide_idx}, shape {shape_idx}") except Exception as e: logger.error(f"Fallback text extraction failed for slide {slide_idx}, shape {shape_idx}: {str(e)}") return fallback_texts def _normalize_text(self, text: str) -> str: """標準化文字用於比較""" import re return re.sub(r"\s+", " ", (text or "").strip()).lower() def _check_existing_translations(self, text_frame, translations: List[str]) -> bool: """檢查翻譯是否已經存在於文字框末尾""" if len(text_frame.paragraphs) < len(translations): return False # 檢查末尾的段落是否與翻譯匹配 tail_paragraphs = text_frame.paragraphs[-len(translations):] for para, expected in zip(tail_paragraphs, translations): if self._normalize_text(para.text) != self._normalize_text(expected): return False # 檢查是否為斜體格式(我們添加的翻譯標記) if any((r.font.italic is not True) and (r.text or "").strip() for r in para.runs): return False return True def _append_translation(self, text_frame, text_block: str): """在文字框末尾添加翻譯文字""" try: from pptx.util import Pt as PPTPt para = text_frame.add_paragraph() para.text = text_block # 設定格式:斜體、字體大小 for run in para.runs: run.font.italic = True run.font.size = PPTPt(12) except Exception as e: logger.error(f"Failed to append translation to text frame: {str(e)}") raise def generate_translated_document(self, translations: Dict[str, List[str]], target_language: str, output_dir: Path) -> str: """生成翻譯後的 PPTX 文件""" try: import pptx from sqlalchemy import text as sql_text from app import db # 載入 PowerPoint 文件 prs = pptx.Presentation(str(self.file_path)) # 生成輸出檔名 output_filename = generate_filename( self.file_path.name, 'translated', 'translated', target_language ) output_path = output_dir / output_filename # 收集所有文字框 text_frames = [] for slide in prs.slides: for shape in slide.shapes: if getattr(shape, "has_text_frame", False): text = self._extract_text_from_frame(shape.text_frame) if text.strip(): text_frames.append((shape.text_frame, text)) # 建立翻譯映射 - 從快取讀取 translation_map = {} logger.info(f"Building translation map for {len(text_frames)} text frames in language {target_language}") for text_frame, text in text_frames: # 從翻譯快取中查詢翻譯 result = db.session.execute(sql_text(""" SELECT translated_text FROM dt_translation_cache WHERE source_text = :text AND target_language = :lang ORDER BY created_at DESC LIMIT 1 """), {'text': text, 'lang': target_language}) row = result.fetchone() if row and row[0]: translation_map[text] = row[0] logger.debug(f"Found translation for PowerPoint text: {text[:50]}...") else: logger.warning(f"No translation found for PowerPoint text: {text[:50]}...") logger.info(f"Translation map built with {len(translation_map)} mappings") # 插入翻譯 ok_count = skip_count = 0 for text_frame, original_text in text_frames: if original_text not in translation_map: skip_count += 1 logger.debug(f"Skip PowerPoint frame: no translation for {original_text[:30]}...") continue translated_text = translation_map[original_text] translations_to_add = [translated_text] # 單一語言模式 # 檢查是否已存在翻譯 if self._check_existing_translations(text_frame, translations_to_add): skip_count += 1 logger.debug(f"Skip PowerPoint frame: translation already exists for {original_text[:30]}...") continue # 添加翻譯 for translation in translations_to_add: self._append_translation(text_frame, translation) ok_count += 1 logger.debug(f"Added translation to PowerPoint frame: {original_text[:30]}...") # 儲存文件 prs.save(str(output_path)) logger.info(f"PowerPoint translation completed: {ok_count} insertions, {skip_count} skips") logger.info(f"Generated translated PowerPoint file: {output_path}") return str(output_path) except Exception as e: logger.error(f"Failed to generate translated PPTX file: {str(e)}") raise FileProcessingError(f"PPTX 翻譯檔生成失敗: {str(e)}") def insert_pptx_translations(self, translation_map: Dict[Tuple[str, str], str], target_languages: List[str], output_path: str) -> Tuple[int, int]: """插入翻譯到 PowerPoint 文件 - 單語言模式(僅翻譯文)""" try: import pptx from shutil import copyfile # 複製原始文件 copyfile(str(self.file_path), output_path) # 載入 PowerPoint 文件 prs = pptx.Presentation(output_path) ok_count = skip_count = 0 for slide_idx, slide in enumerate(prs.slides, 1): for shape_idx, shape in enumerate(slide.shapes, 1): # 使用與提取邏輯相同的處理順序(並行處理) # 處理文字框 if getattr(shape, "has_text_frame", False): text = self._extract_text_from_frame(shape.text_frame) if text.strip(): ok, skip = self._insert_single_language_translation( shape.text_frame, text, translation_map, target_languages[0] ) ok_count += ok skip_count += skip # 處理表格 if getattr(shape, "has_table", False): table_ok, table_skip = self._insert_table_translations( shape.table, translation_map, target_languages[0] ) ok_count += table_ok skip_count += table_skip # 處理圖表(並行處理) if getattr(shape, "has_chart", False): chart_ok, chart_skip = self._insert_chart_translations( shape.chart, translation_map, target_languages[0] ) ok_count += chart_ok skip_count += chart_skip # 處理群組形狀(並行處理,支援深度嵌套) if hasattr(shape, 'shapes'): group_ok, group_skip = self._insert_group_translations( shape.shapes, translation_map, target_languages[0], slide_idx, shape_idx ) ok_count += group_ok skip_count += group_skip # 處理基本形狀文字(並行處理) if hasattr(shape, 'text') and shape.text.strip(): if (target_languages[0], shape.text) in translation_map: translated_text = translation_map[(target_languages[0], shape.text)] shape.text = translated_text ok_count += 1 logger.debug(f"Inserted basic shape translation on slide {slide_idx}: {shape.text[:30]}...") else: skip_count += 1 # 儲存文件 prs.save(output_path) logger.info(f"Saved PowerPoint file with {ok_count} translations, {skip_count} skips") return ok_count, skip_count except Exception as e: logger.error(f"Failed to insert PowerPoint translations: {str(e)}") raise FileProcessingError(f"PowerPoint 翻譯插入失敗: {str(e)}") def insert_pptx_combined_translations(self, translation_map: Dict[Tuple[str, str], str], target_languages: List[str], output_path: str) -> Tuple[int, int]: """插入翻譯到 PowerPoint 文件 - 組合模式(原文+所有譯文)""" try: import pptx from shutil import copyfile # 複製原始文件 copyfile(str(self.file_path), output_path) # 載入 PowerPoint 文件 prs = pptx.Presentation(output_path) ok_count = skip_count = 0 for slide in prs.slides: for shape in slide.shapes: # 處理文字框 if getattr(shape, "has_text_frame", False): text = self._extract_text_from_frame(shape.text_frame) if text.strip(): ok, skip = self._insert_combined_language_translation( shape.text_frame, text, translation_map, target_languages ) ok_count += ok skip_count += skip # 處理表格 elif getattr(shape, "has_table", False): table_ok, table_skip = self._insert_combined_table_translations( shape.table, translation_map, target_languages ) ok_count += table_ok skip_count += table_skip # 處理圖表 elif getattr(shape, "has_chart", False): chart_ok, chart_skip = self._insert_combined_chart_translations( shape.chart, translation_map, target_languages ) ok_count += chart_ok skip_count += chart_skip # 處理群組形狀 elif hasattr(shape, 'shapes'): group_ok, group_skip = self._insert_combined_group_translations( shape.shapes, translation_map, target_languages ) ok_count += group_ok skip_count += group_skip # 處理基本形狀文字 elif hasattr(shape, 'text') and shape.text.strip(): # 收集所有語言的翻譯 translations = [] for lang in target_languages: if (lang, shape.text) in translation_map: translations.append(translation_map[(lang, shape.text)]) else: translations.append(f"【翻譯缺失|{lang}】") if translations: # 組合原文和所有翻譯 combined_text = shape.text + '\n' + '\n'.join(translations) shape.text = combined_text ok_count += 1 else: skip_count += 1 # 儲存文件 prs.save(output_path) logger.info(f"Saved combined PowerPoint file with {ok_count} translations, {skip_count} skips") return ok_count, skip_count except Exception as e: logger.error(f"Failed to insert combined PowerPoint translations: {str(e)}") raise FileProcessingError(f"PowerPoint 組合翻譯插入失敗: {str(e)}") def _insert_single_language_translation(self, text_frame, original_text: str, translation_map: Dict[Tuple[str, str], str], target_language: str) -> Tuple[int, int]: """插入單語言翻譯到文字框""" if (target_language, original_text) not in translation_map: return 0, 1 translated_text = translation_map[(target_language, original_text)] # 檢查是否已存在翻譯 if self._check_existing_translations(text_frame, [translated_text]): return 0, 1 # 清除現有內容,只保留翻譯 text_frame.clear() para = text_frame.add_paragraph() para.text = translated_text # 設定格式 for run in para.runs: run.font.italic = True try: from pptx.util import Pt as PPTPt run.font.size = PPTPt(12) except: pass return 1, 0 def _insert_combined_language_translation(self, text_frame, original_text: str, translation_map: Dict[Tuple[str, str], str], target_languages: List[str]) -> Tuple[int, int]: """插入組合語言翻譯到文字框(原文+所有譯文)""" translations = [] for lang in target_languages: if (lang, original_text) in translation_map: translations.append(translation_map[(lang, original_text)]) else: translations.append(f"【翻譯缺失|{lang}】") if not any(trans for trans in translations if not trans.startswith("【翻譯缺失")): return 0, 1 # 檢查是否已存在翻譯 combined_translations = [original_text] + translations if self._check_existing_translations(text_frame, combined_translations): return 0, 1 # 添加所有翻譯 for translation in translations: self._append_translation(text_frame, translation) return 1, 0 def _insert_table_translations(self, table, translation_map: Dict[Tuple[str, str], str], target_language: str) -> Tuple[int, int]: """插入翻譯到表格 - 單語言模式""" ok_count = skip_count = 0 for row in table.rows: for cell in row.cells: cell_text = cell.text_frame.text.strip() if not cell_text: continue if (target_language, cell_text) in translation_map: translated_text = translation_map[(target_language, cell_text)] # 替換儲存格內容為翻譯文 cell.text_frame.clear() para = cell.text_frame.add_paragraph() para.text = translated_text # 設定格式 for run in para.runs: run.font.italic = True try: from pptx.util import Pt as PPTPt run.font.size = PPTPt(10) except: pass ok_count += 1 else: skip_count += 1 return ok_count, skip_count def _insert_combined_table_translations(self, table, translation_map: Dict[Tuple[str, str], str], target_languages: List[str]) -> Tuple[int, int]: """插入翻譯到表格 - 組合模式""" ok_count = skip_count = 0 for row in table.rows: for cell in row.cells: cell_text = cell.text_frame.text.strip() if not cell_text: continue # 收集所有語言的翻譯 translations = [] for lang in target_languages: if (lang, cell_text) in translation_map: translations.append(translation_map[(lang, cell_text)]) else: translations.append(f"【翻譯缺失|{lang}】") if translations: # 組合原文和所有翻譯 combined_text = cell_text + '\n' + '\n'.join(translations) # 替換儲存格內容 cell.text_frame.clear() para = cell.text_frame.add_paragraph() para.text = combined_text # 設定格式 for run in para.runs: try: from pptx.util import Pt as PPTPt run.font.size = PPTPt(9) except: pass ok_count += 1 else: skip_count += 1 return ok_count, skip_count def _insert_chart_translations(self, chart, translation_map: Dict[Tuple[str, str], str], target_language: str) -> Tuple[int, int]: """插入翻譯到圖表 - 有限支援""" ok_count = skip_count = 0 try: # 處理圖表標題 if hasattr(chart, 'chart_title') and chart.chart_title.has_text_frame: title_text = chart.chart_title.text_frame.text.strip() if title_text and (target_language, title_text) in translation_map: translated_title = translation_map[(target_language, title_text)] chart.chart_title.text_frame.text = translated_title ok_count += 1 logger.debug(f"Translated chart title: {title_text[:30]} -> {translated_title[:30]}") else: skip_count += 1 # 注意:python-pptx 對圖表軸標籤等的支援非常有限 logger.info(f"Chart translation: {ok_count} successful, {skip_count} skipped (limited support)") except Exception as e: logger.error(f"Failed to insert chart translations: {str(e)}") skip_count += 1 return ok_count, skip_count def _insert_group_translations(self, shapes, translation_map: Dict[Tuple[str, str], str], target_language: str, slide_idx: int = 0, shape_idx: int = 0, depth: int = 0) -> Tuple[int, int]: """插入翻譯到群組形狀 - 支援深度嵌套,與提取邏輯保持一致""" ok_count = skip_count = 0 max_depth = 10 # 防止無限遞歸 if depth > max_depth: logger.warning(f"Group nesting depth exceeded {max_depth} on slide {slide_idx}, skipping deeper levels") return ok_count, skip_count try: for sub_shape_idx, sub_shape in enumerate(shapes): shape_processed = False # 1. 優先處理嵌套群組(遞歸處理) if hasattr(sub_shape, 'shapes') and hasattr(sub_shape, 'shape_type'): try: nested_ok, nested_skip = self._insert_group_translations( sub_shape.shapes, translation_map, target_language, slide_idx, f"{shape_idx}.{sub_shape_idx}", depth + 1 ) ok_count += nested_ok skip_count += nested_skip if nested_ok > 0: shape_processed = True logger.debug(f"Inserted {nested_ok} nested group translations at depth {depth + 1}") except Exception as e: logger.debug(f"Failed to process nested group at depth {depth + 1}: {str(e)}") # 2. 處理群組內的文字框(並行處理) if getattr(sub_shape, "has_text_frame", False): text = self._extract_text_from_frame(sub_shape.text_frame) if text.strip(): if (target_language, text) in translation_map: translated_text = translation_map[(target_language, text)] # 使用更安全的文字替換方法 try: # 清除並重新設置文字 sub_shape.text_frame.clear() para = sub_shape.text_frame.add_paragraph() para.text = translated_text ok_count += 1 shape_processed = True logger.debug(f"Inserted group text frame translation: {text[:30]}... -> {translated_text[:30]}...") except Exception as e: logger.warning(f"Failed to replace text frame content: {str(e)}") skip_count += 1 else: skip_count += 1 # 3. 處理群組內的表格(並行處理) if getattr(sub_shape, "has_table", False): table_ok, table_skip = self._insert_table_translations( sub_shape.table, translation_map, target_language ) ok_count += table_ok skip_count += table_skip if table_ok > 0: shape_processed = True # 4. 處理群組內的圖表(並行處理) if getattr(sub_shape, "has_chart", False): chart_ok, chart_skip = self._insert_chart_translations( sub_shape.chart, translation_map, target_language ) ok_count += chart_ok skip_count += chart_skip if chart_ok > 0: shape_processed = True # 5. 處理基本形狀文字(作為備選方案) if not shape_processed and hasattr(sub_shape, 'text') and sub_shape.text.strip(): if (target_language, sub_shape.text) in translation_map: translated_text = translation_map[(target_language, sub_shape.text)] sub_shape.text = translated_text ok_count += 1 logger.debug(f"Inserted basic group shape translation: {sub_shape.text[:30]}...") shape_processed = True else: skip_count += 1 logger.debug(f"Group translation at depth {depth}: {ok_count} successful, {skip_count} skipped") except Exception as e: logger.error(f"Failed to insert group translations at depth {depth}: {str(e)}") return ok_count, skip_count def _insert_combined_chart_translations(self, chart, translation_map: Dict[Tuple[str, str], str], target_languages: List[str]) -> Tuple[int, int]: """插入組合翻譯到圖表 - 有限支援""" ok_count = skip_count = 0 try: # 處理圖表標題 if hasattr(chart, 'chart_title') and chart.chart_title.has_text_frame: title_text = chart.chart_title.text_frame.text.strip() if title_text: # 收集所有語言的翻譯 translations = [] for lang in target_languages: if (lang, title_text) in translation_map: translations.append(translation_map[(lang, title_text)]) else: translations.append(f"【翻譯缺失|{lang}】") if any(trans for trans in translations if not trans.startswith("【翻譯缺失")): # 組合原文和所有翻譯 combined_text = title_text + '\n' + '\n'.join(translations) chart.chart_title.text_frame.text = combined_text ok_count += 1 else: skip_count += 1 else: skip_count += 1 # 注意:python-pptx 對圖表軸標籤等的支援非常有限 logger.info(f"Combined chart translation: {ok_count} successful, {skip_count} skipped (limited support)") except Exception as e: logger.error(f"Failed to insert combined chart translations: {str(e)}") skip_count += 1 return ok_count, skip_count def _insert_combined_group_translations(self, shapes, translation_map: Dict[Tuple[str, str], str], target_languages: List[str]) -> Tuple[int, int]: """插入組合翻譯到群組形狀""" ok_count = skip_count = 0 try: for sub_shape in shapes: # 處理群組內的文字框 if getattr(sub_shape, "has_text_frame", False): text = self._extract_text_from_frame(sub_shape.text_frame) if text.strip(): # 收集所有語言的翻譯 translations = [] for lang in target_languages: if (lang, text) in translation_map: translations.append(translation_map[(lang, text)]) else: translations.append(f"【翻譯缺失|{lang}】") if any(trans for trans in translations if not trans.startswith("【翻譯缺失")): # 添加所有翻譯 for translation in translations: self._append_translation(sub_shape.text_frame, translation) ok_count += 1 else: skip_count += 1 else: skip_count += 1 # 處理群組內的表格 elif getattr(sub_shape, "has_table", False): table_ok, table_skip = self._insert_combined_table_translations( sub_shape.table, translation_map, target_languages ) ok_count += table_ok skip_count += table_skip # 處理群組內的基本形狀文字 elif hasattr(sub_shape, 'text') and sub_shape.text.strip(): # 收集所有語言的翻譯 translations = [] for lang in target_languages: if (lang, sub_shape.text) in translation_map: translations.append(translation_map[(lang, sub_shape.text)]) else: translations.append(f"【翻譯缺失|{lang}】") if translations: # 組合原文和所有翻譯 combined_text = sub_shape.text + '\n' + '\n'.join(translations) sub_shape.text = combined_text ok_count += 1 else: skip_count += 1 except Exception as e: logger.error(f"Failed to insert combined group translations: {str(e)}") return ok_count, skip_count class TranslationService: """翻譯服務""" def __init__(self): self.dify_client = DifyClient() self.document_processor = DocumentProcessor() # 文件解析器映射 self.parsers = { '.docx': DocxParser, '.doc': DocParser, # 需要先轉換為 DOCX '.pptx': PptxParser, # PowerPoint 簡報支援 '.xlsx': ExcelParser, '.xls': ExcelParser, # Excel 處理器會自動處理 XLS 轉換 '.pdf': PdfParser, # 其他格式可以稍後添加 } def get_document_parser(self, file_path: str) -> DocumentParser: """取得文件解析器""" file_ext = Path(file_path).suffix.lower() parser_class = self.parsers.get(file_ext) if not parser_class: raise FileProcessingError(f"不支援的檔案格式: {file_ext}") return parser_class(file_path) def split_text_into_sentences(self, text: str, language: str = 'auto') -> List[str]: """將文字分割成句子 - 使用增強的分句邏輯""" return self.document_processor.split_text_into_sentences(text, language) def translate_excel_cell(self, text: str, source_language: str, target_language: str, user_id: int = None, job_id: int = None, conversation_id: str = None) -> Dict[str, Any]: """ Excel儲存格翻譯 - 整個儲存格作為一個單位翻譯,不進行切片 返回 dict 包含 translated_text 和 conversation_id """ if not text or not text.strip(): return {"translated_text": "", "conversation_id": conversation_id} # 檢查快取 - 整個儲存格內容 cached_translation = TranslationCache.get_translation(text, source_language, target_language) if cached_translation: logger.debug(f"Excel cell cache hit: {text[:30]}...") return {"translated_text": cached_translation, "conversation_id": conversation_id} # 直接翻譯整個儲存格內容,不進行任何切片 try: result = self.dify_client.translate_text( text=text, source_language=source_language, target_language=target_language, user_id=user_id, job_id=job_id, conversation_id=conversation_id # 傳遞 conversation_id ) translated_text = result['translated_text'] # 儲存整個儲存格的翻譯到快取 TranslationCache.save_translation( text, source_language, target_language, translated_text ) return result # 返回包含 conversation_id 的完整結果 except Exception as e: logger.error(f"Failed to translate Excel cell: {text[:30]}... Error: {str(e)}") # 翻譯失敗時返回失敗標記 return f"【翻譯失敗|{target_language}】{text}" def translate_word_table_cell(self, text: str, source_language: str, target_language: str, user_id: int = None, job_id: int = None) -> str: """ Word表格儲存格翻譯 - 整個儲存格內容作為一個單位翻譯,不進行段落切片 """ if not text or not text.strip(): return "" # 檢查快取 - 整個儲存格內容 cached_translation = TranslationCache.get_translation(text, source_language, target_language) if cached_translation: logger.debug(f"Word table cell cache hit: {text[:30]}...") return cached_translation # 直接翻譯整個儲存格內容,不進行任何段落切片 try: result = self.dify_client.translate_text( text=text, source_language=source_language, target_language=target_language, user_id=user_id, job_id=job_id ) translated_text = result['translated_text'] # 儲存整個儲存格的翻譯到快取 TranslationCache.save_translation( text, source_language, target_language, translated_text ) return translated_text except Exception as e: logger.error(f"Failed to translate Word table cell: {text[:30]}... Error: {str(e)}") return f"【翻譯失敗|{target_language}】{text}" def translate_segment_with_sentences(self, text: str, source_language: str, target_language: str, user_id: int = None, job_id: int = None, conversation_id: str = None) -> Dict[str, Any]: """ 按段落翻譯,模仿成功版本的 translate_block_sentencewise 邏輯 對多行文字進行逐行、逐句翻譯,並重新組合成完整段落 僅用於Word文檔,Excel請使用 translate_excel_cell """ if not text or not text.strip(): return "" # 檢查快取 - 先檢查整個段落的快取 cached_whole = TranslationCache.get_translation(text, source_language, target_language) if cached_whole: logger.debug(f"Whole paragraph cache hit: {text[:30]}...") return cached_whole # 按行處理 out_lines = [] all_successful = True current_conversation_id = conversation_id for raw_line in text.split('\n'): if not raw_line.strip(): out_lines.append("") continue # 分句處理 sentences = self.document_processor.split_text_into_sentences(raw_line, source_language) if not sentences: sentences = [raw_line] translated_parts = [] for sentence in sentences: sentence = sentence.strip() if not sentence: continue # 檢查句子級快取 cached_sentence = TranslationCache.get_translation(sentence, source_language, target_language) if cached_sentence: translated_parts.append(cached_sentence) continue # 呼叫 Dify API 翻譯句子 try: result = self.dify_client.translate_text( text=sentence, source_language=source_language, target_language=target_language, user_id=user_id, job_id=job_id, conversation_id=current_conversation_id ) translated_sentence = result['translated_text'] # 更新對話ID以保持上下文連續性 if result.get('conversation_id'): current_conversation_id = result['conversation_id'] # 儲存句子級快取 TranslationCache.save_translation( sentence, source_language, target_language, translated_sentence ) translated_parts.append(translated_sentence) except Exception as e: logger.error(f"Failed to translate sentence: {sentence[:30]}... Error: {str(e)}") translated_parts.append(f"【翻譯失敗|{target_language}】{sentence}") all_successful = False # 重新組合句子為一行 out_lines.append(" ".join(translated_parts)) # 重新組合所有行 final_result = "\n".join(out_lines) # 如果全部成功,儲存整個段落的快取 if all_successful: TranslationCache.save_translation(text, source_language, target_language, final_result) return { 'translated_text': final_result, 'conversation_id': current_conversation_id } def translate_text_with_cache(self, text: str, source_language: str, target_language: str, user_id: int = None, job_id: int = None, conversation_id: str = None) -> Dict[str, Any]: """帶快取的文字翻譯""" # 檢查快取 cached_translation = TranslationCache.get_translation( text, source_language, target_language ) if cached_translation: logger.debug(f"Cache hit for translation: {text[:50]}...") return { 'translated_text': cached_translation, 'conversation_id': conversation_id, # 保持原有的conversation_id 'from_cache': True } # 呼叫 Dify API try: result = self.dify_client.translate_text( text=text, source_language=source_language, target_language=target_language, user_id=user_id, job_id=job_id, conversation_id=conversation_id ) translated_text = result['translated_text'] new_conversation_id = result.get('conversation_id') # 儲存到快取 TranslationCache.save_translation( text, source_language, target_language, translated_text ) return { 'translated_text': translated_text, 'conversation_id': new_conversation_id, 'from_cache': False } except Exception as e: logger.error(f"Translation failed for text: {text[:50]}... Error: {str(e)}") raise TranslationError(f"翻譯失敗: {str(e)}") def translate_document(self, job_uuid: str) -> Dict[str, Any]: """翻譯文件(主要入口點)- 使用增強的文檔處理邏輯""" try: # 取得任務資訊 job = TranslationJob.query.filter_by(job_uuid=job_uuid).first() if not job: raise TranslationError(f"找不到任務: {job_uuid}") logger.info(f"Starting enhanced document translation: {job_uuid}") # 更新任務狀態 job.update_status('PROCESSING', progress=0) # 使用增強的文檔處理器直接提取段落 file_ext = Path(job.file_path).suffix.lower() if file_ext in ['.docx', '.doc']: # 使用增強的 DOCX 處理邏輯 segments = self.document_processor.extract_docx_segments(job.file_path) logger.info(f"Enhanced extraction: Found {len(segments)} segments to translate") if not segments: raise TranslationError("文件中未找到可翻譯的文字段落") # 使用成功版本的翻譯邏輯 - 直接按段落翻譯,不做複雜分割 translatable_segments = [] for seg in segments: if self.document_processor.should_translate_text(seg.text, job.source_language): translatable_segments.append(seg) logger.info(f"Found {len(translatable_segments)} segments to translate") # 批次翻譯 - 直接按原始段落翻譯 translation_map = {} # 格式: (target_language, source_text) -> translated_text total_segments = len(translatable_segments) for target_language in job.target_languages: logger.info(f"Translating to {target_language}") # 每個目標語言使用獨立的對話ID以保持該語言的翻譯一致性 current_conversation_id = None for i, seg in enumerate(translatable_segments): try: # 根據段落類型選擇適當的翻譯方法 if seg.kind == "table_cell": # 表格儲存格使用整個儲存格為單位的翻譯方法 translated = self.translate_word_table_cell( text=seg.text, source_language=job.source_language, target_language=target_language, user_id=job.user_id, job_id=job.id ) else: # 一般段落使用原有的句子切片方法 translation_result = self.translate_segment_with_sentences( text=seg.text, source_language=job.source_language, target_language=target_language, user_id=job.user_id, job_id=job.id, conversation_id=current_conversation_id ) translated = translation_result['translated_text'] # 更新當前對話ID以保持上下文連續性 if translation_result.get('conversation_id'): current_conversation_id = translation_result['conversation_id'] # 直接以原始段落文字為鍵儲存翻譯結果 translation_map[(target_language, seg.text)] = translated # 更新進度 progress = (i + 1) / total_segments * 100 / len(job.target_languages) current_lang_index = job.target_languages.index(target_language) total_progress = (current_lang_index * 100 + progress) / len(job.target_languages) job.update_status('PROCESSING', progress=total_progress) # 短暫延遲避免過快請求 time.sleep(0.1) except Exception as e: logger.error(f"Failed to translate segment: {seg.text[:50]}... Error: {str(e)}") # 翻譯失敗時保留原文 translation_map[(target_language, seg.text)] = f"[翻譯失敗] {seg.text}" # 保存該語言的對話ID到任務記錄中(用於後續重試等場景) if current_conversation_id and not job.conversation_id: job.conversation_id = current_conversation_id db.session.commit() logger.info(f"Saved conversation_id {current_conversation_id} for job {job.job_uuid}") # 生成翻譯文件 logger.info("Generating translated documents with enhanced insertion") output_dir = Path(job.file_path).parent output_files = {} for target_language in job.target_languages: try: # 生成輸出檔名 output_filename = generate_filename( Path(job.file_path).name, 'translated', 'translated', target_language ) output_path = output_dir / output_filename # 使用增強的翻譯插入邏輯 ok_count, skip_count = self.document_processor.insert_docx_translations( job.file_path, segments, translation_map, [target_language], str(output_path) ) output_files[target_language] = str(output_path) # 記錄翻譯檔案到資料庫 file_size = Path(output_path).stat().st_size job.add_translated_file( language_code=target_language, filename=Path(output_path).name, file_path=str(output_path), file_size=file_size ) logger.info(f"Generated {target_language}: {ok_count} insertions, {skip_count} skips") except Exception as e: logger.error(f"Failed to generate translated document for {target_language}: {str(e)}") raise TranslationError(f"生成 {target_language} 翻譯文件失敗: {str(e)}") # 生成組合多語言檔案 - 包含所有翻譯在一個文件中 if len(job.target_languages) > 1: try: # 生成組合檔案的檔名 combined_filename = generate_filename( Path(job.file_path).name, 'translated', 'combined', 'multilang' ) combined_output_path = output_dir / combined_filename # 使用新的組合翻譯插入方法 combined_ok_count, combined_skip_count = self.document_processor.insert_docx_combined_translations( job.file_path, segments, translation_map, job.target_languages, str(combined_output_path) ) output_files['combined'] = str(combined_output_path) # 記錄組合翻譯檔案到資料庫 file_size = Path(combined_output_path).stat().st_size job.add_translated_file( language_code='combined', filename=Path(combined_output_path).name, file_path=str(combined_output_path), file_size=file_size ) logger.info(f"Generated combined multi-language file: {combined_ok_count} insertions, {combined_skip_count} skips") except Exception as e: logger.error(f"Failed to generate combined multi-language document: {str(e)}") # 不要因為組合檔案失敗而讓整個任務失敗,只記錄警告 logger.warning("Combined multi-language file generation failed, but individual files were successful") elif file_ext in ['.xlsx', '.xls']: # Excel 文件使用儲存格為單位的翻譯邏輯 logger.info(f"Using cell-based processing for Excel files") parser = self.get_document_parser(job.file_path) # 提取儲存格文字內容(不進行句子切片) cell_segments = parser.extract_text_segments() if not cell_segments: raise TranslationError("Excel 文件中未找到可翻譯的文字") logger.info(f"Found {len(cell_segments)} cell segments to translate") # 批次翻譯 - 使用儲存格為單位的翻譯方法 translation_results = {} total_segments = len(cell_segments) for target_language in job.target_languages: logger.info(f"Translating Excel cells to {target_language}") translated_cells = [] current_conversation_id = job.conversation_id # 維持上下文連貫性 for i, cell_text in enumerate(cell_segments): try: # 使用新的儲存格翻譯方法(整個儲存格作為單位) translated = self.translate_excel_cell( text=cell_text, source_language=job.source_language, target_language=target_language, user_id=job.user_id, job_id=job.id, conversation_id=current_conversation_id # 傳遞 conversation_id ) # 提取翻譯文字(translate_excel_cell 現在返回 dict) translated_text = translated["translated_text"] if isinstance(translated, dict) else translated translated_cells.append(translated_text) # 更新 conversation_id 以維持連續對話上下文 if isinstance(translated, dict) and translated.get("conversation_id"): current_conversation_id = translated["conversation_id"] # 更新進度 progress = (i + 1) / total_segments * 100 / len(job.target_languages) current_lang_index = job.target_languages.index(target_language) total_progress = (current_lang_index * 100 + progress) / len(job.target_languages) job.update_status('PROCESSING', progress=total_progress) time.sleep(0.1) except Exception as e: logger.error(f"Failed to translate Excel cell: {cell_text[:50]}... Error: {str(e)}") translated_cells.append(f"[翻譯失敗] {cell_text}") translation_results[target_language] = translated_cells # 生成翻譯文件 output_dir = Path(job.file_path).parent output_files = {} for target_language, translations in translation_results.items(): translation_mapping = {target_language: translations} output_file = parser.generate_translated_document( translations=translation_mapping, target_language=target_language, output_dir=output_dir ) output_files[target_language] = output_file file_size = Path(output_file).stat().st_size job.add_translated_file( language_code=target_language, filename=Path(output_file).name, file_path=output_file, file_size=file_size ) # 生成組合多語言Excel檔案 if len(job.target_languages) > 1: try: # 生成組合檔案的檔名 combined_filename = generate_filename( Path(job.file_path).name, 'translated', 'combined', 'multilang' ) combined_output_path = output_dir / combined_filename # 為Excel組合檔案建立翻譯映射 combined_translation_mapping = {} for lang in job.target_languages: combined_translation_mapping[lang] = translation_results[lang] # 使用修改過的generate_combined_excel_document方法 combined_output_file = self._generate_combined_excel_document( parser, combined_translation_mapping, job.target_languages, combined_output_path ) output_files['combined'] = combined_output_file # 記錄組合翻譯檔案到資料庫 file_size = Path(combined_output_file).stat().st_size job.add_translated_file( language_code='combined', filename=Path(combined_output_file).name, file_path=combined_output_file, file_size=file_size ) logger.info(f"Generated combined multi-language Excel file") except Exception as e: logger.error(f"Failed to generate combined multi-language Excel document: {str(e)}") logger.warning("Combined multi-language Excel file generation failed, but individual files were successful") elif file_ext == '.pptx': # PowerPoint 文件使用增強的處理邏輯,仿照 DOCX 處理方式 logger.info(f"Using enhanced PowerPoint processing for {job_uuid}") parser = self.get_document_parser(job.file_path) # 提取文字段落和表格內容 text_segments = parser.extract_text_segments() if not text_segments: raise TranslationError("PowerPoint 文件中未找到可翻譯的文字") logger.info(f"Found {len(text_segments)} PowerPoint text segments to translate") # 批次翻譯 - 建立翻譯映射 translation_map = {} # 格式: (target_language, source_text) -> translated_text total_segments = len(text_segments) for target_language in job.target_languages: logger.info(f"Translating PowerPoint segments to {target_language}") translated_segments = [] current_conversation_id = job.conversation_id # 維持上下文連貫性 for i, segment_text in enumerate(text_segments): try: # 對於 PowerPoint 文字框和表格,使用段落級別的翻譯 translated = self.translate_segment_with_sentences( text=segment_text, source_language=job.source_language, target_language=target_language, user_id=job.user_id, job_id=job.id, conversation_id=current_conversation_id # 傳遞 conversation_id ) # 使用與 DOCX 相同的格式儲存翻譯結果 translation_map[(target_language, segment_text)] = translated # 更新 conversation_id 以維持連續對話上下文 if isinstance(translated, dict) and translated.get("conversation_id"): current_conversation_id = translated["conversation_id"] # 更新進度 progress = (i + 1) / total_segments * 100 / len(job.target_languages) current_lang_index = job.target_languages.index(target_language) total_progress = (current_lang_index * 100 + progress) / len(job.target_languages) job.update_status('PROCESSING', progress=total_progress) time.sleep(0.1) except Exception as e: logger.error(f"Failed to translate PowerPoint segment: {segment_text[:50]}... Error: {str(e)}") # 翻譯失敗時保留原文 translation_map[(target_language, segment_text)] = f"[翻譯失敗] {segment_text}" # 生成翻譯文件 - 仿照 DOCX 的方式 logger.info("Generating translated PowerPoint documents with enhanced insertion") output_dir = Path(job.file_path).parent output_files = {} # 生成單語言文件 for target_language in job.target_languages: try: # 生成輸出檔名 output_filename = generate_filename( Path(job.file_path).name, 'translated', 'translated', target_language ) output_path = output_dir / output_filename # 使用增強的翻譯插入邏輯 ok_count, skip_count = parser.insert_pptx_translations( translation_map, [target_language], str(output_path) ) output_files[target_language] = str(output_path) # 記錄翻譯檔案到資料庫 file_size = Path(output_path).stat().st_size job.add_translated_file( language_code=target_language, filename=Path(output_path).name, file_path=str(output_path), file_size=file_size ) logger.info(f"Generated {target_language}: {ok_count} insertions, {skip_count} skips") except Exception as e: logger.error(f"Failed to generate translated PowerPoint document for {target_language}: {str(e)}") raise TranslationError(f"生成 {target_language} PowerPoint 翻譯文件失敗: {str(e)}") # 生成組合多語言檔案 - 包含所有翻譯在一個文件中 if len(job.target_languages) > 1: try: # 生成組合檔案的檔名 combined_filename = generate_filename( Path(job.file_path).name, 'translated', 'combined', 'multilang' ) combined_output_path = output_dir / combined_filename # 使用組合翻譯插入方法 combined_ok_count, combined_skip_count = parser.insert_pptx_combined_translations( translation_map, job.target_languages, str(combined_output_path) ) output_files['combined'] = str(combined_output_path) # 記錄組合翻譯檔案到資料庫 file_size = Path(combined_output_path).stat().st_size job.add_translated_file( language_code='combined', filename=Path(combined_output_path).name, file_path=str(combined_output_path), file_size=file_size ) logger.info(f"Generated combined multi-language PowerPoint file: {combined_ok_count} insertions, {combined_skip_count} skips") except Exception as e: logger.error(f"Failed to generate combined multi-language PowerPoint document: {str(e)}") # 不要因為組合檔案失敗而讓整個任務失敗,只記錄警告 logger.warning("Combined multi-language PowerPoint file generation failed, but individual files were successful") elif file_ext == '.pdf': # PDF 文件使用增強的OCR處理邏輯(避免重複OCR) logger.info(f"Using enhanced PDF processing for {job_uuid}") from app.services.enhanced_pdf_parser import EnhancedPdfParser enhanced_parser = EnhancedPdfParser(job.file_path) # 提取文字片段(會使用OCR快取避免重複處理) text_segments = enhanced_parser.extract_text_segments(user_id=job.user_id, job_id=job.id) if not text_segments: raise TranslationError("PDF文件中未找到可翻譯的文字") logger.info(f"Found {len(text_segments)} PDF text segments to translate") # 批次翻譯PDF文字段落 translation_results = {} total_segments = len(text_segments) for target_language in job.target_languages: logger.info(f"Translating PDF segments to {target_language}") translated_segments = [] current_conversation_id = job.conversation_id # 維持上下文連貫性 for i, segment_text in enumerate(text_segments): try: # 對於PDF段落,使用段落級別的翻譯(保留段落結構) translated = self.translate_segment_with_sentences( text=segment_text, source_language=job.source_language, target_language=target_language, user_id=job.user_id, job_id=job.id, conversation_id=current_conversation_id # 傳遞 conversation_id ) # 提取翻譯文字(translate_segment_with_sentences 返回 dict) translated_text = translated['translated_text'] if isinstance(translated, dict) else translated translated_segments.append(translated_text) # 更新 conversation_id 以維持連續對話上下文 if isinstance(translated, dict) and translated.get('conversation_id'): current_conversation_id = translated['conversation_id'] # 更新進度 progress = (i + 1) / total_segments * 100 / len(job.target_languages) current_lang_index = job.target_languages.index(target_language) total_progress = (current_lang_index * 100 + progress) / len(job.target_languages) job.update_status('PROCESSING', progress=total_progress) time.sleep(0.1) except Exception as e: logger.error(f"Failed to translate PDF segment: {segment_text[:50]}... Error: {str(e)}") translated_segments.append(f"[翻譯失敗] {segment_text}") translation_results[target_language] = translated_segments # 生成翻譯Word文件 logger.info("Generating translated Word documents from PDF") output_dir = Path(job.file_path).parent output_files = {} for target_language, translations in translation_results.items(): try: # 使用增強PDF解析器生成Word文檔 output_file = enhanced_parser.generate_translated_document( translations={target_language: translations}, target_language=target_language, output_dir=output_dir ) output_files[target_language] = output_file # 記錄翻譯檔案到資料庫 file_size = Path(output_file).stat().st_size job.add_translated_file( language_code=target_language, filename=Path(output_file).name, file_path=output_file, file_size=file_size ) logger.info(f"Generated PDF translation for {target_language}: {output_file}") except Exception as e: logger.error(f"Failed to generate PDF translated document for {target_language}: {str(e)}") raise TranslationError(f"生成PDF {target_language} 翻譯文件失敗: {str(e)}") # 生成組合多語言文檔 - 譯文1/譯文2格式(當有多個目標語言時) if len(job.target_languages) > 1: try: logger.info("Generating combined multi-language PDF document") combined_output_file = enhanced_parser.generate_combined_translated_document( all_translations=translation_results, target_languages=job.target_languages, output_dir=output_dir ) output_files['combined'] = combined_output_file # 記錄組合翻譯檔案到資料庫 file_size = Path(combined_output_file).stat().st_size job.add_translated_file( language_code='combined', filename=Path(combined_output_file).name, file_path=combined_output_file, file_size=file_size ) logger.info(f"Generated combined multi-language PDF file: {combined_output_file}") except Exception as e: logger.error(f"Failed to generate combined multi-language PDF document: {str(e)}") # 不要因為組合檔案失敗而讓整個任務失敗,只記錄警告 logger.warning("Combined multi-language PDF file generation failed, but individual files were successful") else: # 對於其他文件格式,使用原有邏輯 logger.info(f"Using legacy sentence-based processing for {file_ext} files") parser = self.get_document_parser(job.file_path) # 提取文字片段 - 对PDF传递user_id和job_id以支持OCR if file_ext == '.pdf': text_segments = parser.extract_text_segments(user_id=job.user_id, job_id=job.id) else: text_segments = parser.extract_text_segments() if not text_segments: raise TranslationError("文件中未找到可翻譯的文字") # 分割成句子 all_sentences = [] for segment in text_segments: sentences = self.split_text_into_sentences(segment, job.source_language) all_sentences.extend(sentences) # 去重複 unique_sentences = list(dict.fromkeys(all_sentences)) logger.info(f"Found {len(unique_sentences)} unique sentences to translate") # 批次翻譯 translation_results = {} total_sentences = len(unique_sentences) for target_language in job.target_languages: logger.info(f"Translating to {target_language}") translated_sentences = [] current_conversation_id = job.conversation_id # 維持上下文連貫性 for i, sentence in enumerate(unique_sentences): try: translation_result = self.translate_text_with_cache( text=sentence, source_language=job.source_language, target_language=target_language, user_id=job.user_id, job_id=job.id, conversation_id=current_conversation_id # 傳遞 conversation_id ) translated_sentences.append(translation_result['translated_text']) # 更新 conversation_id 以維持連續對話上下文 if translation_result.get("conversation_id"): current_conversation_id = translation_result["conversation_id"] # 更新進度 progress = (i + 1) / total_sentences * 100 / len(job.target_languages) current_lang_index = job.target_languages.index(target_language) total_progress = (current_lang_index * 100 + progress) / len(job.target_languages) job.update_status('PROCESSING', progress=total_progress) time.sleep(0.1) except Exception as e: logger.error(f"Failed to translate sentence: {sentence[:50]}... Error: {str(e)}") translated_sentences.append(f"[翻譯失敗] {sentence}") translation_results[target_language] = translated_sentences # 生成翻譯文件 output_dir = Path(job.file_path).parent output_files = {} for target_language, translations in translation_results.items(): translation_mapping = {target_language: translations} output_file = parser.generate_translated_document( translations=translation_mapping, target_language=target_language, output_dir=output_dir ) output_files[target_language] = output_file file_size = Path(output_file).stat().st_size job.add_translated_file( language_code=target_language, filename=Path(output_file).name, file_path=output_file, file_size=file_size ) # 計算總成本 total_cost = self._calculate_job_cost(job.id) # 更新任務狀態為完成 job.update_status('COMPLETED', progress=100) job.total_cost = total_cost # 計算實際使用的 token 數(從 API 使用統計中獲取) from sqlalchemy import func from app.models.stats import APIUsageStats from app import db actual_tokens = db.session.query( func.sum(APIUsageStats.total_tokens) ).filter_by(job_id=job.id).scalar() job.total_tokens = int(actual_tokens) if actual_tokens else 0 db.session.commit() logger.info(f"Enhanced document translation completed: {job_uuid}") return { 'success': True, 'job_uuid': job_uuid, 'output_files': output_files, 'total_sentences': len(texts_to_translate) if 'texts_to_translate' in locals() else len(unique_sentences) if 'unique_sentences' in locals() else 0, 'total_cost': float(total_cost), 'target_languages': job.target_languages } except TranslationError: raise except Exception as e: logger.error(f"Enhanced document translation failed: {job_uuid}. Error: {str(e)}") raise TranslationError(f"文件翻譯失敗: {str(e)}") def _calculate_job_cost(self, job_id: int) -> float: """計算任務總成本""" from app import db from sqlalchemy import func from app.models.stats import APIUsageStats total_cost = db.session.query( func.sum(APIUsageStats.cost) ).filter_by(job_id=job_id).scalar() return float(total_cost) if total_cost else 0.0 def _generate_combined_excel_document(self, parser, translation_mapping: Dict[str, List[str]], target_languages: List[str], output_path: Path) -> str: """生成包含所有翻譯語言的組合Excel檔案""" try: import openpyxl from openpyxl.styles import Alignment, Font from sqlalchemy import text as sql_text from app import db # 載入原始工作簿 wb = openpyxl.load_workbook(str(parser.file_path), data_only=False) try: wb_vals = openpyxl.load_workbook(str(parser.file_path), data_only=True) except Exception: wb_vals = None # 取得原始文字段落以建立翻譯映射 original_segments = parser.extract_text_segments() combined_tmap = {} logger.info(f"Building combined translation map for {len(original_segments)} segments") for original_text in original_segments: # 從翻譯快取中查詢所有語言的翻譯 for target_lang in target_languages: result = db.session.execute(sql_text(""" SELECT translated_text FROM dt_translation_cache WHERE source_text = :text AND target_language = :lang ORDER BY created_at ASC LIMIT 1 """), {'text': original_text, 'lang': target_lang}) row = result.fetchone() if row and row[0]: combined_tmap[(target_lang, original_text)] = row[0] logger.info(f"Built combined translation map with {len(combined_tmap)} mappings") # 處理每個工作表,插入組合翻譯 for ws in wb.worksheets: logger.info(f"Processing combined worksheet: {ws.title}") ws_vals = wb_vals[ws.title] if wb_vals and ws.title in wb_vals.sheetnames else None max_row, max_col = ws.max_row, ws.max_column for r in range(1, max_row + 1): for c in range(1, max_col + 1): cell = ws.cell(row=r, column=c) src_text = parser._get_display_text_for_translation(ws, ws_vals, r, c) if not src_text or not parser._should_translate(src_text, 'auto'): continue # 收集所有語言的翻譯 translations = [] for target_lang in target_languages: if (target_lang, src_text) in combined_tmap: translations.append(combined_tmap[(target_lang, src_text)]) else: translations.append(f"【翻譯缺失|{target_lang}】") # 組合翻譯文字:原文\n英文\n越南文 if translations: combined_text = src_text + '\n' + '\n'.join(translations) # 設置儲存格值 cell.value = combined_text cell.alignment = Alignment(wrap_text=True, vertical='top') cell.font = Font(size=10) # 儲存組合檔案 wb.save(str(output_path)) logger.info(f"Generated combined Excel file: {output_path}") return str(output_path) except Exception as e: logger.error(f"Failed to generate combined Excel document: {str(e)}") raise FileProcessingError(f"組合 Excel 檔案生成失敗: {str(e)}") def _generate_combined_pptx_document(self, parser, translation_results: Dict[str, List[str]], target_languages: List[str], output_path: Path) -> str: """生成包含所有翻譯語言的組合PowerPoint檔案""" try: import pptx from sqlalchemy import text as sql_text from app import db # 載入原始 PowerPoint 文件 prs = pptx.Presentation(str(parser.file_path)) # 收集所有文字框和原始文字 text_frames_data = [] for slide in prs.slides: for shape in slide.shapes: if getattr(shape, "has_text_frame", False): text = parser._extract_text_from_frame(shape.text_frame) if text.strip(): text_frames_data.append((shape.text_frame, text)) # 建立組合翻譯映射 - 從快取讀取所有語言的翻譯 combined_translation_map = {} logger.info(f"Building combined PowerPoint translation map for {len(text_frames_data)} text frames") for text_frame, original_text in text_frames_data: # 從翻譯快取中查詢所有語言的翻譯 for target_lang in target_languages: result = db.session.execute(sql_text(""" SELECT translated_text FROM dt_translation_cache WHERE source_text = :text AND target_language = :lang ORDER BY created_at ASC LIMIT 1 """), {'text': original_text, 'lang': target_lang}) row = result.fetchone() if row and row[0]: combined_translation_map[(target_lang, original_text)] = row[0] logger.info(f"Built combined PowerPoint translation map with {len(combined_translation_map)} mappings") # 處理每個文字框,插入組合翻譯 ok_count = skip_count = 0 for text_frame, original_text in text_frames_data: # 收集所有語言的翻譯 translations = [] for target_lang in target_languages: if (target_lang, original_text) in combined_translation_map: translations.append(combined_translation_map[(target_lang, original_text)]) else: translations.append(f"【翻譯缺失|{target_lang}】") # 檢查是否已存在翻譯 if parser._check_existing_translations(text_frame, translations): skip_count += 1 continue # 添加所有語言的翻譯 for translation in translations: parser._append_translation(text_frame, translation) ok_count += 1 # 儲存組合檔案 prs.save(str(output_path)) logger.info(f"Generated combined PowerPoint file: {output_path} with {ok_count} frames, {skip_count} skips") return str(output_path) except Exception as e: logger.error(f"Failed to generate combined PowerPoint document: {str(e)}") raise FileProcessingError(f"組合 PowerPoint 檔案生成失敗: {str(e)}")