Files
Document_translator/app/services/translation_service.py
2025-10-02 17:13:24 +08:00

2634 lines
123 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
翻譯服務
Author: PANJIT IT Team
Created: 2024-01-28
Modified: 2024-01-28
"""
import hashlib
import time
from pathlib import Path
from typing import List, Dict, Any, Optional, Tuple
from app.utils.logger import get_logger
from app.utils.exceptions import TranslationError, FileProcessingError
from app.services.dify_client import DifyClient
from app.services.document_processor import DocumentProcessor, Segment
from app.models.cache import TranslationCache
from app.models.job import TranslationJob
from app.utils.helpers import generate_filename, create_job_directory
from app import db
logger = get_logger(__name__)
class DocumentParser:
"""文件解析器基類"""
def __init__(self, file_path: str):
self.file_path = Path(file_path)
if not self.file_path.exists():
raise FileProcessingError(f"檔案不存在: {file_path}")
def extract_text_segments(self) -> List[str]:
"""提取文字片段"""
raise NotImplementedError
def generate_translated_document(self, translations: Dict[str, List[str]],
target_language: str, output_dir: Path) -> str:
"""生成翻譯後的文件"""
raise NotImplementedError
class DocxParser(DocumentParser):
"""DOCX 文件解析器 - 使用增強的 DocumentProcessor"""
def __init__(self, file_path: str):
super().__init__(file_path)
self.processor = DocumentProcessor()
def extract_text_segments(self) -> List[str]:
"""提取 DOCX 文件的文字片段 - 使用增強邏輯"""
try:
# 使用新的文檔處理器提取段落
segments = self.processor.extract_docx_segments(str(self.file_path))
# 轉換為文字列表
text_segments = []
for seg in segments:
if seg.text.strip() and len(seg.text.strip()) > 3:
text_segments.append(seg.text)
logger.info(f"Enhanced extraction: {len(text_segments)} text segments from DOCX")
return text_segments
except Exception as e:
logger.error(f"Failed to extract text from DOCX: {str(e)}")
raise FileProcessingError(f"DOCX 文件解析失敗: {str(e)}")
def extract_segments_with_context(self) -> List[Segment]:
"""提取帶上下文的段落資訊"""
return self.processor.extract_docx_segments(str(self.file_path))
def generate_translated_document(self, translations: Dict[str, List[str]],
target_language: str, output_dir: Path) -> str:
"""生成翻譯後的 DOCX 文件 - 使用增強的翻譯插入邏輯(從快取讀取)"""
try:
from sqlalchemy import text as sql_text
from app import db
# 生成輸出檔名
output_filename = generate_filename(
self.file_path.name,
'translated',
'translated',
target_language
)
output_path = output_dir / output_filename
# 提取段落資訊
segments = self.extract_segments_with_context()
# 建立翻譯映射 - 從快取讀取而非使用傳入的translations參數
translation_map = {}
logger.info(f"Building translation map for {len(segments)} segments in language {target_language}")
for seg in segments:
# 從翻譯快取中查詢每個段落的翻譯
result = db.session.execute(sql_text("""
SELECT translated_text
FROM dt_translation_cache
WHERE source_text = :text AND target_language = :lang
ORDER BY created_at DESC
LIMIT 1
"""), {'text': seg.text, 'lang': target_language})
row = result.fetchone()
if row and row[0]:
translation_map[(target_language, seg.text)] = row[0]
logger.debug(f"Found translation for: {seg.text[:50]}...")
else:
logger.warning(f"No translation found for: {seg.text[:50]}...")
logger.info(f"Translation map built with {len(translation_map)} mappings")
# 使用增強的翻譯插入邏輯
ok_count, skip_count = self.processor.insert_docx_translations(
str(self.file_path),
segments,
translation_map,
[target_language],
str(output_path)
)
logger.info(f"Enhanced translation: Generated {output_path} with {ok_count} insertions, {skip_count} skips")
return str(output_path)
except Exception as e:
logger.error(f"Failed to generate translated DOCX: {str(e)}")
raise FileProcessingError(f"生成翻譯 DOCX 失敗: {str(e)}")
class DocParser(DocumentParser):
"""DOC 文件解析器 - 需要先轉換為 DOCX"""
def extract_text_segments(self) -> List[str]:
"""提取 DOC 文件的文字片段 - 先轉換為 DOCX 再處理"""
try:
# 檢查是否有 Word COM 支援
import tempfile
import os
try:
import win32com.client as win32
import pythoncom
_WIN32COM_AVAILABLE = True
except ImportError:
_WIN32COM_AVAILABLE = False
if not _WIN32COM_AVAILABLE:
raise FileProcessingError("DOC 格式需要 Word COM 支援,請先手動轉換為 DOCX 格式或安裝 Microsoft Office")
# 創建臨時 DOCX 文件
temp_docx = None
try:
with tempfile.NamedTemporaryFile(suffix='.docx', delete=False) as tmp:
temp_docx = tmp.name
# 使用 Word COM 轉換 DOC 到 DOCX (格式 16)
self._word_convert(str(self.file_path), temp_docx, 16)
# 使用 DOCX 解析器處理轉換後的文件
docx_parser = DocxParser(temp_docx)
segments = docx_parser.extract_text_segments()
logger.info(f"Converted DOC to DOCX and extracted {len(segments)} segments")
return segments
finally:
# 清理臨時文件
if temp_docx and os.path.exists(temp_docx):
try:
os.remove(temp_docx)
except Exception:
pass
except Exception as e:
logger.error(f"Failed to extract text from DOC file: {str(e)}")
raise FileProcessingError(f"DOC 文件解析失敗: {str(e)}")
def _word_convert(self, input_path: str, output_path: str, target_format: int):
"""使用 Word COM 轉換文件格式(移植自參考檔案)"""
try:
import win32com.client as win32
import pythoncom
pythoncom.CoInitialize()
try:
word = win32.Dispatch("Word.Application")
word.Visible = False
doc = word.Documents.Open(os.path.abspath(input_path))
doc.SaveAs2(os.path.abspath(output_path), FileFormat=target_format)
doc.Close(False)
finally:
word.Quit()
pythoncom.CoUninitialize()
except Exception as e:
raise FileProcessingError(f"Word COM 轉換失敗: {str(e)}")
def generate_translated_document(self, translations: Dict[str, List[str]],
target_language: str, output_dir: Path) -> str:
"""生成翻譯後的 DOC 文件 - 先轉為 DOCX 處理後輸出為 DOCX"""
try:
import tempfile
import os
# 先轉換為 DOCX然後使用 DOCX 處理邏輯
temp_docx = None
try:
with tempfile.NamedTemporaryFile(suffix='.docx', delete=False) as tmp:
temp_docx = tmp.name
# 轉換 DOC 到 DOCX
self._word_convert(str(self.file_path), temp_docx, 16)
# 使用 DOCX 解析器生成翻譯文檔
docx_parser = DocxParser(temp_docx)
# 注意:最終輸出為 DOCX 格式,因為 DOC 格式較難直接處理
output_filename = f"{self.file_path.stem}_{target_language}_translated.docx"
output_path = output_dir / output_filename
result_path = docx_parser.generate_translated_document(translations, target_language, output_dir)
logger.info(f"Generated translated DOC file (as DOCX): {result_path}")
return result_path
finally:
# 清理臨時文件
if temp_docx and os.path.exists(temp_docx):
try:
os.remove(temp_docx)
except Exception:
pass
except Exception as e:
logger.error(f"Failed to generate translated DOC file: {str(e)}")
raise FileProcessingError(f"DOC 翻譯檔生成失敗: {str(e)}")
class ExcelParser(DocumentParser):
"""Excel 文件解析器XLSX/XLS- 移植自參考檔案"""
def extract_text_segments(self) -> List[str]:
"""提取 Excel 文件的文字片段"""
try:
import openpyxl
from openpyxl.utils.exceptions import InvalidFileException
# 載入工作簿(移植自參考檔案邏輯)
try:
wb = openpyxl.load_workbook(str(self.file_path), data_only=False)
wb_vals = openpyxl.load_workbook(str(self.file_path), data_only=True)
except InvalidFileException:
if self.file_path.suffix.lower() == '.xls':
raise FileProcessingError("XLS 格式需要先轉換為 XLSX 格式")
raise
except Exception:
wb_vals = None
# 提取文字段落(完全按照參考檔案的邏輯)
segs = []
for ws in wb.worksheets:
ws_vals = wb_vals[ws.title] if wb_vals and ws.title in wb_vals.sheetnames else None
max_row, max_col = ws.max_row, ws.max_column
for r in range(1, max_row + 1):
for c in range(1, max_col + 1):
src_text = self._get_display_text_for_translation(ws, ws_vals, r, c)
if not src_text:
continue
if not self._should_translate(src_text, 'auto'):
continue
segs.append(src_text)
# 去重保持順序
unique_segments = []
seen = set()
for seg in segs:
if seg not in seen:
unique_segments.append(seg)
seen.add(seg)
logger.info(f"Extracted {len(unique_segments)} unique text segments from Excel file")
return unique_segments
except Exception as e:
logger.error(f"Failed to extract text from Excel file: {str(e)}")
raise FileProcessingError(f"Excel 文件解析失敗: {str(e)}")
def _get_display_text_for_translation(self, ws, ws_vals, r: int, c: int) -> Optional[str]:
"""取得儲存格用於翻譯的顯示文字(完全移植自參考檔案)"""
val = ws.cell(row=r, column=c).value
if isinstance(val, str) and val.startswith("="):
if ws_vals is not None:
shown = ws_vals.cell(row=r, column=c).value
return shown if isinstance(shown, str) and shown.strip() else None
return None
if isinstance(val, str) and val.strip():
return val
if ws_vals is not None:
shown = ws_vals.cell(row=r, column=c).value
if isinstance(shown, str) and shown.strip():
return shown
return None
def _should_translate(self, text: str, src_lang: str) -> bool:
"""判斷文字是否需要翻譯(只要有字就翻譯)"""
text = text.strip()
# 只要有字就翻譯 - 最小長度設為1
if len(text) < 1:
return False
# Skip pure numbers, dates, etc.
import re
if re.match(r'^[\d\s\.\-\:\/]+$', text):
return False
# For auto-detect, translate if has CJK or meaningful text
if src_lang.lower() in ('auto', 'auto-detect'):
return self._has_cjk(text) or len(text) > 5
return True
def _has_cjk(self, text: str) -> bool:
"""檢查是否包含中日韓文字(移植自參考檔案)"""
for char in text:
if '\u4e00' <= char <= '\u9fff' or \
'\u3400' <= char <= '\u4dbf' or \
'\u20000' <= char <= '\u2a6df' or \
'\u3040' <= char <= '\u309f' or \
'\u30a0' <= char <= '\u30ff' or \
'\uac00' <= char <= '\ud7af':
return True
return False
def generate_translated_document(self, translations: Dict[str, List[str]],
target_language: str, output_dir: Path) -> str:
"""生成翻譯後的 Excel 文件(使用翻譯快取確保正確映射)"""
try:
import openpyxl
from openpyxl.styles import Alignment
from openpyxl.comments import Comment
from sqlalchemy import text as sql_text
from app import db
# 載入原始工作簿
wb = openpyxl.load_workbook(str(self.file_path), data_only=False)
try:
wb_vals = openpyxl.load_workbook(str(self.file_path), data_only=True)
except Exception:
wb_vals = None
# 建立翻譯映射 - 改用翻譯快取查詢,確保正確對應
original_segments = self.extract_text_segments()
tmap = {}
logger.info(f"Building translation map for {len(original_segments)} segments in language {target_language}")
for original_text in original_segments:
# 從翻譯快取中查詢每個原文的翻譯
# 使用聯合查詢優先使用最早的翻譯記錄原始DIFY翻譯
normalized_text = original_text.replace('\n', ' ').replace('\r', ' ').strip()
result = db.session.execute(sql_text("""
SELECT translated_text, created_at, 'exact' as match_type
FROM dt_translation_cache
WHERE source_text = :exact_text AND target_language = :lang
UNION ALL
SELECT translated_text, created_at, 'normalized' as match_type
FROM dt_translation_cache
WHERE REPLACE(REPLACE(TRIM(source_text), '\n', ' '), '\r', ' ') = :norm_text
AND target_language = :lang
AND source_text != :exact_text
ORDER BY created_at ASC
LIMIT 1
"""), {'exact_text': original_text, 'norm_text': normalized_text, 'lang': target_language})
row = result.fetchone()
if row and row[0]:
tmap[original_text] = row[0]
logger.debug(f"Cache hit for Excel: {original_text[:30]}... -> {row[0][:30]}...")
else:
logger.warning(f"No translation found in cache for: {original_text[:50]}...")
logger.info(f"Translation map built with {len(tmap)} mappings from cache")
# 處理每個工作表(加入詳細調試日誌)
translation_count = 0
skip_count = 0
for ws in wb.worksheets:
logger.info(f"Processing worksheet: {ws.title}")
ws_vals = wb_vals[ws.title] if wb_vals and ws.title in wb_vals.sheetnames else None
max_row, max_col = ws.max_row, ws.max_column
for r in range(1, max_row + 1):
for c in range(1, max_col + 1):
cell_name = f"{openpyxl.utils.get_column_letter(c)}{r}"
src_text = self._get_display_text_for_translation(ws, ws_vals, r, c)
if not src_text:
continue
# 檢查是否需要翻譯
should_translate = self._should_translate(src_text, 'auto')
if not should_translate:
logger.debug(f"Skip {cell_name}: '{src_text[:30]}...' (should not translate)")
skip_count += 1
continue
# 檢查翻譯映射
if src_text not in tmap:
logger.warning(f"No translation mapping for {cell_name}: '{src_text[:30]}...'")
skip_count += 1
continue
val = ws.cell(row=r, column=c).value
is_formula = isinstance(val, str) and val.startswith("=")
translated_text = tmap[src_text]
cell = ws.cell(row=r, column=c)
if is_formula:
# 公式儲存格:添加註解
txt_comment = f"翻譯: {translated_text}"
exist = cell.comment
if not exist or exist.text.strip() != txt_comment:
cell.comment = Comment(txt_comment, "translator")
logger.debug(f"Added comment to {cell_name}: {translated_text[:30]}...")
translation_count += 1
else:
# 一般儲存格:單語言檔案只保留翻譯文,不包含原文
# 檢查是否已經是預期的格式
current_text = str(cell.value) if cell.value else ""
if current_text.strip() == translated_text.strip():
logger.debug(f"Skip {cell_name}: already translated")
continue
cell.value = translated_text # 只保留翻譯文
logger.info(f"Translated {cell_name}: '{src_text[:20]}...' -> '{translated_text[:20]}...'")
translation_count += 1
# 設定自動換行(移植自參考檔案)
try:
if cell.alignment:
cell.alignment = Alignment(
horizontal=cell.alignment.horizontal,
vertical=cell.alignment.vertical,
wrap_text=True
)
else:
cell.alignment = Alignment(wrap_text=True)
except Exception:
cell.alignment = Alignment(wrap_text=True)
# 儲存翻譯後的檔案
output_filename = f"{self.file_path.stem}_{target_language}_translated.xlsx"
output_path = output_dir / output_filename
wb.save(str(output_path))
logger.info(f"Excel translation completed: {translation_count} translations, {skip_count} skips")
logger.info(f"Generated translated Excel file: {output_path}")
return str(output_path)
except Exception as e:
logger.error(f"Failed to generate translated Excel file: {str(e)}")
raise FileProcessingError(f"Excel 翻譯檔生成失敗: {str(e)}")
class PdfParser(DocumentParser):
"""PDF 文件解析器 - 支持扫描PDF的OCR处理"""
def extract_text_segments(self, user_id: int = None, job_id: int = None) -> List[str]:
"""提取 PDF 文件的文字片段 - 支持扫描PDF的智能处理"""
try:
from app.services.enhanced_pdf_parser import EnhancedPdfParser
# 使用增强的PDF解析器
enhanced_parser = EnhancedPdfParser(str(self.file_path))
text_segments = enhanced_parser.extract_text_segments(user_id, job_id)
logger.info(f"Enhanced PDF extraction: {len(text_segments)} text segments")
return text_segments
except Exception as e:
logger.error(f"Enhanced PDF extraction failed, falling back to basic extraction: {str(e)}")
# 回退到基本文字提取
try:
from PyPDF2 import PdfReader
reader = PdfReader(str(self.file_path))
text_segments = []
for page in reader.pages:
text = page.extract_text()
# 簡單的句子分割
sentences = text.split('.')
for sentence in sentences:
sentence = sentence.strip()
if sentence and len(sentence) > 10:
text_segments.append(sentence)
logger.info(f"Basic PDF extraction: {len(text_segments)} text segments")
return text_segments
except Exception as e2:
logger.error(f"Basic PDF extraction also failed: {str(e2)}")
raise FileProcessingError(f"PDF 文件解析失敗: {str(e2)}")
def generate_translated_document(self, translations: Dict[str, List[str]],
target_language: str, output_dir: Path) -> str:
"""生成翻譯文字檔PDF 不支援直接編輯)"""
try:
from app.services.enhanced_pdf_parser import EnhancedPdfParser
# 使用增强解析器生成翻译文档
enhanced_parser = EnhancedPdfParser(str(self.file_path))
return enhanced_parser.generate_translated_document(translations, target_language, output_dir)
except Exception as e:
# 回退到基本生成方式
logger.warning(f"Enhanced PDF generation failed, using basic method: {str(e)}")
translated_texts = translations.get(target_language, [])
# 生成純文字檔案
output_filename = f"{self.file_path.stem}_{target_language}_translated.txt"
output_path = output_dir / output_filename
with open(output_path, 'w', encoding='utf-8') as f:
f.write(f"翻譯結果 - {target_language}\n")
f.write("=" * 50 + "\n\n")
for i, text in enumerate(translated_texts):
f.write(f"{i+1}. {text}\n\n")
logger.info(f"Generated translated text file: {output_path}")
return str(output_path)
class PptxParser(DocumentParser):
"""PowerPoint 文件解析器"""
def extract_text_segments(self) -> List[str]:
"""提取 PPTX 文件的文字片段(包含表格)"""
try:
import pptx
prs = pptx.Presentation(str(self.file_path))
text_segments = []
for slide_idx, slide in enumerate(prs.slides, 1):
for shape_idx, shape in enumerate(slide.shapes, 1):
shape_processed = False
# 處理文字框 - 優先處理,因為大多數文字都在這裡
if getattr(shape, "has_text_frame", False):
text_frame = shape.text_frame
text = self._extract_text_from_frame(text_frame)
if text.strip():
text_segments.append(text)
logger.debug(f"Extracted text frame from slide {slide_idx}, shape {shape_idx}: {text[:50]}...")
shape_processed = True
# 處理表格
if getattr(shape, "has_table", False):
table_texts = self._extract_text_from_table(shape.table, slide_idx, shape_idx)
text_segments.extend(table_texts)
if table_texts:
shape_processed = True
# 處理圖表 (Charts)
if getattr(shape, "has_chart", False):
chart_texts = self._extract_text_from_chart(shape.chart, slide_idx, shape_idx)
text_segments.extend(chart_texts)
if chart_texts:
shape_processed = True
# 處理群組形狀 (Grouped Shapes) - 支援深度嵌套
if hasattr(shape, 'shapes'):
group_texts = self._extract_text_from_group(shape.shapes, slide_idx, shape_idx, depth=0)
text_segments.extend(group_texts)
if group_texts:
shape_processed = True
# 處理 GraphicFrame (可能包含 SmartArt 等)
if getattr(shape, "has_smart_art", False):
smartart_texts = self._extract_text_from_smartart(shape, slide_idx, shape_idx)
text_segments.extend(smartart_texts)
if smartart_texts:
shape_processed = True
# 處理基本形狀內的文字 - 作為備用方案,避免重複提取
if not shape_processed and hasattr(shape, 'text') and shape.text.strip():
text_segments.append(shape.text)
logger.debug(f"Extracted shape text from slide {slide_idx}, shape {shape_idx}: {shape.text[:50]}...")
shape_processed = True
# 如果以上都沒有處理到,檢查是否有其他可能的文字內容
if not shape_processed:
# 嘗試更深層的文字提取
fallback_texts = self._extract_fallback_text(shape, slide_idx, shape_idx)
text_segments.extend(fallback_texts)
logger.info(f"PowerPoint extraction: {len(text_segments)} text segments from PPTX (including tables)")
# 診斷特定關鍵字 - 增強版
target_keywords = [
"檢驗盤剔線作業時缺少線塌防護設計",
"治工具未標準化管理",
"彈匣裝載料片間距不足",
"彈匣未評估防震防傾倒風險",
"搬運台車選用錯誤"
]
logger.info("=== 關鍵字診斷開始 ===")
for keyword in target_keywords:
# 完全匹配
exact_matches = [seg for seg in text_segments if keyword == seg.strip()]
# 包含匹配
contains_matches = [seg for seg in text_segments if keyword in seg]
# 模糊匹配(去掉空白和換行符)
normalized_keyword = keyword.replace(' ', '').replace('\n', '').replace('\r', '')
fuzzy_matches = [seg for seg in text_segments
if normalized_keyword in seg.replace(' ', '').replace('\n', '').replace('\r', '')]
if exact_matches:
logger.info(f"✅ 完全匹配關鍵字: '{keyword}'{len(exact_matches)} 個文字片段中")
for i, seg in enumerate(exact_matches):
logger.info(f" 完全匹配{i+1}: '{seg}'")
elif contains_matches:
logger.info(f"🔍 包含關鍵字: '{keyword}'{len(contains_matches)} 個文字片段中")
for i, seg in enumerate(contains_matches):
logger.info(f" 包含匹配{i+1}: '{seg}'")
elif fuzzy_matches:
logger.info(f"🎯 模糊匹配關鍵字: '{keyword}'{len(fuzzy_matches)} 個文字片段中")
for i, seg in enumerate(fuzzy_matches):
logger.info(f" 模糊匹配{i+1}: '{seg}'")
# 顯示標準化後的比較
normalized_seg = seg.replace(' ', '').replace('\n', '').replace('\r', '')
logger.info(f" 標準化後: 關鍵字='{normalized_keyword}' vs 片段='{normalized_seg}'")
else:
logger.warning(f"❌ 未找到關鍵字: '{keyword}'")
# 檢查是否有類似的文字
similar_segments = []
for seg in text_segments:
# 計算相似度(簡單的關鍵詞匹配)
keyword_chars = set(keyword)
seg_chars = set(seg)
intersection = keyword_chars.intersection(seg_chars)
if len(intersection) >= min(5, len(keyword_chars) * 0.5):
similar_segments.append(seg)
if similar_segments:
logger.info(f"💡 可能相似的片段 ({len(similar_segments)} 個):")
for i, seg in enumerate(similar_segments[:3]): # 只顯示前3個
logger.info(f" 相似{i+1}: '{seg}'")
logger.info("=== 關鍵字診斷結束 ===")
return text_segments
except Exception as e:
logger.error(f"Failed to extract text from PPTX: {str(e)}")
raise FileProcessingError(f"PPTX 文件解析失敗: {str(e)}")
def _extract_text_from_frame(self, text_frame) -> str:
"""從文字框中提取文字內容,包含標準化處理"""
if not text_frame or not hasattr(text_frame, 'paragraphs'):
return ""
# 收集所有段落文字
paragraphs = []
for para in text_frame.paragraphs:
para_text = para.text
if para_text and para_text.strip():
paragraphs.append(para_text.strip())
if not paragraphs:
return ""
# 合併段落
text = "\n".join(paragraphs)
# 標準化文字處理
import re
# 1. 標準化換行符
text = text.replace('\r\n', '\n').replace('\r', '\n')
# 2. 移除末尾的換行符(但保留中間的)
text = text.rstrip('\n')
# 3. 標準化多重空白(但保留單個換行符)
text = re.sub(r'[ \t]+', ' ', text)
# 4. 移除段落間多餘空行
text = re.sub(r'\n\s*\n', '\n', text)
return text
def _extract_text_from_table(self, table, slide_idx: int, shape_idx: int) -> List[str]:
"""從表格中提取文字內容"""
table_texts = []
try:
for row_idx, row in enumerate(table.rows):
for col_idx, cell in enumerate(row.cells):
cell_text = cell.text_frame.text.strip()
if cell_text:
table_texts.append(cell_text)
logger.debug(f"Extracted table cell text from slide {slide_idx}, shape {shape_idx}, "
f"row {row_idx+1}, col {col_idx+1}: {cell_text[:50]}...")
logger.info(f"Extracted {len(table_texts)} cells from table on slide {slide_idx}")
except Exception as e:
logger.error(f"Failed to extract text from table on slide {slide_idx}: {str(e)}")
return table_texts
def _extract_text_from_chart(self, chart, slide_idx: int, shape_idx: int) -> List[str]:
"""從圖表中提取文字內容"""
chart_texts = []
try:
# 嘗試提取圖表標題
if hasattr(chart, 'chart_title') and chart.chart_title.has_text_frame:
title_text = chart.chart_title.text_frame.text.strip()
if title_text:
chart_texts.append(title_text)
logger.debug(f"Extracted chart title from slide {slide_idx}: {title_text[:50]}...")
# 嘗試提取其他圖表元素的文字(受限於 python-pptx 支援)
# 注意python-pptx 對圖表的支援有限,無法直接存取軸標籤等
logger.info(f"Extracted {len(chart_texts)} text elements from chart on slide {slide_idx}")
except Exception as e:
logger.error(f"Failed to extract text from chart on slide {slide_idx}: {str(e)}")
return chart_texts
def _extract_text_from_group(self, shapes, slide_idx: int, shape_idx: int, depth: int = 0) -> List[str]:
"""從群組形狀中提取文字內容 - 支援深度嵌套群組"""
group_texts = []
max_depth = 10 # 防止無限遞歸
if depth > max_depth:
logger.warning(f"Group nesting depth exceeded {max_depth} on slide {slide_idx}, skipping deeper levels")
return group_texts
try:
for sub_shape_idx, sub_shape in enumerate(shapes):
shape_processed = False
# 1. 優先處理嵌套群組(遞歸處理)
if hasattr(sub_shape, 'shapes') and hasattr(sub_shape, 'shape_type'):
try:
# 這是一個嵌套的群組
nested_texts = self._extract_text_from_group(sub_shape.shapes, slide_idx,
f"{shape_idx}.{sub_shape_idx}", depth + 1)
group_texts.extend(nested_texts)
if nested_texts:
shape_processed = True
logger.debug(f"Extracted {len(nested_texts)} texts from nested group "
f"at slide {slide_idx}, depth {depth + 1}")
except Exception as e:
logger.debug(f"Failed to process nested group at slide {slide_idx}, "
f"depth {depth + 1}: {str(e)}")
# 2. 處理文字框
if getattr(sub_shape, "has_text_frame", False):
text = self._extract_text_from_frame(sub_shape.text_frame)
if text.strip():
group_texts.append(text)
logger.debug(f"Extracted group text from slide {slide_idx}, group {shape_idx}, "
f"sub-shape {sub_shape_idx} (depth {depth}): {text[:50]}...")
shape_processed = True
# 3. 處理群組內的表格
if getattr(sub_shape, "has_table", False):
sub_table_texts = self._extract_text_from_table(sub_shape.table, slide_idx,
f"{shape_idx}.{sub_shape_idx}")
group_texts.extend(sub_table_texts)
if sub_table_texts:
shape_processed = True
# 4. 處理群組內的圖表
if getattr(sub_shape, "has_chart", False):
chart_texts = self._extract_text_from_chart(sub_shape.chart, slide_idx,
f"{shape_idx}.{sub_shape_idx}")
group_texts.extend(chart_texts)
if chart_texts:
shape_processed = True
# 5. 處理基本形狀文字(作為最後的備選方案)
if not shape_processed and hasattr(sub_shape, 'text') and sub_shape.text.strip():
group_texts.append(sub_shape.text)
logger.debug(f"Extracted group shape text from slide {slide_idx} "
f"(depth {depth}): {sub_shape.text[:50]}...")
shape_processed = True
# 6. 如果仍未處理,使用備用文字提取
if not shape_processed:
fallback_texts = self._extract_fallback_text(sub_shape, slide_idx,
f"{shape_idx}.{sub_shape_idx}")
group_texts.extend(fallback_texts)
logger.info(f"Extracted {len(group_texts)} text elements from grouped shapes "
f"on slide {slide_idx} (depth {depth})")
except Exception as e:
logger.error(f"Failed to extract text from grouped shapes on slide {slide_idx} "
f"(depth {depth}): {str(e)}")
return group_texts
def _extract_text_from_smartart(self, shape, slide_idx: int, shape_idx: int) -> List[str]:
"""從 SmartArt 中提取文字內容 - 有限支援"""
smartart_texts = []
try:
# python-pptx 對 SmartArt 支援有限,嘗試透過 XML 提取
# 這是一個基本實現,可能無法涵蓋所有 SmartArt 類型
logger.warning(f"SmartArt detected on slide {slide_idx}, shape {shape_idx} - limited support available")
logger.info("Consider using alternative libraries like Spire.Presentation for full SmartArt support")
# 暫時回傳空列表,避免錯誤
# 在未來版本中可以考慮整合 Spire.Presentation 或其他支援 SmartArt 的庫
except Exception as e:
logger.error(f"Failed to extract text from SmartArt on slide {slide_idx}: {str(e)}")
return smartart_texts
def _extract_fallback_text(self, shape, slide_idx: int, shape_idx: int) -> List[str]:
"""備用文字提取方法,處理可能遺漏的文字內容,包括深層嵌套結構"""
fallback_texts = []
try:
# 檢查形狀類型和屬性
shape_type = getattr(shape, 'shape_type', None)
logger.debug(f"Fallback extraction for slide {slide_idx}, shape {shape_idx}, type: {shape_type}")
# 嘗試透過不同的方式取得文字
# 方法 1: 直接檢查 text 屬性(即使之前沒處理到)
if hasattr(shape, 'text'):
text = getattr(shape, 'text', '')
if text and text.strip():
fallback_texts.append(text)
logger.debug(f"Fallback: Found direct text - {text[:50]}...")
# 方法 2: 檢查是否有 text_frame 但之前沒有正確處理
try:
if hasattr(shape, 'text_frame'):
text_frame = shape.text_frame
if text_frame and hasattr(text_frame, 'text'):
text = text_frame.text
if text and text.strip():
fallback_texts.append(text)
logger.debug(f"Fallback: Found text_frame text - {text[:50]}...")
except:
pass
# 方法 2.5: 深度檢查 text_frame 內的段落結構
try:
if hasattr(shape, 'text_frame') and shape.text_frame:
text_frame = shape.text_frame
if hasattr(text_frame, 'paragraphs'):
for para_idx, paragraph in enumerate(text_frame.paragraphs):
if hasattr(paragraph, 'runs'):
for run_idx, run in enumerate(paragraph.runs):
if hasattr(run, 'text') and run.text.strip():
fallback_texts.append(run.text)
logger.debug(f"Fallback: Found run text {para_idx}.{run_idx} - {run.text[:30]}...")
except Exception as e:
logger.debug(f"Failed to extract paragraph runs: {str(e)}")
# 方法 2.6: 如果形狀有嵌套的 shapes遞歸處理
if hasattr(shape, 'shapes') and shape.shapes:
try:
nested_texts = self._extract_text_from_group(shape.shapes, slide_idx,
f"fallback_{shape_idx}", depth=0)
fallback_texts.extend(nested_texts)
if nested_texts:
logger.debug(f"Fallback: Found {len(nested_texts)} texts from nested shapes")
except Exception as e:
logger.debug(f"Failed to extract from nested shapes: {str(e)}")
# 方法 3: 檢查特殊屬性
special_attrs = ['textFrame', 'text_frame', '_element']
for attr in special_attrs:
try:
if hasattr(shape, attr):
obj = getattr(shape, attr)
if hasattr(obj, 'text') and obj.text and obj.text.strip():
fallback_texts.append(obj.text)
logger.debug(f"Fallback: Found {attr} text - {obj.text[:30]}...")
except:
continue
# 方法 3: 如果是 GraphicFrame嘗試更深入的提取
if hasattr(shape, 'element'):
try:
# 透過 XML 元素搜尋文字節點
element = shape.element
# 搜尋 XML 中的文字內容
text_elements = []
# 搜尋 <a:t> 標籤(文字內容)
for t_elem in element.iter():
if t_elem.tag.endswith('}t'): # 匹配 a:t 標籤
if t_elem.text and t_elem.text.strip():
text_elements.append(t_elem.text.strip())
# 去重並添加
for text in set(text_elements):
if text not in [existing_text for existing_text in fallback_texts]:
fallback_texts.append(text)
logger.debug(f"Fallback: Found XML text - {text[:50]}...")
except Exception as xml_e:
logger.debug(f"XML extraction failed for shape {shape_idx}: {str(xml_e)}")
if fallback_texts:
logger.info(f"Fallback extraction found {len(fallback_texts)} additional text elements on slide {slide_idx}, shape {shape_idx}")
else:
logger.debug(f"No additional text found in fallback for slide {slide_idx}, shape {shape_idx}")
except Exception as e:
logger.error(f"Fallback text extraction failed for slide {slide_idx}, shape {shape_idx}: {str(e)}")
return fallback_texts
def _normalize_text(self, text: str) -> str:
"""標準化文字用於比較"""
import re
return re.sub(r"\s+", " ", (text or "").strip()).lower()
def _check_existing_translations(self, text_frame, translations: List[str]) -> bool:
"""檢查翻譯是否已經存在於文字框末尾"""
if len(text_frame.paragraphs) < len(translations):
return False
# 檢查末尾的段落是否與翻譯匹配
tail_paragraphs = text_frame.paragraphs[-len(translations):]
for para, expected in zip(tail_paragraphs, translations):
if self._normalize_text(para.text) != self._normalize_text(expected):
return False
# 檢查是否為斜體格式(我們添加的翻譯標記)
if any((r.font.italic is not True) and (r.text or "").strip() for r in para.runs):
return False
return True
def _append_translation(self, text_frame, text_block: str):
"""在文字框末尾添加翻譯文字"""
try:
from pptx.util import Pt as PPTPt
para = text_frame.add_paragraph()
para.text = text_block
# 設定格式:斜體、字體大小
for run in para.runs:
run.font.italic = True
run.font.size = PPTPt(12)
except Exception as e:
logger.error(f"Failed to append translation to text frame: {str(e)}")
raise
def generate_translated_document(self, translations: Dict[str, List[str]],
target_language: str, output_dir: Path) -> str:
"""生成翻譯後的 PPTX 文件"""
try:
import pptx
from sqlalchemy import text as sql_text
from app import db
# 載入 PowerPoint 文件
prs = pptx.Presentation(str(self.file_path))
# 生成輸出檔名
output_filename = generate_filename(
self.file_path.name,
'translated',
'translated',
target_language
)
output_path = output_dir / output_filename
# 收集所有文字框
text_frames = []
for slide in prs.slides:
for shape in slide.shapes:
if getattr(shape, "has_text_frame", False):
text = self._extract_text_from_frame(shape.text_frame)
if text.strip():
text_frames.append((shape.text_frame, text))
# 建立翻譯映射 - 從快取讀取
translation_map = {}
logger.info(f"Building translation map for {len(text_frames)} text frames in language {target_language}")
for text_frame, text in text_frames:
# 從翻譯快取中查詢翻譯
result = db.session.execute(sql_text("""
SELECT translated_text
FROM dt_translation_cache
WHERE source_text = :text AND target_language = :lang
ORDER BY created_at DESC
LIMIT 1
"""), {'text': text, 'lang': target_language})
row = result.fetchone()
if row and row[0]:
translation_map[text] = row[0]
logger.debug(f"Found translation for PowerPoint text: {text[:50]}...")
else:
logger.warning(f"No translation found for PowerPoint text: {text[:50]}...")
logger.info(f"Translation map built with {len(translation_map)} mappings")
# 插入翻譯
ok_count = skip_count = 0
for text_frame, original_text in text_frames:
if original_text not in translation_map:
skip_count += 1
logger.debug(f"Skip PowerPoint frame: no translation for {original_text[:30]}...")
continue
translated_text = translation_map[original_text]
translations_to_add = [translated_text] # 單一語言模式
# 檢查是否已存在翻譯
if self._check_existing_translations(text_frame, translations_to_add):
skip_count += 1
logger.debug(f"Skip PowerPoint frame: translation already exists for {original_text[:30]}...")
continue
# 添加翻譯
for translation in translations_to_add:
self._append_translation(text_frame, translation)
ok_count += 1
logger.debug(f"Added translation to PowerPoint frame: {original_text[:30]}...")
# 儲存文件
prs.save(str(output_path))
logger.info(f"PowerPoint translation completed: {ok_count} insertions, {skip_count} skips")
logger.info(f"Generated translated PowerPoint file: {output_path}")
return str(output_path)
except Exception as e:
logger.error(f"Failed to generate translated PPTX file: {str(e)}")
raise FileProcessingError(f"PPTX 翻譯檔生成失敗: {str(e)}")
def insert_pptx_translations(self, translation_map: Dict[Tuple[str, str], str],
target_languages: List[str], output_path: str) -> Tuple[int, int]:
"""插入翻譯到 PowerPoint 文件 - 單語言模式(僅翻譯文)"""
try:
import pptx
from shutil import copyfile
# 複製原始文件
copyfile(str(self.file_path), output_path)
# 載入 PowerPoint 文件
prs = pptx.Presentation(output_path)
ok_count = skip_count = 0
for slide_idx, slide in enumerate(prs.slides, 1):
for shape_idx, shape in enumerate(slide.shapes, 1):
# 使用與提取邏輯相同的處理順序(並行處理)
# 處理文字框
if getattr(shape, "has_text_frame", False):
text = self._extract_text_from_frame(shape.text_frame)
if text.strip():
ok, skip = self._insert_single_language_translation(
shape.text_frame, text, translation_map, target_languages[0]
)
ok_count += ok
skip_count += skip
# 處理表格
if getattr(shape, "has_table", False):
table_ok, table_skip = self._insert_table_translations(
shape.table, translation_map, target_languages[0]
)
ok_count += table_ok
skip_count += table_skip
# 處理圖表(並行處理)
if getattr(shape, "has_chart", False):
chart_ok, chart_skip = self._insert_chart_translations(
shape.chart, translation_map, target_languages[0]
)
ok_count += chart_ok
skip_count += chart_skip
# 處理群組形狀(並行處理,支援深度嵌套)
if hasattr(shape, 'shapes'):
group_ok, group_skip = self._insert_group_translations(
shape.shapes, translation_map, target_languages[0], slide_idx, shape_idx
)
ok_count += group_ok
skip_count += group_skip
# 處理基本形狀文字(並行處理)
if hasattr(shape, 'text') and shape.text.strip():
if (target_languages[0], shape.text) in translation_map:
translated_text = translation_map[(target_languages[0], shape.text)]
shape.text = translated_text
ok_count += 1
logger.debug(f"Inserted basic shape translation on slide {slide_idx}: {shape.text[:30]}...")
else:
skip_count += 1
# 儲存文件
prs.save(output_path)
logger.info(f"Saved PowerPoint file with {ok_count} translations, {skip_count} skips")
return ok_count, skip_count
except Exception as e:
logger.error(f"Failed to insert PowerPoint translations: {str(e)}")
raise FileProcessingError(f"PowerPoint 翻譯插入失敗: {str(e)}")
def insert_pptx_combined_translations(self, translation_map: Dict[Tuple[str, str], str],
target_languages: List[str], output_path: str) -> Tuple[int, int]:
"""插入翻譯到 PowerPoint 文件 - 組合模式(原文+所有譯文)"""
try:
import pptx
from shutil import copyfile
# 複製原始文件
copyfile(str(self.file_path), output_path)
# 載入 PowerPoint 文件
prs = pptx.Presentation(output_path)
ok_count = skip_count = 0
for slide in prs.slides:
for shape in slide.shapes:
# 處理文字框
if getattr(shape, "has_text_frame", False):
text = self._extract_text_from_frame(shape.text_frame)
if text.strip():
ok, skip = self._insert_combined_language_translation(
shape.text_frame, text, translation_map, target_languages
)
ok_count += ok
skip_count += skip
# 處理表格
elif getattr(shape, "has_table", False):
table_ok, table_skip = self._insert_combined_table_translations(
shape.table, translation_map, target_languages
)
ok_count += table_ok
skip_count += table_skip
# 處理圖表
elif getattr(shape, "has_chart", False):
chart_ok, chart_skip = self._insert_combined_chart_translations(
shape.chart, translation_map, target_languages
)
ok_count += chart_ok
skip_count += chart_skip
# 處理群組形狀
elif hasattr(shape, 'shapes'):
group_ok, group_skip = self._insert_combined_group_translations(
shape.shapes, translation_map, target_languages
)
ok_count += group_ok
skip_count += group_skip
# 處理基本形狀文字
elif hasattr(shape, 'text') and shape.text.strip():
# 收集所有語言的翻譯
translations = []
for lang in target_languages:
if (lang, shape.text) in translation_map:
translations.append(translation_map[(lang, shape.text)])
else:
translations.append(f"【翻譯缺失|{lang}")
if translations:
# 組合原文和所有翻譯
combined_text = shape.text + '\n' + '\n'.join(translations)
shape.text = combined_text
ok_count += 1
else:
skip_count += 1
# 儲存文件
prs.save(output_path)
logger.info(f"Saved combined PowerPoint file with {ok_count} translations, {skip_count} skips")
return ok_count, skip_count
except Exception as e:
logger.error(f"Failed to insert combined PowerPoint translations: {str(e)}")
raise FileProcessingError(f"PowerPoint 組合翻譯插入失敗: {str(e)}")
def _insert_single_language_translation(self, text_frame, original_text: str,
translation_map: Dict[Tuple[str, str], str],
target_language: str) -> Tuple[int, int]:
"""插入單語言翻譯到文字框"""
if (target_language, original_text) not in translation_map:
return 0, 1
translated_text = translation_map[(target_language, original_text)]
# 檢查是否已存在翻譯
if self._check_existing_translations(text_frame, [translated_text]):
return 0, 1
# 清除現有內容,只保留翻譯
text_frame.clear()
para = text_frame.add_paragraph()
para.text = translated_text
# 設定格式
for run in para.runs:
run.font.italic = True
try:
from pptx.util import Pt as PPTPt
run.font.size = PPTPt(12)
except:
pass
return 1, 0
def _insert_combined_language_translation(self, text_frame, original_text: str,
translation_map: Dict[Tuple[str, str], str],
target_languages: List[str]) -> Tuple[int, int]:
"""插入組合語言翻譯到文字框(原文+所有譯文)"""
translations = []
for lang in target_languages:
if (lang, original_text) in translation_map:
translations.append(translation_map[(lang, original_text)])
else:
translations.append(f"【翻譯缺失|{lang}")
if not any(trans for trans in translations if not trans.startswith("【翻譯缺失")):
return 0, 1
# 檢查是否已存在翻譯
combined_translations = [original_text] + translations
if self._check_existing_translations(text_frame, combined_translations):
return 0, 1
# 添加所有翻譯
for translation in translations:
self._append_translation(text_frame, translation)
return 1, 0
def _insert_table_translations(self, table, translation_map: Dict[Tuple[str, str], str],
target_language: str) -> Tuple[int, int]:
"""插入翻譯到表格 - 單語言模式"""
ok_count = skip_count = 0
for row in table.rows:
for cell in row.cells:
cell_text = cell.text_frame.text.strip()
if not cell_text:
continue
if (target_language, cell_text) in translation_map:
translated_text = translation_map[(target_language, cell_text)]
# 替換儲存格內容為翻譯文
cell.text_frame.clear()
para = cell.text_frame.add_paragraph()
para.text = translated_text
# 設定格式
for run in para.runs:
run.font.italic = True
try:
from pptx.util import Pt as PPTPt
run.font.size = PPTPt(10)
except:
pass
ok_count += 1
else:
skip_count += 1
return ok_count, skip_count
def _insert_combined_table_translations(self, table, translation_map: Dict[Tuple[str, str], str],
target_languages: List[str]) -> Tuple[int, int]:
"""插入翻譯到表格 - 組合模式"""
ok_count = skip_count = 0
for row in table.rows:
for cell in row.cells:
cell_text = cell.text_frame.text.strip()
if not cell_text:
continue
# 收集所有語言的翻譯
translations = []
for lang in target_languages:
if (lang, cell_text) in translation_map:
translations.append(translation_map[(lang, cell_text)])
else:
translations.append(f"【翻譯缺失|{lang}")
if translations:
# 組合原文和所有翻譯
combined_text = cell_text + '\n' + '\n'.join(translations)
# 替換儲存格內容
cell.text_frame.clear()
para = cell.text_frame.add_paragraph()
para.text = combined_text
# 設定格式
for run in para.runs:
try:
from pptx.util import Pt as PPTPt
run.font.size = PPTPt(9)
except:
pass
ok_count += 1
else:
skip_count += 1
return ok_count, skip_count
def _insert_chart_translations(self, chart, translation_map: Dict[Tuple[str, str], str],
target_language: str) -> Tuple[int, int]:
"""插入翻譯到圖表 - 有限支援"""
ok_count = skip_count = 0
try:
# 處理圖表標題
if hasattr(chart, 'chart_title') and chart.chart_title.has_text_frame:
title_text = chart.chart_title.text_frame.text.strip()
if title_text and (target_language, title_text) in translation_map:
translated_title = translation_map[(target_language, title_text)]
chart.chart_title.text_frame.text = translated_title
ok_count += 1
logger.debug(f"Translated chart title: {title_text[:30]} -> {translated_title[:30]}")
else:
skip_count += 1
# 注意python-pptx 對圖表軸標籤等的支援非常有限
logger.info(f"Chart translation: {ok_count} successful, {skip_count} skipped (limited support)")
except Exception as e:
logger.error(f"Failed to insert chart translations: {str(e)}")
skip_count += 1
return ok_count, skip_count
def _insert_group_translations(self, shapes, translation_map: Dict[Tuple[str, str], str],
target_language: str, slide_idx: int = 0, shape_idx: int = 0, depth: int = 0) -> Tuple[int, int]:
"""插入翻譯到群組形狀 - 支援深度嵌套,與提取邏輯保持一致"""
ok_count = skip_count = 0
max_depth = 10 # 防止無限遞歸
if depth > max_depth:
logger.warning(f"Group nesting depth exceeded {max_depth} on slide {slide_idx}, skipping deeper levels")
return ok_count, skip_count
try:
for sub_shape_idx, sub_shape in enumerate(shapes):
shape_processed = False
# 1. 優先處理嵌套群組(遞歸處理)
if hasattr(sub_shape, 'shapes') and hasattr(sub_shape, 'shape_type'):
try:
nested_ok, nested_skip = self._insert_group_translations(
sub_shape.shapes, translation_map, target_language,
slide_idx, f"{shape_idx}.{sub_shape_idx}", depth + 1
)
ok_count += nested_ok
skip_count += nested_skip
if nested_ok > 0:
shape_processed = True
logger.debug(f"Inserted {nested_ok} nested group translations at depth {depth + 1}")
except Exception as e:
logger.debug(f"Failed to process nested group at depth {depth + 1}: {str(e)}")
# 2. 處理群組內的文字框(並行處理)
if getattr(sub_shape, "has_text_frame", False):
text = self._extract_text_from_frame(sub_shape.text_frame)
if text.strip():
if (target_language, text) in translation_map:
translated_text = translation_map[(target_language, text)]
# 使用更安全的文字替換方法
try:
# 清除並重新設置文字
sub_shape.text_frame.clear()
para = sub_shape.text_frame.add_paragraph()
para.text = translated_text
ok_count += 1
shape_processed = True
logger.debug(f"Inserted group text frame translation: {text[:30]}... -> {translated_text[:30]}...")
except Exception as e:
logger.warning(f"Failed to replace text frame content: {str(e)}")
skip_count += 1
else:
skip_count += 1
# 3. 處理群組內的表格(並行處理)
if getattr(sub_shape, "has_table", False):
table_ok, table_skip = self._insert_table_translations(
sub_shape.table, translation_map, target_language
)
ok_count += table_ok
skip_count += table_skip
if table_ok > 0:
shape_processed = True
# 4. 處理群組內的圖表(並行處理)
if getattr(sub_shape, "has_chart", False):
chart_ok, chart_skip = self._insert_chart_translations(
sub_shape.chart, translation_map, target_language
)
ok_count += chart_ok
skip_count += chart_skip
if chart_ok > 0:
shape_processed = True
# 5. 處理基本形狀文字(作為備選方案)
if not shape_processed and hasattr(sub_shape, 'text') and sub_shape.text.strip():
if (target_language, sub_shape.text) in translation_map:
translated_text = translation_map[(target_language, sub_shape.text)]
sub_shape.text = translated_text
ok_count += 1
logger.debug(f"Inserted basic group shape translation: {sub_shape.text[:30]}...")
shape_processed = True
else:
skip_count += 1
logger.debug(f"Group translation at depth {depth}: {ok_count} successful, {skip_count} skipped")
except Exception as e:
logger.error(f"Failed to insert group translations at depth {depth}: {str(e)}")
return ok_count, skip_count
def _insert_combined_chart_translations(self, chart, translation_map: Dict[Tuple[str, str], str],
target_languages: List[str]) -> Tuple[int, int]:
"""插入組合翻譯到圖表 - 有限支援"""
ok_count = skip_count = 0
try:
# 處理圖表標題
if hasattr(chart, 'chart_title') and chart.chart_title.has_text_frame:
title_text = chart.chart_title.text_frame.text.strip()
if title_text:
# 收集所有語言的翻譯
translations = []
for lang in target_languages:
if (lang, title_text) in translation_map:
translations.append(translation_map[(lang, title_text)])
else:
translations.append(f"【翻譯缺失|{lang}")
if any(trans for trans in translations if not trans.startswith("【翻譯缺失")):
# 組合原文和所有翻譯
combined_text = title_text + '\n' + '\n'.join(translations)
chart.chart_title.text_frame.text = combined_text
ok_count += 1
else:
skip_count += 1
else:
skip_count += 1
# 注意python-pptx 對圖表軸標籤等的支援非常有限
logger.info(f"Combined chart translation: {ok_count} successful, {skip_count} skipped (limited support)")
except Exception as e:
logger.error(f"Failed to insert combined chart translations: {str(e)}")
skip_count += 1
return ok_count, skip_count
def _insert_combined_group_translations(self, shapes, translation_map: Dict[Tuple[str, str], str],
target_languages: List[str]) -> Tuple[int, int]:
"""插入組合翻譯到群組形狀"""
ok_count = skip_count = 0
try:
for sub_shape in shapes:
# 處理群組內的文字框
if getattr(sub_shape, "has_text_frame", False):
text = self._extract_text_from_frame(sub_shape.text_frame)
if text.strip():
# 收集所有語言的翻譯
translations = []
for lang in target_languages:
if (lang, text) in translation_map:
translations.append(translation_map[(lang, text)])
else:
translations.append(f"【翻譯缺失|{lang}")
if any(trans for trans in translations if not trans.startswith("【翻譯缺失")):
# 添加所有翻譯
for translation in translations:
self._append_translation(sub_shape.text_frame, translation)
ok_count += 1
else:
skip_count += 1
else:
skip_count += 1
# 處理群組內的表格
elif getattr(sub_shape, "has_table", False):
table_ok, table_skip = self._insert_combined_table_translations(
sub_shape.table, translation_map, target_languages
)
ok_count += table_ok
skip_count += table_skip
# 處理群組內的基本形狀文字
elif hasattr(sub_shape, 'text') and sub_shape.text.strip():
# 收集所有語言的翻譯
translations = []
for lang in target_languages:
if (lang, sub_shape.text) in translation_map:
translations.append(translation_map[(lang, sub_shape.text)])
else:
translations.append(f"【翻譯缺失|{lang}")
if translations:
# 組合原文和所有翻譯
combined_text = sub_shape.text + '\n' + '\n'.join(translations)
sub_shape.text = combined_text
ok_count += 1
else:
skip_count += 1
except Exception as e:
logger.error(f"Failed to insert combined group translations: {str(e)}")
return ok_count, skip_count
class TranslationService:
"""翻譯服務"""
def __init__(self):
self.dify_client = DifyClient()
self.document_processor = DocumentProcessor()
# 文件解析器映射
self.parsers = {
'.docx': DocxParser,
'.doc': DocParser, # 需要先轉換為 DOCX
'.pptx': PptxParser, # PowerPoint 簡報支援
'.xlsx': ExcelParser,
'.xls': ExcelParser, # Excel 處理器會自動處理 XLS 轉換
'.pdf': PdfParser,
# 其他格式可以稍後添加
}
def get_document_parser(self, file_path: str) -> DocumentParser:
"""取得文件解析器"""
file_ext = Path(file_path).suffix.lower()
parser_class = self.parsers.get(file_ext)
if not parser_class:
raise FileProcessingError(f"不支援的檔案格式: {file_ext}")
return parser_class(file_path)
def split_text_into_sentences(self, text: str, language: str = 'auto') -> List[str]:
"""將文字分割成句子 - 使用增強的分句邏輯"""
return self.document_processor.split_text_into_sentences(text, language)
def translate_excel_cell(self, text: str, source_language: str,
target_language: str, user_id: int = None,
job_id: int = None, conversation_id: str = None) -> Dict[str, Any]:
"""
Excel儲存格翻譯 - 整個儲存格作為一個單位翻譯,不進行切片
返回 dict 包含 translated_text 和 conversation_id
"""
if not text or not text.strip():
return {"translated_text": "", "conversation_id": conversation_id}
# 檢查快取 - 整個儲存格內容
cached_translation = TranslationCache.get_translation(text, source_language, target_language)
if cached_translation:
logger.debug(f"Excel cell cache hit: {text[:30]}...")
return {"translated_text": cached_translation, "conversation_id": conversation_id}
# 直接翻譯整個儲存格內容,不進行任何切片
try:
result = self.dify_client.translate_text(
text=text,
source_language=source_language,
target_language=target_language,
user_id=user_id,
job_id=job_id,
conversation_id=conversation_id # 傳遞 conversation_id
)
translated_text = result['translated_text']
# 儲存整個儲存格的翻譯到快取
TranslationCache.save_translation(
text, source_language, target_language, translated_text
)
return result # 返回包含 conversation_id 的完整結果
except Exception as e:
logger.error(f"Failed to translate Excel cell: {text[:30]}... Error: {str(e)}")
# 翻譯失敗時返回失敗標記
return f"【翻譯失敗|{target_language}{text}"
def translate_word_table_cell(self, text: str, source_language: str,
target_language: str, user_id: int = None,
job_id: int = None) -> str:
"""
Word表格儲存格翻譯 - 整個儲存格內容作為一個單位翻譯,不進行段落切片
"""
if not text or not text.strip():
return ""
# 檢查快取 - 整個儲存格內容
cached_translation = TranslationCache.get_translation(text, source_language, target_language)
if cached_translation:
logger.debug(f"Word table cell cache hit: {text[:30]}...")
return cached_translation
# 直接翻譯整個儲存格內容,不進行任何段落切片
try:
result = self.dify_client.translate_text(
text=text,
source_language=source_language,
target_language=target_language,
user_id=user_id,
job_id=job_id
)
translated_text = result['translated_text']
# 儲存整個儲存格的翻譯到快取
TranslationCache.save_translation(
text, source_language, target_language, translated_text
)
return translated_text
except Exception as e:
logger.error(f"Failed to translate Word table cell: {text[:30]}... Error: {str(e)}")
return f"【翻譯失敗|{target_language}{text}"
def translate_segment_with_sentences(self, text: str, source_language: str,
target_language: str, user_id: int = None,
job_id: int = None, conversation_id: str = None) -> Dict[str, Any]:
"""
按段落翻譯,模仿成功版本的 translate_block_sentencewise 邏輯
對多行文字進行逐行、逐句翻譯,並重新組合成完整段落
僅用於Word文檔Excel請使用 translate_excel_cell
"""
if not text or not text.strip():
return ""
# 檢查快取 - 先檢查整個段落的快取
cached_whole = TranslationCache.get_translation(text, source_language, target_language)
if cached_whole:
logger.debug(f"Whole paragraph cache hit: {text[:30]}...")
return cached_whole
# 按行處理
out_lines = []
all_successful = True
current_conversation_id = conversation_id
for raw_line in text.split('\n'):
if not raw_line.strip():
out_lines.append("")
continue
# 分句處理
sentences = self.document_processor.split_text_into_sentences(raw_line, source_language)
if not sentences:
sentences = [raw_line]
translated_parts = []
for sentence in sentences:
sentence = sentence.strip()
if not sentence:
continue
# 檢查句子級快取
cached_sentence = TranslationCache.get_translation(sentence, source_language, target_language)
if cached_sentence:
translated_parts.append(cached_sentence)
continue
# 呼叫 Dify API 翻譯句子
try:
result = self.dify_client.translate_text(
text=sentence,
source_language=source_language,
target_language=target_language,
user_id=user_id,
job_id=job_id,
conversation_id=current_conversation_id
)
translated_sentence = result['translated_text']
# 更新對話ID以保持上下文連續性
if result.get('conversation_id'):
current_conversation_id = result['conversation_id']
# 儲存句子級快取
TranslationCache.save_translation(
sentence, source_language, target_language, translated_sentence
)
translated_parts.append(translated_sentence)
except Exception as e:
logger.error(f"Failed to translate sentence: {sentence[:30]}... Error: {str(e)}")
translated_parts.append(f"【翻譯失敗|{target_language}{sentence}")
all_successful = False
# 重新組合句子為一行
out_lines.append(" ".join(translated_parts))
# 重新組合所有行
final_result = "\n".join(out_lines)
# 如果全部成功,儲存整個段落的快取
if all_successful:
TranslationCache.save_translation(text, source_language, target_language, final_result)
return {
'translated_text': final_result,
'conversation_id': current_conversation_id
}
def translate_text_with_cache(self, text: str, source_language: str,
target_language: str, user_id: int = None,
job_id: int = None, conversation_id: str = None) -> Dict[str, Any]:
"""帶快取的文字翻譯"""
# 檢查快取
cached_translation = TranslationCache.get_translation(
text, source_language, target_language
)
if cached_translation:
logger.debug(f"Cache hit for translation: {text[:50]}...")
return {
'translated_text': cached_translation,
'conversation_id': conversation_id, # 保持原有的conversation_id
'from_cache': True
}
# 呼叫 Dify API
try:
result = self.dify_client.translate_text(
text=text,
source_language=source_language,
target_language=target_language,
user_id=user_id,
job_id=job_id,
conversation_id=conversation_id
)
translated_text = result['translated_text']
new_conversation_id = result.get('conversation_id')
# 儲存到快取
TranslationCache.save_translation(
text, source_language, target_language, translated_text
)
return {
'translated_text': translated_text,
'conversation_id': new_conversation_id,
'from_cache': False
}
except Exception as e:
logger.error(f"Translation failed for text: {text[:50]}... Error: {str(e)}")
raise TranslationError(f"翻譯失敗: {str(e)}")
def translate_document(self, job_uuid: str) -> Dict[str, Any]:
"""翻譯文件(主要入口點)- 使用增強的文檔處理邏輯"""
try:
# 取得任務資訊
job = TranslationJob.query.filter_by(job_uuid=job_uuid).first()
if not job:
raise TranslationError(f"找不到任務: {job_uuid}")
logger.info(f"Starting enhanced document translation: {job_uuid}")
# 更新任務狀態
job.update_status('PROCESSING', progress=0)
# 使用增強的文檔處理器直接提取段落
file_ext = Path(job.file_path).suffix.lower()
if file_ext in ['.docx', '.doc']:
# 使用增強的 DOCX 處理邏輯
segments = self.document_processor.extract_docx_segments(job.file_path)
logger.info(f"Enhanced extraction: Found {len(segments)} segments to translate")
if not segments:
raise TranslationError("文件中未找到可翻譯的文字段落")
# 使用成功版本的翻譯邏輯 - 直接按段落翻譯,不做複雜分割
translatable_segments = []
for seg in segments:
if self.document_processor.should_translate_text(seg.text, job.source_language):
translatable_segments.append(seg)
logger.info(f"Found {len(translatable_segments)} segments to translate")
# 批次翻譯 - 直接按原始段落翻譯
translation_map = {} # 格式: (target_language, source_text) -> translated_text
total_segments = len(translatable_segments)
for target_language in job.target_languages:
logger.info(f"Translating to {target_language}")
# 每個目標語言使用獨立的對話ID以保持該語言的翻譯一致性
current_conversation_id = None
for i, seg in enumerate(translatable_segments):
try:
# 根據段落類型選擇適當的翻譯方法
if seg.kind == "table_cell":
# 表格儲存格使用整個儲存格為單位的翻譯方法
translated = self.translate_word_table_cell(
text=seg.text,
source_language=job.source_language,
target_language=target_language,
user_id=job.user_id,
job_id=job.id
)
else:
# 一般段落使用原有的句子切片方法
translation_result = self.translate_segment_with_sentences(
text=seg.text,
source_language=job.source_language,
target_language=target_language,
user_id=job.user_id,
job_id=job.id,
conversation_id=current_conversation_id
)
translated = translation_result['translated_text']
# 更新當前對話ID以保持上下文連續性
if translation_result.get('conversation_id'):
current_conversation_id = translation_result['conversation_id']
# 直接以原始段落文字為鍵儲存翻譯結果
translation_map[(target_language, seg.text)] = translated
# 更新進度
progress = (i + 1) / total_segments * 100 / len(job.target_languages)
current_lang_index = job.target_languages.index(target_language)
total_progress = (current_lang_index * 100 + progress) / len(job.target_languages)
job.update_status('PROCESSING', progress=total_progress)
# 短暫延遲避免過快請求
time.sleep(0.1)
except Exception as e:
logger.error(f"Failed to translate segment: {seg.text[:50]}... Error: {str(e)}")
# 翻譯失敗時保留原文
translation_map[(target_language, seg.text)] = f"[翻譯失敗] {seg.text}"
# 保存該語言的對話ID到任務記錄中用於後續重試等場景
if current_conversation_id and not job.conversation_id:
job.conversation_id = current_conversation_id
db.session.commit()
logger.info(f"Saved conversation_id {current_conversation_id} for job {job.job_uuid}")
# 生成翻譯文件
logger.info("Generating translated documents with enhanced insertion")
output_dir = Path(job.file_path).parent
output_files = {}
for target_language in job.target_languages:
try:
# 生成輸出檔名
output_filename = generate_filename(
Path(job.file_path).name,
'translated',
'translated',
target_language
)
output_path = output_dir / output_filename
# 使用增強的翻譯插入邏輯
ok_count, skip_count = self.document_processor.insert_docx_translations(
job.file_path,
segments,
translation_map,
[target_language],
str(output_path)
)
output_files[target_language] = str(output_path)
# 記錄翻譯檔案到資料庫
file_size = Path(output_path).stat().st_size
job.add_translated_file(
language_code=target_language,
filename=Path(output_path).name,
file_path=str(output_path),
file_size=file_size
)
logger.info(f"Generated {target_language}: {ok_count} insertions, {skip_count} skips")
except Exception as e:
logger.error(f"Failed to generate translated document for {target_language}: {str(e)}")
raise TranslationError(f"生成 {target_language} 翻譯文件失敗: {str(e)}")
# 生成組合多語言檔案 - 包含所有翻譯在一個文件中
if len(job.target_languages) > 1:
try:
# 生成組合檔案的檔名
combined_filename = generate_filename(
Path(job.file_path).name,
'translated',
'combined',
'multilang'
)
combined_output_path = output_dir / combined_filename
# 使用新的組合翻譯插入方法
combined_ok_count, combined_skip_count = self.document_processor.insert_docx_combined_translations(
job.file_path,
segments,
translation_map,
job.target_languages,
str(combined_output_path)
)
output_files['combined'] = str(combined_output_path)
# 記錄組合翻譯檔案到資料庫
file_size = Path(combined_output_path).stat().st_size
job.add_translated_file(
language_code='combined',
filename=Path(combined_output_path).name,
file_path=str(combined_output_path),
file_size=file_size
)
logger.info(f"Generated combined multi-language file: {combined_ok_count} insertions, {combined_skip_count} skips")
except Exception as e:
logger.error(f"Failed to generate combined multi-language document: {str(e)}")
# 不要因為組合檔案失敗而讓整個任務失敗,只記錄警告
logger.warning("Combined multi-language file generation failed, but individual files were successful")
elif file_ext in ['.xlsx', '.xls']:
# Excel 文件使用儲存格為單位的翻譯邏輯
logger.info(f"Using cell-based processing for Excel files")
parser = self.get_document_parser(job.file_path)
# 提取儲存格文字內容(不進行句子切片)
cell_segments = parser.extract_text_segments()
if not cell_segments:
raise TranslationError("Excel 文件中未找到可翻譯的文字")
logger.info(f"Found {len(cell_segments)} cell segments to translate")
# 批次翻譯 - 使用儲存格為單位的翻譯方法
translation_results = {}
total_segments = len(cell_segments)
for target_language in job.target_languages:
logger.info(f"Translating Excel cells to {target_language}")
translated_cells = []
current_conversation_id = job.conversation_id # 維持上下文連貫性
for i, cell_text in enumerate(cell_segments):
try:
# 使用新的儲存格翻譯方法(整個儲存格作為單位)
translated = self.translate_excel_cell(
text=cell_text,
source_language=job.source_language,
target_language=target_language,
user_id=job.user_id,
job_id=job.id,
conversation_id=current_conversation_id # 傳遞 conversation_id
)
# 提取翻譯文字translate_excel_cell 現在返回 dict
translated_text = translated["translated_text"] if isinstance(translated, dict) else translated
translated_cells.append(translated_text)
# 更新 conversation_id 以維持連續對話上下文
if isinstance(translated, dict) and translated.get("conversation_id"):
current_conversation_id = translated["conversation_id"]
# 更新進度
progress = (i + 1) / total_segments * 100 / len(job.target_languages)
current_lang_index = job.target_languages.index(target_language)
total_progress = (current_lang_index * 100 + progress) / len(job.target_languages)
job.update_status('PROCESSING', progress=total_progress)
time.sleep(0.1)
except Exception as e:
logger.error(f"Failed to translate Excel cell: {cell_text[:50]}... Error: {str(e)}")
translated_cells.append(f"[翻譯失敗] {cell_text}")
translation_results[target_language] = translated_cells
# 生成翻譯文件
output_dir = Path(job.file_path).parent
output_files = {}
for target_language, translations in translation_results.items():
translation_mapping = {target_language: translations}
output_file = parser.generate_translated_document(
translations=translation_mapping,
target_language=target_language,
output_dir=output_dir
)
output_files[target_language] = output_file
file_size = Path(output_file).stat().st_size
job.add_translated_file(
language_code=target_language,
filename=Path(output_file).name,
file_path=output_file,
file_size=file_size
)
# 生成組合多語言Excel檔案
if len(job.target_languages) > 1:
try:
# 生成組合檔案的檔名
combined_filename = generate_filename(
Path(job.file_path).name,
'translated',
'combined',
'multilang'
)
combined_output_path = output_dir / combined_filename
# 為Excel組合檔案建立翻譯映射
combined_translation_mapping = {}
for lang in job.target_languages:
combined_translation_mapping[lang] = translation_results[lang]
# 使用修改過的generate_combined_excel_document方法
combined_output_file = self._generate_combined_excel_document(
parser,
combined_translation_mapping,
job.target_languages,
combined_output_path
)
output_files['combined'] = combined_output_file
# 記錄組合翻譯檔案到資料庫
file_size = Path(combined_output_file).stat().st_size
job.add_translated_file(
language_code='combined',
filename=Path(combined_output_file).name,
file_path=combined_output_file,
file_size=file_size
)
logger.info(f"Generated combined multi-language Excel file")
except Exception as e:
logger.error(f"Failed to generate combined multi-language Excel document: {str(e)}")
logger.warning("Combined multi-language Excel file generation failed, but individual files were successful")
elif file_ext == '.pptx':
# PowerPoint 文件使用增強的處理邏輯,仿照 DOCX 處理方式
logger.info(f"Using enhanced PowerPoint processing for {job_uuid}")
parser = self.get_document_parser(job.file_path)
# 提取文字段落和表格內容
text_segments = parser.extract_text_segments()
if not text_segments:
raise TranslationError("PowerPoint 文件中未找到可翻譯的文字")
logger.info(f"Found {len(text_segments)} PowerPoint text segments to translate")
# 批次翻譯 - 建立翻譯映射
translation_map = {} # 格式: (target_language, source_text) -> translated_text
total_segments = len(text_segments)
for target_language in job.target_languages:
logger.info(f"Translating PowerPoint segments to {target_language}")
translated_segments = []
current_conversation_id = job.conversation_id # 維持上下文連貫性
for i, segment_text in enumerate(text_segments):
try:
# 對於 PowerPoint 文字框和表格,使用段落級別的翻譯
translated = self.translate_segment_with_sentences(
text=segment_text,
source_language=job.source_language,
target_language=target_language,
user_id=job.user_id,
job_id=job.id,
conversation_id=current_conversation_id # 傳遞 conversation_id
)
# 使用與 DOCX 相同的格式儲存翻譯結果
translation_map[(target_language, segment_text)] = translated
# 更新 conversation_id 以維持連續對話上下文
if isinstance(translated, dict) and translated.get("conversation_id"):
current_conversation_id = translated["conversation_id"]
# 更新進度
progress = (i + 1) / total_segments * 100 / len(job.target_languages)
current_lang_index = job.target_languages.index(target_language)
total_progress = (current_lang_index * 100 + progress) / len(job.target_languages)
job.update_status('PROCESSING', progress=total_progress)
time.sleep(0.1)
except Exception as e:
logger.error(f"Failed to translate PowerPoint segment: {segment_text[:50]}... Error: {str(e)}")
# 翻譯失敗時保留原文
translation_map[(target_language, segment_text)] = f"[翻譯失敗] {segment_text}"
# 生成翻譯文件 - 仿照 DOCX 的方式
logger.info("Generating translated PowerPoint documents with enhanced insertion")
output_dir = Path(job.file_path).parent
output_files = {}
# 生成單語言文件
for target_language in job.target_languages:
try:
# 生成輸出檔名
output_filename = generate_filename(
Path(job.file_path).name,
'translated',
'translated',
target_language
)
output_path = output_dir / output_filename
# 使用增強的翻譯插入邏輯
ok_count, skip_count = parser.insert_pptx_translations(
translation_map,
[target_language],
str(output_path)
)
output_files[target_language] = str(output_path)
# 記錄翻譯檔案到資料庫
file_size = Path(output_path).stat().st_size
job.add_translated_file(
language_code=target_language,
filename=Path(output_path).name,
file_path=str(output_path),
file_size=file_size
)
logger.info(f"Generated {target_language}: {ok_count} insertions, {skip_count} skips")
except Exception as e:
logger.error(f"Failed to generate translated PowerPoint document for {target_language}: {str(e)}")
raise TranslationError(f"生成 {target_language} PowerPoint 翻譯文件失敗: {str(e)}")
# 生成組合多語言檔案 - 包含所有翻譯在一個文件中
if len(job.target_languages) > 1:
try:
# 生成組合檔案的檔名
combined_filename = generate_filename(
Path(job.file_path).name,
'translated',
'combined',
'multilang'
)
combined_output_path = output_dir / combined_filename
# 使用組合翻譯插入方法
combined_ok_count, combined_skip_count = parser.insert_pptx_combined_translations(
translation_map,
job.target_languages,
str(combined_output_path)
)
output_files['combined'] = str(combined_output_path)
# 記錄組合翻譯檔案到資料庫
file_size = Path(combined_output_path).stat().st_size
job.add_translated_file(
language_code='combined',
filename=Path(combined_output_path).name,
file_path=str(combined_output_path),
file_size=file_size
)
logger.info(f"Generated combined multi-language PowerPoint file: {combined_ok_count} insertions, {combined_skip_count} skips")
except Exception as e:
logger.error(f"Failed to generate combined multi-language PowerPoint document: {str(e)}")
# 不要因為組合檔案失敗而讓整個任務失敗,只記錄警告
logger.warning("Combined multi-language PowerPoint file generation failed, but individual files were successful")
elif file_ext == '.pdf':
# PDF 文件使用增強的OCR處理邏輯避免重複OCR
logger.info(f"Using enhanced PDF processing for {job_uuid}")
from app.services.enhanced_pdf_parser import EnhancedPdfParser
enhanced_parser = EnhancedPdfParser(job.file_path)
# 提取文字片段會使用OCR快取避免重複處理
text_segments = enhanced_parser.extract_text_segments(user_id=job.user_id, job_id=job.id)
if not text_segments:
raise TranslationError("PDF文件中未找到可翻譯的文字")
logger.info(f"Found {len(text_segments)} PDF text segments to translate")
# 批次翻譯PDF文字段落
translation_results = {}
total_segments = len(text_segments)
for target_language in job.target_languages:
logger.info(f"Translating PDF segments to {target_language}")
translated_segments = []
current_conversation_id = job.conversation_id # 維持上下文連貫性
for i, segment_text in enumerate(text_segments):
try:
# 對於PDF段落使用段落級別的翻譯保留段落結構
translated = self.translate_segment_with_sentences(
text=segment_text,
source_language=job.source_language,
target_language=target_language,
user_id=job.user_id,
job_id=job.id,
conversation_id=current_conversation_id # 傳遞 conversation_id
)
# 提取翻譯文字translate_segment_with_sentences 返回 dict
translated_text = translated['translated_text'] if isinstance(translated, dict) else translated
translated_segments.append(translated_text)
# 更新 conversation_id 以維持連續對話上下文
if isinstance(translated, dict) and translated.get('conversation_id'):
current_conversation_id = translated['conversation_id']
# 更新進度
progress = (i + 1) / total_segments * 100 / len(job.target_languages)
current_lang_index = job.target_languages.index(target_language)
total_progress = (current_lang_index * 100 + progress) / len(job.target_languages)
job.update_status('PROCESSING', progress=total_progress)
time.sleep(0.1)
except Exception as e:
logger.error(f"Failed to translate PDF segment: {segment_text[:50]}... Error: {str(e)}")
translated_segments.append(f"[翻譯失敗] {segment_text}")
translation_results[target_language] = translated_segments
# 生成翻譯Word文件
logger.info("Generating translated Word documents from PDF")
output_dir = Path(job.file_path).parent
output_files = {}
for target_language, translations in translation_results.items():
try:
# 使用增強PDF解析器生成Word文檔
output_file = enhanced_parser.generate_translated_document(
translations={target_language: translations},
target_language=target_language,
output_dir=output_dir
)
output_files[target_language] = output_file
# 記錄翻譯檔案到資料庫
file_size = Path(output_file).stat().st_size
job.add_translated_file(
language_code=target_language,
filename=Path(output_file).name,
file_path=output_file,
file_size=file_size
)
logger.info(f"Generated PDF translation for {target_language}: {output_file}")
except Exception as e:
logger.error(f"Failed to generate PDF translated document for {target_language}: {str(e)}")
raise TranslationError(f"生成PDF {target_language} 翻譯文件失敗: {str(e)}")
# 生成組合多語言文檔 - 譯文1/譯文2格式當有多個目標語言時
if len(job.target_languages) > 1:
try:
logger.info("Generating combined multi-language PDF document")
combined_output_file = enhanced_parser.generate_combined_translated_document(
all_translations=translation_results,
target_languages=job.target_languages,
output_dir=output_dir
)
output_files['combined'] = combined_output_file
# 記錄組合翻譯檔案到資料庫
file_size = Path(combined_output_file).stat().st_size
job.add_translated_file(
language_code='combined',
filename=Path(combined_output_file).name,
file_path=combined_output_file,
file_size=file_size
)
logger.info(f"Generated combined multi-language PDF file: {combined_output_file}")
except Exception as e:
logger.error(f"Failed to generate combined multi-language PDF document: {str(e)}")
# 不要因為組合檔案失敗而讓整個任務失敗,只記錄警告
logger.warning("Combined multi-language PDF file generation failed, but individual files were successful")
else:
# 對於其他文件格式,使用原有邏輯
logger.info(f"Using legacy sentence-based processing for {file_ext} files")
parser = self.get_document_parser(job.file_path)
# 提取文字片段 - 对PDF传递user_id和job_id以支持OCR
if file_ext == '.pdf':
text_segments = parser.extract_text_segments(user_id=job.user_id, job_id=job.id)
else:
text_segments = parser.extract_text_segments()
if not text_segments:
raise TranslationError("文件中未找到可翻譯的文字")
# 分割成句子
all_sentences = []
for segment in text_segments:
sentences = self.split_text_into_sentences(segment, job.source_language)
all_sentences.extend(sentences)
# 去重複
unique_sentences = list(dict.fromkeys(all_sentences))
logger.info(f"Found {len(unique_sentences)} unique sentences to translate")
# 批次翻譯
translation_results = {}
total_sentences = len(unique_sentences)
for target_language in job.target_languages:
logger.info(f"Translating to {target_language}")
translated_sentences = []
current_conversation_id = job.conversation_id # 維持上下文連貫性
for i, sentence in enumerate(unique_sentences):
try:
translation_result = self.translate_text_with_cache(
text=sentence,
source_language=job.source_language,
target_language=target_language,
user_id=job.user_id,
job_id=job.id,
conversation_id=current_conversation_id # 傳遞 conversation_id
)
translated_sentences.append(translation_result['translated_text'])
# 更新 conversation_id 以維持連續對話上下文
if translation_result.get("conversation_id"):
current_conversation_id = translation_result["conversation_id"]
# 更新進度
progress = (i + 1) / total_sentences * 100 / len(job.target_languages)
current_lang_index = job.target_languages.index(target_language)
total_progress = (current_lang_index * 100 + progress) / len(job.target_languages)
job.update_status('PROCESSING', progress=total_progress)
time.sleep(0.1)
except Exception as e:
logger.error(f"Failed to translate sentence: {sentence[:50]}... Error: {str(e)}")
translated_sentences.append(f"[翻譯失敗] {sentence}")
translation_results[target_language] = translated_sentences
# 生成翻譯文件
output_dir = Path(job.file_path).parent
output_files = {}
for target_language, translations in translation_results.items():
translation_mapping = {target_language: translations}
output_file = parser.generate_translated_document(
translations=translation_mapping,
target_language=target_language,
output_dir=output_dir
)
output_files[target_language] = output_file
file_size = Path(output_file).stat().st_size
job.add_translated_file(
language_code=target_language,
filename=Path(output_file).name,
file_path=output_file,
file_size=file_size
)
# 計算總成本
total_cost = self._calculate_job_cost(job.id)
# 更新任務狀態為完成
job.update_status('COMPLETED', progress=100)
job.total_cost = total_cost
# 計算實際使用的 token 數(從 API 使用統計中獲取)
from sqlalchemy import func
from app.models.stats import APIUsageStats
from app import db
actual_tokens = db.session.query(
func.sum(APIUsageStats.total_tokens)
).filter_by(job_id=job.id).scalar()
job.total_tokens = int(actual_tokens) if actual_tokens else 0
db.session.commit()
logger.info(f"Enhanced document translation completed: {job_uuid}")
return {
'success': True,
'job_uuid': job_uuid,
'output_files': output_files,
'total_sentences': len(texts_to_translate) if 'texts_to_translate' in locals() else len(unique_sentences) if 'unique_sentences' in locals() else 0,
'total_cost': float(total_cost),
'target_languages': job.target_languages
}
except TranslationError:
raise
except Exception as e:
logger.error(f"Enhanced document translation failed: {job_uuid}. Error: {str(e)}")
raise TranslationError(f"文件翻譯失敗: {str(e)}")
def _calculate_job_cost(self, job_id: int) -> float:
"""計算任務總成本"""
from app import db
from sqlalchemy import func
from app.models.stats import APIUsageStats
total_cost = db.session.query(
func.sum(APIUsageStats.cost)
).filter_by(job_id=job_id).scalar()
return float(total_cost) if total_cost else 0.0
def _generate_combined_excel_document(self, parser, translation_mapping: Dict[str, List[str]],
target_languages: List[str], output_path: Path) -> str:
"""生成包含所有翻譯語言的組合Excel檔案"""
try:
import openpyxl
from openpyxl.styles import Alignment, Font
from sqlalchemy import text as sql_text
from app import db
# 載入原始工作簿
wb = openpyxl.load_workbook(str(parser.file_path), data_only=False)
try:
wb_vals = openpyxl.load_workbook(str(parser.file_path), data_only=True)
except Exception:
wb_vals = None
# 取得原始文字段落以建立翻譯映射
original_segments = parser.extract_text_segments()
combined_tmap = {}
logger.info(f"Building combined translation map for {len(original_segments)} segments")
for original_text in original_segments:
# 從翻譯快取中查詢所有語言的翻譯
for target_lang in target_languages:
result = db.session.execute(sql_text("""
SELECT translated_text
FROM dt_translation_cache
WHERE source_text = :text AND target_language = :lang
ORDER BY created_at ASC
LIMIT 1
"""), {'text': original_text, 'lang': target_lang})
row = result.fetchone()
if row and row[0]:
combined_tmap[(target_lang, original_text)] = row[0]
logger.info(f"Built combined translation map with {len(combined_tmap)} mappings")
# 處理每個工作表,插入組合翻譯
for ws in wb.worksheets:
logger.info(f"Processing combined worksheet: {ws.title}")
ws_vals = wb_vals[ws.title] if wb_vals and ws.title in wb_vals.sheetnames else None
max_row, max_col = ws.max_row, ws.max_column
for r in range(1, max_row + 1):
for c in range(1, max_col + 1):
cell = ws.cell(row=r, column=c)
src_text = parser._get_display_text_for_translation(ws, ws_vals, r, c)
if not src_text or not parser._should_translate(src_text, 'auto'):
continue
# 收集所有語言的翻譯
translations = []
for target_lang in target_languages:
if (target_lang, src_text) in combined_tmap:
translations.append(combined_tmap[(target_lang, src_text)])
else:
translations.append(f"【翻譯缺失|{target_lang}")
# 組合翻譯文字:原文\n英文\n越南文
if translations:
combined_text = src_text + '\n' + '\n'.join(translations)
# 設置儲存格值
cell.value = combined_text
cell.alignment = Alignment(wrap_text=True, vertical='top')
cell.font = Font(size=10)
# 儲存組合檔案
wb.save(str(output_path))
logger.info(f"Generated combined Excel file: {output_path}")
return str(output_path)
except Exception as e:
logger.error(f"Failed to generate combined Excel document: {str(e)}")
raise FileProcessingError(f"組合 Excel 檔案生成失敗: {str(e)}")
def _generate_combined_pptx_document(self, parser, translation_results: Dict[str, List[str]],
target_languages: List[str], output_path: Path) -> str:
"""生成包含所有翻譯語言的組合PowerPoint檔案"""
try:
import pptx
from sqlalchemy import text as sql_text
from app import db
# 載入原始 PowerPoint 文件
prs = pptx.Presentation(str(parser.file_path))
# 收集所有文字框和原始文字
text_frames_data = []
for slide in prs.slides:
for shape in slide.shapes:
if getattr(shape, "has_text_frame", False):
text = parser._extract_text_from_frame(shape.text_frame)
if text.strip():
text_frames_data.append((shape.text_frame, text))
# 建立組合翻譯映射 - 從快取讀取所有語言的翻譯
combined_translation_map = {}
logger.info(f"Building combined PowerPoint translation map for {len(text_frames_data)} text frames")
for text_frame, original_text in text_frames_data:
# 從翻譯快取中查詢所有語言的翻譯
for target_lang in target_languages:
result = db.session.execute(sql_text("""
SELECT translated_text
FROM dt_translation_cache
WHERE source_text = :text AND target_language = :lang
ORDER BY created_at ASC
LIMIT 1
"""), {'text': original_text, 'lang': target_lang})
row = result.fetchone()
if row and row[0]:
combined_translation_map[(target_lang, original_text)] = row[0]
logger.info(f"Built combined PowerPoint translation map with {len(combined_translation_map)} mappings")
# 處理每個文字框,插入組合翻譯
ok_count = skip_count = 0
for text_frame, original_text in text_frames_data:
# 收集所有語言的翻譯
translations = []
for target_lang in target_languages:
if (target_lang, original_text) in combined_translation_map:
translations.append(combined_translation_map[(target_lang, original_text)])
else:
translations.append(f"【翻譯缺失|{target_lang}")
# 檢查是否已存在翻譯
if parser._check_existing_translations(text_frame, translations):
skip_count += 1
continue
# 添加所有語言的翻譯
for translation in translations:
parser._append_translation(text_frame, translation)
ok_count += 1
# 儲存組合檔案
prs.save(str(output_path))
logger.info(f"Generated combined PowerPoint file: {output_path} with {ok_count} frames, {skip_count} skips")
return str(output_path)
except Exception as e:
logger.error(f"Failed to generate combined PowerPoint document: {str(e)}")
raise FileProcessingError(f"組合 PowerPoint 檔案生成失敗: {str(e)}")