3rd_fix download

This commit is contained in:
beabigegg
2025-09-02 16:47:16 +08:00
parent b11a8272c4
commit e6e5332705
24 changed files with 1671 additions and 167 deletions

View File

@@ -116,6 +116,294 @@ class DocxParser(DocumentParser):
raise FileProcessingError(f"生成翻譯 DOCX 失敗: {str(e)}")
class DocParser(DocumentParser):
"""DOC 文件解析器 - 需要先轉換為 DOCX"""
def extract_text_segments(self) -> List[str]:
"""提取 DOC 文件的文字片段 - 先轉換為 DOCX 再處理"""
try:
# 檢查是否有 Word COM 支援
import tempfile
import os
try:
import win32com.client as win32
import pythoncom
_WIN32COM_AVAILABLE = True
except ImportError:
_WIN32COM_AVAILABLE = False
if not _WIN32COM_AVAILABLE:
raise FileProcessingError("DOC 格式需要 Word COM 支援,請先手動轉換為 DOCX 格式或安裝 Microsoft Office")
# 創建臨時 DOCX 文件
temp_docx = None
try:
with tempfile.NamedTemporaryFile(suffix='.docx', delete=False) as tmp:
temp_docx = tmp.name
# 使用 Word COM 轉換 DOC 到 DOCX (格式 16)
self._word_convert(str(self.file_path), temp_docx, 16)
# 使用 DOCX 解析器處理轉換後的文件
docx_parser = DocxParser(temp_docx)
segments = docx_parser.extract_text_segments()
logger.info(f"Converted DOC to DOCX and extracted {len(segments)} segments")
return segments
finally:
# 清理臨時文件
if temp_docx and os.path.exists(temp_docx):
try:
os.remove(temp_docx)
except Exception:
pass
except Exception as e:
logger.error(f"Failed to extract text from DOC file: {str(e)}")
raise FileProcessingError(f"DOC 文件解析失敗: {str(e)}")
def _word_convert(self, input_path: str, output_path: str, target_format: int):
"""使用 Word COM 轉換文件格式(移植自參考檔案)"""
try:
import win32com.client as win32
import pythoncom
pythoncom.CoInitialize()
try:
word = win32.Dispatch("Word.Application")
word.Visible = False
doc = word.Documents.Open(os.path.abspath(input_path))
doc.SaveAs2(os.path.abspath(output_path), FileFormat=target_format)
doc.Close(False)
finally:
word.Quit()
pythoncom.CoUninitialize()
except Exception as e:
raise FileProcessingError(f"Word COM 轉換失敗: {str(e)}")
def generate_translated_document(self, translations: Dict[str, List[str]],
target_language: str, output_dir: Path) -> str:
"""生成翻譯後的 DOC 文件 - 先轉為 DOCX 處理後輸出為 DOCX"""
try:
import tempfile
import os
# 先轉換為 DOCX然後使用 DOCX 處理邏輯
temp_docx = None
try:
with tempfile.NamedTemporaryFile(suffix='.docx', delete=False) as tmp:
temp_docx = tmp.name
# 轉換 DOC 到 DOCX
self._word_convert(str(self.file_path), temp_docx, 16)
# 使用 DOCX 解析器生成翻譯文檔
docx_parser = DocxParser(temp_docx)
# 注意:最終輸出為 DOCX 格式,因為 DOC 格式較難直接處理
output_filename = f"{self.file_path.stem}_{target_language}_translated.docx"
output_path = output_dir / output_filename
result_path = docx_parser.generate_translated_document(translations, target_language, output_dir)
logger.info(f"Generated translated DOC file (as DOCX): {result_path}")
return result_path
finally:
# 清理臨時文件
if temp_docx and os.path.exists(temp_docx):
try:
os.remove(temp_docx)
except Exception:
pass
except Exception as e:
logger.error(f"Failed to generate translated DOC file: {str(e)}")
raise FileProcessingError(f"DOC 翻譯檔生成失敗: {str(e)}")
class ExcelParser(DocumentParser):
"""Excel 文件解析器XLSX/XLS- 移植自參考檔案"""
def extract_text_segments(self) -> List[str]:
"""提取 Excel 文件的文字片段"""
try:
import openpyxl
from openpyxl.utils.exceptions import InvalidFileException
# 載入工作簿(移植自參考檔案邏輯)
try:
wb = openpyxl.load_workbook(str(self.file_path), data_only=False)
wb_vals = openpyxl.load_workbook(str(self.file_path), data_only=True)
except InvalidFileException:
if self.file_path.suffix.lower() == '.xls':
raise FileProcessingError("XLS 格式需要先轉換為 XLSX 格式")
raise
except Exception:
wb_vals = None
# 提取文字段落(完全按照參考檔案的邏輯)
segs = []
for ws in wb.worksheets:
ws_vals = wb_vals[ws.title] if wb_vals and ws.title in wb_vals.sheetnames else None
max_row, max_col = ws.max_row, ws.max_column
for r in range(1, max_row + 1):
for c in range(1, max_col + 1):
src_text = self._get_display_text_for_translation(ws, ws_vals, r, c)
if not src_text:
continue
if not self._should_translate(src_text, 'auto'):
continue
segs.append(src_text)
# 去重保持順序
unique_segments = []
seen = set()
for seg in segs:
if seg not in seen:
unique_segments.append(seg)
seen.add(seg)
logger.info(f"Extracted {len(unique_segments)} unique text segments from Excel file")
return unique_segments
except Exception as e:
logger.error(f"Failed to extract text from Excel file: {str(e)}")
raise FileProcessingError(f"Excel 文件解析失敗: {str(e)}")
def _get_display_text_for_translation(self, ws, ws_vals, r: int, c: int) -> Optional[str]:
"""取得儲存格用於翻譯的顯示文字(完全移植自參考檔案)"""
val = ws.cell(row=r, column=c).value
if isinstance(val, str) and val.startswith("="):
if ws_vals is not None:
shown = ws_vals.cell(row=r, column=c).value
return shown if isinstance(shown, str) and shown.strip() else None
return None
if isinstance(val, str) and val.strip():
return val
if ws_vals is not None:
shown = ws_vals.cell(row=r, column=c).value
if isinstance(shown, str) and shown.strip():
return shown
return None
def _should_translate(self, text: str, src_lang: str) -> bool:
"""判斷文字是否需要翻譯(移植自參考檔案)"""
text = text.strip()
if len(text) < 3:
return False
# Skip pure numbers, dates, etc.
import re
if re.match(r'^[\d\s\.\-\:\/]+$', text):
return False
# For auto-detect, translate if has CJK or meaningful text
if src_lang.lower() in ('auto', 'auto-detect'):
return self._has_cjk(text) or len(text) > 5
return True
def _has_cjk(self, text: str) -> bool:
"""檢查是否包含中日韓文字(移植自參考檔案)"""
for char in text:
if '\u4e00' <= char <= '\u9fff' or \
'\u3400' <= char <= '\u4dbf' or \
'\u20000' <= char <= '\u2a6df' or \
'\u3040' <= char <= '\u309f' or \
'\u30a0' <= char <= '\u30ff' or \
'\uac00' <= char <= '\ud7af':
return True
return False
def generate_translated_document(self, translations: Dict[str, List[str]],
target_language: str, output_dir: Path) -> str:
"""生成翻譯後的 Excel 文件(移植自參考檔案邏輯)"""
try:
import openpyxl
from openpyxl.styles import Alignment
from openpyxl.comments import Comment
# 載入原始工作簿
wb = openpyxl.load_workbook(str(self.file_path), data_only=False)
try:
wb_vals = openpyxl.load_workbook(str(self.file_path), data_only=True)
except Exception:
wb_vals = None
# 建立翻譯對應表
translated_texts = translations.get(target_language, [])
original_segments = self.extract_text_segments()
# 建立翻譯映射(按照參考檔案的格式)
tmap = {}
for i, original_text in enumerate(original_segments):
if i < len(translated_texts):
tmap[original_text] = translated_texts[i]
# 處理每個工作表(完全按照參考檔案邏輯)
for ws in wb.worksheets:
ws_vals = wb_vals[ws.title] if wb_vals and ws.title in wb_vals.sheetnames else None
max_row, max_col = ws.max_row, ws.max_column
for r in range(1, max_row + 1):
for c in range(1, max_col + 1):
src_text = self._get_display_text_for_translation(ws, ws_vals, r, c)
if not src_text or src_text not in tmap:
continue
val = ws.cell(row=r, column=c).value
is_formula = isinstance(val, str) and val.startswith("=")
translated_text = tmap[src_text]
cell = ws.cell(row=r, column=c)
if is_formula:
# 公式儲存格:添加註解
txt_comment = f"翻譯: {translated_text}"
exist = cell.comment
if not exist or exist.text.strip() != txt_comment:
cell.comment = Comment(txt_comment, "translator")
else:
# 一般儲存格:使用交錯格式(原文+翻譯)
combined = f"{src_text}\n{translated_text}"
# 檢查是否已經是預期的格式
current_text = str(cell.value) if cell.value else ""
if current_text.strip() == combined.strip():
continue
cell.value = combined
# 設定自動換行(移植自參考檔案)
try:
if cell.alignment:
cell.alignment = Alignment(
horizontal=cell.alignment.horizontal,
vertical=cell.alignment.vertical,
wrap_text=True
)
else:
cell.alignment = Alignment(wrap_text=True)
except Exception:
cell.alignment = Alignment(wrap_text=True)
# 儲存翻譯後的檔案
output_filename = f"{self.file_path.stem}_{target_language}_translated.xlsx"
output_path = output_dir / output_filename
wb.save(str(output_path))
logger.info(f"Generated translated Excel file: {output_path}")
return str(output_path)
except Exception as e:
logger.error(f"Failed to generate translated Excel file: {str(e)}")
raise FileProcessingError(f"Excel 翻譯檔生成失敗: {str(e)}")
class PdfParser(DocumentParser):
"""PDF 文件解析器(只讀)"""
@@ -179,7 +467,9 @@ class TranslationService:
# 文件解析器映射
self.parsers = {
'.docx': DocxParser,
'.doc': DocxParser, # 假設可以用 docx 處理
'.doc': DocParser, # 需要先轉換為 DOCX
'.xlsx': ExcelParser,
'.xls': ExcelParser, # Excel 處理器會自動處理 XLS 轉換
'.pdf': PdfParser,
# 其他格式可以稍後添加
}