3rd_fix download
This commit is contained in:
@@ -116,6 +116,294 @@ class DocxParser(DocumentParser):
|
||||
raise FileProcessingError(f"生成翻譯 DOCX 失敗: {str(e)}")
|
||||
|
||||
|
||||
class DocParser(DocumentParser):
|
||||
"""DOC 文件解析器 - 需要先轉換為 DOCX"""
|
||||
|
||||
def extract_text_segments(self) -> List[str]:
|
||||
"""提取 DOC 文件的文字片段 - 先轉換為 DOCX 再處理"""
|
||||
try:
|
||||
# 檢查是否有 Word COM 支援
|
||||
import tempfile
|
||||
import os
|
||||
|
||||
try:
|
||||
import win32com.client as win32
|
||||
import pythoncom
|
||||
_WIN32COM_AVAILABLE = True
|
||||
except ImportError:
|
||||
_WIN32COM_AVAILABLE = False
|
||||
|
||||
if not _WIN32COM_AVAILABLE:
|
||||
raise FileProcessingError("DOC 格式需要 Word COM 支援,請先手動轉換為 DOCX 格式或安裝 Microsoft Office")
|
||||
|
||||
# 創建臨時 DOCX 文件
|
||||
temp_docx = None
|
||||
try:
|
||||
with tempfile.NamedTemporaryFile(suffix='.docx', delete=False) as tmp:
|
||||
temp_docx = tmp.name
|
||||
|
||||
# 使用 Word COM 轉換 DOC 到 DOCX (格式 16)
|
||||
self._word_convert(str(self.file_path), temp_docx, 16)
|
||||
|
||||
# 使用 DOCX 解析器處理轉換後的文件
|
||||
docx_parser = DocxParser(temp_docx)
|
||||
segments = docx_parser.extract_text_segments()
|
||||
|
||||
logger.info(f"Converted DOC to DOCX and extracted {len(segments)} segments")
|
||||
return segments
|
||||
|
||||
finally:
|
||||
# 清理臨時文件
|
||||
if temp_docx and os.path.exists(temp_docx):
|
||||
try:
|
||||
os.remove(temp_docx)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to extract text from DOC file: {str(e)}")
|
||||
raise FileProcessingError(f"DOC 文件解析失敗: {str(e)}")
|
||||
|
||||
def _word_convert(self, input_path: str, output_path: str, target_format: int):
|
||||
"""使用 Word COM 轉換文件格式(移植自參考檔案)"""
|
||||
try:
|
||||
import win32com.client as win32
|
||||
import pythoncom
|
||||
|
||||
pythoncom.CoInitialize()
|
||||
try:
|
||||
word = win32.Dispatch("Word.Application")
|
||||
word.Visible = False
|
||||
doc = word.Documents.Open(os.path.abspath(input_path))
|
||||
doc.SaveAs2(os.path.abspath(output_path), FileFormat=target_format)
|
||||
doc.Close(False)
|
||||
finally:
|
||||
word.Quit()
|
||||
pythoncom.CoUninitialize()
|
||||
except Exception as e:
|
||||
raise FileProcessingError(f"Word COM 轉換失敗: {str(e)}")
|
||||
|
||||
def generate_translated_document(self, translations: Dict[str, List[str]],
|
||||
target_language: str, output_dir: Path) -> str:
|
||||
"""生成翻譯後的 DOC 文件 - 先轉為 DOCX 處理後輸出為 DOCX"""
|
||||
try:
|
||||
import tempfile
|
||||
import os
|
||||
|
||||
# 先轉換為 DOCX,然後使用 DOCX 處理邏輯
|
||||
temp_docx = None
|
||||
try:
|
||||
with tempfile.NamedTemporaryFile(suffix='.docx', delete=False) as tmp:
|
||||
temp_docx = tmp.name
|
||||
|
||||
# 轉換 DOC 到 DOCX
|
||||
self._word_convert(str(self.file_path), temp_docx, 16)
|
||||
|
||||
# 使用 DOCX 解析器生成翻譯文檔
|
||||
docx_parser = DocxParser(temp_docx)
|
||||
|
||||
# 注意:最終輸出為 DOCX 格式,因為 DOC 格式較難直接處理
|
||||
output_filename = f"{self.file_path.stem}_{target_language}_translated.docx"
|
||||
output_path = output_dir / output_filename
|
||||
|
||||
result_path = docx_parser.generate_translated_document(translations, target_language, output_dir)
|
||||
|
||||
logger.info(f"Generated translated DOC file (as DOCX): {result_path}")
|
||||
return result_path
|
||||
|
||||
finally:
|
||||
# 清理臨時文件
|
||||
if temp_docx and os.path.exists(temp_docx):
|
||||
try:
|
||||
os.remove(temp_docx)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to generate translated DOC file: {str(e)}")
|
||||
raise FileProcessingError(f"DOC 翻譯檔生成失敗: {str(e)}")
|
||||
|
||||
|
||||
class ExcelParser(DocumentParser):
|
||||
"""Excel 文件解析器(XLSX/XLS)- 移植自參考檔案"""
|
||||
|
||||
def extract_text_segments(self) -> List[str]:
|
||||
"""提取 Excel 文件的文字片段"""
|
||||
try:
|
||||
import openpyxl
|
||||
from openpyxl.utils.exceptions import InvalidFileException
|
||||
|
||||
# 載入工作簿(移植自參考檔案邏輯)
|
||||
try:
|
||||
wb = openpyxl.load_workbook(str(self.file_path), data_only=False)
|
||||
wb_vals = openpyxl.load_workbook(str(self.file_path), data_only=True)
|
||||
except InvalidFileException:
|
||||
if self.file_path.suffix.lower() == '.xls':
|
||||
raise FileProcessingError("XLS 格式需要先轉換為 XLSX 格式")
|
||||
raise
|
||||
except Exception:
|
||||
wb_vals = None
|
||||
|
||||
# 提取文字段落(完全按照參考檔案的邏輯)
|
||||
segs = []
|
||||
for ws in wb.worksheets:
|
||||
ws_vals = wb_vals[ws.title] if wb_vals and ws.title in wb_vals.sheetnames else None
|
||||
max_row, max_col = ws.max_row, ws.max_column
|
||||
|
||||
for r in range(1, max_row + 1):
|
||||
for c in range(1, max_col + 1):
|
||||
src_text = self._get_display_text_for_translation(ws, ws_vals, r, c)
|
||||
if not src_text:
|
||||
continue
|
||||
if not self._should_translate(src_text, 'auto'):
|
||||
continue
|
||||
segs.append(src_text)
|
||||
|
||||
# 去重保持順序
|
||||
unique_segments = []
|
||||
seen = set()
|
||||
for seg in segs:
|
||||
if seg not in seen:
|
||||
unique_segments.append(seg)
|
||||
seen.add(seg)
|
||||
|
||||
logger.info(f"Extracted {len(unique_segments)} unique text segments from Excel file")
|
||||
return unique_segments
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to extract text from Excel file: {str(e)}")
|
||||
raise FileProcessingError(f"Excel 文件解析失敗: {str(e)}")
|
||||
|
||||
def _get_display_text_for_translation(self, ws, ws_vals, r: int, c: int) -> Optional[str]:
|
||||
"""取得儲存格用於翻譯的顯示文字(完全移植自參考檔案)"""
|
||||
val = ws.cell(row=r, column=c).value
|
||||
if isinstance(val, str) and val.startswith("="):
|
||||
if ws_vals is not None:
|
||||
shown = ws_vals.cell(row=r, column=c).value
|
||||
return shown if isinstance(shown, str) and shown.strip() else None
|
||||
return None
|
||||
if isinstance(val, str) and val.strip():
|
||||
return val
|
||||
if ws_vals is not None:
|
||||
shown = ws_vals.cell(row=r, column=c).value
|
||||
if isinstance(shown, str) and shown.strip():
|
||||
return shown
|
||||
return None
|
||||
|
||||
def _should_translate(self, text: str, src_lang: str) -> bool:
|
||||
"""判斷文字是否需要翻譯(移植自參考檔案)"""
|
||||
text = text.strip()
|
||||
if len(text) < 3:
|
||||
return False
|
||||
|
||||
# Skip pure numbers, dates, etc.
|
||||
import re
|
||||
if re.match(r'^[\d\s\.\-\:\/]+$', text):
|
||||
return False
|
||||
|
||||
# For auto-detect, translate if has CJK or meaningful text
|
||||
if src_lang.lower() in ('auto', 'auto-detect'):
|
||||
return self._has_cjk(text) or len(text) > 5
|
||||
|
||||
return True
|
||||
|
||||
def _has_cjk(self, text: str) -> bool:
|
||||
"""檢查是否包含中日韓文字(移植自參考檔案)"""
|
||||
for char in text:
|
||||
if '\u4e00' <= char <= '\u9fff' or \
|
||||
'\u3400' <= char <= '\u4dbf' or \
|
||||
'\u20000' <= char <= '\u2a6df' or \
|
||||
'\u3040' <= char <= '\u309f' or \
|
||||
'\u30a0' <= char <= '\u30ff' or \
|
||||
'\uac00' <= char <= '\ud7af':
|
||||
return True
|
||||
return False
|
||||
|
||||
def generate_translated_document(self, translations: Dict[str, List[str]],
|
||||
target_language: str, output_dir: Path) -> str:
|
||||
"""生成翻譯後的 Excel 文件(移植自參考檔案邏輯)"""
|
||||
try:
|
||||
import openpyxl
|
||||
from openpyxl.styles import Alignment
|
||||
from openpyxl.comments import Comment
|
||||
|
||||
# 載入原始工作簿
|
||||
wb = openpyxl.load_workbook(str(self.file_path), data_only=False)
|
||||
try:
|
||||
wb_vals = openpyxl.load_workbook(str(self.file_path), data_only=True)
|
||||
except Exception:
|
||||
wb_vals = None
|
||||
|
||||
# 建立翻譯對應表
|
||||
translated_texts = translations.get(target_language, [])
|
||||
original_segments = self.extract_text_segments()
|
||||
|
||||
# 建立翻譯映射(按照參考檔案的格式)
|
||||
tmap = {}
|
||||
for i, original_text in enumerate(original_segments):
|
||||
if i < len(translated_texts):
|
||||
tmap[original_text] = translated_texts[i]
|
||||
|
||||
# 處理每個工作表(完全按照參考檔案邏輯)
|
||||
for ws in wb.worksheets:
|
||||
ws_vals = wb_vals[ws.title] if wb_vals and ws.title in wb_vals.sheetnames else None
|
||||
max_row, max_col = ws.max_row, ws.max_column
|
||||
|
||||
for r in range(1, max_row + 1):
|
||||
for c in range(1, max_col + 1):
|
||||
src_text = self._get_display_text_for_translation(ws, ws_vals, r, c)
|
||||
if not src_text or src_text not in tmap:
|
||||
continue
|
||||
|
||||
val = ws.cell(row=r, column=c).value
|
||||
is_formula = isinstance(val, str) and val.startswith("=")
|
||||
translated_text = tmap[src_text]
|
||||
|
||||
cell = ws.cell(row=r, column=c)
|
||||
|
||||
if is_formula:
|
||||
# 公式儲存格:添加註解
|
||||
txt_comment = f"翻譯: {translated_text}"
|
||||
exist = cell.comment
|
||||
if not exist or exist.text.strip() != txt_comment:
|
||||
cell.comment = Comment(txt_comment, "translator")
|
||||
else:
|
||||
# 一般儲存格:使用交錯格式(原文+翻譯)
|
||||
combined = f"{src_text}\n{translated_text}"
|
||||
|
||||
# 檢查是否已經是預期的格式
|
||||
current_text = str(cell.value) if cell.value else ""
|
||||
if current_text.strip() == combined.strip():
|
||||
continue
|
||||
|
||||
cell.value = combined
|
||||
|
||||
# 設定自動換行(移植自參考檔案)
|
||||
try:
|
||||
if cell.alignment:
|
||||
cell.alignment = Alignment(
|
||||
horizontal=cell.alignment.horizontal,
|
||||
vertical=cell.alignment.vertical,
|
||||
wrap_text=True
|
||||
)
|
||||
else:
|
||||
cell.alignment = Alignment(wrap_text=True)
|
||||
except Exception:
|
||||
cell.alignment = Alignment(wrap_text=True)
|
||||
|
||||
# 儲存翻譯後的檔案
|
||||
output_filename = f"{self.file_path.stem}_{target_language}_translated.xlsx"
|
||||
output_path = output_dir / output_filename
|
||||
wb.save(str(output_path))
|
||||
|
||||
logger.info(f"Generated translated Excel file: {output_path}")
|
||||
return str(output_path)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to generate translated Excel file: {str(e)}")
|
||||
raise FileProcessingError(f"Excel 翻譯檔生成失敗: {str(e)}")
|
||||
|
||||
|
||||
class PdfParser(DocumentParser):
|
||||
"""PDF 文件解析器(只讀)"""
|
||||
|
||||
@@ -179,7 +467,9 @@ class TranslationService:
|
||||
# 文件解析器映射
|
||||
self.parsers = {
|
||||
'.docx': DocxParser,
|
||||
'.doc': DocxParser, # 假設可以用 docx 處理
|
||||
'.doc': DocParser, # 需要先轉換為 DOCX
|
||||
'.xlsx': ExcelParser,
|
||||
'.xls': ExcelParser, # Excel 處理器會自動處理 XLS 轉換
|
||||
'.pdf': PdfParser,
|
||||
# 其他格式可以稍後添加
|
||||
}
|
||||
|
Reference in New Issue
Block a user