Files
Document_Translator/debug_excel_translation.py
2025-09-03 15:07:34 +08:00

184 lines
6.3 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
調試Excel翻譯問題
"""
import sys
import os
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
import openpyxl
from pathlib import Path
# 設定編碼
sys.stdout.reconfigure(encoding='utf-8')
def debug_excel_translation_process():
"""調試Excel翻譯過程"""
print("=" * 80)
print("Excel 翻譯過程調試")
print("=" * 80)
# 文件路徑
excel_dir = Path(r"C:\Users\EGG\WORK\data\user_scrip\TOOL\Document_translator_V2\uploads\f0b78200-2c5e-41a4-bac8-1536f92529e9")
original_file = excel_dir / "original_panjit_f0b78200.xlsx"
translated_file = excel_dir / "original_panjit_f0b78200_ja_translated.xlsx"
if not original_file.exists():
print(f"原始文件不存在: {original_file}")
return
if not translated_file.exists():
print(f"翻譯文件不存在: {translated_file}")
return
print(f"\n1. 分析原始文件提取過程")
print("-" * 50)
# 模擬 ExcelParser.extract_text_segments() 的過程
wb = openpyxl.load_workbook(str(original_file), data_only=False)
try:
wb_vals = openpyxl.load_workbook(str(original_file), data_only=True)
except Exception:
wb_vals = None
print(f"工作簿載入成功,共 {len(wb.worksheets)} 個工作表")
# 提取文字段落
segs = []
cell_info = [] # 記錄每個提取片段的來源位置
for ws in wb.worksheets:
print(f"\n處理工作表: {ws.title}")
ws_vals = wb_vals[ws.title] if wb_vals and ws.title in wb_vals.sheetnames else None
max_row, max_col = ws.max_row, ws.max_column
print(f"工作表大小: {max_row} x {max_col}")
for r in range(1, max_row + 1):
for c in range(1, max_col + 1):
src_text = get_display_text_for_translation(ws, ws_vals, r, c)
if not src_text:
continue
if not should_translate(src_text, 'auto'):
continue
# 記錄提取到的文字和位置
cell_name = f"{openpyxl.utils.get_column_letter(c)}{r}"
segs.append(src_text)
cell_info.append((cell_name, src_text))
# 詳細記錄前20個儲存格
if len(segs) <= 20:
# 安全輸出,避免特殊字符問題
safe_text = repr(src_text)
print(f" {cell_name}: {safe_text}")
print(f"\n提取結果: 共提取到 {len(segs)} 個文字片段")
# 去重保持順序
unique_segments = []
seen = set()
for seg in segs:
if seg not in seen:
unique_segments.append(seg)
seen.add(seg)
print(f"去重後: {len(unique_segments)} 個唯一文字片段")
print(f"\n2. 分析翻譯結果寫入過程")
print("-" * 50)
# 檢查翻譯檔案的內容
wb_trans = openpyxl.load_workbook(str(translated_file), data_only=False)
# 檢查重要儲存格的翻譯狀況
important_cells = ['A1', 'B1', 'C1', 'D1', 'B3', 'C3', 'D3']
for cell_name in important_cells:
row = int(''.join(filter(str.isdigit, cell_name)))
col = openpyxl.utils.column_index_from_string(''.join(filter(str.isalpha, cell_name)))
# 原始內容
orig_val = wb.active.cell(row=row, column=col).value
# 翻譯後內容
trans_val = wb_trans.active.cell(row=row, column=col).value
print(f"\n儲存格 {cell_name}:")
print(f" 原始: {repr(orig_val)}")
print(f" 翻譯: {repr(trans_val)}")
# 檢查是否為期望的格式(原文+換行+譯文)
if isinstance(trans_val, str) and '\n' in trans_val:
lines = trans_val.split('\n')
print(f" 格式: 雙行格式,共 {len(lines)}")
for i, line in enumerate(lines):
print(f"{i+1}: {repr(line)}")
else:
print(f" 格式: 單行格式")
print(f"\n3. 檢查 A1 儲存格特殊情況")
print("-" * 50)
# 檢查A1儲存格的特殊處理
a1_orig = wb.active['A1'].value
a1_trans = wb_trans.active['A1'].value
print(f"A1 原始值: {repr(a1_orig)}")
print(f"A1 翻譯值: {repr(a1_trans)}")
print(f"A1 是否需要翻譯: {should_translate(str(a1_orig) if a1_orig else '', 'auto')}")
print(f"A1 是否在提取列表中: {str(a1_orig) in unique_segments if a1_orig else False}")
wb.close()
wb_trans.close()
if wb_vals:
wb_vals.close()
def get_display_text_for_translation(ws, ws_vals, r: int, c: int):
"""取得儲存格用於翻譯的顯示文字(移植自原始程式碼)"""
val = ws.cell(row=r, column=c).value
if isinstance(val, str) and val.startswith("="):
if ws_vals is not None:
shown = ws_vals.cell(row=r, column=c).value
return shown if isinstance(shown, str) and shown.strip() else None
return None
if isinstance(val, str) and val.strip():
return val
if ws_vals is not None:
shown = ws_vals.cell(row=r, column=c).value
if isinstance(shown, str) and shown.strip():
return shown
return None
def should_translate(text: str, src_lang: str) -> bool:
"""判斷文字是否需要翻譯(移植自原始程式碼)"""
text = text.strip()
if len(text) < 3:
return False
# Skip pure numbers, dates, etc.
import re
if re.match(r'^[\d\s\.\-\:\/]+$', text):
return False
# For auto-detect, translate if has CJK or meaningful text
if src_lang.lower() in ('auto', 'auto-detect'):
return has_cjk(text) or len(text) > 5
return True
def has_cjk(text: str) -> bool:
"""檢查是否包含中日韓文字(移植自原始程式碼)"""
for char in text:
if '\u4e00' <= char <= '\u9fff' or \
'\u3400' <= char <= '\u4dbf' or \
'\u20000' <= char <= '\u2a6df' or \
'\u3040' <= char <= '\u309f' or \
'\u30a0' <= char <= '\u30ff' or \
'\uac00' <= char <= '\ud7af':
return True
return False
if __name__ == "__main__":
debug_excel_translation_process()