184 lines
6.3 KiB
Python
184 lines
6.3 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
調試Excel翻譯問題
|
|
"""
|
|
|
|
import sys
|
|
import os
|
|
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
|
|
|
import openpyxl
|
|
from pathlib import Path
|
|
|
|
# 設定編碼
|
|
sys.stdout.reconfigure(encoding='utf-8')
|
|
|
|
def debug_excel_translation_process():
|
|
"""調試Excel翻譯過程"""
|
|
|
|
print("=" * 80)
|
|
print("Excel 翻譯過程調試")
|
|
print("=" * 80)
|
|
|
|
# 文件路徑
|
|
excel_dir = Path(r"C:\Users\EGG\WORK\data\user_scrip\TOOL\Document_translator_V2\uploads\f0b78200-2c5e-41a4-bac8-1536f92529e9")
|
|
original_file = excel_dir / "original_panjit_f0b78200.xlsx"
|
|
translated_file = excel_dir / "original_panjit_f0b78200_ja_translated.xlsx"
|
|
|
|
if not original_file.exists():
|
|
print(f"原始文件不存在: {original_file}")
|
|
return
|
|
|
|
if not translated_file.exists():
|
|
print(f"翻譯文件不存在: {translated_file}")
|
|
return
|
|
|
|
print(f"\n1. 分析原始文件提取過程")
|
|
print("-" * 50)
|
|
|
|
# 模擬 ExcelParser.extract_text_segments() 的過程
|
|
wb = openpyxl.load_workbook(str(original_file), data_only=False)
|
|
try:
|
|
wb_vals = openpyxl.load_workbook(str(original_file), data_only=True)
|
|
except Exception:
|
|
wb_vals = None
|
|
|
|
print(f"工作簿載入成功,共 {len(wb.worksheets)} 個工作表")
|
|
|
|
# 提取文字段落
|
|
segs = []
|
|
cell_info = [] # 記錄每個提取片段的來源位置
|
|
|
|
for ws in wb.worksheets:
|
|
print(f"\n處理工作表: {ws.title}")
|
|
ws_vals = wb_vals[ws.title] if wb_vals and ws.title in wb_vals.sheetnames else None
|
|
max_row, max_col = ws.max_row, ws.max_column
|
|
print(f"工作表大小: {max_row} x {max_col}")
|
|
|
|
for r in range(1, max_row + 1):
|
|
for c in range(1, max_col + 1):
|
|
src_text = get_display_text_for_translation(ws, ws_vals, r, c)
|
|
if not src_text:
|
|
continue
|
|
if not should_translate(src_text, 'auto'):
|
|
continue
|
|
|
|
# 記錄提取到的文字和位置
|
|
cell_name = f"{openpyxl.utils.get_column_letter(c)}{r}"
|
|
segs.append(src_text)
|
|
cell_info.append((cell_name, src_text))
|
|
|
|
# 詳細記錄前20個儲存格
|
|
if len(segs) <= 20:
|
|
# 安全輸出,避免特殊字符問題
|
|
safe_text = repr(src_text)
|
|
print(f" {cell_name}: {safe_text}")
|
|
|
|
print(f"\n提取結果: 共提取到 {len(segs)} 個文字片段")
|
|
|
|
# 去重保持順序
|
|
unique_segments = []
|
|
seen = set()
|
|
for seg in segs:
|
|
if seg not in seen:
|
|
unique_segments.append(seg)
|
|
seen.add(seg)
|
|
|
|
print(f"去重後: {len(unique_segments)} 個唯一文字片段")
|
|
|
|
print(f"\n2. 分析翻譯結果寫入過程")
|
|
print("-" * 50)
|
|
|
|
# 檢查翻譯檔案的內容
|
|
wb_trans = openpyxl.load_workbook(str(translated_file), data_only=False)
|
|
|
|
# 檢查重要儲存格的翻譯狀況
|
|
important_cells = ['A1', 'B1', 'C1', 'D1', 'B3', 'C3', 'D3']
|
|
|
|
for cell_name in important_cells:
|
|
row = int(''.join(filter(str.isdigit, cell_name)))
|
|
col = openpyxl.utils.column_index_from_string(''.join(filter(str.isalpha, cell_name)))
|
|
|
|
# 原始內容
|
|
orig_val = wb.active.cell(row=row, column=col).value
|
|
# 翻譯後內容
|
|
trans_val = wb_trans.active.cell(row=row, column=col).value
|
|
|
|
print(f"\n儲存格 {cell_name}:")
|
|
print(f" 原始: {repr(orig_val)}")
|
|
print(f" 翻譯: {repr(trans_val)}")
|
|
|
|
# 檢查是否為期望的格式(原文+換行+譯文)
|
|
if isinstance(trans_val, str) and '\n' in trans_val:
|
|
lines = trans_val.split('\n')
|
|
print(f" 格式: 雙行格式,共 {len(lines)} 行")
|
|
for i, line in enumerate(lines):
|
|
print(f" 行{i+1}: {repr(line)}")
|
|
else:
|
|
print(f" 格式: 單行格式")
|
|
|
|
print(f"\n3. 檢查 A1 儲存格特殊情況")
|
|
print("-" * 50)
|
|
|
|
# 檢查A1儲存格的特殊處理
|
|
a1_orig = wb.active['A1'].value
|
|
a1_trans = wb_trans.active['A1'].value
|
|
|
|
print(f"A1 原始值: {repr(a1_orig)}")
|
|
print(f"A1 翻譯值: {repr(a1_trans)}")
|
|
print(f"A1 是否需要翻譯: {should_translate(str(a1_orig) if a1_orig else '', 'auto')}")
|
|
print(f"A1 是否在提取列表中: {str(a1_orig) in unique_segments if a1_orig else False}")
|
|
|
|
wb.close()
|
|
wb_trans.close()
|
|
if wb_vals:
|
|
wb_vals.close()
|
|
|
|
def get_display_text_for_translation(ws, ws_vals, r: int, c: int):
|
|
"""取得儲存格用於翻譯的顯示文字(移植自原始程式碼)"""
|
|
val = ws.cell(row=r, column=c).value
|
|
if isinstance(val, str) and val.startswith("="):
|
|
if ws_vals is not None:
|
|
shown = ws_vals.cell(row=r, column=c).value
|
|
return shown if isinstance(shown, str) and shown.strip() else None
|
|
return None
|
|
if isinstance(val, str) and val.strip():
|
|
return val
|
|
if ws_vals is not None:
|
|
shown = ws_vals.cell(row=r, column=c).value
|
|
if isinstance(shown, str) and shown.strip():
|
|
return shown
|
|
return None
|
|
|
|
def should_translate(text: str, src_lang: str) -> bool:
|
|
"""判斷文字是否需要翻譯(移植自原始程式碼)"""
|
|
text = text.strip()
|
|
if len(text) < 3:
|
|
return False
|
|
|
|
# Skip pure numbers, dates, etc.
|
|
import re
|
|
if re.match(r'^[\d\s\.\-\:\/]+$', text):
|
|
return False
|
|
|
|
# For auto-detect, translate if has CJK or meaningful text
|
|
if src_lang.lower() in ('auto', 'auto-detect'):
|
|
return has_cjk(text) or len(text) > 5
|
|
|
|
return True
|
|
|
|
def has_cjk(text: str) -> bool:
|
|
"""檢查是否包含中日韓文字(移植自原始程式碼)"""
|
|
for char in text:
|
|
if '\u4e00' <= char <= '\u9fff' or \
|
|
'\u3400' <= char <= '\u4dbf' or \
|
|
'\u20000' <= char <= '\u2a6df' or \
|
|
'\u3040' <= char <= '\u309f' or \
|
|
'\u30a0' <= char <= '\u30ff' or \
|
|
'\uac00' <= char <= '\ud7af':
|
|
return True
|
|
return False
|
|
|
|
if __name__ == "__main__":
|
|
debug_excel_translation_process() |