Document_Translator/debug_excel_translation.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
調試Excel翻譯問題
"""

import sys
import os
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))

import openpyxl
from pathlib import Path

# 設定編碼
sys.stdout.reconfigure(encoding='utf-8')

def debug_excel_translation_process():
    """調試Excel翻譯過程"""

    print("=" * 80)
    print("Excel 翻譯過程調試")
    print("=" * 80)

    # 文件路徑
    excel_dir = Path(r"C:\Users\EGG\WORK\data\user_scrip\TOOL\Document_translator_V2\uploads\f0b78200-2c5e-41a4-bac8-1536f92529e9")
    original_file = excel_dir / "original_panjit_f0b78200.xlsx"
    translated_file = excel_dir / "original_panjit_f0b78200_ja_translated.xlsx"

    if not original_file.exists():
        print(f"原始文件不存在: {original_file}")
        return

    if not translated_file.exists():
        print(f"翻譯文件不存在: {translated_file}")
        return

    print(f"\n1. 分析原始文件提取過程")
    print("-" * 50)

    # 模擬 ExcelParser.extract_text_segments() 的過程
    wb = openpyxl.load_workbook(str(original_file), data_only=False)
    try:
        wb_vals = openpyxl.load_workbook(str(original_file), data_only=True)
    except Exception:
        wb_vals = None

    print(f"工作簿載入成功，共 {len(wb.worksheets)} 個工作表")

    # 提取文字段落
    segs = []
    cell_info = []  # 記錄每個提取片段的來源位置

    for ws in wb.worksheets:
        print(f"\n處理工作表: {ws.title}")
        ws_vals = wb_vals[ws.title] if wb_vals and ws.title in wb_vals.sheetnames else None
        max_row, max_col = ws.max_row, ws.max_column
        print(f"工作表大小: {max_row} x {max_col}")

        for r in range(1, max_row + 1):
            for c in range(1, max_col + 1):
                src_text = get_display_text_for_translation(ws, ws_vals, r, c)
                if not src_text:
                    continue
                if not should_translate(src_text, 'auto'):
                    continue

                # 記錄提取到的文字和位置
                cell_name = f"{openpyxl.utils.get_column_letter(c)}{r}"
                segs.append(src_text)
                cell_info.append((cell_name, src_text))

                # 詳細記錄前20個儲存格
                if len(segs) <= 20:
                    # 安全輸出，避免特殊字符問題
                    safe_text = repr(src_text)
                    print(f"  {cell_name}: {safe_text}")

    print(f"\n提取結果: 共提取到 {len(segs)} 個文字片段")

    # 去重保持順序
    unique_segments = []
    seen = set()
    for seg in segs:
        if seg not in seen:
            unique_segments.append(seg)
            seen.add(seg)

    print(f"去重後: {len(unique_segments)} 個唯一文字片段")

    print(f"\n2. 分析翻譯結果寫入過程")
    print("-" * 50)

    # 檢查翻譯檔案的內容
    wb_trans = openpyxl.load_workbook(str(translated_file), data_only=False)

    # 檢查重要儲存格的翻譯狀況
    important_cells = ['A1', 'B1', 'C1', 'D1', 'B3', 'C3', 'D3']

    for cell_name in important_cells:
        row = int(''.join(filter(str.isdigit, cell_name)))
        col = openpyxl.utils.column_index_from_string(''.join(filter(str.isalpha, cell_name)))

        # 原始內容
        orig_val = wb.active.cell(row=row, column=col).value
        # 翻譯後內容
        trans_val = wb_trans.active.cell(row=row, column=col).value

        print(f"\n儲存格 {cell_name}:")
        print(f"  原始: {repr(orig_val)}")
        print(f"  翻譯: {repr(trans_val)}")

        # 檢查是否為期望的格式（原文+換行+譯文）
        if isinstance(trans_val, str) and '\n' in trans_val:
            lines = trans_val.split('\n')
            print(f"  格式: 雙行格式，共 {len(lines)} 行")
            for i, line in enumerate(lines):
                print(f"    行{i+1}: {repr(line)}")
        else:
            print(f"  格式: 單行格式")

    print(f"\n3. 檢查 A1 儲存格特殊情況")
    print("-" * 50)

    # 檢查A1儲存格的特殊處理
    a1_orig = wb.active['A1'].value
    a1_trans = wb_trans.active['A1'].value

    print(f"A1 原始值: {repr(a1_orig)}")
    print(f"A1 翻譯值: {repr(a1_trans)}")
    print(f"A1 是否需要翻譯: {should_translate(str(a1_orig) if a1_orig else '', 'auto')}")
    print(f"A1 是否在提取列表中: {str(a1_orig) in unique_segments if a1_orig else False}")

    wb.close()
    wb_trans.close()
    if wb_vals:
        wb_vals.close()

def get_display_text_for_translation(ws, ws_vals, r: int, c: int):
    """取得儲存格用於翻譯的顯示文字（移植自原始程式碼）"""
    val = ws.cell(row=r, column=c).value
    if isinstance(val, str) and val.startswith("="):
        if ws_vals is not None:
            shown = ws_vals.cell(row=r, column=c).value
            return shown if isinstance(shown, str) and shown.strip() else None
        return None
    if isinstance(val, str) and val.strip():
        return val
    if ws_vals is not None:
        shown = ws_vals.cell(row=r, column=c).value
        if isinstance(shown, str) and shown.strip():
            return shown
    return None

def should_translate(text: str, src_lang: str) -> bool:
    """判斷文字是否需要翻譯（移植自原始程式碼）"""
    text = text.strip()
    if len(text) < 3:
        return False

    # Skip pure numbers, dates, etc.
    import re
    if re.match(r'^[\d\s\.\-\:\/]+$', text):
        return False

    # For auto-detect, translate if has CJK or meaningful text
    if src_lang.lower() in ('auto', 'auto-detect'):
        return has_cjk(text) or len(text) > 5

    return True

def has_cjk(text: str) -> bool:
    """檢查是否包含中日韓文字（移植自原始程式碼）"""
    for char in text:
        if '\u4e00' <= char <= '\u9fff' or \
           '\u3400' <= char <= '\u4dbf' or \
           '\u20000' <= char <= '\u2a6df' or \
           '\u3040' <= char <= '\u309f' or \
           '\u30a0' <= char <= '\u30ff' or \
           '\uac00' <= char <= '\ud7af':
            return True
    return False

if __name__ == "__main__":
    debug_excel_translation_process()