Document_Translator/debug_text_format_mismatch.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
調試文字格式不匹配問題
"""

import sys
import os
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))

# 設定編碼
sys.stdout.reconfigure(encoding='utf-8')

from pathlib import Path
from app import create_app

def debug_text_format_mismatch():
    """調試文字格式不匹配問題"""

    print("=" * 80)
    print("調試文字格式不匹配問題")
    print("Excel提取 vs 原始快取的文字格式")
    print("=" * 80)

    app = create_app()

    with app.app_context():
        from sqlalchemy import text as sql_text
        from app import db
        from app.services.translation_service import ExcelParser

        # 1. 檢查Excel提取的D2文字格式
        print(f"1. Excel提取的D2文字格式")
        print("-" * 60)

        original_file = Path(r"C:\Users\EGG\WORK\data\user_scrip\TOOL\Document_translator_V2\uploads\98158984-f335-44f5-a0b4-88fb8ccd5d78") / "original_panjit_98158984.xlsx"

        if original_file.exists():
            parser = ExcelParser(str(original_file))
            segments = parser.extract_text_segments()

            # 找到包含"WB inline"的片段
            d2_extracted = None
            for segment in segments:
                if "WB inline" in segment:
                    d2_extracted = segment
                    break

            if d2_extracted:
                print(f"Excel提取的D2:")
                print(f"  長度: {len(d2_extracted)}")
                print(f"  內容: {repr(d2_extracted)}")
                print(f"  包含\\n: {'\\n' in d2_extracted}")
                print(f"  行數: {len(d2_extracted.split(chr(10)))}")
            else:
                print("❌ 沒有找到D2相關內容")

        # 2. 檢查原始快取中的D2格式
        print(f"\n2. 原始快取中的D2格式")
        print("-" * 60)

        result = db.session.execute(sql_text("""
            SELECT id, source_text, translated_text, target_language, created_at
            FROM dt_translation_cache
            WHERE source_text LIKE '%WB inline%' AND source_text LIKE '%Sn/Au%'
            ORDER BY created_at ASC
        """))

        d2_cache_records = result.fetchall()

        print(f"找到 {len(d2_cache_records)} 筆原始D2快取:")

        for i, record in enumerate(d2_cache_records, 1):
            print(f"\n記錄 {i} (ROW {record[0]}, {record[3]}):")
            print(f"  長度: {len(record[1])}")
            print(f"  內容: {repr(record[1])}")
            print(f"  包含\\n: {'\\n' in record[1]}")
            print(f"  行數: {len(record[1].split(chr(10)))}")
            print(f"  創建時間: {record[4]}")

            # 標記哪個是原始DIFY翻譯
            if record[0] == 449:
                print(f"  🎯 這是原始DIFY韓文翻譯 (ROW 449)")

        # 3. 比較格式差異
        print(f"\n3. 格式差異分析")
        print("-" * 60)

        if d2_extracted and d2_cache_records:
            original_cache = next((r for r in d2_cache_records if r[0] == 449), None)

            if original_cache:
                print(f"Excel提取格式:")
                print(f"  {repr(d2_extracted)}")
                print(f"\n原始快取格式 (ROW 449):")
                print(f"  {repr(original_cache[1])}")

                print(f"\n格式差異:")
                print(f"  長度差異: {len(d2_extracted)} vs {len(original_cache[1])}")
                print(f"  Excel有\\n: {'\\n' in d2_extracted}")
                print(f"  快取有\\n: {'\\n' in original_cache[1]}")

                # 嘗試格式化統一比較
                excel_normalized = d2_extracted.replace('\n', ' ').strip()
                cache_normalized = original_cache[1].replace('\n', ' ').strip()

                print(f"\n標準化比較:")
                print(f"  Excel標準化: {repr(excel_normalized)}")
                print(f"  快取標準化: {repr(cache_normalized)}")
                print(f"  標準化後相等: {excel_normalized == cache_normalized}")

                # 檢查字符級差異
                if excel_normalized != cache_normalized:
                    print(f"\n字符級差異分析:")
                    min_len = min(len(excel_normalized), len(cache_normalized))
                    for j in range(min_len):
                        if excel_normalized[j] != cache_normalized[j]:
                            print(f"  位置{j}: Excel='{excel_normalized[j]}' vs 快取='{cache_normalized[j]}'")
                            break

        # 4. 測試修正查找邏輯
        print(f"\n4. 測試修正查找邏輯")
        print("-" * 60)

        if d2_extracted:
            # 原始查找
            result1 = db.session.execute(sql_text("""
                SELECT id, translated_text
                FROM dt_translation_cache
                WHERE source_text = :text AND target_language = 'ko'
                ORDER BY created_at DESC
                LIMIT 1
            """), {'text': d2_extracted})

            row1 = result1.fetchone()
            print(f"原始查找 (精確匹配): {'✅ 找到' if row1 else '❌ 未找到'}")
            if row1:
                print(f"  ROW {row1[0]}: {repr(row1[1][:30])}...")

            # 標準化查找 - 去除換行後查找
            normalized_text = d2_extracted.replace('\n', ' ').strip()
            result2 = db.session.execute(sql_text("""
                SELECT id, translated_text
                FROM dt_translation_cache
                WHERE REPLACE(REPLACE(source_text, '\n', ' '), '\r', ' ') = :text AND target_language = 'ko'
                ORDER BY created_at DESC
                LIMIT 1
            """), {'text': normalized_text})

            row2 = result2.fetchone()
            print(f"標準化查找 (忽略換行): {'✅ 找到' if row2 else '❌ 未找到'}")
            if row2:
                print(f"  ROW {row2[0]}: {repr(row2[1][:30])}...")

    print(f"\n" + "=" * 80)
    print("文字格式不匹配調試完成！")
    print("建議: 修改翻譯映射邏輯以容忍換行符差異")
    print("=" * 80)

if __name__ == "__main__":
    debug_text_format_mismatch()