Files
Document_Translator/debug_text_format_mismatch.py
2025-09-03 15:07:34 +08:00

161 lines
6.3 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
調試文字格式不匹配問題
"""
import sys
import os
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
# 設定編碼
sys.stdout.reconfigure(encoding='utf-8')
from pathlib import Path
from app import create_app
def debug_text_format_mismatch():
"""調試文字格式不匹配問題"""
print("=" * 80)
print("調試文字格式不匹配問題")
print("Excel提取 vs 原始快取的文字格式")
print("=" * 80)
app = create_app()
with app.app_context():
from sqlalchemy import text as sql_text
from app import db
from app.services.translation_service import ExcelParser
# 1. 檢查Excel提取的D2文字格式
print(f"1. Excel提取的D2文字格式")
print("-" * 60)
original_file = Path(r"C:\Users\EGG\WORK\data\user_scrip\TOOL\Document_translator_V2\uploads\98158984-f335-44f5-a0b4-88fb8ccd5d78") / "original_panjit_98158984.xlsx"
if original_file.exists():
parser = ExcelParser(str(original_file))
segments = parser.extract_text_segments()
# 找到包含"WB inline"的片段
d2_extracted = None
for segment in segments:
if "WB inline" in segment:
d2_extracted = segment
break
if d2_extracted:
print(f"Excel提取的D2:")
print(f" 長度: {len(d2_extracted)}")
print(f" 內容: {repr(d2_extracted)}")
print(f" 包含\\n: {'\\n' in d2_extracted}")
print(f" 行數: {len(d2_extracted.split(chr(10)))}")
else:
print("❌ 沒有找到D2相關內容")
# 2. 檢查原始快取中的D2格式
print(f"\n2. 原始快取中的D2格式")
print("-" * 60)
result = db.session.execute(sql_text("""
SELECT id, source_text, translated_text, target_language, created_at
FROM dt_translation_cache
WHERE source_text LIKE '%WB inline%' AND source_text LIKE '%Sn/Au%'
ORDER BY created_at ASC
"""))
d2_cache_records = result.fetchall()
print(f"找到 {len(d2_cache_records)} 筆原始D2快取:")
for i, record in enumerate(d2_cache_records, 1):
print(f"\n記錄 {i} (ROW {record[0]}, {record[3]}):")
print(f" 長度: {len(record[1])}")
print(f" 內容: {repr(record[1])}")
print(f" 包含\\n: {'\\n' in record[1]}")
print(f" 行數: {len(record[1].split(chr(10)))}")
print(f" 創建時間: {record[4]}")
# 標記哪個是原始DIFY翻譯
if record[0] == 449:
print(f" 🎯 這是原始DIFY韓文翻譯 (ROW 449)")
# 3. 比較格式差異
print(f"\n3. 格式差異分析")
print("-" * 60)
if d2_extracted and d2_cache_records:
original_cache = next((r for r in d2_cache_records if r[0] == 449), None)
if original_cache:
print(f"Excel提取格式:")
print(f" {repr(d2_extracted)}")
print(f"\n原始快取格式 (ROW 449):")
print(f" {repr(original_cache[1])}")
print(f"\n格式差異:")
print(f" 長度差異: {len(d2_extracted)} vs {len(original_cache[1])}")
print(f" Excel有\\n: {'\\n' in d2_extracted}")
print(f" 快取有\\n: {'\\n' in original_cache[1]}")
# 嘗試格式化統一比較
excel_normalized = d2_extracted.replace('\n', ' ').strip()
cache_normalized = original_cache[1].replace('\n', ' ').strip()
print(f"\n標準化比較:")
print(f" Excel標準化: {repr(excel_normalized)}")
print(f" 快取標準化: {repr(cache_normalized)}")
print(f" 標準化後相等: {excel_normalized == cache_normalized}")
# 檢查字符級差異
if excel_normalized != cache_normalized:
print(f"\n字符級差異分析:")
min_len = min(len(excel_normalized), len(cache_normalized))
for j in range(min_len):
if excel_normalized[j] != cache_normalized[j]:
print(f" 位置{j}: Excel='{excel_normalized[j]}' vs 快取='{cache_normalized[j]}'")
break
# 4. 測試修正查找邏輯
print(f"\n4. 測試修正查找邏輯")
print("-" * 60)
if d2_extracted:
# 原始查找
result1 = db.session.execute(sql_text("""
SELECT id, translated_text
FROM dt_translation_cache
WHERE source_text = :text AND target_language = 'ko'
ORDER BY created_at DESC
LIMIT 1
"""), {'text': d2_extracted})
row1 = result1.fetchone()
print(f"原始查找 (精確匹配): {'✅ 找到' if row1 else '❌ 未找到'}")
if row1:
print(f" ROW {row1[0]}: {repr(row1[1][:30])}...")
# 標準化查找 - 去除換行後查找
normalized_text = d2_extracted.replace('\n', ' ').strip()
result2 = db.session.execute(sql_text("""
SELECT id, translated_text
FROM dt_translation_cache
WHERE REPLACE(REPLACE(source_text, '\n', ' '), '\r', ' ') = :text AND target_language = 'ko'
ORDER BY created_at DESC
LIMIT 1
"""), {'text': normalized_text})
row2 = result2.fetchone()
print(f"標準化查找 (忽略換行): {'✅ 找到' if row2 else '❌ 未找到'}")
if row2:
print(f" ROW {row2[0]}: {repr(row2[1][:30])}...")
print(f"\n" + "=" * 80)
print("文字格式不匹配調試完成!")
print("建議: 修改翻譯映射邏輯以容忍換行符差異")
print("=" * 80)
if __name__ == "__main__":
debug_text_format_mismatch()