#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ 調試文字格式不匹配問題 """ import sys import os sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) # 設定編碼 sys.stdout.reconfigure(encoding='utf-8') from pathlib import Path from app import create_app def debug_text_format_mismatch(): """調試文字格式不匹配問題""" print("=" * 80) print("調試文字格式不匹配問題") print("Excel提取 vs 原始快取的文字格式") print("=" * 80) app = create_app() with app.app_context(): from sqlalchemy import text as sql_text from app import db from app.services.translation_service import ExcelParser # 1. 檢查Excel提取的D2文字格式 print(f"1. Excel提取的D2文字格式") print("-" * 60) original_file = Path(r"C:\Users\EGG\WORK\data\user_scrip\TOOL\Document_translator_V2\uploads\98158984-f335-44f5-a0b4-88fb8ccd5d78") / "original_panjit_98158984.xlsx" if original_file.exists(): parser = ExcelParser(str(original_file)) segments = parser.extract_text_segments() # 找到包含"WB inline"的片段 d2_extracted = None for segment in segments: if "WB inline" in segment: d2_extracted = segment break if d2_extracted: print(f"Excel提取的D2:") print(f" 長度: {len(d2_extracted)}") print(f" 內容: {repr(d2_extracted)}") print(f" 包含\\n: {'\\n' in d2_extracted}") print(f" 行數: {len(d2_extracted.split(chr(10)))}") else: print("❌ 沒有找到D2相關內容") # 2. 檢查原始快取中的D2格式 print(f"\n2. 原始快取中的D2格式") print("-" * 60) result = db.session.execute(sql_text(""" SELECT id, source_text, translated_text, target_language, created_at FROM dt_translation_cache WHERE source_text LIKE '%WB inline%' AND source_text LIKE '%Sn/Au%' ORDER BY created_at ASC """)) d2_cache_records = result.fetchall() print(f"找到 {len(d2_cache_records)} 筆原始D2快取:") for i, record in enumerate(d2_cache_records, 1): print(f"\n記錄 {i} (ROW {record[0]}, {record[3]}):") print(f" 長度: {len(record[1])}") print(f" 內容: {repr(record[1])}") print(f" 包含\\n: {'\\n' in record[1]}") print(f" 行數: {len(record[1].split(chr(10)))}") print(f" 創建時間: {record[4]}") # 標記哪個是原始DIFY翻譯 if record[0] == 449: print(f" 🎯 這是原始DIFY韓文翻譯 (ROW 449)") # 3. 比較格式差異 print(f"\n3. 格式差異分析") print("-" * 60) if d2_extracted and d2_cache_records: original_cache = next((r for r in d2_cache_records if r[0] == 449), None) if original_cache: print(f"Excel提取格式:") print(f" {repr(d2_extracted)}") print(f"\n原始快取格式 (ROW 449):") print(f" {repr(original_cache[1])}") print(f"\n格式差異:") print(f" 長度差異: {len(d2_extracted)} vs {len(original_cache[1])}") print(f" Excel有\\n: {'\\n' in d2_extracted}") print(f" 快取有\\n: {'\\n' in original_cache[1]}") # 嘗試格式化統一比較 excel_normalized = d2_extracted.replace('\n', ' ').strip() cache_normalized = original_cache[1].replace('\n', ' ').strip() print(f"\n標準化比較:") print(f" Excel標準化: {repr(excel_normalized)}") print(f" 快取標準化: {repr(cache_normalized)}") print(f" 標準化後相等: {excel_normalized == cache_normalized}") # 檢查字符級差異 if excel_normalized != cache_normalized: print(f"\n字符級差異分析:") min_len = min(len(excel_normalized), len(cache_normalized)) for j in range(min_len): if excel_normalized[j] != cache_normalized[j]: print(f" 位置{j}: Excel='{excel_normalized[j]}' vs 快取='{cache_normalized[j]}'") break # 4. 測試修正查找邏輯 print(f"\n4. 測試修正查找邏輯") print("-" * 60) if d2_extracted: # 原始查找 result1 = db.session.execute(sql_text(""" SELECT id, translated_text FROM dt_translation_cache WHERE source_text = :text AND target_language = 'ko' ORDER BY created_at DESC LIMIT 1 """), {'text': d2_extracted}) row1 = result1.fetchone() print(f"原始查找 (精確匹配): {'✅ 找到' if row1 else '❌ 未找到'}") if row1: print(f" ROW {row1[0]}: {repr(row1[1][:30])}...") # 標準化查找 - 去除換行後查找 normalized_text = d2_extracted.replace('\n', ' ').strip() result2 = db.session.execute(sql_text(""" SELECT id, translated_text FROM dt_translation_cache WHERE REPLACE(REPLACE(source_text, '\n', ' '), '\r', ' ') = :text AND target_language = 'ko' ORDER BY created_at DESC LIMIT 1 """), {'text': normalized_text}) row2 = result2.fetchone() print(f"標準化查找 (忽略換行): {'✅ 找到' if row2 else '❌ 未找到'}") if row2: print(f" ROW {row2[0]}: {repr(row2[1][:30])}...") print(f"\n" + "=" * 80) print("文字格式不匹配調試完成!") print("建議: 修改翻譯映射邏輯以容忍換行符差異") print("=" * 80) if __name__ == "__main__": debug_text_format_mismatch()