161 lines
6.3 KiB
Python
161 lines
6.3 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
調試文字格式不匹配問題
|
|
"""
|
|
|
|
import sys
|
|
import os
|
|
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
|
|
|
# 設定編碼
|
|
sys.stdout.reconfigure(encoding='utf-8')
|
|
|
|
from pathlib import Path
|
|
from app import create_app
|
|
|
|
def debug_text_format_mismatch():
|
|
"""調試文字格式不匹配問題"""
|
|
|
|
print("=" * 80)
|
|
print("調試文字格式不匹配問題")
|
|
print("Excel提取 vs 原始快取的文字格式")
|
|
print("=" * 80)
|
|
|
|
app = create_app()
|
|
|
|
with app.app_context():
|
|
from sqlalchemy import text as sql_text
|
|
from app import db
|
|
from app.services.translation_service import ExcelParser
|
|
|
|
# 1. 檢查Excel提取的D2文字格式
|
|
print(f"1. Excel提取的D2文字格式")
|
|
print("-" * 60)
|
|
|
|
original_file = Path(r"C:\Users\EGG\WORK\data\user_scrip\TOOL\Document_translator_V2\uploads\98158984-f335-44f5-a0b4-88fb8ccd5d78") / "original_panjit_98158984.xlsx"
|
|
|
|
if original_file.exists():
|
|
parser = ExcelParser(str(original_file))
|
|
segments = parser.extract_text_segments()
|
|
|
|
# 找到包含"WB inline"的片段
|
|
d2_extracted = None
|
|
for segment in segments:
|
|
if "WB inline" in segment:
|
|
d2_extracted = segment
|
|
break
|
|
|
|
if d2_extracted:
|
|
print(f"Excel提取的D2:")
|
|
print(f" 長度: {len(d2_extracted)}")
|
|
print(f" 內容: {repr(d2_extracted)}")
|
|
print(f" 包含\\n: {'\\n' in d2_extracted}")
|
|
print(f" 行數: {len(d2_extracted.split(chr(10)))}")
|
|
else:
|
|
print("❌ 沒有找到D2相關內容")
|
|
|
|
# 2. 檢查原始快取中的D2格式
|
|
print(f"\n2. 原始快取中的D2格式")
|
|
print("-" * 60)
|
|
|
|
result = db.session.execute(sql_text("""
|
|
SELECT id, source_text, translated_text, target_language, created_at
|
|
FROM dt_translation_cache
|
|
WHERE source_text LIKE '%WB inline%' AND source_text LIKE '%Sn/Au%'
|
|
ORDER BY created_at ASC
|
|
"""))
|
|
|
|
d2_cache_records = result.fetchall()
|
|
|
|
print(f"找到 {len(d2_cache_records)} 筆原始D2快取:")
|
|
|
|
for i, record in enumerate(d2_cache_records, 1):
|
|
print(f"\n記錄 {i} (ROW {record[0]}, {record[3]}):")
|
|
print(f" 長度: {len(record[1])}")
|
|
print(f" 內容: {repr(record[1])}")
|
|
print(f" 包含\\n: {'\\n' in record[1]}")
|
|
print(f" 行數: {len(record[1].split(chr(10)))}")
|
|
print(f" 創建時間: {record[4]}")
|
|
|
|
# 標記哪個是原始DIFY翻譯
|
|
if record[0] == 449:
|
|
print(f" 🎯 這是原始DIFY韓文翻譯 (ROW 449)")
|
|
|
|
# 3. 比較格式差異
|
|
print(f"\n3. 格式差異分析")
|
|
print("-" * 60)
|
|
|
|
if d2_extracted and d2_cache_records:
|
|
original_cache = next((r for r in d2_cache_records if r[0] == 449), None)
|
|
|
|
if original_cache:
|
|
print(f"Excel提取格式:")
|
|
print(f" {repr(d2_extracted)}")
|
|
print(f"\n原始快取格式 (ROW 449):")
|
|
print(f" {repr(original_cache[1])}")
|
|
|
|
print(f"\n格式差異:")
|
|
print(f" 長度差異: {len(d2_extracted)} vs {len(original_cache[1])}")
|
|
print(f" Excel有\\n: {'\\n' in d2_extracted}")
|
|
print(f" 快取有\\n: {'\\n' in original_cache[1]}")
|
|
|
|
# 嘗試格式化統一比較
|
|
excel_normalized = d2_extracted.replace('\n', ' ').strip()
|
|
cache_normalized = original_cache[1].replace('\n', ' ').strip()
|
|
|
|
print(f"\n標準化比較:")
|
|
print(f" Excel標準化: {repr(excel_normalized)}")
|
|
print(f" 快取標準化: {repr(cache_normalized)}")
|
|
print(f" 標準化後相等: {excel_normalized == cache_normalized}")
|
|
|
|
# 檢查字符級差異
|
|
if excel_normalized != cache_normalized:
|
|
print(f"\n字符級差異分析:")
|
|
min_len = min(len(excel_normalized), len(cache_normalized))
|
|
for j in range(min_len):
|
|
if excel_normalized[j] != cache_normalized[j]:
|
|
print(f" 位置{j}: Excel='{excel_normalized[j]}' vs 快取='{cache_normalized[j]}'")
|
|
break
|
|
|
|
# 4. 測試修正查找邏輯
|
|
print(f"\n4. 測試修正查找邏輯")
|
|
print("-" * 60)
|
|
|
|
if d2_extracted:
|
|
# 原始查找
|
|
result1 = db.session.execute(sql_text("""
|
|
SELECT id, translated_text
|
|
FROM dt_translation_cache
|
|
WHERE source_text = :text AND target_language = 'ko'
|
|
ORDER BY created_at DESC
|
|
LIMIT 1
|
|
"""), {'text': d2_extracted})
|
|
|
|
row1 = result1.fetchone()
|
|
print(f"原始查找 (精確匹配): {'✅ 找到' if row1 else '❌ 未找到'}")
|
|
if row1:
|
|
print(f" ROW {row1[0]}: {repr(row1[1][:30])}...")
|
|
|
|
# 標準化查找 - 去除換行後查找
|
|
normalized_text = d2_extracted.replace('\n', ' ').strip()
|
|
result2 = db.session.execute(sql_text("""
|
|
SELECT id, translated_text
|
|
FROM dt_translation_cache
|
|
WHERE REPLACE(REPLACE(source_text, '\n', ' '), '\r', ' ') = :text AND target_language = 'ko'
|
|
ORDER BY created_at DESC
|
|
LIMIT 1
|
|
"""), {'text': normalized_text})
|
|
|
|
row2 = result2.fetchone()
|
|
print(f"標準化查找 (忽略換行): {'✅ 找到' if row2 else '❌ 未找到'}")
|
|
if row2:
|
|
print(f" ROW {row2[0]}: {repr(row2[1][:30])}...")
|
|
|
|
print(f"\n" + "=" * 80)
|
|
print("文字格式不匹配調試完成!")
|
|
print("建議: 修改翻譯映射邏輯以容忍換行符差異")
|
|
print("=" * 80)
|
|
|
|
if __name__ == "__main__":
|
|
debug_text_format_mismatch() |