4th_fix time error
This commit is contained in:
260
test_final_docx_fix.py
Normal file
260
test_final_docx_fix.py
Normal file
@@ -0,0 +1,260 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
最終DOCX翻譯修復驗證 - 測試段落重新匹配修復
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import tempfile
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
|
||||
# Fix encoding for Windows console
|
||||
if sys.stdout.encoding != 'utf-8':
|
||||
sys.stdout.reconfigure(encoding='utf-8')
|
||||
if sys.stderr.encoding != 'utf-8':
|
||||
sys.stderr.reconfigure(encoding='utf-8')
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'app'))
|
||||
|
||||
from app import create_app, db
|
||||
from app.services.translation_service import DocxParser
|
||||
from sqlalchemy import text as sql_text
|
||||
|
||||
def test_final_docx_fix():
|
||||
"""最終DOCX翻譯修復驗證"""
|
||||
|
||||
app = create_app()
|
||||
|
||||
with app.app_context():
|
||||
print("=== 最終DOCX翻譯修復驗證 ===")
|
||||
|
||||
# 原始文件
|
||||
original_path = r"C:\Users\EGG\WORK\data\user_scrip\TOOL\Document_translator_V2\uploads\9c6548ac-2f59-45f4-aade-0a9b3895bbfd\original_-OR026_9c6548ac.docx"
|
||||
|
||||
# 創建全新的測試環境
|
||||
test_dir = Path(tempfile.gettempdir()) / "final_docx_test"
|
||||
if test_dir.exists():
|
||||
shutil.rmtree(test_dir)
|
||||
test_dir.mkdir(exist_ok=True)
|
||||
|
||||
clean_input_path = test_dir / "clean_input.docx"
|
||||
shutil.copy2(original_path, clean_input_path)
|
||||
print(f"✅ 創建全新測試副本: {clean_input_path}")
|
||||
|
||||
# 檢查翻譯快取覆蓋率
|
||||
try:
|
||||
parser = DocxParser(str(clean_input_path))
|
||||
segments = parser.processor.extract_docx_segments(str(clean_input_path))
|
||||
|
||||
print(f"\n📊 翻譯快取檢查:")
|
||||
print(f"文檔段落數: {len(segments)}")
|
||||
|
||||
# 檢查英文和越南文翻譯覆蓋率
|
||||
languages = ['en', 'vi']
|
||||
for lang in languages:
|
||||
translated_count = 0
|
||||
total_count = 0
|
||||
|
||||
for seg in segments:
|
||||
total_count += 1
|
||||
result = db.session.execute(sql_text("""
|
||||
SELECT translated_text
|
||||
FROM dt_translation_cache
|
||||
WHERE source_text = :text AND target_language = :lang
|
||||
ORDER BY created_at DESC
|
||||
LIMIT 1
|
||||
"""), {'text': seg.text, 'lang': lang})
|
||||
|
||||
row = result.fetchone()
|
||||
if row and row[0]:
|
||||
translated_count += 1
|
||||
|
||||
coverage = (translated_count / total_count * 100) if total_count > 0 else 0
|
||||
print(f" {lang.upper()}翻譯覆蓋率: {coverage:.1f}% ({translated_count}/{total_count})")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ 翻譯快取檢查失敗: {e}")
|
||||
return
|
||||
|
||||
# 生成英文翻譯文檔
|
||||
print(f"\n🔄 生成英文翻譯文檔...")
|
||||
try:
|
||||
empty_translations = {} # 使用空字典,從快取讀取
|
||||
|
||||
en_output_path = parser.generate_translated_document(
|
||||
empty_translations,
|
||||
'en',
|
||||
test_dir
|
||||
)
|
||||
|
||||
print(f"✅ 英文翻譯文檔生成: {en_output_path}")
|
||||
|
||||
# 詳細分析生成的文檔
|
||||
try:
|
||||
from docx import Document
|
||||
output_doc = Document(en_output_path)
|
||||
paragraphs = [p for p in output_doc.paragraphs if p.text.strip()]
|
||||
|
||||
print(f"\n📄 英文翻譯文檔分析:")
|
||||
print(f"總段落數: {len(paragraphs)}")
|
||||
|
||||
# 語言統計
|
||||
chinese_paras = 0
|
||||
english_paras = 0
|
||||
mixed_paras = 0
|
||||
marker_paras = 0
|
||||
|
||||
# 交錯格式檢查
|
||||
translation_pairs = 0
|
||||
consecutive_pairs = []
|
||||
|
||||
for i, para in enumerate(paragraphs[:50]): # 檢查前50段
|
||||
text = para.text.strip()
|
||||
|
||||
# 語言檢測
|
||||
has_chinese = any('\u4e00' <= c <= '\u9fff' for c in text)
|
||||
has_english = any(ord(c) < 128 and c.isalpha() and c not in 'PANJIT' for c in text)
|
||||
has_marker = any('\u200b' in (r.text or '') for r in para.runs)
|
||||
|
||||
if has_marker:
|
||||
marker_paras += 1
|
||||
|
||||
if has_chinese and has_english:
|
||||
mixed_paras += 1
|
||||
lang_status = "🔄 中英混合"
|
||||
elif has_english:
|
||||
english_paras += 1
|
||||
lang_status = "🇺🇸 純英文"
|
||||
elif has_chinese:
|
||||
chinese_paras += 1
|
||||
lang_status = "🇨🇳 純中文"
|
||||
else:
|
||||
lang_status = "❓ 其他"
|
||||
|
||||
# 檢查交錯對
|
||||
if i < len(paragraphs) - 1:
|
||||
next_text = paragraphs[i + 1].text.strip()
|
||||
next_has_chinese = any('\u4e00' <= c <= '\u9fff' for c in next_text)
|
||||
next_has_english = any(ord(c) < 128 and c.isalpha() and c not in 'PANJIT' for c in next_text)
|
||||
|
||||
# 中文後跟英文 = 翻譯對
|
||||
if (has_chinese and not has_english and
|
||||
next_has_english and not next_has_chinese):
|
||||
translation_pairs += 1
|
||||
if len(consecutive_pairs) < 5: # 記錄前5個翻譯對
|
||||
consecutive_pairs.append({
|
||||
'index': i,
|
||||
'chinese': text[:60],
|
||||
'english': next_text[:60]
|
||||
})
|
||||
|
||||
if i < 20: # 顯示前20段詳情
|
||||
marker_status = " 🏷️" if has_marker else ""
|
||||
print(f" 段落 {i+1:2d}: {lang_status}{marker_status} - {text[:70]}...")
|
||||
|
||||
print(f"\n📊 語言統計:")
|
||||
print(f" 純中文段落: {chinese_paras}")
|
||||
print(f" 純英文段落: {english_paras}")
|
||||
print(f" 中英混合段落: {mixed_paras}")
|
||||
print(f" 帶翻譯標記段落: {marker_paras}")
|
||||
print(f" 發現交錯翻譯對: {translation_pairs}")
|
||||
|
||||
# 顯示翻譯對示例
|
||||
if consecutive_pairs:
|
||||
print(f"\n🔍 翻譯對示例:")
|
||||
for pair in consecutive_pairs:
|
||||
print(f" 對 {pair['index']//2 + 1}:")
|
||||
print(f" 中文: {pair['chinese']}...")
|
||||
print(f" 英文: {pair['english']}...")
|
||||
|
||||
# 判斷翻譯效果
|
||||
total_expected_pairs = chinese_paras # 預期翻譯對數量
|
||||
success_rate = (translation_pairs / total_expected_pairs * 100) if total_expected_pairs > 0 else 0
|
||||
|
||||
print(f"\n🎯 翻譯效果評估:")
|
||||
print(f" 預期翻譯對: {total_expected_pairs}")
|
||||
print(f" 實際翻譯對: {translation_pairs}")
|
||||
print(f" 翻譯成功率: {success_rate:.1f}%")
|
||||
|
||||
if success_rate >= 80:
|
||||
print(f" ✅ 翻譯效果優秀!")
|
||||
elif success_rate >= 50:
|
||||
print(f" ⚠️ 翻譯效果良好,但仍有改進空間")
|
||||
elif translation_pairs > 0:
|
||||
print(f" 🔍 翻譯部分成功,需要檢查具體問題")
|
||||
else:
|
||||
print(f" ❌ 翻譯失敗,需要深入調試")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ 分析英文翻譯文檔失敗: {e}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ 生成英文翻譯文檔失敗: {e}")
|
||||
|
||||
# 生成越南文翻譯文檔
|
||||
print(f"\n🔄 生成越南文翻譯文檔...")
|
||||
try:
|
||||
vi_output_path = parser.generate_translated_document(
|
||||
{},
|
||||
'vi',
|
||||
test_dir
|
||||
)
|
||||
|
||||
print(f"✅ 越南文翻譯文檔生成: {vi_output_path}")
|
||||
|
||||
# 快速檢查越南文文檔
|
||||
try:
|
||||
vi_doc = Document(vi_output_path)
|
||||
vi_paragraphs = [p for p in vi_doc.paragraphs if p.text.strip()]
|
||||
|
||||
vi_pairs = 0
|
||||
for i in range(len(vi_paragraphs) - 1):
|
||||
text = vi_paragraphs[i].text.strip()
|
||||
next_text = vi_paragraphs[i + 1].text.strip()
|
||||
|
||||
has_chinese = any('\u4e00' <= c <= '\u9fff' for c in text)
|
||||
has_vietnamese = any('\u00C0' <= c <= '\u1EF9' for c in next_text)
|
||||
|
||||
if has_chinese and has_vietnamese:
|
||||
vi_pairs += 1
|
||||
|
||||
print(f" 越南文翻譯對: {vi_pairs}")
|
||||
|
||||
except Exception as e:
|
||||
print(f" 越南文文檔檢查失敗: {e}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ 生成越南文翻譯文檔失敗: {e}")
|
||||
|
||||
# 最終結論
|
||||
print(f"\n" + "="*60)
|
||||
print(f"🎯 DOCX翻譯修復最終驗證結果:")
|
||||
|
||||
if 'success_rate' in locals() and success_rate >= 80:
|
||||
print(f"✅ 修復成功!DOCX翻譯功能已完美解決")
|
||||
print(f" - 翻譯成功率: {success_rate:.1f}%")
|
||||
print(f" - 交錯格式正確: {translation_pairs} 個翻譯對")
|
||||
print(f" - 文檔實例匹配問題已解決")
|
||||
|
||||
# 更新TODO狀態為完成
|
||||
return True
|
||||
|
||||
elif 'translation_pairs' in locals() and translation_pairs > 0:
|
||||
print(f"⚠️ 修復部分成功,需要進一步調整")
|
||||
print(f" - 翻譯成功率: {success_rate:.1f}% (目標: ≥80%)")
|
||||
print(f" - 實際翻譯對: {translation_pairs}")
|
||||
return False
|
||||
|
||||
else:
|
||||
print(f"❌ 修復尚未完全成功,需要繼續調試")
|
||||
print(f" - 沒有發現有效的翻譯內容")
|
||||
return False
|
||||
|
||||
if __name__ == "__main__":
|
||||
success = test_final_docx_fix()
|
||||
if success:
|
||||
print(f"\n🎉 DOCX翻譯問題已完美解決!")
|
||||
else:
|
||||
print(f"\n🔧 需要繼續修復調試...")
|
Reference in New Issue
Block a user