Files
Document_Translator/test_final_docx_fix.py
2025-09-03 09:05:51 +08:00

260 lines
11 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
最終DOCX翻譯修復驗證 - 測試段落重新匹配修復
"""
import sys
import os
import tempfile
import shutil
from pathlib import Path
# Fix encoding for Windows console
if sys.stdout.encoding != 'utf-8':
sys.stdout.reconfigure(encoding='utf-8')
if sys.stderr.encoding != 'utf-8':
sys.stderr.reconfigure(encoding='utf-8')
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'app'))
from app import create_app, db
from app.services.translation_service import DocxParser
from sqlalchemy import text as sql_text
def test_final_docx_fix():
"""最終DOCX翻譯修復驗證"""
app = create_app()
with app.app_context():
print("=== 最終DOCX翻譯修復驗證 ===")
# 原始文件
original_path = r"C:\Users\EGG\WORK\data\user_scrip\TOOL\Document_translator_V2\uploads\9c6548ac-2f59-45f4-aade-0a9b3895bbfd\original_-OR026_9c6548ac.docx"
# 創建全新的測試環境
test_dir = Path(tempfile.gettempdir()) / "final_docx_test"
if test_dir.exists():
shutil.rmtree(test_dir)
test_dir.mkdir(exist_ok=True)
clean_input_path = test_dir / "clean_input.docx"
shutil.copy2(original_path, clean_input_path)
print(f"✅ 創建全新測試副本: {clean_input_path}")
# 檢查翻譯快取覆蓋率
try:
parser = DocxParser(str(clean_input_path))
segments = parser.processor.extract_docx_segments(str(clean_input_path))
print(f"\n📊 翻譯快取檢查:")
print(f"文檔段落數: {len(segments)}")
# 檢查英文和越南文翻譯覆蓋率
languages = ['en', 'vi']
for lang in languages:
translated_count = 0
total_count = 0
for seg in segments:
total_count += 1
result = db.session.execute(sql_text("""
SELECT translated_text
FROM dt_translation_cache
WHERE source_text = :text AND target_language = :lang
ORDER BY created_at DESC
LIMIT 1
"""), {'text': seg.text, 'lang': lang})
row = result.fetchone()
if row and row[0]:
translated_count += 1
coverage = (translated_count / total_count * 100) if total_count > 0 else 0
print(f" {lang.upper()}翻譯覆蓋率: {coverage:.1f}% ({translated_count}/{total_count})")
except Exception as e:
print(f"❌ 翻譯快取檢查失敗: {e}")
return
# 生成英文翻譯文檔
print(f"\n🔄 生成英文翻譯文檔...")
try:
empty_translations = {} # 使用空字典,從快取讀取
en_output_path = parser.generate_translated_document(
empty_translations,
'en',
test_dir
)
print(f"✅ 英文翻譯文檔生成: {en_output_path}")
# 詳細分析生成的文檔
try:
from docx import Document
output_doc = Document(en_output_path)
paragraphs = [p for p in output_doc.paragraphs if p.text.strip()]
print(f"\n📄 英文翻譯文檔分析:")
print(f"總段落數: {len(paragraphs)}")
# 語言統計
chinese_paras = 0
english_paras = 0
mixed_paras = 0
marker_paras = 0
# 交錯格式檢查
translation_pairs = 0
consecutive_pairs = []
for i, para in enumerate(paragraphs[:50]): # 檢查前50段
text = para.text.strip()
# 語言檢測
has_chinese = any('\u4e00' <= c <= '\u9fff' for c in text)
has_english = any(ord(c) < 128 and c.isalpha() and c not in 'PANJIT' for c in text)
has_marker = any('\u200b' in (r.text or '') for r in para.runs)
if has_marker:
marker_paras += 1
if has_chinese and has_english:
mixed_paras += 1
lang_status = "🔄 中英混合"
elif has_english:
english_paras += 1
lang_status = "🇺🇸 純英文"
elif has_chinese:
chinese_paras += 1
lang_status = "🇨🇳 純中文"
else:
lang_status = "❓ 其他"
# 檢查交錯對
if i < len(paragraphs) - 1:
next_text = paragraphs[i + 1].text.strip()
next_has_chinese = any('\u4e00' <= c <= '\u9fff' for c in next_text)
next_has_english = any(ord(c) < 128 and c.isalpha() and c not in 'PANJIT' for c in next_text)
# 中文後跟英文 = 翻譯對
if (has_chinese and not has_english and
next_has_english and not next_has_chinese):
translation_pairs += 1
if len(consecutive_pairs) < 5: # 記錄前5個翻譯對
consecutive_pairs.append({
'index': i,
'chinese': text[:60],
'english': next_text[:60]
})
if i < 20: # 顯示前20段詳情
marker_status = " 🏷️" if has_marker else ""
print(f" 段落 {i+1:2d}: {lang_status}{marker_status} - {text[:70]}...")
print(f"\n📊 語言統計:")
print(f" 純中文段落: {chinese_paras}")
print(f" 純英文段落: {english_paras}")
print(f" 中英混合段落: {mixed_paras}")
print(f" 帶翻譯標記段落: {marker_paras}")
print(f" 發現交錯翻譯對: {translation_pairs}")
# 顯示翻譯對示例
if consecutive_pairs:
print(f"\n🔍 翻譯對示例:")
for pair in consecutive_pairs:
print(f"{pair['index']//2 + 1}:")
print(f" 中文: {pair['chinese']}...")
print(f" 英文: {pair['english']}...")
# 判斷翻譯效果
total_expected_pairs = chinese_paras # 預期翻譯對數量
success_rate = (translation_pairs / total_expected_pairs * 100) if total_expected_pairs > 0 else 0
print(f"\n🎯 翻譯效果評估:")
print(f" 預期翻譯對: {total_expected_pairs}")
print(f" 實際翻譯對: {translation_pairs}")
print(f" 翻譯成功率: {success_rate:.1f}%")
if success_rate >= 80:
print(f" ✅ 翻譯效果優秀!")
elif success_rate >= 50:
print(f" ⚠️ 翻譯效果良好,但仍有改進空間")
elif translation_pairs > 0:
print(f" 🔍 翻譯部分成功,需要檢查具體問題")
else:
print(f" ❌ 翻譯失敗,需要深入調試")
except Exception as e:
print(f"❌ 分析英文翻譯文檔失敗: {e}")
except Exception as e:
print(f"❌ 生成英文翻譯文檔失敗: {e}")
# 生成越南文翻譯文檔
print(f"\n🔄 生成越南文翻譯文檔...")
try:
vi_output_path = parser.generate_translated_document(
{},
'vi',
test_dir
)
print(f"✅ 越南文翻譯文檔生成: {vi_output_path}")
# 快速檢查越南文文檔
try:
vi_doc = Document(vi_output_path)
vi_paragraphs = [p for p in vi_doc.paragraphs if p.text.strip()]
vi_pairs = 0
for i in range(len(vi_paragraphs) - 1):
text = vi_paragraphs[i].text.strip()
next_text = vi_paragraphs[i + 1].text.strip()
has_chinese = any('\u4e00' <= c <= '\u9fff' for c in text)
has_vietnamese = any('\u00C0' <= c <= '\u1EF9' for c in next_text)
if has_chinese and has_vietnamese:
vi_pairs += 1
print(f" 越南文翻譯對: {vi_pairs}")
except Exception as e:
print(f" 越南文文檔檢查失敗: {e}")
except Exception as e:
print(f"❌ 生成越南文翻譯文檔失敗: {e}")
# 最終結論
print(f"\n" + "="*60)
print(f"🎯 DOCX翻譯修復最終驗證結果:")
if 'success_rate' in locals() and success_rate >= 80:
print(f"✅ 修復成功DOCX翻譯功能已完美解決")
print(f" - 翻譯成功率: {success_rate:.1f}%")
print(f" - 交錯格式正確: {translation_pairs} 個翻譯對")
print(f" - 文檔實例匹配問題已解決")
# 更新TODO狀態為完成
return True
elif 'translation_pairs' in locals() and translation_pairs > 0:
print(f"⚠️ 修復部分成功,需要進一步調整")
print(f" - 翻譯成功率: {success_rate:.1f}% (目標: ≥80%)")
print(f" - 實際翻譯對: {translation_pairs}")
return False
else:
print(f"❌ 修復尚未完全成功,需要繼續調試")
print(f" - 沒有發現有效的翻譯內容")
return False
if __name__ == "__main__":
success = test_final_docx_fix()
if success:
print(f"\n🎉 DOCX翻譯問題已完美解決")
else:
print(f"\n🔧 需要繼續修復調試...")