#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ 最終DOCX翻譯修復驗證 - 測試段落重新匹配修復 """ import sys import os import tempfile import shutil from pathlib import Path # Fix encoding for Windows console if sys.stdout.encoding != 'utf-8': sys.stdout.reconfigure(encoding='utf-8') if sys.stderr.encoding != 'utf-8': sys.stderr.reconfigure(encoding='utf-8') sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'app')) from app import create_app, db from app.services.translation_service import DocxParser from sqlalchemy import text as sql_text def test_final_docx_fix(): """最終DOCX翻譯修復驗證""" app = create_app() with app.app_context(): print("=== 最終DOCX翻譯修復驗證 ===") # 原始文件 original_path = r"C:\Users\EGG\WORK\data\user_scrip\TOOL\Document_translator_V2\uploads\9c6548ac-2f59-45f4-aade-0a9b3895bbfd\original_-OR026_9c6548ac.docx" # 創建全新的測試環境 test_dir = Path(tempfile.gettempdir()) / "final_docx_test" if test_dir.exists(): shutil.rmtree(test_dir) test_dir.mkdir(exist_ok=True) clean_input_path = test_dir / "clean_input.docx" shutil.copy2(original_path, clean_input_path) print(f"✅ 創建全新測試副本: {clean_input_path}") # 檢查翻譯快取覆蓋率 try: parser = DocxParser(str(clean_input_path)) segments = parser.processor.extract_docx_segments(str(clean_input_path)) print(f"\n📊 翻譯快取檢查:") print(f"文檔段落數: {len(segments)}") # 檢查英文和越南文翻譯覆蓋率 languages = ['en', 'vi'] for lang in languages: translated_count = 0 total_count = 0 for seg in segments: total_count += 1 result = db.session.execute(sql_text(""" SELECT translated_text FROM dt_translation_cache WHERE source_text = :text AND target_language = :lang ORDER BY created_at DESC LIMIT 1 """), {'text': seg.text, 'lang': lang}) row = result.fetchone() if row and row[0]: translated_count += 1 coverage = (translated_count / total_count * 100) if total_count > 0 else 0 print(f" {lang.upper()}翻譯覆蓋率: {coverage:.1f}% ({translated_count}/{total_count})") except Exception as e: print(f"❌ 翻譯快取檢查失敗: {e}") return # 生成英文翻譯文檔 print(f"\n🔄 生成英文翻譯文檔...") try: empty_translations = {} # 使用空字典,從快取讀取 en_output_path = parser.generate_translated_document( empty_translations, 'en', test_dir ) print(f"✅ 英文翻譯文檔生成: {en_output_path}") # 詳細分析生成的文檔 try: from docx import Document output_doc = Document(en_output_path) paragraphs = [p for p in output_doc.paragraphs if p.text.strip()] print(f"\n📄 英文翻譯文檔分析:") print(f"總段落數: {len(paragraphs)}") # 語言統計 chinese_paras = 0 english_paras = 0 mixed_paras = 0 marker_paras = 0 # 交錯格式檢查 translation_pairs = 0 consecutive_pairs = [] for i, para in enumerate(paragraphs[:50]): # 檢查前50段 text = para.text.strip() # 語言檢測 has_chinese = any('\u4e00' <= c <= '\u9fff' for c in text) has_english = any(ord(c) < 128 and c.isalpha() and c not in 'PANJIT' for c in text) has_marker = any('\u200b' in (r.text or '') for r in para.runs) if has_marker: marker_paras += 1 if has_chinese and has_english: mixed_paras += 1 lang_status = "🔄 中英混合" elif has_english: english_paras += 1 lang_status = "🇺🇸 純英文" elif has_chinese: chinese_paras += 1 lang_status = "🇨🇳 純中文" else: lang_status = "❓ 其他" # 檢查交錯對 if i < len(paragraphs) - 1: next_text = paragraphs[i + 1].text.strip() next_has_chinese = any('\u4e00' <= c <= '\u9fff' for c in next_text) next_has_english = any(ord(c) < 128 and c.isalpha() and c not in 'PANJIT' for c in next_text) # 中文後跟英文 = 翻譯對 if (has_chinese and not has_english and next_has_english and not next_has_chinese): translation_pairs += 1 if len(consecutive_pairs) < 5: # 記錄前5個翻譯對 consecutive_pairs.append({ 'index': i, 'chinese': text[:60], 'english': next_text[:60] }) if i < 20: # 顯示前20段詳情 marker_status = " 🏷️" if has_marker else "" print(f" 段落 {i+1:2d}: {lang_status}{marker_status} - {text[:70]}...") print(f"\n📊 語言統計:") print(f" 純中文段落: {chinese_paras}") print(f" 純英文段落: {english_paras}") print(f" 中英混合段落: {mixed_paras}") print(f" 帶翻譯標記段落: {marker_paras}") print(f" 發現交錯翻譯對: {translation_pairs}") # 顯示翻譯對示例 if consecutive_pairs: print(f"\n🔍 翻譯對示例:") for pair in consecutive_pairs: print(f" 對 {pair['index']//2 + 1}:") print(f" 中文: {pair['chinese']}...") print(f" 英文: {pair['english']}...") # 判斷翻譯效果 total_expected_pairs = chinese_paras # 預期翻譯對數量 success_rate = (translation_pairs / total_expected_pairs * 100) if total_expected_pairs > 0 else 0 print(f"\n🎯 翻譯效果評估:") print(f" 預期翻譯對: {total_expected_pairs}") print(f" 實際翻譯對: {translation_pairs}") print(f" 翻譯成功率: {success_rate:.1f}%") if success_rate >= 80: print(f" ✅ 翻譯效果優秀!") elif success_rate >= 50: print(f" ⚠️ 翻譯效果良好,但仍有改進空間") elif translation_pairs > 0: print(f" 🔍 翻譯部分成功,需要檢查具體問題") else: print(f" ❌ 翻譯失敗,需要深入調試") except Exception as e: print(f"❌ 分析英文翻譯文檔失敗: {e}") except Exception as e: print(f"❌ 生成英文翻譯文檔失敗: {e}") # 生成越南文翻譯文檔 print(f"\n🔄 生成越南文翻譯文檔...") try: vi_output_path = parser.generate_translated_document( {}, 'vi', test_dir ) print(f"✅ 越南文翻譯文檔生成: {vi_output_path}") # 快速檢查越南文文檔 try: vi_doc = Document(vi_output_path) vi_paragraphs = [p for p in vi_doc.paragraphs if p.text.strip()] vi_pairs = 0 for i in range(len(vi_paragraphs) - 1): text = vi_paragraphs[i].text.strip() next_text = vi_paragraphs[i + 1].text.strip() has_chinese = any('\u4e00' <= c <= '\u9fff' for c in text) has_vietnamese = any('\u00C0' <= c <= '\u1EF9' for c in next_text) if has_chinese and has_vietnamese: vi_pairs += 1 print(f" 越南文翻譯對: {vi_pairs}") except Exception as e: print(f" 越南文文檔檢查失敗: {e}") except Exception as e: print(f"❌ 生成越南文翻譯文檔失敗: {e}") # 最終結論 print(f"\n" + "="*60) print(f"🎯 DOCX翻譯修復最終驗證結果:") if 'success_rate' in locals() and success_rate >= 80: print(f"✅ 修復成功!DOCX翻譯功能已完美解決") print(f" - 翻譯成功率: {success_rate:.1f}%") print(f" - 交錯格式正確: {translation_pairs} 個翻譯對") print(f" - 文檔實例匹配問題已解決") # 更新TODO狀態為完成 return True elif 'translation_pairs' in locals() and translation_pairs > 0: print(f"⚠️ 修復部分成功,需要進一步調整") print(f" - 翻譯成功率: {success_rate:.1f}% (目標: ≥80%)") print(f" - 實際翻譯對: {translation_pairs}") return False else: print(f"❌ 修復尚未完全成功,需要繼續調試") print(f" - 沒有發現有效的翻譯內容") return False if __name__ == "__main__": success = test_final_docx_fix() if success: print(f"\n🎉 DOCX翻譯問題已完美解決!") else: print(f"\n🔧 需要繼續修復調試...")