4th_fix time error

2025-09-03 09:05:51 +08:00
parent e6e5332705
commit cce3fd4925
26 changed files with 2551 additions and 82 deletions
--- a/test_final_docx_fix.py
+++ b/test_final_docx_fix.py
@@ -0,0 +1,260 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+最終DOCX翻譯修復驗證 - 測試段落重新匹配修復
+"""
+
+import sys
+import os
+import tempfile
+import shutil
+from pathlib import Path
+
+# Fix encoding for Windows console
+if sys.stdout.encoding != 'utf-8':
+    sys.stdout.reconfigure(encoding='utf-8')
+if sys.stderr.encoding != 'utf-8':
+    sys.stderr.reconfigure(encoding='utf-8')
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'app'))
+
+from app import create_app, db
+from app.services.translation_service import DocxParser
+from sqlalchemy import text as sql_text
+
+def test_final_docx_fix():
+    """最終DOCX翻譯修復驗證"""
+    
+    app = create_app()
+    
+    with app.app_context():
+        print("=== 最終DOCX翻譯修復驗證 ===")
+        
+        # 原始文件
+        original_path = r"C:\Users\EGG\WORK\data\user_scrip\TOOL\Document_translator_V2\uploads\9c6548ac-2f59-45f4-aade-0a9b3895bbfd\original_-OR026_9c6548ac.docx"
+        
+        # 創建全新的測試環境
+        test_dir = Path(tempfile.gettempdir()) / "final_docx_test"
+        if test_dir.exists():
+            shutil.rmtree(test_dir)
+        test_dir.mkdir(exist_ok=True)
+        
+        clean_input_path = test_dir / "clean_input.docx"
+        shutil.copy2(original_path, clean_input_path)
+        print(f"✅ 創建全新測試副本: {clean_input_path}")
+        
+        # 檢查翻譯快取覆蓋率
+        try:
+            parser = DocxParser(str(clean_input_path))
+            segments = parser.processor.extract_docx_segments(str(clean_input_path))
+            
+            print(f"\n📊 翻譯快取檢查:")
+            print(f"文檔段落數: {len(segments)}")
+            
+            # 檢查英文和越南文翻譯覆蓋率
+            languages = ['en', 'vi']
+            for lang in languages:
+                translated_count = 0
+                total_count = 0
+                
+                for seg in segments:
+                    total_count += 1
+                    result = db.session.execute(sql_text("""
+                        SELECT translated_text 
+                        FROM dt_translation_cache 
+                        WHERE source_text = :text AND target_language = :lang
+                        ORDER BY created_at DESC 
+                        LIMIT 1
+                    """), {'text': seg.text, 'lang': lang})
+                    
+                    row = result.fetchone()
+                    if row and row[0]:
+                        translated_count += 1
+                
+                coverage = (translated_count / total_count * 100) if total_count > 0 else 0
+                print(f"  {lang.upper()}翻譯覆蓋率: {coverage:.1f}% ({translated_count}/{total_count})")
+                
+        except Exception as e:
+            print(f"❌ 翻譯快取檢查失敗: {e}")
+            return
+        
+        # 生成英文翻譯文檔
+        print(f"\n🔄 生成英文翻譯文檔...")
+        try:
+            empty_translations = {}  # 使用空字典，從快取讀取
+            
+            en_output_path = parser.generate_translated_document(
+                empty_translations, 
+                'en', 
+                test_dir
+            )
+            
+            print(f"✅ 英文翻譯文檔生成: {en_output_path}")
+            
+            # 詳細分析生成的文檔
+            try:
+                from docx import Document
+                output_doc = Document(en_output_path)
+                paragraphs = [p for p in output_doc.paragraphs if p.text.strip()]
+                
+                print(f"\n📄 英文翻譯文檔分析:")
+                print(f"總段落數: {len(paragraphs)}")
+                
+                # 語言統計
+                chinese_paras = 0
+                english_paras = 0
+                mixed_paras = 0
+                marker_paras = 0
+                
+                # 交錯格式檢查
+                translation_pairs = 0
+                consecutive_pairs = []
+                
+                for i, para in enumerate(paragraphs[:50]):  # 檢查前50段
+                    text = para.text.strip()
+                    
+                    # 語言檢測
+                    has_chinese = any('\u4e00' <= c <= '\u9fff' for c in text)
+                    has_english = any(ord(c) < 128 and c.isalpha() and c not in 'PANJIT' for c in text)
+                    has_marker = any('\u200b' in (r.text or '') for r in para.runs)
+                    
+                    if has_marker:
+                        marker_paras += 1
+                    
+                    if has_chinese and has_english:
+                        mixed_paras += 1
+                        lang_status = "🔄 中英混合"
+                    elif has_english:
+                        english_paras += 1
+                        lang_status = "🇺🇸 純英文"
+                    elif has_chinese:
+                        chinese_paras += 1
+                        lang_status = "🇨🇳 純中文"
+                    else:
+                        lang_status = "❓ 其他"
+                    
+                    # 檢查交錯對
+                    if i < len(paragraphs) - 1:
+                        next_text = paragraphs[i + 1].text.strip()
+                        next_has_chinese = any('\u4e00' <= c <= '\u9fff' for c in next_text)
+                        next_has_english = any(ord(c) < 128 and c.isalpha() and c not in 'PANJIT' for c in next_text)
+                        
+                        # 中文後跟英文 = 翻譯對
+                        if (has_chinese and not has_english and 
+                            next_has_english and not next_has_chinese):
+                            translation_pairs += 1
+                            if len(consecutive_pairs) < 5:  # 記錄前5個翻譯對
+                                consecutive_pairs.append({
+                                    'index': i,
+                                    'chinese': text[:60],
+                                    'english': next_text[:60]
+                                })
+                    
+                    if i < 20:  # 顯示前20段詳情
+                        marker_status = " 🏷️" if has_marker else ""
+                        print(f"  段落 {i+1:2d}: {lang_status}{marker_status} - {text[:70]}...")
+                
+                print(f"\n📊 語言統計:")
+                print(f"  純中文段落: {chinese_paras}")
+                print(f"  純英文段落: {english_paras}")
+                print(f"  中英混合段落: {mixed_paras}")
+                print(f"  帶翻譯標記段落: {marker_paras}")
+                print(f"  發現交錯翻譯對: {translation_pairs}")
+                
+                # 顯示翻譯對示例
+                if consecutive_pairs:
+                    print(f"\n🔍 翻譯對示例:")
+                    for pair in consecutive_pairs:
+                        print(f"  對 {pair['index']//2 + 1}:")
+                        print(f"    中文: {pair['chinese']}...")
+                        print(f"    英文: {pair['english']}...")
+                
+                # 判斷翻譯效果
+                total_expected_pairs = chinese_paras  # 預期翻譯對數量
+                success_rate = (translation_pairs / total_expected_pairs * 100) if total_expected_pairs > 0 else 0
+                
+                print(f"\n🎯 翻譯效果評估:")
+                print(f"  預期翻譯對: {total_expected_pairs}")
+                print(f"  實際翻譯對: {translation_pairs}")
+                print(f"  翻譯成功率: {success_rate:.1f}%")
+                
+                if success_rate >= 80:
+                    print(f"  ✅ 翻譯效果優秀！")
+                elif success_rate >= 50:
+                    print(f"  ⚠️ 翻譯效果良好，但仍有改進空間")
+                elif translation_pairs > 0:
+                    print(f"  🔍 翻譯部分成功，需要檢查具體問題")
+                else:
+                    print(f"  ❌ 翻譯失敗，需要深入調試")
+                    
+            except Exception as e:
+                print(f"❌ 分析英文翻譯文檔失敗: {e}")
+        
+        except Exception as e:
+            print(f"❌ 生成英文翻譯文檔失敗: {e}")
+        
+        # 生成越南文翻譯文檔
+        print(f"\n🔄 生成越南文翻譯文檔...")
+        try:
+            vi_output_path = parser.generate_translated_document(
+                {}, 
+                'vi', 
+                test_dir
+            )
+            
+            print(f"✅ 越南文翻譯文檔生成: {vi_output_path}")
+            
+            # 快速檢查越南文文檔
+            try:
+                vi_doc = Document(vi_output_path)
+                vi_paragraphs = [p for p in vi_doc.paragraphs if p.text.strip()]
+                
+                vi_pairs = 0
+                for i in range(len(vi_paragraphs) - 1):
+                    text = vi_paragraphs[i].text.strip()
+                    next_text = vi_paragraphs[i + 1].text.strip()
+                    
+                    has_chinese = any('\u4e00' <= c <= '\u9fff' for c in text)
+                    has_vietnamese = any('\u00C0' <= c <= '\u1EF9' for c in next_text)
+                    
+                    if has_chinese and has_vietnamese:
+                        vi_pairs += 1
+                
+                print(f"  越南文翻譯對: {vi_pairs}")
+                
+            except Exception as e:
+                print(f"  越南文文檔檢查失敗: {e}")
+        
+        except Exception as e:
+            print(f"❌ 生成越南文翻譯文檔失敗: {e}")
+        
+        # 最終結論
+        print(f"\n" + "="*60)
+        print(f"🎯 DOCX翻譯修復最終驗證結果:")
+        
+        if 'success_rate' in locals() and success_rate >= 80:
+            print(f"✅ 修復成功！DOCX翻譯功能已完美解決")
+            print(f"   - 翻譯成功率: {success_rate:.1f}%")
+            print(f"   - 交錯格式正確: {translation_pairs} 個翻譯對")
+            print(f"   - 文檔實例匹配問題已解決")
+            
+            # 更新TODO狀態為完成
+            return True
+            
+        elif 'translation_pairs' in locals() and translation_pairs > 0:
+            print(f"⚠️ 修復部分成功，需要進一步調整")
+            print(f"   - 翻譯成功率: {success_rate:.1f}% (目標: ≥80%)")
+            print(f"   - 實際翻譯對: {translation_pairs}")
+            return False
+            
+        else:
+            print(f"❌ 修復尚未完全成功，需要繼續調試")
+            print(f"   - 沒有發現有效的翻譯內容")
+            return False
+
+if __name__ == "__main__":
+    success = test_final_docx_fix()
+    if success:
+        print(f"\n🎉 DOCX翻譯問題已完美解決！")
+    else:
+        print(f"\n🔧 需要繼續修復調試...")