4th_fix time error

2025-09-03 09:05:51 +08:00
parent e6e5332705
commit cce3fd4925
26 changed files with 2551 additions and 82 deletions
--- a/examine_fixed_docx.py
+++ b/examine_fixed_docx.py
@@ -0,0 +1,107 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+詳細檢查修復後的DOCX翻譯文件內容
+"""
+
+import sys
+import os
+
+# Fix encoding for Windows console
+if sys.stdout.encoding != 'utf-8':
+    sys.stdout.reconfigure(encoding='utf-8')
+if sys.stderr.encoding != 'utf-8':
+    sys.stderr.reconfigure(encoding='utf-8')
+
+def examine_fixed_docx():
+    """詳細檢查修復後的DOCX文件"""
+    
+    print("=== 詳細檢查修復後的DOCX翻譯文件 ===")
+    
+    # 檢查剛生成的測試文件
+    test_file = r"C:\Users\EGG\AppData\Local\Temp\test_docx_translation\translated_original_-OR026_9c6548ac_en_translat.docx"
+    
+    try:
+        from docx import Document
+        doc = Document(test_file)
+        
+        print(f"文件: {test_file}")
+        print(f"總段落數: {len(doc.paragraphs)}")
+        
+        # 詳細分析每個段落
+        chinese_only = 0
+        english_only = 0
+        mixed = 0
+        empty = 0
+        
+        print(f"\n📄 詳細段落分析:")
+        
+        for i, para in enumerate(doc.paragraphs):
+            text = para.text.strip()
+            
+            if not text:
+                empty += 1
+                continue
+            
+            has_chinese = any('\u4e00' <= c <= '\u9fff' for c in text)
+            has_english = any(ord(c) < 128 and c.isalpha() for c in text)
+            
+            if has_chinese and has_english:
+                mixed += 1
+                status = "🔄 中英混合"
+            elif has_english:
+                english_only += 1  
+                status = "🇺🇸 純英文"
+            elif has_chinese:
+                chinese_only += 1
+                status = "🇨🇳 純中文"
+            else:
+                status = "❓ 未知"
+                
+            if i < 20:  # 顯示前20段
+                print(f"  段落 {i+1:2d}: {status} - {text[:80]}...")
+        
+        print(f"\n📊 統計結果:")
+        print(f"  空段落: {empty}")
+        print(f"  純中文段落: {chinese_only}")
+        print(f"  純英文段落: {english_only}")
+        print(f"  中英混合段落: {mixed}")
+        
+        total_content = chinese_only + english_only + mixed
+        if total_content > 0:
+            print(f"  中文內容比例: {(chinese_only + mixed) / total_content * 100:.1f}%")
+            print(f"  英文內容比例: {(english_only + mixed) / total_content * 100:.1f}%")
+        
+        # 檢查是否有交錯格式
+        print(f"\n🔍 檢查交錯翻譯格式:")
+        potential_alternating = 0
+        
+        for i in range(len(doc.paragraphs) - 1):
+            current = doc.paragraphs[i].text.strip()
+            next_para = doc.paragraphs[i + 1].text.strip()
+            
+            if current and next_para:
+                current_chinese = any('\u4e00' <= c <= '\u9fff' for c in current)
+                current_english = any(ord(c) < 128 and c.isalpha() for c in current)
+                next_chinese = any('\u4e00' <= c <= '\u9fff' for c in next_para)
+                next_english = any(ord(c) < 128 and c.isalpha() for c in next_para)
+                
+                # 檢查是否是中文段落後跟英文段落（交錯格式）
+                if current_chinese and not current_english and next_english and not next_chinese:
+                    potential_alternating += 1
+                    if potential_alternating <= 5:  # 顯示前5個交錯範例
+                        print(f"  交錯範例 {potential_alternating}:")
+                        print(f"    中文: {current[:60]}...")
+                        print(f"    英文: {next_para[:60]}...")
+        
+        if potential_alternating > 0:
+            print(f"  ✅ 發現 {potential_alternating} 個潛在交錯翻譯對")
+            print(f"  📈 交錯格式覆蓋率: {potential_alternating / (total_content // 2) * 100:.1f}%")
+        else:
+            print(f"  ❌ 沒有發現明顯的交錯翻譯格式")
+            
+    except Exception as e:
+        print(f"❌ 檢查失敗: {e}")
+
+if __name__ == "__main__":
+    examine_fixed_docx()