#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ 詳細檢查修復後的DOCX翻譯文件內容 """ import sys import os # Fix encoding for Windows console if sys.stdout.encoding != 'utf-8': sys.stdout.reconfigure(encoding='utf-8') if sys.stderr.encoding != 'utf-8': sys.stderr.reconfigure(encoding='utf-8') def examine_fixed_docx(): """詳細檢查修復後的DOCX文件""" print("=== 詳細檢查修復後的DOCX翻譯文件 ===") # 檢查剛生成的測試文件 test_file = r"C:\Users\EGG\AppData\Local\Temp\test_docx_translation\translated_original_-OR026_9c6548ac_en_translat.docx" try: from docx import Document doc = Document(test_file) print(f"文件: {test_file}") print(f"總段落數: {len(doc.paragraphs)}") # 詳細分析每個段落 chinese_only = 0 english_only = 0 mixed = 0 empty = 0 print(f"\n📄 詳細段落分析:") for i, para in enumerate(doc.paragraphs): text = para.text.strip() if not text: empty += 1 continue has_chinese = any('\u4e00' <= c <= '\u9fff' for c in text) has_english = any(ord(c) < 128 and c.isalpha() for c in text) if has_chinese and has_english: mixed += 1 status = "🔄 中英混合" elif has_english: english_only += 1 status = "🇺🇸 純英文" elif has_chinese: chinese_only += 1 status = "🇨🇳 純中文" else: status = "❓ 未知" if i < 20: # 顯示前20段 print(f" 段落 {i+1:2d}: {status} - {text[:80]}...") print(f"\n📊 統計結果:") print(f" 空段落: {empty}") print(f" 純中文段落: {chinese_only}") print(f" 純英文段落: {english_only}") print(f" 中英混合段落: {mixed}") total_content = chinese_only + english_only + mixed if total_content > 0: print(f" 中文內容比例: {(chinese_only + mixed) / total_content * 100:.1f}%") print(f" 英文內容比例: {(english_only + mixed) / total_content * 100:.1f}%") # 檢查是否有交錯格式 print(f"\n🔍 檢查交錯翻譯格式:") potential_alternating = 0 for i in range(len(doc.paragraphs) - 1): current = doc.paragraphs[i].text.strip() next_para = doc.paragraphs[i + 1].text.strip() if current and next_para: current_chinese = any('\u4e00' <= c <= '\u9fff' for c in current) current_english = any(ord(c) < 128 and c.isalpha() for c in current) next_chinese = any('\u4e00' <= c <= '\u9fff' for c in next_para) next_english = any(ord(c) < 128 and c.isalpha() for c in next_para) # 檢查是否是中文段落後跟英文段落(交錯格式) if current_chinese and not current_english and next_english and not next_chinese: potential_alternating += 1 if potential_alternating <= 5: # 顯示前5個交錯範例 print(f" 交錯範例 {potential_alternating}:") print(f" 中文: {current[:60]}...") print(f" 英文: {next_para[:60]}...") if potential_alternating > 0: print(f" ✅ 發現 {potential_alternating} 個潛在交錯翻譯對") print(f" 📈 交錯格式覆蓋率: {potential_alternating / (total_content // 2) * 100:.1f}%") else: print(f" ❌ 沒有發現明顯的交錯翻譯格式") except Exception as e: print(f"❌ 檢查失敗: {e}") if __name__ == "__main__": examine_fixed_docx()