4th_fix time error
This commit is contained in:
107
examine_fixed_docx.py
Normal file
107
examine_fixed_docx.py
Normal file
@@ -0,0 +1,107 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
詳細檢查修復後的DOCX翻譯文件內容
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
|
||||
# Fix encoding for Windows console
|
||||
if sys.stdout.encoding != 'utf-8':
|
||||
sys.stdout.reconfigure(encoding='utf-8')
|
||||
if sys.stderr.encoding != 'utf-8':
|
||||
sys.stderr.reconfigure(encoding='utf-8')
|
||||
|
||||
def examine_fixed_docx():
|
||||
"""詳細檢查修復後的DOCX文件"""
|
||||
|
||||
print("=== 詳細檢查修復後的DOCX翻譯文件 ===")
|
||||
|
||||
# 檢查剛生成的測試文件
|
||||
test_file = r"C:\Users\EGG\AppData\Local\Temp\test_docx_translation\translated_original_-OR026_9c6548ac_en_translat.docx"
|
||||
|
||||
try:
|
||||
from docx import Document
|
||||
doc = Document(test_file)
|
||||
|
||||
print(f"文件: {test_file}")
|
||||
print(f"總段落數: {len(doc.paragraphs)}")
|
||||
|
||||
# 詳細分析每個段落
|
||||
chinese_only = 0
|
||||
english_only = 0
|
||||
mixed = 0
|
||||
empty = 0
|
||||
|
||||
print(f"\n📄 詳細段落分析:")
|
||||
|
||||
for i, para in enumerate(doc.paragraphs):
|
||||
text = para.text.strip()
|
||||
|
||||
if not text:
|
||||
empty += 1
|
||||
continue
|
||||
|
||||
has_chinese = any('\u4e00' <= c <= '\u9fff' for c in text)
|
||||
has_english = any(ord(c) < 128 and c.isalpha() for c in text)
|
||||
|
||||
if has_chinese and has_english:
|
||||
mixed += 1
|
||||
status = "🔄 中英混合"
|
||||
elif has_english:
|
||||
english_only += 1
|
||||
status = "🇺🇸 純英文"
|
||||
elif has_chinese:
|
||||
chinese_only += 1
|
||||
status = "🇨🇳 純中文"
|
||||
else:
|
||||
status = "❓ 未知"
|
||||
|
||||
if i < 20: # 顯示前20段
|
||||
print(f" 段落 {i+1:2d}: {status} - {text[:80]}...")
|
||||
|
||||
print(f"\n📊 統計結果:")
|
||||
print(f" 空段落: {empty}")
|
||||
print(f" 純中文段落: {chinese_only}")
|
||||
print(f" 純英文段落: {english_only}")
|
||||
print(f" 中英混合段落: {mixed}")
|
||||
|
||||
total_content = chinese_only + english_only + mixed
|
||||
if total_content > 0:
|
||||
print(f" 中文內容比例: {(chinese_only + mixed) / total_content * 100:.1f}%")
|
||||
print(f" 英文內容比例: {(english_only + mixed) / total_content * 100:.1f}%")
|
||||
|
||||
# 檢查是否有交錯格式
|
||||
print(f"\n🔍 檢查交錯翻譯格式:")
|
||||
potential_alternating = 0
|
||||
|
||||
for i in range(len(doc.paragraphs) - 1):
|
||||
current = doc.paragraphs[i].text.strip()
|
||||
next_para = doc.paragraphs[i + 1].text.strip()
|
||||
|
||||
if current and next_para:
|
||||
current_chinese = any('\u4e00' <= c <= '\u9fff' for c in current)
|
||||
current_english = any(ord(c) < 128 and c.isalpha() for c in current)
|
||||
next_chinese = any('\u4e00' <= c <= '\u9fff' for c in next_para)
|
||||
next_english = any(ord(c) < 128 and c.isalpha() for c in next_para)
|
||||
|
||||
# 檢查是否是中文段落後跟英文段落(交錯格式)
|
||||
if current_chinese and not current_english and next_english and not next_chinese:
|
||||
potential_alternating += 1
|
||||
if potential_alternating <= 5: # 顯示前5個交錯範例
|
||||
print(f" 交錯範例 {potential_alternating}:")
|
||||
print(f" 中文: {current[:60]}...")
|
||||
print(f" 英文: {next_para[:60]}...")
|
||||
|
||||
if potential_alternating > 0:
|
||||
print(f" ✅ 發現 {potential_alternating} 個潛在交錯翻譯對")
|
||||
print(f" 📈 交錯格式覆蓋率: {potential_alternating / (total_content // 2) * 100:.1f}%")
|
||||
else:
|
||||
print(f" ❌ 沒有發現明顯的交錯翻譯格式")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ 檢查失敗: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
examine_fixed_docx()
|
Reference in New Issue
Block a user