116 lines
4.5 KiB
Python
116 lines
4.5 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
檢查中英混合段落的具體內容
|
|
"""
|
|
|
|
import sys
|
|
import os
|
|
|
|
# Fix encoding for Windows console
|
|
if sys.stdout.encoding != 'utf-8':
|
|
sys.stdout.reconfigure(encoding='utf-8')
|
|
if sys.stderr.encoding != 'utf-8':
|
|
sys.stderr.reconfigure(encoding='utf-8')
|
|
|
|
def check_mixed_paragraph():
|
|
"""檢查中英混合段落的具體內容"""
|
|
|
|
print("=== 檢查中英混合段落的具體內容 ===")
|
|
|
|
test_file = r"C:\Users\EGG\AppData\Local\Temp\test_docx_translation\translated_original_-OR026_9c6548ac_en_translat.docx"
|
|
|
|
try:
|
|
from docx import Document
|
|
doc = Document(test_file)
|
|
|
|
mixed_count = 0
|
|
|
|
for i, para in enumerate(doc.paragraphs):
|
|
text = para.text.strip()
|
|
|
|
if not text:
|
|
continue
|
|
|
|
has_chinese = any('\u4e00' <= c <= '\u9fff' for c in text)
|
|
has_english = any(ord(c) < 128 and c.isalpha() for c in text)
|
|
|
|
if has_chinese and has_english:
|
|
mixed_count += 1
|
|
print(f"\n混合段落 {mixed_count} (段落 {i+1}):")
|
|
print(f"完整內容: {text}")
|
|
|
|
# 分析段落內部結構
|
|
lines = text.split('\n')
|
|
if len(lines) > 1:
|
|
print(f"包含 {len(lines)} 行:")
|
|
for j, line in enumerate(lines):
|
|
line_chinese = any('\u4e00' <= c <= '\u9fff' for c in line)
|
|
line_english = any(ord(c) < 128 and c.isalpha() for c in line)
|
|
|
|
if line_chinese and line_english:
|
|
status = "🔄 中英混合"
|
|
elif line_english:
|
|
status = "🇺🇸 英文"
|
|
elif line_chinese:
|
|
status = "🇨🇳 中文"
|
|
else:
|
|
status = "❓ 其他"
|
|
|
|
print(f" 行 {j+1}: {status} - {line}")
|
|
|
|
# 檢查是否包含特殊字符(翻譯插入標記)
|
|
if '\u200b' in text:
|
|
print(" 💡 包含零寬空格標記(翻譯插入標記)")
|
|
|
|
# 嘗試分離中英文內容
|
|
parts = []
|
|
current_part = ""
|
|
current_is_chinese = None
|
|
|
|
for char in text:
|
|
is_chinese = '\u4e00' <= char <= '\u9fff'
|
|
is_english = ord(char) < 128 and char.isalpha()
|
|
|
|
if is_chinese:
|
|
if current_is_chinese == False: # 切換到中文
|
|
if current_part.strip():
|
|
parts.append(("EN", current_part.strip()))
|
|
current_part = char
|
|
current_is_chinese = True
|
|
else:
|
|
current_part += char
|
|
current_is_chinese = True
|
|
elif is_english:
|
|
if current_is_chinese == True: # 切換到英文
|
|
if current_part.strip():
|
|
parts.append(("ZH", current_part.strip()))
|
|
current_part = char
|
|
current_is_chinese = False
|
|
else:
|
|
current_part += char
|
|
current_is_chinese = False
|
|
else:
|
|
current_part += char
|
|
|
|
if current_part.strip():
|
|
if current_is_chinese:
|
|
parts.append(("ZH", current_part.strip()))
|
|
elif current_is_chinese == False:
|
|
parts.append(("EN", current_part.strip()))
|
|
|
|
if len(parts) > 1:
|
|
print(f" 📝 內容分析 ({len(parts)} 部分):")
|
|
for k, (lang, content) in enumerate(parts):
|
|
print(f" {k+1}. [{lang}] {content[:50]}...")
|
|
|
|
if mixed_count == 0:
|
|
print("沒有找到中英混合段落")
|
|
else:
|
|
print(f"\n✅ 總共找到 {mixed_count} 個中英混合段落")
|
|
|
|
except Exception as e:
|
|
print(f"❌ 檢查失敗: {e}")
|
|
|
|
if __name__ == "__main__":
|
|
check_mixed_paragraph() |