Files
Document_Translator/check_mixed_paragraph.py
2025-09-03 09:05:51 +08:00

116 lines
4.5 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
檢查中英混合段落的具體內容
"""
import sys
import os
# Fix encoding for Windows console
if sys.stdout.encoding != 'utf-8':
sys.stdout.reconfigure(encoding='utf-8')
if sys.stderr.encoding != 'utf-8':
sys.stderr.reconfigure(encoding='utf-8')
def check_mixed_paragraph():
"""檢查中英混合段落的具體內容"""
print("=== 檢查中英混合段落的具體內容 ===")
test_file = r"C:\Users\EGG\AppData\Local\Temp\test_docx_translation\translated_original_-OR026_9c6548ac_en_translat.docx"
try:
from docx import Document
doc = Document(test_file)
mixed_count = 0
for i, para in enumerate(doc.paragraphs):
text = para.text.strip()
if not text:
continue
has_chinese = any('\u4e00' <= c <= '\u9fff' for c in text)
has_english = any(ord(c) < 128 and c.isalpha() for c in text)
if has_chinese and has_english:
mixed_count += 1
print(f"\n混合段落 {mixed_count} (段落 {i+1}):")
print(f"完整內容: {text}")
# 分析段落內部結構
lines = text.split('\n')
if len(lines) > 1:
print(f"包含 {len(lines)} 行:")
for j, line in enumerate(lines):
line_chinese = any('\u4e00' <= c <= '\u9fff' for c in line)
line_english = any(ord(c) < 128 and c.isalpha() for c in line)
if line_chinese and line_english:
status = "🔄 中英混合"
elif line_english:
status = "🇺🇸 英文"
elif line_chinese:
status = "🇨🇳 中文"
else:
status = "❓ 其他"
print(f"{j+1}: {status} - {line}")
# 檢查是否包含特殊字符(翻譯插入標記)
if '\u200b' in text:
print(" 💡 包含零寬空格標記(翻譯插入標記)")
# 嘗試分離中英文內容
parts = []
current_part = ""
current_is_chinese = None
for char in text:
is_chinese = '\u4e00' <= char <= '\u9fff'
is_english = ord(char) < 128 and char.isalpha()
if is_chinese:
if current_is_chinese == False: # 切換到中文
if current_part.strip():
parts.append(("EN", current_part.strip()))
current_part = char
current_is_chinese = True
else:
current_part += char
current_is_chinese = True
elif is_english:
if current_is_chinese == True: # 切換到英文
if current_part.strip():
parts.append(("ZH", current_part.strip()))
current_part = char
current_is_chinese = False
else:
current_part += char
current_is_chinese = False
else:
current_part += char
if current_part.strip():
if current_is_chinese:
parts.append(("ZH", current_part.strip()))
elif current_is_chinese == False:
parts.append(("EN", current_part.strip()))
if len(parts) > 1:
print(f" 📝 內容分析 ({len(parts)} 部分):")
for k, (lang, content) in enumerate(parts):
print(f" {k+1}. [{lang}] {content[:50]}...")
if mixed_count == 0:
print("沒有找到中英混合段落")
else:
print(f"\n✅ 總共找到 {mixed_count} 個中英混合段落")
except Exception as e:
print(f"❌ 檢查失敗: {e}")
if __name__ == "__main__":
check_mixed_paragraph()