#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ 檢查中英混合段落的具體內容 """ import sys import os # Fix encoding for Windows console if sys.stdout.encoding != 'utf-8': sys.stdout.reconfigure(encoding='utf-8') if sys.stderr.encoding != 'utf-8': sys.stderr.reconfigure(encoding='utf-8') def check_mixed_paragraph(): """檢查中英混合段落的具體內容""" print("=== 檢查中英混合段落的具體內容 ===") test_file = r"C:\Users\EGG\AppData\Local\Temp\test_docx_translation\translated_original_-OR026_9c6548ac_en_translat.docx" try: from docx import Document doc = Document(test_file) mixed_count = 0 for i, para in enumerate(doc.paragraphs): text = para.text.strip() if not text: continue has_chinese = any('\u4e00' <= c <= '\u9fff' for c in text) has_english = any(ord(c) < 128 and c.isalpha() for c in text) if has_chinese and has_english: mixed_count += 1 print(f"\n混合段落 {mixed_count} (段落 {i+1}):") print(f"完整內容: {text}") # 分析段落內部結構 lines = text.split('\n') if len(lines) > 1: print(f"包含 {len(lines)} 行:") for j, line in enumerate(lines): line_chinese = any('\u4e00' <= c <= '\u9fff' for c in line) line_english = any(ord(c) < 128 and c.isalpha() for c in line) if line_chinese and line_english: status = "🔄 中英混合" elif line_english: status = "🇺🇸 英文" elif line_chinese: status = "🇨🇳 中文" else: status = "❓ 其他" print(f" 行 {j+1}: {status} - {line}") # 檢查是否包含特殊字符(翻譯插入標記) if '\u200b' in text: print(" 💡 包含零寬空格標記(翻譯插入標記)") # 嘗試分離中英文內容 parts = [] current_part = "" current_is_chinese = None for char in text: is_chinese = '\u4e00' <= char <= '\u9fff' is_english = ord(char) < 128 and char.isalpha() if is_chinese: if current_is_chinese == False: # 切換到中文 if current_part.strip(): parts.append(("EN", current_part.strip())) current_part = char current_is_chinese = True else: current_part += char current_is_chinese = True elif is_english: if current_is_chinese == True: # 切換到英文 if current_part.strip(): parts.append(("ZH", current_part.strip())) current_part = char current_is_chinese = False else: current_part += char current_is_chinese = False else: current_part += char if current_part.strip(): if current_is_chinese: parts.append(("ZH", current_part.strip())) elif current_is_chinese == False: parts.append(("EN", current_part.strip())) if len(parts) > 1: print(f" 📝 內容分析 ({len(parts)} 部分):") for k, (lang, content) in enumerate(parts): print(f" {k+1}. [{lang}] {content[:50]}...") if mixed_count == 0: print("沒有找到中英混合段落") else: print(f"\n✅ 總共找到 {mixed_count} 個中英混合段落") except Exception as e: print(f"❌ 檢查失敗: {e}") if __name__ == "__main__": check_mixed_paragraph()