Document_Translator/check_mixed_paragraph.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
檢查中英混合段落的具體內容
"""

import sys
import os

# Fix encoding for Windows console
if sys.stdout.encoding != 'utf-8':
    sys.stdout.reconfigure(encoding='utf-8')
if sys.stderr.encoding != 'utf-8':
    sys.stderr.reconfigure(encoding='utf-8')

def check_mixed_paragraph():
    """檢查中英混合段落的具體內容"""

    print("=== 檢查中英混合段落的具體內容 ===")

    test_file = r"C:\Users\EGG\AppData\Local\Temp\test_docx_translation\translated_original_-OR026_9c6548ac_en_translat.docx"

    try:
        from docx import Document
        doc = Document(test_file)

        mixed_count = 0

        for i, para in enumerate(doc.paragraphs):
            text = para.text.strip()

            if not text:
                continue

            has_chinese = any('\u4e00' <= c <= '\u9fff' for c in text)
            has_english = any(ord(c) < 128 and c.isalpha() for c in text)

            if has_chinese and has_english:
                mixed_count += 1
                print(f"\n混合段落 {mixed_count} (段落 {i+1}):")
                print(f"完整內容: {text}")

                # 分析段落內部結構
                lines = text.split('\n')
                if len(lines) > 1:
                    print(f"包含 {len(lines)} 行:")
                    for j, line in enumerate(lines):
                        line_chinese = any('\u4e00' <= c <= '\u9fff' for c in line)
                        line_english = any(ord(c) < 128 and c.isalpha() for c in line)

                        if line_chinese and line_english:
                            status = "🔄 中英混合"
                        elif line_english:
                            status = "🇺🇸 英文"
                        elif line_chinese:
                            status = "🇨🇳 中文"
                        else:
                            status = "❓ 其他"

                        print(f"  行 {j+1}: {status} - {line}")

                # 檢查是否包含特殊字符（翻譯插入標記）
                if '\u200b' in text:
                    print("  💡 包含零寬空格標記（翻譯插入標記）")

                # 嘗試分離中英文內容
                parts = []
                current_part = ""
                current_is_chinese = None

                for char in text:
                    is_chinese = '\u4e00' <= char <= '\u9fff'
                    is_english = ord(char) < 128 and char.isalpha()

                    if is_chinese:
                        if current_is_chinese == False:  # 切換到中文
                            if current_part.strip():
                                parts.append(("EN", current_part.strip()))
                            current_part = char
                            current_is_chinese = True
                        else:
                            current_part += char
                            current_is_chinese = True
                    elif is_english:
                        if current_is_chinese == True:  # 切換到英文
                            if current_part.strip():
                                parts.append(("ZH", current_part.strip()))
                            current_part = char
                            current_is_chinese = False
                        else:
                            current_part += char
                            current_is_chinese = False
                    else:
                        current_part += char

                if current_part.strip():
                    if current_is_chinese:
                        parts.append(("ZH", current_part.strip()))
                    elif current_is_chinese == False:
                        parts.append(("EN", current_part.strip()))

                if len(parts) > 1:
                    print(f"  📝 內容分析 ({len(parts)} 部分):")
                    for k, (lang, content) in enumerate(parts):
                        print(f"    {k+1}. [{lang}] {content[:50]}...")

        if mixed_count == 0:
            print("沒有找到中英混合段落")
        else:
            print(f"\n✅ 總共找到 {mixed_count} 個中英混合段落")

    except Exception as e:
        print(f"❌ 檢查失敗: {e}")

if __name__ == "__main__":
    check_mixed_paragraph()