Document_Translator/test_clean_docx_translation.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
使用乾淨的DOCX文件測試翻譯插入
"""

import sys
import os
import tempfile
import shutil
from pathlib import Path

# Fix encoding for Windows console
if sys.stdout.encoding != 'utf-8':
    sys.stdout.reconfigure(encoding='utf-8')
if sys.stderr.encoding != 'utf-8':
    sys.stderr.reconfigure(encoding='utf-8')

sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'app'))

from app import create_app, db
from app.services.translation_service import DocxParser
from sqlalchemy import text

def test_clean_docx_translation():
    """使用乾淨的DOCX文件測試翻譯插入"""

    app = create_app()

    with app.app_context():
        print("=== 使用乾淨的DOCX文件測試翻譯插入 ===")

        # 原始文件
        original_path = r"C:\Users\EGG\WORK\data\user_scrip\TOOL\Document_translator_V2\uploads\9c6548ac-2f59-45f4-aade-0a9b3895bbfd\original_-OR026_9c6548ac.docx"

        # 創建乾淨的副本
        clean_copy_dir = Path(tempfile.gettempdir()) / "clean_docx_test"
        clean_copy_dir.mkdir(exist_ok=True)
        clean_copy_path = clean_copy_dir / "clean_original.docx"

        shutil.copy2(original_path, clean_copy_path)
        print(f"✅ 創建乾淨副本: {clean_copy_path}")

        # 使用乾淨副本測試翻譯
        parser = DocxParser(str(clean_copy_path))

        # 檢查前幾個段落的當前狀態
        try:
            from docx import Document
            doc = Document(str(clean_copy_path))

            print(f"\n📄 乾淨文檔當前狀態:")
            print(f"總段落數: {len(doc.paragraphs)}")

            for i, para in enumerate(doc.paragraphs[:10]):
                if para.text.strip():
                    print(f"  段落 {i+1}: {para.text.strip()[:60]}...")

                    # 檢查是否有零寬空格標記（翻譯插入標記）
                    has_marker = any('\u200b' in (r.text or '') for r in para.runs)
                    if has_marker:
                        print(f"    ⚠️ 此段落已包含翻譯插入標記")

        except Exception as e:
            print(f"❌ 檢查文檔狀態失敗: {e}")
            return

        # 測試翻譯生成（只生成前3個段落來測試）
        print(f"\n🔄 測試翻譯生成...")
        try:
            output_dir = clean_copy_dir

            # 使用空的translations字典，因為我們從快取讀取
            empty_translations = {}

            en_output_path = parser.generate_translated_document(
                empty_translations,
                'en',
                output_dir
            )

            print(f"✅ 翻譯文件生成成功: {en_output_path}")

            # 檢查生成的文件
            output_file = Path(en_output_path)
            if output_file.exists():
                print(f"文件大小: {output_file.stat().st_size:,} bytes")

                try:
                    doc2 = Document(str(output_file))
                    paragraphs = [p for p in doc2.paragraphs if p.text.strip()]

                    print(f"\n📄 生成文件詳細分析:")
                    print(f"總段落數: {len(paragraphs)}")

                    chinese_count = 0
                    english_count = 0
                    mixed_count = 0
                    marker_count = 0

                    print(f"\n前20段落詳情:")

                    for i, para in enumerate(paragraphs[:20]):
                        text = para.text.strip()

                        # 語言檢測
                        has_chinese = any('\u4e00' <= c <= '\u9fff' for c in text)
                        has_english = any(ord(c) < 128 and c.isalpha() and c not in 'PANJIT' for c in text)
                        has_marker = any('\u200b' in (r.text or '') for r in para.runs)

                        if has_marker:
                            marker_count += 1

                        if has_chinese and has_english:
                            mixed_count += 1
                            lang_status = "🔄 中英混合"
                        elif has_english:
                            english_count += 1
                            lang_status = "🇺🇸 純英文"
                        elif has_chinese:
                            chinese_count += 1
                            lang_status = "🇨🇳 純中文"
                        else:
                            lang_status = "❓ 其他"

                        marker_status = " 🏷️" if has_marker else ""

                        print(f"  段落 {i+1:2d}: {lang_status}{marker_status} - {text[:70]}...")

                    print(f"\n📊 統計結果:")
                    print(f"  純中文段落: {chinese_count}")
                    print(f"  純英文段落: {english_count}")
                    print(f"  中英混合段落: {mixed_count}")
                    print(f"  帶翻譯標記的段落: {marker_count}")

                    # 判斷翻譯效果
                    if english_count > 10:
                        print(f"\n✅ 翻譯效果優秀 - 有 {english_count} 個純英文段落")
                    elif english_count > 0:
                        print(f"\n⚠️ 翻譯部分成功 - 有 {english_count} 個純英文段落")
                    elif marker_count > 10:
                        print(f"\n🔍 翻譯可能成功但格式問題 - 有 {marker_count} 個帶標記的段落")
                    else:
                        print(f"\n❌ 翻譯可能失敗 - 沒有明顯的英文內容")

                        # 檢查是否有連續的中英文段落（交錯格式）
                        alternating_pairs = 0
                        for i in range(len(paragraphs) - 1):
                            current = paragraphs[i].text.strip()
                            next_para = paragraphs[i + 1].text.strip()

                            current_chinese = any('\u4e00' <= c <= '\u9fff' for c in current)
                            current_english = any(ord(c) < 128 and c.isalpha() and c not in 'PANJIT' for c in current)
                            next_chinese = any('\u4e00' <= c <= '\u9fff' for c in next_para)
                            next_english = any(ord(c) < 128 and c.isalpha() and c not in 'PANJIT' for c in next_para)

                            if current_chinese and not current_english and next_english and not next_chinese:
                                alternating_pairs += 1
                                if alternating_pairs <= 3:  # 顯示前3個交錯對
                                    print(f"\n  交錯對 {alternating_pairs}:")
                                    print(f"    中文: {current[:50]}...")
                                    print(f"    英文: {next_para[:50]}...")

                        if alternating_pairs > 0:
                            print(f"\n✅ 發現交錯翻譯格式！共 {alternating_pairs} 對")
                        else:
                            print(f"\n❌ 沒有發現交錯翻譯格式")

                except Exception as e:
                    print(f"❌ 分析生成文件失敗: {e}")
            else:
                print(f"❌ 生成的文件不存在")

        except Exception as e:
            print(f"❌ 翻譯生成失敗: {e}")

if __name__ == "__main__":
    test_clean_docx_translation()