Document_Translator/check_docx_content.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
檢查DOCX翻譯文件的實際內容
"""

import sys
import os
from pathlib import Path

# Fix encoding for Windows console
if sys.stdout.encoding != 'utf-8':
    sys.stdout.reconfigure(encoding='utf-8')
if sys.stderr.encoding != 'utf-8':
    sys.stderr.reconfigure(encoding='utf-8')

sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'app'))

from app import create_app
from app.models.job import TranslationJob

def check_docx_content():
    """檢查DOCX翻譯文件的實際內容"""

    app = create_app()

    with app.app_context():
        print("=== 檢查DOCX翻譯文件內容 ===")

        # 檢查最新的DOCX任務
        job = TranslationJob.query.filter_by(job_uuid='9c6548ac-2f59-45f4-aade-0a9b3895bbfd').first()
        if not job:
            print("DOCX任務不存在")
            return

        print(f"任務狀態: {job.status}")
        print(f"總tokens: {job.total_tokens}")
        print(f"總成本: ${job.total_cost}")
        print(f"目標語言: {job.target_languages}")

        translated_files = job.get_translated_files()
        print(f"\n📁 翻譯檔案數: {len(translated_files)}")

        for tf in translated_files:
            file_path = Path(tf.file_path)
            print(f"\n【檢查】 {tf.filename} ({tf.language_code})")
            print(f"路徑: {tf.file_path}")
            print(f"存在: {file_path.exists()}")
            print(f"大小: {file_path.stat().st_size:,} bytes")

            if file_path.exists() and tf.filename.endswith('.docx'):
                try:
                    from docx import Document
                    doc = Document(str(file_path))

                    paragraphs = [p.text.strip() for p in doc.paragraphs if p.text.strip()]
                    print(f"總段落數: {len(paragraphs)}")

                    if paragraphs:
                        print(f"\n📄 前5段內容檢查:")
                        for i, para in enumerate(paragraphs[:5]):
                            print(f"段落 {i+1}: {para[:100]}...")

                            # 檢查是否包含交錯翻譯格式
                            lines = para.split('\n')
                            if len(lines) > 1:
                                print(f"  -> 多行內容（可能是交錯格式）: {len(lines)} 行")
                                for j, line in enumerate(lines[:3]):  # 顯示前3行
                                    print(f"    行{j+1}: {line[:60]}...")

                            # 檢查是否包含英文或越南文
                            has_english = any(ord(c) < 128 and c.isalpha() for c in para)
                            has_vietnamese = any('\u00C0' <= c <= '\u1EF9' for c in para)  # Vietnamese characters

                            print(f"  -> 包含英文: {has_english}")
                            print(f"  -> 包含越南文: {has_vietnamese}")
                            print("  ---")

                        # 檢查整個文件的語言分佈
                        all_text = ' '.join(paragraphs)
                        chinese_chars = sum(1 for c in all_text if '\u4e00' <= c <= '\u9fff')
                        english_chars = sum(1 for c in all_text if ord(c) < 128 and c.isalpha())
                        vietnamese_chars = sum(1 for c in all_text if '\u00C0' <= c <= '\u1EF9')

                        print(f"\n📊 文件語言分析:")
                        print(f"  中文字符: {chinese_chars}")
                        print(f"  英文字符: {english_chars}")
                        print(f"  越南文字符: {vietnamese_chars}")

                        if chinese_chars > 0 and (english_chars == 0 and vietnamese_chars == 0):
                            print("  ❌ 只有中文，沒有翻譯內容！")
                        elif chinese_chars > 0 and (english_chars > 0 or vietnamese_chars > 0):
                            print("  ✅ 包含中文和翻譯內容，可能是交錯格式")
                        else:
                            print("  ⚠️ 文件內容異常")

                except Exception as e:
                    print(f"❌ 讀取DOCX文件失敗: {e}")

if __name__ == "__main__":
    check_docx_content()