#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ 檢查DOCX翻譯文件的實際內容 """ import sys import os from pathlib import Path # Fix encoding for Windows console if sys.stdout.encoding != 'utf-8': sys.stdout.reconfigure(encoding='utf-8') if sys.stderr.encoding != 'utf-8': sys.stderr.reconfigure(encoding='utf-8') sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'app')) from app import create_app from app.models.job import TranslationJob def check_docx_content(): """檢查DOCX翻譯文件的實際內容""" app = create_app() with app.app_context(): print("=== 檢查DOCX翻譯文件內容 ===") # 檢查最新的DOCX任務 job = TranslationJob.query.filter_by(job_uuid='9c6548ac-2f59-45f4-aade-0a9b3895bbfd').first() if not job: print("DOCX任務不存在") return print(f"任務狀態: {job.status}") print(f"總tokens: {job.total_tokens}") print(f"總成本: ${job.total_cost}") print(f"目標語言: {job.target_languages}") translated_files = job.get_translated_files() print(f"\n📁 翻譯檔案數: {len(translated_files)}") for tf in translated_files: file_path = Path(tf.file_path) print(f"\n【檢查】 {tf.filename} ({tf.language_code})") print(f"路徑: {tf.file_path}") print(f"存在: {file_path.exists()}") print(f"大小: {file_path.stat().st_size:,} bytes") if file_path.exists() and tf.filename.endswith('.docx'): try: from docx import Document doc = Document(str(file_path)) paragraphs = [p.text.strip() for p in doc.paragraphs if p.text.strip()] print(f"總段落數: {len(paragraphs)}") if paragraphs: print(f"\n📄 前5段內容檢查:") for i, para in enumerate(paragraphs[:5]): print(f"段落 {i+1}: {para[:100]}...") # 檢查是否包含交錯翻譯格式 lines = para.split('\n') if len(lines) > 1: print(f" -> 多行內容(可能是交錯格式): {len(lines)} 行") for j, line in enumerate(lines[:3]): # 顯示前3行 print(f" 行{j+1}: {line[:60]}...") # 檢查是否包含英文或越南文 has_english = any(ord(c) < 128 and c.isalpha() for c in para) has_vietnamese = any('\u00C0' <= c <= '\u1EF9' for c in para) # Vietnamese characters print(f" -> 包含英文: {has_english}") print(f" -> 包含越南文: {has_vietnamese}") print(" ---") # 檢查整個文件的語言分佈 all_text = ' '.join(paragraphs) chinese_chars = sum(1 for c in all_text if '\u4e00' <= c <= '\u9fff') english_chars = sum(1 for c in all_text if ord(c) < 128 and c.isalpha()) vietnamese_chars = sum(1 for c in all_text if '\u00C0' <= c <= '\u1EF9') print(f"\n📊 文件語言分析:") print(f" 中文字符: {chinese_chars}") print(f" 英文字符: {english_chars}") print(f" 越南文字符: {vietnamese_chars}") if chinese_chars > 0 and (english_chars == 0 and vietnamese_chars == 0): print(" ❌ 只有中文,沒有翻譯內容!") elif chinese_chars > 0 and (english_chars > 0 or vietnamese_chars > 0): print(" ✅ 包含中文和翻譯內容,可能是交錯格式") else: print(" ⚠️ 文件內容異常") except Exception as e: print(f"❌ 讀取DOCX文件失敗: {e}") if __name__ == "__main__": check_docx_content()