#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ 調試DOCX翻譯流程 - 詳細檢查翻譯映射和插入過程 """ import sys import os from pathlib import Path # Fix encoding for Windows console if sys.stdout.encoding != 'utf-8': sys.stdout.reconfigure(encoding='utf-8') if sys.stderr.encoding != 'utf-8': sys.stderr.reconfigure(encoding='utf-8') sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'app')) from app import create_app, db from app.models.job import TranslationJob from app.services.translation_service import DocxParser from sqlalchemy import text def debug_docx_translation(): """調試DOCX翻譯流程""" app = create_app() with app.app_context(): print("=== 調試DOCX翻譯流程 ===") # 檢查指定的DOCX任務 job_uuid = "9c6548ac-2f59-45f4-aade-0a9b3895bbfd" job = TranslationJob.query.filter_by(job_uuid=job_uuid).first() if not job: print(f"任務不存在: {job_uuid}") return print(f"任務狀態: {job.status}") print(f"總tokens: {job.total_tokens:,}") print(f"總成本: ${job.total_cost}") print(f"目標語言: {job.target_languages}") # 取得原始文件 original_file = job.get_original_file() if not original_file: print("找不到原始文件") return original_path = Path(original_file.file_path) print(f"\n📄 原始文件: {original_path}") print(f"存在: {original_path.exists()}") if not original_path.exists(): print("原始文件不存在,無法調試") return # 創建DOCX解析器 parser = DocxParser(str(original_path)) # 1. 檢查文本段落提取 print(f"\n🔍 步驟1: 提取文本段落") try: text_segments = parser.extract_text_segments() print(f"提取到 {len(text_segments)} 個文本段落:") for i, seg in enumerate(text_segments[:5]): # 顯示前5段 print(f" 段落 {i+1}: {seg[:60]}...") except Exception as e: print(f"❌ 文本段落提取失敗: {e}") return # 2. 檢查帶上下文的段落提取 print(f"\n🔍 步驟2: 提取帶上下文的段落") try: segments_with_context = parser.extract_segments_with_context() print(f"提取到 {len(segments_with_context)} 個段落(含上下文):") for i, seg in enumerate(segments_with_context[:3]): # 顯示前3段 print(f" 段落 {i+1}: {seg.kind} | {seg.text[:50]}... | {seg.ctx}") except Exception as e: print(f"❌ 帶上下文段落提取失敗: {e}") return # 3. 檢查翻譯結果 - 從快取讀取 print(f"\n🔍 步驟3: 檢查翻譯快取中的結果") # 讀取英文翻譯 en_result = db.session.execute(text(""" SELECT source_text, translated_text FROM dt_translation_cache WHERE target_language = 'en' ORDER BY created_at DESC LIMIT 10 """)) en_translations = {} en_list = [] for row in en_result.fetchall(): en_translations[row[0]] = row[1] en_list.append(row[1]) # 讀取越南文翻譯 vi_result = db.session.execute(text(""" SELECT source_text, translated_text FROM dt_translation_cache WHERE target_language = 'vi' ORDER BY created_at DESC LIMIT 10 """)) vi_translations = {} vi_list = [] for row in vi_result.fetchall(): vi_translations[row[0]] = row[1] vi_list.append(row[1]) translations = {'en': en_list, 'vi': vi_list} print(f"從快取讀取翻譯: en={len(en_list)}, vi={len(vi_list)}") # 4. 檢查翻譯映射構建 - 使用快取資料 print(f"\n🔍 步驟4: 檢查翻譯映射構建") target_language = 'en' # 檢查英文翻譯 translation_map = {} # 建立基於快取的翻譯映射 for seg in segments_with_context: # 檢查此段落是否在快取中有英文翻譯 if seg.text in en_translations: key = (target_language, seg.text) value = en_translations[seg.text] translation_map[key] = value print(f" 映射: {seg.text[:40]}... -> {value[:40]}...") print(f"翻譯映射總數: {len(translation_map)}") print(f"段落總數: {len(segments_with_context)}") print(f"映射覆蓋率: {len(translation_map)/len(segments_with_context)*100:.1f}%") # 5. 檢查是否有翻譯插入 print(f"\n🔍 步驟5: 檢查翻譯插入邏輯") # 模擬翻譯插入的檢查邏輯 segments_with_translation = 0 segments_without_translation = 0 for seg in segments_with_context: has_translation = (target_language, seg.text) in translation_map if has_translation: segments_with_translation += 1 print(f" ✅ 有翻譯: {seg.text[:30]}...") else: segments_without_translation += 1 print(f" ❌ 無翻譯: {seg.text[:30]}...") print(f"\n📊 總結:") print(f" 有翻譯的段落: {segments_with_translation}") print(f" 無翻譯的段落: {segments_without_translation}") print(f" 翻譯覆蓋率: {segments_with_translation/(segments_with_translation+segments_without_translation)*100:.1f}%") # 6. 檢查已翻譯的文件內容 print(f"\n🔍 步驟6: 檢查已生成的翻譯文件") translated_files = job.get_translated_files() for tf in translated_files: if tf.language_code == target_language: file_path = Path(tf.file_path) if file_path.exists(): print(f"翻譯文件: {tf.filename}") print(f"路徑: {tf.file_path}") print(f"大小: {file_path.stat().st_size:,} bytes") # 檢查文件內容 try: from docx import Document doc = Document(str(file_path)) paragraphs = [p.text.strip() for p in doc.paragraphs if p.text.strip()] english_paras = [p for p in paragraphs if any(ord(c) < 128 and c.isalpha() for c in p)] chinese_paras = [p for p in paragraphs if any('\u4e00' <= c <= '\u9fff' for c in p)] print(f" 總段落: {len(paragraphs)}") print(f" 含英文段落: {len(english_paras)}") print(f" 含中文段落: {len(chinese_paras)}") if english_paras: print(f" 英文段落範例: {english_paras[0][:80]}...") else: print(" ❌ 沒有發現英文段落!") except Exception as e: print(f"❌ 讀取翻譯文件失敗: {e}") if __name__ == "__main__": debug_docx_translation()