Files
Document_Translator/debug_docx_translation.py
2025-09-03 09:05:51 +08:00

193 lines
7.5 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
調試DOCX翻譯流程 - 詳細檢查翻譯映射和插入過程
"""
import sys
import os
from pathlib import Path
# Fix encoding for Windows console
if sys.stdout.encoding != 'utf-8':
sys.stdout.reconfigure(encoding='utf-8')
if sys.stderr.encoding != 'utf-8':
sys.stderr.reconfigure(encoding='utf-8')
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'app'))
from app import create_app, db
from app.models.job import TranslationJob
from app.services.translation_service import DocxParser
from sqlalchemy import text
def debug_docx_translation():
"""調試DOCX翻譯流程"""
app = create_app()
with app.app_context():
print("=== 調試DOCX翻譯流程 ===")
# 檢查指定的DOCX任務
job_uuid = "9c6548ac-2f59-45f4-aade-0a9b3895bbfd"
job = TranslationJob.query.filter_by(job_uuid=job_uuid).first()
if not job:
print(f"任務不存在: {job_uuid}")
return
print(f"任務狀態: {job.status}")
print(f"總tokens: {job.total_tokens:,}")
print(f"總成本: ${job.total_cost}")
print(f"目標語言: {job.target_languages}")
# 取得原始文件
original_file = job.get_original_file()
if not original_file:
print("找不到原始文件")
return
original_path = Path(original_file.file_path)
print(f"\n📄 原始文件: {original_path}")
print(f"存在: {original_path.exists()}")
if not original_path.exists():
print("原始文件不存在,無法調試")
return
# 創建DOCX解析器
parser = DocxParser(str(original_path))
# 1. 檢查文本段落提取
print(f"\n🔍 步驟1: 提取文本段落")
try:
text_segments = parser.extract_text_segments()
print(f"提取到 {len(text_segments)} 個文本段落:")
for i, seg in enumerate(text_segments[:5]): # 顯示前5段
print(f" 段落 {i+1}: {seg[:60]}...")
except Exception as e:
print(f"❌ 文本段落提取失敗: {e}")
return
# 2. 檢查帶上下文的段落提取
print(f"\n🔍 步驟2: 提取帶上下文的段落")
try:
segments_with_context = parser.extract_segments_with_context()
print(f"提取到 {len(segments_with_context)} 個段落(含上下文):")
for i, seg in enumerate(segments_with_context[:3]): # 顯示前3段
print(f" 段落 {i+1}: {seg.kind} | {seg.text[:50]}... | {seg.ctx}")
except Exception as e:
print(f"❌ 帶上下文段落提取失敗: {e}")
return
# 3. 檢查翻譯結果 - 從快取讀取
print(f"\n🔍 步驟3: 檢查翻譯快取中的結果")
# 讀取英文翻譯
en_result = db.session.execute(text("""
SELECT source_text, translated_text
FROM dt_translation_cache
WHERE target_language = 'en'
ORDER BY created_at DESC
LIMIT 10
"""))
en_translations = {}
en_list = []
for row in en_result.fetchall():
en_translations[row[0]] = row[1]
en_list.append(row[1])
# 讀取越南文翻譯
vi_result = db.session.execute(text("""
SELECT source_text, translated_text
FROM dt_translation_cache
WHERE target_language = 'vi'
ORDER BY created_at DESC
LIMIT 10
"""))
vi_translations = {}
vi_list = []
for row in vi_result.fetchall():
vi_translations[row[0]] = row[1]
vi_list.append(row[1])
translations = {'en': en_list, 'vi': vi_list}
print(f"從快取讀取翻譯: en={len(en_list)}, vi={len(vi_list)}")
# 4. 檢查翻譯映射構建 - 使用快取資料
print(f"\n🔍 步驟4: 檢查翻譯映射構建")
target_language = 'en' # 檢查英文翻譯
translation_map = {}
# 建立基於快取的翻譯映射
for seg in segments_with_context:
# 檢查此段落是否在快取中有英文翻譯
if seg.text in en_translations:
key = (target_language, seg.text)
value = en_translations[seg.text]
translation_map[key] = value
print(f" 映射: {seg.text[:40]}... -> {value[:40]}...")
print(f"翻譯映射總數: {len(translation_map)}")
print(f"段落總數: {len(segments_with_context)}")
print(f"映射覆蓋率: {len(translation_map)/len(segments_with_context)*100:.1f}%")
# 5. 檢查是否有翻譯插入
print(f"\n🔍 步驟5: 檢查翻譯插入邏輯")
# 模擬翻譯插入的檢查邏輯
segments_with_translation = 0
segments_without_translation = 0
for seg in segments_with_context:
has_translation = (target_language, seg.text) in translation_map
if has_translation:
segments_with_translation += 1
print(f" ✅ 有翻譯: {seg.text[:30]}...")
else:
segments_without_translation += 1
print(f" ❌ 無翻譯: {seg.text[:30]}...")
print(f"\n📊 總結:")
print(f" 有翻譯的段落: {segments_with_translation}")
print(f" 無翻譯的段落: {segments_without_translation}")
print(f" 翻譯覆蓋率: {segments_with_translation/(segments_with_translation+segments_without_translation)*100:.1f}%")
# 6. 檢查已翻譯的文件內容
print(f"\n🔍 步驟6: 檢查已生成的翻譯文件")
translated_files = job.get_translated_files()
for tf in translated_files:
if tf.language_code == target_language:
file_path = Path(tf.file_path)
if file_path.exists():
print(f"翻譯文件: {tf.filename}")
print(f"路徑: {tf.file_path}")
print(f"大小: {file_path.stat().st_size:,} bytes")
# 檢查文件內容
try:
from docx import Document
doc = Document(str(file_path))
paragraphs = [p.text.strip() for p in doc.paragraphs if p.text.strip()]
english_paras = [p for p in paragraphs if any(ord(c) < 128 and c.isalpha() for c in p)]
chinese_paras = [p for p in paragraphs if any('\u4e00' <= c <= '\u9fff' for c in p)]
print(f" 總段落: {len(paragraphs)}")
print(f" 含英文段落: {len(english_paras)}")
print(f" 含中文段落: {len(chinese_paras)}")
if english_paras:
print(f" 英文段落範例: {english_paras[0][:80]}...")
else:
print(" ❌ 沒有發現英文段落!")
except Exception as e:
print(f"❌ 讀取翻譯文件失敗: {e}")
if __name__ == "__main__":
debug_docx_translation()