193 lines
7.5 KiB
Python
193 lines
7.5 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
調試DOCX翻譯流程 - 詳細檢查翻譯映射和插入過程
|
|
"""
|
|
|
|
import sys
|
|
import os
|
|
from pathlib import Path
|
|
|
|
# Fix encoding for Windows console
|
|
if sys.stdout.encoding != 'utf-8':
|
|
sys.stdout.reconfigure(encoding='utf-8')
|
|
if sys.stderr.encoding != 'utf-8':
|
|
sys.stderr.reconfigure(encoding='utf-8')
|
|
|
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'app'))
|
|
|
|
from app import create_app, db
|
|
from app.models.job import TranslationJob
|
|
from app.services.translation_service import DocxParser
|
|
from sqlalchemy import text
|
|
|
|
def debug_docx_translation():
|
|
"""調試DOCX翻譯流程"""
|
|
|
|
app = create_app()
|
|
|
|
with app.app_context():
|
|
print("=== 調試DOCX翻譯流程 ===")
|
|
|
|
# 檢查指定的DOCX任務
|
|
job_uuid = "9c6548ac-2f59-45f4-aade-0a9b3895bbfd"
|
|
job = TranslationJob.query.filter_by(job_uuid=job_uuid).first()
|
|
|
|
if not job:
|
|
print(f"任務不存在: {job_uuid}")
|
|
return
|
|
|
|
print(f"任務狀態: {job.status}")
|
|
print(f"總tokens: {job.total_tokens:,}")
|
|
print(f"總成本: ${job.total_cost}")
|
|
print(f"目標語言: {job.target_languages}")
|
|
|
|
# 取得原始文件
|
|
original_file = job.get_original_file()
|
|
if not original_file:
|
|
print("找不到原始文件")
|
|
return
|
|
|
|
original_path = Path(original_file.file_path)
|
|
print(f"\n📄 原始文件: {original_path}")
|
|
print(f"存在: {original_path.exists()}")
|
|
|
|
if not original_path.exists():
|
|
print("原始文件不存在,無法調試")
|
|
return
|
|
|
|
# 創建DOCX解析器
|
|
parser = DocxParser(str(original_path))
|
|
|
|
# 1. 檢查文本段落提取
|
|
print(f"\n🔍 步驟1: 提取文本段落")
|
|
try:
|
|
text_segments = parser.extract_text_segments()
|
|
print(f"提取到 {len(text_segments)} 個文本段落:")
|
|
for i, seg in enumerate(text_segments[:5]): # 顯示前5段
|
|
print(f" 段落 {i+1}: {seg[:60]}...")
|
|
except Exception as e:
|
|
print(f"❌ 文本段落提取失敗: {e}")
|
|
return
|
|
|
|
# 2. 檢查帶上下文的段落提取
|
|
print(f"\n🔍 步驟2: 提取帶上下文的段落")
|
|
try:
|
|
segments_with_context = parser.extract_segments_with_context()
|
|
print(f"提取到 {len(segments_with_context)} 個段落(含上下文):")
|
|
for i, seg in enumerate(segments_with_context[:3]): # 顯示前3段
|
|
print(f" 段落 {i+1}: {seg.kind} | {seg.text[:50]}... | {seg.ctx}")
|
|
except Exception as e:
|
|
print(f"❌ 帶上下文段落提取失敗: {e}")
|
|
return
|
|
|
|
# 3. 檢查翻譯結果 - 從快取讀取
|
|
print(f"\n🔍 步驟3: 檢查翻譯快取中的結果")
|
|
|
|
# 讀取英文翻譯
|
|
en_result = db.session.execute(text("""
|
|
SELECT source_text, translated_text
|
|
FROM dt_translation_cache
|
|
WHERE target_language = 'en'
|
|
ORDER BY created_at DESC
|
|
LIMIT 10
|
|
"""))
|
|
|
|
en_translations = {}
|
|
en_list = []
|
|
for row in en_result.fetchall():
|
|
en_translations[row[0]] = row[1]
|
|
en_list.append(row[1])
|
|
|
|
# 讀取越南文翻譯
|
|
vi_result = db.session.execute(text("""
|
|
SELECT source_text, translated_text
|
|
FROM dt_translation_cache
|
|
WHERE target_language = 'vi'
|
|
ORDER BY created_at DESC
|
|
LIMIT 10
|
|
"""))
|
|
|
|
vi_translations = {}
|
|
vi_list = []
|
|
for row in vi_result.fetchall():
|
|
vi_translations[row[0]] = row[1]
|
|
vi_list.append(row[1])
|
|
|
|
translations = {'en': en_list, 'vi': vi_list}
|
|
print(f"從快取讀取翻譯: en={len(en_list)}, vi={len(vi_list)}")
|
|
|
|
# 4. 檢查翻譯映射構建 - 使用快取資料
|
|
print(f"\n🔍 步驟4: 檢查翻譯映射構建")
|
|
target_language = 'en' # 檢查英文翻譯
|
|
|
|
translation_map = {}
|
|
|
|
# 建立基於快取的翻譯映射
|
|
for seg in segments_with_context:
|
|
# 檢查此段落是否在快取中有英文翻譯
|
|
if seg.text in en_translations:
|
|
key = (target_language, seg.text)
|
|
value = en_translations[seg.text]
|
|
translation_map[key] = value
|
|
print(f" 映射: {seg.text[:40]}... -> {value[:40]}...")
|
|
|
|
print(f"翻譯映射總數: {len(translation_map)}")
|
|
print(f"段落總數: {len(segments_with_context)}")
|
|
print(f"映射覆蓋率: {len(translation_map)/len(segments_with_context)*100:.1f}%")
|
|
|
|
# 5. 檢查是否有翻譯插入
|
|
print(f"\n🔍 步驟5: 檢查翻譯插入邏輯")
|
|
|
|
# 模擬翻譯插入的檢查邏輯
|
|
segments_with_translation = 0
|
|
segments_without_translation = 0
|
|
|
|
for seg in segments_with_context:
|
|
has_translation = (target_language, seg.text) in translation_map
|
|
if has_translation:
|
|
segments_with_translation += 1
|
|
print(f" ✅ 有翻譯: {seg.text[:30]}...")
|
|
else:
|
|
segments_without_translation += 1
|
|
print(f" ❌ 無翻譯: {seg.text[:30]}...")
|
|
|
|
print(f"\n📊 總結:")
|
|
print(f" 有翻譯的段落: {segments_with_translation}")
|
|
print(f" 無翻譯的段落: {segments_without_translation}")
|
|
print(f" 翻譯覆蓋率: {segments_with_translation/(segments_with_translation+segments_without_translation)*100:.1f}%")
|
|
|
|
# 6. 檢查已翻譯的文件內容
|
|
print(f"\n🔍 步驟6: 檢查已生成的翻譯文件")
|
|
translated_files = job.get_translated_files()
|
|
for tf in translated_files:
|
|
if tf.language_code == target_language:
|
|
file_path = Path(tf.file_path)
|
|
if file_path.exists():
|
|
print(f"翻譯文件: {tf.filename}")
|
|
print(f"路徑: {tf.file_path}")
|
|
print(f"大小: {file_path.stat().st_size:,} bytes")
|
|
|
|
# 檢查文件內容
|
|
try:
|
|
from docx import Document
|
|
doc = Document(str(file_path))
|
|
paragraphs = [p.text.strip() for p in doc.paragraphs if p.text.strip()]
|
|
|
|
english_paras = [p for p in paragraphs if any(ord(c) < 128 and c.isalpha() for c in p)]
|
|
chinese_paras = [p for p in paragraphs if any('\u4e00' <= c <= '\u9fff' for c in p)]
|
|
|
|
print(f" 總段落: {len(paragraphs)}")
|
|
print(f" 含英文段落: {len(english_paras)}")
|
|
print(f" 含中文段落: {len(chinese_paras)}")
|
|
|
|
if english_paras:
|
|
print(f" 英文段落範例: {english_paras[0][:80]}...")
|
|
else:
|
|
print(" ❌ 沒有發現英文段落!")
|
|
|
|
except Exception as e:
|
|
print(f"❌ 讀取翻譯文件失敗: {e}")
|
|
|
|
if __name__ == "__main__":
|
|
debug_docx_translation() |