Files
Document_Translator/debug_docx_insertion_path.py
2025-09-03 09:05:51 +08:00

153 lines
6.0 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
調試DOCX翻譯插入的實際執行路徑
"""
import sys
import os
# Fix encoding for Windows console
if sys.stdout.encoding != 'utf-8':
sys.stdout.reconfigure(encoding='utf-8')
if sys.stderr.encoding != 'utf-8':
sys.stderr.reconfigure(encoding='utf-8')
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'app'))
from app import create_app, db
from app.services.translation_service import DocxParser
from sqlalchemy import text
def debug_docx_insertion_path():
"""調試DOCX翻譯插入的實際執行路徑"""
app = create_app()
with app.app_context():
print("=== 調試DOCX翻譯插入的實際執行路徑 ===")
# 使用現有的DOCX文件
original_path = r"C:\Users\EGG\WORK\data\user_scrip\TOOL\Document_translator_V2\uploads\9c6548ac-2f59-45f4-aade-0a9b3895bbfd\original_-OR026_9c6548ac.docx"
# 創建解析器
parser = DocxParser(original_path)
# 提取段落資訊
segments = parser.extract_segments_with_context()
print(f"文檔總段落數: {len(segments)}")
# 分析段落類型
table_segments = 0
normal_segments = 0
sdt_segments = 0
other_segments = 0
print(f"\n📊 段落類型分析:")
for i, seg in enumerate(segments[:20]): # 檢查前20個段落
if seg.kind == "para":
# 檢查是否在表格中
from docx.table import _Cell
from docx.text.paragraph import Paragraph
if isinstance(seg.ref, Paragraph):
p = seg.ref
if isinstance(p._parent, _Cell):
table_segments += 1
segment_type = "🏢 表格段落"
else:
normal_segments += 1
segment_type = "📄 普通段落"
elif hasattr(seg.ref, 'tag') and seg.ref.tag.endswith('}sdt'):
sdt_segments += 1
segment_type = "📋 SDT段落"
else:
other_segments += 1
segment_type = f"❓ 其他段落 ({type(seg.ref)})"
else:
other_segments += 1
segment_type = f"🔧 非段落 ({seg.kind})"
print(f" 段落 {i+1:2d}: {segment_type} - {seg.text[:50]}...")
print(f"\n統計結果 (前20個段落):")
print(f" 表格段落: {table_segments}")
print(f" 普通段落: {normal_segments}")
print(f" SDT段落: {sdt_segments}")
print(f" 其他類型: {other_segments}")
# 檢查有翻譯的段落會走哪個路徑
print(f"\n🔍 檢查有翻譯的段落執行路徑:")
path_stats = {
"table": 0,
"normal": 0,
"sdt": 0,
"other": 0,
"skipped": 0
}
for i, seg in enumerate(segments[:10]): # 檢查前10個段落
if seg.kind == "para":
# 查找翻譯
result = db.session.execute(text("""
SELECT translated_text
FROM dt_translation_cache
WHERE source_text = :text AND target_language = 'en'
ORDER BY created_at DESC
LIMIT 1
"""), {'text': seg.text})
row = result.fetchone()
has_translation = row and row[0]
if has_translation:
# 判斷執行路徑
if isinstance(seg.ref, Paragraph):
p = seg.ref
if isinstance(p._parent, _Cell):
path = "table"
path_name = "🏢 表格路徑"
else:
path = "normal"
path_name = "📄 普通段落路徑"
elif hasattr(seg.ref, 'tag') and seg.ref.tag.endswith('}sdt'):
path = "sdt"
path_name = "📋 SDT路徑"
else:
path = "other"
path_name = "❓ 其他路徑"
path_stats[path] += 1
print(f" 段落 {i+1:2d}: {path_name} ✅ 有翻譯")
print(f" 原文: {seg.text[:50]}...")
print(f" 譯文: {row[0][:50]}...")
else:
path_stats["skipped"] += 1
print(f" 段落 {i+1:2d}: ❌ 無翻譯 - {seg.text[:30]}...")
print(f"\n📈 執行路徑統計:")
print(f" 表格路徑: {path_stats['table']} 段落")
print(f" 普通段落路徑: {path_stats['normal']} 段落")
print(f" SDT路徑: {path_stats['sdt']} 段落")
print(f" 其他路徑: {path_stats['other']} 段落")
print(f" 跳過(無翻譯): {path_stats['skipped']} 段落")
# 重點分析:大多數段落走的是哪個路徑?
total_with_translation = sum(path_stats[k] for k in ['table', 'normal', 'sdt', 'other'])
if total_with_translation > 0:
print(f"\n💡 關鍵分析:")
if path_stats['table'] > path_stats['normal']:
print(f" ⚠️ 大多數段落走表格路徑 ({path_stats['table']}/{total_with_translation})")
print(f" 可能問題: 表格插入邏輯有問題")
elif path_stats['normal'] > path_stats['table']:
print(f" ✅ 大多數段落走普通段落路徑 ({path_stats['normal']}/{total_with_translation})")
print(f" 可能問題: 普通段落插入邏輯有問題")
else:
print(f" 📊 表格和普通段落路徑數量相當")
if __name__ == "__main__":
debug_docx_insertion_path()