Files
Document_Translator/debug_actual_insertion.py
2025-09-03 09:05:51 +08:00

213 lines
8.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
監控實際的DOCX翻譯插入過程
"""
import sys
import os
import tempfile
import shutil
from pathlib import Path
# Fix encoding for Windows console
if sys.stdout.encoding != 'utf-8':
sys.stdout.reconfigure(encoding='utf-8')
if sys.stderr.encoding != 'utf-8':
sys.stderr.reconfigure(encoding='utf-8')
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'app'))
from app import create_app, db
from app.services.document_processor import DocumentProcessor, _insert_docx_translations
from sqlalchemy import text as sql_text
def debug_actual_insertion():
"""監控實際的DOCX翻譯插入過程"""
app = create_app()
with app.app_context():
print("=== 監控實際的DOCX翻譯插入過程 ===")
# 原始文件
original_path = r"C:\Users\EGG\WORK\data\user_scrip\TOOL\Document_translator_V2\uploads\9c6548ac-2f59-45f4-aade-0a9b3895bbfd\original_-OR026_9c6548ac.docx"
# 創建測試副本
test_dir = Path(tempfile.gettempdir()) / "debug_insertion"
test_dir.mkdir(exist_ok=True)
test_path = test_dir / "debug_original.docx"
output_path = test_dir / "debug_translated.docx"
shutil.copy2(original_path, test_path)
print(f"✅ 創建測試副本: {test_path}")
# 創建處理器
processor = DocumentProcessor()
# 提取段落
segments = processor.extract_docx_segments(str(test_path))
print(f"📄 提取到 {len(segments)} 個段落")
# 構建翻譯映射只取前5個段落進行詳細調試
target_language = 'en'
translation_map = {}
debug_segments = segments[:5] # 只調試前5個段落
print(f"\n🔍 構建前5個段落的翻譯映射:")
for i, seg in enumerate(debug_segments):
result = db.session.execute(sql_text("""
SELECT translated_text
FROM dt_translation_cache
WHERE source_text = :text AND target_language = :lang
ORDER BY created_at DESC
LIMIT 1
"""), {'text': seg.text, 'lang': target_language})
row = result.fetchone()
if row and row[0]:
translation_map[(target_language, seg.text)] = row[0]
print(f" 段落 {i+1}: ✅ 有翻譯")
print(f" 原文: {seg.text[:50]}...")
print(f" 譯文: {row[0][:50]}...")
else:
print(f" 段落 {i+1}: ❌ 無翻譯 - {seg.text[:50]}...")
print(f"\n翻譯映射總數: {len(translation_map)}")
# 載入文檔並檢查插入前狀態
try:
from docx import Document
doc = Document(str(test_path))
print(f"\n📊 插入前文檔狀態:")
print(f"總段落數: {len(doc.paragraphs)}")
# 創建詳細的日誌函數
insertion_logs = []
def detailed_log(msg: str):
print(f"[LOG] {msg}")
insertion_logs.append(msg)
# 執行插入只處理前5個段落
print(f"\n🔄 開始執行翻譯插入...")
ok_count, skip_count = _insert_docx_translations(
doc, debug_segments, translation_map, [target_language], detailed_log
)
print(f"\n插入結果: 成功 {ok_count}, 跳過 {skip_count}")
# 檢查插入後的文檔狀態
print(f"\n📊 插入後文檔狀態:")
print(f"總段落數: {len(doc.paragraphs)}")
# 詳細檢查前20個段落
insertion_found = 0
marker_found = 0
for i, para in enumerate(doc.paragraphs[:20]):
text = para.text.strip()
if not text:
continue
# 檢查是否有翻譯標記
has_marker = any('\u200b' in (r.text or '') for r in para.runs)
# 語言檢測
has_chinese = any('\u4e00' <= c <= '\u9fff' for c in text)
has_english = any(ord(c) < 128 and c.isalpha() and c not in 'PANJIT' for c in text)
if has_marker:
marker_found += 1
lang_status = "🏷️ 翻譯標記"
elif has_english and not has_chinese:
insertion_found += 1
lang_status = "🇺🇸 純英文"
elif has_chinese and has_english:
lang_status = "🔄 中英混合"
elif has_chinese:
lang_status = "🇨🇳 純中文"
else:
lang_status = "❓ 其他"
print(f" 段落 {i+1:2d}: {lang_status} - {text[:60]}...")
print(f"\n發現的插入內容:")
print(f" 純英文段落: {insertion_found}")
print(f" 帶翻譯標記的段落: {marker_found}")
# 保存文檔
doc.save(str(output_path))
print(f"\n✅ 文檔已保存至: {output_path}")
# 重新讀取並驗證
doc2 = Document(str(output_path))
print(f"\n📊 保存後重新讀取驗證:")
print(f"總段落數: {len(doc2.paragraphs)}")
saved_insertion_found = 0
saved_marker_found = 0
for i, para in enumerate(doc2.paragraphs[:20]):
text = para.text.strip()
if not text:
continue
has_marker = any('\u200b' in (r.text or '') for r in para.runs)
has_chinese = any('\u4e00' <= c <= '\u9fff' for c in text)
has_english = any(ord(c) < 128 and c.isalpha() and c not in 'PANJIT' for c in text)
if has_marker:
saved_marker_found += 1
elif has_english and not has_chinese:
saved_insertion_found += 1
print(f"保存後發現的插入內容:")
print(f" 純英文段落: {saved_insertion_found}")
print(f" 帶翻譯標記的段落: {saved_marker_found}")
# 診斷結果
if ok_count > 0 and saved_insertion_found == 0 and saved_marker_found == 0:
print(f"\n🚨 關鍵問題發現:")
print(f" - 插入函數報告成功插入 {ok_count} 個翻譯")
print(f" - 但保存後的文檔中沒有發現任何翻譯內容或標記")
print(f" - 問題可能在於:")
print(f" 1. _append_after函數實際沒有插入")
print(f" 2. 插入位置不正確")
print(f" 3. 文檔保存過程有問題")
elif ok_count > 0 and (saved_insertion_found > 0 or saved_marker_found > 0):
print(f"\n✅ 插入成功!")
print(f" - 插入函數報告: {ok_count} 個翻譯")
print(f" - 保存後確認: {saved_insertion_found + saved_marker_found} 個翻譯段落")
else:
print(f"\n⚠️ 無翻譯插入(可能都被跳過)")
# 打印插入日誌摘要
print(f"\n📝 插入日誌摘要:")
success_logs = [log for log in insertion_logs if '[SUCCESS]' in log]
skip_logs = [log for log in insertion_logs if '[SKIP]' in log]
error_logs = [log for log in insertion_logs if '[ERROR]' in log]
print(f" 成功日誌: {len(success_logs)}")
print(f" 跳過日誌: {len(skip_logs)}")
print(f" 錯誤日誌: {len(error_logs)}")
if success_logs:
print(f" 前3條成功日誌:")
for log in success_logs[:3]:
print(f" {log}")
if error_logs:
print(f" 錯誤日誌:")
for log in error_logs:
print(f" {log}")
except Exception as e:
print(f"❌ 調試失敗: {e}")
if __name__ == "__main__":
debug_actual_insertion()