Files
Document_Translator/check_docx_specific_translations.py
2025-09-03 09:05:51 +08:00

122 lines
4.5 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
檢查DOCX任務的具體翻譯對應
"""
import sys
import os
# Fix encoding for Windows console
if sys.stdout.encoding != 'utf-8':
sys.stdout.reconfigure(encoding='utf-8')
if sys.stderr.encoding != 'utf-8':
sys.stderr.reconfigure(encoding='utf-8')
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'app'))
from app import create_app, db
from sqlalchemy import text
from app.services.translation_service import DocxParser
def check_docx_specific_translations():
"""檢查DOCX任務的具體翻譯對應"""
app = create_app()
with app.app_context():
print("=== 檢查DOCX任務的具體翻譯對應 ===")
# 原始文件路徑
original_path = r"C:\Users\EGG\WORK\data\user_scrip\TOOL\Document_translator_V2\uploads\9c6548ac-2f59-45f4-aade-0a9b3895bbfd\original_-OR026_9c6548ac.docx"
# 提取原始文檔段落
parser = DocxParser(original_path)
segments = parser.extract_segments_with_context()
text_segments = [seg.text for seg in segments if seg.text.strip()]
print(f"原始文檔有 {len(text_segments)} 個文本段落")
# 查找這些段落在快取中對應的翻譯
print(f"\n=== 檢查每個段落的翻譯狀況 ===")
total_segments = len(text_segments)
found_en = 0
found_vi = 0
for i, segment_text in enumerate(text_segments):
# 查找英文翻譯
en_result = db.session.execute(text("""
SELECT translated_text, created_at
FROM dt_translation_cache
WHERE source_text = :text AND target_language = 'en'
ORDER BY created_at DESC
LIMIT 1
"""), {'text': segment_text})
en_row = en_result.fetchone()
# 查找越南文翻譯
vi_result = db.session.execute(text("""
SELECT translated_text, created_at
FROM dt_translation_cache
WHERE source_text = :text AND target_language = 'vi'
ORDER BY created_at DESC
LIMIT 1
"""), {'text': segment_text})
vi_row = vi_result.fetchone()
status = ""
if en_row:
found_en += 1
status += "EN✅ "
else:
status += "EN❌ "
if vi_row:
found_vi += 1
status += "VI✅ "
else:
status += "VI❌ "
print(f"段落 {i+1:3d}: {status} {segment_text[:50]}...")
# 顯示翻譯內容(如果有的話)
if en_row and len(en_row[0]) > 0:
en_text = en_row[0]
# 檢查是否真的是英文
has_english = any(ord(c) < 128 and c.isalpha() for c in en_text)
has_chinese = any('\u4e00' <= c <= '\u9fff' for c in en_text)
if has_english and not has_chinese:
print(f" EN: ✅ {en_text[:60]}...")
elif has_chinese:
print(f" EN: ❌ 仍是中文: {en_text[:60]}...")
else:
print(f" EN: ❓ 未知: {en_text[:60]}...")
if vi_row and len(vi_row[0]) > 0:
vi_text = vi_row[0]
has_vietnamese = any('\u00C0' <= c <= '\u1EF9' for c in vi_text)
has_chinese = any('\u4e00' <= c <= '\u9fff' for c in vi_text)
if has_vietnamese and not has_chinese:
print(f" VI: ✅ {vi_text[:60]}...")
elif has_chinese:
print(f" VI: ❌ 仍是中文: {vi_text[:60]}...")
else:
print(f" VI: ❓ 未知: {vi_text[:60]}...")
print(f"\n📊 統計結果:")
print(f" 總段落數: {total_segments}")
print(f" 有英文翻譯: {found_en} ({found_en/total_segments*100:.1f}%)")
print(f" 有越南文翻譯: {found_vi} ({found_vi/total_segments*100:.1f}%)")
if found_en < total_segments * 0.5:
print(f" ❌ 翻譯覆蓋率太低,可能是翻譯流程有問題")
else:
print(f" ✅ 翻譯覆蓋率正常")
if __name__ == "__main__":
check_docx_specific_translations()