122 lines
4.5 KiB
Python
122 lines
4.5 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
檢查DOCX任務的具體翻譯對應
|
|
"""
|
|
|
|
import sys
|
|
import os
|
|
|
|
# Fix encoding for Windows console
|
|
if sys.stdout.encoding != 'utf-8':
|
|
sys.stdout.reconfigure(encoding='utf-8')
|
|
if sys.stderr.encoding != 'utf-8':
|
|
sys.stderr.reconfigure(encoding='utf-8')
|
|
|
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'app'))
|
|
|
|
from app import create_app, db
|
|
from sqlalchemy import text
|
|
from app.services.translation_service import DocxParser
|
|
|
|
def check_docx_specific_translations():
|
|
"""檢查DOCX任務的具體翻譯對應"""
|
|
|
|
app = create_app()
|
|
|
|
with app.app_context():
|
|
print("=== 檢查DOCX任務的具體翻譯對應 ===")
|
|
|
|
# 原始文件路徑
|
|
original_path = r"C:\Users\EGG\WORK\data\user_scrip\TOOL\Document_translator_V2\uploads\9c6548ac-2f59-45f4-aade-0a9b3895bbfd\original_-OR026_9c6548ac.docx"
|
|
|
|
# 提取原始文檔段落
|
|
parser = DocxParser(original_path)
|
|
segments = parser.extract_segments_with_context()
|
|
text_segments = [seg.text for seg in segments if seg.text.strip()]
|
|
|
|
print(f"原始文檔有 {len(text_segments)} 個文本段落")
|
|
|
|
# 查找這些段落在快取中對應的翻譯
|
|
print(f"\n=== 檢查每個段落的翻譯狀況 ===")
|
|
|
|
total_segments = len(text_segments)
|
|
found_en = 0
|
|
found_vi = 0
|
|
|
|
for i, segment_text in enumerate(text_segments):
|
|
# 查找英文翻譯
|
|
en_result = db.session.execute(text("""
|
|
SELECT translated_text, created_at
|
|
FROM dt_translation_cache
|
|
WHERE source_text = :text AND target_language = 'en'
|
|
ORDER BY created_at DESC
|
|
LIMIT 1
|
|
"""), {'text': segment_text})
|
|
|
|
en_row = en_result.fetchone()
|
|
|
|
# 查找越南文翻譯
|
|
vi_result = db.session.execute(text("""
|
|
SELECT translated_text, created_at
|
|
FROM dt_translation_cache
|
|
WHERE source_text = :text AND target_language = 'vi'
|
|
ORDER BY created_at DESC
|
|
LIMIT 1
|
|
"""), {'text': segment_text})
|
|
|
|
vi_row = vi_result.fetchone()
|
|
|
|
status = ""
|
|
if en_row:
|
|
found_en += 1
|
|
status += "EN✅ "
|
|
else:
|
|
status += "EN❌ "
|
|
|
|
if vi_row:
|
|
found_vi += 1
|
|
status += "VI✅ "
|
|
else:
|
|
status += "VI❌ "
|
|
|
|
print(f"段落 {i+1:3d}: {status} {segment_text[:50]}...")
|
|
|
|
# 顯示翻譯內容(如果有的話)
|
|
if en_row and len(en_row[0]) > 0:
|
|
en_text = en_row[0]
|
|
# 檢查是否真的是英文
|
|
has_english = any(ord(c) < 128 and c.isalpha() for c in en_text)
|
|
has_chinese = any('\u4e00' <= c <= '\u9fff' for c in en_text)
|
|
|
|
if has_english and not has_chinese:
|
|
print(f" EN: ✅ {en_text[:60]}...")
|
|
elif has_chinese:
|
|
print(f" EN: ❌ 仍是中文: {en_text[:60]}...")
|
|
else:
|
|
print(f" EN: ❓ 未知: {en_text[:60]}...")
|
|
|
|
if vi_row and len(vi_row[0]) > 0:
|
|
vi_text = vi_row[0]
|
|
has_vietnamese = any('\u00C0' <= c <= '\u1EF9' for c in vi_text)
|
|
has_chinese = any('\u4e00' <= c <= '\u9fff' for c in vi_text)
|
|
|
|
if has_vietnamese and not has_chinese:
|
|
print(f" VI: ✅ {vi_text[:60]}...")
|
|
elif has_chinese:
|
|
print(f" VI: ❌ 仍是中文: {vi_text[:60]}...")
|
|
else:
|
|
print(f" VI: ❓ 未知: {vi_text[:60]}...")
|
|
|
|
print(f"\n📊 統計結果:")
|
|
print(f" 總段落數: {total_segments}")
|
|
print(f" 有英文翻譯: {found_en} ({found_en/total_segments*100:.1f}%)")
|
|
print(f" 有越南文翻譯: {found_vi} ({found_vi/total_segments*100:.1f}%)")
|
|
|
|
if found_en < total_segments * 0.5:
|
|
print(f" ❌ 翻譯覆蓋率太低,可能是翻譯流程有問題")
|
|
else:
|
|
print(f" ✅ 翻譯覆蓋率正常")
|
|
|
|
if __name__ == "__main__":
|
|
check_docx_specific_translations() |