178 lines
7.8 KiB
Python
178 lines
7.8 KiB
Python
#!/usr/bin/env python3
|
||
# -*- coding: utf-8 -*-
|
||
"""
|
||
使用乾淨的DOCX文件測試翻譯插入
|
||
"""
|
||
|
||
import sys
|
||
import os
|
||
import tempfile
|
||
import shutil
|
||
from pathlib import Path
|
||
|
||
# Fix encoding for Windows console
|
||
if sys.stdout.encoding != 'utf-8':
|
||
sys.stdout.reconfigure(encoding='utf-8')
|
||
if sys.stderr.encoding != 'utf-8':
|
||
sys.stderr.reconfigure(encoding='utf-8')
|
||
|
||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'app'))
|
||
|
||
from app import create_app, db
|
||
from app.services.translation_service import DocxParser
|
||
from sqlalchemy import text
|
||
|
||
def test_clean_docx_translation():
|
||
"""使用乾淨的DOCX文件測試翻譯插入"""
|
||
|
||
app = create_app()
|
||
|
||
with app.app_context():
|
||
print("=== 使用乾淨的DOCX文件測試翻譯插入 ===")
|
||
|
||
# 原始文件
|
||
original_path = r"C:\Users\EGG\WORK\data\user_scrip\TOOL\Document_translator_V2\uploads\9c6548ac-2f59-45f4-aade-0a9b3895bbfd\original_-OR026_9c6548ac.docx"
|
||
|
||
# 創建乾淨的副本
|
||
clean_copy_dir = Path(tempfile.gettempdir()) / "clean_docx_test"
|
||
clean_copy_dir.mkdir(exist_ok=True)
|
||
clean_copy_path = clean_copy_dir / "clean_original.docx"
|
||
|
||
shutil.copy2(original_path, clean_copy_path)
|
||
print(f"✅ 創建乾淨副本: {clean_copy_path}")
|
||
|
||
# 使用乾淨副本測試翻譯
|
||
parser = DocxParser(str(clean_copy_path))
|
||
|
||
# 檢查前幾個段落的當前狀態
|
||
try:
|
||
from docx import Document
|
||
doc = Document(str(clean_copy_path))
|
||
|
||
print(f"\n📄 乾淨文檔當前狀態:")
|
||
print(f"總段落數: {len(doc.paragraphs)}")
|
||
|
||
for i, para in enumerate(doc.paragraphs[:10]):
|
||
if para.text.strip():
|
||
print(f" 段落 {i+1}: {para.text.strip()[:60]}...")
|
||
|
||
# 檢查是否有零寬空格標記(翻譯插入標記)
|
||
has_marker = any('\u200b' in (r.text or '') for r in para.runs)
|
||
if has_marker:
|
||
print(f" ⚠️ 此段落已包含翻譯插入標記")
|
||
|
||
except Exception as e:
|
||
print(f"❌ 檢查文檔狀態失敗: {e}")
|
||
return
|
||
|
||
# 測試翻譯生成(只生成前3個段落來測試)
|
||
print(f"\n🔄 測試翻譯生成...")
|
||
try:
|
||
output_dir = clean_copy_dir
|
||
|
||
# 使用空的translations字典,因為我們從快取讀取
|
||
empty_translations = {}
|
||
|
||
en_output_path = parser.generate_translated_document(
|
||
empty_translations,
|
||
'en',
|
||
output_dir
|
||
)
|
||
|
||
print(f"✅ 翻譯文件生成成功: {en_output_path}")
|
||
|
||
# 檢查生成的文件
|
||
output_file = Path(en_output_path)
|
||
if output_file.exists():
|
||
print(f"文件大小: {output_file.stat().st_size:,} bytes")
|
||
|
||
try:
|
||
doc2 = Document(str(output_file))
|
||
paragraphs = [p for p in doc2.paragraphs if p.text.strip()]
|
||
|
||
print(f"\n📄 生成文件詳細分析:")
|
||
print(f"總段落數: {len(paragraphs)}")
|
||
|
||
chinese_count = 0
|
||
english_count = 0
|
||
mixed_count = 0
|
||
marker_count = 0
|
||
|
||
print(f"\n前20段落詳情:")
|
||
|
||
for i, para in enumerate(paragraphs[:20]):
|
||
text = para.text.strip()
|
||
|
||
# 語言檢測
|
||
has_chinese = any('\u4e00' <= c <= '\u9fff' for c in text)
|
||
has_english = any(ord(c) < 128 and c.isalpha() and c not in 'PANJIT' for c in text)
|
||
has_marker = any('\u200b' in (r.text or '') for r in para.runs)
|
||
|
||
if has_marker:
|
||
marker_count += 1
|
||
|
||
if has_chinese and has_english:
|
||
mixed_count += 1
|
||
lang_status = "🔄 中英混合"
|
||
elif has_english:
|
||
english_count += 1
|
||
lang_status = "🇺🇸 純英文"
|
||
elif has_chinese:
|
||
chinese_count += 1
|
||
lang_status = "🇨🇳 純中文"
|
||
else:
|
||
lang_status = "❓ 其他"
|
||
|
||
marker_status = " 🏷️" if has_marker else ""
|
||
|
||
print(f" 段落 {i+1:2d}: {lang_status}{marker_status} - {text[:70]}...")
|
||
|
||
print(f"\n📊 統計結果:")
|
||
print(f" 純中文段落: {chinese_count}")
|
||
print(f" 純英文段落: {english_count}")
|
||
print(f" 中英混合段落: {mixed_count}")
|
||
print(f" 帶翻譯標記的段落: {marker_count}")
|
||
|
||
# 判斷翻譯效果
|
||
if english_count > 10:
|
||
print(f"\n✅ 翻譯效果優秀 - 有 {english_count} 個純英文段落")
|
||
elif english_count > 0:
|
||
print(f"\n⚠️ 翻譯部分成功 - 有 {english_count} 個純英文段落")
|
||
elif marker_count > 10:
|
||
print(f"\n🔍 翻譯可能成功但格式問題 - 有 {marker_count} 個帶標記的段落")
|
||
else:
|
||
print(f"\n❌ 翻譯可能失敗 - 沒有明顯的英文內容")
|
||
|
||
# 檢查是否有連續的中英文段落(交錯格式)
|
||
alternating_pairs = 0
|
||
for i in range(len(paragraphs) - 1):
|
||
current = paragraphs[i].text.strip()
|
||
next_para = paragraphs[i + 1].text.strip()
|
||
|
||
current_chinese = any('\u4e00' <= c <= '\u9fff' for c in current)
|
||
current_english = any(ord(c) < 128 and c.isalpha() and c not in 'PANJIT' for c in current)
|
||
next_chinese = any('\u4e00' <= c <= '\u9fff' for c in next_para)
|
||
next_english = any(ord(c) < 128 and c.isalpha() and c not in 'PANJIT' for c in next_para)
|
||
|
||
if current_chinese and not current_english and next_english and not next_chinese:
|
||
alternating_pairs += 1
|
||
if alternating_pairs <= 3: # 顯示前3個交錯對
|
||
print(f"\n 交錯對 {alternating_pairs}:")
|
||
print(f" 中文: {current[:50]}...")
|
||
print(f" 英文: {next_para[:50]}...")
|
||
|
||
if alternating_pairs > 0:
|
||
print(f"\n✅ 發現交錯翻譯格式!共 {alternating_pairs} 對")
|
||
else:
|
||
print(f"\n❌ 沒有發現交錯翻譯格式")
|
||
|
||
except Exception as e:
|
||
print(f"❌ 分析生成文件失敗: {e}")
|
||
else:
|
||
print(f"❌ 生成的文件不存在")
|
||
|
||
except Exception as e:
|
||
print(f"❌ 翻譯生成失敗: {e}")
|
||
|
||
if __name__ == "__main__":
|
||
test_clean_docx_translation() |