4th_fix time error
This commit is contained in:
178
test_clean_docx_translation.py
Normal file
178
test_clean_docx_translation.py
Normal file
@@ -0,0 +1,178 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
使用乾淨的DOCX文件測試翻譯插入
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import tempfile
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
|
||||
# Fix encoding for Windows console
|
||||
if sys.stdout.encoding != 'utf-8':
|
||||
sys.stdout.reconfigure(encoding='utf-8')
|
||||
if sys.stderr.encoding != 'utf-8':
|
||||
sys.stderr.reconfigure(encoding='utf-8')
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'app'))
|
||||
|
||||
from app import create_app, db
|
||||
from app.services.translation_service import DocxParser
|
||||
from sqlalchemy import text
|
||||
|
||||
def test_clean_docx_translation():
|
||||
"""使用乾淨的DOCX文件測試翻譯插入"""
|
||||
|
||||
app = create_app()
|
||||
|
||||
with app.app_context():
|
||||
print("=== 使用乾淨的DOCX文件測試翻譯插入 ===")
|
||||
|
||||
# 原始文件
|
||||
original_path = r"C:\Users\EGG\WORK\data\user_scrip\TOOL\Document_translator_V2\uploads\9c6548ac-2f59-45f4-aade-0a9b3895bbfd\original_-OR026_9c6548ac.docx"
|
||||
|
||||
# 創建乾淨的副本
|
||||
clean_copy_dir = Path(tempfile.gettempdir()) / "clean_docx_test"
|
||||
clean_copy_dir.mkdir(exist_ok=True)
|
||||
clean_copy_path = clean_copy_dir / "clean_original.docx"
|
||||
|
||||
shutil.copy2(original_path, clean_copy_path)
|
||||
print(f"✅ 創建乾淨副本: {clean_copy_path}")
|
||||
|
||||
# 使用乾淨副本測試翻譯
|
||||
parser = DocxParser(str(clean_copy_path))
|
||||
|
||||
# 檢查前幾個段落的當前狀態
|
||||
try:
|
||||
from docx import Document
|
||||
doc = Document(str(clean_copy_path))
|
||||
|
||||
print(f"\n📄 乾淨文檔當前狀態:")
|
||||
print(f"總段落數: {len(doc.paragraphs)}")
|
||||
|
||||
for i, para in enumerate(doc.paragraphs[:10]):
|
||||
if para.text.strip():
|
||||
print(f" 段落 {i+1}: {para.text.strip()[:60]}...")
|
||||
|
||||
# 檢查是否有零寬空格標記(翻譯插入標記)
|
||||
has_marker = any('\u200b' in (r.text or '') for r in para.runs)
|
||||
if has_marker:
|
||||
print(f" ⚠️ 此段落已包含翻譯插入標記")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ 檢查文檔狀態失敗: {e}")
|
||||
return
|
||||
|
||||
# 測試翻譯生成(只生成前3個段落來測試)
|
||||
print(f"\n🔄 測試翻譯生成...")
|
||||
try:
|
||||
output_dir = clean_copy_dir
|
||||
|
||||
# 使用空的translations字典,因為我們從快取讀取
|
||||
empty_translations = {}
|
||||
|
||||
en_output_path = parser.generate_translated_document(
|
||||
empty_translations,
|
||||
'en',
|
||||
output_dir
|
||||
)
|
||||
|
||||
print(f"✅ 翻譯文件生成成功: {en_output_path}")
|
||||
|
||||
# 檢查生成的文件
|
||||
output_file = Path(en_output_path)
|
||||
if output_file.exists():
|
||||
print(f"文件大小: {output_file.stat().st_size:,} bytes")
|
||||
|
||||
try:
|
||||
doc2 = Document(str(output_file))
|
||||
paragraphs = [p for p in doc2.paragraphs if p.text.strip()]
|
||||
|
||||
print(f"\n📄 生成文件詳細分析:")
|
||||
print(f"總段落數: {len(paragraphs)}")
|
||||
|
||||
chinese_count = 0
|
||||
english_count = 0
|
||||
mixed_count = 0
|
||||
marker_count = 0
|
||||
|
||||
print(f"\n前20段落詳情:")
|
||||
|
||||
for i, para in enumerate(paragraphs[:20]):
|
||||
text = para.text.strip()
|
||||
|
||||
# 語言檢測
|
||||
has_chinese = any('\u4e00' <= c <= '\u9fff' for c in text)
|
||||
has_english = any(ord(c) < 128 and c.isalpha() and c not in 'PANJIT' for c in text)
|
||||
has_marker = any('\u200b' in (r.text or '') for r in para.runs)
|
||||
|
||||
if has_marker:
|
||||
marker_count += 1
|
||||
|
||||
if has_chinese and has_english:
|
||||
mixed_count += 1
|
||||
lang_status = "🔄 中英混合"
|
||||
elif has_english:
|
||||
english_count += 1
|
||||
lang_status = "🇺🇸 純英文"
|
||||
elif has_chinese:
|
||||
chinese_count += 1
|
||||
lang_status = "🇨🇳 純中文"
|
||||
else:
|
||||
lang_status = "❓ 其他"
|
||||
|
||||
marker_status = " 🏷️" if has_marker else ""
|
||||
|
||||
print(f" 段落 {i+1:2d}: {lang_status}{marker_status} - {text[:70]}...")
|
||||
|
||||
print(f"\n📊 統計結果:")
|
||||
print(f" 純中文段落: {chinese_count}")
|
||||
print(f" 純英文段落: {english_count}")
|
||||
print(f" 中英混合段落: {mixed_count}")
|
||||
print(f" 帶翻譯標記的段落: {marker_count}")
|
||||
|
||||
# 判斷翻譯效果
|
||||
if english_count > 10:
|
||||
print(f"\n✅ 翻譯效果優秀 - 有 {english_count} 個純英文段落")
|
||||
elif english_count > 0:
|
||||
print(f"\n⚠️ 翻譯部分成功 - 有 {english_count} 個純英文段落")
|
||||
elif marker_count > 10:
|
||||
print(f"\n🔍 翻譯可能成功但格式問題 - 有 {marker_count} 個帶標記的段落")
|
||||
else:
|
||||
print(f"\n❌ 翻譯可能失敗 - 沒有明顯的英文內容")
|
||||
|
||||
# 檢查是否有連續的中英文段落(交錯格式)
|
||||
alternating_pairs = 0
|
||||
for i in range(len(paragraphs) - 1):
|
||||
current = paragraphs[i].text.strip()
|
||||
next_para = paragraphs[i + 1].text.strip()
|
||||
|
||||
current_chinese = any('\u4e00' <= c <= '\u9fff' for c in current)
|
||||
current_english = any(ord(c) < 128 and c.isalpha() and c not in 'PANJIT' for c in current)
|
||||
next_chinese = any('\u4e00' <= c <= '\u9fff' for c in next_para)
|
||||
next_english = any(ord(c) < 128 and c.isalpha() and c not in 'PANJIT' for c in next_para)
|
||||
|
||||
if current_chinese and not current_english and next_english and not next_chinese:
|
||||
alternating_pairs += 1
|
||||
if alternating_pairs <= 3: # 顯示前3個交錯對
|
||||
print(f"\n 交錯對 {alternating_pairs}:")
|
||||
print(f" 中文: {current[:50]}...")
|
||||
print(f" 英文: {next_para[:50]}...")
|
||||
|
||||
if alternating_pairs > 0:
|
||||
print(f"\n✅ 發現交錯翻譯格式!共 {alternating_pairs} 對")
|
||||
else:
|
||||
print(f"\n❌ 沒有發現交錯翻譯格式")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ 分析生成文件失敗: {e}")
|
||||
else:
|
||||
print(f"❌ 生成的文件不存在")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ 翻譯生成失敗: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_clean_docx_translation()
|
Reference in New Issue
Block a user