Files
Document_Translator/test_clean_docx_translation.py
2025-09-03 09:05:51 +08:00

178 lines
7.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
使用乾淨的DOCX文件測試翻譯插入
"""
import sys
import os
import tempfile
import shutil
from pathlib import Path
# Fix encoding for Windows console
if sys.stdout.encoding != 'utf-8':
sys.stdout.reconfigure(encoding='utf-8')
if sys.stderr.encoding != 'utf-8':
sys.stderr.reconfigure(encoding='utf-8')
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'app'))
from app import create_app, db
from app.services.translation_service import DocxParser
from sqlalchemy import text
def test_clean_docx_translation():
"""使用乾淨的DOCX文件測試翻譯插入"""
app = create_app()
with app.app_context():
print("=== 使用乾淨的DOCX文件測試翻譯插入 ===")
# 原始文件
original_path = r"C:\Users\EGG\WORK\data\user_scrip\TOOL\Document_translator_V2\uploads\9c6548ac-2f59-45f4-aade-0a9b3895bbfd\original_-OR026_9c6548ac.docx"
# 創建乾淨的副本
clean_copy_dir = Path(tempfile.gettempdir()) / "clean_docx_test"
clean_copy_dir.mkdir(exist_ok=True)
clean_copy_path = clean_copy_dir / "clean_original.docx"
shutil.copy2(original_path, clean_copy_path)
print(f"✅ 創建乾淨副本: {clean_copy_path}")
# 使用乾淨副本測試翻譯
parser = DocxParser(str(clean_copy_path))
# 檢查前幾個段落的當前狀態
try:
from docx import Document
doc = Document(str(clean_copy_path))
print(f"\n📄 乾淨文檔當前狀態:")
print(f"總段落數: {len(doc.paragraphs)}")
for i, para in enumerate(doc.paragraphs[:10]):
if para.text.strip():
print(f" 段落 {i+1}: {para.text.strip()[:60]}...")
# 檢查是否有零寬空格標記(翻譯插入標記)
has_marker = any('\u200b' in (r.text or '') for r in para.runs)
if has_marker:
print(f" ⚠️ 此段落已包含翻譯插入標記")
except Exception as e:
print(f"❌ 檢查文檔狀態失敗: {e}")
return
# 測試翻譯生成只生成前3個段落來測試
print(f"\n🔄 測試翻譯生成...")
try:
output_dir = clean_copy_dir
# 使用空的translations字典因為我們從快取讀取
empty_translations = {}
en_output_path = parser.generate_translated_document(
empty_translations,
'en',
output_dir
)
print(f"✅ 翻譯文件生成成功: {en_output_path}")
# 檢查生成的文件
output_file = Path(en_output_path)
if output_file.exists():
print(f"文件大小: {output_file.stat().st_size:,} bytes")
try:
doc2 = Document(str(output_file))
paragraphs = [p for p in doc2.paragraphs if p.text.strip()]
print(f"\n📄 生成文件詳細分析:")
print(f"總段落數: {len(paragraphs)}")
chinese_count = 0
english_count = 0
mixed_count = 0
marker_count = 0
print(f"\n前20段落詳情:")
for i, para in enumerate(paragraphs[:20]):
text = para.text.strip()
# 語言檢測
has_chinese = any('\u4e00' <= c <= '\u9fff' for c in text)
has_english = any(ord(c) < 128 and c.isalpha() and c not in 'PANJIT' for c in text)
has_marker = any('\u200b' in (r.text or '') for r in para.runs)
if has_marker:
marker_count += 1
if has_chinese and has_english:
mixed_count += 1
lang_status = "🔄 中英混合"
elif has_english:
english_count += 1
lang_status = "🇺🇸 純英文"
elif has_chinese:
chinese_count += 1
lang_status = "🇨🇳 純中文"
else:
lang_status = "❓ 其他"
marker_status = " 🏷️" if has_marker else ""
print(f" 段落 {i+1:2d}: {lang_status}{marker_status} - {text[:70]}...")
print(f"\n📊 統計結果:")
print(f" 純中文段落: {chinese_count}")
print(f" 純英文段落: {english_count}")
print(f" 中英混合段落: {mixed_count}")
print(f" 帶翻譯標記的段落: {marker_count}")
# 判斷翻譯效果
if english_count > 10:
print(f"\n✅ 翻譯效果優秀 - 有 {english_count} 個純英文段落")
elif english_count > 0:
print(f"\n⚠️ 翻譯部分成功 - 有 {english_count} 個純英文段落")
elif marker_count > 10:
print(f"\n🔍 翻譯可能成功但格式問題 - 有 {marker_count} 個帶標記的段落")
else:
print(f"\n❌ 翻譯可能失敗 - 沒有明顯的英文內容")
# 檢查是否有連續的中英文段落(交錯格式)
alternating_pairs = 0
for i in range(len(paragraphs) - 1):
current = paragraphs[i].text.strip()
next_para = paragraphs[i + 1].text.strip()
current_chinese = any('\u4e00' <= c <= '\u9fff' for c in current)
current_english = any(ord(c) < 128 and c.isalpha() and c not in 'PANJIT' for c in current)
next_chinese = any('\u4e00' <= c <= '\u9fff' for c in next_para)
next_english = any(ord(c) < 128 and c.isalpha() and c not in 'PANJIT' for c in next_para)
if current_chinese and not current_english and next_english and not next_chinese:
alternating_pairs += 1
if alternating_pairs <= 3: # 顯示前3個交錯對
print(f"\n 交錯對 {alternating_pairs}:")
print(f" 中文: {current[:50]}...")
print(f" 英文: {next_para[:50]}...")
if alternating_pairs > 0:
print(f"\n✅ 發現交錯翻譯格式!共 {alternating_pairs}")
else:
print(f"\n❌ 沒有發現交錯翻譯格式")
except Exception as e:
print(f"❌ 分析生成文件失敗: {e}")
else:
print(f"❌ 生成的文件不存在")
except Exception as e:
print(f"❌ 翻譯生成失敗: {e}")
if __name__ == "__main__":
test_clean_docx_translation()