#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ 使用乾淨的DOCX文件測試翻譯插入 """ import sys import os import tempfile import shutil from pathlib import Path # Fix encoding for Windows console if sys.stdout.encoding != 'utf-8': sys.stdout.reconfigure(encoding='utf-8') if sys.stderr.encoding != 'utf-8': sys.stderr.reconfigure(encoding='utf-8') sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'app')) from app import create_app, db from app.services.translation_service import DocxParser from sqlalchemy import text def test_clean_docx_translation(): """使用乾淨的DOCX文件測試翻譯插入""" app = create_app() with app.app_context(): print("=== 使用乾淨的DOCX文件測試翻譯插入 ===") # 原始文件 original_path = r"C:\Users\EGG\WORK\data\user_scrip\TOOL\Document_translator_V2\uploads\9c6548ac-2f59-45f4-aade-0a9b3895bbfd\original_-OR026_9c6548ac.docx" # 創建乾淨的副本 clean_copy_dir = Path(tempfile.gettempdir()) / "clean_docx_test" clean_copy_dir.mkdir(exist_ok=True) clean_copy_path = clean_copy_dir / "clean_original.docx" shutil.copy2(original_path, clean_copy_path) print(f"✅ 創建乾淨副本: {clean_copy_path}") # 使用乾淨副本測試翻譯 parser = DocxParser(str(clean_copy_path)) # 檢查前幾個段落的當前狀態 try: from docx import Document doc = Document(str(clean_copy_path)) print(f"\n📄 乾淨文檔當前狀態:") print(f"總段落數: {len(doc.paragraphs)}") for i, para in enumerate(doc.paragraphs[:10]): if para.text.strip(): print(f" 段落 {i+1}: {para.text.strip()[:60]}...") # 檢查是否有零寬空格標記(翻譯插入標記) has_marker = any('\u200b' in (r.text or '') for r in para.runs) if has_marker: print(f" ⚠️ 此段落已包含翻譯插入標記") except Exception as e: print(f"❌ 檢查文檔狀態失敗: {e}") return # 測試翻譯生成(只生成前3個段落來測試) print(f"\n🔄 測試翻譯生成...") try: output_dir = clean_copy_dir # 使用空的translations字典,因為我們從快取讀取 empty_translations = {} en_output_path = parser.generate_translated_document( empty_translations, 'en', output_dir ) print(f"✅ 翻譯文件生成成功: {en_output_path}") # 檢查生成的文件 output_file = Path(en_output_path) if output_file.exists(): print(f"文件大小: {output_file.stat().st_size:,} bytes") try: doc2 = Document(str(output_file)) paragraphs = [p for p in doc2.paragraphs if p.text.strip()] print(f"\n📄 生成文件詳細分析:") print(f"總段落數: {len(paragraphs)}") chinese_count = 0 english_count = 0 mixed_count = 0 marker_count = 0 print(f"\n前20段落詳情:") for i, para in enumerate(paragraphs[:20]): text = para.text.strip() # 語言檢測 has_chinese = any('\u4e00' <= c <= '\u9fff' for c in text) has_english = any(ord(c) < 128 and c.isalpha() and c not in 'PANJIT' for c in text) has_marker = any('\u200b' in (r.text or '') for r in para.runs) if has_marker: marker_count += 1 if has_chinese and has_english: mixed_count += 1 lang_status = "🔄 中英混合" elif has_english: english_count += 1 lang_status = "🇺🇸 純英文" elif has_chinese: chinese_count += 1 lang_status = "🇨🇳 純中文" else: lang_status = "❓ 其他" marker_status = " 🏷️" if has_marker else "" print(f" 段落 {i+1:2d}: {lang_status}{marker_status} - {text[:70]}...") print(f"\n📊 統計結果:") print(f" 純中文段落: {chinese_count}") print(f" 純英文段落: {english_count}") print(f" 中英混合段落: {mixed_count}") print(f" 帶翻譯標記的段落: {marker_count}") # 判斷翻譯效果 if english_count > 10: print(f"\n✅ 翻譯效果優秀 - 有 {english_count} 個純英文段落") elif english_count > 0: print(f"\n⚠️ 翻譯部分成功 - 有 {english_count} 個純英文段落") elif marker_count > 10: print(f"\n🔍 翻譯可能成功但格式問題 - 有 {marker_count} 個帶標記的段落") else: print(f"\n❌ 翻譯可能失敗 - 沒有明顯的英文內容") # 檢查是否有連續的中英文段落(交錯格式) alternating_pairs = 0 for i in range(len(paragraphs) - 1): current = paragraphs[i].text.strip() next_para = paragraphs[i + 1].text.strip() current_chinese = any('\u4e00' <= c <= '\u9fff' for c in current) current_english = any(ord(c) < 128 and c.isalpha() and c not in 'PANJIT' for c in current) next_chinese = any('\u4e00' <= c <= '\u9fff' for c in next_para) next_english = any(ord(c) < 128 and c.isalpha() and c not in 'PANJIT' for c in next_para) if current_chinese and not current_english and next_english and not next_chinese: alternating_pairs += 1 if alternating_pairs <= 3: # 顯示前3個交錯對 print(f"\n 交錯對 {alternating_pairs}:") print(f" 中文: {current[:50]}...") print(f" 英文: {next_para[:50]}...") if alternating_pairs > 0: print(f"\n✅ 發現交錯翻譯格式!共 {alternating_pairs} 對") else: print(f"\n❌ 沒有發現交錯翻譯格式") except Exception as e: print(f"❌ 分析生成文件失敗: {e}") else: print(f"❌ 生成的文件不存在") except Exception as e: print(f"❌ 翻譯生成失敗: {e}") if __name__ == "__main__": test_clean_docx_translation()