Files
Document_Translator/test_xlsx_translation_format.py
2025-09-03 09:05:51 +08:00

220 lines
10 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
驗證XLSX翻譯格式 - 檢查翻譯文件內容
"""
import sys
import os
import tempfile
from pathlib import Path
# Fix encoding for Windows console
if sys.stdout.encoding != 'utf-8':
sys.stdout.reconfigure(encoding='utf-8')
if sys.stderr.encoding != 'utf-8':
sys.stderr.reconfigure(encoding='utf-8')
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'app'))
from app import create_app, db
from app.services.translation_service import ExcelParser
from sqlalchemy import text as sql_text
def test_xlsx_translation_format():
"""驗證XLSX翻譯格式"""
app = create_app()
with app.app_context():
print("=== 驗證XLSX翻譯格式 ===")
# 尋找現有的XLSX文件進行測試
uploads_dir = Path("uploads")
xlsx_files = []
if uploads_dir.exists():
for job_dir in uploads_dir.iterdir():
if job_dir.is_dir():
for file_path in job_dir.iterdir():
if file_path.suffix.lower() in ['.xlsx', '.xls']:
xlsx_files.append(file_path)
if not xlsx_files:
print("❌ 沒有找到XLSX測試文件")
return
# 使用第一個找到的XLSX文件
test_file = xlsx_files[0]
print(f"✅ 使用測試文件: {test_file}")
# 創建測試環境
test_dir = Path(tempfile.gettempdir()) / "xlsx_format_test"
test_dir.mkdir(exist_ok=True)
try:
# 創建ExcelParser
parser = ExcelParser(str(test_file))
# 提取文字片段
text_segments = parser.extract_text_segments()
print(f"\n📄 文件分析:")
print(f"提取的文字段落數: {len(text_segments)}")
# 檢查翻譯覆蓋率
languages = ['en', 'vi']
for lang in languages:
translated_count = 0
total_count = 0
for text in text_segments:
if text.strip() and len(text.strip()) > 2:
total_count += 1
result = db.session.execute(sql_text("""
SELECT translated_text
FROM dt_translation_cache
WHERE source_text = :text AND target_language = :lang
ORDER BY created_at DESC
LIMIT 1
"""), {'text': text, 'lang': lang})
row = result.fetchone()
if row and row[0]:
translated_count += 1
coverage = (translated_count / total_count * 100) if total_count > 0 else 0
print(f" {lang.upper()}翻譯覆蓋率: {coverage:.1f}% ({translated_count}/{total_count})")
# 生成英文翻譯
print(f"\n🔄 生成英文翻譯XLSX文件...")
try:
en_output_path = parser.generate_translated_document(
{}, # 使用空字典,從快取讀取
'en',
test_dir
)
print(f"✅ 英文翻譯文件生成: {en_output_path}")
# 檢查生成的文件內容
try:
import openpyxl
output_file = Path(en_output_path)
if output_file.exists():
print(f"檔案大小: {output_file.stat().st_size:,} bytes")
# 分析Excel內容
wb = openpyxl.load_workbook(str(output_file))
print(f"\n📊 Excel文件分析:")
print(f"工作表數量: {len(wb.sheetnames)}")
for sheet_name in wb.sheetnames[:3]: # 檢查前3個工作表
ws = wb[sheet_name]
print(f"\n📄 工作表: {sheet_name}")
print(f" 最大行數: {ws.max_row}")
print(f" 最大列數: {ws.max_column}")
# 檢查前20行的內容
chinese_cells = 0
english_cells = 0
mixed_cells = 0
empty_cells = 0
sample_data = []
for row in range(1, min(21, ws.max_row + 1)):
for col in range(1, min(6, ws.max_column + 1)): # 檢查前5列
cell = ws.cell(row, col)
if cell.value:
cell_text = str(cell.value).strip()
if cell_text:
# 語言檢測
has_chinese = any('\u4e00' <= c <= '\u9fff' for c in cell_text)
has_english = any(ord(c) < 128 and c.isalpha() and c not in 'PANJIT' for c in cell_text)
if has_chinese and has_english:
mixed_cells += 1
lang_status = "🔄 中英混合"
elif has_english:
english_cells += 1
lang_status = "🇺🇸 純英文"
elif has_chinese:
chinese_cells += 1
lang_status = "🇨🇳 純中文"
else:
lang_status = "❓ 其他"
# 收集前10個樣本
if len(sample_data) < 10:
sample_data.append({
'position': f"{chr(64+col)}{row}",
'status': lang_status,
'content': cell_text[:50]
})
else:
empty_cells += 1
else:
empty_cells += 1
print(f" 內容統計:")
print(f" 純中文儲存格: {chinese_cells}")
print(f" 純英文儲存格: {english_cells}")
print(f" 中英混合儲存格: {mixed_cells}")
print(f" 空儲存格: {empty_cells}")
if sample_data:
print(f" 前10個內容樣本:")
for sample in sample_data:
print(f" {sample['position']}: {sample['status']} - {sample['content']}...")
# 判斷翻譯格式
total_content_cells = chinese_cells + english_cells + mixed_cells
if total_content_cells == 0:
print(f"\n❌ 沒有發現任何內容,可能翻譯失敗")
elif english_cells > chinese_cells * 0.5:
print(f"\n✅ XLSX翻譯格式良好")
print(f" - 英文內容比例: {english_cells / total_content_cells * 100:.1f}%")
elif mixed_cells > chinese_cells * 0.3:
print(f"\n⚠️ XLSX翻譯採用混合格式")
print(f" - 混合內容比例: {mixed_cells / total_content_cells * 100:.1f}%")
else:
print(f"\n🔍 XLSX翻譯可能使用原始格式主要為中文")
print(f" - 中文內容比例: {chinese_cells / total_content_cells * 100:.1f}%")
wb.close()
else:
print(f"❌ 生成的檔案不存在")
except Exception as e:
print(f"❌ 分析Excel檔案失敗: {e}")
except Exception as e:
print(f"❌ 生成英文翻譯失敗: {e}")
# 簡單測試越南文翻譯
print(f"\n🔄 生成越南文翻譯XLSX文件...")
try:
vi_output_path = parser.generate_translated_document(
{},
'vi',
test_dir
)
print(f"✅ 越南文翻譯文件生成: {vi_output_path}")
# 快速檢查文件是否有內容
vi_file = Path(vi_output_path)
if vi_file.exists():
print(f" 檔案大小: {vi_file.stat().st_size:,} bytes")
else:
print(f" ❌ 越南文文件不存在")
except Exception as e:
print(f"❌ 生成越南文翻譯失敗: {e}")
except Exception as e:
print(f"❌ XLSX格式驗證失敗: {e}")
if __name__ == "__main__":
test_xlsx_translation_format()