#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ 驗證XLSX翻譯格式 - 檢查翻譯文件內容 """ import sys import os import tempfile from pathlib import Path # Fix encoding for Windows console if sys.stdout.encoding != 'utf-8': sys.stdout.reconfigure(encoding='utf-8') if sys.stderr.encoding != 'utf-8': sys.stderr.reconfigure(encoding='utf-8') sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'app')) from app import create_app, db from app.services.translation_service import ExcelParser from sqlalchemy import text as sql_text def test_xlsx_translation_format(): """驗證XLSX翻譯格式""" app = create_app() with app.app_context(): print("=== 驗證XLSX翻譯格式 ===") # 尋找現有的XLSX文件進行測試 uploads_dir = Path("uploads") xlsx_files = [] if uploads_dir.exists(): for job_dir in uploads_dir.iterdir(): if job_dir.is_dir(): for file_path in job_dir.iterdir(): if file_path.suffix.lower() in ['.xlsx', '.xls']: xlsx_files.append(file_path) if not xlsx_files: print("❌ 沒有找到XLSX測試文件") return # 使用第一個找到的XLSX文件 test_file = xlsx_files[0] print(f"✅ 使用測試文件: {test_file}") # 創建測試環境 test_dir = Path(tempfile.gettempdir()) / "xlsx_format_test" test_dir.mkdir(exist_ok=True) try: # 創建ExcelParser parser = ExcelParser(str(test_file)) # 提取文字片段 text_segments = parser.extract_text_segments() print(f"\n📄 文件分析:") print(f"提取的文字段落數: {len(text_segments)}") # 檢查翻譯覆蓋率 languages = ['en', 'vi'] for lang in languages: translated_count = 0 total_count = 0 for text in text_segments: if text.strip() and len(text.strip()) > 2: total_count += 1 result = db.session.execute(sql_text(""" SELECT translated_text FROM dt_translation_cache WHERE source_text = :text AND target_language = :lang ORDER BY created_at DESC LIMIT 1 """), {'text': text, 'lang': lang}) row = result.fetchone() if row and row[0]: translated_count += 1 coverage = (translated_count / total_count * 100) if total_count > 0 else 0 print(f" {lang.upper()}翻譯覆蓋率: {coverage:.1f}% ({translated_count}/{total_count})") # 生成英文翻譯 print(f"\n🔄 生成英文翻譯XLSX文件...") try: en_output_path = parser.generate_translated_document( {}, # 使用空字典,從快取讀取 'en', test_dir ) print(f"✅ 英文翻譯文件生成: {en_output_path}") # 檢查生成的文件內容 try: import openpyxl output_file = Path(en_output_path) if output_file.exists(): print(f"檔案大小: {output_file.stat().st_size:,} bytes") # 分析Excel內容 wb = openpyxl.load_workbook(str(output_file)) print(f"\n📊 Excel文件分析:") print(f"工作表數量: {len(wb.sheetnames)}") for sheet_name in wb.sheetnames[:3]: # 檢查前3個工作表 ws = wb[sheet_name] print(f"\n📄 工作表: {sheet_name}") print(f" 最大行數: {ws.max_row}") print(f" 最大列數: {ws.max_column}") # 檢查前20行的內容 chinese_cells = 0 english_cells = 0 mixed_cells = 0 empty_cells = 0 sample_data = [] for row in range(1, min(21, ws.max_row + 1)): for col in range(1, min(6, ws.max_column + 1)): # 檢查前5列 cell = ws.cell(row, col) if cell.value: cell_text = str(cell.value).strip() if cell_text: # 語言檢測 has_chinese = any('\u4e00' <= c <= '\u9fff' for c in cell_text) has_english = any(ord(c) < 128 and c.isalpha() and c not in 'PANJIT' for c in cell_text) if has_chinese and has_english: mixed_cells += 1 lang_status = "🔄 中英混合" elif has_english: english_cells += 1 lang_status = "🇺🇸 純英文" elif has_chinese: chinese_cells += 1 lang_status = "🇨🇳 純中文" else: lang_status = "❓ 其他" # 收集前10個樣本 if len(sample_data) < 10: sample_data.append({ 'position': f"{chr(64+col)}{row}", 'status': lang_status, 'content': cell_text[:50] }) else: empty_cells += 1 else: empty_cells += 1 print(f" 內容統計:") print(f" 純中文儲存格: {chinese_cells}") print(f" 純英文儲存格: {english_cells}") print(f" 中英混合儲存格: {mixed_cells}") print(f" 空儲存格: {empty_cells}") if sample_data: print(f" 前10個內容樣本:") for sample in sample_data: print(f" {sample['position']}: {sample['status']} - {sample['content']}...") # 判斷翻譯格式 total_content_cells = chinese_cells + english_cells + mixed_cells if total_content_cells == 0: print(f"\n❌ 沒有發現任何內容,可能翻譯失敗") elif english_cells > chinese_cells * 0.5: print(f"\n✅ XLSX翻譯格式良好") print(f" - 英文內容比例: {english_cells / total_content_cells * 100:.1f}%") elif mixed_cells > chinese_cells * 0.3: print(f"\n⚠️ XLSX翻譯採用混合格式") print(f" - 混合內容比例: {mixed_cells / total_content_cells * 100:.1f}%") else: print(f"\n🔍 XLSX翻譯可能使用原始格式(主要為中文)") print(f" - 中文內容比例: {chinese_cells / total_content_cells * 100:.1f}%") wb.close() else: print(f"❌ 生成的檔案不存在") except Exception as e: print(f"❌ 分析Excel檔案失敗: {e}") except Exception as e: print(f"❌ 生成英文翻譯失敗: {e}") # 簡單測試越南文翻譯 print(f"\n🔄 生成越南文翻譯XLSX文件...") try: vi_output_path = parser.generate_translated_document( {}, 'vi', test_dir ) print(f"✅ 越南文翻譯文件生成: {vi_output_path}") # 快速檢查文件是否有內容 vi_file = Path(vi_output_path) if vi_file.exists(): print(f" 檔案大小: {vi_file.stat().st_size:,} bytes") else: print(f" ❌ 越南文文件不存在") except Exception as e: print(f"❌ 生成越南文翻譯失敗: {e}") except Exception as e: print(f"❌ XLSX格式驗證失敗: {e}") if __name__ == "__main__": test_xlsx_translation_format()