4th_fix time error

2025-09-03 09:05:51 +08:00
parent e6e5332705
commit cce3fd4925
26 changed files with 2551 additions and 82 deletions
--- a/test_xlsx_translation_format.py
+++ b/test_xlsx_translation_format.py
@@ -0,0 +1,220 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+驗證XLSX翻譯格式 - 檢查翻譯文件內容
+"""
+
+import sys
+import os
+import tempfile
+from pathlib import Path
+
+# Fix encoding for Windows console
+if sys.stdout.encoding != 'utf-8':
+    sys.stdout.reconfigure(encoding='utf-8')
+if sys.stderr.encoding != 'utf-8':
+    sys.stderr.reconfigure(encoding='utf-8')
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'app'))
+
+from app import create_app, db
+from app.services.translation_service import ExcelParser
+from sqlalchemy import text as sql_text
+
+def test_xlsx_translation_format():
+    """驗證XLSX翻譯格式"""
+    
+    app = create_app()
+    
+    with app.app_context():
+        print("=== 驗證XLSX翻譯格式 ===")
+        
+        # 尋找現有的XLSX文件進行測試
+        uploads_dir = Path("uploads")
+        xlsx_files = []
+        
+        if uploads_dir.exists():
+            for job_dir in uploads_dir.iterdir():
+                if job_dir.is_dir():
+                    for file_path in job_dir.iterdir():
+                        if file_path.suffix.lower() in ['.xlsx', '.xls']:
+                            xlsx_files.append(file_path)
+        
+        if not xlsx_files:
+            print("❌ 沒有找到XLSX測試文件")
+            return
+        
+        # 使用第一個找到的XLSX文件
+        test_file = xlsx_files[0]
+        print(f"✅ 使用測試文件: {test_file}")
+        
+        # 創建測試環境
+        test_dir = Path(tempfile.gettempdir()) / "xlsx_format_test"
+        test_dir.mkdir(exist_ok=True)
+        
+        try:
+            # 創建ExcelParser
+            parser = ExcelParser(str(test_file))
+            
+            # 提取文字片段
+            text_segments = parser.extract_text_segments()
+            print(f"\n📄 文件分析:")
+            print(f"提取的文字段落數: {len(text_segments)}")
+            
+            # 檢查翻譯覆蓋率
+            languages = ['en', 'vi']
+            for lang in languages:
+                translated_count = 0
+                total_count = 0
+                
+                for text in text_segments:
+                    if text.strip() and len(text.strip()) > 2:
+                        total_count += 1
+                        result = db.session.execute(sql_text("""
+                            SELECT translated_text 
+                            FROM dt_translation_cache 
+                            WHERE source_text = :text AND target_language = :lang
+                            ORDER BY created_at DESC 
+                            LIMIT 1
+                        """), {'text': text, 'lang': lang})
+                        
+                        row = result.fetchone()
+                        if row and row[0]:
+                            translated_count += 1
+                
+                coverage = (translated_count / total_count * 100) if total_count > 0 else 0
+                print(f"  {lang.upper()}翻譯覆蓋率: {coverage:.1f}% ({translated_count}/{total_count})")
+            
+            # 生成英文翻譯
+            print(f"\n🔄 生成英文翻譯XLSX文件...")
+            try:
+                en_output_path = parser.generate_translated_document(
+                    {},  # 使用空字典，從快取讀取
+                    'en', 
+                    test_dir
+                )
+                print(f"✅ 英文翻譯文件生成: {en_output_path}")
+                
+                # 檢查生成的文件內容
+                try:
+                    import openpyxl
+                    output_file = Path(en_output_path)
+                    
+                    if output_file.exists():
+                        print(f"檔案大小: {output_file.stat().st_size:,} bytes")
+                        
+                        # 分析Excel內容
+                        wb = openpyxl.load_workbook(str(output_file))
+                        print(f"\n📊 Excel文件分析:")
+                        print(f"工作表數量: {len(wb.sheetnames)}")
+                        
+                        for sheet_name in wb.sheetnames[:3]:  # 檢查前3個工作表
+                            ws = wb[sheet_name]
+                            print(f"\n📄 工作表: {sheet_name}")
+                            print(f"  最大行數: {ws.max_row}")
+                            print(f"  最大列數: {ws.max_column}")
+                            
+                            # 檢查前20行的內容
+                            chinese_cells = 0
+                            english_cells = 0
+                            mixed_cells = 0
+                            empty_cells = 0
+                            
+                            sample_data = []
+                            
+                            for row in range(1, min(21, ws.max_row + 1)):
+                                for col in range(1, min(6, ws.max_column + 1)):  # 檢查前5列
+                                    cell = ws.cell(row, col)
+                                    if cell.value:
+                                        cell_text = str(cell.value).strip()
+                                        
+                                        if cell_text:
+                                            # 語言檢測
+                                            has_chinese = any('\u4e00' <= c <= '\u9fff' for c in cell_text)
+                                            has_english = any(ord(c) < 128 and c.isalpha() and c not in 'PANJIT' for c in cell_text)
+                                            
+                                            if has_chinese and has_english:
+                                                mixed_cells += 1
+                                                lang_status = "🔄 中英混合"
+                                            elif has_english:
+                                                english_cells += 1
+                                                lang_status = "🇺🇸 純英文"
+                                            elif has_chinese:
+                                                chinese_cells += 1
+                                                lang_status = "🇨🇳 純中文"
+                                            else:
+                                                lang_status = "❓ 其他"
+                                            
+                                            # 收集前10個樣本
+                                            if len(sample_data) < 10:
+                                                sample_data.append({
+                                                    'position': f"{chr(64+col)}{row}",
+                                                    'status': lang_status,
+                                                    'content': cell_text[:50]
+                                                })
+                                        else:
+                                            empty_cells += 1
+                                    else:
+                                        empty_cells += 1
+                            
+                            print(f"  內容統計:")
+                            print(f"    純中文儲存格: {chinese_cells}")
+                            print(f"    純英文儲存格: {english_cells}")
+                            print(f"    中英混合儲存格: {mixed_cells}")
+                            print(f"    空儲存格: {empty_cells}")
+                            
+                            if sample_data:
+                                print(f"  前10個內容樣本:")
+                                for sample in sample_data:
+                                    print(f"    {sample['position']}: {sample['status']} - {sample['content']}...")
+                        
+                        # 判斷翻譯格式
+                        total_content_cells = chinese_cells + english_cells + mixed_cells
+                        if total_content_cells == 0:
+                            print(f"\n❌ 沒有發現任何內容，可能翻譯失敗")
+                        elif english_cells > chinese_cells * 0.5:
+                            print(f"\n✅ XLSX翻譯格式良好")
+                            print(f"  - 英文內容比例: {english_cells / total_content_cells * 100:.1f}%")
+                        elif mixed_cells > chinese_cells * 0.3:
+                            print(f"\n⚠️ XLSX翻譯採用混合格式")
+                            print(f"  - 混合內容比例: {mixed_cells / total_content_cells * 100:.1f}%")
+                        else:
+                            print(f"\n🔍 XLSX翻譯可能使用原始格式（主要為中文）")
+                            print(f"  - 中文內容比例: {chinese_cells / total_content_cells * 100:.1f}%")
+                        
+                        wb.close()
+                        
+                    else:
+                        print(f"❌ 生成的檔案不存在")
+                        
+                except Exception as e:
+                    print(f"❌ 分析Excel檔案失敗: {e}")
+            
+            except Exception as e:
+                print(f"❌ 生成英文翻譯失敗: {e}")
+            
+            # 簡單測試越南文翻譯
+            print(f"\n🔄 生成越南文翻譯XLSX文件...")
+            try:
+                vi_output_path = parser.generate_translated_document(
+                    {}, 
+                    'vi', 
+                    test_dir
+                )
+                print(f"✅ 越南文翻譯文件生成: {vi_output_path}")
+                
+                # 快速檢查文件是否有內容
+                vi_file = Path(vi_output_path)
+                if vi_file.exists():
+                    print(f"  檔案大小: {vi_file.stat().st_size:,} bytes")
+                else:
+                    print(f"  ❌ 越南文文件不存在")
+            
+            except Exception as e:
+                print(f"❌ 生成越南文翻譯失敗: {e}")
+        
+        except Exception as e:
+            print(f"❌ XLSX格式驗證失敗: {e}")
+
+if __name__ == "__main__":
+    test_xlsx_translation_format()