4th_fix time error

2025-09-03 09:05:51 +08:00
parent e6e5332705
commit cce3fd4925
26 changed files with 2551 additions and 82 deletions
--- a/check_docx_content.py
+++ b/check_docx_content.py
@@ -0,0 +1,101 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+檢查DOCX翻譯文件的實際內容
+"""
+
+import sys
+import os
+from pathlib import Path
+
+# Fix encoding for Windows console
+if sys.stdout.encoding != 'utf-8':
+    sys.stdout.reconfigure(encoding='utf-8')
+if sys.stderr.encoding != 'utf-8':
+    sys.stderr.reconfigure(encoding='utf-8')
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'app'))
+
+from app import create_app
+from app.models.job import TranslationJob
+
+def check_docx_content():
+    """檢查DOCX翻譯文件的實際內容"""
+    
+    app = create_app()
+    
+    with app.app_context():
+        print("=== 檢查DOCX翻譯文件內容 ===")
+        
+        # 檢查最新的DOCX任務
+        job = TranslationJob.query.filter_by(job_uuid='9c6548ac-2f59-45f4-aade-0a9b3895bbfd').first()
+        if not job:
+            print("DOCX任務不存在")
+            return
+            
+        print(f"任務狀態: {job.status}")
+        print(f"總tokens: {job.total_tokens}")
+        print(f"總成本: ${job.total_cost}")
+        print(f"目標語言: {job.target_languages}")
+        
+        translated_files = job.get_translated_files()
+        print(f"\n📁 翻譯檔案數: {len(translated_files)}")
+        
+        for tf in translated_files:
+            file_path = Path(tf.file_path)
+            print(f"\n【檢查】 {tf.filename} ({tf.language_code})")
+            print(f"路徑: {tf.file_path}")
+            print(f"存在: {file_path.exists()}")
+            print(f"大小: {file_path.stat().st_size:,} bytes")
+            
+            if file_path.exists() and tf.filename.endswith('.docx'):
+                try:
+                    from docx import Document
+                    doc = Document(str(file_path))
+                    
+                    paragraphs = [p.text.strip() for p in doc.paragraphs if p.text.strip()]
+                    print(f"總段落數: {len(paragraphs)}")
+                    
+                    if paragraphs:
+                        print(f"\n📄 前5段內容檢查:")
+                        for i, para in enumerate(paragraphs[:5]):
+                            print(f"段落 {i+1}: {para[:100]}...")
+                            
+                            # 檢查是否包含交錯翻譯格式
+                            lines = para.split('\n')
+                            if len(lines) > 1:
+                                print(f"  -> 多行內容（可能是交錯格式）: {len(lines)} 行")
+                                for j, line in enumerate(lines[:3]):  # 顯示前3行
+                                    print(f"    行{j+1}: {line[:60]}...")
+                            
+                            # 檢查是否包含英文或越南文
+                            has_english = any(ord(c) < 128 and c.isalpha() for c in para)
+                            has_vietnamese = any('\u00C0' <= c <= '\u1EF9' for c in para)  # Vietnamese characters
+                            
+                            print(f"  -> 包含英文: {has_english}")
+                            print(f"  -> 包含越南文: {has_vietnamese}")
+                            print("  ---")
+                        
+                        # 檢查整個文件的語言分佈
+                        all_text = ' '.join(paragraphs)
+                        chinese_chars = sum(1 for c in all_text if '\u4e00' <= c <= '\u9fff')
+                        english_chars = sum(1 for c in all_text if ord(c) < 128 and c.isalpha())
+                        vietnamese_chars = sum(1 for c in all_text if '\u00C0' <= c <= '\u1EF9')
+                        
+                        print(f"\n📊 文件語言分析:")
+                        print(f"  中文字符: {chinese_chars}")
+                        print(f"  英文字符: {english_chars}")  
+                        print(f"  越南文字符: {vietnamese_chars}")
+                        
+                        if chinese_chars > 0 and (english_chars == 0 and vietnamese_chars == 0):
+                            print("  ❌ 只有中文，沒有翻譯內容！")
+                        elif chinese_chars > 0 and (english_chars > 0 or vietnamese_chars > 0):
+                            print("  ✅ 包含中文和翻譯內容，可能是交錯格式")
+                        else:
+                            print("  ⚠️ 文件內容異常")
+                            
+                except Exception as e:
+                    print(f"❌ 讀取DOCX文件失敗: {e}")
+
+if __name__ == "__main__":
+    check_docx_content()