4th_fix time error
This commit is contained in:
101
check_docx_content.py
Normal file
101
check_docx_content.py
Normal file
@@ -0,0 +1,101 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
檢查DOCX翻譯文件的實際內容
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
# Fix encoding for Windows console
|
||||
if sys.stdout.encoding != 'utf-8':
|
||||
sys.stdout.reconfigure(encoding='utf-8')
|
||||
if sys.stderr.encoding != 'utf-8':
|
||||
sys.stderr.reconfigure(encoding='utf-8')
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'app'))
|
||||
|
||||
from app import create_app
|
||||
from app.models.job import TranslationJob
|
||||
|
||||
def check_docx_content():
|
||||
"""檢查DOCX翻譯文件的實際內容"""
|
||||
|
||||
app = create_app()
|
||||
|
||||
with app.app_context():
|
||||
print("=== 檢查DOCX翻譯文件內容 ===")
|
||||
|
||||
# 檢查最新的DOCX任務
|
||||
job = TranslationJob.query.filter_by(job_uuid='9c6548ac-2f59-45f4-aade-0a9b3895bbfd').first()
|
||||
if not job:
|
||||
print("DOCX任務不存在")
|
||||
return
|
||||
|
||||
print(f"任務狀態: {job.status}")
|
||||
print(f"總tokens: {job.total_tokens}")
|
||||
print(f"總成本: ${job.total_cost}")
|
||||
print(f"目標語言: {job.target_languages}")
|
||||
|
||||
translated_files = job.get_translated_files()
|
||||
print(f"\n📁 翻譯檔案數: {len(translated_files)}")
|
||||
|
||||
for tf in translated_files:
|
||||
file_path = Path(tf.file_path)
|
||||
print(f"\n【檢查】 {tf.filename} ({tf.language_code})")
|
||||
print(f"路徑: {tf.file_path}")
|
||||
print(f"存在: {file_path.exists()}")
|
||||
print(f"大小: {file_path.stat().st_size:,} bytes")
|
||||
|
||||
if file_path.exists() and tf.filename.endswith('.docx'):
|
||||
try:
|
||||
from docx import Document
|
||||
doc = Document(str(file_path))
|
||||
|
||||
paragraphs = [p.text.strip() for p in doc.paragraphs if p.text.strip()]
|
||||
print(f"總段落數: {len(paragraphs)}")
|
||||
|
||||
if paragraphs:
|
||||
print(f"\n📄 前5段內容檢查:")
|
||||
for i, para in enumerate(paragraphs[:5]):
|
||||
print(f"段落 {i+1}: {para[:100]}...")
|
||||
|
||||
# 檢查是否包含交錯翻譯格式
|
||||
lines = para.split('\n')
|
||||
if len(lines) > 1:
|
||||
print(f" -> 多行內容(可能是交錯格式): {len(lines)} 行")
|
||||
for j, line in enumerate(lines[:3]): # 顯示前3行
|
||||
print(f" 行{j+1}: {line[:60]}...")
|
||||
|
||||
# 檢查是否包含英文或越南文
|
||||
has_english = any(ord(c) < 128 and c.isalpha() for c in para)
|
||||
has_vietnamese = any('\u00C0' <= c <= '\u1EF9' for c in para) # Vietnamese characters
|
||||
|
||||
print(f" -> 包含英文: {has_english}")
|
||||
print(f" -> 包含越南文: {has_vietnamese}")
|
||||
print(" ---")
|
||||
|
||||
# 檢查整個文件的語言分佈
|
||||
all_text = ' '.join(paragraphs)
|
||||
chinese_chars = sum(1 for c in all_text if '\u4e00' <= c <= '\u9fff')
|
||||
english_chars = sum(1 for c in all_text if ord(c) < 128 and c.isalpha())
|
||||
vietnamese_chars = sum(1 for c in all_text if '\u00C0' <= c <= '\u1EF9')
|
||||
|
||||
print(f"\n📊 文件語言分析:")
|
||||
print(f" 中文字符: {chinese_chars}")
|
||||
print(f" 英文字符: {english_chars}")
|
||||
print(f" 越南文字符: {vietnamese_chars}")
|
||||
|
||||
if chinese_chars > 0 and (english_chars == 0 and vietnamese_chars == 0):
|
||||
print(" ❌ 只有中文,沒有翻譯內容!")
|
||||
elif chinese_chars > 0 and (english_chars > 0 or vietnamese_chars > 0):
|
||||
print(" ✅ 包含中文和翻譯內容,可能是交錯格式")
|
||||
else:
|
||||
print(" ⚠️ 文件內容異常")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ 讀取DOCX文件失敗: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
check_docx_content()
|
Reference in New Issue
Block a user