2ND
This commit is contained in:
@@ -49,11 +49,13 @@ def create_app(config_name=None):
|
||||
|
||||
# 載入配置
|
||||
config_name = config_name or os.getenv('FLASK_ENV', 'default')
|
||||
app.config.from_object(config[config_name])
|
||||
|
||||
# 載入 Dify API 配置
|
||||
# 先載入 Dify API 配置
|
||||
config[config_name].load_dify_config()
|
||||
|
||||
# 然後載入配置到 Flask app
|
||||
app.config.from_object(config[config_name])
|
||||
|
||||
# 初始化必要目錄
|
||||
config[config_name].init_directories()
|
||||
|
||||
@@ -92,7 +94,7 @@ def create_app(config_name=None):
|
||||
@app.after_request
|
||||
def after_request(response):
|
||||
origin = request.headers.get('Origin')
|
||||
allowed_origins = ['http://localhost:3000', 'http://127.0.0.1:3000']
|
||||
allowed_origins = ['http://localhost:3000', 'http://127.0.0.1:3000', 'http://localhost:3001', 'http://127.0.0.1:3001']
|
||||
|
||||
if origin and origin in allowed_origins:
|
||||
response.headers['Access-Control-Allow-Origin'] = origin
|
||||
@@ -109,7 +111,7 @@ def create_app(config_name=None):
|
||||
if request.method == 'OPTIONS':
|
||||
response = make_response()
|
||||
origin = request.headers.get('Origin')
|
||||
allowed_origins = ['http://localhost:3000', 'http://127.0.0.1:3000']
|
||||
allowed_origins = ['http://localhost:3000', 'http://127.0.0.1:3000', 'http://localhost:3001', 'http://127.0.0.1:3001']
|
||||
|
||||
if origin and origin in allowed_origins:
|
||||
response.headers['Access-Control-Allow-Origin'] = origin
|
||||
|
172
app/api/files.py
172
app/api/files.py
@@ -9,6 +9,8 @@ Modified: 2024-01-28
|
||||
"""
|
||||
|
||||
import json
|
||||
import zipfile
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from flask import Blueprint, request, jsonify, send_file, current_app, g
|
||||
from werkzeug.utils import secure_filename
|
||||
@@ -122,9 +124,36 @@ def upload_file():
|
||||
|
||||
logger.info(f"File uploaded successfully: {job.job_uuid} - {file_info['filename']}")
|
||||
|
||||
# 觸發翻譯任務(這裡會在實作 Celery 時加入)
|
||||
# from app.tasks.translation import process_translation_job
|
||||
# process_translation_job.delay(job.id)
|
||||
# 觸發翻譯任務
|
||||
try:
|
||||
from app.tasks.translation import process_translation_job
|
||||
|
||||
# 嘗試使用 Celery 異步處理
|
||||
try:
|
||||
task = process_translation_job.delay(job.id)
|
||||
logger.info(f"Translation task queued with Celery: {task.id} for job {job.job_uuid}")
|
||||
except Exception as celery_error:
|
||||
logger.warning(f"Celery not available, falling back to synchronous processing: {str(celery_error)}")
|
||||
|
||||
# Celery 不可用時,使用同步處理
|
||||
try:
|
||||
from app.services.translation_service import TranslationService
|
||||
service = TranslationService()
|
||||
|
||||
# 在後台執行翻譯(同步處理)
|
||||
logger.info(f"Starting synchronous translation for job {job.job_uuid}")
|
||||
result = service.translate_document(job.job_uuid)
|
||||
logger.info(f"Synchronous translation completed for job {job.job_uuid}: {result}")
|
||||
|
||||
except Exception as sync_error:
|
||||
logger.error(f"Synchronous translation failed for job {job.job_uuid}: {str(sync_error)}")
|
||||
job.update_status('FAILED', error_message=f"翻譯處理失敗: {str(sync_error)}")
|
||||
db.session.commit()
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to process translation for job {job.job_uuid}: {str(e)}")
|
||||
job.update_status('FAILED', error_message=f"任務處理失敗: {str(e)}")
|
||||
db.session.commit()
|
||||
|
||||
return jsonify(create_response(
|
||||
success=True,
|
||||
@@ -440,4 +469,141 @@ def get_supported_languages():
|
||||
success=False,
|
||||
error='SYSTEM_ERROR',
|
||||
message='取得支援語言失敗'
|
||||
)), 500
|
||||
|
||||
|
||||
@files_bp.route('/<job_uuid>/download/batch', methods=['GET'])
|
||||
@jwt_login_required
|
||||
def download_batch_files(job_uuid):
|
||||
"""批量下載所有翻譯檔案為 ZIP"""
|
||||
try:
|
||||
# 驗證 UUID 格式
|
||||
validate_job_uuid(job_uuid)
|
||||
|
||||
# 取得任務
|
||||
job = TranslationJob.query.filter_by(job_uuid=job_uuid).first()
|
||||
|
||||
if not job:
|
||||
return jsonify(create_response(
|
||||
success=False,
|
||||
error='JOB_NOT_FOUND',
|
||||
message='任務不存在'
|
||||
)), 404
|
||||
|
||||
# 檢查權限
|
||||
if job.user_id != g.current_user_id and not g.is_admin:
|
||||
return jsonify(create_response(
|
||||
success=False,
|
||||
error='PERMISSION_DENIED',
|
||||
message='無權限存取此檔案'
|
||||
)), 403
|
||||
|
||||
# 檢查任務狀態
|
||||
if job.status != 'COMPLETED':
|
||||
return jsonify(create_response(
|
||||
success=False,
|
||||
error='JOB_NOT_COMPLETED',
|
||||
message='任務尚未完成'
|
||||
)), 400
|
||||
|
||||
# 收集所有翻譯檔案
|
||||
translated_files = job.get_translated_files()
|
||||
|
||||
if not translated_files:
|
||||
return jsonify(create_response(
|
||||
success=False,
|
||||
error='NO_TRANSLATED_FILES',
|
||||
message='沒有找到翻譯檔案'
|
||||
)), 404
|
||||
|
||||
# 建立臨時 ZIP 檔案
|
||||
temp_dir = tempfile.gettempdir()
|
||||
zip_filename = f"{job.original_filename.split('.')[0]}_translations_{job.job_uuid[:8]}.zip"
|
||||
zip_path = Path(temp_dir) / zip_filename
|
||||
|
||||
try:
|
||||
with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zip_file:
|
||||
files_added = 0
|
||||
|
||||
# 添加原始檔案
|
||||
original_file = job.get_original_file()
|
||||
if original_file and Path(original_file.file_path).exists():
|
||||
zip_file.write(
|
||||
original_file.file_path,
|
||||
f"original/{original_file.filename}"
|
||||
)
|
||||
files_added += 1
|
||||
|
||||
# 添加所有翻譯檔案(避免重複)
|
||||
added_files = set() # 追蹤已添加的檔案,避免重複
|
||||
for tf in translated_files:
|
||||
file_path = Path(tf.file_path)
|
||||
if file_path.exists():
|
||||
# 按語言建立資料夾結構
|
||||
archive_name = f"{tf.language_code}/{tf.filename}"
|
||||
|
||||
# 檢查是否已經添加過這個檔案
|
||||
if archive_name not in added_files:
|
||||
zip_file.write(str(file_path), archive_name)
|
||||
added_files.add(archive_name)
|
||||
files_added += 1
|
||||
else:
|
||||
logger.warning(f"Translation file not found: {tf.file_path}")
|
||||
|
||||
if files_added == 0:
|
||||
return jsonify(create_response(
|
||||
success=False,
|
||||
error='NO_FILES_TO_ZIP',
|
||||
message='沒有可用的檔案進行壓縮'
|
||||
)), 404
|
||||
|
||||
# 檢查 ZIP 檔案是否建立成功
|
||||
if not zip_path.exists():
|
||||
return jsonify(create_response(
|
||||
success=False,
|
||||
error='ZIP_CREATION_FAILED',
|
||||
message='ZIP 檔案建立失敗'
|
||||
)), 500
|
||||
|
||||
# 記錄下載日誌
|
||||
SystemLog.info(
|
||||
'files.download_batch',
|
||||
f'Batch files downloaded: {zip_filename}',
|
||||
user_id=g.current_user_id,
|
||||
job_id=job.id,
|
||||
extra_data={
|
||||
'zip_filename': zip_filename,
|
||||
'files_count': files_added,
|
||||
'job_uuid': job_uuid
|
||||
}
|
||||
)
|
||||
|
||||
logger.info(f"Batch files downloaded: {job.job_uuid} - {files_added} files in ZIP")
|
||||
|
||||
# 發送 ZIP 檔案
|
||||
return send_file(
|
||||
str(zip_path),
|
||||
as_attachment=True,
|
||||
download_name=zip_filename,
|
||||
mimetype='application/zip'
|
||||
)
|
||||
|
||||
finally:
|
||||
# 清理臨時檔案(在發送後會自動清理)
|
||||
pass
|
||||
|
||||
except ValidationError as e:
|
||||
return jsonify(create_response(
|
||||
success=False,
|
||||
error=e.error_code,
|
||||
message=str(e)
|
||||
)), 400
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Batch download error: {str(e)}")
|
||||
|
||||
return jsonify(create_response(
|
||||
success=False,
|
||||
error='SYSTEM_ERROR',
|
||||
message='批量下載失敗'
|
||||
)), 500
|
@@ -440,4 +440,95 @@ def cancel_job(job_uuid):
|
||||
success=False,
|
||||
error='SYSTEM_ERROR',
|
||||
message='取消任務失敗'
|
||||
)), 500
|
||||
|
||||
|
||||
@jobs_bp.route('/<job_uuid>', methods=['DELETE'])
|
||||
@jwt_login_required
|
||||
def delete_job(job_uuid):
|
||||
"""刪除任務"""
|
||||
try:
|
||||
# 驗證 UUID 格式
|
||||
validate_job_uuid(job_uuid)
|
||||
|
||||
# 取得任務
|
||||
job = TranslationJob.query.filter_by(job_uuid=job_uuid).first()
|
||||
|
||||
if not job:
|
||||
return jsonify(create_response(
|
||||
success=False,
|
||||
error='JOB_NOT_FOUND',
|
||||
message='任務不存在'
|
||||
)), 404
|
||||
|
||||
# 檢查權限
|
||||
if job.user_id != g.current_user_id and not g.is_admin:
|
||||
return jsonify(create_response(
|
||||
success=False,
|
||||
error='PERMISSION_DENIED',
|
||||
message='無權限操作此任務'
|
||||
)), 403
|
||||
|
||||
# 檢查任務狀態 - 不能刪除正在處理中的任務
|
||||
if job.status == 'PROCESSING':
|
||||
return jsonify(create_response(
|
||||
success=False,
|
||||
error='CANNOT_DELETE',
|
||||
message='無法刪除正在處理中的任務'
|
||||
)), 400
|
||||
|
||||
# 刪除任務相關檔案
|
||||
import os
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
|
||||
try:
|
||||
if job.file_path and os.path.exists(job.file_path):
|
||||
# 取得任務目錄(通常是 uploads/job_uuid)
|
||||
job_dir = Path(job.file_path).parent
|
||||
if job_dir.exists() and job_dir.name == job.job_uuid:
|
||||
shutil.rmtree(job_dir)
|
||||
logger.info(f"Deleted job directory: {job_dir}")
|
||||
except Exception as file_error:
|
||||
logger.warning(f"Failed to delete job files: {str(file_error)}")
|
||||
|
||||
# 記錄刪除日誌
|
||||
SystemLog.info(
|
||||
'jobs.delete',
|
||||
f'Job deleted by user: {job_uuid}',
|
||||
user_id=g.current_user_id,
|
||||
job_id=job.id,
|
||||
extra_data={
|
||||
'filename': job.original_filename,
|
||||
'status': job.status
|
||||
}
|
||||
)
|
||||
|
||||
from app import db
|
||||
|
||||
# 刪除資料庫記錄
|
||||
db.session.delete(job)
|
||||
db.session.commit()
|
||||
|
||||
logger.info(f"Job deleted by user: {job_uuid}")
|
||||
|
||||
return jsonify(create_response(
|
||||
success=True,
|
||||
message='任務已刪除'
|
||||
))
|
||||
|
||||
except ValidationError as e:
|
||||
return jsonify(create_response(
|
||||
success=False,
|
||||
error=e.error_code,
|
||||
message=str(e)
|
||||
)), 400
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Delete job error: {str(e)}")
|
||||
|
||||
return jsonify(create_response(
|
||||
success=False,
|
||||
error='SYSTEM_ERROR',
|
||||
message='刪除任務失敗'
|
||||
)), 500
|
@@ -58,16 +58,26 @@ class APIUsageStats(db.Model):
|
||||
def record_api_call(cls, user_id, job_id, api_endpoint, metadata, response_time_ms, success=True, error_message=None):
|
||||
"""記錄 API 呼叫統計"""
|
||||
# 從 Dify API metadata 解析使用量資訊
|
||||
prompt_tokens = metadata.get('usage', {}).get('prompt_tokens', 0)
|
||||
completion_tokens = metadata.get('usage', {}).get('completion_tokens', 0)
|
||||
total_tokens = metadata.get('usage', {}).get('total_tokens', prompt_tokens + completion_tokens)
|
||||
usage_data = metadata.get('usage', {})
|
||||
|
||||
# 計算成本
|
||||
prompt_unit_price = metadata.get('usage', {}).get('prompt_unit_price', 0.0)
|
||||
prompt_price_unit = metadata.get('usage', {}).get('prompt_price_unit', 'USD')
|
||||
prompt_tokens = usage_data.get('prompt_tokens', 0)
|
||||
completion_tokens = usage_data.get('completion_tokens', 0)
|
||||
total_tokens = usage_data.get('total_tokens', prompt_tokens + completion_tokens)
|
||||
|
||||
# 成本計算:通常是 prompt_tokens * prompt_unit_price
|
||||
cost = prompt_tokens * float(prompt_unit_price) if prompt_unit_price else 0.0
|
||||
# 計算成本 - 使用 Dify API 提供的總成本
|
||||
if 'total_price' in usage_data:
|
||||
# 直接使用 API 提供的總價格
|
||||
cost = float(usage_data.get('total_price', 0.0))
|
||||
else:
|
||||
# 備用計算方式
|
||||
prompt_price = float(usage_data.get('prompt_price', 0.0))
|
||||
completion_price = float(usage_data.get('completion_price', 0.0))
|
||||
cost = prompt_price + completion_price
|
||||
|
||||
# 單價資訊
|
||||
prompt_unit_price = usage_data.get('prompt_unit_price', 0.0)
|
||||
completion_unit_price = usage_data.get('completion_unit_price', 0.0)
|
||||
prompt_price_unit = usage_data.get('currency', 'USD')
|
||||
|
||||
stats = cls(
|
||||
user_id=user_id,
|
||||
|
@@ -142,15 +142,43 @@ class DifyClient:
|
||||
if not text.strip():
|
||||
raise APIError("翻譯文字不能為空")
|
||||
|
||||
# 構建請求資料
|
||||
# 構建標準翻譯 prompt(英文指令格式)
|
||||
language_names = {
|
||||
'zh-tw': 'Traditional Chinese',
|
||||
'zh-cn': 'Simplified Chinese',
|
||||
'en': 'English',
|
||||
'ja': 'Japanese',
|
||||
'ko': 'Korean',
|
||||
'vi': 'Vietnamese',
|
||||
'th': 'Thai',
|
||||
'id': 'Indonesian',
|
||||
'ms': 'Malay',
|
||||
'es': 'Spanish',
|
||||
'fr': 'French',
|
||||
'de': 'German',
|
||||
'ru': 'Russian',
|
||||
'ar': 'Arabic'
|
||||
}
|
||||
|
||||
source_lang_name = language_names.get(source_language, source_language)
|
||||
target_lang_name = language_names.get(target_language, target_language)
|
||||
|
||||
query = f"""Task: Translate ONLY into {target_lang_name} from {source_lang_name}.
|
||||
|
||||
Rules:
|
||||
- Output translation text ONLY (no source text, no notes, no questions, no language-detection remarks).
|
||||
- Preserve original line breaks.
|
||||
- Do NOT wrap in quotes or code blocks.
|
||||
- Maintain original formatting and structure.
|
||||
|
||||
{text.strip()}"""
|
||||
|
||||
# 構建請求資料 - 使用成功版本的格式
|
||||
request_data = {
|
||||
'inputs': {
|
||||
'text': text.strip(),
|
||||
'source_language': source_language,
|
||||
'target_language': target_language
|
||||
},
|
||||
'inputs': {},
|
||||
'response_mode': 'blocking',
|
||||
'user': f"user_{user_id}" if user_id else "anonymous"
|
||||
'user': f"user_{user_id}" if user_id else "doc-translator-user",
|
||||
'query': query
|
||||
}
|
||||
|
||||
try:
|
||||
@@ -162,10 +190,10 @@ class DifyClient:
|
||||
job_id=job_id
|
||||
)
|
||||
|
||||
# 從響應中提取翻譯結果
|
||||
answer = response.get('answer', '')
|
||||
# 從響應中提取翻譯結果 - 使用成功版本的方式
|
||||
answer = response.get('answer')
|
||||
|
||||
if not answer:
|
||||
if not isinstance(answer, str) or not answer.strip():
|
||||
raise APIError("Dify API 返回空的翻譯結果")
|
||||
|
||||
return {
|
||||
|
719
app/services/document_processor.py
Normal file
719
app/services/document_processor.py
Normal file
@@ -0,0 +1,719 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
核心文檔處理邏輯 - 移植自最佳版本
|
||||
包含完整的 DOCX 文字提取和翻譯插入功能
|
||||
|
||||
Author: PANJIT IT Team
|
||||
Created: 2024-09-02
|
||||
Modified: 2024-09-02
|
||||
"""
|
||||
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Tuple, Optional, Any
|
||||
from docx.text.paragraph import Paragraph
|
||||
from docx.table import Table, _Cell
|
||||
from docx.shared import Pt
|
||||
from docx.oxml import OxmlElement
|
||||
from docx.oxml.ns import qn, nsdecls
|
||||
import docx
|
||||
|
||||
from app.utils.logger import get_logger
|
||||
from app.utils.exceptions import FileProcessingError
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
# ---------- Constants ----------
|
||||
INSERT_FONT_SIZE_PT = 10
|
||||
SENTENCE_MODE = True
|
||||
|
||||
# ---------- Optional dependencies detection ----------
|
||||
try:
|
||||
import blingfire
|
||||
_HAS_BLINGFIRE = True
|
||||
except ImportError:
|
||||
_HAS_BLINGFIRE = False
|
||||
|
||||
try:
|
||||
import pysbd
|
||||
_HAS_PYSBD = True
|
||||
except ImportError:
|
||||
_HAS_PYSBD = False
|
||||
|
||||
# ---------- Helper functions ----------
|
||||
def _has_cjk(text: str) -> bool:
|
||||
"""Check if text contains CJK (Chinese/Japanese/Korean) characters."""
|
||||
for char in text:
|
||||
if '\u4e00' <= char <= '\u9fff' or \
|
||||
'\u3400' <= char <= '\u4dbf' or \
|
||||
'\u20000' <= char <= '\u2a6df' or \
|
||||
'\u3040' <= char <= '\u309f' or \
|
||||
'\u30a0' <= char <= '\u30ff' or \
|
||||
'\uac00' <= char <= '\ud7af':
|
||||
return True
|
||||
return False
|
||||
|
||||
def _normalize_text(text: str) -> str:
|
||||
"""Normalize text for comparison."""
|
||||
return re.sub(r'\s+', ' ', text.strip().lower())
|
||||
|
||||
def _append_after(p: Paragraph, text_block: str, italic: bool=True, font_size_pt: int=INSERT_FONT_SIZE_PT) -> Paragraph:
|
||||
"""Insert a new paragraph after p, return the new paragraph (for chain insert)."""
|
||||
new_p = OxmlElement("w:p")
|
||||
p._p.addnext(new_p)
|
||||
np = Paragraph(new_p, p._parent)
|
||||
lines = text_block.split("\n")
|
||||
for i, line in enumerate(lines):
|
||||
run = np.add_run(line)
|
||||
if italic:
|
||||
run.italic = True
|
||||
if font_size_pt:
|
||||
run.font.size = Pt(font_size_pt)
|
||||
if i < len(lines) - 1:
|
||||
run.add_break()
|
||||
tag = np.add_run("\u200b")
|
||||
if italic:
|
||||
tag.italic = True
|
||||
if font_size_pt:
|
||||
tag.font.size = Pt(font_size_pt)
|
||||
return np
|
||||
|
||||
def _is_our_insert_block(p: Paragraph) -> bool:
|
||||
"""Return True iff paragraph contains our zero-width marker."""
|
||||
return any("\u200b" in (r.text or "") for r in p.runs)
|
||||
|
||||
def _find_last_inserted_after(p: Paragraph, limit: int = 8) -> Optional[Paragraph]:
|
||||
"""Find the last paragraph that was inserted after p (up to limit paragraphs)."""
|
||||
try:
|
||||
# Get all paragraphs in the parent container
|
||||
if hasattr(p._parent, 'paragraphs'):
|
||||
all_paras = list(p._parent.paragraphs)
|
||||
else:
|
||||
# Handle cases where _parent doesn't have paragraphs (e.g., table cells)
|
||||
return None
|
||||
|
||||
# Find p's index
|
||||
p_index = -1
|
||||
for i, para in enumerate(all_paras):
|
||||
if para._element == p._element:
|
||||
p_index = i
|
||||
break
|
||||
|
||||
if p_index == -1:
|
||||
return None
|
||||
|
||||
# Check paragraphs after p
|
||||
last_found = None
|
||||
for i in range(p_index + 1, min(p_index + 1 + limit, len(all_paras))):
|
||||
if _is_our_insert_block(all_paras[i]):
|
||||
last_found = all_paras[i]
|
||||
else:
|
||||
break # Stop at first non-inserted paragraph
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
return last_found
|
||||
|
||||
def _p_text_with_breaks(p: Paragraph) -> str:
|
||||
"""Extract text from paragraph with line breaks preserved."""
|
||||
parts = []
|
||||
for node in p._element.xpath(".//*[local-name()='t' or local-name()='br' or local-name()='tab']"):
|
||||
tag = node.tag.split('}', 1)[-1]
|
||||
if tag == "t":
|
||||
parts.append(node.text or "")
|
||||
elif tag == "br":
|
||||
parts.append("\n")
|
||||
elif tag == "tab":
|
||||
parts.append("\t")
|
||||
return "".join(parts)
|
||||
|
||||
def _is_our_insert_block(p: Paragraph) -> bool:
|
||||
"""Check if paragraph is our inserted translation (contains zero-width space marker)."""
|
||||
text = _p_text_with_breaks(p)
|
||||
return "\u200b" in text
|
||||
|
||||
def should_translate(text: str, src_lang: str) -> bool:
|
||||
"""Determine if text should be translated based on content and source language."""
|
||||
text = text.strip()
|
||||
if len(text) < 3:
|
||||
return False
|
||||
|
||||
# Skip pure numbers, dates, etc.
|
||||
if re.match(r'^[\d\s\.\-\:\/]+$', text):
|
||||
return False
|
||||
|
||||
# For auto-detect, translate if has CJK or meaningful text
|
||||
if src_lang.lower() in ('auto', 'auto-detect'):
|
||||
return _has_cjk(text) or len(text) > 5
|
||||
|
||||
return True
|
||||
|
||||
def _split_sentences(text: str, lang: str = 'auto') -> List[str]:
|
||||
"""Split text into sentences using available libraries."""
|
||||
if not text.strip():
|
||||
return []
|
||||
|
||||
# Try blingfire first
|
||||
if _HAS_BLINGFIRE and SENTENCE_MODE:
|
||||
try:
|
||||
sentences = blingfire.text_to_sentences(text).split('\n')
|
||||
sentences = [s.strip() for s in sentences if s.strip()]
|
||||
if sentences:
|
||||
return sentences
|
||||
except Exception as e:
|
||||
logger.warning(f"Blingfire failed: {e}")
|
||||
|
||||
# Try pysbd
|
||||
if _HAS_PYSBD and SENTENCE_MODE:
|
||||
try:
|
||||
seg = pysbd.Segmenter(language="en" if lang == "auto" else lang)
|
||||
sentences = seg.segment(text)
|
||||
sentences = [s.strip() for s in sentences if s.strip()]
|
||||
if sentences:
|
||||
return sentences
|
||||
except Exception as e:
|
||||
logger.warning(f"PySBD failed: {e}")
|
||||
|
||||
# Fallback to simple splitting
|
||||
separators = ['. ', '。', '!', '?', '!', '?', '\n']
|
||||
sentences = [text]
|
||||
|
||||
for sep in separators:
|
||||
new_sentences = []
|
||||
for s in sentences:
|
||||
parts = s.split(sep)
|
||||
if len(parts) > 1:
|
||||
new_sentences.extend([p.strip() + sep.rstrip() for p in parts[:-1] if p.strip()])
|
||||
if parts[-1].strip():
|
||||
new_sentences.append(parts[-1].strip())
|
||||
else:
|
||||
new_sentences.append(s)
|
||||
sentences = new_sentences
|
||||
|
||||
return [s for s in sentences if len(s.strip()) > 3]
|
||||
|
||||
# ---------- Segment class ----------
|
||||
class Segment:
|
||||
"""Represents a translatable text segment in a document."""
|
||||
|
||||
def __init__(self, kind: str, ref: Any, ctx: str, text: str):
|
||||
self.kind = kind # 'para' | 'txbx'
|
||||
self.ref = ref # Reference to original document element
|
||||
self.ctx = ctx # Context information
|
||||
self.text = text # Text content
|
||||
|
||||
# ---------- TextBox helpers ----------
|
||||
def _txbx_iter_texts(doc: docx.Document):
|
||||
"""
|
||||
Yield (txbxContent_element, joined_source_text)
|
||||
- Deeply collect all descendant <w:p> under txbxContent
|
||||
- Skip our inserted translations: contains zero-width or (all italic and no CJK)
|
||||
- Keep only lines that still have CJK
|
||||
"""
|
||||
def _p_text_flags(p_el):
|
||||
parts = []
|
||||
for node in p_el.xpath(".//*[local-name()='t' or local-name()='br' or local-name()='tab']"):
|
||||
tag = node.tag.split('}', 1)[-1]
|
||||
if tag == "t":
|
||||
parts.append(node.text or "")
|
||||
elif tag == "br":
|
||||
parts.append("\n")
|
||||
else:
|
||||
parts.append(" ")
|
||||
text = "".join(parts)
|
||||
has_zero = ("\u200b" in text)
|
||||
runs = p_el.xpath(".//*[local-name()='r']")
|
||||
vis, ital = [], []
|
||||
for r in runs:
|
||||
rt = "".join([(t.text or "") for t in r.xpath(".//*[local-name()='t']")])
|
||||
if (rt or "").strip():
|
||||
vis.append(rt)
|
||||
ital.append(bool(r.xpath(".//*[local-name()='i']")))
|
||||
all_italic = (len(vis) > 0 and all(ital))
|
||||
return text, has_zero, all_italic
|
||||
|
||||
for tx in doc._element.xpath(".//*[local-name()='txbxContent']"):
|
||||
kept = []
|
||||
for p in tx.xpath(".//*[local-name()='p']"): # all descendant paragraphs
|
||||
text, has_zero, all_italic = _p_text_flags(p)
|
||||
if not (text or "").strip():
|
||||
continue
|
||||
if has_zero:
|
||||
continue # our inserted
|
||||
for line in text.split("\n"):
|
||||
if line.strip():
|
||||
kept.append(line.strip())
|
||||
if kept:
|
||||
joined = "\n".join(kept)
|
||||
yield tx, joined
|
||||
|
||||
def _txbx_append_paragraph(tx, text_block: str, italic: bool = True, font_size_pt: int = INSERT_FONT_SIZE_PT):
|
||||
"""Append a paragraph to textbox content."""
|
||||
p = OxmlElement("w:p")
|
||||
r = OxmlElement("w:r")
|
||||
rPr = OxmlElement("w:rPr")
|
||||
if italic:
|
||||
rPr.append(OxmlElement("w:i"))
|
||||
if font_size_pt:
|
||||
sz = OxmlElement("w:sz")
|
||||
sz.set(qn("w:val"), str(int(font_size_pt * 2)))
|
||||
rPr.append(sz)
|
||||
r.append(rPr)
|
||||
lines = text_block.split("\n")
|
||||
for i, line in enumerate(lines):
|
||||
if i > 0:
|
||||
r.append(OxmlElement("w:br"))
|
||||
t = OxmlElement("w:t")
|
||||
t.set(qn("xml:space"), "preserve")
|
||||
t.text = line
|
||||
r.append(t)
|
||||
tag = OxmlElement("w:t")
|
||||
tag.set(qn("xml:space"), "preserve")
|
||||
tag.text = "\u200b"
|
||||
r.append(tag)
|
||||
p.append(r)
|
||||
tx.append(p)
|
||||
|
||||
def _txbx_tail_equals(tx, translations: List[str]) -> bool:
|
||||
"""Check if textbox already contains the expected translations."""
|
||||
paras = tx.xpath("./*[local-name()='p']")
|
||||
if len(paras) < len(translations):
|
||||
return False
|
||||
tail = paras[-len(translations):]
|
||||
for q, expect in zip(tail, translations):
|
||||
parts = []
|
||||
for node in q.xpath(".//*[local-name()='t' or local-name()='br']"):
|
||||
tag = node.tag.split("}", 1)[-1]
|
||||
parts.append("\n" if tag == "br" else (node.text or ""))
|
||||
if _normalize_text("".join(parts).strip()) != _normalize_text(expect):
|
||||
return False
|
||||
return True
|
||||
|
||||
# ---------- Main extraction logic ----------
|
||||
def _get_paragraph_key(p: Paragraph) -> str:
|
||||
"""Generate a stable unique key for paragraph deduplication."""
|
||||
try:
|
||||
# Use XML content hash + text content for stable deduplication
|
||||
xml_content = p._p.xml if hasattr(p._p, 'xml') else str(p._p)
|
||||
text_content = _p_text_with_breaks(p)
|
||||
combined = f"{hash(xml_content)}_{len(text_content)}_{text_content[:50]}"
|
||||
return combined
|
||||
except Exception:
|
||||
# Fallback to simple text-based key
|
||||
text_content = _p_text_with_breaks(p)
|
||||
return f"fallback_{hash(text_content)}_{len(text_content)}"
|
||||
|
||||
def _collect_docx_segments(doc: docx.Document) -> List[Segment]:
|
||||
"""
|
||||
Enhanced segment collector with improved stability.
|
||||
Handles paragraphs, tables, textboxes, and SDT Content Controls.
|
||||
"""
|
||||
segs: List[Segment] = []
|
||||
seen_par_keys = set()
|
||||
|
||||
def _add_paragraph(p: Paragraph, ctx: str):
|
||||
try:
|
||||
p_key = _get_paragraph_key(p)
|
||||
if p_key in seen_par_keys:
|
||||
return
|
||||
|
||||
txt = _p_text_with_breaks(p)
|
||||
if txt.strip() and not _is_our_insert_block(p):
|
||||
segs.append(Segment("para", p, ctx, txt))
|
||||
seen_par_keys.add(p_key)
|
||||
except Exception as e:
|
||||
# Log error but continue processing
|
||||
logger.warning(f"段落處理錯誤: {e}, 跳過此段落")
|
||||
|
||||
def _process_container_content(container, ctx: str):
|
||||
"""
|
||||
Recursively processes content within a container (body, cell, or SDT content).
|
||||
Identifies and handles paragraphs, tables, and SDT elements.
|
||||
"""
|
||||
if container._element is None:
|
||||
return
|
||||
|
||||
for child_element in container._element:
|
||||
qname = child_element.tag
|
||||
|
||||
if qname.endswith('}p'): # Paragraph
|
||||
p = Paragraph(child_element, container)
|
||||
_add_paragraph(p, ctx)
|
||||
|
||||
elif qname.endswith('}tbl'): # Table
|
||||
table = Table(child_element, container)
|
||||
for r_idx, row in enumerate(table.rows, 1):
|
||||
for c_idx, cell in enumerate(row.cells, 1):
|
||||
cell_ctx = f"{ctx} > Tbl(r{r_idx},c{c_idx})"
|
||||
_process_container_content(cell, cell_ctx)
|
||||
|
||||
elif qname.endswith('}sdt'): # Structured Document Tag (SDT)
|
||||
sdt_ctx = f"{ctx} > SDT"
|
||||
|
||||
# 1. 提取 SDT 的元數據文本 (Placeholder, Dropdown items)
|
||||
ns = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
|
||||
|
||||
# 提取 Placeholder text
|
||||
placeholder_texts = []
|
||||
for t in child_element.xpath('.//w:placeholder//w:t', namespaces=ns):
|
||||
if t.text:
|
||||
placeholder_texts.append(t.text)
|
||||
if placeholder_texts:
|
||||
full_placeholder = "".join(placeholder_texts).strip()
|
||||
if full_placeholder:
|
||||
segs.append(Segment("para", child_element, f"{sdt_ctx}-Placeholder", full_placeholder))
|
||||
|
||||
# 提取 Dropdown list items
|
||||
list_items = []
|
||||
for item in child_element.xpath('.//w:dropDownList/w:listItem', namespaces=ns):
|
||||
display_text = item.get(qn('w:displayText'))
|
||||
if display_text:
|
||||
list_items.append(display_text)
|
||||
if list_items:
|
||||
items_as_text = "\n".join(list_items)
|
||||
segs.append(Segment("para", child_element, f"{sdt_ctx}-Dropdown", items_as_text))
|
||||
|
||||
# 2. 遞迴處理 SDT 的實際內容 (sdtContent)
|
||||
sdt_content_element = child_element.find(qn('w:sdtContent'))
|
||||
if sdt_content_element is not None:
|
||||
class SdtContentWrapper:
|
||||
def __init__(self, element, parent):
|
||||
self._element = element
|
||||
self._parent = parent
|
||||
|
||||
sdt_content_wrapper = SdtContentWrapper(sdt_content_element, container)
|
||||
_process_container_content(sdt_content_wrapper, sdt_ctx)
|
||||
|
||||
# --- Main execution starts here ---
|
||||
|
||||
# 1. Process the main document body
|
||||
_process_container_content(doc._body, "Body")
|
||||
|
||||
# 2. Process textboxes
|
||||
for tx, s in _txbx_iter_texts(doc):
|
||||
if s.strip() and (_has_cjk(s) or should_translate(s, 'auto')):
|
||||
segs.append(Segment("txbx", tx, "TextBox", s))
|
||||
|
||||
return segs
|
||||
|
||||
def _insert_docx_translations(doc: docx.Document, segs: List[Segment],
|
||||
tmap: Dict[Tuple[str, str], str],
|
||||
targets: List[str], log=lambda s: None) -> Tuple[int, int]:
|
||||
"""
|
||||
Insert translations into DOCX document segments.
|
||||
|
||||
CRITICAL: This function contains the fix for the major translation insertion bug.
|
||||
The key fix is in the segment filtering logic - we now correctly check if any target
|
||||
language has translation available using the proper key format (target_lang, text).
|
||||
|
||||
Args:
|
||||
doc: The DOCX document object
|
||||
segs: List of segments to translate
|
||||
tmap: Translation map with keys as (target_language, source_text)
|
||||
targets: List of target languages in order
|
||||
log: Logging function
|
||||
|
||||
Returns:
|
||||
Tuple of (successful_insertions, skipped_insertions)
|
||||
|
||||
Key Bug Fix:
|
||||
OLD (INCORRECT): if (seg.kind, seg.text) not in tmap and (targets[0], seg.text) not in tmap
|
||||
NEW (CORRECT): has_any_translation = any((tgt, seg.text) in tmap for tgt in targets)
|
||||
"""
|
||||
ok_cnt = skip_cnt = 0
|
||||
|
||||
# Helper function to add a formatted run to a paragraph
|
||||
def _add_formatted_run(p: Paragraph, text: str, italic: bool, font_size_pt: int):
|
||||
lines = text.split("\n")
|
||||
for i, line in enumerate(lines):
|
||||
run = p.add_run(line)
|
||||
if italic:
|
||||
run.italic = True
|
||||
if font_size_pt:
|
||||
run.font.size = Pt(font_size_pt)
|
||||
if i < len(lines) - 1:
|
||||
run.add_break()
|
||||
# Add our zero-width space marker
|
||||
tag_run = p.add_run("\u200b")
|
||||
if italic:
|
||||
tag_run.italic = True
|
||||
if font_size_pt:
|
||||
tag_run.font.size = Pt(font_size_pt)
|
||||
|
||||
for seg in segs:
|
||||
# Check if any target language has translation for this segment
|
||||
has_any_translation = any((tgt, seg.text) in tmap for tgt in targets)
|
||||
if not has_any_translation:
|
||||
log(f"[SKIP] 無翻譯結果: {seg.ctx} | {seg.text[:50]}...")
|
||||
skip_cnt += 1
|
||||
continue
|
||||
|
||||
# Get translations for all targets, with fallback for missing ones
|
||||
translations = []
|
||||
for tgt in targets:
|
||||
if (tgt, seg.text) in tmap:
|
||||
translations.append(tmap[(tgt, seg.text)])
|
||||
else:
|
||||
log(f"[WARNING] 缺少 {tgt} 翻譯: {seg.text[:30]}...")
|
||||
translations.append(f"【翻譯查詢失敗|{tgt}】{seg.text[:50]}...")
|
||||
|
||||
log(f"[INSERT] 準備插入 {len(translations)} 個翻譯到 {seg.ctx}: {seg.text[:30]}...")
|
||||
|
||||
if seg.kind == "para":
|
||||
# Check if this is an SDT segment (ref is an XML element, not a Paragraph)
|
||||
if hasattr(seg.ref, 'tag') and seg.ref.tag.endswith('}sdt'):
|
||||
# Handle SDT segments - insert translation into sdtContent
|
||||
sdt_element = seg.ref
|
||||
ns = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
|
||||
sdt_content = sdt_element.find(qn('w:sdtContent'))
|
||||
|
||||
if sdt_content is not None:
|
||||
# Check if translations already exist
|
||||
existing_paras = sdt_content.xpath('.//w:p', namespaces=ns)
|
||||
existing_texts = []
|
||||
for ep in existing_paras:
|
||||
p_obj = Paragraph(ep, None)
|
||||
if _is_our_insert_block(p_obj):
|
||||
existing_texts.append(_p_text_with_breaks(p_obj))
|
||||
|
||||
# Check if all translations already exist
|
||||
if len(existing_texts) >= len(translations):
|
||||
if all(_normalize_text(e) == _normalize_text(t) for e, t in zip(existing_texts[:len(translations)], translations)):
|
||||
skip_cnt += 1
|
||||
log(f"[SKIP] SDT 已存在翻譯: {seg.text[:30]}...")
|
||||
continue
|
||||
|
||||
# Add translations to SDT content
|
||||
for t in translations:
|
||||
if not any(_normalize_text(t) == _normalize_text(e) for e in existing_texts):
|
||||
# Create new paragraph in SDT content
|
||||
new_p_element = OxmlElement("w:p")
|
||||
sdt_content.append(new_p_element)
|
||||
new_p = Paragraph(new_p_element, None)
|
||||
_add_formatted_run(new_p, t, italic=True, font_size_pt=INSERT_FONT_SIZE_PT)
|
||||
|
||||
ok_cnt += 1
|
||||
log(f"[SUCCESS] SDT 插入翻譯(交錯格式)")
|
||||
continue
|
||||
|
||||
p: Paragraph = seg.ref
|
||||
|
||||
# --- CONTEXT-AWARE INSERTION LOGIC (from successful version) ---
|
||||
# Check if the paragraph's parent is a table cell
|
||||
if isinstance(p._parent, _Cell):
|
||||
cell = p._parent
|
||||
|
||||
try:
|
||||
# Find the current paragraph's position in the cell
|
||||
cell_paragraphs = list(cell.paragraphs)
|
||||
p_index = -1
|
||||
for idx, cell_p in enumerate(cell_paragraphs):
|
||||
if cell_p._element == p._element:
|
||||
p_index = idx
|
||||
break
|
||||
|
||||
if p_index == -1:
|
||||
log(f"[WARNING] 無法找到段落在單元格中的位置,使用原始方法")
|
||||
# Fallback to original method
|
||||
for block in translations:
|
||||
new_p = cell.add_paragraph()
|
||||
_add_formatted_run(new_p, block, italic=True, font_size_pt=INSERT_FONT_SIZE_PT)
|
||||
ok_cnt += 1
|
||||
continue
|
||||
|
||||
# Check if translations already exist right after this paragraph
|
||||
existing_texts = []
|
||||
check_limit = min(p_index + 1 + len(translations), len(cell_paragraphs))
|
||||
for idx in range(p_index + 1, check_limit):
|
||||
if _is_our_insert_block(cell_paragraphs[idx]):
|
||||
existing_texts.append(_p_text_with_breaks(cell_paragraphs[idx]))
|
||||
|
||||
# Check if all translations already exist in order
|
||||
if len(existing_texts) >= len(translations):
|
||||
if all(_normalize_text(e) == _normalize_text(t) for e, t in zip(existing_texts[:len(translations)], translations)):
|
||||
skip_cnt += 1
|
||||
log(f"[SKIP] 表格單元格已存在翻譯: {seg.text[:30]}...")
|
||||
continue
|
||||
|
||||
# Determine which translations need to be added
|
||||
to_add = []
|
||||
for t in translations:
|
||||
if not any(_normalize_text(t) == _normalize_text(e) for e in existing_texts):
|
||||
to_add.append(t)
|
||||
|
||||
if not to_add:
|
||||
skip_cnt += 1
|
||||
log(f"[SKIP] 表格單元格所有翻譯已存在: {seg.text[:30]}...")
|
||||
continue
|
||||
|
||||
# Insert new paragraphs right after the current paragraph
|
||||
insert_after = p
|
||||
for block in to_add:
|
||||
try:
|
||||
# Create new paragraph and insert it after the current position
|
||||
new_p_element = OxmlElement("w:p")
|
||||
insert_after._element.addnext(new_p_element)
|
||||
new_p = Paragraph(new_p_element, cell)
|
||||
_add_formatted_run(new_p, block, italic=True, font_size_pt=INSERT_FONT_SIZE_PT)
|
||||
insert_after = new_p # Update position for next insertion
|
||||
except Exception as e:
|
||||
log(f"[ERROR] 表格插入失敗: {e}, 嘗試fallback方法")
|
||||
# Fallback: add at the end of cell
|
||||
try:
|
||||
new_p = cell.add_paragraph()
|
||||
_add_formatted_run(new_p, block, italic=True, font_size_pt=INSERT_FONT_SIZE_PT)
|
||||
log(f"[SUCCESS] Fallback插入成功")
|
||||
except Exception as e2:
|
||||
log(f"[FATAL] Fallback也失敗: {e2}")
|
||||
continue
|
||||
ok_cnt += 1
|
||||
log(f"[SUCCESS] 表格單元格插入 {len(to_add)} 個翻譯(緊接原文後)")
|
||||
|
||||
except Exception as e:
|
||||
log(f"[ERROR] 表格處理全面失敗: {e}, 跳過此段落")
|
||||
continue
|
||||
|
||||
else:
|
||||
# Normal paragraph (not in table cell) - enhanced logic from successful version
|
||||
try:
|
||||
# Check existing translations using the enhanced method
|
||||
last = _find_last_inserted_after(p, limit=max(len(translations), 4))
|
||||
|
||||
# Check if all translations already exist
|
||||
existing_texts = []
|
||||
current_check = p
|
||||
for _ in range(len(translations)):
|
||||
try:
|
||||
# Get the next sibling paragraph
|
||||
next_sibling = current_check._element.getnext()
|
||||
if next_sibling is not None and next_sibling.tag.endswith('}p'):
|
||||
next_p = Paragraph(next_sibling, p._parent)
|
||||
if _is_our_insert_block(next_p):
|
||||
existing_texts.append(_p_text_with_breaks(next_p))
|
||||
current_check = next_p
|
||||
else:
|
||||
break
|
||||
else:
|
||||
break
|
||||
except Exception:
|
||||
break
|
||||
|
||||
# Skip if all translations already exist in order
|
||||
if len(existing_texts) >= len(translations):
|
||||
if all(_normalize_text(e) == _normalize_text(t) for e, t in zip(existing_texts[:len(translations)], translations)):
|
||||
skip_cnt += 1
|
||||
log(f"[SKIP] 段落已存在翻譯: {seg.text[:30]}...")
|
||||
continue
|
||||
|
||||
# Determine which translations need to be added
|
||||
to_add = []
|
||||
for t in translations:
|
||||
if not any(_normalize_text(t) == _normalize_text(e) for e in existing_texts):
|
||||
to_add.append(t)
|
||||
|
||||
if not to_add:
|
||||
skip_cnt += 1
|
||||
log(f"[SKIP] 段落所有翻譯已存在: {seg.text[:30]}...")
|
||||
continue
|
||||
|
||||
# Use enhanced insertion with proper positioning
|
||||
anchor = last if last else p
|
||||
|
||||
for block in to_add:
|
||||
try:
|
||||
anchor = _append_after(anchor, block, italic=True, font_size_pt=INSERT_FONT_SIZE_PT)
|
||||
except Exception as e:
|
||||
log(f"[ERROR] 段落插入失敗: {e}, 嘗試簡化插入")
|
||||
try:
|
||||
# Fallback: simple append
|
||||
if hasattr(p._parent, 'add_paragraph'):
|
||||
new_p = p._parent.add_paragraph()
|
||||
_add_formatted_run(new_p, block, italic=True, font_size_pt=INSERT_FONT_SIZE_PT)
|
||||
log(f"[SUCCESS] Fallback段落插入成功")
|
||||
else:
|
||||
log(f"[ERROR] 無法進行fallback插入")
|
||||
except Exception as e2:
|
||||
log(f"[FATAL] Fallback也失敗: {e2}")
|
||||
continue
|
||||
|
||||
ok_cnt += 1
|
||||
log(f"[SUCCESS] 段落插入 {len(to_add)} 個翻譯(交錯格式)")
|
||||
|
||||
except Exception as e:
|
||||
log(f"[ERROR] 段落處理失敗: {e}, 跳過此段落")
|
||||
continue
|
||||
|
||||
elif seg.kind == "txbx":
|
||||
tx = seg.ref
|
||||
# Check if textbox already has our translations at the end
|
||||
if _txbx_tail_equals(tx, translations):
|
||||
skip_cnt += 1
|
||||
log(f"[SKIP] 文字框已存在翻譯: {seg.text[:30]}...")
|
||||
continue
|
||||
|
||||
# Append translations to textbox
|
||||
for t in translations:
|
||||
_txbx_append_paragraph(tx, t, italic=True, font_size_pt=INSERT_FONT_SIZE_PT)
|
||||
|
||||
ok_cnt += 1
|
||||
log(f"[SUCCESS] 文字框插入 {len(translations)} 個翻譯")
|
||||
|
||||
return ok_cnt, skip_cnt
|
||||
|
||||
# ---------- Main DocumentProcessor class ----------
|
||||
class DocumentProcessor:
|
||||
"""Enhanced document processor with complete DOCX handling capabilities."""
|
||||
|
||||
def __init__(self):
|
||||
self.logger = logger
|
||||
|
||||
def extract_docx_segments(self, file_path: str) -> List[Segment]:
|
||||
"""Extract all translatable segments from DOCX file."""
|
||||
try:
|
||||
doc = docx.Document(file_path)
|
||||
segments = _collect_docx_segments(doc)
|
||||
|
||||
self.logger.info(f"Extracted {len(segments)} segments from {file_path}")
|
||||
for seg in segments[:5]: # Log first 5 segments for debugging
|
||||
self.logger.debug(f"Segment: {seg.kind} | {seg.ctx} | {seg.text[:50]}...")
|
||||
|
||||
return segments
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to extract DOCX segments from {file_path}: {str(e)}")
|
||||
raise FileProcessingError(f"DOCX 文件分析失敗: {str(e)}")
|
||||
|
||||
def insert_docx_translations(self, file_path: str, segments: List[Segment],
|
||||
translation_map: Dict[Tuple[str, str], str],
|
||||
target_languages: List[str], output_path: str) -> Tuple[int, int]:
|
||||
"""Insert translations into DOCX file and save to output path."""
|
||||
try:
|
||||
doc = docx.Document(file_path)
|
||||
|
||||
def log_func(msg: str):
|
||||
self.logger.debug(msg)
|
||||
|
||||
ok_count, skip_count = _insert_docx_translations(
|
||||
doc, segments, translation_map, target_languages, log_func
|
||||
)
|
||||
|
||||
# Save the modified document
|
||||
doc.save(output_path)
|
||||
|
||||
self.logger.info(f"Inserted {ok_count} translations, skipped {skip_count}. Saved to: {output_path}")
|
||||
return ok_count, skip_count
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to insert DOCX translations: {str(e)}")
|
||||
raise FileProcessingError(f"DOCX 翻譯插入失敗: {str(e)}")
|
||||
|
||||
def split_text_into_sentences(self, text: str, language: str = 'auto') -> List[str]:
|
||||
"""Split text into sentences using the best available method."""
|
||||
return _split_sentences(text, language)
|
||||
|
||||
def should_translate_text(self, text: str, source_language: str) -> bool:
|
||||
"""Determine if text should be translated."""
|
||||
return should_translate(text, source_language)
|
@@ -11,10 +11,11 @@ Modified: 2024-01-28
|
||||
import hashlib
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Any, Optional
|
||||
from typing import List, Dict, Any, Optional, Tuple
|
||||
from app.utils.logger import get_logger
|
||||
from app.utils.exceptions import TranslationError, FileProcessingError
|
||||
from app.services.dify_client import DifyClient
|
||||
from app.services.document_processor import DocumentProcessor, Segment
|
||||
from app.models.cache import TranslationCache
|
||||
from app.models.job import TranslationJob
|
||||
from app.utils.helpers import generate_filename, create_job_directory
|
||||
@@ -42,88 +43,39 @@ class DocumentParser:
|
||||
|
||||
|
||||
class DocxParser(DocumentParser):
|
||||
"""DOCX 文件解析器"""
|
||||
"""DOCX 文件解析器 - 使用增強的 DocumentProcessor"""
|
||||
|
||||
def __init__(self, file_path: str):
|
||||
super().__init__(file_path)
|
||||
self.processor = DocumentProcessor()
|
||||
|
||||
def extract_text_segments(self) -> List[str]:
|
||||
"""提取 DOCX 文件的文字片段"""
|
||||
"""提取 DOCX 文件的文字片段 - 使用增強邏輯"""
|
||||
try:
|
||||
import docx
|
||||
from docx.table import _Cell
|
||||
# 使用新的文檔處理器提取段落
|
||||
segments = self.processor.extract_docx_segments(str(self.file_path))
|
||||
|
||||
doc = docx.Document(str(self.file_path))
|
||||
# 轉換為文字列表
|
||||
text_segments = []
|
||||
for seg in segments:
|
||||
if seg.text.strip() and len(seg.text.strip()) > 3:
|
||||
text_segments.append(seg.text)
|
||||
|
||||
# 提取段落文字
|
||||
for paragraph in doc.paragraphs:
|
||||
text = paragraph.text.strip()
|
||||
if text and len(text) > 3: # 過濾太短的文字
|
||||
text_segments.append(text)
|
||||
|
||||
# 提取表格文字
|
||||
for table in doc.tables:
|
||||
for row in table.rows:
|
||||
for cell in row.cells:
|
||||
text = cell.text.strip()
|
||||
if text and len(text) > 3:
|
||||
text_segments.append(text)
|
||||
|
||||
logger.info(f"Extracted {len(text_segments)} text segments from DOCX")
|
||||
logger.info(f"Enhanced extraction: {len(text_segments)} text segments from DOCX")
|
||||
return text_segments
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to extract text from DOCX: {str(e)}")
|
||||
raise FileProcessingError(f"DOCX 文件解析失敗: {str(e)}")
|
||||
|
||||
def extract_segments_with_context(self) -> List[Segment]:
|
||||
"""提取帶上下文的段落資訊"""
|
||||
return self.processor.extract_docx_segments(str(self.file_path))
|
||||
|
||||
def generate_translated_document(self, translations: Dict[str, List[str]],
|
||||
target_language: str, output_dir: Path) -> str:
|
||||
"""生成翻譯後的 DOCX 文件"""
|
||||
"""生成翻譯後的 DOCX 文件 - 使用增強的翻譯插入邏輯"""
|
||||
try:
|
||||
import docx
|
||||
from docx.shared import Pt
|
||||
|
||||
# 開啟原始文件
|
||||
doc = docx.Document(str(self.file_path))
|
||||
|
||||
# 取得對應的翻譯
|
||||
translated_texts = translations.get(target_language, [])
|
||||
text_index = 0
|
||||
|
||||
# 處理段落
|
||||
for paragraph in doc.paragraphs:
|
||||
if paragraph.text.strip() and len(paragraph.text.strip()) > 3:
|
||||
if text_index < len(translated_texts):
|
||||
# 保留原文,添加翻譯
|
||||
original_text = paragraph.text
|
||||
translated_text = translated_texts[text_index]
|
||||
|
||||
# 清空段落
|
||||
paragraph.clear()
|
||||
|
||||
# 添加原文
|
||||
run = paragraph.add_run(original_text)
|
||||
|
||||
# 添加翻譯(新行,較小字體)
|
||||
paragraph.add_run('\n')
|
||||
trans_run = paragraph.add_run(translated_text)
|
||||
trans_run.font.size = Pt(10)
|
||||
trans_run.italic = True
|
||||
|
||||
text_index += 1
|
||||
|
||||
# 處理表格(簡化版本)
|
||||
for table in doc.tables:
|
||||
for row in table.rows:
|
||||
for cell in row.cells:
|
||||
if cell.text.strip() and len(cell.text.strip()) > 3:
|
||||
if text_index < len(translated_texts):
|
||||
original_text = cell.text
|
||||
translated_text = translated_texts[text_index]
|
||||
|
||||
# 清空儲存格
|
||||
cell.text = f"{original_text}\n{translated_text}"
|
||||
|
||||
text_index += 1
|
||||
|
||||
# 生成輸出檔名
|
||||
output_filename = generate_filename(
|
||||
self.file_path.name,
|
||||
@@ -133,10 +85,30 @@ class DocxParser(DocumentParser):
|
||||
)
|
||||
output_path = output_dir / output_filename
|
||||
|
||||
# 儲存文件
|
||||
doc.save(str(output_path))
|
||||
# 提取段落資訊
|
||||
segments = self.extract_segments_with_context()
|
||||
|
||||
logger.info(f"Generated translated DOCX: {output_path}")
|
||||
# 建立翻譯映射
|
||||
translation_map = {}
|
||||
translated_texts = translations.get(target_language, [])
|
||||
|
||||
# 對應文字段落與翻譯
|
||||
text_index = 0
|
||||
for seg in segments:
|
||||
if text_index < len(translated_texts):
|
||||
translation_map[(target_language, seg.text)] = translated_texts[text_index]
|
||||
text_index += 1
|
||||
|
||||
# 使用增強的翻譯插入邏輯
|
||||
ok_count, skip_count = self.processor.insert_docx_translations(
|
||||
str(self.file_path),
|
||||
segments,
|
||||
translation_map,
|
||||
[target_language],
|
||||
str(output_path)
|
||||
)
|
||||
|
||||
logger.info(f"Enhanced translation: Generated {output_path} with {ok_count} insertions, {skip_count} skips")
|
||||
return str(output_path)
|
||||
|
||||
except Exception as e:
|
||||
@@ -202,6 +174,7 @@ class TranslationService:
|
||||
|
||||
def __init__(self):
|
||||
self.dify_client = DifyClient()
|
||||
self.document_processor = DocumentProcessor()
|
||||
|
||||
# 文件解析器映射
|
||||
self.parsers = {
|
||||
@@ -222,31 +195,87 @@ class TranslationService:
|
||||
return parser_class(file_path)
|
||||
|
||||
def split_text_into_sentences(self, text: str, language: str = 'auto') -> List[str]:
|
||||
"""將文字分割成句子"""
|
||||
# 這裡可以使用更智能的句子分割
|
||||
# 暫時使用簡單的分割方式
|
||||
|
||||
sentences = []
|
||||
|
||||
# 基本的句子分割符號
|
||||
separators = ['. ', '。', '!', '?', '!', '?']
|
||||
|
||||
current_text = text
|
||||
for sep in separators:
|
||||
parts = current_text.split(sep)
|
||||
if len(parts) > 1:
|
||||
sentences.extend([part.strip() + sep.rstrip() for part in parts[:-1] if part.strip()])
|
||||
current_text = parts[-1]
|
||||
|
||||
# 添加最後一部分
|
||||
if current_text.strip():
|
||||
sentences.append(current_text.strip())
|
||||
|
||||
# 過濾太短的句子
|
||||
sentences = [s for s in sentences if len(s.strip()) > 5]
|
||||
|
||||
return sentences
|
||||
"""將文字分割成句子 - 使用增強的分句邏輯"""
|
||||
return self.document_processor.split_text_into_sentences(text, language)
|
||||
|
||||
def translate_segment_with_sentences(self, text: str, source_language: str,
|
||||
target_language: str, user_id: int = None,
|
||||
job_id: int = None) -> str:
|
||||
"""
|
||||
按段落翻譯,模仿成功版本的 translate_block_sentencewise 邏輯
|
||||
對多行文字進行逐行、逐句翻譯,並重新組合成完整段落
|
||||
"""
|
||||
if not text or not text.strip():
|
||||
return ""
|
||||
|
||||
# 檢查快取 - 先檢查整個段落的快取
|
||||
cached_whole = TranslationCache.get_translation(text, source_language, target_language)
|
||||
if cached_whole:
|
||||
logger.debug(f"Whole paragraph cache hit: {text[:30]}...")
|
||||
return cached_whole
|
||||
|
||||
# 按行處理
|
||||
out_lines = []
|
||||
all_successful = True
|
||||
|
||||
for raw_line in text.split('\n'):
|
||||
if not raw_line.strip():
|
||||
out_lines.append("")
|
||||
continue
|
||||
|
||||
# 分句處理
|
||||
sentences = self.document_processor.split_text_into_sentences(raw_line, source_language)
|
||||
if not sentences:
|
||||
sentences = [raw_line]
|
||||
|
||||
translated_parts = []
|
||||
for sentence in sentences:
|
||||
sentence = sentence.strip()
|
||||
if not sentence:
|
||||
continue
|
||||
|
||||
# 檢查句子級快取
|
||||
cached_sentence = TranslationCache.get_translation(sentence, source_language, target_language)
|
||||
if cached_sentence:
|
||||
translated_parts.append(cached_sentence)
|
||||
continue
|
||||
|
||||
# 呼叫 Dify API 翻譯句子
|
||||
try:
|
||||
result = self.dify_client.translate_text(
|
||||
text=sentence,
|
||||
source_language=source_language,
|
||||
target_language=target_language,
|
||||
user_id=user_id,
|
||||
job_id=job_id
|
||||
)
|
||||
|
||||
translated_sentence = result['translated_text']
|
||||
|
||||
# 儲存句子級快取
|
||||
TranslationCache.save_translation(
|
||||
sentence, source_language, target_language, translated_sentence
|
||||
)
|
||||
|
||||
translated_parts.append(translated_sentence)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to translate sentence: {sentence[:30]}... Error: {str(e)}")
|
||||
translated_parts.append(f"【翻譯失敗|{target_language}】{sentence}")
|
||||
all_successful = False
|
||||
|
||||
# 重新組合句子為一行
|
||||
out_lines.append(" ".join(translated_parts))
|
||||
|
||||
# 重新組合所有行
|
||||
final_result = "\n".join(out_lines)
|
||||
|
||||
# 如果全部成功,儲存整個段落的快取
|
||||
if all_successful:
|
||||
TranslationCache.save_translation(text, source_language, target_language, final_result)
|
||||
|
||||
return final_result
|
||||
|
||||
def translate_text_with_cache(self, text: str, source_language: str,
|
||||
target_language: str, user_id: int = None,
|
||||
job_id: int = None) -> str:
|
||||
@@ -285,82 +314,173 @@ class TranslationService:
|
||||
raise TranslationError(f"翻譯失敗: {str(e)}")
|
||||
|
||||
def translate_document(self, job_uuid: str) -> Dict[str, Any]:
|
||||
"""翻譯文件(主要入口點)"""
|
||||
"""翻譯文件(主要入口點)- 使用增強的文檔處理邏輯"""
|
||||
try:
|
||||
# 取得任務資訊
|
||||
job = TranslationJob.query.filter_by(job_uuid=job_uuid).first()
|
||||
if not job:
|
||||
raise TranslationError(f"找不到任務: {job_uuid}")
|
||||
|
||||
logger.info(f"Starting document translation: {job_uuid}")
|
||||
logger.info(f"Starting enhanced document translation: {job_uuid}")
|
||||
|
||||
# 更新任務狀態
|
||||
job.update_status('PROCESSING', progress=0)
|
||||
|
||||
# 取得文件解析器
|
||||
parser = self.get_document_parser(job.file_path)
|
||||
# 使用增強的文檔處理器直接提取段落
|
||||
file_ext = Path(job.file_path).suffix.lower()
|
||||
|
||||
# 提取文字片段
|
||||
logger.info("Extracting text segments from document")
|
||||
text_segments = parser.extract_text_segments()
|
||||
|
||||
if not text_segments:
|
||||
raise TranslationError("文件中未找到可翻譯的文字")
|
||||
|
||||
# 分割成句子
|
||||
logger.info("Splitting text into sentences")
|
||||
all_sentences = []
|
||||
for segment in text_segments:
|
||||
sentences = self.split_text_into_sentences(segment, job.source_language)
|
||||
all_sentences.extend(sentences)
|
||||
|
||||
# 去重複
|
||||
unique_sentences = list(dict.fromkeys(all_sentences)) # 保持順序的去重
|
||||
logger.info(f"Found {len(unique_sentences)} unique sentences to translate")
|
||||
|
||||
# 批次翻譯
|
||||
translation_results = {}
|
||||
total_sentences = len(unique_sentences)
|
||||
|
||||
for target_language in job.target_languages:
|
||||
logger.info(f"Translating to {target_language}")
|
||||
translated_sentences = []
|
||||
if file_ext in ['.docx', '.doc']:
|
||||
# 使用增強的 DOCX 處理邏輯
|
||||
segments = self.document_processor.extract_docx_segments(job.file_path)
|
||||
logger.info(f"Enhanced extraction: Found {len(segments)} segments to translate")
|
||||
|
||||
for i, sentence in enumerate(unique_sentences):
|
||||
if not segments:
|
||||
raise TranslationError("文件中未找到可翻譯的文字段落")
|
||||
|
||||
# 使用成功版本的翻譯邏輯 - 直接按段落翻譯,不做複雜分割
|
||||
translatable_segments = []
|
||||
for seg in segments:
|
||||
if self.document_processor.should_translate_text(seg.text, job.source_language):
|
||||
translatable_segments.append(seg)
|
||||
|
||||
logger.info(f"Found {len(translatable_segments)} segments to translate")
|
||||
|
||||
# 批次翻譯 - 直接按原始段落翻譯
|
||||
translation_map = {} # 格式: (target_language, source_text) -> translated_text
|
||||
total_segments = len(translatable_segments)
|
||||
|
||||
for target_language in job.target_languages:
|
||||
logger.info(f"Translating to {target_language}")
|
||||
|
||||
for i, seg in enumerate(translatable_segments):
|
||||
try:
|
||||
# 使用整段文字進行翻譯
|
||||
translated = self.translate_segment_with_sentences(
|
||||
text=seg.text,
|
||||
source_language=job.source_language,
|
||||
target_language=target_language,
|
||||
user_id=job.user_id,
|
||||
job_id=job.id
|
||||
)
|
||||
|
||||
# 直接以原始段落文字為鍵儲存翻譯結果
|
||||
translation_map[(target_language, seg.text)] = translated
|
||||
|
||||
# 更新進度
|
||||
progress = (i + 1) / total_segments * 100 / len(job.target_languages)
|
||||
current_lang_index = job.target_languages.index(target_language)
|
||||
total_progress = (current_lang_index * 100 + progress) / len(job.target_languages)
|
||||
job.update_status('PROCESSING', progress=total_progress)
|
||||
|
||||
# 短暫延遲避免過快請求
|
||||
time.sleep(0.1)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to translate segment: {seg.text[:50]}... Error: {str(e)}")
|
||||
# 翻譯失敗時保留原文
|
||||
translation_map[(target_language, seg.text)] = f"[翻譯失敗] {seg.text}"
|
||||
|
||||
# 生成翻譯文件
|
||||
logger.info("Generating translated documents with enhanced insertion")
|
||||
output_dir = Path(job.file_path).parent
|
||||
output_files = {}
|
||||
|
||||
for target_language in job.target_languages:
|
||||
try:
|
||||
translated = self.translate_text_with_cache(
|
||||
text=sentence,
|
||||
source_language=job.source_language,
|
||||
target_language=target_language,
|
||||
user_id=job.user_id,
|
||||
job_id=job.id
|
||||
# 生成輸出檔名
|
||||
output_filename = generate_filename(
|
||||
Path(job.file_path).name,
|
||||
'translated',
|
||||
'translated',
|
||||
target_language
|
||||
)
|
||||
translated_sentences.append(translated)
|
||||
output_path = output_dir / output_filename
|
||||
|
||||
# 更新進度
|
||||
progress = (i + 1) / total_sentences * 100 / len(job.target_languages)
|
||||
current_lang_index = job.target_languages.index(target_language)
|
||||
total_progress = (current_lang_index * 100 + progress) / len(job.target_languages)
|
||||
job.update_status('PROCESSING', progress=total_progress)
|
||||
# 使用增強的翻譯插入邏輯
|
||||
ok_count, skip_count = self.document_processor.insert_docx_translations(
|
||||
job.file_path,
|
||||
segments,
|
||||
translation_map,
|
||||
[target_language],
|
||||
str(output_path)
|
||||
)
|
||||
|
||||
# 短暫延遲避免過快請求
|
||||
time.sleep(0.1)
|
||||
output_files[target_language] = str(output_path)
|
||||
|
||||
# 記錄翻譯檔案到資料庫
|
||||
file_size = Path(output_path).stat().st_size
|
||||
job.add_translated_file(
|
||||
language_code=target_language,
|
||||
filename=Path(output_path).name,
|
||||
file_path=str(output_path),
|
||||
file_size=file_size
|
||||
)
|
||||
|
||||
logger.info(f"Generated {target_language}: {ok_count} insertions, {skip_count} skips")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to translate sentence: {sentence[:50]}... Error: {str(e)}")
|
||||
# 翻譯失敗時保留原文
|
||||
translated_sentences.append(f"[翻譯失敗] {sentence}")
|
||||
logger.error(f"Failed to generate translated document for {target_language}: {str(e)}")
|
||||
raise TranslationError(f"生成 {target_language} 翻譯文件失敗: {str(e)}")
|
||||
|
||||
else:
|
||||
# 對於非 DOCX 文件,使用原有邏輯
|
||||
logger.info(f"Using legacy processing for {file_ext} files")
|
||||
parser = self.get_document_parser(job.file_path)
|
||||
|
||||
translation_results[target_language] = translated_sentences
|
||||
|
||||
# 生成翻譯文件
|
||||
logger.info("Generating translated documents")
|
||||
output_dir = Path(job.file_path).parent
|
||||
output_files = {}
|
||||
|
||||
for target_language, translations in translation_results.items():
|
||||
try:
|
||||
# 重建翻譯映射
|
||||
# 提取文字片段
|
||||
text_segments = parser.extract_text_segments()
|
||||
|
||||
if not text_segments:
|
||||
raise TranslationError("文件中未找到可翻譯的文字")
|
||||
|
||||
# 分割成句子
|
||||
all_sentences = []
|
||||
for segment in text_segments:
|
||||
sentences = self.split_text_into_sentences(segment, job.source_language)
|
||||
all_sentences.extend(sentences)
|
||||
|
||||
# 去重複
|
||||
unique_sentences = list(dict.fromkeys(all_sentences))
|
||||
logger.info(f"Found {len(unique_sentences)} unique sentences to translate")
|
||||
|
||||
# 批次翻譯
|
||||
translation_results = {}
|
||||
total_sentences = len(unique_sentences)
|
||||
|
||||
for target_language in job.target_languages:
|
||||
logger.info(f"Translating to {target_language}")
|
||||
translated_sentences = []
|
||||
|
||||
for i, sentence in enumerate(unique_sentences):
|
||||
try:
|
||||
translated = self.translate_text_with_cache(
|
||||
text=sentence,
|
||||
source_language=job.source_language,
|
||||
target_language=target_language,
|
||||
user_id=job.user_id,
|
||||
job_id=job.id
|
||||
)
|
||||
translated_sentences.append(translated)
|
||||
|
||||
# 更新進度
|
||||
progress = (i + 1) / total_sentences * 100 / len(job.target_languages)
|
||||
current_lang_index = job.target_languages.index(target_language)
|
||||
total_progress = (current_lang_index * 100 + progress) / len(job.target_languages)
|
||||
job.update_status('PROCESSING', progress=total_progress)
|
||||
|
||||
time.sleep(0.1)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to translate sentence: {sentence[:50]}... Error: {str(e)}")
|
||||
translated_sentences.append(f"[翻譯失敗] {sentence}")
|
||||
|
||||
translation_results[target_language] = translated_sentences
|
||||
|
||||
# 生成翻譯文件
|
||||
output_dir = Path(job.file_path).parent
|
||||
output_files = {}
|
||||
|
||||
for target_language, translations in translation_results.items():
|
||||
translation_mapping = {target_language: translations}
|
||||
|
||||
output_file = parser.generate_translated_document(
|
||||
@@ -371,7 +491,6 @@ class TranslationService:
|
||||
|
||||
output_files[target_language] = output_file
|
||||
|
||||
# 記錄翻譯檔案到資料庫
|
||||
file_size = Path(output_file).stat().st_size
|
||||
job.add_translated_file(
|
||||
language_code=target_language,
|
||||
@@ -379,29 +498,33 @@ class TranslationService:
|
||||
file_path=output_file,
|
||||
file_size=file_size
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to generate translated document for {target_language}: {str(e)}")
|
||||
raise TranslationError(f"生成 {target_language} 翻譯文件失敗: {str(e)}")
|
||||
|
||||
# 計算總成本(從 API 使用統計中取得)
|
||||
# 計算總成本
|
||||
total_cost = self._calculate_job_cost(job.id)
|
||||
|
||||
# 更新任務狀態為完成
|
||||
job.update_status('COMPLETED', progress=100)
|
||||
job.total_cost = total_cost
|
||||
job.total_tokens = len(unique_sentences) # 簡化的 token 計算
|
||||
|
||||
# 計算實際使用的 token 數(從 API 使用統計中獲取)
|
||||
from sqlalchemy import func
|
||||
from app.models.stats import APIUsageStats
|
||||
from app import db
|
||||
|
||||
actual_tokens = db.session.query(
|
||||
func.sum(APIUsageStats.total_tokens)
|
||||
).filter_by(job_id=job.id).scalar()
|
||||
|
||||
job.total_tokens = int(actual_tokens) if actual_tokens else 0
|
||||
|
||||
db.session.commit()
|
||||
|
||||
logger.info(f"Document translation completed: {job_uuid}")
|
||||
logger.info(f"Enhanced document translation completed: {job_uuid}")
|
||||
|
||||
return {
|
||||
'success': True,
|
||||
'job_uuid': job_uuid,
|
||||
'output_files': output_files,
|
||||
'total_sentences': len(unique_sentences),
|
||||
'total_sentences': len(texts_to_translate) if 'texts_to_translate' in locals() else len(unique_sentences) if 'unique_sentences' in locals() else 0,
|
||||
'total_cost': float(total_cost),
|
||||
'target_languages': job.target_languages
|
||||
}
|
||||
@@ -409,13 +532,14 @@ class TranslationService:
|
||||
except TranslationError:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Document translation failed: {job_uuid}. Error: {str(e)}")
|
||||
logger.error(f"Enhanced document translation failed: {job_uuid}. Error: {str(e)}")
|
||||
raise TranslationError(f"文件翻譯失敗: {str(e)}")
|
||||
|
||||
def _calculate_job_cost(self, job_id: int) -> float:
|
||||
"""計算任務總成本"""
|
||||
from app import db
|
||||
from sqlalchemy import func
|
||||
from app.models.stats import APIUsageStats
|
||||
|
||||
total_cost = db.session.query(
|
||||
func.sum(APIUsageStats.cost)
|
||||
|
@@ -12,17 +12,30 @@ import os
|
||||
import shutil
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
from celery import current_task
|
||||
from app import create_app, db, celery
|
||||
from celery import Celery, current_task
|
||||
from celery.schedules import crontab
|
||||
from app import create_app, db
|
||||
|
||||
logger = None
|
||||
|
||||
def get_celery_instance():
|
||||
"""取得 Celery 實例"""
|
||||
app = create_app()
|
||||
return app.celery
|
||||
|
||||
# 建立 Celery 實例
|
||||
celery = get_celery_instance()
|
||||
|
||||
# 初始化 logger
|
||||
from app.utils.logger import get_logger
|
||||
logger = get_logger(__name__)
|
||||
|
||||
from app.models.job import TranslationJob
|
||||
from app.models.log import SystemLog
|
||||
from app.services.translation_service import TranslationService
|
||||
from app.services.notification_service import NotificationService
|
||||
from app.utils.logger import get_logger
|
||||
from app.utils.exceptions import TranslationError
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
@celery.task(bind=True, max_retries=3)
|
||||
def process_translation_job(self, job_id: int):
|
||||
@@ -319,5 +332,3 @@ def setup_periodic_tasks(sender, **kwargs):
|
||||
)
|
||||
|
||||
|
||||
# 導入 crontab
|
||||
from celery.schedules import crontab
|
84
app/utils/response.py
Normal file
84
app/utils/response.py
Normal file
@@ -0,0 +1,84 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
API 響應處理工具
|
||||
|
||||
Author: PANJIT IT Team
|
||||
Created: 2025-09-02
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
from typing import Dict, Any, List, Union
|
||||
from app.utils.timezone import to_taiwan_time, format_taiwan_time
|
||||
|
||||
|
||||
def convert_datetime_to_taiwan(data: Union[Dict, List, Any]) -> Union[Dict, List, Any]:
|
||||
"""遞迴轉換資料中的 datetime 欄位為台灣時間
|
||||
|
||||
Args:
|
||||
data: 要轉換的資料(字典、列表或其他)
|
||||
|
||||
Returns:
|
||||
轉換後的資料
|
||||
"""
|
||||
if isinstance(data, dict):
|
||||
result = {}
|
||||
for key, value in data.items():
|
||||
if isinstance(value, datetime):
|
||||
# 將 datetime 轉換為台灣時間的 ISO 字符串
|
||||
taiwan_dt = to_taiwan_time(value)
|
||||
result[key] = taiwan_dt.isoformat()
|
||||
elif key in ['created_at', 'updated_at', 'completed_at', 'processing_started_at', 'last_login', 'timestamp']:
|
||||
# 特定的時間欄位
|
||||
if isinstance(value, str):
|
||||
try:
|
||||
# 嘗試解析 ISO 格式的時間字符串
|
||||
dt = datetime.fromisoformat(value.replace('Z', '+00:00'))
|
||||
taiwan_dt = to_taiwan_time(dt)
|
||||
result[key] = taiwan_dt.isoformat()
|
||||
except:
|
||||
result[key] = value
|
||||
else:
|
||||
result[key] = convert_datetime_to_taiwan(value)
|
||||
else:
|
||||
result[key] = convert_datetime_to_taiwan(value)
|
||||
return result
|
||||
elif isinstance(data, list):
|
||||
return [convert_datetime_to_taiwan(item) for item in data]
|
||||
else:
|
||||
return data
|
||||
|
||||
|
||||
def create_taiwan_response(success: bool = True, data: Any = None, message: str = '',
|
||||
error: str = '', **kwargs) -> Dict[str, Any]:
|
||||
"""創建包含台灣時區轉換的 API 響應
|
||||
|
||||
Args:
|
||||
success: 是否成功
|
||||
data: 響應資料
|
||||
message: 成功訊息
|
||||
error: 錯誤訊息
|
||||
**kwargs: 其他參數
|
||||
|
||||
Returns:
|
||||
包含台灣時區的響應字典
|
||||
"""
|
||||
response = {
|
||||
'success': success,
|
||||
'timestamp': format_taiwan_time(datetime.now(), "%Y-%m-%d %H:%M:%S")
|
||||
}
|
||||
|
||||
if data is not None:
|
||||
response['data'] = convert_datetime_to_taiwan(data)
|
||||
|
||||
if message:
|
||||
response['message'] = message
|
||||
|
||||
if error:
|
||||
response['error'] = error
|
||||
|
||||
# 加入其他參數
|
||||
for key, value in kwargs.items():
|
||||
response[key] = convert_datetime_to_taiwan(value)
|
||||
|
||||
return response
|
104
app/utils/timezone.py
Normal file
104
app/utils/timezone.py
Normal file
@@ -0,0 +1,104 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
時區工具函數
|
||||
|
||||
Author: PANJIT IT Team
|
||||
Created: 2025-09-02
|
||||
"""
|
||||
|
||||
from datetime import datetime, timezone, timedelta
|
||||
from typing import Optional
|
||||
|
||||
# 台灣時區 UTC+8
|
||||
TAIWAN_TZ = timezone(timedelta(hours=8))
|
||||
|
||||
|
||||
def now_taiwan() -> datetime:
|
||||
"""取得當前台灣時間(UTC+8)"""
|
||||
return datetime.now(TAIWAN_TZ)
|
||||
|
||||
|
||||
def now_utc() -> datetime:
|
||||
"""取得當前 UTC 時間"""
|
||||
return datetime.now(timezone.utc)
|
||||
|
||||
|
||||
def to_taiwan_time(dt: datetime) -> datetime:
|
||||
"""將 datetime 轉換為台灣時間
|
||||
|
||||
Args:
|
||||
dt: datetime 物件(可能是 naive 或 aware)
|
||||
|
||||
Returns:
|
||||
台灣時區的 datetime 物件
|
||||
"""
|
||||
if dt is None:
|
||||
return None
|
||||
|
||||
# 如果是 naive datetime,假設為 UTC
|
||||
if dt.tzinfo is None:
|
||||
dt = dt.replace(tzinfo=timezone.utc)
|
||||
|
||||
# 轉換為台灣時區
|
||||
return dt.astimezone(TAIWAN_TZ)
|
||||
|
||||
|
||||
def to_utc_time(dt: datetime) -> datetime:
|
||||
"""將 datetime 轉換為 UTC 時間
|
||||
|
||||
Args:
|
||||
dt: datetime 物件(可能是 naive 或 aware)
|
||||
|
||||
Returns:
|
||||
UTC 時區的 datetime 物件
|
||||
"""
|
||||
if dt is None:
|
||||
return None
|
||||
|
||||
# 如果是 naive datetime,假設為台灣時間
|
||||
if dt.tzinfo is None:
|
||||
dt = dt.replace(tzinfo=TAIWAN_TZ)
|
||||
|
||||
# 轉換為 UTC
|
||||
return dt.astimezone(timezone.utc)
|
||||
|
||||
|
||||
def format_taiwan_time(dt: datetime, format_str: str = "%Y-%m-%d %H:%M:%S") -> str:
|
||||
"""格式化台灣時間為字符串
|
||||
|
||||
Args:
|
||||
dt: datetime 物件
|
||||
format_str: 格式化字符串
|
||||
|
||||
Returns:
|
||||
格式化後的時間字符串
|
||||
"""
|
||||
if dt is None:
|
||||
return ""
|
||||
|
||||
taiwan_dt = to_taiwan_time(dt)
|
||||
return taiwan_dt.strftime(format_str)
|
||||
|
||||
|
||||
def parse_taiwan_time(time_str: str, format_str: str = "%Y-%m-%d %H:%M:%S") -> datetime:
|
||||
"""解析台灣時間字符串為 datetime
|
||||
|
||||
Args:
|
||||
time_str: 時間字符串
|
||||
format_str: 解析格式
|
||||
|
||||
Returns:
|
||||
台灣時區的 datetime 物件
|
||||
"""
|
||||
naive_dt = datetime.strptime(time_str, format_str)
|
||||
return naive_dt.replace(tzinfo=TAIWAN_TZ)
|
||||
|
||||
|
||||
# 為了向後兼容,提供替代 datetime.utcnow() 的函數
|
||||
def utcnow() -> datetime:
|
||||
"""取得當前 UTC 時間(替代 datetime.utcnow())
|
||||
|
||||
注意:新代碼建議使用 now_taiwan() 或 now_utc()
|
||||
"""
|
||||
return now_utc().replace(tzinfo=None) # 返回 naive UTC datetime 以保持兼容性
|
Reference in New Issue
Block a user