1st_fix_login_issue

This commit is contained in:
beabigegg
2025-09-02 10:31:35 +08:00
commit a60d965317
103 changed files with 12402 additions and 0 deletions

View File

@@ -0,0 +1,424 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
翻譯服務
Author: PANJIT IT Team
Created: 2024-01-28
Modified: 2024-01-28
"""
import hashlib
import time
from pathlib import Path
from typing import List, Dict, Any, Optional
from app.utils.logger import get_logger
from app.utils.exceptions import TranslationError, FileProcessingError
from app.services.dify_client import DifyClient
from app.models.cache import TranslationCache
from app.models.job import TranslationJob
from app.utils.helpers import generate_filename, create_job_directory
logger = get_logger(__name__)
class DocumentParser:
"""文件解析器基類"""
def __init__(self, file_path: str):
self.file_path = Path(file_path)
if not self.file_path.exists():
raise FileProcessingError(f"檔案不存在: {file_path}")
def extract_text_segments(self) -> List[str]:
"""提取文字片段"""
raise NotImplementedError
def generate_translated_document(self, translations: Dict[str, List[str]],
target_language: str, output_dir: Path) -> str:
"""生成翻譯後的文件"""
raise NotImplementedError
class DocxParser(DocumentParser):
"""DOCX 文件解析器"""
def extract_text_segments(self) -> List[str]:
"""提取 DOCX 文件的文字片段"""
try:
import docx
from docx.table import _Cell
doc = docx.Document(str(self.file_path))
text_segments = []
# 提取段落文字
for paragraph in doc.paragraphs:
text = paragraph.text.strip()
if text and len(text) > 3: # 過濾太短的文字
text_segments.append(text)
# 提取表格文字
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
text = cell.text.strip()
if text and len(text) > 3:
text_segments.append(text)
logger.info(f"Extracted {len(text_segments)} text segments from DOCX")
return text_segments
except Exception as e:
logger.error(f"Failed to extract text from DOCX: {str(e)}")
raise FileProcessingError(f"DOCX 文件解析失敗: {str(e)}")
def generate_translated_document(self, translations: Dict[str, List[str]],
target_language: str, output_dir: Path) -> str:
"""生成翻譯後的 DOCX 文件"""
try:
import docx
from docx.shared import Pt
# 開啟原始文件
doc = docx.Document(str(self.file_path))
# 取得對應的翻譯
translated_texts = translations.get(target_language, [])
text_index = 0
# 處理段落
for paragraph in doc.paragraphs:
if paragraph.text.strip() and len(paragraph.text.strip()) > 3:
if text_index < len(translated_texts):
# 保留原文,添加翻譯
original_text = paragraph.text
translated_text = translated_texts[text_index]
# 清空段落
paragraph.clear()
# 添加原文
run = paragraph.add_run(original_text)
# 添加翻譯(新行,較小字體)
paragraph.add_run('\n')
trans_run = paragraph.add_run(translated_text)
trans_run.font.size = Pt(10)
trans_run.italic = True
text_index += 1
# 處理表格(簡化版本)
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
if cell.text.strip() and len(cell.text.strip()) > 3:
if text_index < len(translated_texts):
original_text = cell.text
translated_text = translated_texts[text_index]
# 清空儲存格
cell.text = f"{original_text}\n{translated_text}"
text_index += 1
# 生成輸出檔名
output_filename = generate_filename(
self.file_path.name,
'translated',
'translated',
target_language
)
output_path = output_dir / output_filename
# 儲存文件
doc.save(str(output_path))
logger.info(f"Generated translated DOCX: {output_path}")
return str(output_path)
except Exception as e:
logger.error(f"Failed to generate translated DOCX: {str(e)}")
raise FileProcessingError(f"生成翻譯 DOCX 失敗: {str(e)}")
class PdfParser(DocumentParser):
"""PDF 文件解析器(只讀)"""
def extract_text_segments(self) -> List[str]:
"""提取 PDF 文件的文字片段"""
try:
from PyPDF2 import PdfReader
reader = PdfReader(str(self.file_path))
text_segments = []
for page in reader.pages:
text = page.extract_text()
# 簡單的句子分割
sentences = text.split('.')
for sentence in sentences:
sentence = sentence.strip()
if sentence and len(sentence) > 10:
text_segments.append(sentence)
logger.info(f"Extracted {len(text_segments)} text segments from PDF")
return text_segments
except Exception as e:
logger.error(f"Failed to extract text from PDF: {str(e)}")
raise FileProcessingError(f"PDF 文件解析失敗: {str(e)}")
def generate_translated_document(self, translations: Dict[str, List[str]],
target_language: str, output_dir: Path) -> str:
"""生成翻譯文字檔PDF 不支援直接編輯)"""
try:
translated_texts = translations.get(target_language, [])
# 生成純文字檔案
output_filename = f"{self.file_path.stem}_{target_language}_translated.txt"
output_path = output_dir / output_filename
with open(output_path, 'w', encoding='utf-8') as f:
f.write(f"翻譯結果 - {target_language}\n")
f.write("=" * 50 + "\n\n")
for i, text in enumerate(translated_texts):
f.write(f"{i+1}. {text}\n\n")
logger.info(f"Generated translated text file: {output_path}")
return str(output_path)
except Exception as e:
logger.error(f"Failed to generate translated text file: {str(e)}")
raise FileProcessingError(f"生成翻譯文字檔失敗: {str(e)}")
class TranslationService:
"""翻譯服務"""
def __init__(self):
self.dify_client = DifyClient()
# 文件解析器映射
self.parsers = {
'.docx': DocxParser,
'.doc': DocxParser, # 假設可以用 docx 處理
'.pdf': PdfParser,
# 其他格式可以稍後添加
}
def get_document_parser(self, file_path: str) -> DocumentParser:
"""取得文件解析器"""
file_ext = Path(file_path).suffix.lower()
parser_class = self.parsers.get(file_ext)
if not parser_class:
raise FileProcessingError(f"不支援的檔案格式: {file_ext}")
return parser_class(file_path)
def split_text_into_sentences(self, text: str, language: str = 'auto') -> List[str]:
"""將文字分割成句子"""
# 這裡可以使用更智能的句子分割
# 暫時使用簡單的分割方式
sentences = []
# 基本的句子分割符號
separators = ['. ', '', '', '', '!', '?']
current_text = text
for sep in separators:
parts = current_text.split(sep)
if len(parts) > 1:
sentences.extend([part.strip() + sep.rstrip() for part in parts[:-1] if part.strip()])
current_text = parts[-1]
# 添加最後一部分
if current_text.strip():
sentences.append(current_text.strip())
# 過濾太短的句子
sentences = [s for s in sentences if len(s.strip()) > 5]
return sentences
def translate_text_with_cache(self, text: str, source_language: str,
target_language: str, user_id: int = None,
job_id: int = None) -> str:
"""帶快取的文字翻譯"""
# 檢查快取
cached_translation = TranslationCache.get_translation(
text, source_language, target_language
)
if cached_translation:
logger.debug(f"Cache hit for translation: {text[:50]}...")
return cached_translation
# 呼叫 Dify API
try:
result = self.dify_client.translate_text(
text=text,
source_language=source_language,
target_language=target_language,
user_id=user_id,
job_id=job_id
)
translated_text = result['translated_text']
# 儲存到快取
TranslationCache.save_translation(
text, source_language, target_language, translated_text
)
return translated_text
except Exception as e:
logger.error(f"Translation failed for text: {text[:50]}... Error: {str(e)}")
raise TranslationError(f"翻譯失敗: {str(e)}")
def translate_document(self, job_uuid: str) -> Dict[str, Any]:
"""翻譯文件(主要入口點)"""
try:
# 取得任務資訊
job = TranslationJob.query.filter_by(job_uuid=job_uuid).first()
if not job:
raise TranslationError(f"找不到任務: {job_uuid}")
logger.info(f"Starting document translation: {job_uuid}")
# 更新任務狀態
job.update_status('PROCESSING', progress=0)
# 取得文件解析器
parser = self.get_document_parser(job.file_path)
# 提取文字片段
logger.info("Extracting text segments from document")
text_segments = parser.extract_text_segments()
if not text_segments:
raise TranslationError("文件中未找到可翻譯的文字")
# 分割成句子
logger.info("Splitting text into sentences")
all_sentences = []
for segment in text_segments:
sentences = self.split_text_into_sentences(segment, job.source_language)
all_sentences.extend(sentences)
# 去重複
unique_sentences = list(dict.fromkeys(all_sentences)) # 保持順序的去重
logger.info(f"Found {len(unique_sentences)} unique sentences to translate")
# 批次翻譯
translation_results = {}
total_sentences = len(unique_sentences)
for target_language in job.target_languages:
logger.info(f"Translating to {target_language}")
translated_sentences = []
for i, sentence in enumerate(unique_sentences):
try:
translated = self.translate_text_with_cache(
text=sentence,
source_language=job.source_language,
target_language=target_language,
user_id=job.user_id,
job_id=job.id
)
translated_sentences.append(translated)
# 更新進度
progress = (i + 1) / total_sentences * 100 / len(job.target_languages)
current_lang_index = job.target_languages.index(target_language)
total_progress = (current_lang_index * 100 + progress) / len(job.target_languages)
job.update_status('PROCESSING', progress=total_progress)
# 短暫延遲避免過快請求
time.sleep(0.1)
except Exception as e:
logger.error(f"Failed to translate sentence: {sentence[:50]}... Error: {str(e)}")
# 翻譯失敗時保留原文
translated_sentences.append(f"[翻譯失敗] {sentence}")
translation_results[target_language] = translated_sentences
# 生成翻譯文件
logger.info("Generating translated documents")
output_dir = Path(job.file_path).parent
output_files = {}
for target_language, translations in translation_results.items():
try:
# 重建翻譯映射
translation_mapping = {target_language: translations}
output_file = parser.generate_translated_document(
translations=translation_mapping,
target_language=target_language,
output_dir=output_dir
)
output_files[target_language] = output_file
# 記錄翻譯檔案到資料庫
file_size = Path(output_file).stat().st_size
job.add_translated_file(
language_code=target_language,
filename=Path(output_file).name,
file_path=output_file,
file_size=file_size
)
except Exception as e:
logger.error(f"Failed to generate translated document for {target_language}: {str(e)}")
raise TranslationError(f"生成 {target_language} 翻譯文件失敗: {str(e)}")
# 計算總成本(從 API 使用統計中取得)
total_cost = self._calculate_job_cost(job.id)
# 更新任務狀態為完成
job.update_status('COMPLETED', progress=100)
job.total_cost = total_cost
job.total_tokens = len(unique_sentences) # 簡化的 token 計算
from app import db
db.session.commit()
logger.info(f"Document translation completed: {job_uuid}")
return {
'success': True,
'job_uuid': job_uuid,
'output_files': output_files,
'total_sentences': len(unique_sentences),
'total_cost': float(total_cost),
'target_languages': job.target_languages
}
except TranslationError:
raise
except Exception as e:
logger.error(f"Document translation failed: {job_uuid}. Error: {str(e)}")
raise TranslationError(f"文件翻譯失敗: {str(e)}")
def _calculate_job_cost(self, job_id: int) -> float:
"""計算任務總成本"""
from app import db
from sqlalchemy import func
total_cost = db.session.query(
func.sum(APIUsageStats.cost)
).filter_by(job_id=job_id).scalar()
return float(total_cost) if total_cost else 0.0