2ND
This commit is contained in:
@@ -11,10 +11,11 @@ Modified: 2024-01-28
|
||||
import hashlib
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Any, Optional
|
||||
from typing import List, Dict, Any, Optional, Tuple
|
||||
from app.utils.logger import get_logger
|
||||
from app.utils.exceptions import TranslationError, FileProcessingError
|
||||
from app.services.dify_client import DifyClient
|
||||
from app.services.document_processor import DocumentProcessor, Segment
|
||||
from app.models.cache import TranslationCache
|
||||
from app.models.job import TranslationJob
|
||||
from app.utils.helpers import generate_filename, create_job_directory
|
||||
@@ -42,88 +43,39 @@ class DocumentParser:
|
||||
|
||||
|
||||
class DocxParser(DocumentParser):
|
||||
"""DOCX 文件解析器"""
|
||||
"""DOCX 文件解析器 - 使用增強的 DocumentProcessor"""
|
||||
|
||||
def __init__(self, file_path: str):
|
||||
super().__init__(file_path)
|
||||
self.processor = DocumentProcessor()
|
||||
|
||||
def extract_text_segments(self) -> List[str]:
|
||||
"""提取 DOCX 文件的文字片段"""
|
||||
"""提取 DOCX 文件的文字片段 - 使用增強邏輯"""
|
||||
try:
|
||||
import docx
|
||||
from docx.table import _Cell
|
||||
# 使用新的文檔處理器提取段落
|
||||
segments = self.processor.extract_docx_segments(str(self.file_path))
|
||||
|
||||
doc = docx.Document(str(self.file_path))
|
||||
# 轉換為文字列表
|
||||
text_segments = []
|
||||
for seg in segments:
|
||||
if seg.text.strip() and len(seg.text.strip()) > 3:
|
||||
text_segments.append(seg.text)
|
||||
|
||||
# 提取段落文字
|
||||
for paragraph in doc.paragraphs:
|
||||
text = paragraph.text.strip()
|
||||
if text and len(text) > 3: # 過濾太短的文字
|
||||
text_segments.append(text)
|
||||
|
||||
# 提取表格文字
|
||||
for table in doc.tables:
|
||||
for row in table.rows:
|
||||
for cell in row.cells:
|
||||
text = cell.text.strip()
|
||||
if text and len(text) > 3:
|
||||
text_segments.append(text)
|
||||
|
||||
logger.info(f"Extracted {len(text_segments)} text segments from DOCX")
|
||||
logger.info(f"Enhanced extraction: {len(text_segments)} text segments from DOCX")
|
||||
return text_segments
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to extract text from DOCX: {str(e)}")
|
||||
raise FileProcessingError(f"DOCX 文件解析失敗: {str(e)}")
|
||||
|
||||
def extract_segments_with_context(self) -> List[Segment]:
|
||||
"""提取帶上下文的段落資訊"""
|
||||
return self.processor.extract_docx_segments(str(self.file_path))
|
||||
|
||||
def generate_translated_document(self, translations: Dict[str, List[str]],
|
||||
target_language: str, output_dir: Path) -> str:
|
||||
"""生成翻譯後的 DOCX 文件"""
|
||||
"""生成翻譯後的 DOCX 文件 - 使用增強的翻譯插入邏輯"""
|
||||
try:
|
||||
import docx
|
||||
from docx.shared import Pt
|
||||
|
||||
# 開啟原始文件
|
||||
doc = docx.Document(str(self.file_path))
|
||||
|
||||
# 取得對應的翻譯
|
||||
translated_texts = translations.get(target_language, [])
|
||||
text_index = 0
|
||||
|
||||
# 處理段落
|
||||
for paragraph in doc.paragraphs:
|
||||
if paragraph.text.strip() and len(paragraph.text.strip()) > 3:
|
||||
if text_index < len(translated_texts):
|
||||
# 保留原文,添加翻譯
|
||||
original_text = paragraph.text
|
||||
translated_text = translated_texts[text_index]
|
||||
|
||||
# 清空段落
|
||||
paragraph.clear()
|
||||
|
||||
# 添加原文
|
||||
run = paragraph.add_run(original_text)
|
||||
|
||||
# 添加翻譯(新行,較小字體)
|
||||
paragraph.add_run('\n')
|
||||
trans_run = paragraph.add_run(translated_text)
|
||||
trans_run.font.size = Pt(10)
|
||||
trans_run.italic = True
|
||||
|
||||
text_index += 1
|
||||
|
||||
# 處理表格(簡化版本)
|
||||
for table in doc.tables:
|
||||
for row in table.rows:
|
||||
for cell in row.cells:
|
||||
if cell.text.strip() and len(cell.text.strip()) > 3:
|
||||
if text_index < len(translated_texts):
|
||||
original_text = cell.text
|
||||
translated_text = translated_texts[text_index]
|
||||
|
||||
# 清空儲存格
|
||||
cell.text = f"{original_text}\n{translated_text}"
|
||||
|
||||
text_index += 1
|
||||
|
||||
# 生成輸出檔名
|
||||
output_filename = generate_filename(
|
||||
self.file_path.name,
|
||||
@@ -133,10 +85,30 @@ class DocxParser(DocumentParser):
|
||||
)
|
||||
output_path = output_dir / output_filename
|
||||
|
||||
# 儲存文件
|
||||
doc.save(str(output_path))
|
||||
# 提取段落資訊
|
||||
segments = self.extract_segments_with_context()
|
||||
|
||||
logger.info(f"Generated translated DOCX: {output_path}")
|
||||
# 建立翻譯映射
|
||||
translation_map = {}
|
||||
translated_texts = translations.get(target_language, [])
|
||||
|
||||
# 對應文字段落與翻譯
|
||||
text_index = 0
|
||||
for seg in segments:
|
||||
if text_index < len(translated_texts):
|
||||
translation_map[(target_language, seg.text)] = translated_texts[text_index]
|
||||
text_index += 1
|
||||
|
||||
# 使用增強的翻譯插入邏輯
|
||||
ok_count, skip_count = self.processor.insert_docx_translations(
|
||||
str(self.file_path),
|
||||
segments,
|
||||
translation_map,
|
||||
[target_language],
|
||||
str(output_path)
|
||||
)
|
||||
|
||||
logger.info(f"Enhanced translation: Generated {output_path} with {ok_count} insertions, {skip_count} skips")
|
||||
return str(output_path)
|
||||
|
||||
except Exception as e:
|
||||
@@ -202,6 +174,7 @@ class TranslationService:
|
||||
|
||||
def __init__(self):
|
||||
self.dify_client = DifyClient()
|
||||
self.document_processor = DocumentProcessor()
|
||||
|
||||
# 文件解析器映射
|
||||
self.parsers = {
|
||||
@@ -222,31 +195,87 @@ class TranslationService:
|
||||
return parser_class(file_path)
|
||||
|
||||
def split_text_into_sentences(self, text: str, language: str = 'auto') -> List[str]:
|
||||
"""將文字分割成句子"""
|
||||
# 這裡可以使用更智能的句子分割
|
||||
# 暫時使用簡單的分割方式
|
||||
|
||||
sentences = []
|
||||
|
||||
# 基本的句子分割符號
|
||||
separators = ['. ', '。', '!', '?', '!', '?']
|
||||
|
||||
current_text = text
|
||||
for sep in separators:
|
||||
parts = current_text.split(sep)
|
||||
if len(parts) > 1:
|
||||
sentences.extend([part.strip() + sep.rstrip() for part in parts[:-1] if part.strip()])
|
||||
current_text = parts[-1]
|
||||
|
||||
# 添加最後一部分
|
||||
if current_text.strip():
|
||||
sentences.append(current_text.strip())
|
||||
|
||||
# 過濾太短的句子
|
||||
sentences = [s for s in sentences if len(s.strip()) > 5]
|
||||
|
||||
return sentences
|
||||
"""將文字分割成句子 - 使用增強的分句邏輯"""
|
||||
return self.document_processor.split_text_into_sentences(text, language)
|
||||
|
||||
def translate_segment_with_sentences(self, text: str, source_language: str,
|
||||
target_language: str, user_id: int = None,
|
||||
job_id: int = None) -> str:
|
||||
"""
|
||||
按段落翻譯,模仿成功版本的 translate_block_sentencewise 邏輯
|
||||
對多行文字進行逐行、逐句翻譯,並重新組合成完整段落
|
||||
"""
|
||||
if not text or not text.strip():
|
||||
return ""
|
||||
|
||||
# 檢查快取 - 先檢查整個段落的快取
|
||||
cached_whole = TranslationCache.get_translation(text, source_language, target_language)
|
||||
if cached_whole:
|
||||
logger.debug(f"Whole paragraph cache hit: {text[:30]}...")
|
||||
return cached_whole
|
||||
|
||||
# 按行處理
|
||||
out_lines = []
|
||||
all_successful = True
|
||||
|
||||
for raw_line in text.split('\n'):
|
||||
if not raw_line.strip():
|
||||
out_lines.append("")
|
||||
continue
|
||||
|
||||
# 分句處理
|
||||
sentences = self.document_processor.split_text_into_sentences(raw_line, source_language)
|
||||
if not sentences:
|
||||
sentences = [raw_line]
|
||||
|
||||
translated_parts = []
|
||||
for sentence in sentences:
|
||||
sentence = sentence.strip()
|
||||
if not sentence:
|
||||
continue
|
||||
|
||||
# 檢查句子級快取
|
||||
cached_sentence = TranslationCache.get_translation(sentence, source_language, target_language)
|
||||
if cached_sentence:
|
||||
translated_parts.append(cached_sentence)
|
||||
continue
|
||||
|
||||
# 呼叫 Dify API 翻譯句子
|
||||
try:
|
||||
result = self.dify_client.translate_text(
|
||||
text=sentence,
|
||||
source_language=source_language,
|
||||
target_language=target_language,
|
||||
user_id=user_id,
|
||||
job_id=job_id
|
||||
)
|
||||
|
||||
translated_sentence = result['translated_text']
|
||||
|
||||
# 儲存句子級快取
|
||||
TranslationCache.save_translation(
|
||||
sentence, source_language, target_language, translated_sentence
|
||||
)
|
||||
|
||||
translated_parts.append(translated_sentence)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to translate sentence: {sentence[:30]}... Error: {str(e)}")
|
||||
translated_parts.append(f"【翻譯失敗|{target_language}】{sentence}")
|
||||
all_successful = False
|
||||
|
||||
# 重新組合句子為一行
|
||||
out_lines.append(" ".join(translated_parts))
|
||||
|
||||
# 重新組合所有行
|
||||
final_result = "\n".join(out_lines)
|
||||
|
||||
# 如果全部成功,儲存整個段落的快取
|
||||
if all_successful:
|
||||
TranslationCache.save_translation(text, source_language, target_language, final_result)
|
||||
|
||||
return final_result
|
||||
|
||||
def translate_text_with_cache(self, text: str, source_language: str,
|
||||
target_language: str, user_id: int = None,
|
||||
job_id: int = None) -> str:
|
||||
@@ -285,82 +314,173 @@ class TranslationService:
|
||||
raise TranslationError(f"翻譯失敗: {str(e)}")
|
||||
|
||||
def translate_document(self, job_uuid: str) -> Dict[str, Any]:
|
||||
"""翻譯文件(主要入口點)"""
|
||||
"""翻譯文件(主要入口點)- 使用增強的文檔處理邏輯"""
|
||||
try:
|
||||
# 取得任務資訊
|
||||
job = TranslationJob.query.filter_by(job_uuid=job_uuid).first()
|
||||
if not job:
|
||||
raise TranslationError(f"找不到任務: {job_uuid}")
|
||||
|
||||
logger.info(f"Starting document translation: {job_uuid}")
|
||||
logger.info(f"Starting enhanced document translation: {job_uuid}")
|
||||
|
||||
# 更新任務狀態
|
||||
job.update_status('PROCESSING', progress=0)
|
||||
|
||||
# 取得文件解析器
|
||||
parser = self.get_document_parser(job.file_path)
|
||||
# 使用增強的文檔處理器直接提取段落
|
||||
file_ext = Path(job.file_path).suffix.lower()
|
||||
|
||||
# 提取文字片段
|
||||
logger.info("Extracting text segments from document")
|
||||
text_segments = parser.extract_text_segments()
|
||||
|
||||
if not text_segments:
|
||||
raise TranslationError("文件中未找到可翻譯的文字")
|
||||
|
||||
# 分割成句子
|
||||
logger.info("Splitting text into sentences")
|
||||
all_sentences = []
|
||||
for segment in text_segments:
|
||||
sentences = self.split_text_into_sentences(segment, job.source_language)
|
||||
all_sentences.extend(sentences)
|
||||
|
||||
# 去重複
|
||||
unique_sentences = list(dict.fromkeys(all_sentences)) # 保持順序的去重
|
||||
logger.info(f"Found {len(unique_sentences)} unique sentences to translate")
|
||||
|
||||
# 批次翻譯
|
||||
translation_results = {}
|
||||
total_sentences = len(unique_sentences)
|
||||
|
||||
for target_language in job.target_languages:
|
||||
logger.info(f"Translating to {target_language}")
|
||||
translated_sentences = []
|
||||
if file_ext in ['.docx', '.doc']:
|
||||
# 使用增強的 DOCX 處理邏輯
|
||||
segments = self.document_processor.extract_docx_segments(job.file_path)
|
||||
logger.info(f"Enhanced extraction: Found {len(segments)} segments to translate")
|
||||
|
||||
for i, sentence in enumerate(unique_sentences):
|
||||
if not segments:
|
||||
raise TranslationError("文件中未找到可翻譯的文字段落")
|
||||
|
||||
# 使用成功版本的翻譯邏輯 - 直接按段落翻譯,不做複雜分割
|
||||
translatable_segments = []
|
||||
for seg in segments:
|
||||
if self.document_processor.should_translate_text(seg.text, job.source_language):
|
||||
translatable_segments.append(seg)
|
||||
|
||||
logger.info(f"Found {len(translatable_segments)} segments to translate")
|
||||
|
||||
# 批次翻譯 - 直接按原始段落翻譯
|
||||
translation_map = {} # 格式: (target_language, source_text) -> translated_text
|
||||
total_segments = len(translatable_segments)
|
||||
|
||||
for target_language in job.target_languages:
|
||||
logger.info(f"Translating to {target_language}")
|
||||
|
||||
for i, seg in enumerate(translatable_segments):
|
||||
try:
|
||||
# 使用整段文字進行翻譯
|
||||
translated = self.translate_segment_with_sentences(
|
||||
text=seg.text,
|
||||
source_language=job.source_language,
|
||||
target_language=target_language,
|
||||
user_id=job.user_id,
|
||||
job_id=job.id
|
||||
)
|
||||
|
||||
# 直接以原始段落文字為鍵儲存翻譯結果
|
||||
translation_map[(target_language, seg.text)] = translated
|
||||
|
||||
# 更新進度
|
||||
progress = (i + 1) / total_segments * 100 / len(job.target_languages)
|
||||
current_lang_index = job.target_languages.index(target_language)
|
||||
total_progress = (current_lang_index * 100 + progress) / len(job.target_languages)
|
||||
job.update_status('PROCESSING', progress=total_progress)
|
||||
|
||||
# 短暫延遲避免過快請求
|
||||
time.sleep(0.1)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to translate segment: {seg.text[:50]}... Error: {str(e)}")
|
||||
# 翻譯失敗時保留原文
|
||||
translation_map[(target_language, seg.text)] = f"[翻譯失敗] {seg.text}"
|
||||
|
||||
# 生成翻譯文件
|
||||
logger.info("Generating translated documents with enhanced insertion")
|
||||
output_dir = Path(job.file_path).parent
|
||||
output_files = {}
|
||||
|
||||
for target_language in job.target_languages:
|
||||
try:
|
||||
translated = self.translate_text_with_cache(
|
||||
text=sentence,
|
||||
source_language=job.source_language,
|
||||
target_language=target_language,
|
||||
user_id=job.user_id,
|
||||
job_id=job.id
|
||||
# 生成輸出檔名
|
||||
output_filename = generate_filename(
|
||||
Path(job.file_path).name,
|
||||
'translated',
|
||||
'translated',
|
||||
target_language
|
||||
)
|
||||
translated_sentences.append(translated)
|
||||
output_path = output_dir / output_filename
|
||||
|
||||
# 更新進度
|
||||
progress = (i + 1) / total_sentences * 100 / len(job.target_languages)
|
||||
current_lang_index = job.target_languages.index(target_language)
|
||||
total_progress = (current_lang_index * 100 + progress) / len(job.target_languages)
|
||||
job.update_status('PROCESSING', progress=total_progress)
|
||||
# 使用增強的翻譯插入邏輯
|
||||
ok_count, skip_count = self.document_processor.insert_docx_translations(
|
||||
job.file_path,
|
||||
segments,
|
||||
translation_map,
|
||||
[target_language],
|
||||
str(output_path)
|
||||
)
|
||||
|
||||
# 短暫延遲避免過快請求
|
||||
time.sleep(0.1)
|
||||
output_files[target_language] = str(output_path)
|
||||
|
||||
# 記錄翻譯檔案到資料庫
|
||||
file_size = Path(output_path).stat().st_size
|
||||
job.add_translated_file(
|
||||
language_code=target_language,
|
||||
filename=Path(output_path).name,
|
||||
file_path=str(output_path),
|
||||
file_size=file_size
|
||||
)
|
||||
|
||||
logger.info(f"Generated {target_language}: {ok_count} insertions, {skip_count} skips")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to translate sentence: {sentence[:50]}... Error: {str(e)}")
|
||||
# 翻譯失敗時保留原文
|
||||
translated_sentences.append(f"[翻譯失敗] {sentence}")
|
||||
logger.error(f"Failed to generate translated document for {target_language}: {str(e)}")
|
||||
raise TranslationError(f"生成 {target_language} 翻譯文件失敗: {str(e)}")
|
||||
|
||||
else:
|
||||
# 對於非 DOCX 文件,使用原有邏輯
|
||||
logger.info(f"Using legacy processing for {file_ext} files")
|
||||
parser = self.get_document_parser(job.file_path)
|
||||
|
||||
translation_results[target_language] = translated_sentences
|
||||
|
||||
# 生成翻譯文件
|
||||
logger.info("Generating translated documents")
|
||||
output_dir = Path(job.file_path).parent
|
||||
output_files = {}
|
||||
|
||||
for target_language, translations in translation_results.items():
|
||||
try:
|
||||
# 重建翻譯映射
|
||||
# 提取文字片段
|
||||
text_segments = parser.extract_text_segments()
|
||||
|
||||
if not text_segments:
|
||||
raise TranslationError("文件中未找到可翻譯的文字")
|
||||
|
||||
# 分割成句子
|
||||
all_sentences = []
|
||||
for segment in text_segments:
|
||||
sentences = self.split_text_into_sentences(segment, job.source_language)
|
||||
all_sentences.extend(sentences)
|
||||
|
||||
# 去重複
|
||||
unique_sentences = list(dict.fromkeys(all_sentences))
|
||||
logger.info(f"Found {len(unique_sentences)} unique sentences to translate")
|
||||
|
||||
# 批次翻譯
|
||||
translation_results = {}
|
||||
total_sentences = len(unique_sentences)
|
||||
|
||||
for target_language in job.target_languages:
|
||||
logger.info(f"Translating to {target_language}")
|
||||
translated_sentences = []
|
||||
|
||||
for i, sentence in enumerate(unique_sentences):
|
||||
try:
|
||||
translated = self.translate_text_with_cache(
|
||||
text=sentence,
|
||||
source_language=job.source_language,
|
||||
target_language=target_language,
|
||||
user_id=job.user_id,
|
||||
job_id=job.id
|
||||
)
|
||||
translated_sentences.append(translated)
|
||||
|
||||
# 更新進度
|
||||
progress = (i + 1) / total_sentences * 100 / len(job.target_languages)
|
||||
current_lang_index = job.target_languages.index(target_language)
|
||||
total_progress = (current_lang_index * 100 + progress) / len(job.target_languages)
|
||||
job.update_status('PROCESSING', progress=total_progress)
|
||||
|
||||
time.sleep(0.1)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to translate sentence: {sentence[:50]}... Error: {str(e)}")
|
||||
translated_sentences.append(f"[翻譯失敗] {sentence}")
|
||||
|
||||
translation_results[target_language] = translated_sentences
|
||||
|
||||
# 生成翻譯文件
|
||||
output_dir = Path(job.file_path).parent
|
||||
output_files = {}
|
||||
|
||||
for target_language, translations in translation_results.items():
|
||||
translation_mapping = {target_language: translations}
|
||||
|
||||
output_file = parser.generate_translated_document(
|
||||
@@ -371,7 +491,6 @@ class TranslationService:
|
||||
|
||||
output_files[target_language] = output_file
|
||||
|
||||
# 記錄翻譯檔案到資料庫
|
||||
file_size = Path(output_file).stat().st_size
|
||||
job.add_translated_file(
|
||||
language_code=target_language,
|
||||
@@ -379,29 +498,33 @@ class TranslationService:
|
||||
file_path=output_file,
|
||||
file_size=file_size
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to generate translated document for {target_language}: {str(e)}")
|
||||
raise TranslationError(f"生成 {target_language} 翻譯文件失敗: {str(e)}")
|
||||
|
||||
# 計算總成本(從 API 使用統計中取得)
|
||||
# 計算總成本
|
||||
total_cost = self._calculate_job_cost(job.id)
|
||||
|
||||
# 更新任務狀態為完成
|
||||
job.update_status('COMPLETED', progress=100)
|
||||
job.total_cost = total_cost
|
||||
job.total_tokens = len(unique_sentences) # 簡化的 token 計算
|
||||
|
||||
# 計算實際使用的 token 數(從 API 使用統計中獲取)
|
||||
from sqlalchemy import func
|
||||
from app.models.stats import APIUsageStats
|
||||
from app import db
|
||||
|
||||
actual_tokens = db.session.query(
|
||||
func.sum(APIUsageStats.total_tokens)
|
||||
).filter_by(job_id=job.id).scalar()
|
||||
|
||||
job.total_tokens = int(actual_tokens) if actual_tokens else 0
|
||||
|
||||
db.session.commit()
|
||||
|
||||
logger.info(f"Document translation completed: {job_uuid}")
|
||||
logger.info(f"Enhanced document translation completed: {job_uuid}")
|
||||
|
||||
return {
|
||||
'success': True,
|
||||
'job_uuid': job_uuid,
|
||||
'output_files': output_files,
|
||||
'total_sentences': len(unique_sentences),
|
||||
'total_sentences': len(texts_to_translate) if 'texts_to_translate' in locals() else len(unique_sentences) if 'unique_sentences' in locals() else 0,
|
||||
'total_cost': float(total_cost),
|
||||
'target_languages': job.target_languages
|
||||
}
|
||||
@@ -409,13 +532,14 @@ class TranslationService:
|
||||
except TranslationError:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Document translation failed: {job_uuid}. Error: {str(e)}")
|
||||
logger.error(f"Enhanced document translation failed: {job_uuid}. Error: {str(e)}")
|
||||
raise TranslationError(f"文件翻譯失敗: {str(e)}")
|
||||
|
||||
def _calculate_job_cost(self, job_id: int) -> float:
|
||||
"""計算任務總成本"""
|
||||
from app import db
|
||||
from sqlalchemy import func
|
||||
from app.models.stats import APIUsageStats
|
||||
|
||||
total_cost = db.session.query(
|
||||
func.sum(APIUsageStats.cost)
|
||||
|
Reference in New Issue
Block a user