5th_fix excel problem

This commit is contained in:
beabigegg
2025-09-03 15:07:34 +08:00
parent cce3fd4925
commit 5fd0671b4f
28 changed files with 4484 additions and 97 deletions

View File

@@ -307,9 +307,15 @@ class ExcelParser(DocumentParser):
return None
def _should_translate(self, text: str, src_lang: str) -> bool:
"""判斷文字是否需要翻譯(移植自參考檔案"""
"""判斷文字是否需要翻譯(修正中文長度判斷"""
text = text.strip()
if len(text) < 3:
# 檢查是否包含中日韓文字
has_cjk = self._has_cjk(text)
# 對於包含CJK字符的文字放寬長度限制為2個字符
min_length = 2 if has_cjk else 3
if len(text) < min_length:
return False
# Skip pure numbers, dates, etc.
@@ -319,7 +325,7 @@ class ExcelParser(DocumentParser):
# For auto-detect, translate if has CJK or meaningful text
if src_lang.lower() in ('auto', 'auto-detect'):
return self._has_cjk(text) or len(text) > 5
return has_cjk or len(text) > 5
return True
@@ -337,11 +343,13 @@ class ExcelParser(DocumentParser):
def generate_translated_document(self, translations: Dict[str, List[str]],
target_language: str, output_dir: Path) -> str:
"""生成翻譯後的 Excel 文件(移植自參考檔案邏輯"""
"""生成翻譯後的 Excel 文件(使用翻譯快取確保正確映射"""
try:
import openpyxl
from openpyxl.styles import Alignment
from openpyxl.comments import Comment
from sqlalchemy import text as sql_text
from app import db
# 載入原始工作簿
wb = openpyxl.load_workbook(str(self.file_path), data_only=False)
@@ -350,25 +358,70 @@ class ExcelParser(DocumentParser):
except Exception:
wb_vals = None
# 建立翻譯對應
translated_texts = translations.get(target_language, [])
# 建立翻譯映射 - 改用翻譯快取查詢,確保正確對應
original_segments = self.extract_text_segments()
# 建立翻譯映射(按照參考檔案的格式)
tmap = {}
for i, original_text in enumerate(original_segments):
if i < len(translated_texts):
tmap[original_text] = translated_texts[i]
# 處理每個工作表(完全按照參考檔案邏輯)
logger.info(f"Building translation map for {len(original_segments)} segments in language {target_language}")
for original_text in original_segments:
# 從翻譯快取中查詢每個原文的翻譯
# 使用聯合查詢優先使用最早的翻譯記錄原始DIFY翻譯
normalized_text = original_text.replace('\n', ' ').replace('\r', ' ').strip()
result = db.session.execute(sql_text("""
SELECT translated_text, created_at, 'exact' as match_type
FROM dt_translation_cache
WHERE source_text = :exact_text AND target_language = :lang
UNION ALL
SELECT translated_text, created_at, 'normalized' as match_type
FROM dt_translation_cache
WHERE REPLACE(REPLACE(TRIM(source_text), '\n', ' '), '\r', ' ') = :norm_text
AND target_language = :lang
AND source_text != :exact_text
ORDER BY created_at ASC
LIMIT 1
"""), {'exact_text': original_text, 'norm_text': normalized_text, 'lang': target_language})
row = result.fetchone()
if row and row[0]:
tmap[original_text] = row[0]
logger.debug(f"Cache hit for Excel: {original_text[:30]}... -> {row[0][:30]}...")
else:
logger.warning(f"No translation found in cache for: {original_text[:50]}...")
logger.info(f"Translation map built with {len(tmap)} mappings from cache")
# 處理每個工作表(加入詳細調試日誌)
translation_count = 0
skip_count = 0
for ws in wb.worksheets:
logger.info(f"Processing worksheet: {ws.title}")
ws_vals = wb_vals[ws.title] if wb_vals and ws.title in wb_vals.sheetnames else None
max_row, max_col = ws.max_row, ws.max_column
for r in range(1, max_row + 1):
for c in range(1, max_col + 1):
cell_name = f"{openpyxl.utils.get_column_letter(c)}{r}"
src_text = self._get_display_text_for_translation(ws, ws_vals, r, c)
if not src_text or src_text not in tmap:
if not src_text:
continue
# 檢查是否需要翻譯
should_translate = self._should_translate(src_text, 'auto')
if not should_translate:
logger.debug(f"Skip {cell_name}: '{src_text[:30]}...' (should not translate)")
skip_count += 1
continue
# 檢查翻譯映射
if src_text not in tmap:
logger.warning(f"No translation mapping for {cell_name}: '{src_text[:30]}...'")
skip_count += 1
continue
val = ws.cell(row=r, column=c).value
@@ -383,6 +436,8 @@ class ExcelParser(DocumentParser):
exist = cell.comment
if not exist or exist.text.strip() != txt_comment:
cell.comment = Comment(txt_comment, "translator")
logger.debug(f"Added comment to {cell_name}: {translated_text[:30]}...")
translation_count += 1
else:
# 一般儲存格:使用交錯格式(原文+翻譯)
combined = f"{src_text}\n{translated_text}"
@@ -390,9 +445,12 @@ class ExcelParser(DocumentParser):
# 檢查是否已經是預期的格式
current_text = str(cell.value) if cell.value else ""
if current_text.strip() == combined.strip():
logger.debug(f"Skip {cell_name}: already translated")
continue
cell.value = combined
logger.info(f"Translated {cell_name}: '{src_text[:20]}...' -> '{translated_text[:20]}...'")
translation_count += 1
# 設定自動換行(移植自參考檔案)
try:
@@ -412,6 +470,7 @@ class ExcelParser(DocumentParser):
output_path = output_dir / output_filename
wb.save(str(output_path))
logger.info(f"Excel translation completed: {translation_count} translations, {skip_count} skips")
logger.info(f"Generated translated Excel file: {output_path}")
return str(output_path)
@@ -504,12 +563,90 @@ class TranslationService:
"""將文字分割成句子 - 使用增強的分句邏輯"""
return self.document_processor.split_text_into_sentences(text, language)
def translate_excel_cell(self, text: str, source_language: str,
target_language: str, user_id: int = None,
job_id: int = None) -> str:
"""
Excel儲存格翻譯 - 整個儲存格作為一個單位翻譯,不進行切片
"""
if not text or not text.strip():
return ""
# 檢查快取 - 整個儲存格內容
cached_translation = TranslationCache.get_translation(text, source_language, target_language)
if cached_translation:
logger.debug(f"Excel cell cache hit: {text[:30]}...")
return cached_translation
# 直接翻譯整個儲存格內容,不進行任何切片
try:
result = self.dify_client.translate_text(
text=text,
source_language=source_language,
target_language=target_language,
user_id=user_id,
job_id=job_id
)
translated_text = result['translated_text']
# 儲存整個儲存格的翻譯到快取
TranslationCache.save_translation(
text, source_language, target_language, translated_text
)
return translated_text
except Exception as e:
logger.error(f"Failed to translate Excel cell: {text[:30]}... Error: {str(e)}")
# 翻譯失敗時返回失敗標記
return f"【翻譯失敗|{target_language}{text}"
def translate_word_table_cell(self, text: str, source_language: str,
target_language: str, user_id: int = None,
job_id: int = None) -> str:
"""
Word表格儲存格翻譯 - 整個儲存格內容作為一個單位翻譯,不進行段落切片
"""
if not text or not text.strip():
return ""
# 檢查快取 - 整個儲存格內容
cached_translation = TranslationCache.get_translation(text, source_language, target_language)
if cached_translation:
logger.debug(f"Word table cell cache hit: {text[:30]}...")
return cached_translation
# 直接翻譯整個儲存格內容,不進行任何段落切片
try:
result = self.dify_client.translate_text(
text=text,
source_language=source_language,
target_language=target_language,
user_id=user_id,
job_id=job_id
)
translated_text = result['translated_text']
# 儲存整個儲存格的翻譯到快取
TranslationCache.save_translation(
text, source_language, target_language, translated_text
)
return translated_text
except Exception as e:
logger.error(f"Failed to translate Word table cell: {text[:30]}... Error: {str(e)}")
return f"【翻譯失敗|{target_language}{text}"
def translate_segment_with_sentences(self, text: str, source_language: str,
target_language: str, user_id: int = None,
job_id: int = None) -> str:
"""
按段落翻譯,模仿成功版本的 translate_block_sentencewise 邏輯
對多行文字進行逐行、逐句翻譯,並重新組合成完整段落
僅用於Word文檔Excel請使用 translate_excel_cell
"""
if not text or not text.strip():
return ""
@@ -660,14 +797,25 @@ class TranslationService:
for i, seg in enumerate(translatable_segments):
try:
# 使用整段文字進行翻譯
translated = self.translate_segment_with_sentences(
text=seg.text,
source_language=job.source_language,
target_language=target_language,
user_id=job.user_id,
job_id=job.id
)
# 根據段落類型選擇適當的翻譯方法
if seg.kind == "table_cell":
# 表格儲存格使用整個儲存格為單位的翻譯方法
translated = self.translate_word_table_cell(
text=seg.text,
source_language=job.source_language,
target_language=target_language,
user_id=job.user_id,
job_id=job.id
)
else:
# 一般段落使用原有的句子切片方法
translated = self.translate_segment_with_sentences(
text=seg.text,
source_language=job.source_language,
target_language=target_language,
user_id=job.user_id,
job_id=job.id
)
# 直接以原始段落文字為鍵儲存翻譯結果
translation_map[(target_language, seg.text)] = translated
@@ -728,9 +876,79 @@ class TranslationService:
logger.error(f"Failed to generate translated document for {target_language}: {str(e)}")
raise TranslationError(f"生成 {target_language} 翻譯文件失敗: {str(e)}")
elif file_ext in ['.xlsx', '.xls']:
# Excel 文件使用儲存格為單位的翻譯邏輯
logger.info(f"Using cell-based processing for Excel files")
parser = self.get_document_parser(job.file_path)
# 提取儲存格文字內容(不進行句子切片)
cell_segments = parser.extract_text_segments()
if not cell_segments:
raise TranslationError("Excel 文件中未找到可翻譯的文字")
logger.info(f"Found {len(cell_segments)} cell segments to translate")
# 批次翻譯 - 使用儲存格為單位的翻譯方法
translation_results = {}
total_segments = len(cell_segments)
for target_language in job.target_languages:
logger.info(f"Translating Excel cells to {target_language}")
translated_cells = []
for i, cell_text in enumerate(cell_segments):
try:
# 使用新的儲存格翻譯方法(整個儲存格作為單位)
translated = self.translate_excel_cell(
text=cell_text,
source_language=job.source_language,
target_language=target_language,
user_id=job.user_id,
job_id=job.id
)
translated_cells.append(translated)
# 更新進度
progress = (i + 1) / total_segments * 100 / len(job.target_languages)
current_lang_index = job.target_languages.index(target_language)
total_progress = (current_lang_index * 100 + progress) / len(job.target_languages)
job.update_status('PROCESSING', progress=total_progress)
time.sleep(0.1)
except Exception as e:
logger.error(f"Failed to translate Excel cell: {cell_text[:50]}... Error: {str(e)}")
translated_cells.append(f"[翻譯失敗] {cell_text}")
translation_results[target_language] = translated_cells
# 生成翻譯文件
output_dir = Path(job.file_path).parent
output_files = {}
for target_language, translations in translation_results.items():
translation_mapping = {target_language: translations}
output_file = parser.generate_translated_document(
translations=translation_mapping,
target_language=target_language,
output_dir=output_dir
)
output_files[target_language] = output_file
file_size = Path(output_file).stat().st_size
job.add_translated_file(
language_code=target_language,
filename=Path(output_file).name,
file_path=output_file,
file_size=file_size
)
else:
# 對於非 DOCX 文件,使用原有邏輯
logger.info(f"Using legacy processing for {file_ext} files")
# 對於其他文件格式,使用原有邏輯
logger.info(f"Using legacy sentence-based processing for {file_ext} files")
parser = self.get_document_parser(job.file_path)
# 提取文字片段