5th_fix excel problem
This commit is contained in:
@@ -130,6 +130,37 @@ def _p_text_with_breaks(p: Paragraph) -> str:
|
||||
parts.append("\t")
|
||||
return "".join(parts)
|
||||
|
||||
def _get_cell_full_text(cell) -> str:
|
||||
"""
|
||||
提取表格儲存格的完整文字內容,包含所有段落
|
||||
"""
|
||||
try:
|
||||
cell_texts = []
|
||||
for para in cell.paragraphs:
|
||||
para_text = _p_text_with_breaks(para)
|
||||
if para_text.strip():
|
||||
cell_texts.append(para_text.strip())
|
||||
|
||||
# 用換行符連接所有段落
|
||||
return '\n'.join(cell_texts)
|
||||
except Exception as e:
|
||||
logger.warning(f"提取儲存格文字失敗: {e}")
|
||||
return ""
|
||||
|
||||
def _is_our_insert_block_text(text: str) -> bool:
|
||||
"""檢查文字是否為翻譯插入區塊"""
|
||||
if not text:
|
||||
return False
|
||||
text_lower = text.lower().strip()
|
||||
return (
|
||||
text_lower.startswith('【') or
|
||||
text_lower.startswith('[翻譯') or
|
||||
'翻譯:' in text_lower or
|
||||
'translation:' in text_lower or
|
||||
text_lower.startswith('translated:') or
|
||||
"\u200b" in text
|
||||
)
|
||||
|
||||
def _is_our_insert_block(p: Paragraph) -> bool:
|
||||
"""Check if paragraph is our inserted translation (contains zero-width space marker)."""
|
||||
text = _p_text_with_breaks(p)
|
||||
@@ -348,7 +379,11 @@ def _collect_docx_segments(doc: docx.Document) -> List[Segment]:
|
||||
for r_idx, row in enumerate(table.rows, 1):
|
||||
for c_idx, cell in enumerate(row.cells, 1):
|
||||
cell_ctx = f"{ctx} > Tbl(r{r_idx},c{c_idx})"
|
||||
_process_container_content(cell, cell_ctx)
|
||||
|
||||
# 使用儲存格為單位的提取方式(而非逐段落提取)
|
||||
cell_text = _get_cell_full_text(cell)
|
||||
if cell_text.strip() and not _is_our_insert_block_text(cell_text):
|
||||
segs.append(Segment("table_cell", cell, cell_ctx, cell_text))
|
||||
|
||||
elif qname.endswith('}sdt'): # Structured Document Tag (SDT)
|
||||
sdt_ctx = f"{ctx} > SDT"
|
||||
|
@@ -307,9 +307,15 @@ class ExcelParser(DocumentParser):
|
||||
return None
|
||||
|
||||
def _should_translate(self, text: str, src_lang: str) -> bool:
|
||||
"""判斷文字是否需要翻譯(移植自參考檔案)"""
|
||||
"""判斷文字是否需要翻譯(修正中文長度判斷)"""
|
||||
text = text.strip()
|
||||
if len(text) < 3:
|
||||
|
||||
# 檢查是否包含中日韓文字
|
||||
has_cjk = self._has_cjk(text)
|
||||
|
||||
# 對於包含CJK字符的文字,放寬長度限制為2個字符
|
||||
min_length = 2 if has_cjk else 3
|
||||
if len(text) < min_length:
|
||||
return False
|
||||
|
||||
# Skip pure numbers, dates, etc.
|
||||
@@ -319,7 +325,7 @@ class ExcelParser(DocumentParser):
|
||||
|
||||
# For auto-detect, translate if has CJK or meaningful text
|
||||
if src_lang.lower() in ('auto', 'auto-detect'):
|
||||
return self._has_cjk(text) or len(text) > 5
|
||||
return has_cjk or len(text) > 5
|
||||
|
||||
return True
|
||||
|
||||
@@ -337,11 +343,13 @@ class ExcelParser(DocumentParser):
|
||||
|
||||
def generate_translated_document(self, translations: Dict[str, List[str]],
|
||||
target_language: str, output_dir: Path) -> str:
|
||||
"""生成翻譯後的 Excel 文件(移植自參考檔案邏輯)"""
|
||||
"""生成翻譯後的 Excel 文件(使用翻譯快取確保正確映射)"""
|
||||
try:
|
||||
import openpyxl
|
||||
from openpyxl.styles import Alignment
|
||||
from openpyxl.comments import Comment
|
||||
from sqlalchemy import text as sql_text
|
||||
from app import db
|
||||
|
||||
# 載入原始工作簿
|
||||
wb = openpyxl.load_workbook(str(self.file_path), data_only=False)
|
||||
@@ -350,25 +358,70 @@ class ExcelParser(DocumentParser):
|
||||
except Exception:
|
||||
wb_vals = None
|
||||
|
||||
# 建立翻譯對應表
|
||||
translated_texts = translations.get(target_language, [])
|
||||
# 建立翻譯映射 - 改用翻譯快取查詢,確保正確對應
|
||||
original_segments = self.extract_text_segments()
|
||||
|
||||
# 建立翻譯映射(按照參考檔案的格式)
|
||||
tmap = {}
|
||||
for i, original_text in enumerate(original_segments):
|
||||
if i < len(translated_texts):
|
||||
tmap[original_text] = translated_texts[i]
|
||||
|
||||
# 處理每個工作表(完全按照參考檔案邏輯)
|
||||
logger.info(f"Building translation map for {len(original_segments)} segments in language {target_language}")
|
||||
|
||||
for original_text in original_segments:
|
||||
# 從翻譯快取中查詢每個原文的翻譯
|
||||
# 使用聯合查詢,優先使用最早的翻譯記錄(原始DIFY翻譯)
|
||||
normalized_text = original_text.replace('\n', ' ').replace('\r', ' ').strip()
|
||||
result = db.session.execute(sql_text("""
|
||||
SELECT translated_text, created_at, 'exact' as match_type
|
||||
FROM dt_translation_cache
|
||||
WHERE source_text = :exact_text AND target_language = :lang
|
||||
|
||||
UNION ALL
|
||||
|
||||
SELECT translated_text, created_at, 'normalized' as match_type
|
||||
FROM dt_translation_cache
|
||||
WHERE REPLACE(REPLACE(TRIM(source_text), '\n', ' '), '\r', ' ') = :norm_text
|
||||
AND target_language = :lang
|
||||
AND source_text != :exact_text
|
||||
|
||||
ORDER BY created_at ASC
|
||||
LIMIT 1
|
||||
"""), {'exact_text': original_text, 'norm_text': normalized_text, 'lang': target_language})
|
||||
|
||||
row = result.fetchone()
|
||||
if row and row[0]:
|
||||
tmap[original_text] = row[0]
|
||||
logger.debug(f"Cache hit for Excel: {original_text[:30]}... -> {row[0][:30]}...")
|
||||
else:
|
||||
logger.warning(f"No translation found in cache for: {original_text[:50]}...")
|
||||
|
||||
logger.info(f"Translation map built with {len(tmap)} mappings from cache")
|
||||
|
||||
# 處理每個工作表(加入詳細調試日誌)
|
||||
translation_count = 0
|
||||
skip_count = 0
|
||||
|
||||
for ws in wb.worksheets:
|
||||
logger.info(f"Processing worksheet: {ws.title}")
|
||||
ws_vals = wb_vals[ws.title] if wb_vals and ws.title in wb_vals.sheetnames else None
|
||||
max_row, max_col = ws.max_row, ws.max_column
|
||||
|
||||
for r in range(1, max_row + 1):
|
||||
for c in range(1, max_col + 1):
|
||||
cell_name = f"{openpyxl.utils.get_column_letter(c)}{r}"
|
||||
src_text = self._get_display_text_for_translation(ws, ws_vals, r, c)
|
||||
if not src_text or src_text not in tmap:
|
||||
|
||||
if not src_text:
|
||||
continue
|
||||
|
||||
# 檢查是否需要翻譯
|
||||
should_translate = self._should_translate(src_text, 'auto')
|
||||
if not should_translate:
|
||||
logger.debug(f"Skip {cell_name}: '{src_text[:30]}...' (should not translate)")
|
||||
skip_count += 1
|
||||
continue
|
||||
|
||||
# 檢查翻譯映射
|
||||
if src_text not in tmap:
|
||||
logger.warning(f"No translation mapping for {cell_name}: '{src_text[:30]}...'")
|
||||
skip_count += 1
|
||||
continue
|
||||
|
||||
val = ws.cell(row=r, column=c).value
|
||||
@@ -383,6 +436,8 @@ class ExcelParser(DocumentParser):
|
||||
exist = cell.comment
|
||||
if not exist or exist.text.strip() != txt_comment:
|
||||
cell.comment = Comment(txt_comment, "translator")
|
||||
logger.debug(f"Added comment to {cell_name}: {translated_text[:30]}...")
|
||||
translation_count += 1
|
||||
else:
|
||||
# 一般儲存格:使用交錯格式(原文+翻譯)
|
||||
combined = f"{src_text}\n{translated_text}"
|
||||
@@ -390,9 +445,12 @@ class ExcelParser(DocumentParser):
|
||||
# 檢查是否已經是預期的格式
|
||||
current_text = str(cell.value) if cell.value else ""
|
||||
if current_text.strip() == combined.strip():
|
||||
logger.debug(f"Skip {cell_name}: already translated")
|
||||
continue
|
||||
|
||||
cell.value = combined
|
||||
logger.info(f"Translated {cell_name}: '{src_text[:20]}...' -> '{translated_text[:20]}...'")
|
||||
translation_count += 1
|
||||
|
||||
# 設定自動換行(移植自參考檔案)
|
||||
try:
|
||||
@@ -412,6 +470,7 @@ class ExcelParser(DocumentParser):
|
||||
output_path = output_dir / output_filename
|
||||
wb.save(str(output_path))
|
||||
|
||||
logger.info(f"Excel translation completed: {translation_count} translations, {skip_count} skips")
|
||||
logger.info(f"Generated translated Excel file: {output_path}")
|
||||
return str(output_path)
|
||||
|
||||
@@ -504,12 +563,90 @@ class TranslationService:
|
||||
"""將文字分割成句子 - 使用增強的分句邏輯"""
|
||||
return self.document_processor.split_text_into_sentences(text, language)
|
||||
|
||||
def translate_excel_cell(self, text: str, source_language: str,
|
||||
target_language: str, user_id: int = None,
|
||||
job_id: int = None) -> str:
|
||||
"""
|
||||
Excel儲存格翻譯 - 整個儲存格作為一個單位翻譯,不進行切片
|
||||
"""
|
||||
if not text or not text.strip():
|
||||
return ""
|
||||
|
||||
# 檢查快取 - 整個儲存格內容
|
||||
cached_translation = TranslationCache.get_translation(text, source_language, target_language)
|
||||
if cached_translation:
|
||||
logger.debug(f"Excel cell cache hit: {text[:30]}...")
|
||||
return cached_translation
|
||||
|
||||
# 直接翻譯整個儲存格內容,不進行任何切片
|
||||
try:
|
||||
result = self.dify_client.translate_text(
|
||||
text=text,
|
||||
source_language=source_language,
|
||||
target_language=target_language,
|
||||
user_id=user_id,
|
||||
job_id=job_id
|
||||
)
|
||||
|
||||
translated_text = result['translated_text']
|
||||
|
||||
# 儲存整個儲存格的翻譯到快取
|
||||
TranslationCache.save_translation(
|
||||
text, source_language, target_language, translated_text
|
||||
)
|
||||
|
||||
return translated_text
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to translate Excel cell: {text[:30]}... Error: {str(e)}")
|
||||
# 翻譯失敗時返回失敗標記
|
||||
return f"【翻譯失敗|{target_language}】{text}"
|
||||
|
||||
def translate_word_table_cell(self, text: str, source_language: str,
|
||||
target_language: str, user_id: int = None,
|
||||
job_id: int = None) -> str:
|
||||
"""
|
||||
Word表格儲存格翻譯 - 整個儲存格內容作為一個單位翻譯,不進行段落切片
|
||||
"""
|
||||
if not text or not text.strip():
|
||||
return ""
|
||||
|
||||
# 檢查快取 - 整個儲存格內容
|
||||
cached_translation = TranslationCache.get_translation(text, source_language, target_language)
|
||||
if cached_translation:
|
||||
logger.debug(f"Word table cell cache hit: {text[:30]}...")
|
||||
return cached_translation
|
||||
|
||||
# 直接翻譯整個儲存格內容,不進行任何段落切片
|
||||
try:
|
||||
result = self.dify_client.translate_text(
|
||||
text=text,
|
||||
source_language=source_language,
|
||||
target_language=target_language,
|
||||
user_id=user_id,
|
||||
job_id=job_id
|
||||
)
|
||||
|
||||
translated_text = result['translated_text']
|
||||
|
||||
# 儲存整個儲存格的翻譯到快取
|
||||
TranslationCache.save_translation(
|
||||
text, source_language, target_language, translated_text
|
||||
)
|
||||
|
||||
return translated_text
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to translate Word table cell: {text[:30]}... Error: {str(e)}")
|
||||
return f"【翻譯失敗|{target_language}】{text}"
|
||||
|
||||
def translate_segment_with_sentences(self, text: str, source_language: str,
|
||||
target_language: str, user_id: int = None,
|
||||
job_id: int = None) -> str:
|
||||
"""
|
||||
按段落翻譯,模仿成功版本的 translate_block_sentencewise 邏輯
|
||||
對多行文字進行逐行、逐句翻譯,並重新組合成完整段落
|
||||
僅用於Word文檔,Excel請使用 translate_excel_cell
|
||||
"""
|
||||
if not text or not text.strip():
|
||||
return ""
|
||||
@@ -660,14 +797,25 @@ class TranslationService:
|
||||
|
||||
for i, seg in enumerate(translatable_segments):
|
||||
try:
|
||||
# 使用整段文字進行翻譯
|
||||
translated = self.translate_segment_with_sentences(
|
||||
text=seg.text,
|
||||
source_language=job.source_language,
|
||||
target_language=target_language,
|
||||
user_id=job.user_id,
|
||||
job_id=job.id
|
||||
)
|
||||
# 根據段落類型選擇適當的翻譯方法
|
||||
if seg.kind == "table_cell":
|
||||
# 表格儲存格使用整個儲存格為單位的翻譯方法
|
||||
translated = self.translate_word_table_cell(
|
||||
text=seg.text,
|
||||
source_language=job.source_language,
|
||||
target_language=target_language,
|
||||
user_id=job.user_id,
|
||||
job_id=job.id
|
||||
)
|
||||
else:
|
||||
# 一般段落使用原有的句子切片方法
|
||||
translated = self.translate_segment_with_sentences(
|
||||
text=seg.text,
|
||||
source_language=job.source_language,
|
||||
target_language=target_language,
|
||||
user_id=job.user_id,
|
||||
job_id=job.id
|
||||
)
|
||||
|
||||
# 直接以原始段落文字為鍵儲存翻譯結果
|
||||
translation_map[(target_language, seg.text)] = translated
|
||||
@@ -728,9 +876,79 @@ class TranslationService:
|
||||
logger.error(f"Failed to generate translated document for {target_language}: {str(e)}")
|
||||
raise TranslationError(f"生成 {target_language} 翻譯文件失敗: {str(e)}")
|
||||
|
||||
elif file_ext in ['.xlsx', '.xls']:
|
||||
# Excel 文件使用儲存格為單位的翻譯邏輯
|
||||
logger.info(f"Using cell-based processing for Excel files")
|
||||
parser = self.get_document_parser(job.file_path)
|
||||
|
||||
# 提取儲存格文字內容(不進行句子切片)
|
||||
cell_segments = parser.extract_text_segments()
|
||||
|
||||
if not cell_segments:
|
||||
raise TranslationError("Excel 文件中未找到可翻譯的文字")
|
||||
|
||||
logger.info(f"Found {len(cell_segments)} cell segments to translate")
|
||||
|
||||
# 批次翻譯 - 使用儲存格為單位的翻譯方法
|
||||
translation_results = {}
|
||||
total_segments = len(cell_segments)
|
||||
|
||||
for target_language in job.target_languages:
|
||||
logger.info(f"Translating Excel cells to {target_language}")
|
||||
translated_cells = []
|
||||
|
||||
for i, cell_text in enumerate(cell_segments):
|
||||
try:
|
||||
# 使用新的儲存格翻譯方法(整個儲存格作為單位)
|
||||
translated = self.translate_excel_cell(
|
||||
text=cell_text,
|
||||
source_language=job.source_language,
|
||||
target_language=target_language,
|
||||
user_id=job.user_id,
|
||||
job_id=job.id
|
||||
)
|
||||
translated_cells.append(translated)
|
||||
|
||||
# 更新進度
|
||||
progress = (i + 1) / total_segments * 100 / len(job.target_languages)
|
||||
current_lang_index = job.target_languages.index(target_language)
|
||||
total_progress = (current_lang_index * 100 + progress) / len(job.target_languages)
|
||||
job.update_status('PROCESSING', progress=total_progress)
|
||||
|
||||
time.sleep(0.1)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to translate Excel cell: {cell_text[:50]}... Error: {str(e)}")
|
||||
translated_cells.append(f"[翻譯失敗] {cell_text}")
|
||||
|
||||
translation_results[target_language] = translated_cells
|
||||
|
||||
# 生成翻譯文件
|
||||
output_dir = Path(job.file_path).parent
|
||||
output_files = {}
|
||||
|
||||
for target_language, translations in translation_results.items():
|
||||
translation_mapping = {target_language: translations}
|
||||
|
||||
output_file = parser.generate_translated_document(
|
||||
translations=translation_mapping,
|
||||
target_language=target_language,
|
||||
output_dir=output_dir
|
||||
)
|
||||
|
||||
output_files[target_language] = output_file
|
||||
|
||||
file_size = Path(output_file).stat().st_size
|
||||
job.add_translated_file(
|
||||
language_code=target_language,
|
||||
filename=Path(output_file).name,
|
||||
file_path=output_file,
|
||||
file_size=file_size
|
||||
)
|
||||
|
||||
else:
|
||||
# 對於非 DOCX 文件,使用原有邏輯
|
||||
logger.info(f"Using legacy processing for {file_ext} files")
|
||||
# 對於其他文件格式,使用原有邏輯
|
||||
logger.info(f"Using legacy sentence-based processing for {file_ext} files")
|
||||
parser = self.get_document_parser(job.file_path)
|
||||
|
||||
# 提取文字片段
|
||||
|
Reference in New Issue
Block a user