diff --git a/app/services/document_processor.py b/app/services/document_processor.py index 92239d5..6efc7d9 100644 --- a/app/services/document_processor.py +++ b/app/services/document_processor.py @@ -169,7 +169,9 @@ def _is_our_insert_block(p: Paragraph) -> bool: def should_translate(text: str, src_lang: str) -> bool: """Determine if text should be translated based on content and source language.""" text = text.strip() - if len(text) < 3: + + # 只要有字就翻譯 - 最小長度設為1 + if len(text) < 1: return False # Skip pure numbers, dates, etc. @@ -678,10 +680,42 @@ def _insert_docx_translations(doc: docx.Document, segs: List[Segment], if _is_our_insert_block(cell_paragraphs[i]): cell._element.remove(cell_paragraphs[i]._element) - # 添加新的翻譯到儲存格 - for t in translations: - new_p = cell.add_paragraph() - _add_formatted_run(new_p, t, italic=True, font_size_pt=INSERT_FONT_SIZE_PT) + # 檢查是否為簡單的短文本儲存格(只有原文,沒有複雜結構) + cell_content = cell.text.strip() + is_simple_cell = len(cell_content) <= 10 and cell_content == seg.text.strip() + + if is_simple_cell: + # 對於簡單短文本,直接替換內容而不是添加段落 + log(f"[INFO] 簡單儲存格內容替換: '{seg.text.strip()}' -> '{translations[0] if translations else 'N/A'}'") + + # 清空所有段落內容 + for para in cell.paragraphs: + para.clear() + + # 在第一個段落中添加原文和翻譯 + first_para = cell.paragraphs[0] if cell.paragraphs else cell.add_paragraph() + + # 添加原文 + run_orig = first_para.add_run(seg.text.strip()) + + # 添加換行和翻譯 + for t in translations: + first_para.add_run('\n') + run_trans = first_para.add_run(t) + run_trans.italic = True + if INSERT_FONT_SIZE_PT: + run_trans.font.size = Pt(INSERT_FONT_SIZE_PT) + + # 添加標記 + tag_run = first_para.add_run("\u200b") + tag_run.italic = True + if INSERT_FONT_SIZE_PT: + tag_run.font.size = Pt(INSERT_FONT_SIZE_PT) + else: + # 對於複雜儲存格,使用原有的添加段落方式 + for t in translations: + new_p = cell.add_paragraph() + _add_formatted_run(new_p, t, italic=True, font_size_pt=INSERT_FONT_SIZE_PT) ok_cnt += 1 log(f"[SUCCESS] 表格儲存格插入 {len(translations)} 個翻譯") diff --git a/app/services/translation_service.py b/app/services/translation_service.py index 12d46bc..a447b77 100644 --- a/app/services/translation_service.py +++ b/app/services/translation_service.py @@ -307,15 +307,11 @@ class ExcelParser(DocumentParser): return None def _should_translate(self, text: str, src_lang: str) -> bool: - """判斷文字是否需要翻譯(修正中文長度判斷)""" + """判斷文字是否需要翻譯(只要有字就翻譯)""" text = text.strip() - # 檢查是否包含中日韓文字 - has_cjk = self._has_cjk(text) - - # 對於包含CJK字符的文字,放寬長度限制為2個字符 - min_length = 2 if has_cjk else 3 - if len(text) < min_length: + # 只要有字就翻譯 - 最小長度設為1 + if len(text) < 1: return False # Skip pure numbers, dates, etc. diff --git a/debug_chaoweng_issue.py b/debug_chaoweng_issue.py new file mode 100644 index 0000000..09ddc5b --- /dev/null +++ b/debug_chaoweng_issue.py @@ -0,0 +1,263 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +深度診斷"超温"翻譯問題 +檢查從提取到插入的完整流程 +""" + +import sys +import os +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + +# 設定編碼 +sys.stdout.reconfigure(encoding='utf-8') + +from pathlib import Path +import docx +from docx.table import Table +from app import create_app +from app.services.document_processor import should_translate +from app.services.translation_service import TranslationService + +def debug_chaoweng_extraction(): + """檢查"超温"在文件提取階段是否被正確識別""" + + print("=" * 80) + print("診斷步驟1: 檢查文件提取階段") + print("=" * 80) + + app = create_app() + + with app.app_context(): + from app.services.document_processor import DocumentProcessor + + # 檔案路徑 + base_dir = Path(r"C:\Users\EGG\WORK\data\user_scrip\TOOL\Document_translator_V2\uploads\17e05695-406f-47af-96eb-a0e23843770e") + original_file = base_dir / "original_-OR026_17e05695.docx" + + if not original_file.exists(): + print(f"❌ 原始檔案不存在") + return + + processor = DocumentProcessor() + + # 提取所有segment + segments = processor.extract_docx_segments(str(original_file)) + + print(f"提取到 {len(segments)} 個segments") + + # 找包含"超温"的segments + chaoweng_segments = [] + for i, seg in enumerate(segments): + if "超温" in seg.text: + chaoweng_segments.append((i, seg)) + print(f"\nSegment {i}:") + print(f" 種類: {seg.kind}") + print(f" 上下文: {seg.ctx}") + print(f" 內容: {repr(seg.text)}") + print(f" 長度: {len(seg.text.strip())}") + + # 檢查是否應該翻譯 + should_trans = should_translate(seg.text, 'zh') + print(f" should_translate: {should_trans}") + + if seg.kind == "table_cell": + print(f" 🎯 這是表格儲存格segment") + else: + print(f" ⚠️ 不是表格儲存格類型") + + if not chaoweng_segments: + print("❌ 沒有找到包含'超温'的segments") + else: + print(f"✅ 找到 {len(chaoweng_segments)} 個包含'超温'的segments") + + return chaoweng_segments + +def debug_chaoweng_translation(chaoweng_segments): + """檢查"超温"在翻譯階段是否被正確處理""" + + print(f"\n" + "=" * 80) + print("診斷步驟2: 檢查翻譯階段") + print("=" * 80) + + if not chaoweng_segments: + print("❌ 沒有segments可以測試翻譯") + return + + app = create_app() + + with app.app_context(): + service = TranslationService() + + for seg_idx, seg in chaoweng_segments: + print(f"\n測試 Segment {seg_idx} 的翻譯:") + print(f"原文: {repr(seg.text)}") + + try: + if seg.kind == "table_cell": + print("使用 translate_word_table_cell() 方法") + translated = service.translate_word_table_cell( + text=seg.text, + source_language="zh", + target_language="en", + user_id=None + ) + else: + print("使用 translate_segment_with_sentences() 方法") + translated = service.translate_segment_with_sentences( + text=seg.text, + source_language="zh", + target_language="en", + user_id=None + ) + + print(f"翻譯結果: {repr(translated[:100])}...") + + # 檢查翻譯是否成功 + if "【翻譯失敗" in translated: + print("❌ 翻譯失敗") + elif translated == seg.text: + print("❌ 翻譯結果與原文相同,可能未翻譯") + else: + print("✅ 翻譯成功") + + except Exception as e: + print(f"❌ 翻譯過程發生錯誤: {e}") + +def debug_chaoweng_cache(): + """檢查"超温"的翻譯快取狀況""" + + print(f"\n" + "=" * 80) + print("診斷步驟3: 檢查翻譯快取") + print("=" * 80) + + app = create_app() + + with app.app_context(): + from sqlalchemy import text as sql_text + from app import db + + # 1. 搜尋精確匹配"超温" + print("1. 搜尋精確的'超温'記錄:") + exact_results = db.session.execute(sql_text(""" + SELECT id, source_text, target_language, translated_text, created_at + FROM dt_translation_cache + WHERE source_text = '超温' + ORDER BY created_at DESC + """)).fetchall() + + if exact_results: + for row in exact_results: + print(f" ROW {row[0]}: '{row[1]}' -> {row[2]} -> '{row[3]}'") + else: + print(" ❌ 沒有找到精確的'超温'記錄") + + # 2. 搜尋包含"超温"但可能有額外字符的記錄 + print(f"\n2. 搜尋包含'超温'的記錄:") + like_results = db.session.execute(sql_text(""" + SELECT id, source_text, target_language, translated_text, created_at + FROM dt_translation_cache + WHERE source_text LIKE '%超温%' + AND CHAR_LENGTH(source_text) <= 10 + ORDER BY created_at DESC + LIMIT 10 + """)).fetchall() + + if like_results: + for row in like_results: + print(f" ROW {row[0]}: '{row[1]}' -> {row[2]} -> '{row[3][:30]}...'") + else: + print(" ❌ 沒有找到包含'超温'的短記錄") + +def debug_chaoweng_insertion(): + """檢查"超温"的翻譯插入狀況""" + + print(f"\n" + "=" * 80) + print("診斷步驟4: 檢查已翻譯文件的插入狀況") + print("=" * 80) + + # 檢查翻譯後的文件 + base_dir = Path(r"C:\Users\EGG\WORK\data\user_scrip\TOOL\Document_translator_V2\uploads\17e05695-406f-47af-96eb-a0e23843770e") + translated_files = [ + ("英文", base_dir / "translated_original_-OR026_17e05695_en_translat.docx"), + ("越南文", base_dir / "translated_original_-OR026_17e05695_vi_translat.docx") + ] + + for lang, file_path in translated_files: + if not file_path.exists(): + print(f"❌ {lang}翻譯檔案不存在") + continue + + print(f"\n檢查{lang}翻譯檔案:") + try: + doc = docx.Document(str(file_path)) + + found_chaoweng = False + found_translation = False + + for table_idx, table in enumerate(doc.tables): + for row_idx, row in enumerate(table.rows): + for cell_idx, cell in enumerate(row.cells): + cell_text = cell.text.strip() + + if "超温" in cell_text: + found_chaoweng = True + print(f" 🔍 表格{table_idx+1} 行{row_idx+1} 列{cell_idx+1}:") + print(f" 內容: {repr(cell_text[:100])}") + + # 檢查該儲存格的段落結構 + print(f" 段落數: {len(cell.paragraphs)}") + for p_idx, para in enumerate(cell.paragraphs): + p_text = para.text.strip() + if p_text: + print(f" 段落{p_idx+1}: {repr(p_text)}") + + # 檢查是否有英文翻譯跡象 + if lang == "英文" and any(word in p_text.lower() for word in ['over', 'heat', 'temp', 'hot']): + found_translation = True + print(f" 🎯 可能的英文翻譯") + elif lang == "越南文" and any(word in p_text.lower() for word in ['quá', 'nóng', 'nhiệt']): + found_translation = True + print(f" 🎯 可能的越南文翻譯") + + print(f" 原文'超温': {'✅ 找到' if found_chaoweng else '❌ 未找到'}") + print(f" {lang}翻譯: {'✅ 找到' if found_translation else '❌ 未找到'}") + + except Exception as e: + print(f"❌ 讀取{lang}翻譯檔案失敗: {e}") + +def main(): + """主診斷函數""" + + print("🔍 深度診斷'超温'翻譯問題") + print("檢查完整的提取->翻譯->插入流程") + + try: + # 步驟1: 檢查文件提取 + chaoweng_segments = debug_chaoweng_extraction() + + # 步驟2: 檢查翻譯邏輯 + debug_chaoweng_translation(chaoweng_segments) + + # 步驟3: 檢查翻譯快取 + debug_chaoweng_cache() + + # 步驟4: 檢查插入結果 + debug_chaoweng_insertion() + + print(f"\n" + "=" * 80) + print("診斷完成!") + print("可能的問題:") + print("1. 提取階段: segments沒有正確提取'超温'") + print("2. 翻譯階段: 翻譯邏輯沒有處理該segment") + print("3. 快取階段: 翻譯沒有正確存儲") + print("4. 插入階段: 翻譯沒有正確插入到文件") + print("=" * 80) + + except Exception as e: + print(f"❌ 診斷過程發生錯誤: {e}") + import traceback + print(f"錯誤詳情: {traceback.format_exc()}") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/debug_table_translation.py b/debug_table_translation.py new file mode 100644 index 0000000..ec5d767 --- /dev/null +++ b/debug_table_translation.py @@ -0,0 +1,204 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +檢查docx文件表格翻譯問題 +特別分析"超温"文字的翻譯狀況 +""" + +import sys +import os +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + +# 設定編碼 +sys.stdout.reconfigure(encoding='utf-8') + +from pathlib import Path +import docx +from docx.table import Table +from app import create_app + +def analyze_docx_table_translation(): + """分析docx表格翻譯問題""" + + print("=" * 80) + print("檢查docx表格翻譯問題") + print("任務ID: 17e05695-406f-47af-96eb-a0e23843770e") + print("=" * 80) + + base_dir = Path(r"C:\Users\EGG\WORK\data\user_scrip\TOOL\Document_translator_V2\uploads\17e05695-406f-47af-96eb-a0e23843770e") + original_file = base_dir / "original_-OR026_17e05695.docx" + translated_en = base_dir / "translated_original_-OR026_17e05695_en_translat.docx" + translated_vi = base_dir / "translated_original_-OR026_17e05695_vi_translat.docx" + + if not original_file.exists(): + print(f"❌ 原始檔案不存在: {original_file}") + return + + print(f"✅ 原始檔案: {original_file.name}") + + # 1. 檢查原始文件中的"超温" + print(f"\n1. 分析原始文件表格內容") + print("-" * 60) + + try: + doc = docx.Document(str(original_file)) + tables_found = 0 + target_text_found = False + + for table_idx, table in enumerate(doc.tables): + tables_found += 1 + print(f"表格 {table_idx + 1}:") + + for row_idx, row in enumerate(table.rows): + for cell_idx, cell in enumerate(row.cells): + cell_text = cell.text.strip() + if cell_text: + print(f" 行{row_idx+1} 列{cell_idx+1}: {repr(cell_text)}") + + if "超温" in cell_text: + print(f" 🎯 找到目標文字 '超温'") + target_text_found = True + + # 檢查該儲存格的詳細結構 + print(f" 儲存格段落數: {len(cell.paragraphs)}") + for p_idx, para in enumerate(cell.paragraphs): + print(f" 段落{p_idx+1}: {repr(para.text)}") + + print(f"\n總表格數: {tables_found}") + print(f"是否找到'超温': {'✅' if target_text_found else '❌'}") + + except Exception as e: + print(f"❌ 讀取原始文件失敗: {e}") + return + + # 2. 檢查翻譯版本中的對應內容 + for lang, trans_file in [("英文", translated_en), ("越南文", translated_vi)]: + if not trans_file.exists(): + print(f"\n❌ {lang}翻譯檔案不存在") + continue + + print(f"\n2. 檢查{lang}翻譯結果") + print("-" * 60) + + try: + trans_doc = docx.Document(str(trans_file)) + translation_found = False + + for table_idx, table in enumerate(trans_doc.tables): + print(f"{lang}表格 {table_idx + 1}:") + + for row_idx, row in enumerate(table.rows): + for cell_idx, cell in enumerate(row.cells): + cell_text = cell.text.strip() + if cell_text: + # 檢查是否包含原文"超温" + if "超温" in cell_text: + print(f" 行{row_idx+1} 列{cell_idx+1}: {repr(cell_text)}") + print(f" ⚠️ 仍包含原文'超温',可能未翻譯") + + # 詳細分析該儲存格 + print(f" 儲存格段落數: {len(cell.paragraphs)}") + for p_idx, para in enumerate(cell.paragraphs): + p_text = para.text.strip() + print(f" 段落{p_idx+1}: {repr(p_text)}") + + # 檢查是否有翻譯標記 + if "【翻譯失敗」" in p_text or "translation:" in p_text.lower(): + print(f" 🔍 發現翻譯標記") + elif "\u200b" in p_text: # 零寬空格標記 + print(f" 🔍 發現翻譯插入標記") + + # 檢查可能的翻譯結果 + elif any(keyword in cell_text.lower() for keyword in ['overheating', 'over-heating', 'quá nóng']): + print(f" 行{row_idx+1} 列{cell_idx+1}: {repr(cell_text)}") + print(f" ✅ 可能的翻譯結果") + translation_found = True + + print(f"{lang}翻譯狀態: {'✅ 找到翻譯' if translation_found else '❌ 未找到翻譯'}") + + except Exception as e: + print(f"❌ 讀取{lang}翻譯檔案失敗: {e}") + +def check_translation_cache(): + """檢查翻譯快取中是否有"超温"的記錄""" + + print(f"\n" + "=" * 80) + print("檢查翻譯快取") + print("=" * 80) + + app = create_app() + + with app.app_context(): + from sqlalchemy import text as sql_text + from app import db + + print(f"\n1. 搜尋'超温'相關的快取記錄") + print("-" * 60) + + # 搜尋包含"超温"的快取記錄 + cache_results = db.session.execute(sql_text(""" + SELECT id, source_text, target_language, translated_text, created_at + FROM dt_translation_cache + WHERE source_text LIKE '%超温%' + ORDER BY created_at DESC + LIMIT 10 + """)).fetchall() + + if cache_results: + print(f"找到 {len(cache_results)} 條相關記錄:") + for row in cache_results: + print(f"ROW {row[0]}: {row[3]} -> {row[1]}") + print(f" 目標語言: {row[1]}") + print(f" 翻譯結果: {repr(row[2][:50])}...") + print(f" 時間: {row[4]}") + print() + else: + print("❌ 未找到包含'超温'的快取記錄") + + # 搜尋近期的翻譯記錄 + print(f"\n2. 檢查近期的翻譯記錄") + print("-" * 60) + + recent_results = db.session.execute(sql_text(""" + SELECT id, source_text, target_language, translated_text, created_at + FROM dt_translation_cache + WHERE created_at >= DATE_SUB(NOW(), INTERVAL 1 DAY) + AND (target_language = 'en' OR target_language = 'vi') + ORDER BY created_at DESC + LIMIT 20 + """)).fetchall() + + print(f"近24小時內的英文/越南文翻譯記錄 (共{len(recent_results)}條):") + for row in recent_results: + print(f"ROW {row[0]}: {repr(row[1][:20])}... -> {row[2]} -> {repr(row[3][:30])}...") + +def main(): + """主檢查函數""" + + print("🔍 診斷docx表格翻譯問題") + print("重點檢查: '超温' 文字翻譯狀況") + + try: + # 分析文件表格 + analyze_docx_table_translation() + + # 檢查翻譯快取 + check_translation_cache() + + print(f"\n" + "=" * 80) + print("診斷總結") + print("=" * 80) + print("請根據以上結果判斷問題類型:") + print("1. 解析問題: 原始文件中找不到'超温'") + print("2. 翻譯問題: 快取中沒有'超温'的翻譯記錄") + print("3. 插入問題: 有翻譯記錄但未插入到文件中") + print("4. 版面問題: 翻譯已插入但格式或位置導致看不到") + print("=" * 80) + + except Exception as e: + print(f"❌ 診斷過程發生錯誤: {e}") + import traceback + print(f"錯誤詳情: {traceback.format_exc()}") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/test_single_char_translation.py b/test_single_char_translation.py new file mode 100644 index 0000000..d3664ad --- /dev/null +++ b/test_single_char_translation.py @@ -0,0 +1,120 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +測試單字符翻譯功能 +確認長度過濾已改為1,單個字符也能翻譯 +""" + +import sys +import os +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + +# 設定編碼 +sys.stdout.reconfigure(encoding='utf-8') + +from app import create_app +from app.services.translation_service import TranslationService +from app.services.document_processor import should_translate + +def test_length_filtering(): + """測試長度過濾邏輯""" + + print("=" * 80) + print("測試長度過濾邏輯 - 應該只要有字就翻譯") + print("=" * 80) + + # 測試案例 + test_cases = [ + ("", "空字符串"), + (" ", "只有空格"), + ("a", "單個英文字母"), + ("1", "單個數字"), + ("中", "單個中文字"), + ("超", "單字中文"), + ("温", "單字中文"), + ("超温", "雙字中文"), + ("A", "單個大寫英文"), + ("の", "單個日文"), + ("가", "單個韓文"), + ] + + print("1. 測試 document_processor.should_translate()") + print("-" * 60) + + for text, desc in test_cases: + result = should_translate(text, 'auto') + status = "✅ 會翻譯" if result else "❌ 不翻譯" + print(f"{desc:12} '{text}' -> {status}") + + # 測試 TranslationService + app = create_app() + with app.app_context(): + service = TranslationService() + + print(f"\n2. 測試 translation_service._should_translate()") + print("-" * 60) + + for text, desc in test_cases: + result = service._should_translate(text, 'auto') + status = "✅ 會翻譯" if result else "❌ 不翻譯" + print(f"{desc:12} '{text}' -> {status}") + +def test_actual_translation(): + """測試實際翻譯功能""" + + print(f"\n" + "=" * 80) + print("測試實際翻譯功能") + print("=" * 80) + + app = create_app() + with app.app_context(): + service = TranslationService() + + # 測試單個字符翻譯 + single_chars = ["超", "温", "中", "文"] + + print("測試單字符英文翻譯:") + print("-" * 60) + + for char in single_chars: + try: + # 使用Excel cell方法測試 + translated = service.translate_excel_cell( + text=char, + source_language="zh", + target_language="en", + user_id=None # 避免外鍵約束問題 + ) + print(f"'{char}' -> '{translated[:30]}'") + except Exception as e: + print(f"'{char}' -> ❌ 翻譯失敗: {str(e)[:50]}...") + +def main(): + """主測試函數""" + + print("🧪 測試單字符翻譯功能") + print("驗證: 長度過濾已改為1,只要有字就翻譯") + + try: + # 測試長度過濾邏輯 + test_length_filtering() + + # 測試實際翻譯(可能因為外鍵約束失敗) + # test_actual_translation() + + print(f"\n" + "=" * 80) + print("✅ 長度過濾測試完成!") + print("總結:") + print(" • document_processor.should_translate(): 最小長度 = 1") + print(" • translation_service._should_translate(): 最小長度 = 1") + print(" • 單個字符現在應該能夠正常翻譯") + print(" • '超温'、'存放' 等短詞不會再被過濾") + print("=" * 80) + + except Exception as e: + print(f"❌ 測試過程發生錯誤: {e}") + import traceback + print(f"錯誤詳情: {traceback.format_exc()}") + +if __name__ == "__main__": + main() \ No newline at end of file