3rd_fix download

2025-09-02 16:47:16 +08:00
parent b11a8272c4
commit e6e5332705
24 changed files with 1671 additions and 167 deletions
--- a/app/services/translation_service.py
+++ b/app/services/translation_service.py
@@ -116,6 +116,294 @@ class DocxParser(DocumentParser):
            raise FileProcessingError(f"生成翻譯 DOCX 失敗: {str(e)}")


+class DocParser(DocumentParser):
+    """DOC 文件解析器 - 需要先轉換為 DOCX"""
+    
+    def extract_text_segments(self) -> List[str]:
+        """提取 DOC 文件的文字片段 - 先轉換為 DOCX 再處理"""
+        try:
+            # 檢查是否有 Word COM 支援
+            import tempfile
+            import os
+            
+            try:
+                import win32com.client as win32
+                import pythoncom
+                _WIN32COM_AVAILABLE = True
+            except ImportError:
+                _WIN32COM_AVAILABLE = False
+            
+            if not _WIN32COM_AVAILABLE:
+                raise FileProcessingError("DOC 格式需要 Word COM 支援，請先手動轉換為 DOCX 格式或安裝 Microsoft Office")
+            
+            # 創建臨時 DOCX 文件
+            temp_docx = None
+            try:
+                with tempfile.NamedTemporaryFile(suffix='.docx', delete=False) as tmp:
+                    temp_docx = tmp.name
+                
+                # 使用 Word COM 轉換 DOC 到 DOCX (格式 16)
+                self._word_convert(str(self.file_path), temp_docx, 16)
+                
+                # 使用 DOCX 解析器處理轉換後的文件
+                docx_parser = DocxParser(temp_docx)
+                segments = docx_parser.extract_text_segments()
+                
+                logger.info(f"Converted DOC to DOCX and extracted {len(segments)} segments")
+                return segments
+                
+            finally:
+                # 清理臨時文件
+                if temp_docx and os.path.exists(temp_docx):
+                    try:
+                        os.remove(temp_docx)
+                    except Exception:
+                        pass
+        
+        except Exception as e:
+            logger.error(f"Failed to extract text from DOC file: {str(e)}")
+            raise FileProcessingError(f"DOC 文件解析失敗: {str(e)}")
+    
+    def _word_convert(self, input_path: str, output_path: str, target_format: int):
+        """使用 Word COM 轉換文件格式（移植自參考檔案）"""
+        try:
+            import win32com.client as win32
+            import pythoncom
+            
+            pythoncom.CoInitialize()
+            try:
+                word = win32.Dispatch("Word.Application")
+                word.Visible = False
+                doc = word.Documents.Open(os.path.abspath(input_path))
+                doc.SaveAs2(os.path.abspath(output_path), FileFormat=target_format)
+                doc.Close(False)
+            finally:
+                word.Quit()
+                pythoncom.CoUninitialize()
+        except Exception as e:
+            raise FileProcessingError(f"Word COM 轉換失敗: {str(e)}")
+    
+    def generate_translated_document(self, translations: Dict[str, List[str]], 
+                                   target_language: str, output_dir: Path) -> str:
+        """生成翻譯後的 DOC 文件 - 先轉為 DOCX 處理後輸出為 DOCX"""
+        try:
+            import tempfile
+            import os
+            
+            # 先轉換為 DOCX，然後使用 DOCX 處理邏輯
+            temp_docx = None
+            try:
+                with tempfile.NamedTemporaryFile(suffix='.docx', delete=False) as tmp:
+                    temp_docx = tmp.name
+                
+                # 轉換 DOC 到 DOCX
+                self._word_convert(str(self.file_path), temp_docx, 16)
+                
+                # 使用 DOCX 解析器生成翻譯文檔
+                docx_parser = DocxParser(temp_docx)
+                
+                # 注意：最終輸出為 DOCX 格式，因為 DOC 格式較難直接處理
+                output_filename = f"{self.file_path.stem}_{target_language}_translated.docx"
+                output_path = output_dir / output_filename
+                
+                result_path = docx_parser.generate_translated_document(translations, target_language, output_dir)
+                
+                logger.info(f"Generated translated DOC file (as DOCX): {result_path}")
+                return result_path
+                
+            finally:
+                # 清理臨時文件
+                if temp_docx and os.path.exists(temp_docx):
+                    try:
+                        os.remove(temp_docx)
+                    except Exception:
+                        pass
+        
+        except Exception as e:
+            logger.error(f"Failed to generate translated DOC file: {str(e)}")
+            raise FileProcessingError(f"DOC 翻譯檔生成失敗: {str(e)}")
+
+
+class ExcelParser(DocumentParser):
+    """Excel 文件解析器（XLSX/XLS）- 移植自參考檔案"""
+    
+    def extract_text_segments(self) -> List[str]:
+        """提取 Excel 文件的文字片段"""
+        try:
+            import openpyxl
+            from openpyxl.utils.exceptions import InvalidFileException
+            
+            # 載入工作簿（移植自參考檔案邏輯）
+            try:
+                wb = openpyxl.load_workbook(str(self.file_path), data_only=False)
+                wb_vals = openpyxl.load_workbook(str(self.file_path), data_only=True)
+            except InvalidFileException:
+                if self.file_path.suffix.lower() == '.xls':
+                    raise FileProcessingError("XLS 格式需要先轉換為 XLSX 格式")
+                raise
+            except Exception:
+                wb_vals = None
+            
+            # 提取文字段落（完全按照參考檔案的邏輯）
+            segs = []
+            for ws in wb.worksheets:
+                ws_vals = wb_vals[ws.title] if wb_vals and ws.title in wb_vals.sheetnames else None
+                max_row, max_col = ws.max_row, ws.max_column
+                
+                for r in range(1, max_row + 1):
+                    for c in range(1, max_col + 1):
+                        src_text = self._get_display_text_for_translation(ws, ws_vals, r, c)
+                        if not src_text:
+                            continue
+                        if not self._should_translate(src_text, 'auto'):
+                            continue
+                        segs.append(src_text)
+            
+            # 去重保持順序
+            unique_segments = []
+            seen = set()
+            for seg in segs:
+                if seg not in seen:
+                    unique_segments.append(seg)
+                    seen.add(seg)
+            
+            logger.info(f"Extracted {len(unique_segments)} unique text segments from Excel file")
+            return unique_segments
+        
+        except Exception as e:
+            logger.error(f"Failed to extract text from Excel file: {str(e)}")
+            raise FileProcessingError(f"Excel 文件解析失敗: {str(e)}")
+    
+    def _get_display_text_for_translation(self, ws, ws_vals, r: int, c: int) -> Optional[str]:
+        """取得儲存格用於翻譯的顯示文字（完全移植自參考檔案）"""
+        val = ws.cell(row=r, column=c).value
+        if isinstance(val, str) and val.startswith("="):
+            if ws_vals is not None:
+                shown = ws_vals.cell(row=r, column=c).value
+                return shown if isinstance(shown, str) and shown.strip() else None
+            return None
+        if isinstance(val, str) and val.strip():
+            return val
+        if ws_vals is not None:
+            shown = ws_vals.cell(row=r, column=c).value
+            if isinstance(shown, str) and shown.strip():
+                return shown
+        return None
+    
+    def _should_translate(self, text: str, src_lang: str) -> bool:
+        """判斷文字是否需要翻譯（移植自參考檔案）"""
+        text = text.strip()
+        if len(text) < 3:
+            return False
+        
+        # Skip pure numbers, dates, etc.
+        import re
+        if re.match(r'^[\d\s\.\-\:\/]+$', text):
+            return False
+        
+        # For auto-detect, translate if has CJK or meaningful text
+        if src_lang.lower() in ('auto', 'auto-detect'):
+            return self._has_cjk(text) or len(text) > 5
+        
+        return True
+    
+    def _has_cjk(self, text: str) -> bool:
+        """檢查是否包含中日韓文字（移植自參考檔案）"""
+        for char in text:
+            if '\u4e00' <= char <= '\u9fff' or \
+               '\u3400' <= char <= '\u4dbf' or \
+               '\u20000' <= char <= '\u2a6df' or \
+               '\u3040' <= char <= '\u309f' or \
+               '\u30a0' <= char <= '\u30ff' or \
+               '\uac00' <= char <= '\ud7af':
+                return True
+        return False
+    
+    def generate_translated_document(self, translations: Dict[str, List[str]], 
+                                   target_language: str, output_dir: Path) -> str:
+        """生成翻譯後的 Excel 文件（移植自參考檔案邏輯）"""
+        try:
+            import openpyxl
+            from openpyxl.styles import Alignment
+            from openpyxl.comments import Comment
+            
+            # 載入原始工作簿
+            wb = openpyxl.load_workbook(str(self.file_path), data_only=False)
+            try:
+                wb_vals = openpyxl.load_workbook(str(self.file_path), data_only=True)
+            except Exception:
+                wb_vals = None
+            
+            # 建立翻譯對應表
+            translated_texts = translations.get(target_language, [])
+            original_segments = self.extract_text_segments()
+            
+            # 建立翻譯映射（按照參考檔案的格式）
+            tmap = {}
+            for i, original_text in enumerate(original_segments):
+                if i < len(translated_texts):
+                    tmap[original_text] = translated_texts[i]
+            
+            # 處理每個工作表（完全按照參考檔案邏輯）
+            for ws in wb.worksheets:
+                ws_vals = wb_vals[ws.title] if wb_vals and ws.title in wb_vals.sheetnames else None
+                max_row, max_col = ws.max_row, ws.max_column
+                
+                for r in range(1, max_row + 1):
+                    for c in range(1, max_col + 1):
+                        src_text = self._get_display_text_for_translation(ws, ws_vals, r, c)
+                        if not src_text or src_text not in tmap:
+                            continue
+                        
+                        val = ws.cell(row=r, column=c).value
+                        is_formula = isinstance(val, str) and val.startswith("=")
+                        translated_text = tmap[src_text]
+                        
+                        cell = ws.cell(row=r, column=c)
+                        
+                        if is_formula:
+                            # 公式儲存格：添加註解
+                            txt_comment = f"翻譯: {translated_text}"
+                            exist = cell.comment
+                            if not exist or exist.text.strip() != txt_comment:
+                                cell.comment = Comment(txt_comment, "translator")
+                        else:
+                            # 一般儲存格：使用交錯格式（原文+翻譯）
+                            combined = f"{src_text}\n{translated_text}"
+                            
+                            # 檢查是否已經是預期的格式
+                            current_text = str(cell.value) if cell.value else ""
+                            if current_text.strip() == combined.strip():
+                                continue
+                                
+                            cell.value = combined
+                            
+                            # 設定自動換行（移植自參考檔案）
+                            try:
+                                if cell.alignment:
+                                    cell.alignment = Alignment(
+                                        horizontal=cell.alignment.horizontal,
+                                        vertical=cell.alignment.vertical,
+                                        wrap_text=True
+                                    )
+                                else:
+                                    cell.alignment = Alignment(wrap_text=True)
+                            except Exception:
+                                cell.alignment = Alignment(wrap_text=True)
+            
+            # 儲存翻譯後的檔案
+            output_filename = f"{self.file_path.stem}_{target_language}_translated.xlsx"
+            output_path = output_dir / output_filename
+            wb.save(str(output_path))
+            
+            logger.info(f"Generated translated Excel file: {output_path}")
+            return str(output_path)
+        
+        except Exception as e:
+            logger.error(f"Failed to generate translated Excel file: {str(e)}")
+            raise FileProcessingError(f"Excel 翻譯檔生成失敗: {str(e)}")
+
+
 class PdfParser(DocumentParser):
    """PDF 文件解析器（只讀）"""
    
@@ -179,7 +467,9 @@ class TranslationService:
        # 文件解析器映射
        self.parsers = {
            '.docx': DocxParser,
-            '.doc': DocxParser,  # 假設可以用 docx 處理
+            '.doc': DocParser,  # 需要先轉換為 DOCX
+            '.xlsx': ExcelParser,
+            '.xls': ExcelParser,  # Excel 處理器會自動處理 XLS 轉換
            '.pdf': PdfParser,
            # 其他格式可以稍後添加
        }