5th_fix excel problem

2025-09-03 15:07:34 +08:00
parent cce3fd4925
commit 5fd0671b4f
28 changed files with 4484 additions and 97 deletions
--- a/app/services/document_processor.py
+++ b/app/services/document_processor.py
@@ -130,6 +130,37 @@ def _p_text_with_breaks(p: Paragraph) -> str:
            parts.append("\t")
    return "".join(parts)

+def _get_cell_full_text(cell) -> str:
+    """
+    提取表格儲存格的完整文字內容，包含所有段落
+    """
+    try:
+        cell_texts = []
+        for para in cell.paragraphs:
+            para_text = _p_text_with_breaks(para)
+            if para_text.strip():
+                cell_texts.append(para_text.strip())
+        
+        # 用換行符連接所有段落
+        return '\n'.join(cell_texts)
+    except Exception as e:
+        logger.warning(f"提取儲存格文字失敗: {e}")
+        return ""
+
+def _is_our_insert_block_text(text: str) -> bool:
+    """檢查文字是否為翻譯插入區塊"""
+    if not text:
+        return False
+    text_lower = text.lower().strip()
+    return (
+        text_lower.startswith('【') or
+        text_lower.startswith('[翻譯') or
+        '翻譯：' in text_lower or
+        'translation:' in text_lower or
+        text_lower.startswith('translated:') or
+        "\u200b" in text
+    )
+
 def _is_our_insert_block(p: Paragraph) -> bool:
    """Check if paragraph is our inserted translation (contains zero-width space marker)."""
    text = _p_text_with_breaks(p)
@@ -348,7 +379,11 @@ def _collect_docx_segments(doc: docx.Document) -> List[Segment]:
                for r_idx, row in enumerate(table.rows, 1):
                    for c_idx, cell in enumerate(row.cells, 1):
                        cell_ctx = f"{ctx} > Tbl(r{r_idx},c{c_idx})"
-                        _process_container_content(cell, cell_ctx)
+                        
+                        # 使用儲存格為單位的提取方式（而非逐段落提取）
+                        cell_text = _get_cell_full_text(cell)
+                        if cell_text.strip() and not _is_our_insert_block_text(cell_text):
+                            segs.append(Segment("table_cell", cell, cell_ctx, cell_text))

            elif qname.endswith('}sdt'):  # Structured Document Tag (SDT)
                sdt_ctx = f"{ctx} > SDT"