5th_fix excel problem

This commit is contained in:
beabigegg
2025-09-03 15:07:34 +08:00
parent cce3fd4925
commit 5fd0671b4f
28 changed files with 4484 additions and 97 deletions

View File

@@ -130,6 +130,37 @@ def _p_text_with_breaks(p: Paragraph) -> str:
parts.append("\t")
return "".join(parts)
def _get_cell_full_text(cell) -> str:
"""
提取表格儲存格的完整文字內容,包含所有段落
"""
try:
cell_texts = []
for para in cell.paragraphs:
para_text = _p_text_with_breaks(para)
if para_text.strip():
cell_texts.append(para_text.strip())
# 用換行符連接所有段落
return '\n'.join(cell_texts)
except Exception as e:
logger.warning(f"提取儲存格文字失敗: {e}")
return ""
def _is_our_insert_block_text(text: str) -> bool:
"""檢查文字是否為翻譯插入區塊"""
if not text:
return False
text_lower = text.lower().strip()
return (
text_lower.startswith('') or
text_lower.startswith('[翻譯') or
'翻譯:' in text_lower or
'translation:' in text_lower or
text_lower.startswith('translated:') or
"\u200b" in text
)
def _is_our_insert_block(p: Paragraph) -> bool:
"""Check if paragraph is our inserted translation (contains zero-width space marker)."""
text = _p_text_with_breaks(p)
@@ -348,7 +379,11 @@ def _collect_docx_segments(doc: docx.Document) -> List[Segment]:
for r_idx, row in enumerate(table.rows, 1):
for c_idx, cell in enumerate(row.cells, 1):
cell_ctx = f"{ctx} > Tbl(r{r_idx},c{c_idx})"
_process_container_content(cell, cell_ctx)
# 使用儲存格為單位的提取方式(而非逐段落提取)
cell_text = _get_cell_full_text(cell)
if cell_text.strip() and not _is_our_insert_block_text(cell_text):
segs.append(Segment("table_cell", cell, cell_ctx, cell_text))
elif qname.endswith('}sdt'): # Structured Document Tag (SDT)
sdt_ctx = f"{ctx} > SDT"