5th_fix excel problem
This commit is contained in:
@@ -130,6 +130,37 @@ def _p_text_with_breaks(p: Paragraph) -> str:
|
||||
parts.append("\t")
|
||||
return "".join(parts)
|
||||
|
||||
def _get_cell_full_text(cell) -> str:
|
||||
"""
|
||||
提取表格儲存格的完整文字內容,包含所有段落
|
||||
"""
|
||||
try:
|
||||
cell_texts = []
|
||||
for para in cell.paragraphs:
|
||||
para_text = _p_text_with_breaks(para)
|
||||
if para_text.strip():
|
||||
cell_texts.append(para_text.strip())
|
||||
|
||||
# 用換行符連接所有段落
|
||||
return '\n'.join(cell_texts)
|
||||
except Exception as e:
|
||||
logger.warning(f"提取儲存格文字失敗: {e}")
|
||||
return ""
|
||||
|
||||
def _is_our_insert_block_text(text: str) -> bool:
|
||||
"""檢查文字是否為翻譯插入區塊"""
|
||||
if not text:
|
||||
return False
|
||||
text_lower = text.lower().strip()
|
||||
return (
|
||||
text_lower.startswith('【') or
|
||||
text_lower.startswith('[翻譯') or
|
||||
'翻譯:' in text_lower or
|
||||
'translation:' in text_lower or
|
||||
text_lower.startswith('translated:') or
|
||||
"\u200b" in text
|
||||
)
|
||||
|
||||
def _is_our_insert_block(p: Paragraph) -> bool:
|
||||
"""Check if paragraph is our inserted translation (contains zero-width space marker)."""
|
||||
text = _p_text_with_breaks(p)
|
||||
@@ -348,7 +379,11 @@ def _collect_docx_segments(doc: docx.Document) -> List[Segment]:
|
||||
for r_idx, row in enumerate(table.rows, 1):
|
||||
for c_idx, cell in enumerate(row.cells, 1):
|
||||
cell_ctx = f"{ctx} > Tbl(r{r_idx},c{c_idx})"
|
||||
_process_container_content(cell, cell_ctx)
|
||||
|
||||
# 使用儲存格為單位的提取方式(而非逐段落提取)
|
||||
cell_text = _get_cell_full_text(cell)
|
||||
if cell_text.strip() and not _is_our_insert_block_text(cell_text):
|
||||
segs.append(Segment("table_cell", cell, cell_ctx, cell_text))
|
||||
|
||||
elif qname.endswith('}sdt'): # Structured Document Tag (SDT)
|
||||
sdt_ctx = f"{ctx} > SDT"
|
||||
|
Reference in New Issue
Block a user