Document_Translator/app/services/document_processor.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
核心文檔處理邏輯 - 移植自最佳版本
包含完整的 DOCX 文字提取和翻譯插入功能

Author: PANJIT IT Team
Created: 2024-09-02
Modified: 2024-09-02
"""

import re
import sys
import time
from pathlib import Path
from typing import List, Dict, Tuple, Optional, Any
from docx.text.paragraph import Paragraph
from docx.table import Table, _Cell
from docx.shared import Pt
from docx.oxml import OxmlElement
from docx.oxml.ns import qn, nsdecls
import docx

from app.utils.logger import get_logger
from app.utils.exceptions import FileProcessingError

logger = get_logger(__name__)

# ---------- Constants ----------
INSERT_FONT_SIZE_PT = 10
SENTENCE_MODE = True

# ---------- Optional dependencies detection ----------
try:
    import blingfire
    _HAS_BLINGFIRE = True
except ImportError:
    _HAS_BLINGFIRE = False

try:
    import pysbd
    _HAS_PYSBD = True
except ImportError:
    _HAS_PYSBD = False

# ---------- Helper functions ----------
def _has_cjk(text: str) -> bool:
    """Check if text contains CJK (Chinese/Japanese/Korean) characters."""
    for char in text:
        if '\u4e00' <= char <= '\u9fff' or \
           '\u3400' <= char <= '\u4dbf' or \
           '\u20000' <= char <= '\u2a6df' or \
           '\u3040' <= char <= '\u309f' or \
           '\u30a0' <= char <= '\u30ff' or \
           '\uac00' <= char <= '\ud7af':
            return True
    return False

def _normalize_text(text: str) -> str:
    """Normalize text for comparison."""
    return re.sub(r'\s+', ' ', text.strip().lower())

def _append_after(p: Paragraph, text_block: str, italic: bool=True, font_size_pt: int=INSERT_FONT_SIZE_PT) -> Paragraph:
    """Insert a new paragraph after p, return the new paragraph (for chain insert)."""
    new_p = OxmlElement("w:p")
    p._p.addnext(new_p)
    np = Paragraph(new_p, p._parent)
    lines = text_block.split("\n")
    for i, line in enumerate(lines):
        run = np.add_run(line)
        if italic:
            run.italic = True
        if font_size_pt:
            run.font.size = Pt(font_size_pt)
        if i < len(lines) - 1:
            run.add_break()
    tag = np.add_run("\u200b")
    if italic:
        tag.italic = True
    if font_size_pt:
        tag.font.size = Pt(font_size_pt)
    return np

def _is_our_insert_block(p: Paragraph) -> bool:
    """Return True iff paragraph contains our zero-width marker."""
    return any("\u200b" in (r.text or "") for r in p.runs)

def _find_last_inserted_after(p: Paragraph, limit: int = 8) -> Optional[Paragraph]:
    """Find the last paragraph that was inserted after p (up to limit paragraphs)."""
    try:
        # Get all paragraphs in the parent container
        if hasattr(p._parent, 'paragraphs'):
            all_paras = list(p._parent.paragraphs)
        else:
            # Handle cases where _parent doesn't have paragraphs (e.g., table cells)
            return None

        # Find p's index
        p_index = -1
        for i, para in enumerate(all_paras):
            if para._element == p._element:
                p_index = i
                break

        if p_index == -1:
            return None

        # Check paragraphs after p
        last_found = None
        for i in range(p_index + 1, min(p_index + 1 + limit, len(all_paras))):
            if _is_our_insert_block(all_paras[i]):
                last_found = all_paras[i]
            else:
                break  # Stop at first non-inserted paragraph
    except Exception:
        return None

    return last_found

def _p_text_with_breaks(p: Paragraph) -> str:
    """Extract text from paragraph with line breaks preserved."""
    parts = []
    for node in p._element.xpath(".//*[local-name()='t' or local-name()='br' or local-name()='tab']"):
        tag = node.tag.split('}', 1)[-1]
        if tag == "t":
            parts.append(node.text or "")
        elif tag == "br":
            parts.append("\n")
        elif tag == "tab":
            parts.append("\t")
    return "".join(parts)

def _get_cell_full_text(cell) -> str:
    """
    提取表格儲存格的完整文字內容，包含所有段落
    """
    try:
        cell_texts = []
        for para in cell.paragraphs:
            para_text = _p_text_with_breaks(para)
            if para_text.strip():
                cell_texts.append(para_text.strip())

        # 用換行符連接所有段落
        return '\n'.join(cell_texts)
    except Exception as e:
        logger.warning(f"提取儲存格文字失敗: {e}")
        return ""

def _is_our_insert_block_text(text: str) -> bool:
    """檢查文字是否為翻譯插入區塊"""
    if not text:
        return False
    text_lower = text.lower().strip()
    return (
        text_lower.startswith('【') or
        text_lower.startswith('[翻譯') or
        '翻譯：' in text_lower or
        'translation:' in text_lower or
        text_lower.startswith('translated:') or
        "\u200b" in text
    )

def _is_our_insert_block(p: Paragraph) -> bool:
    """Check if paragraph is our inserted translation (contains zero-width space marker)."""
    text = _p_text_with_breaks(p)
    return "\u200b" in text

def should_translate(text: str, src_lang: str) -> bool:
    """Determine if text should be translated based on content and source language."""
    text = text.strip()

    # 只要有字就翻譯 - 最小長度設為1
    if len(text) < 1:
        return False

    # Skip pure numbers, dates, etc.
    if re.match(r'^[\d\s\.\-\:\/]+$', text):
        return False

    # For auto-detect, translate if has CJK or meaningful text
    if src_lang.lower() in ('auto', 'auto-detect'):
        return _has_cjk(text) or len(text) > 5

    return True

def _split_sentences(text: str, lang: str = 'auto') -> List[str]:
    """Split text into sentences using available libraries."""
    if not text.strip():
        return []

    # Try blingfire first
    if _HAS_BLINGFIRE and SENTENCE_MODE:
        try:
            sentences = blingfire.text_to_sentences(text).split('\n')
            sentences = [s.strip() for s in sentences if s.strip()]
            if sentences:
                return sentences
        except Exception as e:
            logger.warning(f"Blingfire failed: {e}")

    # Try pysbd
    if _HAS_PYSBD and SENTENCE_MODE:
        try:
            seg = pysbd.Segmenter(language="en" if lang == "auto" else lang)
            sentences = seg.segment(text)
            sentences = [s.strip() for s in sentences if s.strip()]
            if sentences:
                return sentences
        except Exception as e:
            logger.warning(f"PySBD failed: {e}")

    # Fallback to simple splitting
    separators = ['. ', '。', '！', '？', '!', '?', '\n']
    sentences = [text]

    for sep in separators:
        new_sentences = []
        for s in sentences:
            parts = s.split(sep)
            if len(parts) > 1:
                new_sentences.extend([p.strip() + sep.rstrip() for p in parts[:-1] if p.strip()])
                if parts[-1].strip():
                    new_sentences.append(parts[-1].strip())
            else:
                new_sentences.append(s)
        sentences = new_sentences

    return [s for s in sentences if len(s.strip()) > 3]

# ---------- Segment class ----------
class Segment:
    """Represents a translatable text segment in a document."""

    def __init__(self, kind: str, ref: Any, ctx: str, text: str):
        self.kind = kind   # 'para' | 'txbx'
        self.ref = ref     # Reference to original document element
        self.ctx = ctx     # Context information
        self.text = text   # Text content

# ---------- TextBox helpers ----------
def _txbx_iter_texts(doc: docx.Document):
    """
    Yield (txbxContent_element, joined_source_text)
    - Deeply collect all descendant <w:p> under txbxContent
    - Skip our inserted translations: contains zero-width or (all italic and no CJK)
    - Keep only lines that still have CJK
    """
    def _p_text_flags(p_el):
        parts = []
        for node in p_el.xpath(".//*[local-name()='t' or local-name()='br' or local-name()='tab']"):
            tag = node.tag.split('}', 1)[-1]
            if tag == "t":
                parts.append(node.text or "")
            elif tag == "br":
                parts.append("\n")
            else:
                parts.append(" ")
        text = "".join(parts)
        has_zero = ("\u200b" in text)
        runs = p_el.xpath(".//*[local-name()='r']")
        vis, ital = [], []
        for r in runs:
            rt = "".join([(t.text or "") for t in r.xpath(".//*[local-name()='t']")])
            if (rt or "").strip():
                vis.append(rt)
                ital.append(bool(r.xpath(".//*[local-name()='i']")))
        all_italic = (len(vis) > 0 and all(ital))
        return text, has_zero, all_italic

    for tx in doc._element.xpath(".//*[local-name()='txbxContent']"):
        kept = []
        for p in tx.xpath(".//*[local-name()='p']"):   # all descendant paragraphs
            text, has_zero, all_italic = _p_text_flags(p)
            if not (text or "").strip():
                continue
            if has_zero:
                continue  # our inserted
            for line in text.split("\n"):
                if line.strip():
                    kept.append(line.strip())
        if kept:
            joined = "\n".join(kept)
            yield tx, joined

def _txbx_append_paragraph(tx, text_block: str, italic: bool = True, font_size_pt: int = INSERT_FONT_SIZE_PT):
    """Append a paragraph to textbox content."""
    p = OxmlElement("w:p")
    r = OxmlElement("w:r")
    rPr = OxmlElement("w:rPr")
    if italic:
        rPr.append(OxmlElement("w:i"))
    if font_size_pt:
        sz = OxmlElement("w:sz")
        sz.set(qn("w:val"), str(int(font_size_pt * 2)))
        rPr.append(sz)
    r.append(rPr)
    lines = text_block.split("\n")
    for i, line in enumerate(lines):
        if i > 0:
            r.append(OxmlElement("w:br"))
        t = OxmlElement("w:t")
        t.set(qn("xml:space"), "preserve")
        t.text = line
        r.append(t)
    tag = OxmlElement("w:t")
    tag.set(qn("xml:space"), "preserve")
    tag.text = "\u200b"
    r.append(tag)
    p.append(r)
    tx.append(p)

def _txbx_tail_equals(tx, translations: List[str]) -> bool:
    """Check if textbox already contains the expected translations."""
    paras = tx.xpath("./*[local-name()='p']")
    if len(paras) < len(translations):
        return False
    tail = paras[-len(translations):]
    for q, expect in zip(tail, translations):
        parts = []
        for node in q.xpath(".//*[local-name()='t' or local-name()='br']"):
            tag = node.tag.split("}", 1)[-1]
            parts.append("\n" if tag == "br" else (node.text or ""))
        if _normalize_text("".join(parts).strip()) != _normalize_text(expect):
            return False
    return True

# ---------- Main extraction logic ----------
def _get_paragraph_key(p: Paragraph) -> str:
    """Generate a stable unique key for paragraph deduplication."""
    try:
        # Use XML content hash + text content for stable deduplication
        xml_content = p._p.xml if hasattr(p._p, 'xml') else str(p._p)
        text_content = _p_text_with_breaks(p)
        combined = f"{hash(xml_content)}_{len(text_content)}_{text_content[:50]}"
        return combined
    except Exception:
        # Fallback to simple text-based key
        text_content = _p_text_with_breaks(p)
        return f"fallback_{hash(text_content)}_{len(text_content)}"

def _collect_docx_segments(doc: docx.Document) -> List[Segment]:
    """
    Enhanced segment collector with improved stability.
    Handles paragraphs, tables, textboxes, and SDT Content Controls.
    """
    segs: List[Segment] = []
    seen_par_keys = set()

    def _add_paragraph(p: Paragraph, ctx: str):
        try:
            p_key = _get_paragraph_key(p)
            if p_key in seen_par_keys:
                return

            txt = _p_text_with_breaks(p)
            if txt.strip() and not _is_our_insert_block(p):
                segs.append(Segment("para", p, ctx, txt))
                seen_par_keys.add(p_key)
        except Exception as e:
            # Log error but continue processing
            logger.warning(f"段落處理錯誤: {e}, 跳過此段落")

    def _process_container_content(container, ctx: str):
        """
        Recursively processes content within a container (body, cell, or SDT content).
        Identifies and handles paragraphs, tables, and SDT elements.
        """
        if container._element is None:
            return

        for child_element in container._element:
            qname = child_element.tag

            if qname.endswith('}p'):  # Paragraph
                p = Paragraph(child_element, container)
                _add_paragraph(p, ctx)

            elif qname.endswith('}tbl'):  # Table
                table = Table(child_element, container)
                for r_idx, row in enumerate(table.rows, 1):
                    for c_idx, cell in enumerate(row.cells, 1):
                        cell_ctx = f"{ctx} > Tbl(r{r_idx},c{c_idx})"

                        # 使用儲存格為單位的提取方式（而非逐段落提取）
                        cell_text = _get_cell_full_text(cell)
                        if cell_text.strip() and not _is_our_insert_block_text(cell_text):
                            segs.append(Segment("table_cell", cell, cell_ctx, cell_text))

            elif qname.endswith('}sdt'):  # Structured Document Tag (SDT)
                sdt_ctx = f"{ctx} > SDT"

                # 1. 提取 SDT 的元數據文本 (Placeholder, Dropdown items)
                ns = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}

                # 提取 Placeholder text
                placeholder_texts = []
                for t in child_element.xpath('.//w:placeholder//w:t', namespaces=ns):
                    if t.text:
                        placeholder_texts.append(t.text)
                if placeholder_texts:
                    full_placeholder = "".join(placeholder_texts).strip()
                    if full_placeholder:
                        segs.append(Segment("para", child_element, f"{sdt_ctx}-Placeholder", full_placeholder))

                # 提取 Dropdown list items
                list_items = []
                for item in child_element.xpath('.//w:dropDownList/w:listItem', namespaces=ns):
                    display_text = item.get(qn('w:displayText'))
                    if display_text:
                        list_items.append(display_text)
                if list_items:
                    items_as_text = "\n".join(list_items)
                    segs.append(Segment("para", child_element, f"{sdt_ctx}-Dropdown", items_as_text))

                # 2. 遞迴處理 SDT 的實際內容 (sdtContent)
                sdt_content_element = child_element.find(qn('w:sdtContent'))
                if sdt_content_element is not None:
                    class SdtContentWrapper:
                        def __init__(self, element, parent):
                            self._element = element
                            self._parent = parent

                    sdt_content_wrapper = SdtContentWrapper(sdt_content_element, container)
                    _process_container_content(sdt_content_wrapper, sdt_ctx)

    # --- Main execution starts here ---

    # 1. Process the main document body
    _process_container_content(doc._body, "Body")

    # 2. Process textboxes
    for tx, s in _txbx_iter_texts(doc):
        if s.strip() and (_has_cjk(s) or should_translate(s, 'auto')):
            segs.append(Segment("txbx", tx, "TextBox", s))

    return segs

def _insert_docx_translations(doc: docx.Document, segs: List[Segment],
                              tmap: Dict[Tuple[str, str], str],
                              targets: List[str], log=lambda s: None) -> Tuple[int, int]:
    """
    Insert translations into DOCX document segments.

    CRITICAL: This function contains the fix for the major translation insertion bug.
    The key fix is in the segment filtering logic - we now correctly check if any target
    language has translation available using the proper key format (target_lang, text).

    Args:
        doc: The DOCX document object
        segs: List of segments to translate
        tmap: Translation map with keys as (target_language, source_text)
        targets: List of target languages in order
        log: Logging function

    Returns:
        Tuple of (successful_insertions, skipped_insertions)

    Key Bug Fix:
        OLD (INCORRECT): if (seg.kind, seg.text) not in tmap and (targets[0], seg.text) not in tmap
        NEW (CORRECT): has_any_translation = any((tgt, seg.text) in tmap for tgt in targets)
    """
    ok_cnt = skip_cnt = 0

    # Helper function to add a formatted run to a paragraph
    def _add_formatted_run(p: Paragraph, text: str, italic: bool, font_size_pt: int):
        lines = text.split("\n")
        for i, line in enumerate(lines):
            run = p.add_run(line)
            if italic:
                run.italic = True
            if font_size_pt:
                run.font.size = Pt(font_size_pt)
            if i < len(lines) - 1:
                run.add_break()
        # Add our zero-width space marker
        tag_run = p.add_run("\u200b")
        if italic:
            tag_run.italic = True
        if font_size_pt:
            tag_run.font.size = Pt(font_size_pt)

    for seg in segs:
        # Check if any target language has translation for this segment
        has_any_translation = any((tgt, seg.text) in tmap for tgt in targets)
        if not has_any_translation:
            log(f"[SKIP] 無翻譯結果: {seg.ctx} | {seg.text[:50]}...")
            skip_cnt += 1
            continue

        # Get translations for all targets, with fallback for missing ones
        translations = []
        for tgt in targets:
            if (tgt, seg.text) in tmap:
                translations.append(tmap[(tgt, seg.text)])
            else:
                log(f"[WARNING] 缺少 {tgt} 翻譯: {seg.text[:30]}...")
                translations.append(f"【翻譯查詢失敗｜{tgt}】{seg.text[:50]}...")

        log(f"[INSERT] 準備插入 {len(translations)} 個翻譯到 {seg.ctx}: {seg.text[:30]}...")

        if seg.kind == "para":
            # Check if this is an SDT segment (ref is an XML element, not a Paragraph)
            if hasattr(seg.ref, 'tag') and seg.ref.tag.endswith('}sdt'):
                # Handle SDT segments - insert translation into sdtContent
                sdt_element = seg.ref
                ns = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
                sdt_content = sdt_element.find(qn('w:sdtContent'))

                if sdt_content is not None:
                    # Check if translations already exist
                    existing_paras = sdt_content.xpath('.//w:p', namespaces=ns)
                    existing_texts = []
                    for ep in existing_paras:
                        p_obj = Paragraph(ep, None)
                        if _is_our_insert_block(p_obj):
                            existing_texts.append(_p_text_with_breaks(p_obj))

                    # Check if all translations already exist
                    if len(existing_texts) >= len(translations):
                        if all(_normalize_text(e) == _normalize_text(t) for e, t in zip(existing_texts[:len(translations)], translations)):
                            skip_cnt += 1
                            log(f"[SKIP] SDT 已存在翻譯: {seg.text[:30]}...")
                            continue

                    # Add translations to SDT content
                    for t in translations:
                        if not any(_normalize_text(t) == _normalize_text(e) for e in existing_texts):
                            # Create new paragraph in SDT content
                            new_p_element = OxmlElement("w:p")
                            sdt_content.append(new_p_element)
                            new_p = Paragraph(new_p_element, None)
                            _add_formatted_run(new_p, t, italic=True, font_size_pt=INSERT_FONT_SIZE_PT)

                    ok_cnt += 1
                    log(f"[SUCCESS] SDT 插入翻譯（交錯格式）")
                    continue

            p: Paragraph = seg.ref

            # --- CONTEXT-AWARE INSERTION LOGIC (from successful version) ---
            # Check if the paragraph's parent is a table cell
            if isinstance(p._parent, _Cell):
                cell = p._parent

                try:
                    # Find the current paragraph's position in the cell
                    cell_paragraphs = list(cell.paragraphs)
                    p_index = -1
                    for idx, cell_p in enumerate(cell_paragraphs):
                        if cell_p._element == p._element:
                            p_index = idx
                            break

                    if p_index == -1:
                        log(f"[WARNING] 無法找到段落在單元格中的位置，使用原始方法")
                        # Fallback to original method
                        for block in translations:
                            new_p = cell.add_paragraph()
                            _add_formatted_run(new_p, block, italic=True, font_size_pt=INSERT_FONT_SIZE_PT)
                        ok_cnt += 1
                        continue

                    # Check if translations already exist right after this paragraph
                    existing_texts = []
                    check_limit = min(p_index + 1 + len(translations), len(cell_paragraphs))
                    for idx in range(p_index + 1, check_limit):
                        if _is_our_insert_block(cell_paragraphs[idx]):
                            existing_texts.append(_p_text_with_breaks(cell_paragraphs[idx]))

                    # Check if all translations already exist in order
                    if len(existing_texts) >= len(translations):
                        if all(_normalize_text(e) == _normalize_text(t) for e, t in zip(existing_texts[:len(translations)], translations)):
                            skip_cnt += 1
                            log(f"[SKIP] 表格單元格已存在翻譯: {seg.text[:30]}...")
                            continue

                    # Determine which translations need to be added
                    to_add = []
                    for t in translations:
                        if not any(_normalize_text(t) == _normalize_text(e) for e in existing_texts):
                            to_add.append(t)

                    if not to_add:
                        skip_cnt += 1
                        log(f"[SKIP] 表格單元格所有翻譯已存在: {seg.text[:30]}...")
                        continue

                    # Insert new paragraphs right after the current paragraph
                    insert_after = p
                    for block in to_add:
                        try:
                            # Create new paragraph and insert it after the current position
                            new_p_element = OxmlElement("w:p")
                            insert_after._element.addnext(new_p_element)
                            new_p = Paragraph(new_p_element, cell)
                            _add_formatted_run(new_p, block, italic=True, font_size_pt=INSERT_FONT_SIZE_PT)
                            insert_after = new_p  # Update position for next insertion
                        except Exception as e:
                            log(f"[ERROR] 表格插入失敗: {e}, 嘗試fallback方法")
                            # Fallback: add at the end of cell
                            try:
                                new_p = cell.add_paragraph()
                                _add_formatted_run(new_p, block, italic=True, font_size_pt=INSERT_FONT_SIZE_PT)
                                log(f"[SUCCESS] Fallback插入成功")
                            except Exception as e2:
                                log(f"[FATAL] Fallback也失敗: {e2}")
                                continue
                    ok_cnt += 1
                    log(f"[SUCCESS] 表格單元格插入 {len(to_add)} 個翻譯（緊接原文後）")

                except Exception as e:
                    log(f"[ERROR] 表格處理全面失敗: {e}, 跳過此段落")
                    continue

            else:
                # Normal paragraph (not in table cell) - SIMPLIFIED FOR DEBUGGING
                try:
                    # TEMPORARILY DISABLE existing translation check to force insertion
                    log(f"[DEBUG] 強制插入翻譯到段落: {seg.text[:30]}...")

                    # Force all translations to be added
                    to_add = translations

                    # Use simple positioning - always insert after current paragraph
                    anchor = p

                    for block in to_add:
                        try:
                            log(f"[DEBUG] 嘗試插入: {block[:50]}...")
                            anchor = _append_after(anchor, block, italic=True, font_size_pt=INSERT_FONT_SIZE_PT)
                            log(f"[SUCCESS] _append_after成功插入")
                        except Exception as e:
                            log(f"[ERROR] _append_after失敗: {e}, 嘗試簡化插入")
                            try:
                                # Fallback: simple append
                                if hasattr(p._parent, 'add_paragraph'):
                                    new_p = p._parent.add_paragraph()
                                    _add_formatted_run(new_p, block, italic=True, font_size_pt=INSERT_FONT_SIZE_PT)
                                    log(f"[SUCCESS] Fallback段落插入成功")
                                else:
                                    log(f"[ERROR] 無法進行fallback插入")
                            except Exception as e2:
                                log(f"[FATAL] Fallback也失敗: {e2}")
                                continue

                    ok_cnt += 1
                    log(f"[SUCCESS] 段落強制插入 {len(to_add)} 個翻譯")

                except Exception as e:
                    log(f"[ERROR] 段落處理失敗: {e}, 跳過此段落")
                    continue

        elif seg.kind == "table_cell":
            # 處理表格儲存格翻譯插入
            cell = seg.ref  # cell 是 _Cell 對象

            # 檢查儲存格是否已有翻譯
            existing_translations = []
            cell_paragraphs = list(cell.paragraphs)

            # 檢查儲存格末尾是否已有翻譯
            translation_start_index = len(cell_paragraphs)
            for i in range(len(cell_paragraphs) - 1, -1, -1):
                if _is_our_insert_block(cell_paragraphs[i]):
                    existing_translations.insert(0, _p_text_with_breaks(cell_paragraphs[i]))
                    translation_start_index = i
                else:
                    break

            # 檢查是否所有翻譯都已存在且相同
            if len(existing_translations) >= len(translations):
                if all(_normalize_text(e) == _normalize_text(t) for e, t in zip(existing_translations[:len(translations)], translations)):
                    skip_cnt += 1
                    log(f"[SKIP] 表格儲存格已存在翻譯: {seg.text[:30]}...")
                    continue

            # 移除舊的翻譯段落（如果有的話）
            for i in range(len(cell_paragraphs) - 1, translation_start_index - 1, -1):
                if _is_our_insert_block(cell_paragraphs[i]):
                    cell._element.remove(cell_paragraphs[i]._element)

            # 檢查是否為簡單的短文本儲存格（只有原文，沒有複雜結構）
            cell_content = cell.text.strip()
            is_simple_cell = len(cell_content) <= 10 and cell_content == seg.text.strip()

            if is_simple_cell:
                # 對於簡單短文本，直接替換內容而不是添加段落
                log(f"[INFO] 簡單儲存格內容替換: '{seg.text.strip()}' -> '{translations[0] if translations else 'N/A'}'")

                # 清空所有段落內容
                for para in cell.paragraphs:
                    para.clear()

                # 在第一個段落中添加原文和翻譯
                first_para = cell.paragraphs[0] if cell.paragraphs else cell.add_paragraph()

                # 添加原文
                run_orig = first_para.add_run(seg.text.strip())

                # 添加換行和翻譯
                for t in translations:
                    first_para.add_run('\n')
                    run_trans = first_para.add_run(t)
                    run_trans.italic = True
                    if INSERT_FONT_SIZE_PT:
                        run_trans.font.size = Pt(INSERT_FONT_SIZE_PT)

                # 添加標記
                tag_run = first_para.add_run("\u200b")
                tag_run.italic = True
                if INSERT_FONT_SIZE_PT:
                    tag_run.font.size = Pt(INSERT_FONT_SIZE_PT)
            else:
                # 對於複雜儲存格，使用原有的添加段落方式
                for t in translations:
                    new_p = cell.add_paragraph()
                    _add_formatted_run(new_p, t, italic=True, font_size_pt=INSERT_FONT_SIZE_PT)

            ok_cnt += 1
            log(f"[SUCCESS] 表格儲存格插入 {len(translations)} 個翻譯")

        elif seg.kind == "txbx":
            tx = seg.ref
            # Check if textbox already has our translations at the end
            if _txbx_tail_equals(tx, translations):
                skip_cnt += 1
                log(f"[SKIP] 文字框已存在翻譯: {seg.text[:30]}...")
                continue

            # Append translations to textbox
            for t in translations:
                _txbx_append_paragraph(tx, t, italic=True, font_size_pt=INSERT_FONT_SIZE_PT)

            ok_cnt += 1
            log(f"[SUCCESS] 文字框插入 {len(translations)} 個翻譯")

    return ok_cnt, skip_cnt

# ---------- Main DocumentProcessor class ----------
class DocumentProcessor:
    """Enhanced document processor with complete DOCX handling capabilities."""

    def __init__(self):
        self.logger = logger

    def extract_docx_segments(self, file_path: str) -> List[Segment]:
        """Extract all translatable segments from DOCX file."""
        try:
            doc = docx.Document(file_path)
            segments = _collect_docx_segments(doc)

            self.logger.info(f"Extracted {len(segments)} segments from {file_path}")
            for seg in segments[:5]:  # Log first 5 segments for debugging
                self.logger.debug(f"Segment: {seg.kind} | {seg.ctx} | {seg.text[:50]}...")

            return segments

        except Exception as e:
            self.logger.error(f"Failed to extract DOCX segments from {file_path}: {str(e)}")
            raise FileProcessingError(f"DOCX 文件分析失敗: {str(e)}")

    def _rematch_segments_to_document(self, doc: docx.Document, old_segments: List[Segment]) -> List[Segment]:
        """Re-match segments from old document instance to new document instance."""
        try:
            # Extract fresh segments from the current document instance
            fresh_segments = _collect_docx_segments(doc)

            # Match old segments with fresh segments based on text content
            matched_segments = []

            for old_seg in old_segments:
                # Find matching segment in fresh segments
                matched = False
                for fresh_seg in fresh_segments:
                    if (old_seg.kind == fresh_seg.kind and
                        old_seg.ctx == fresh_seg.ctx and
                        _normalize_text(old_seg.text) == _normalize_text(fresh_seg.text)):
                        matched_segments.append(fresh_seg)
                        matched = True
                        break

                if not matched:
                    self.logger.warning(f"Failed to match segment: {old_seg.text[:50]}...")
                    # Still add the old segment but it might not work for insertion
                    matched_segments.append(old_seg)

            self.logger.debug(f"Re-matched {len(matched_segments)} segments to current document")
            return matched_segments

        except Exception as e:
            self.logger.error(f"Failed to re-match segments: {str(e)}")
            # Return original segments as fallback
            return old_segments

    def insert_docx_translations(self, file_path: str, segments: List[Segment],
                                 translation_map: Dict[Tuple[str, str], str],
                                 target_languages: List[str], output_path: str) -> Tuple[int, int]:
        """Insert translations into DOCX file and save to output path."""
        try:
            doc = docx.Document(file_path)

            # CRITICAL FIX: Re-match segments with the current document instance
            # The original segments were extracted from a different document instance
            matched_segments = self._rematch_segments_to_document(doc, segments)

            def log_func(msg: str):
                self.logger.debug(msg)

            ok_count, skip_count = _insert_docx_translations(
                doc, matched_segments, translation_map, target_languages, log_func
            )

            # Save the modified document
            doc.save(output_path)

            self.logger.info(f"Inserted {ok_count} translations, skipped {skip_count}. Saved to: {output_path}")
            return ok_count, skip_count

        except Exception as e:
            self.logger.error(f"Failed to insert DOCX translations: {str(e)}")
            raise FileProcessingError(f"DOCX 翻譯插入失敗: {str(e)}")

    def split_text_into_sentences(self, text: str, language: str = 'auto') -> List[str]:
        """Split text into sentences using the best available method."""
        return _split_sentences(text, language)

    def should_translate_text(self, text: str, source_language: str) -> bool:
        """Determine if text should be translated."""
        return should_translate(text, source_language)

    def insert_docx_combined_translations(self, file_path: str, segments: List[Segment],
                                         translation_map: Dict[Tuple[str, str], str],
                                         target_languages: List[str], output_path: str) -> Tuple[int, int]:
        """Insert all translations into a single DOCX file with combined multi-language output.

        This creates a combined file where each original text is followed by all translations
        in the format: original\n英文\n越南文 etc.
        """
        try:
            doc = docx.Document(file_path)

            # Re-match segments with the current document instance
            matched_segments = self._rematch_segments_to_document(doc, segments)

            def log_func(msg: str):
                self.logger.debug(msg)

            # Use the existing _insert_docx_translations function which already supports
            # multiple target languages in a single document
            ok_count, skip_count = _insert_docx_translations(
                doc, matched_segments, translation_map, target_languages, log_func
            )

            # Save the combined document
            doc.save(output_path)

            self.logger.info(f"Generated combined multi-language file: {output_path}")
            self.logger.info(f"Inserted {ok_count} translations, skipped {skip_count}")
            return ok_count, skip_count

        except Exception as e:
            self.logger.error(f"Failed to create combined DOCX translations: {str(e)}")
            raise FileProcessingError(f"組合多語言 DOCX 檔案生成失敗: {str(e)}")