chore: backup before code cleanup

Backup commit before executing remove-unused-code proposal. This includes all pending changes and new features. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-11 11:55:39 +08:00
parent eff9b0bcd5
commit 940a406dce
58 changed files with 8226 additions and 175 deletions
--- a/backend/app/services/cell_validation_engine.py
+++ b/backend/app/services/cell_validation_engine.py
@@ -0,0 +1,583 @@
+"""
+Cell Validation Engine
+
+Validates PP-StructureV3 table detections using metric-based heuristics
+to filter over-detected cells and reclassify invalid tables as TEXT elements.
+
+Metrics used:
+- Cell density: cells per 10,000 px² (normal: 0.4-1.0, over-detected: 6+)
+- Average cell area: px² per cell (normal: 10,000-25,000, over-detected: ~1,600)
+- Cell height: table_height / cell_count (minimum: 10px for readable text)
+"""
+
+import logging
+from dataclasses import dataclass
+from typing import List, Dict, Any, Optional, Tuple
+from html.parser import HTMLParser
+import re
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class CellValidationConfig:
+    """Configuration for cell validation thresholds."""
+    max_cell_density: float = 3.0  # cells per 10,000 px²
+    min_avg_cell_area: float = 3000.0  # px² per cell
+    min_cell_height: float = 10.0  # px per cell row
+    enabled: bool = True
+
+
+@dataclass
+class TableValidationResult:
+    """Result of table validation."""
+    is_valid: bool
+    table_element: Dict[str, Any]
+    reason: Optional[str] = None
+    metrics: Optional[Dict[str, float]] = None
+
+
+class CellValidationEngine:
+    """
+    Validates table elements from PP-StructureV3 output.
+
+    Over-detected tables are identified by abnormal metrics and
+    reclassified as TEXT elements while preserving content.
+    """
+
+    def __init__(self, config: Optional[CellValidationConfig] = None):
+        self.config = config or CellValidationConfig()
+
+    def calculate_table_metrics(
+        self,
+        bbox: List[float],
+        cell_boxes: List[List[float]]
+    ) -> Dict[str, float]:
+        """
+        Calculate validation metrics for a table.
+
+        Args:
+            bbox: Table bounding box [x0, y0, x1, y1]
+            cell_boxes: List of cell bounding boxes
+
+        Returns:
+            Dictionary with calculated metrics
+        """
+        if len(bbox) < 4:
+            return {"cell_count": 0, "cell_density": 0, "avg_cell_area": 0, "avg_cell_height": 0}
+
+        cell_count = len(cell_boxes)
+        if cell_count == 0:
+            return {"cell_count": 0, "cell_density": 0, "avg_cell_area": 0, "avg_cell_height": 0}
+
+        # Calculate table dimensions
+        table_width = bbox[2] - bbox[0]
+        table_height = bbox[3] - bbox[1]
+        table_area = table_width * table_height
+
+        if table_area <= 0:
+            return {"cell_count": cell_count, "cell_density": 0, "avg_cell_area": 0, "avg_cell_height": 0}
+
+        # Cell density: cells per 10,000 px²
+        cell_density = (cell_count / table_area) * 10000
+
+        # Average cell area
+        avg_cell_area = table_area / cell_count
+
+        # Average cell height (table height / cell count)
+        avg_cell_height = table_height / cell_count
+
+        return {
+            "cell_count": cell_count,
+            "table_width": table_width,
+            "table_height": table_height,
+            "table_area": table_area,
+            "cell_density": cell_density,
+            "avg_cell_area": avg_cell_area,
+            "avg_cell_height": avg_cell_height
+        }
+
+    def validate_table(
+        self,
+        element: Dict[str, Any]
+    ) -> TableValidationResult:
+        """
+        Validate a single table element.
+
+        Args:
+            element: Table element from PP-StructureV3 output
+
+        Returns:
+            TableValidationResult with validation status and metrics
+        """
+        if not self.config.enabled:
+            return TableValidationResult(is_valid=True, table_element=element)
+
+        # Extract bbox and cell_boxes
+        bbox = element.get("bbox", [])
+        cell_boxes = element.get("cell_boxes", [])
+
+        # Tables without cells pass validation (structure-only tables)
+        if not cell_boxes:
+            return TableValidationResult(
+                is_valid=True,
+                table_element=element,
+                reason="No cells to validate"
+            )
+
+        # Calculate metrics
+        metrics = self.calculate_table_metrics(bbox, cell_boxes)
+
+        # Check cell density
+        if metrics["cell_density"] > self.config.max_cell_density:
+            return TableValidationResult(
+                is_valid=False,
+                table_element=element,
+                reason=f"Cell density {metrics['cell_density']:.2f} exceeds threshold {self.config.max_cell_density}",
+                metrics=metrics
+            )
+
+        # Check average cell area
+        if metrics["avg_cell_area"] < self.config.min_avg_cell_area:
+            return TableValidationResult(
+                is_valid=False,
+                table_element=element,
+                reason=f"Avg cell area {metrics['avg_cell_area']:.0f}px² below threshold {self.config.min_avg_cell_area}px²",
+                metrics=metrics
+            )
+
+        # Check cell height
+        if metrics["avg_cell_height"] < self.config.min_cell_height:
+            return TableValidationResult(
+                is_valid=False,
+                table_element=element,
+                reason=f"Avg cell height {metrics['avg_cell_height']:.1f}px below threshold {self.config.min_cell_height}px",
+                metrics=metrics
+            )
+
+        # Content-based validation: check if content looks like prose vs tabular data
+        content_check = self._validate_table_content(element)
+        if not content_check["is_tabular"]:
+            return TableValidationResult(
+                is_valid=False,
+                table_element=element,
+                reason=content_check["reason"],
+                metrics=metrics
+            )
+
+        return TableValidationResult(
+            is_valid=True,
+            table_element=element,
+            metrics=metrics
+        )
+
+    def _validate_table_content(self, element: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Validate table content to detect false positive tables.
+
+        Checks:
+        1. Sparsity: text coverage ratio (text area / table area)
+        2. Header: does table have proper header structure
+        3. Key-Value: for 2-col tables, is it a key-value list or random layout
+        4. Prose: are cells containing long prose text
+
+        Returns:
+            Dict with is_tabular (bool) and reason (str)
+        """
+        html_content = element.get("content", "")
+        bbox = element.get("bbox", [])
+        cell_boxes = element.get("cell_boxes", [])
+
+        if not html_content or '<table' not in html_content.lower():
+            return {"is_tabular": True, "reason": "no_html_content"}
+
+        try:
+            from bs4 import BeautifulSoup
+            soup = BeautifulSoup(html_content, 'html.parser')
+            table = soup.find('table')
+            if not table:
+                return {"is_tabular": True, "reason": "no_table_element"}
+
+            rows = table.find_all('tr')
+            if not rows:
+                return {"is_tabular": True, "reason": "no_rows"}
+
+            # Extract cell contents with row structure
+            row_data = []
+            all_cells = []
+            for row_idx, row in enumerate(rows):
+                cells = row.find_all(['td', 'th'])
+                row_cells = []
+                for cell in cells:
+                    text = cell.get_text(strip=True)
+                    colspan = int(cell.get('colspan', 1))
+                    is_header = cell.name == 'th'
+                    cell_info = {
+                        "text": text,
+                        "length": len(text),
+                        "colspan": colspan,
+                        "is_header": is_header,
+                        "row": row_idx
+                    }
+                    row_cells.append(cell_info)
+                    all_cells.append(cell_info)
+                row_data.append(row_cells)
+
+            if not all_cells:
+                return {"is_tabular": True, "reason": "no_cells"}
+
+            num_rows = len(row_data)
+            num_cols = max(len(r) for r in row_data) if row_data else 0
+
+            # === Check 1: Sparsity (text coverage) ===
+            sparsity_result = self._check_sparsity(bbox, cell_boxes, all_cells)
+            if not sparsity_result["is_valid"]:
+                return {"is_tabular": False, "reason": sparsity_result["reason"]}
+
+            # === Check 2: Header structure ===
+            header_result = self._check_header_structure(row_data, num_cols)
+            if not header_result["has_header"] and num_rows > 3:
+                # Large table without header is suspicious
+                logger.debug(f"Table has no header structure with {num_rows} rows")
+
+            # === Check 3: Key-Value pattern for 2-column tables ===
+            if num_cols == 2:
+                kv_result = self._check_key_value_pattern(row_data)
+                if kv_result["is_kv_list"] and kv_result["confidence"] > 0.7:
+                    # High confidence key-value list - keep as table but log
+                    logger.debug(f"Table identified as key-value list (conf={kv_result['confidence']:.2f})")
+                elif not kv_result["is_kv_list"] and kv_result["is_random_layout"]:
+                    # Random 2-column layout, not a real table
+                    return {
+                        "is_tabular": False,
+                        "reason": f"random_two_column_layout (not key-value)"
+                    }
+
+            # === Check 4: Prose content ===
+            long_cells = [c for c in all_cells if c["length"] > 80]
+            prose_ratio = len(long_cells) / len(all_cells) if all_cells else 0
+            if prose_ratio > 0.3:
+                return {
+                    "is_tabular": False,
+                    "reason": f"prose_content ({len(long_cells)}/{len(all_cells)} cells > 80 chars)"
+                }
+
+            # === Check 5: Section header as table ===
+            if num_rows <= 2 and num_cols <= 2:
+                first_row = row_data[0] if row_data else []
+                if len(first_row) == 1:
+                    text = first_row[0]["text"]
+                    if text.isupper() and len(text) < 50:
+                        return {
+                            "is_tabular": False,
+                            "reason": f"section_header_only ({text[:30]})"
+                        }
+
+            return {"is_tabular": True, "reason": "content_valid"}
+
+        except Exception as e:
+            logger.warning(f"Content validation failed: {e}")
+            return {"is_tabular": True, "reason": f"validation_error: {e}"}
+
+    def _check_sparsity(
+        self,
+        bbox: List[float],
+        cell_boxes: List[List[float]],
+        all_cells: List[Dict]
+    ) -> Dict[str, Any]:
+        """
+        Check text coverage ratio (sparsity).
+
+        Two-column layouts have large empty gaps in the middle.
+        Real tables have more uniform cell distribution.
+        """
+        if len(bbox) < 4:
+            return {"is_valid": True, "reason": "no_bbox"}
+
+        table_width = bbox[2] - bbox[0]
+        table_height = bbox[3] - bbox[1]
+        table_area = table_width * table_height
+
+        if table_area <= 0:
+            return {"is_valid": True, "reason": "invalid_area"}
+
+        # Calculate text area from cell_boxes
+        if cell_boxes:
+            text_area = 0
+            for cb in cell_boxes:
+                if len(cb) >= 4:
+                    w = abs(cb[2] - cb[0])
+                    h = abs(cb[3] - cb[1])
+                    text_area += w * h
+            coverage = text_area / table_area
+        else:
+            # Estimate from cell content length
+            total_chars = sum(c["length"] for c in all_cells)
+            # Rough estimate: 1 char ≈ 8x12 pixels = 96 px²
+            estimated_text_area = total_chars * 96
+            coverage = min(estimated_text_area / table_area, 1.0)
+
+        # Very sparse table (< 15% coverage) is suspicious
+        if coverage < 0.15:
+            return {
+                "is_valid": False,
+                "reason": f"sparse_content (coverage={coverage:.1%})"
+            }
+
+        return {"is_valid": True, "coverage": coverage}
+
+    def _check_header_structure(
+        self,
+        row_data: List[List[Dict]],
+        num_cols: int
+    ) -> Dict[str, Any]:
+        """
+        Check if table has proper header structure.
+
+        Real tables usually have:
+        - First row with <th> elements
+        - Or first row with different content pattern (labels vs values)
+        """
+        if not row_data:
+            return {"has_header": False}
+
+        first_row = row_data[0]
+
+        # Check for <th> elements
+        th_count = sum(1 for c in first_row if c.get("is_header", False))
+        if th_count > 0 and th_count >= len(first_row) * 0.5:
+            return {"has_header": True, "type": "th_elements"}
+
+        # Check for header-like content (short, distinct from body)
+        if len(row_data) > 1:
+            first_row_avg_len = sum(c["length"] for c in first_row) / len(first_row) if first_row else 0
+            body_rows = row_data[1:]
+            body_cells = [c for row in body_rows for c in row]
+            body_avg_len = sum(c["length"] for c in body_cells) / len(body_cells) if body_cells else 0
+
+            # Header row should be shorter (labels) than body (data)
+            if first_row_avg_len < body_avg_len * 0.7:
+                return {"has_header": True, "type": "short_labels"}
+
+        return {"has_header": False}
+
+    def _check_key_value_pattern(
+        self,
+        row_data: List[List[Dict]]
+    ) -> Dict[str, Any]:
+        """
+        For 2-column tables, check if it's a key-value list.
+
+        Key-value characteristics:
+        - Left column: short labels (< 30 chars)
+        - Right column: values (can be longer)
+        - Consistent pattern across rows
+
+        Random layout characteristics:
+        - Both columns have similar length distribution
+        - No clear label-value relationship
+        """
+        if not row_data:
+            return {"is_kv_list": False, "is_random_layout": False, "confidence": 0}
+
+        left_lengths = []
+        right_lengths = []
+        kv_rows = 0
+        total_rows = 0
+
+        for row in row_data:
+            if len(row) != 2:
+                continue
+            total_rows += 1
+            left = row[0]
+            right = row[1]
+            left_lengths.append(left["length"])
+            right_lengths.append(right["length"])
+
+            # Key-value pattern: left is short label, right is value
+            if left["length"] < 40 and left["length"] < right["length"] * 2:
+                kv_rows += 1
+
+        if total_rows == 0:
+            return {"is_kv_list": False, "is_random_layout": False, "confidence": 0}
+
+        kv_ratio = kv_rows / total_rows
+        avg_left = sum(left_lengths) / len(left_lengths) if left_lengths else 0
+        avg_right = sum(right_lengths) / len(right_lengths) if right_lengths else 0
+
+        # High KV ratio and left column is shorter = key-value list
+        if kv_ratio > 0.6 and avg_left < avg_right:
+            return {
+                "is_kv_list": True,
+                "is_random_layout": False,
+                "confidence": kv_ratio,
+                "avg_left": avg_left,
+                "avg_right": avg_right
+            }
+
+        # Similar lengths on both sides = random layout
+        if avg_left > 0 and 0.5 < avg_right / avg_left < 2.0:
+            # Both columns have similar content length
+            return {
+                "is_kv_list": False,
+                "is_random_layout": True,
+                "confidence": 1 - kv_ratio,
+                "avg_left": avg_left,
+                "avg_right": avg_right
+            }
+
+        return {
+            "is_kv_list": False,
+            "is_random_layout": False,
+            "confidence": 0,
+            "avg_left": avg_left,
+            "avg_right": avg_right
+        }
+
+    def extract_text_from_table_html(self, html_content: str) -> str:
+        """
+        Extract plain text from table HTML content.
+
+        Args:
+            html_content: HTML string containing table structure
+
+        Returns:
+            Plain text extracted from table cells
+        """
+        if not html_content:
+            return ""
+
+        try:
+            class TableTextExtractor(HTMLParser):
+                def __init__(self):
+                    super().__init__()
+                    self.text_parts = []
+                    self.in_cell = False
+
+                def handle_starttag(self, tag, attrs):
+                    if tag in ('td', 'th'):
+                        self.in_cell = True
+
+                def handle_endtag(self, tag):
+                    if tag in ('td', 'th'):
+                        self.in_cell = False
+
+                def handle_data(self, data):
+                    if self.in_cell:
+                        stripped = data.strip()
+                        if stripped:
+                            self.text_parts.append(stripped)
+
+            parser = TableTextExtractor()
+            parser.feed(html_content)
+            return ' '.join(parser.text_parts)
+        except Exception as e:
+            logger.warning(f"Failed to parse table HTML: {e}")
+            # Fallback: strip HTML tags with regex
+            text = re.sub(r'<[^>]+>', ' ', html_content)
+            text = re.sub(r'\s+', ' ', text).strip()
+            return text
+
+    def reclassify_as_text(self, element: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Convert an over-detected table element to a TEXT element.
+
+        Args:
+            element: Table element to reclassify
+
+        Returns:
+            New TEXT element with preserved content
+        """
+        # Extract text content from HTML
+        html_content = element.get("content", "")
+        text_content = self.extract_text_from_table_html(html_content)
+
+        # Create new TEXT element
+        text_element = {
+            "element_id": element.get("element_id", ""),
+            "type": "text",
+            "original_type": "table_reclassified",  # Mark as reclassified
+            "content": text_content,
+            "page": element.get("page", 0),
+            "bbox": element.get("bbox", []),
+            "index": element.get("index", 0),
+            "confidence": element.get("confidence", 1.0),
+            "reclassified_from": "table",
+            "reclassification_reason": "over_detection"
+        }
+
+        return text_element
+
+    def validate_and_filter_elements(
+        self,
+        elements: List[Dict[str, Any]]
+    ) -> Tuple[List[Dict[str, Any]], Dict[str, Any]]:
+        """
+        Validate all elements and filter/reclassify over-detected tables.
+
+        Args:
+            elements: List of elements from PP-StructureV3 output
+
+        Returns:
+            Tuple of (filtered_elements, statistics)
+        """
+        filtered_elements = []
+        stats = {
+            "total_tables": 0,
+            "valid_tables": 0,
+            "reclassified_tables": 0,
+            "reclassification_details": []
+        }
+
+        for element in elements:
+            if element.get("type") != "table":
+                # Non-table elements pass through unchanged
+                filtered_elements.append(element)
+                continue
+
+            stats["total_tables"] += 1
+
+            # Validate table
+            result = self.validate_table(element)
+
+            if result.is_valid:
+                stats["valid_tables"] += 1
+                filtered_elements.append(element)
+            else:
+                # Reclassify as TEXT
+                stats["reclassified_tables"] += 1
+                text_element = self.reclassify_as_text(element)
+                filtered_elements.append(text_element)
+
+                stats["reclassification_details"].append({
+                    "element_id": element.get("element_id"),
+                    "reason": result.reason,
+                    "metrics": result.metrics
+                })
+
+                logger.info(
+                    f"Reclassified table {element.get('element_id')} as TEXT: {result.reason}"
+                )
+
+        # Re-sort by reading order (y0 then x0)
+        filtered_elements = self._sort_by_reading_order(filtered_elements)
+
+        return filtered_elements, stats
+
+    def _sort_by_reading_order(
+        self,
+        elements: List[Dict[str, Any]]
+    ) -> List[Dict[str, Any]]:
+        """Sort elements by reading order (top-to-bottom, left-to-right)."""
+        def sort_key(elem):
+            bbox = elem.get("bbox", [0, 0, 0, 0])
+            if isinstance(bbox, dict):
+                y0 = bbox.get("y0", 0)
+                x0 = bbox.get("x0", 0)
+            elif isinstance(bbox, list) and len(bbox) >= 2:
+                x0, y0 = bbox[0], bbox[1]
+            else:
+                y0, x0 = 0, 0
+            return (y0, x0)
+
+        return sorted(elements, key=sort_key)
--- a/backend/app/services/gap_filling_service.py
+++ b/backend/app/services/gap_filling_service.py
@@ -83,12 +83,34 @@ class TextRegion:
        return ((x0 + x1) / 2, (y0 + y1) / 2)


+# Element type to IoA threshold mapping
+# TABLE needs strict filtering (low threshold) to prevent duplicate content
+# FIGURE allows more text through (high threshold) to preserve axis labels, legends
+# TEXT/TITLE uses moderate threshold to tolerate boundary detection errors
+ELEMENT_TYPE_IOA_THRESHOLDS = {
+    ElementType.TABLE: 'table',
+    ElementType.FIGURE: 'figure',
+    ElementType.IMAGE: 'figure',
+    ElementType.CHART: 'figure',
+    ElementType.DIAGRAM: 'figure',
+}
+
+
 class GapFillingService:
    """
    Service for detecting and filling gaps in PP-StructureV3 output.

+    This service uses IoA (Intersection over Area) algorithm for coverage detection,
+    which correctly measures "small box contained in large box" relationship.
+
+    Key improvements over IoU:
+    - IoA = intersection_area / ocr_box_area (non-symmetric)
+    - Better for detecting if OCR text is covered by larger layout regions
+    - Different thresholds per element type (TEXT, TABLE, FIGURE)
+    - Optional boundary shrinking to reduce edge duplicates
+
    This service:
-    1. Calculates coverage of PP-StructureV3 elements over raw OCR regions
+    1. Calculates coverage of PP-StructureV3 elements over raw OCR regions using IoA
    2. Identifies uncovered raw OCR regions
    3. Supplements uncovered regions as TEXT elements
    4. Deduplicates against existing PP-StructureV3 TEXT elements
@@ -98,9 +120,12 @@ class GapFillingService:
    def __init__(
        self,
        coverage_threshold: float = None,
-        iou_threshold: float = None,
        confidence_threshold: float = None,
-        dedup_iou_threshold: float = None,
+        ioa_threshold_text: float = None,
+        ioa_threshold_table: float = None,
+        ioa_threshold_figure: float = None,
+        dedup_ioa_threshold: float = None,
+        shrink_pixels: int = None,
        enabled: bool = None
    ):
        """
@@ -108,27 +133,48 @@ class GapFillingService:

        Args:
            coverage_threshold: Coverage ratio below which gap filling activates (default: 0.7)
-            iou_threshold: IoU threshold for coverage detection (default: 0.15)
            confidence_threshold: Minimum confidence for raw OCR regions (default: 0.3)
-            dedup_iou_threshold: IoU threshold for deduplication (default: 0.5)
+            ioa_threshold_text: IoA threshold for TEXT/TITLE elements (default: 0.6)
+            ioa_threshold_table: IoA threshold for TABLE elements (default: 0.1)
+            ioa_threshold_figure: IoA threshold for FIGURE/IMAGE elements (default: 0.8)
+            dedup_ioa_threshold: IoA threshold for deduplication (default: 0.5)
+            shrink_pixels: Shrink OCR bbox inward by this many pixels (default: 1)
            enabled: Whether gap filling is enabled (default: True)
        """
        self.coverage_threshold = coverage_threshold if coverage_threshold is not None else getattr(
            settings, 'gap_filling_coverage_threshold', 0.7
        )
-        self.iou_threshold = iou_threshold if iou_threshold is not None else getattr(
-            settings, 'gap_filling_iou_threshold', 0.15
-        )
        self.confidence_threshold = confidence_threshold if confidence_threshold is not None else getattr(
            settings, 'gap_filling_confidence_threshold', 0.3
        )
-        self.dedup_iou_threshold = dedup_iou_threshold if dedup_iou_threshold is not None else getattr(
-            settings, 'gap_filling_dedup_iou_threshold', 0.5
+
+        # IoA thresholds per element type
+        self.ioa_threshold_text = ioa_threshold_text if ioa_threshold_text is not None else getattr(
+            settings, 'gap_filling_ioa_threshold_text', 0.6
        )
+        self.ioa_threshold_table = ioa_threshold_table if ioa_threshold_table is not None else getattr(
+            settings, 'gap_filling_ioa_threshold_table', 0.1
+        )
+        self.ioa_threshold_figure = ioa_threshold_figure if ioa_threshold_figure is not None else getattr(
+            settings, 'gap_filling_ioa_threshold_figure', 0.8
+        )
+        self.dedup_ioa_threshold = dedup_ioa_threshold if dedup_ioa_threshold is not None else getattr(
+            settings, 'gap_filling_dedup_ioa_threshold', 0.5
+        )
+
+        # Boundary shrinking
+        self.shrink_pixels = shrink_pixels if shrink_pixels is not None else getattr(
+            settings, 'gap_filling_shrink_pixels', 1
+        )
+
        self.enabled = enabled if enabled is not None else getattr(
            settings, 'gap_filling_enabled', True
        )

+        # Legacy compatibility
+        self.iou_threshold = getattr(settings, 'gap_filling_iou_threshold', 0.15)
+        self.dedup_iou_threshold = getattr(settings, 'gap_filling_dedup_iou_threshold', 0.5)
+
    def should_activate(
        self,
        raw_ocr_regions: List[TextRegion],
@@ -209,21 +255,83 @@ class GapFillingService:
        logger.debug(f"Found {len(uncovered)} uncovered regions out of {len(raw_ocr_regions)}")
        return uncovered

+    def _get_ioa_threshold_for_element(self, element_type: ElementType) -> float:
+        """
+        Get the IoA threshold for a specific element type.
+
+        Different element types have different thresholds:
+        - TABLE: 0.1 (strict, prevents duplicate table content)
+        - FIGURE/IMAGE: 0.8 (preserves text inside figures)
+        - TEXT/others: 0.6 (tolerates boundary errors)
+
+        Args:
+            element_type: The element type to get threshold for
+
+        Returns:
+            IoA threshold value
+        """
+        threshold_type = ELEMENT_TYPE_IOA_THRESHOLDS.get(element_type, 'text')
+        if threshold_type == 'table':
+            return self.ioa_threshold_table
+        elif threshold_type == 'figure':
+            return self.ioa_threshold_figure
+        else:
+            return self.ioa_threshold_text
+
+    def _shrink_bbox(
+        self,
+        bbox: Tuple[float, float, float, float],
+        pixels: int
+    ) -> Tuple[float, float, float, float]:
+        """
+        Shrink a bounding box inward by the specified number of pixels.
+
+        This reduces false "uncovered" detection at region boundaries.
+
+        Args:
+            bbox: Original bbox (x0, y0, x1, y1)
+            pixels: Number of pixels to shrink on each side
+
+        Returns:
+            Shrunk bbox (x0, y0, x1, y1)
+        """
+        x0, y0, x1, y1 = bbox
+        # Ensure we don't shrink to negative width/height
+        width = x1 - x0
+        height = y1 - y0
+        max_shrink = min(width / 2, height / 2, pixels)
+
+        return (
+            x0 + max_shrink,
+            y0 + max_shrink,
+            x1 - max_shrink,
+            y1 - max_shrink
+        )
+
    def _is_region_covered(
        self,
        region: TextRegion,
        pp_structure_elements: List[DocumentElement],
-        skip_table_coverage: bool = True
+        skip_table_coverage: bool = False
    ) -> bool:
        """
        Check if a raw OCR region is covered by any PP-StructureV3 element.

+        Uses IoA (Intersection over Area) instead of IoU for better coverage detection.
+        IoA = intersection_area / ocr_box_area
+        This correctly measures "OCR box is contained in layout region".
+
+        Different element types use different IoA thresholds:
+        - TABLE: 0.1 (strict, any overlap means covered)
+        - FIGURE/IMAGE: 0.8 (preserve text inside figures like axis labels)
+        - TEXT/others: 0.6 (tolerate boundary errors)
+
        Args:
            region: Raw OCR text region
            pp_structure_elements: List of PP-StructureV3 elements
-            skip_table_coverage: If True, don't consider TABLE elements as covering
-                                 (allows raw OCR text inside tables to pass through
-                                 for layered rendering)
+            skip_table_coverage: If True, don't consider TABLE elements as covering.
+                                 Default is False - TABLE elements DO cover regions
+                                 to prevent duplicate rendering of table cell content.

        Returns:
            True if the region is covered
@@ -231,10 +339,13 @@ class GapFillingService:
        center_x, center_y = region.center
        region_bbox = region.normalized_bbox

+        # Apply boundary shrinking to reduce edge duplicates
+        if self.shrink_pixels > 0:
+            region_bbox = self._shrink_bbox(region_bbox, self.shrink_pixels)
+
        for element in pp_structure_elements:
-            # Skip TABLE elements when checking coverage
-            # This allows raw OCR text inside tables to be preserved
-            # PDF generator will render: table borders + raw text positions
+            # Check TABLE elements for coverage (default behavior)
+            # This prevents gap_fill from adding duplicate text inside table areas
            if skip_table_coverage and element.type == ElementType.TABLE:
                continue

@@ -247,9 +358,11 @@ class GapFillingService:
            if self._point_in_bbox(center_x, center_y, elem_bbox):
                return True

-            # Check 2: IoU exceeds threshold
-            iou = self._calculate_iou(region_bbox, elem_bbox)
-            if iou > self.iou_threshold:
+            # Check 2: IoA exceeds element-type-specific threshold
+            # IoA = intersection_area / ocr_box_area
+            ioa = self._calculate_ioa(region_bbox, elem_bbox)
+            threshold = self._get_ioa_threshold_for_element(element.type)
+            if ioa > threshold:
                return True

        return False
@@ -262,6 +375,9 @@ class GapFillingService:
        """
        Remove regions that highly overlap with existing PP-StructureV3 TEXT elements.

+        Uses IoA (Intersection over Area) for deduplication to correctly detect
+        when an OCR region is already covered by an existing TEXT element.
+
        Args:
            uncovered_regions: List of uncovered raw OCR regions
            pp_structure_elements: List of PP-StructureV3 elements
@@ -278,6 +394,11 @@ class GapFillingService:
        deduplicated = []
        for region in uncovered_regions:
            region_bbox = region.normalized_bbox
+
+            # Apply boundary shrinking for deduplication as well
+            if self.shrink_pixels > 0:
+                region_bbox = self._shrink_bbox(region_bbox, self.shrink_pixels)
+
            is_duplicate = False

            for element in text_elements:
@@ -286,10 +407,11 @@ class GapFillingService:
                    element.bbox.x1, element.bbox.y1
                )

-                iou = self._calculate_iou(region_bbox, elem_bbox)
-                if iou > self.dedup_iou_threshold:
+                # Use IoA for deduplication
+                ioa = self._calculate_ioa(region_bbox, elem_bbox)
+                if ioa > self.dedup_ioa_threshold:
                    logger.debug(
-                        f"Skipping duplicate region (IoU={iou:.2f}): '{region.text[:30]}...'"
+                        f"Skipping duplicate region (IoA={ioa:.2f}): '{region.text[:30]}...'"
                    )
                    is_duplicate = True
                    break
@@ -622,6 +744,52 @@ class GapFillingService:
        x0, y0, x1, y1 = bbox
        return x0 <= x <= x1 and y0 <= y <= y1

+    @staticmethod
+    def _calculate_ioa(
+        ocr_bbox: Tuple[float, float, float, float],
+        layout_bbox: Tuple[float, float, float, float]
+    ) -> float:
+        """
+        Calculate Intersection over Area (IoA) of OCR bbox relative to layout bbox.
+
+        IoA = intersection_area / ocr_box_area
+
+        This is the recommended algorithm for detecting if an OCR text region
+        is contained within a larger layout region. Unlike IoU which is symmetric,
+        IoA correctly measures "how much of the OCR box is inside the layout region".
+
+        Example:
+        - OCR box: 100x20 pixels (small text line)
+        - Layout box: 500x800 pixels (large paragraph region)
+        - IoU would be very small (~0.005) even if OCR is fully inside layout
+        - IoA would be 1.0 if OCR is fully inside layout, which is correct
+
+        Args:
+            ocr_bbox: OCR text region bbox (x0, y0, x1, y1) - typically smaller
+            layout_bbox: Layout element bbox (x0, y0, x1, y1) - typically larger
+
+        Returns:
+            IoA value between 0 and 1
+        """
+        # Calculate intersection
+        x0 = max(ocr_bbox[0], layout_bbox[0])
+        y0 = max(ocr_bbox[1], layout_bbox[1])
+        x1 = min(ocr_bbox[2], layout_bbox[2])
+        y1 = min(ocr_bbox[3], layout_bbox[3])
+
+        if x1 <= x0 or y1 <= y0:
+            return 0.0
+
+        intersection = (x1 - x0) * (y1 - y0)
+
+        # Calculate OCR box area (denominator for IoA)
+        ocr_area = (ocr_bbox[2] - ocr_bbox[0]) * (ocr_bbox[3] - ocr_bbox[1])
+
+        if ocr_area <= 0:
+            return 0.0
+
+        return intersection / ocr_area
+
    @staticmethod
    def _calculate_iou(
        bbox1: Tuple[float, float, float, float],
@@ -630,6 +798,9 @@ class GapFillingService:
        """
        Calculate Intersection over Union (IoU) of two bboxes.

+        Note: This method is kept for backward compatibility.
+        For coverage detection, use _calculate_ioa() instead.
+
        Args:
            bbox1: First bbox (x0, y0, x1, y1)
            bbox2: Second bbox (x0, y0, x1, y1)
--- a/backend/app/services/ocr_service.py
+++ b/backend/app/services/ocr_service.py
@@ -6,7 +6,7 @@ Supports both PaddleOCR (for scanned documents) and direct extraction (for edita
 import json
 import logging
 from pathlib import Path
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Any, Dict, List, Optional, Tuple, Union
 from datetime import datetime
 import uuid
 import gc  # For garbage collection
@@ -446,6 +446,47 @@ class OCRService:
        except Exception as e:
            logger.warning(f"Failed to clear GPU cache: {e}")

+    def _apply_ocr_config(self, ocr_config: 'OCRConfig'):
+        """
+        Apply OCR configuration from preset or custom settings.
+        This modifies the runtime settings used by PP-Structure.
+
+        Args:
+            ocr_config: OCRConfig object with processing settings
+        """
+        logger.info(f"Applying OCR config: {ocr_config.model_dump()}")
+
+        # Store the config for use in PP-Structure initialization
+        self._runtime_ocr_config = ocr_config
+
+        # Apply table parsing mode settings
+        # These will be used when initializing PP-StructureV3
+        settings.table_parsing_mode = ocr_config.table_parsing_mode.value if hasattr(ocr_config.table_parsing_mode, 'value') else ocr_config.table_parsing_mode
+
+        # Apply preprocessing settings
+        settings.use_doc_orientation_classify = ocr_config.use_doc_orientation_classify
+        settings.use_doc_unwarping = ocr_config.use_doc_unwarping
+        settings.use_textline_orientation = ocr_config.use_textline_orientation
+
+        # Apply recognition module settings
+        settings.enable_chart_recognition = ocr_config.enable_chart_recognition
+        settings.enable_formula_recognition = ocr_config.enable_formula_recognition
+        settings.enable_seal_recognition = ocr_config.enable_seal_recognition
+        settings.enable_region_detection = ocr_config.enable_region_detection
+
+        # Apply layout threshold if specified
+        if ocr_config.layout_threshold is not None:
+            settings.layout_detection_threshold = ocr_config.layout_threshold
+        if ocr_config.layout_nms_threshold is not None:
+            settings.layout_nms_threshold = ocr_config.layout_nms_threshold
+
+        # Invalidate existing structure engine to force re-initialization with new settings
+        if self.structure_engine is not None:
+            logger.info("Invalidating PP-StructureV3 engine to apply new OCR config")
+            self._unload_structure_engine()
+
+        logger.info(f"OCR config applied: table_parsing_mode={settings.table_parsing_mode}")
+
    def get_ocr_engine(self, lang: str = 'ch') -> PaddleOCR:
        """
        Get or create OCR engine for specified language with GPU support
@@ -615,6 +656,39 @@ class OCRService:
                formula_model = settings.formula_recognition_model_name
                chart_model = settings.chart_recognition_model_name

+                # Apply table_parsing_mode settings
+                # This is the KEY configuration to prevent "cell explosion" on datasheet-type documents
+                table_parsing_mode = settings.table_parsing_mode
+                logger.info(f"Table parsing mode: {table_parsing_mode}")
+
+                if table_parsing_mode == "disabled":
+                    # 方案A: 完全關閉 table recognition
+                    use_table = False
+                    wired_table_model = None
+                    wireless_table_model = None
+                    wired_cell_det_model = None
+                    wireless_cell_det_model = None
+                    logger.info("Table parsing DISABLED - no cell segmentation")
+
+                elif table_parsing_mode == "classification_only":
+                    # 方案C: 只做 table classification，不做 cell segmentation
+                    use_table = False  # Don't parse table structure
+                    wired_table_model = None
+                    wireless_table_model = None
+                    wired_cell_det_model = None
+                    wireless_cell_det_model = None
+                    # Keep table_cls_model to identify table regions
+                    logger.info("Table parsing CLASSIFICATION_ONLY - regions identified but no cell parsing")
+
+                elif table_parsing_mode == "conservative":
+                    # 方案B: 保守模式 - 只禁用 wireless tables (aggressive)
+                    # 注意：不要修改 layout_threshold，它會影響所有元素偵測，不只是表格
+                    wireless_table_model = None
+                    wireless_cell_det_model = None
+                    logger.info(f"Table parsing CONSERVATIVE - wireless disabled (layout_threshold unchanged)")
+
+                # else: "full" mode - use all default settings (aggressive)
+
                # Apply table detection config overrides for individual table types
                if table_detection_config:
                    if not table_detection_config.enable_wired_table:
@@ -1343,6 +1417,7 @@ class OCRService:

            if detect_layout:
                # Pass current_page to analyze_layout for correct page numbering
+                # Also pass text_regions for table content rebuilding
                layout_data, images_metadata = self.analyze_layout(
                    image_path,
                    output_dir=output_dir,
@@ -1350,7 +1425,8 @@ class OCRService:
                    layout_model=layout_model,
                    preprocessing_mode=preprocessing_mode,
                    preprocessing_config=preprocessing_config,
-                    table_detection_config=table_detection_config
+                    table_detection_config=table_detection_config,
+                    raw_ocr_regions=text_regions  # For table content rebuilding
                )

            # Generate Markdown
@@ -1379,6 +1455,12 @@ class OCRService:

            # If layout data is enhanced, add enhanced results for converter
            if layout_data and layout_data.get('enhanced'):
+                # Debug: check if table elements have rebuild_stats
+                for elem in layout_data.get('elements', []):
+                    if elem.get('type') == 'table':
+                        has_rebuild = 'rebuild_stats' in elem
+                        logger.info(f"[OCR_SERVICE] Table {elem.get('element_id')}: has rebuild_stats={has_rebuild}, keys={list(elem.keys())[:10]}")
+
                result['enhanced_results'] = [{
                    'elements': layout_data.get('elements', []),
                    'reading_order': layout_data.get('reading_order', []),
@@ -1509,7 +1591,8 @@ class OCRService:
        layout_model: Optional[str] = None,
        preprocessing_mode: Optional[PreprocessingModeEnum] = None,
        preprocessing_config: Optional[PreprocessingConfig] = None,
-        table_detection_config: Optional[TableDetectionConfig] = None
+        table_detection_config: Optional[TableDetectionConfig] = None,
+        raw_ocr_regions: Optional[List[Dict[str, Any]]] = None
    ) -> Tuple[Optional[Dict], List[Dict]]:
        """
        Analyze document layout using PP-StructureV3 with enhanced element extraction
@@ -1522,6 +1605,7 @@ class OCRService:
            preprocessing_mode: Preprocessing mode ('auto', 'manual', 'disabled')
            preprocessing_config: Manual preprocessing config (used when mode='manual')
            table_detection_config: Table detection config (wired/wireless/region options)
+            raw_ocr_regions: Optional list of raw OCR text regions for table content rebuilding

        Returns:
            Tuple of (layout_data, images_metadata)
@@ -1607,7 +1691,8 @@ class OCRService:
                    preprocessed_image=preprocessed_image,
                    scaling_info=scaling_info,
                    save_visualization=True,  # Save layout detection visualization images
-                    use_cv_table_detection=use_cv_table_detection
+                    use_cv_table_detection=use_cv_table_detection,
+                    raw_ocr_regions=raw_ocr_regions  # For table content rebuilding
                )

                if result.get('has_parsing_res_list'):
@@ -2225,7 +2310,8 @@ class OCRService:
        layout_model: Optional[str] = None,
        preprocessing_mode: Optional[PreprocessingModeEnum] = None,
        preprocessing_config: Optional[PreprocessingConfig] = None,
-        table_detection_config: Optional[TableDetectionConfig] = None
+        table_detection_config: Optional[TableDetectionConfig] = None,
+        ocr_config: Optional['OCRConfig'] = None
    ) -> Union[UnifiedDocument, Dict]:
        """
        Main processing method with dual-track support.
@@ -2242,11 +2328,16 @@ class OCRService:
            preprocessing_mode: Layout preprocessing mode ('auto', 'manual', 'disabled')
            preprocessing_config: Manual preprocessing config (used when mode='manual')
            table_detection_config: Table detection config (wired/wireless/region options)
+            ocr_config: OCR processing config from preset or custom settings

        Returns:
            UnifiedDocument if dual-track is enabled and use_dual_track=True,
            Dict with legacy format otherwise
        """
+        # Apply OCR config to settings if provided
+        if ocr_config:
+            self._apply_ocr_config(ocr_config)
+
        # Use dual-track processing if:
        # 1. use_dual_track is True (auto-detection), OR
        # 2. force_track is specified (explicit track selection)
--- a/backend/app/services/ocr_to_unified_converter.py
+++ b/backend/app/services/ocr_to_unified_converter.py
@@ -189,7 +189,7 @@ def validate_cell_boxes(
    Validate cell_boxes coordinates against page boundaries and table bbox.

    PP-StructureV3 sometimes returns cell_boxes with coordinates that exceed
-    page boundaries. This function validates and reports issues.
+    page boundaries or table bbox. This function validates and clamps to valid boundaries.

    Args:
        cell_boxes: List of cell bounding boxes [[x0, y0, x1, y1], ...]
@@ -213,10 +213,22 @@ def validate_cell_boxes(
    clamped_boxes = []

    # Page boundaries with tolerance
-    min_x = -tolerance
-    min_y = -tolerance
-    max_x = page_width + tolerance
-    max_y = page_height + tolerance
+    page_min_x = -tolerance
+    page_min_y = -tolerance
+    page_max_x = page_width + tolerance
+    page_max_y = page_height + tolerance
+
+    # Table boundaries with tolerance (prefer clamping to table bbox)
+    table_min_x = table_bbox[0] - tolerance if len(table_bbox) >= 4 else page_min_x
+    table_min_y = table_bbox[1] - tolerance if len(table_bbox) >= 4 else page_min_y
+    table_max_x = table_bbox[2] + tolerance if len(table_bbox) >= 4 else page_max_x
+    table_max_y = table_bbox[3] + tolerance if len(table_bbox) >= 4 else page_max_y
+
+    # For clamping, use the intersection of page and expanded table bbox
+    clamp_min_x = max(0, table_bbox[0] - tolerance) if len(table_bbox) >= 4 else 0
+    clamp_min_y = max(0, table_bbox[1] - tolerance) if len(table_bbox) >= 4 else 0
+    clamp_max_x = min(page_width, table_bbox[2] + tolerance) if len(table_bbox) >= 4 else page_width
+    clamp_max_y = min(page_height, table_bbox[3] + tolerance) if len(table_bbox) >= 4 else page_height

    for idx, box in enumerate(cell_boxes):
        if not box or len(box) < 4:
@@ -230,19 +242,38 @@ def validate_cell_boxes(
        cell_issues = []

        # Check if coordinates exceed page boundaries
-        if x0 < min_x:
+        if x0 < page_min_x:
            cell_issues.append(f"x0={x0:.1f} < 0")
            is_valid = False
-        if y0 < min_y:
+        if y0 < page_min_y:
            cell_issues.append(f"y0={y0:.1f} < 0")
            is_valid = False
-        if x1 > max_x:
+        if x1 > page_max_x:
            cell_issues.append(f"x1={x1:.1f} > page_width={page_width:.1f}")
            is_valid = False
-        if y1 > max_y:
+        if y1 > page_max_y:
            cell_issues.append(f"y1={y1:.1f} > page_height={page_height:.1f}")
            is_valid = False

+        # Check if coordinates significantly exceed table bbox (more than 20% of table size)
+        if len(table_bbox) >= 4:
+            table_w = table_bbox[2] - table_bbox[0]
+            table_h = table_bbox[3] - table_bbox[1]
+            expand_tolerance = max(tolerance, table_h * 0.2)  # 20% of table height
+
+            if y0 < table_bbox[1] - expand_tolerance:
+                cell_issues.append(f"y0={y0:.1f} above table (table_y0={table_bbox[1]:.1f})")
+                is_valid = False
+            if y1 > table_bbox[3] + expand_tolerance:
+                cell_issues.append(f"y1={y1:.1f} below table (table_y1={table_bbox[3]:.1f})")
+                is_valid = False
+            if x0 < table_bbox[0] - expand_tolerance:
+                cell_issues.append(f"x0={x0:.1f} left of table (table_x0={table_bbox[0]:.1f})")
+                is_valid = False
+            if x1 > table_bbox[2] + expand_tolerance:
+                cell_issues.append(f"x1={x1:.1f} right of table (table_x1={table_bbox[2]:.1f})")
+                is_valid = False
+
        # Check for inverted coordinates
        if x0 > x1:
            cell_issues.append(f"x0={x0:.1f} > x1={x1:.1f}")
@@ -255,12 +286,12 @@ def validate_cell_boxes(
            invalid_count += 1
            issues.append(f"Cell {idx}: {', '.join(cell_issues)}")

-        # Clamp to valid boundaries
+        # Clamp to valid boundaries (table bbox with some tolerance)
        clamped_box = [
-            max(0, min(x0, page_width)),
-            max(0, min(y0, page_height)),
-            max(0, min(x1, page_width)),
-            max(0, min(y1, page_height))
+            max(clamp_min_x, min(x0, clamp_max_x)),
+            max(clamp_min_y, min(y0, clamp_max_y)),
+            max(clamp_min_x, min(x1, clamp_max_x)),
+            max(clamp_min_y, min(y1, clamp_max_y))
        ]

        # Ensure proper ordering after clamping
@@ -395,10 +426,15 @@ class OCRToUnifiedConverter:

        Handles both enhanced PP-StructureV3 results (with parsing_res_list)
        and traditional markdown results. Applies gap filling when enabled.
+
+        Gap filling can use either:
+        1. overall_ocr_res from PP-StructureV3 (preferred, no extra inference)
+        2. Separate raw OCR text_regions (fallback)
        """
        pages = []

        # Extract raw OCR text regions for gap filling
+        # Prefer overall_ocr_res from PP-StructureV3 when available
        raw_text_regions = ocr_results.get('text_regions', [])
        ocr_dimensions = ocr_results.get('ocr_dimensions', {})

@@ -461,13 +497,22 @@ class OCRToUnifiedConverter:
                    if element:
                        elements.append(element)

-            # Apply gap filling if enabled and raw regions available
-            if self.gap_filling_service and raw_text_regions:
-                # Filter raw regions for current page
-                page_raw_regions = [
-                    r for r in raw_text_regions
-                    if r.get('page', 0) == page_idx or r.get('page', 1) == page_idx + 1
-                ]
+            # Apply gap filling if enabled
+            # Priority: 1) overall_ocr_res from page_result, 2) raw_text_regions from separate OCR
+            if self.gap_filling_service:
+                # Check for overall_ocr_res from PP-StructureV3 (preferred, no extra inference)
+                page_raw_regions = page_result.get('overall_ocr_res', [])
+
+                if page_raw_regions:
+                    logger.debug(f"Page {page_idx + 1}: Using overall_ocr_res ({len(page_raw_regions)} regions)")
+                elif raw_text_regions:
+                    # Fallback to separate raw OCR regions
+                    page_raw_regions = [
+                        r for r in raw_text_regions
+                        if r.get('page', 0) == page_idx or r.get('page', 1) == page_idx + 1
+                    ]
+                    if page_raw_regions:
+                        logger.debug(f"Page {page_idx + 1}: Using separate raw OCR ({len(page_raw_regions)} regions)")

                if page_raw_regions:
                    supplemented, stats = self.gap_filling_service.fill_gaps(
@@ -711,8 +756,33 @@ class OCRToUnifiedConverter:
            # Prepare content based on element type
            if element_type == ElementType.TABLE:
                # For tables, use TableData as content
-                # Pass cell_boxes for accurate cell positioning
-                table_data = self._extract_table_data(elem_data)
+                # Priority: rebuilt_table > HTML parsing
+                # rebuilt_table contains clean cells without empty padding
+                if 'rebuilt_table' in elem_data:
+                    rebuilt = elem_data['rebuilt_table']
+                    # Use rebuilt cells directly - they don't include empty cells
+                    rebuilt_cells = rebuilt.get('cells', [])
+                    from app.models.unified_document import TableCell
+                    table_cells = [
+                        TableCell(
+                            row=c.get('row', 0),
+                            col=c.get('col', 0),
+                            row_span=c.get('row_span', 1),
+                            col_span=c.get('col_span', 1),
+                            content=c.get('content', '')
+                        )
+                        for c in rebuilt_cells
+                    ]
+                    table_data = TableData(
+                        rows=rebuilt.get('rows', 0),
+                        cols=rebuilt.get('cols', 0),
+                        cells=table_cells,
+                        caption=elem_data.get('extracted_text')
+                    )
+                    logger.info(f"[CONVERTER] Table {elem_data.get('element_id')}: Using rebuilt_table directly ({len(rebuilt_cells)} cells)")
+                else:
+                    # Fallback to HTML parsing for non-rebuilt tables
+                    table_data = self._extract_table_data(elem_data)
                content = table_data if table_data else elem_data.get('content', '')

                # Preserve cell_boxes and embedded_images in metadata for PDF generation
@@ -756,6 +826,18 @@ class OCRToUnifiedConverter:

                if 'embedded_images' in elem_data:
                    elem_data.setdefault('metadata', {})['embedded_images'] = elem_data['embedded_images']
+
+                # Pass through rebuild information for tables that were rebuilt
+                # This tells the PDF renderer to use HTML content instead of cell_boxes
+                logger.info(f"[CONVERTER] Table {elem_data.get('element_id')}: checking for rebuild_stats, keys={list(elem_data.keys())}")
+                if 'rebuild_stats' in elem_data:
+                    elem_data.setdefault('metadata', {})['rebuild_stats'] = elem_data['rebuild_stats']
+                    elem_data['metadata']['was_rebuilt'] = True
+                    logger.info(f"[CONVERTER] Table {elem_data.get('element_id')}: FOUND rebuild_stats, setting was_rebuilt=True")
+
+                if 'rebuilt_table' in elem_data:
+                    elem_data.setdefault('metadata', {})['rebuilt_table'] = elem_data['rebuilt_table']
+
            elif element_type in [
                ElementType.IMAGE, ElementType.FIGURE, ElementType.CHART,
                ElementType.DIAGRAM, ElementType.LOGO, ElementType.STAMP
--- a/backend/app/services/pdf_generator_service.py
+++ b/backend/app/services/pdf_generator_service.py
@@ -26,6 +26,23 @@ from html.parser import HTMLParser

 from app.core.config import settings

+# Import table column corrector for column alignment fix
+try:
+    from app.services.table_column_corrector import TableColumnCorrector
+    TABLE_COLUMN_CORRECTOR_AVAILABLE = True
+except ImportError:
+    TABLE_COLUMN_CORRECTOR_AVAILABLE = False
+    TableColumnCorrector = None
+
+# Import text region renderer for simple text positioning
+try:
+    from app.services.text_region_renderer import TextRegionRenderer, load_raw_ocr_regions
+    TEXT_REGION_RENDERER_AVAILABLE = True
+except ImportError:
+    TEXT_REGION_RENDERER_AVAILABLE = False
+    TextRegionRenderer = None
+    load_raw_ocr_regions = None
+
 # Import UnifiedDocument for dual-track support
 try:
    from app.models.unified_document import (
@@ -596,7 +613,8 @@ class PDFGeneratorService:
                        'content': html_content,
                        'bbox': [element.bbox.x0, element.bbox.y0,
                                element.bbox.x1, element.bbox.y1],
-                        'page': page_num - 1  # layout uses 0-based
+                        'page': page_num - 1,  # layout uses 0-based
+                        'element_id': element.element_id  # For _use_border_only matching
                    }

                    # Preserve cell_boxes and embedded_images from metadata
@@ -607,18 +625,29 @@ class PDFGeneratorService:
                            table_element['cell_boxes_source'] = element.metadata.get('cell_boxes_source', 'metadata')
                        if 'embedded_images' in element.metadata:
                            table_element['embedded_images'] = element.metadata['embedded_images']
+                        # Pass through rebuild flag - rebuilt tables should use HTML content
+                        if element.metadata.get('was_rebuilt'):
+                            table_element['was_rebuilt'] = True
+                            logger.debug(f"Table {element.element_id}: marked as rebuilt")

                    layout_elements.append(table_element)

                    # Add bbox to images_metadata for text overlap filtering
                    # (no actual image file, just bbox for filtering)
-                    images_metadata.append({
+                    img_metadata = {
                        'image_path': None,  # No fake table image
                        'bbox': bbox_polygon,
                        'page': page_num - 1,  # 0-based for images_metadata
                        'type': 'table',
                        'element_id': element.element_id
-                    })
+                    }
+                    # Also copy cell_boxes for quality checking
+                    if element.metadata and 'cell_boxes' in element.metadata:
+                        img_metadata['cell_boxes'] = element.metadata['cell_boxes']
+                    # Mark if table was rebuilt
+                    if element.metadata and element.metadata.get('was_rebuilt'):
+                        img_metadata['was_rebuilt'] = True
+                    images_metadata.append(img_metadata)

                # Handle image/visual elements (including stamps/seals)
                elif element.is_visual or element.type in [
@@ -1022,15 +1051,25 @@ class PDFGeneratorService:
            # Set current track
            self.current_processing_track = 'ocr'

-            # Convert UnifiedDocument to OCR data format (legacy)
-            ocr_data = self.convert_unified_document_to_ocr_data(unified_doc)
+            # Check if simple text positioning mode is enabled
+            if (settings.simple_text_positioning_enabled and
+                TEXT_REGION_RENDERER_AVAILABLE):
+                logger.info("Using simple text positioning mode")
+                result = self._generate_simple_text_pdf(
+                    unified_doc=unified_doc,
+                    output_path=output_path,
+                    source_file_path=source_file_path
+                )
+            else:
+                # Convert UnifiedDocument to OCR data format (legacy)
+                ocr_data = self.convert_unified_document_to_ocr_data(unified_doc)

-            # Use existing generation pipeline
-            result = self._generate_pdf_from_data(
-                ocr_data=ocr_data,
-                output_path=output_path,
-                source_file_path=source_file_path
-            )
+                # Use existing generation pipeline
+                result = self._generate_pdf_from_data(
+                    ocr_data=ocr_data,
+                    output_path=output_path,
+                    source_file_path=source_file_path
+                )

            # Reset track
            self.current_processing_track = None
@@ -1043,6 +1082,235 @@ class PDFGeneratorService:
            self.current_processing_track = None
            return False

+    def _generate_simple_text_pdf(
+        self,
+        unified_doc: 'UnifiedDocument',
+        output_path: Path,
+        source_file_path: Optional[Path] = None
+    ) -> bool:
+        """
+        Generate PDF using simple text positioning from raw OCR regions.
+
+        This approach bypasses complex table structure reconstruction and renders
+        raw OCR text directly at detected positions with rotation correction.
+        Images, charts, figures, seals, and formulas are still rendered normally.
+
+        Args:
+            unified_doc: UnifiedDocument from OCR processing
+            output_path: Path to save generated PDF
+            source_file_path: Optional path to original source file
+
+        Returns:
+            True if successful, False otherwise
+        """
+        try:
+            logger.info("=== Simple Text Positioning PDF Generation ===")
+
+            # Initialize text region renderer
+            text_renderer = TextRegionRenderer(
+                font_name=self.font_name,
+                debug=settings.simple_text_positioning_debug
+            )
+
+            # Get result directory from output_path
+            result_dir = output_path.parent
+
+            # Try to determine task_id from result directory or output filename
+            # Output path is typically: result_dir/task_id_edited.pdf
+            task_id = None
+            if output_path.stem.endswith('_edited'):
+                task_id = output_path.stem.replace('_edited', '')
+            elif result_dir.name:
+                # result_dir is typically the task_id directory
+                task_id = result_dir.name
+
+            if not task_id:
+                logger.warning("Could not determine task_id, falling back to legacy method")
+                ocr_data = self.convert_unified_document_to_ocr_data(unified_doc)
+                return self._generate_pdf_from_data(
+                    ocr_data=ocr_data,
+                    output_path=output_path,
+                    source_file_path=source_file_path
+                )
+
+            logger.info(f"Task ID: {task_id}, Result dir: {result_dir}")
+
+            # Get total pages from UnifiedDocument
+            total_pages = len(unified_doc.pages) if unified_doc.pages else 1
+
+            # Get page dimensions from first page (for canvas initialization)
+            if not unified_doc.pages:
+                logger.error("No pages in document")
+                return False
+
+            first_page = unified_doc.pages[0]
+            if hasattr(first_page, 'dimensions') and first_page.dimensions:
+                page_width = float(first_page.dimensions.width)
+                page_height = float(first_page.dimensions.height)
+            else:
+                # Fallback to default size
+                page_width = 612.0  # Letter width
+                page_height = 792.0  # Letter height
+                logger.warning(f"No page dimensions found, using default {page_width}x{page_height}")
+
+            logger.info(f"Initial page size: {page_width:.1f} x {page_height:.1f}")
+
+            # Create PDF canvas
+            pdf_canvas = canvas.Canvas(str(output_path), pagesize=(page_width, page_height))
+
+            # Collect image-type elements from UnifiedDocument for rendering
+            # Types that should be rendered as images: figure, image, chart, seal, formula
+            image_element_types = {'figure', 'image', 'chart', 'seal', 'formula'}
+
+            # Process each page
+            for page_num in range(1, total_pages + 1):
+                logger.info(f">>> Processing page {page_num}/{total_pages}")
+
+                # Get page dimensions for current page
+                if page_num <= len(unified_doc.pages):
+                    current_page = unified_doc.pages[page_num - 1]
+                    if hasattr(current_page, 'dimensions') and current_page.dimensions:
+                        current_width = float(current_page.dimensions.width)
+                        current_height = float(current_page.dimensions.height)
+                    else:
+                        current_width = page_width
+                        current_height = page_height
+                else:
+                    current_width = page_width
+                    current_height = page_height
+
+                if page_num > 1:
+                    pdf_canvas.showPage()
+
+                # Set page size
+                pdf_canvas.setPageSize((current_width, current_height))
+
+                # === Layer 1: Render images, charts, figures, seals, formulas ===
+                # Also collect exclusion zones for text avoidance
+                exclusion_zones = []  # List of (x0, y0, x1, y1) tuples
+
+                if page_num <= len(unified_doc.pages):
+                    current_page = unified_doc.pages[page_num - 1]
+                    page_elements = current_page.elements if hasattr(current_page, 'elements') else []
+
+                    image_elements_rendered = 0
+                    for elem in page_elements:
+                        elem_type = elem.type if hasattr(elem, 'type') else elem.get('type', '')
+                        # Handle enum type
+                        if hasattr(elem_type, 'value'):
+                            elem_type = elem_type.value
+
+                        if elem_type in image_element_types:
+                            # Get image path from element content
+                            content = elem.content if hasattr(elem, 'content') else elem.get('content', {})
+                            if isinstance(content, dict):
+                                saved_path = content.get('saved_path') or content.get('path')
+                            else:
+                                saved_path = None
+
+                            # Get bbox for exclusion zone (even if image file not found)
+                            bbox = elem.bbox if hasattr(elem, 'bbox') else elem.get('bbox', {})
+                            if hasattr(bbox, 'x0'):
+                                x0, y0, x1, y1 = bbox.x0, bbox.y0, bbox.x1, bbox.y1
+                            elif isinstance(bbox, dict):
+                                x0 = bbox.get('x0', 0)
+                                y0 = bbox.get('y0', 0)
+                                x1 = bbox.get('x1', x0 + bbox.get('width', 0))
+                                y1 = bbox.get('y1', y0 + bbox.get('height', 0))
+                            else:
+                                continue
+
+                            # Add to exclusion zones for text avoidance
+                            # Use original image coordinates (not PDF flipped)
+                            exclusion_zones.append((x0, y0, x1, y1))
+
+                            if saved_path:
+                                # Try to find the image file
+                                image_path = result_dir / saved_path
+                                if not image_path.exists():
+                                    # Try in imgs subdirectory
+                                    image_path = result_dir / 'imgs' / saved_path
+                                if not image_path.exists():
+                                    # Try just the filename
+                                    image_path = result_dir / Path(saved_path).name
+
+                                if image_path.exists():
+                                    try:
+                                        # Convert coordinates (flip Y for PDF)
+                                        pdf_x = x0
+                                        pdf_y = current_height - y1  # Bottom of image in PDF coords
+                                        img_width = x1 - x0
+                                        img_height = y1 - y0
+
+                                        # Draw image
+                                        pdf_canvas.drawImage(
+                                            str(image_path),
+                                            pdf_x, pdf_y,
+                                            width=img_width,
+                                            height=img_height,
+                                            preserveAspectRatio=True,
+                                            mask='auto'
+                                        )
+                                        image_elements_rendered += 1
+                                        logger.debug(f"Rendered {elem_type}: {saved_path} at ({pdf_x:.1f}, {pdf_y:.1f})")
+                                    except Exception as e:
+                                        logger.warning(f"Failed to render {elem_type} {saved_path}: {e}")
+                                else:
+                                    logger.warning(f"Image file not found: {saved_path}")
+
+                    if image_elements_rendered > 0:
+                        logger.info(f"Rendered {image_elements_rendered} image elements (figures/charts/seals/formulas)")
+
+                    if exclusion_zones:
+                        logger.info(f"Collected {len(exclusion_zones)} exclusion zones for text avoidance")
+
+                # === Layer 2: Render text from raw OCR regions ===
+                raw_regions = load_raw_ocr_regions(str(result_dir), task_id, page_num)
+
+                if not raw_regions:
+                    logger.warning(f"No raw OCR regions found for page {page_num}")
+                else:
+                    logger.info(f"Loaded {len(raw_regions)} raw OCR regions for page {page_num}")
+
+                    # Collect texts inside exclusion zones for position-aware deduplication
+                    # This prevents duplicate axis labels from being rendered near charts
+                    zone_texts = None
+                    if exclusion_zones:
+                        zone_texts = text_renderer.collect_zone_texts(
+                            raw_regions, exclusion_zones, threshold=0.5, include_axis_labels=True
+                        )
+                        if zone_texts:
+                            logger.info(f"Collected {len(zone_texts)} zone texts for deduplication: {list(zone_texts)[:10]}...")
+
+                    # Render all text regions, avoiding exclusion zones (images/charts)
+                    # Scale factors are 1.0 since OCR dimensions match page dimensions
+                    rendered = text_renderer.render_all_regions(
+                        pdf_canvas=pdf_canvas,
+                        regions=raw_regions,
+                        page_height=current_height,
+                        scale_x=1.0,
+                        scale_y=1.0,
+                        exclusion_zones=exclusion_zones,
+                        zone_texts=zone_texts
+                    )
+
+                    logger.info(f"Rendered {rendered} text regions")
+
+                logger.info(f"<<< Page {page_num} complete")
+
+            # Save PDF
+            pdf_canvas.save()
+
+            file_size = output_path.stat().st_size
+            logger.info(f"Generated PDF: {output_path.name} ({file_size} bytes)")
+            return True
+
+        except Exception as e:
+            logger.error(f"Failed to generate simple text PDF: {e}")
+            import traceback
+            traceback.print_exc()
+            return False
+
    def _generate_pdf_from_data(
        self,
        ocr_data: Dict,
@@ -1093,8 +1361,15 @@ class PDFGeneratorService:
                logger.info("No page_dimensions found, using first page size for all pages")

            # Step 3: Get original file dimensions for all pages
+            # For OCR track, we use OCR coordinate system dimensions directly to avoid scaling issues
            original_page_sizes = {}
-            if source_file_path:
+            use_ocr_dimensions_for_pdf = (self.current_processing_track == 'ocr')
+
+            if use_ocr_dimensions_for_pdf:
+                # OCR Track: Use OCR coordinate system dimensions directly
+                # This ensures no scaling is needed (scale = 1.0)
+                logger.info(f"OCR Track: 使用 OCR 座標系尺寸作為 PDF 頁面尺寸（避免縮放）")
+            elif source_file_path:
                original_page_sizes = self.get_all_page_sizes(source_file_path)
                if original_page_sizes:
                    logger.info(f"從原始文件獲取到 {len(original_page_sizes)} 頁尺寸")
@@ -1104,8 +1379,12 @@ class PDFGeneratorService:
                logger.info(f"無原始文件，將使用 OCR/UnifiedDocument 尺寸")

            # Determine initial canvas size (will be updated per page)
-            # Priority: original file first page > OCR/UnifiedDocument first page
-            if 0 in original_page_sizes:
+            # Priority for OCR track: OCR dimensions (no scaling)
+            # Priority for Direct track: original file first page > OCR/UnifiedDocument first page
+            if use_ocr_dimensions_for_pdf:
+                target_width, target_height = ocr_width, ocr_height
+                logger.info(f"初始 PDF 尺寸（OCR Track, 使用 OCR 座標系）: {target_width:.1f} x {target_height:.1f}")
+            elif 0 in original_page_sizes:
                target_width, target_height = original_page_sizes[0]
                logger.info(f"初始 PDF 尺寸（來自原始文件首頁）: {target_width:.1f} x {target_height:.1f}")
            else:
@@ -1159,14 +1438,49 @@ class PDFGeneratorService:
            # Create PDF canvas with initial page size (will be updated per page)
            pdf_canvas = canvas.Canvas(str(output_path), pagesize=(target_width, target_height))

-            # LAYERED RENDERING: Exclude tables from regions_to_avoid
-            # Text inside tables will be rendered at raw OCR positions (via GapFillingService)
-            # while table borders are drawn separately using cell_boxes
-            # Only avoid overlap with actual images/figures/charts
-            regions_to_avoid = [img for img in images_metadata if img.get('type') != 'table']
-            table_count = len([img for img in images_metadata if img.get('type') == 'table'])
+            # Smart filtering: only include tables with good cell_boxes quality in regions_to_avoid
+            # Tables with bad cell_boxes will use raw OCR text positioning instead
+            # Exception: Rebuilt tables always use HTML content and filter text
+            regions_to_avoid = []
+            good_quality_tables = []
+            bad_quality_tables = []
+            rebuilt_tables = []

-            logger.info(f"過濾文字區域: {len(regions_to_avoid)} 個區域需要避免 (不含表格), {table_count} 個表格使用分層渲染")
+            for img in images_metadata:
+                if img.get('type') == 'table':
+                    elem_id = img.get('element_id', 'unknown')
+
+                    # Check if this table was rebuilt - rebuilt tables have good content
+                    was_rebuilt = img.get('was_rebuilt', False)
+
+                    if was_rebuilt:
+                        # Rebuilt tables have accurate content - filter text, use HTML
+                        regions_to_avoid.append(img)
+                        rebuilt_tables.append(elem_id)
+                    else:
+                        # Check cell_boxes quality for non-rebuilt tables
+                        cell_boxes = img.get('cell_boxes', [])
+                        quality = self._check_cell_boxes_quality(cell_boxes, elem_id)
+
+                        if quality == 'good':
+                            # Good quality: filter text, render with cell_boxes
+                            regions_to_avoid.append(img)
+                            good_quality_tables.append(elem_id)
+                        else:
+                            # Bad quality: don't filter text, just draw border
+                            bad_quality_tables.append(elem_id)
+                            img['_use_border_only'] = True  # Mark for border-only rendering
+                else:
+                    # Non-table elements (images, figures, charts) always avoid
+                    regions_to_avoid.append(img)
+
+            logger.info(f"過濾文字區域: {len(regions_to_avoid)} 個區域需要避免")
+            if rebuilt_tables:
+                logger.info(f"  重建表格用 HTML: {rebuilt_tables}")
+            if good_quality_tables:
+                logger.info(f"  表格用 cell_boxes: {good_quality_tables}")
+            if bad_quality_tables:
+                logger.info(f"  表格用 raw OCR text (border only): {bad_quality_tables}")

            filtered_text_regions = self._filter_text_in_regions(text_regions, regions_to_avoid)

@@ -1178,10 +1492,24 @@ class PDFGeneratorService:
                    pages_data[page_num] = []
                pages_data[page_num].append(region)

-            # Get table elements from layout_data
+            # Get table elements from layout_data and copy _use_border_only flags
            table_elements = []
            if layout_data and layout_data.get('elements'):
-                table_elements = [e for e in layout_data['elements'] if e.get('type') == 'table']
+                # Create a lookup for _use_border_only flags from images_metadata
+                border_only_tables = {img.get('element_id') for img in images_metadata
+                                      if img.get('type') == 'table' and img.get('_use_border_only')}
+
+                logger.debug(f"[DEBUG] border_only_tables from images_metadata: {border_only_tables}")
+
+                for e in layout_data['elements']:
+                    if e.get('type') == 'table':
+                        elem_id = e.get('element_id')
+                        logger.debug(f"[DEBUG] layout_data table element_id: {elem_id}")
+                        # Copy the flag if this table should use border only
+                        if elem_id in border_only_tables:
+                            e['_use_border_only'] = True
+                            logger.info(f"[DEBUG] Set _use_border_only=True for table {elem_id}")
+                        table_elements.append(e)

            # Process each page
            total_pages = ocr_data.get('total_pages', 1)
@@ -1195,14 +1523,23 @@ class PDFGeneratorService:
                logger.info(f">>> 處理第 {page_num}/{total_pages} 頁")

                # Get current page dimensions with priority order:
-                # 1. Original file dimensions (highest priority)
-                # 2. OCR/UnifiedDocument dimensions
-                # 3. Fallback to first page dimensions
+                # For OCR Track: always use OCR dimensions (scale = 1.0)
+                # For Direct Track:
+                #   1. Original file dimensions (highest priority)
+                #   2. OCR/UnifiedDocument dimensions
+                #   3. Fallback to first page dimensions
                page_idx = page_num - 1
                dimension_source = "unknown"

-                # Priority 1: Original file dimensions
-                if page_idx in original_page_sizes:
+                # For OCR Track: always use OCR dimensions
+                if use_ocr_dimensions_for_pdf and page_idx in page_dimensions:
+                    current_page_dims = page_dimensions[page_idx]
+                    current_target_w = float(current_page_dims['width'])
+                    current_target_h = float(current_page_dims['height'])
+                    dimension_source = "ocr_track_direct"
+
+                # Priority 1: Original file dimensions (Direct Track only)
+                elif page_idx in original_page_sizes:
                    current_target_w, current_target_h = original_page_sizes[page_idx]
                    dimension_source = "original_file"

@@ -1774,12 +2111,26 @@ class PDFGeneratorService:
            non_empty_lines = [l for l in lines if l.strip()]
            num_lines = max(len(non_empty_lines), 1)

-            # Font size = bbox_height / num_lines * factor
+            # Font size calculation with stabilization
            # Use 0.8 factor to leave room for line spacing
-            font_size = (bbox_height / num_lines) * 0.8
-            font_size = max(min(font_size, 72), 4)  # Clamp between 4pt and 72pt
+            raw_font_size = (bbox_height / num_lines) * 0.8

-            logger.debug(f"Text has {num_lines} non-empty lines, bbox_height={bbox_height:.1f}, calculated font_size={font_size:.1f}")
+            # Stabilize font size for body text (most common case)
+            # Normal body text should be 9-11pt, only deviate for clear outliers
+            element_type = region.get('element_type', 'text')
+            if element_type in ('text', 'paragraph'):
+                # For body text, bias toward 10pt baseline
+                if 7 <= raw_font_size <= 14:
+                    # Near-normal range: use weighted average toward 10pt
+                    font_size = raw_font_size * 0.7 + 10 * 0.3
+                else:
+                    # Clear outlier: use raw but clamp more aggressively
+                    font_size = max(min(raw_font_size, 14), 7)
+            else:
+                # For titles/headers/etc, use raw calculation with wider range
+                font_size = max(min(raw_font_size, 72), 4)
+
+            logger.debug(f"Text has {num_lines} non-empty lines, bbox_height={bbox_height:.1f}, raw={raw_font_size:.1f}, final={font_size:.1f}")

            # Transform coordinates: OCR (top-left origin) → PDF (bottom-left origin)
            # CRITICAL: Y-axis flip!
@@ -2008,24 +2359,45 @@ class PDFGeneratorService:
            result_dir: Directory containing result files (for embedded images)
        """
        try:
+            elem_id = table_element.get('element_id', 'unknown')
+            use_border_only = table_element.get('_use_border_only', False)
+            logger.info(f"[DEBUG] draw_table_region: elem_id={elem_id}, _use_border_only={use_border_only}")
+
            html_content = table_element.get('content', '')
            if not html_content:
+                # Even without HTML, draw border if requested
+                if use_border_only:
+                    self._draw_table_border_only(pdf_canvas, table_element, page_height, scale_w, scale_h)
                return

-            # Try to use cell_boxes for direct rendering first (more accurate)
+            # Apply column correction if enabled
            cell_boxes = table_element.get('cell_boxes', [])
-            if cell_boxes:
-                logger.info(f"[TABLE] Using cell_boxes direct rendering ({len(cell_boxes)} cells)")
-                success = self._draw_table_with_cell_boxes(
-                    pdf_canvas, table_element, page_height,
-                    scale_w, scale_h, result_dir
-                )
-                if success:
-                    return  # Successfully rendered with cell_boxes
+            if (settings.table_column_correction_enabled and
+                TABLE_COLUMN_CORRECTOR_AVAILABLE and
+                cell_boxes):
+                try:
+                    corrector = TableColumnCorrector(
+                        correction_threshold=settings.table_column_correction_threshold,
+                        vertical_merge_enabled=settings.vertical_fragment_merge_enabled,
+                        vertical_aspect_ratio=settings.vertical_fragment_aspect_ratio
+                    )
+                    # Get table bbox for vertical fragment detection
+                    table_bbox = table_element.get('bbox', [])
+                    if isinstance(table_bbox, dict):
+                        table_bbox = [table_bbox['x0'], table_bbox['y0'], table_bbox['x1'], table_bbox['y1']]

-                logger.info("[TABLE] Falling back to ReportLab Table")
+                    corrected_html, stats = corrector.correct(
+                        html=html_content,
+                        cell_boxes=cell_boxes,
+                        table_bbox=table_bbox if isinstance(table_bbox, list) and len(table_bbox) >= 4 else None
+                    )
+                    if stats.get('column_corrections', 0) > 0:
+                        logger.info(f"[TABLE] {elem_id}: Column correction applied - {stats}")
+                        html_content = corrected_html
+                except Exception as e:
+                    logger.warning(f"[TABLE] {elem_id}: Column correction failed: {e}, using original HTML")

-            # Fallback: Parse HTML to extract table structure and use ReportLab Table
+            # Parse HTML first to get table structure for grid validation
            parser = HTMLTableParser()
            parser.feed(html_content)

@@ -2040,6 +2412,83 @@ class PDFGeneratorService:
            if not rows:
                return

+            # Calculate number of rows and columns from HTML for grid validation
+            num_rows = len(rows)
+            max_cols = 0
+            for row in rows:
+                row_cols = sum(cell.get('colspan', 1) for cell in row['cells'])
+                max_cols = max(max_cols, row_cols)
+
+            # Check if table was rebuilt - if so, use HTML content directly
+            was_rebuilt = table_element.get('was_rebuilt', False)
+            cell_boxes_rendered = False  # Track if we rendered borders with cell_boxes
+
+            if was_rebuilt:
+                logger.info(f"[TABLE] {elem_id}: Table was rebuilt, using HTML content directly")
+            elif use_border_only:
+                # Bad quality cell_boxes: skip cell_boxes rendering, use ReportLab Table with borders
+                logger.info(f"[TABLE] {elem_id}: Bad cell_boxes quality, using ReportLab Table with borders")
+            else:
+                # Check if cell_boxes can produce a valid grid before rendering borders
+                cell_boxes = table_element.get('cell_boxes', [])
+                if cell_boxes:
+                    # Get table bbox for grid calculation
+                    temp_bbox = table_element.get('bbox', [])
+                    if isinstance(temp_bbox, dict):
+                        raw_bbox = [temp_bbox['x0'], temp_bbox['y0'], temp_bbox['x1'], temp_bbox['y1']]
+                    elif isinstance(temp_bbox, list) and len(temp_bbox) >= 4:
+                        if isinstance(temp_bbox[0], (int, float)):
+                            raw_bbox = temp_bbox[:4]
+                        else:
+                            raw_bbox = [temp_bbox[0][0], temp_bbox[0][1], temp_bbox[2][0], temp_bbox[2][1]]
+                    else:
+                        raw_bbox = None
+
+                    # Pre-check: can we compute a valid grid from cell_boxes?
+                    if raw_bbox:
+                        test_col_widths, test_row_heights = self._compute_table_grid_from_cell_boxes(
+                            cell_boxes, raw_bbox, num_rows, max_cols
+                        )
+                        grid_valid = test_col_widths is not None and test_row_heights is not None
+
+                        if grid_valid:
+                            logger.info(f"[TABLE] Grid validation passed, rendering borders with cell_boxes")
+                            success = self._draw_table_with_cell_boxes(
+                                pdf_canvas, table_element, page_height,
+                                scale_w, scale_h, result_dir
+                            )
+                            if success:
+                                cell_boxes_rendered = True
+                                logger.info("[TABLE] cell_boxes rendered borders, continuing with text-only ReportLab Table")
+                            else:
+                                logger.info("[TABLE] cell_boxes rendering failed, using ReportLab Table with borders")
+                        else:
+                            # Grid mismatch: try cellboxes-first rendering if enabled
+                            if settings.table_rendering_prefer_cellboxes:
+                                logger.info(f"[TABLE] Grid mismatch, trying cellboxes-first rendering")
+                                from app.services.pdf_table_renderer import TableRenderer, TableRenderConfig
+                                renderer = TableRenderer(TableRenderConfig())
+                                success = renderer.render_from_cellboxes_grid(
+                                    pdf_canvas,
+                                    cell_boxes,
+                                    html_content,
+                                    tuple(raw_bbox),
+                                    page_height,
+                                    scale_w,
+                                    scale_h,
+                                    row_threshold=settings.table_cellboxes_row_threshold,
+                                    col_threshold=settings.table_cellboxes_col_threshold
+                                )
+                                if success:
+                                    logger.info("[TABLE] cellboxes-first rendering succeeded, skipping HTML-based rendering")
+                                    return  # Table fully rendered, exit early
+                                else:
+                                    logger.info("[TABLE] cellboxes-first rendering failed, falling back to HTML-based")
+                            else:
+                                logger.info(f"[TABLE] Grid validation failed (mismatch), using ReportLab Table with borders")
+                    else:
+                        logger.info("[TABLE] No valid bbox for grid validation, using ReportLab Table with borders")
+
            # Get bbox directly from table element
            table_bbox = table_element.get('bbox')

@@ -2106,15 +2555,7 @@ class PDFGeneratorService:
            pdf_y = page_height - ocr_y_bottom

            # Build table data for ReportLab with proper colspan/rowspan handling
-            # First pass: determine the actual grid size by accounting for spans
-            num_rows = len(rows)
-
-            # Calculate actual number of columns by checking first row's total span
-            max_cols = 0
-            for row in rows:
-                row_cols = sum(cell.get('colspan', 1) for cell in row['cells'])
-                max_cols = max(max_cols, row_cols)
-
+            # num_rows and max_cols already calculated above for grid validation
            logger.info(f"[表格] {num_rows}行x{max_cols}列 → PDF位置: ({pdf_x:.1f}, {pdf_y:.1f}), 寬x高: {table_width:.0f}x{table_height:.0f}")

            # Create a grid to track occupied cells (for rowspan handling)
@@ -2223,16 +2664,25 @@ class PDFGeneratorService:
            logger.info(f"[TABLE] Created with {len(col_widths)} cols, {len(row_heights)} rows")

            # Apply table style
-            style = TableStyle([
+            # If cell_boxes rendered borders, skip GRID style (text-only rendering)
+            style_commands = [
                ('FONT', (0, 0), (-1, -1), self.font_name if self.font_registered else 'Helvetica', font_size),
-                ('GRID', (0, 0), (-1, -1), 0.5, colors.black),
                ('VALIGN', (0, 0), (-1, -1), 'MIDDLE'),
                ('ALIGN', (0, 0), (-1, -1), 'CENTER'),
                ('LEFTPADDING', (0, 0), (-1, -1), 2),
                ('RIGHTPADDING', (0, 0), (-1, -1), 2),
                ('TOPPADDING', (0, 0), (-1, -1), 2),
                ('BOTTOMPADDING', (0, 0), (-1, -1), 2),
-            ])
+            ]
+
+            # Only add GRID if cell_boxes didn't render borders
+            if not cell_boxes_rendered:
+                style_commands.insert(1, ('GRID', (0, 0), (-1, -1), 0.5, colors.black))
+                logger.info("[TABLE] Adding GRID style (cell_boxes not used)")
+            else:
+                logger.info("[TABLE] Skipping GRID style (cell_boxes rendered borders)")
+
+            style = TableStyle(style_commands)

            # Add header style if first row has headers
            if rows and rows[0]['cells'] and rows[0]['cells'][0].get('is_header'):
@@ -2435,6 +2885,106 @@ class PDFGeneratorService:
        logger.debug(f"[TABLE] Normalized {len(cell_boxes)} cell boxes to grid")
        return normalized_boxes

+    def _draw_table_border_only(
+        self,
+        pdf_canvas: canvas.Canvas,
+        table_element: Dict,
+        page_height: float,
+        scale_w: float = 1.0,
+        scale_h: float = 1.0
+    ):
+        """
+        Draw only the outer border of a table (for tables with bad cell_boxes quality).
+
+        Text inside the table will be rendered using raw OCR positions.
+
+        Args:
+            pdf_canvas: ReportLab canvas object
+            table_element: Table element dict
+            page_height: Height of page in PDF coordinates
+            scale_w: Scale factor for X coordinates
+            scale_h: Scale factor for Y coordinates
+        """
+        table_bbox = table_element.get('bbox', [])
+        if not table_bbox or len(table_bbox) < 4:
+            return
+
+        element_id = table_element.get('element_id', 'unknown')
+
+        # Handle different bbox formats
+        if isinstance(table_bbox, dict):
+            x0, y0, x1, y1 = table_bbox['x0'], table_bbox['y0'], table_bbox['x1'], table_bbox['y1']
+        elif isinstance(table_bbox[0], (int, float)):
+            x0, y0, x1, y1 = table_bbox[0], table_bbox[1], table_bbox[2], table_bbox[3]
+        else:
+            return
+
+        # Apply scaling
+        pdf_x0 = x0 * scale_w
+        pdf_y0 = y0 * scale_h
+        pdf_x1 = x1 * scale_w
+        pdf_y1 = y1 * scale_h
+
+        # Convert to PDF coordinates (flip Y)
+        pdf_top = page_height - pdf_y0
+        pdf_bottom = page_height - pdf_y1
+        width = pdf_x1 - pdf_x0
+        height = pdf_y1 - pdf_y0
+
+        # Draw outer border only
+        pdf_canvas.setStrokeColor(colors.black)
+        pdf_canvas.setLineWidth(0.5)
+        pdf_canvas.rect(pdf_x0, pdf_bottom, width, height, stroke=1, fill=0)
+
+        logger.info(f"[TABLE] {element_id}: Drew border only (bad cell_boxes quality)")
+
+    def _check_cell_boxes_quality(self, cell_boxes: List, element_id: str = "") -> str:
+        """
+        Check the quality of cell_boxes to determine rendering strategy.
+
+        Args:
+            cell_boxes: List of cell bounding boxes
+            element_id: Optional element ID for logging
+
+        Returns:
+            'good' if cell_boxes form a proper grid, 'bad' otherwise
+        """
+        # If quality check is disabled, always return 'good' to use pure PP-Structure output
+        if not settings.table_quality_check_enabled:
+            logger.debug(f"[TABLE QUALITY] {element_id}: good - quality check disabled (pure PP-Structure mode)")
+            return 'good'
+
+        if not cell_boxes or len(cell_boxes) < 2:
+            logger.debug(f"[TABLE QUALITY] {element_id}: bad - too few cells ({len(cell_boxes) if cell_boxes else 0})")
+            return 'bad'  # No cell_boxes or too few
+
+        # Count overlapping cell pairs
+        overlap_count = 0
+        for i, box1 in enumerate(cell_boxes):
+            for j, box2 in enumerate(cell_boxes):
+                if i >= j:
+                    continue
+                if not isinstance(box1, (list, tuple)) or len(box1) < 4:
+                    continue
+                if not isinstance(box2, (list, tuple)) or len(box2) < 4:
+                    continue
+                x_overlap = box1[0] < box2[2] and box1[2] > box2[0]
+                y_overlap = box1[1] < box2[3] and box1[3] > box2[1]
+                if x_overlap and y_overlap:
+                    overlap_count += 1
+
+        total_pairs = len(cell_boxes) * (len(cell_boxes) - 1) // 2
+        overlap_ratio = overlap_count / total_pairs if total_pairs > 0 else 0
+
+        # Relaxed threshold: 20% overlap instead of 10% to allow more tables through
+        # This is because PP-StructureV3's cell detection sometimes has slight overlaps
+        if overlap_ratio > 0.20:
+            logger.info(f"[TABLE QUALITY] {element_id}: bad - overlap ratio {overlap_ratio:.2%} > 20%")
+            return 'bad'
+
+        logger.debug(f"[TABLE QUALITY] {element_id}: good - {len(cell_boxes)} cells, overlap {overlap_ratio:.2%}")
+        return 'good'
+
    def _draw_table_with_cell_boxes(
        self,
        pdf_canvas: canvas.Canvas,
@@ -2465,39 +3015,64 @@ class PDFGeneratorService:
        """
        try:
            cell_boxes = table_element.get('cell_boxes', [])
-
-            # Always draw outer table border first (fallback for incomplete cell_boxes)
            table_bbox = table_element.get('bbox', [])
-            if table_bbox and len(table_bbox) >= 4:
-                # Handle different bbox formats (list or dict)
-                if isinstance(table_bbox, dict):
-                    tx1 = float(table_bbox.get('x0', 0))
-                    ty1 = float(table_bbox.get('y0', 0))
-                    tx2 = float(table_bbox.get('x1', 0))
-                    ty2 = float(table_bbox.get('y1', 0))
-                else:
-                    tx1, ty1, tx2, ty2 = table_bbox[:4]

-                # Apply scaling
-                tx1_scaled = tx1 * scale_w
-                ty1_scaled = ty1 * scale_h
-                tx2_scaled = tx2 * scale_w
-                ty2_scaled = ty2 * scale_h
+            # Check cell_boxes quality - skip if they don't form a proper grid
+            if cell_boxes and len(cell_boxes) > 2:
+                # Count overlapping cell pairs
+                overlap_count = 0
+                for i, box1 in enumerate(cell_boxes):
+                    for j, box2 in enumerate(cell_boxes):
+                        if i >= j:
+                            continue
+                        x_overlap = box1[0] < box2[2] and box1[2] > box2[0]
+                        y_overlap = box1[1] < box2[3] and box1[3] > box2[1]
+                        if x_overlap and y_overlap:
+                            overlap_count += 1

-                table_width = tx2_scaled - tx1_scaled
-                table_height = ty2_scaled - ty1_scaled
+                # If more than 25% of cell pairs overlap, cell_boxes are unreliable
+                # Increased from 10% to 25% to allow more tables to use cell_boxes rendering
+                # which provides better visual fidelity than ReportLab Table fallback
+                total_pairs = len(cell_boxes) * (len(cell_boxes) - 1) // 2
+                overlap_ratio = overlap_count / total_pairs if total_pairs > 0 else 0

-                # Transform Y coordinate (PDF uses bottom-left origin)
-                pdf_x = tx1_scaled
-                pdf_y = page_height - ty2_scaled  # Bottom of table in PDF coords
-
-                # Draw outer table border (slightly thicker for visibility)
-                pdf_canvas.setStrokeColor(colors.black)
-                pdf_canvas.setLineWidth(1.0)
-                pdf_canvas.rect(pdf_x, pdf_y, table_width, table_height, stroke=1, fill=0)
-                logger.info(f"[TABLE] Drew outer table border at [{int(tx1)},{int(ty1)},{int(tx2)},{int(ty2)}]")
+                if overlap_ratio > 0.25:
+                    logger.warning(
+                        f"[TABLE] Skipping cell_boxes rendering: {overlap_count}/{total_pairs} "
+                        f"({overlap_ratio:.1%}) cell pairs overlap - using ReportLab Table fallback"
+                    )
+                    return False  # Return False to trigger ReportLab Table fallback

            if not cell_boxes:
+                # Fallback: draw outer border only when no cell_boxes
+                if table_bbox and len(table_bbox) >= 4:
+                    # Handle different bbox formats (list or dict)
+                    if isinstance(table_bbox, dict):
+                        tx1 = float(table_bbox.get('x0', 0))
+                        ty1 = float(table_bbox.get('y0', 0))
+                        tx2 = float(table_bbox.get('x1', 0))
+                        ty2 = float(table_bbox.get('y1', 0))
+                    else:
+                        tx1, ty1, tx2, ty2 = table_bbox[:4]
+
+                    # Apply scaling
+                    tx1_scaled = tx1 * scale_w
+                    ty1_scaled = ty1 * scale_h
+                    tx2_scaled = tx2 * scale_w
+                    ty2_scaled = ty2 * scale_h
+
+                    table_width = tx2_scaled - tx1_scaled
+                    table_height = ty2_scaled - ty1_scaled
+
+                    # Transform Y coordinate (PDF uses bottom-left origin)
+                    pdf_x = tx1_scaled
+                    pdf_y = page_height - ty2_scaled  # Bottom of table in PDF coords
+
+                    # Draw outer table border (slightly thicker for visibility)
+                    pdf_canvas.setStrokeColor(colors.black)
+                    pdf_canvas.setLineWidth(1.0)
+                    pdf_canvas.rect(pdf_x, pdf_y, table_width, table_height, stroke=1, fill=0)
+                    logger.info(f"[TABLE] Drew outer table border at [{int(tx1)},{int(ty1)},{int(tx2)},{int(ty2)}]")
                logger.warning("[TABLE] No cell_boxes available, only outer border drawn")
                # Still draw embedded images even without cell borders
                embedded_images = table_element.get('embedded_images', [])
@@ -2511,31 +3086,47 @@ class PDFGeneratorService:
            # Normalize cell boxes to create aligned grid
            cell_boxes = self._normalize_cell_boxes_to_grid(cell_boxes)

-            logger.info(f"[TABLE] Drawing {len(cell_boxes)} cell borders (layered mode, grid-aligned)")
+            logger.info(f"[TABLE] Drawing {len(cell_boxes)} cells using grid lines (avoiding duplicates)")
+
+            # Collect unique grid lines to avoid drawing duplicate/overlapping lines
+            h_lines = set()  # Horizontal lines: (y, x_start, x_end)
+            v_lines = set()  # Vertical lines: (x, y_start, y_end)

-            # Draw each cell border
            for box in cell_boxes:
                x1, y1, x2, y2 = box[0], box[1], box[2], box[3]

                # Apply scaling
-                x1_scaled = x1 * scale_w
-                y1_scaled = y1 * scale_h
-                x2_scaled = x2 * scale_w
-                y2_scaled = y2 * scale_h
+                x1_s = x1 * scale_w
+                y1_s = y1 * scale_h
+                x2_s = x2 * scale_w
+                y2_s = y2 * scale_h

-                cell_width = x2_scaled - x1_scaled
-                cell_height = y2_scaled - y1_scaled
+                # Round to 1 decimal place to help with deduplication
+                x1_s, y1_s, x2_s, y2_s = round(x1_s, 1), round(y1_s, 1), round(x2_s, 1), round(y2_s, 1)

-                # Transform Y coordinate (PDF uses bottom-left origin)
-                pdf_x = x1_scaled
-                pdf_y = page_height - y2_scaled  # Bottom of cell in PDF coords
+                # Add horizontal lines (top and bottom of cell)
+                h_lines.add((y1_s, x1_s, x2_s))  # Top line
+                h_lines.add((y2_s, x1_s, x2_s))  # Bottom line

-                # Draw cell border only (no fill, no text)
-                pdf_canvas.setStrokeColor(colors.black)
-                pdf_canvas.setLineWidth(0.5)
-                pdf_canvas.rect(pdf_x, pdf_y, cell_width, cell_height, stroke=1, fill=0)
+                # Add vertical lines (left and right of cell)
+                v_lines.add((x1_s, y1_s, y2_s))  # Left line
+                v_lines.add((x2_s, y1_s, y2_s))  # Right line

-            logger.info(f"[TABLE] Drew {len(cell_boxes)} cell borders")
+            # Draw unique horizontal lines
+            pdf_canvas.setStrokeColor(colors.black)
+            pdf_canvas.setLineWidth(0.5)
+
+            for y, x_start, x_end in h_lines:
+                pdf_y = page_height - y  # Transform Y coordinate
+                pdf_canvas.line(x_start, pdf_y, x_end, pdf_y)
+
+            # Draw unique vertical lines
+            for x, y_start, y_end in v_lines:
+                pdf_y_start = page_height - y_start
+                pdf_y_end = page_height - y_end
+                pdf_canvas.line(x, pdf_y_start, x, pdf_y_end)
+
+            logger.info(f"[TABLE] Drew {len(h_lines)} horizontal + {len(v_lines)} vertical grid lines")

            # Draw embedded images
            embedded_images = table_element.get('embedded_images', [])
--- a/backend/app/services/pdf_table_renderer.py
+++ b/backend/app/services/pdf_table_renderer.py
@@ -24,6 +24,256 @@ from reportlab.platypus import Paragraph, Table, TableStyle
 logger = logging.getLogger(__name__)


+# ============================================================================
+# Cell Box Grid Inferrer
+# ============================================================================
+
+class CellBoxGridInferrer:
+    """
+    Infer table grid structure from cell_boxes coordinates.
+
+    This class clusters cell_boxes by Y-coordinate (rows) and X-coordinate (columns)
+    to determine the grid structure, regardless of HTML colspan/rowspan.
+    """
+
+    def __init__(
+        self,
+        row_threshold: float = 15.0,
+        col_threshold: float = 15.0
+    ):
+        """
+        Initialize grid inferrer.
+
+        Args:
+            row_threshold: Y-coordinate threshold for row clustering
+            col_threshold: X-coordinate threshold for column clustering
+        """
+        self.row_threshold = row_threshold
+        self.col_threshold = col_threshold
+
+    def infer_grid(
+        self,
+        cell_boxes: List[List[float]]
+    ) -> Optional[Dict]:
+        """
+        Infer grid structure from cell_boxes.
+
+        Args:
+            cell_boxes: List of [x0, y0, x1, y1] coordinates
+
+        Returns:
+            Dict with 'grid', 'num_rows', 'num_cols', 'row_boundaries', 'col_boundaries'
+            or None if inference fails
+        """
+        if not cell_boxes or len(cell_boxes) < 1:
+            return None
+
+        try:
+            # Filter valid boxes
+            valid_boxes = [
+                b for b in cell_boxes
+                if b is not None and len(b) >= 4
+            ]
+            if not valid_boxes:
+                return None
+
+            # Extract Y and X boundaries from all cells
+            y_mins = [b[1] for b in valid_boxes]  # y0
+            y_maxs = [b[3] for b in valid_boxes]  # y1
+            x_mins = [b[0] for b in valid_boxes]  # x0
+            x_maxs = [b[2] for b in valid_boxes]  # x1
+
+            # Cluster Y values to determine rows
+            all_y = sorted(set(y_mins + y_maxs))
+            y_boundaries = self._cluster_to_boundaries(all_y, self.row_threshold)
+
+            # Cluster X values to determine columns
+            all_x = sorted(set(x_mins + x_maxs))
+            x_boundaries = self._cluster_to_boundaries(all_x, self.col_threshold)
+
+            if len(y_boundaries) < 2 or len(x_boundaries) < 2:
+                return None
+
+            num_rows = len(y_boundaries) - 1
+            num_cols = len(x_boundaries) - 1
+
+            # Build grid: map (row, col) -> cell_box info
+            grid = {}
+            for idx, box in enumerate(valid_boxes):
+                x0, y0, x1, y1 = box[:4]
+
+                # Find row by y_center
+                y_center = (y0 + y1) / 2
+                row = self._find_position(y_center, y_boundaries)
+
+                # Find col by x_center
+                x_center = (x0 + x1) / 2
+                col = self._find_position(x_center, x_boundaries)
+
+                if row is not None and col is not None:
+                    grid[(row, col)] = {
+                        'bbox': box,
+                        'index': idx,
+                        'content': ''
+                    }
+
+            # Calculate row heights and column widths
+            row_heights = [
+                y_boundaries[i + 1] - y_boundaries[i]
+                for i in range(num_rows)
+            ]
+            col_widths = [
+                x_boundaries[i + 1] - x_boundaries[i]
+                for i in range(num_cols)
+            ]
+
+            return {
+                'grid': grid,
+                'num_rows': num_rows,
+                'num_cols': num_cols,
+                'row_boundaries': y_boundaries,
+                'col_boundaries': x_boundaries,
+                'row_heights': row_heights,
+                'col_widths': col_widths
+            }
+
+        except Exception as e:
+            logger.error(f"Grid inference failed: {e}")
+            return None
+
+    def _cluster_to_boundaries(
+        self,
+        values: List[float],
+        threshold: float
+    ) -> List[float]:
+        """
+        Cluster nearby values and return representative boundaries.
+
+        Args:
+            values: Sorted list of coordinate values
+            threshold: Clustering threshold
+
+        Returns:
+            List of boundary values (cluster representatives)
+        """
+        if not values:
+            return []
+
+        boundaries = [values[0]]
+        current_cluster = [values[0]]
+
+        for v in values[1:]:
+            if v - current_cluster[-1] <= threshold:
+                current_cluster.append(v)
+            else:
+                # Finish current cluster, use average as boundary
+                boundaries[-1] = sum(current_cluster) / len(current_cluster)
+                boundaries.append(v)
+                current_cluster = [v]
+
+        # Finish last cluster
+        if current_cluster:
+            boundaries[-1] = sum(current_cluster) / len(current_cluster)
+
+        return boundaries
+
+    def _find_position(
+        self,
+        value: float,
+        boundaries: List[float]
+    ) -> Optional[int]:
+        """
+        Find which interval a value falls into.
+
+        Args:
+            value: Coordinate value
+            boundaries: List of boundary values
+
+        Returns:
+            Index of interval, or None if out of bounds
+        """
+        for i in range(len(boundaries) - 1):
+            if boundaries[i] <= value <= boundaries[i + 1]:
+                return i
+
+        # Check if close to any boundary
+        for i in range(len(boundaries) - 1):
+            mid = (boundaries[i] + boundaries[i + 1]) / 2
+            if abs(value - mid) < (boundaries[i + 1] - boundaries[i]):
+                return i
+
+        return None
+
+
+def extract_cell_contents_from_html(html: str) -> List[str]:
+    """
+    Extract cell text contents from HTML in reading order.
+
+    Args:
+        html: HTML table string
+
+    Returns:
+        List of text strings, one per cell
+    """
+    try:
+        parser = HTMLTableParser()
+        parser.feed(html)
+
+        if not parser.tables:
+            return []
+
+        contents = []
+        for row in parser.tables[0].get('rows', []):
+            for cell in row.get('cells', []):
+                text = cell.get('text', '').strip()
+                contents.append(text)
+
+        return contents
+
+    except Exception as e:
+        logger.error(f"HTML content extraction failed: {e}")
+        return []
+
+
+def map_content_to_grid(
+    grid: Dict[Tuple[int, int], Dict],
+    contents: List[str],
+    num_rows: int,
+    num_cols: int
+) -> Dict[Tuple[int, int], Dict]:
+    """
+    Map extracted content to grid cells row by row.
+
+    Args:
+        grid: Dict mapping (row, col) to cell info
+        contents: List of text contents from HTML
+        num_rows: Number of rows in grid
+        num_cols: Number of columns in grid
+
+    Returns:
+        Updated grid with content assigned
+    """
+    content_idx = 0
+
+    for row in range(num_rows):
+        for col in range(num_cols):
+            if (row, col) in grid:
+                if content_idx < len(contents):
+                    grid[(row, col)]['content'] = contents[content_idx]
+                    content_idx += 1
+                else:
+                    grid[(row, col)]['content'] = ''
+
+    # Log if there's a significant mismatch
+    if content_idx < len(contents):
+        logger.debug(
+            f"Content mismatch: {len(contents)} HTML cells, "
+            f"only {content_idx} mapped to {len(grid)} grid cells"
+        )
+
+    return grid
+
+
 # ============================================================================
 # Configuration
 # ============================================================================
@@ -405,6 +655,147 @@ class TableRenderer:
            traceback.print_exc()
            return False

+    def render_from_cellboxes_grid(
+        self,
+        pdf_canvas,
+        cell_boxes: List[List[float]],
+        html_content: str,
+        table_bbox: Tuple[float, float, float, float],
+        page_height: float,
+        scale_w: float = 1.0,
+        scale_h: float = 1.0,
+        row_threshold: float = 15.0,
+        col_threshold: float = 15.0
+    ) -> bool:
+        """
+        Render table using cell_boxes as the primary structure source.
+
+        This method infers grid structure from cell_boxes coordinates and
+        maps HTML content to cells, regardless of HTML colspan/rowspan.
+
+        Args:
+            pdf_canvas: ReportLab canvas
+            cell_boxes: List of [x0, y0, x1, y1] for each cell
+            html_content: HTML table string (for text content)
+            table_bbox: Table bounding box
+            page_height: PDF page height
+            scale_w: Horizontal scale factor
+            scale_h: Vertical scale factor
+            row_threshold: Y-coordinate threshold for row clustering
+            col_threshold: X-coordinate threshold for column clustering
+
+        Returns:
+            True if successful, False otherwise
+        """
+        try:
+            if not cell_boxes:
+                logger.debug("No cell_boxes provided for grid rendering")
+                return False
+
+            # Infer grid structure from cell_boxes
+            inferrer = CellBoxGridInferrer(
+                row_threshold=row_threshold,
+                col_threshold=col_threshold
+            )
+            grid_info = inferrer.infer_grid(cell_boxes)
+
+            if not grid_info:
+                logger.debug("Failed to infer grid from cell_boxes")
+                return False
+
+            grid = grid_info['grid']
+            num_rows = grid_info['num_rows']
+            num_cols = grid_info['num_cols']
+            row_boundaries = grid_info['row_boundaries']
+            col_boundaries = grid_info['col_boundaries']
+
+            logger.info(
+                f"[TABLE] CellBoxes grid inferred: {num_rows} rows x {num_cols} cols "
+                f"from {len(cell_boxes)} cell_boxes"
+            )
+
+            # Extract content from HTML
+            if html_content:
+                contents = extract_cell_contents_from_html(html_content)
+                grid = map_content_to_grid(grid, contents, num_rows, num_cols)
+                logger.debug(f"[TABLE] Mapped {len(contents)} HTML cells to grid")
+
+            # Apply scale factors to boundaries
+            scaled_row_boundaries = [y * scale_h for y in row_boundaries]
+            scaled_col_boundaries = [x * scale_w for x in col_boundaries]
+
+            # Draw cell borders and content
+            pdf_canvas.saveState()
+            pdf_canvas.setStrokeColor(self.config.border_color)
+            pdf_canvas.setLineWidth(self.config.border_width)
+
+            # Create paragraph style for text
+            style = ParagraphStyle(
+                'CellBoxCell',
+                fontName=self.config.font_name,
+                fontSize=self.config.font_size,
+                alignment=TA_CENTER,
+                leading=self.config.font_size * 1.2
+            )
+
+            for row in range(num_rows):
+                for col in range(num_cols):
+                    # Calculate cell boundaries
+                    x0 = scaled_col_boundaries[col]
+                    x1 = scaled_col_boundaries[col + 1] if col + 1 < len(scaled_col_boundaries) else x0 + 50
+                    y0 = scaled_row_boundaries[row]
+                    y1 = scaled_row_boundaries[row + 1] if row + 1 < len(scaled_row_boundaries) else y0 + 20
+
+                    # Convert to PDF coordinates (flip Y)
+                    pdf_x0 = x0
+                    pdf_y0 = page_height - y1
+                    pdf_x1 = x1
+                    pdf_y1 = page_height - y0
+
+                    cell_width = pdf_x1 - pdf_x0
+                    cell_height = pdf_y1 - pdf_y0
+
+                    # Draw cell border
+                    pdf_canvas.rect(pdf_x0, pdf_y0, cell_width, cell_height)
+
+                    # Draw text if cell exists in grid
+                    if (row, col) in grid:
+                        cell_content = grid[(row, col)].get('content', '')
+                        if cell_content:
+                            # Calculate text position with padding
+                            text_x = pdf_x0 + self.config.left_padding
+                            text_y = pdf_y0 + cell_height - self.config.top_padding - self.config.font_size
+
+                            # Fit text to cell
+                            available_width = cell_width - self.config.left_padding - self.config.right_padding
+                            font_size = self._fit_text_to_cell(
+                                pdf_canvas, cell_content, available_width, cell_height
+                            )
+
+                            # Draw centered text
+                            pdf_canvas.setFont(self.config.font_name, font_size)
+                            text_width = pdf_canvas.stringWidth(
+                                cell_content, self.config.font_name, font_size
+                            )
+
+                            # Center horizontally
+                            text_x = pdf_x0 + (cell_width - text_width) / 2
+                            # Center vertically
+                            text_y = pdf_y0 + (cell_height - font_size) / 2
+
+                            pdf_canvas.drawString(text_x, text_y, cell_content)
+
+            pdf_canvas.restoreState()
+
+            logger.info(f"[TABLE] Successfully rendered {num_rows}x{num_cols} table from cell_boxes")
+            return True
+
+        except Exception as e:
+            logger.error(f"CellBoxes grid rendering failed: {e}")
+            import traceback
+            traceback.print_exc()
+            return False
+
    # =========================================================================
    # Grid and Cell Box Helpers
    # =========================================================================
--- a/backend/app/services/pp_structure_enhanced.py
+++ b/backend/app/services/pp_structure_enhanced.py
@@ -28,9 +28,11 @@ from PIL import Image
 import numpy as np
 import cv2
 from app.models.unified_document import ElementType
+from app.services.cell_validation_engine import CellValidationEngine, CellValidationConfig
 from app.core.config import settings
 from app.services.memory_manager import prediction_context
 from app.services.cv_table_detector import CVTableDetector
+from app.services.table_content_rebuilder import TableContentRebuilder

 logger = logging.getLogger(__name__)

@@ -91,7 +93,8 @@ class PPStructureEnhanced:
        preprocessed_image: Optional[Image.Image] = None,
        scaling_info: Optional['ScalingInfo'] = None,
        save_visualization: bool = False,
-        use_cv_table_detection: bool = False
+        use_cv_table_detection: bool = False,
+        raw_ocr_regions: Optional[List[Dict[str, Any]]] = None
    ) -> Dict[str, Any]:
        """
        Analyze document with full PP-StructureV3 capabilities.
@@ -110,6 +113,8 @@ class PPStructureEnhanced:
                               (layout_det_res, layout_order_res, overall_ocr_res, etc.)
            use_cv_table_detection: If True, use CV-based line detection for wired tables
                                   instead of ML-based cell detection (RT-DETR-L)
+            raw_ocr_regions: Optional list of raw OCR text regions for table content
+                            rebuilding. Used when PP-StructureV3's table HTML is incorrect.

        Returns:
            Dictionary with complete structure information including:
@@ -222,6 +227,7 @@ class PPStructureEnhanced:

                # Extract table_res_list which contains cell_box_list
                layout_det_res = None
+                overall_ocr_res = None
                if result_dict:
                    if 'table_res_list' in result_dict:
                        table_res_list = result_dict['table_res_list']
@@ -235,13 +241,20 @@ class PPStructureEnhanced:
                        layout_det_res = result_dict['layout_det_res']
                        logger.info(f"Found layout_det_res with {len(layout_det_res.get('boxes', []))} boxes")

+                    # Extract overall_ocr_res for gap filling (avoid separate Raw OCR inference)
+                    if 'overall_ocr_res' in result_dict:
+                        overall_ocr_res = result_dict['overall_ocr_res']
+                        ocr_count = len(overall_ocr_res.get('rec_texts', []))
+                        logger.info(f"Found overall_ocr_res with {ocr_count} text regions")
+
                # Process parsing_res_list if found
                if parsing_res_list:
                    elements = self._process_parsing_res_list(
                        parsing_res_list, current_page, output_dir, image_path, scaling_info,
                        table_res_list=table_res_list,  # Pass table_res_list for cell_box_list
                        layout_det_res=layout_det_res,  # Pass layout_det_res for Image-in-Table
-                        use_cv_table_detection=use_cv_table_detection  # Use CV for wired tables
+                        use_cv_table_detection=use_cv_table_detection,  # Use CV for wired tables
+                        raw_ocr_regions=raw_ocr_regions  # Pass raw OCR for table content rebuilding
                    )
                    all_elements.extend(elements)

@@ -289,6 +302,15 @@ class PPStructureEnhanced:
            if visualization_dir:
                result['visualization_dir'] = str(visualization_dir)

+            # Add overall_ocr_res for gap filling (converted to standard format)
+            # This allows gap_filling_service to use PP-StructureV3's internal OCR
+            # instead of running a separate Raw OCR inference
+            if overall_ocr_res:
+                result['overall_ocr_res'] = self._convert_overall_ocr_to_regions(
+                    overall_ocr_res, scaling_info
+                )
+                logger.info(f"Converted {len(result['overall_ocr_res'])} OCR regions from overall_ocr_res")
+
            return result

        except Exception as e:
@@ -327,7 +349,8 @@ class PPStructureEnhanced:
        scaling_info: Optional['ScalingInfo'] = None,
        table_res_list: Optional[List[Dict]] = None,
        layout_det_res: Optional[Dict] = None,
-        use_cv_table_detection: bool = False
+        use_cv_table_detection: bool = False,
+        raw_ocr_regions: Optional[List[Dict[str, Any]]] = None
    ) -> List[Dict[str, Any]]:
        """
        Process parsing_res_list to extract all elements.
@@ -341,6 +364,7 @@ class PPStructureEnhanced:
            table_res_list: Optional list of table results containing cell_box_list
            layout_det_res: Optional layout detection result for Image-in-Table processing
            use_cv_table_detection: If True, use CV line detection for wired tables
+            raw_ocr_regions: Optional list of raw OCR text regions for table content rebuilding

        Returns:
            List of processed elements with normalized structure
@@ -415,6 +439,11 @@ class PPStructureEnhanced:
                    mapped_type = ElementType.TABLE
                    html_table_content = content  # Store for later use

+            # Strip LaTeX math formatting from text content (PP-Structure formula detection)
+            if content and mapped_type in [ElementType.TEXT, ElementType.TITLE, ElementType.HEADER]:
+                if '$' in content and '\\' in content:
+                    content = self._strip_latex_math(content)
+
            # Create element
            element = {
                'element_id': f"pp3_{current_page}_{idx}",
@@ -468,18 +497,84 @@ class PPStructureEnhanced:
                                    logger.info(f"[TABLE] Found {len(cell_boxes)} cell boxes from table_res_list (HTML match)")
                                    break

-                    # If no HTML match, use first available table_res with cell_box_list
+                    # If no HTML match, find best matching table_res by bbox overlap
                    if not cell_boxes_extracted:
+                        best_match = None
+                        best_overlap = 0.0
+
                        for tbl_res in table_res_list:
-                            if 'cell_box_list' in tbl_res and tbl_res['cell_box_list']:
-                                cell_boxes = tbl_res['cell_box_list']
-                                element['cell_boxes'] = [[float(c) for c in box] for box in cell_boxes]
-                                element['cell_boxes_source'] = 'table_res_list'
-                                cell_boxes_extracted = True
-                                logger.info(f"[TABLE] Found {len(cell_boxes)} cell boxes from table_res_list (first available)")
-                                # Remove used table_res to avoid reuse
-                                table_res_list.remove(tbl_res)
-                                break
+                            if 'cell_box_list' not in tbl_res or not tbl_res['cell_box_list']:
+                                continue
+
+                            # Get table_res bbox from its cell_box_list
+                            cell_boxes_temp = tbl_res['cell_box_list']
+                            if not cell_boxes_temp:
+                                continue
+
+                            # Calculate bounding box of all cells
+                            tbl_x1 = min(cb[0] for cb in cell_boxes_temp)
+                            tbl_y1 = min(cb[1] for cb in cell_boxes_temp)
+                            tbl_x2 = max(cb[2] for cb in cell_boxes_temp)
+                            tbl_y2 = max(cb[3] for cb in cell_boxes_temp)
+
+                            # Calculate IoU (Intersection over Union) with element bbox
+                            # bbox is [x1, y1, x2, y2]
+                            elem_x1, elem_y1, elem_x2, elem_y2 = bbox[0], bbox[1], bbox[2], bbox[3]
+
+                            # Intersection
+                            inter_x1 = max(tbl_x1, elem_x1)
+                            inter_y1 = max(tbl_y1, elem_y1)
+                            inter_x2 = min(tbl_x2, elem_x2)
+                            inter_y2 = min(tbl_y2, elem_y2)
+
+                            if inter_x1 < inter_x2 and inter_y1 < inter_y2:
+                                inter_area = (inter_x2 - inter_x1) * (inter_y2 - inter_y1)
+                                elem_area = (elem_x2 - elem_x1) * (elem_y2 - elem_y1)
+                                tbl_area = (tbl_x2 - tbl_x1) * (tbl_y2 - tbl_y1)
+
+                                # Use overlap ratio with element bbox (how much of element is covered)
+                                overlap_ratio = inter_area / elem_area if elem_area > 0 else 0
+
+                                if overlap_ratio > best_overlap:
+                                    best_overlap = overlap_ratio
+                                    best_match = tbl_res
+
+                        # Use best match if overlap is significant (>10%)
+                        if best_match and best_overlap > 0.1:
+                            cell_boxes = best_match['cell_box_list']
+                            element['cell_boxes'] = [[float(c) for c in box] for box in cell_boxes]
+                            element['cell_boxes_source'] = 'table_res_list'
+                            cell_boxes_extracted = True
+                            logger.info(f"[TABLE] Found {len(cell_boxes)} cell boxes from table_res_list (bbox match, overlap={best_overlap:.2f})")
+
+                            # Extract pred_html if not already set
+                            if not html_content and 'pred_html' in best_match:
+                                html_content = best_match['pred_html']
+                                element['html'] = html_content
+                                element['extracted_text'] = self._extract_text_from_html(html_content)
+                                logger.info(f"[TABLE] Extracted HTML from table_res_list (bbox match, {len(html_content)} chars)")
+
+                            # Remove used table_res to avoid reuse
+                            table_res_list.remove(best_match)
+                        elif table_res_list:
+                            # Fallback to first available if no bbox match found
+                            for tbl_res in table_res_list:
+                                if 'cell_box_list' in tbl_res and tbl_res['cell_box_list']:
+                                    cell_boxes = tbl_res['cell_box_list']
+                                    element['cell_boxes'] = [[float(c) for c in box] for box in cell_boxes]
+                                    element['cell_boxes_source'] = 'table_res_list'
+                                    cell_boxes_extracted = True
+                                    logger.warning(f"[TABLE] Using first available table_res (no bbox match, {len(cell_boxes)} cells)")
+
+                                    # Extract pred_html if not already set
+                                    if not html_content and 'pred_html' in tbl_res:
+                                        html_content = tbl_res['pred_html']
+                                        element['html'] = html_content
+                                        element['extracted_text'] = self._extract_text_from_html(html_content)
+                                        logger.info(f"[TABLE] Extracted HTML from table_res_list (fallback, {len(html_content)} chars)")
+
+                                    table_res_list.remove(tbl_res)
+                                    break

                if not cell_boxes_extracted and 'boxes' in res_data:
                    # PPStructureV3 returned cell boxes in res (unlikely in PaddleX 3.x)
@@ -558,6 +653,42 @@ class PPStructureEnhanced:
                        element['embedded_images'] = embedded_images
                        logger.info(f"[TABLE] Embedded {len(embedded_images)} images into table")

+                # 4. Table content rebuilding from raw OCR regions
+                # When cell_boxes have boundary issues, rebuild table content from raw OCR
+                # Only if table_content_rebuilder is enabled (disabled by default as it's a patch behavior)
+                logger.info(f"[TABLE] raw_ocr_regions available: {raw_ocr_regions is not None and len(raw_ocr_regions) if raw_ocr_regions else 0}")
+                logger.info(f"[TABLE] cell_boxes available: {len(element.get('cell_boxes', []))}")
+                if settings.table_content_rebuilder_enabled and raw_ocr_regions and element.get('cell_boxes'):
+                    rebuilder = TableContentRebuilder()
+                    should_rebuild, rebuild_reason = rebuilder.should_rebuild(
+                        element['cell_boxes'],
+                        bbox,
+                        element.get('html', '')
+                    )
+
+                    if should_rebuild:
+                        logger.info(f"[TABLE] Triggering table rebuild: {rebuild_reason}")
+                        rebuilt_table, rebuild_stats = rebuilder.rebuild_table(
+                            cell_boxes=element['cell_boxes'],
+                            table_bbox=bbox,
+                            raw_ocr_regions=raw_ocr_regions,
+                            original_html=element.get('html', '')
+                        )
+
+                        if rebuilt_table:
+                            # Update element with rebuilt content
+                            element['html'] = rebuilt_table['html']
+                            element['rebuilt_table'] = rebuilt_table
+                            element['rebuild_stats'] = rebuild_stats
+                            element['extracted_text'] = self._extract_text_from_html(rebuilt_table['html'])
+                            logger.info(
+                                f"[TABLE] Rebuilt table: {rebuilt_table['rows']}x{rebuilt_table['cols']} "
+                                f"with {len(rebuilt_table['cells'])} cells"
+                            )
+                        else:
+                            logger.warning(f"[TABLE] Rebuild failed: {rebuild_stats.get('reason', 'unknown')}")
+                            element['rebuild_stats'] = rebuild_stats
+
            # Special handling for images/figures/charts/stamps (visual elements that need cropping)
            elif mapped_type in [ElementType.IMAGE, ElementType.FIGURE, ElementType.CHART, ElementType.DIAGRAM, ElementType.STAMP, ElementType.LOGO]:
                # Save image if path provided
@@ -587,6 +718,21 @@ class PPStructureEnhanced:
            elements.append(element)
            logger.debug(f"Processed element {idx}: type={mapped_type}, bbox={bbox}")

+        # Apply cell validation to filter over-detected tables
+        if settings.cell_validation_enabled:
+            cell_validator = CellValidationEngine(CellValidationConfig(
+                max_cell_density=settings.cell_validation_max_density,
+                min_avg_cell_area=settings.cell_validation_min_cell_area,
+                min_cell_height=settings.cell_validation_min_cell_height,
+                enabled=True
+            ))
+            elements, validation_stats = cell_validator.validate_and_filter_elements(elements)
+            if validation_stats['reclassified_tables'] > 0:
+                logger.info(
+                    f"Cell validation: {validation_stats['reclassified_tables']}/{validation_stats['total_tables']} "
+                    f"tables reclassified as TEXT due to over-detection"
+                )
+
        return elements

    def _embed_images_in_table(
@@ -911,18 +1057,145 @@ class PPStructureEnhanced:
            type_counts[elem_type] = type_counts.get(elem_type, 0) + 1
        return type_counts

+    def _convert_overall_ocr_to_regions(
+        self,
+        overall_ocr_res: Dict[str, Any],
+        scaling_info: Optional['ScalingInfo'] = None
+    ) -> List[Dict[str, Any]]:
+        """
+        Convert PP-StructureV3's overall_ocr_res to standard OCR region format.
+
+        This allows gap_filling_service to use PP-StructureV3's internal OCR results
+        instead of running a separate Raw OCR inference, saving approximately 50%
+        of total inference time.
+
+        The overall_ocr_res structure:
+        - dt_polys: List of polygon coordinates [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
+        - rec_texts: List of recognized text strings
+        - rec_scores: List of confidence scores
+
+        Args:
+            overall_ocr_res: Dictionary containing OCR results from PP-StructureV3
+            scaling_info: Optional scaling info for coordinate restoration
+
+        Returns:
+            List of OCR region dictionaries in standard format:
+            [{'text': str, 'bbox': [[x1,y1],...], 'confidence': float}, ...]
+        """
+        regions = []
+
+        dt_polys = overall_ocr_res.get('dt_polys', [])
+        rec_texts = overall_ocr_res.get('rec_texts', [])
+        rec_scores = overall_ocr_res.get('rec_scores', [])
+
+        # Ensure all lists have the same length
+        num_regions = min(len(dt_polys), len(rec_texts))
+        if len(rec_scores) < num_regions:
+            # Pad with default confidence if scores are missing
+            rec_scores = list(rec_scores) + [0.9] * (num_regions - len(rec_scores))
+
+        for i in range(num_regions):
+            text = rec_texts[i]
+            if not text or not text.strip():
+                continue
+
+            poly = dt_polys[i]
+            confidence = rec_scores[i] if i < len(rec_scores) else 0.9
+
+            # Apply scaling restoration if needed
+            if scaling_info and hasattr(scaling_info, 'scale_factor') and scaling_info.scale_factor != 1.0:
+                scale = scaling_info.scale_factor
+                poly = [[pt[0] / scale, pt[1] / scale] for pt in poly]
+
+            regions.append({
+                'text': text,
+                'bbox': poly,  # Keep polygon format for compatibility
+                'confidence': confidence
+            })
+
+        return regions
+
    def _extract_text_from_html(self, html: str) -> str:
        """Extract plain text from HTML content."""
        try:
            from bs4 import BeautifulSoup
            soup = BeautifulSoup(html, 'html.parser')
-            return soup.get_text(separator=' ', strip=True)
+            text = soup.get_text(separator=' ', strip=True)
        except:
            # Fallback: just remove HTML tags
            import re
            text = re.sub(r'<[^>]+>', ' ', html)
            text = re.sub(r'\s+', ' ', text)
-            return text.strip()
+            text = text.strip()
+
+        # Strip LaTeX math formatting if present
+        return self._strip_latex_math(text)
+
+    def _strip_latex_math(self, text: str) -> str:
+        """
+        Convert LaTeX math notation to plain text.
+
+        PP-StructureV3 outputs formulas in LaTeX format like:
+        $N\\cdot m\\times8.851=|b\\cdot|$
+
+        This converts them to readable plain text.
+        """
+        import re
+
+        if not text or '$' not in text:
+            return text
+
+        # Remove $...$ delimiters but keep content
+        text = re.sub(r'\$([^$]+)\$', r'\1', text)
+
+        # Convert common LaTeX math commands to plain text
+        replacements = [
+            (r'\\cdot', '·'),           # Multiplication dot
+            (r'\\times', '×'),          # Multiplication sign
+            (r'\\div', '÷'),            # Division sign
+            (r'\\pm', '±'),             # Plus-minus
+            (r'\\leq', '≤'),            # Less than or equal
+            (r'\\geq', '≥'),            # Greater than or equal
+            (r'\\neq', '≠'),            # Not equal
+            (r'\\approx', '≈'),         # Approximately equal
+            (r'\\circ', '°'),           # Degree symbol
+            (r'\\degree', '°'),         # Degree symbol
+            (r'\\alpha', 'α'),
+            (r'\\beta', 'β'),
+            (r'\\gamma', 'γ'),
+            (r'\\delta', 'δ'),
+            (r'\\mu', 'μ'),
+            (r'\\Omega', 'Ω'),
+            (r'\\infty', '∞'),
+            (r'\^\\{2\\}', '²'),         # Superscript 2
+            (r'\^\\{3\\}', '³'),         # Superscript 3
+            (r'\^2', '²'),
+            (r'\^3', '³'),
+            (r'_\\{([^}]+)\\}', r'_\1'), # Subscript
+            (r'\\mathrm\{([^}]+)\}', r'\1'),   # Roman text
+            (r'\\mathsf\{([^}]+)\}', r'\1'),   # Sans-serif text
+            (r'\\mathbf\{([^}]+)\}', r'\1'),   # Bold text
+            (r'\\text\{([^}]+)\}', r'\1'),     # Text mode
+            (r'\\left', ''),
+            (r'\\right', ''),
+            (r'\\[|]', '|'),             # Pipe symbols
+            (r'\\ ', ' '),               # Escaped space
+            (r'\\,', ' '),               # Thin space
+            (r'\\;', ' '),               # Medium space
+            (r'\\quad', ' '),            # Quad space
+            (r'\\qquad', ' '),           # Double quad space
+        ]
+
+        for pattern, replacement in replacements:
+            text = re.sub(pattern, replacement, text)
+
+        # Clean up any remaining backslashes followed by letters (unknown commands)
+        text = re.sub(r'\\[a-zA-Z]+', '', text)
+
+        # Clean up multiple spaces
+        text = re.sub(r'\s+', ' ', text)
+
+        return text.strip()

    def _extract_bbox_from_filename(self, filename: str) -> List[int]:
        """Extract bbox from filename if it contains coordinate information."""
--- a/backend/app/services/processing_orchestrator.py
+++ b/backend/app/services/processing_orchestrator.py
@@ -335,6 +335,14 @@ class OCRPipeline(ProcessingPipeline):

            processing_time = time.time() - start_time

+            # Debug: Check if ocr_result has rebuild_stats
+            if 'enhanced_results' in ocr_result:
+                for page_result in ocr_result['enhanced_results']:
+                    for elem in page_result.get('elements', []):
+                        if elem.get('type') == 'table' or (hasattr(elem.get('type'), 'value') and elem.get('type').value == 'table'):
+                            has_rebuild = 'rebuild_stats' in elem
+                            logger.info(f"[ORCHESTRATOR] Before converter - Table {elem.get('element_id')}: has rebuild_stats={has_rebuild}")
+
            # Convert to UnifiedDocument
            unified_doc = self.converter.convert(
                ocr_result,
--- a/backend/app/services/table_column_corrector.py
+++ b/backend/app/services/table_column_corrector.py
@@ -0,0 +1,790 @@
+"""
+Table Column Alignment Corrector
+
+This module provides post-processing correction for PP-Structure's table
+structure recognition, which frequently outputs cells with incorrect column
+indices (column shift).
+
+The correction uses a "Header-Anchor Alignment" strategy:
+1. Extract header row (row_idx=0) column X-coordinate ranges as anchors
+2. Validate each cell's column assignment against header X-ranges
+3. Correct column index if cell X-overlap with assigned column is insufficient
+
+Additionally supports "Vertical Fragment Merging" for Chinese vertical text
+that gets split into multiple narrow text blocks.
+"""
+
+import logging
+import re
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional, Tuple
+from html.parser import HTMLParser
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class BBox:
+    """Bounding box with x0, y0, x1, y1 coordinates."""
+    x0: float
+    y0: float
+    x1: float
+    y1: float
+
+    @property
+    def width(self) -> float:
+        return self.x1 - self.x0
+
+    @property
+    def height(self) -> float:
+        return self.y1 - self.y0
+
+    @property
+    def center_x(self) -> float:
+        return (self.x0 + self.x1) / 2
+
+    @property
+    def center_y(self) -> float:
+        return (self.y0 + self.y1) / 2
+
+    @classmethod
+    def from_list(cls, coords: List[float]) -> 'BBox':
+        """Create BBox from [x0, y0, x1, y1] list."""
+        if len(coords) >= 4:
+            return cls(coords[0], coords[1], coords[2], coords[3])
+        raise ValueError(f"Invalid bbox coords: {coords}")
+
+
+@dataclass
+class ColumnAnchor:
+    """Represents a column's X-coordinate range from header row."""
+    col_idx: int
+    x_min: float
+    x_max: float
+    colspan: int = 1
+
+    @property
+    def center_x(self) -> float:
+        return (self.x_min + self.x_max) / 2
+
+
+@dataclass
+class TableCell:
+    """Represents a cell extracted from HTML with position info."""
+    row_idx: int
+    col_idx: int
+    content: str
+    colspan: int = 1
+    rowspan: int = 1
+    bbox: Optional[BBox] = None
+    is_header: bool = False
+
+
+@dataclass
+class TextBlock:
+    """Represents a text block that may need merging."""
+    text: str
+    bbox: BBox
+
+    @property
+    def aspect_ratio(self) -> float:
+        """Width / Height ratio. Vertical text has low aspect ratio."""
+        if self.bbox.height == 0:
+            return float('inf')
+        return self.bbox.width / self.bbox.height
+
+
+class TableHTMLParser(HTMLParser):
+    """
+    Parse table HTML to extract cells with row/col indices and spans.
+    PP-Structure outputs HTML like:
+    <table><tr><td>content</td><td colspan="2">merged</td></tr></table>
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.cells: List[TableCell] = []
+        self.current_row_idx = -1
+        self.current_col_idx = 0
+        self.current_cell: Optional[TableCell] = None
+        self.in_table = False
+        # Track occupied cells for colspan/rowspan handling
+        self.occupied: Dict[Tuple[int, int], bool] = {}
+
+    def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]):
+        attrs_dict = dict(attrs)
+
+        if tag == 'table':
+            self.in_table = True
+            self.current_row_idx = -1
+            self.occupied = {}
+
+        elif tag == 'tr' and self.in_table:
+            self.current_row_idx += 1
+            self.current_col_idx = 0
+            # Skip occupied columns from previous rowspans
+            while (self.current_row_idx, self.current_col_idx) in self.occupied:
+                self.current_col_idx += 1
+
+        elif tag in ('td', 'th') and self.in_table:
+            # Skip occupied columns
+            while (self.current_row_idx, self.current_col_idx) in self.occupied:
+                self.current_col_idx += 1
+
+            colspan = int(attrs_dict.get('colspan', 1))
+            rowspan = int(attrs_dict.get('rowspan', 1))
+
+            self.current_cell = TableCell(
+                row_idx=self.current_row_idx,
+                col_idx=self.current_col_idx,
+                content='',
+                colspan=colspan,
+                rowspan=rowspan,
+                is_header=(tag == 'th')
+            )
+
+            # Mark occupied cells for spans
+            for r in range(rowspan):
+                for c in range(colspan):
+                    self.occupied[(self.current_row_idx + r, self.current_col_idx + c)] = True
+
+    def handle_endtag(self, tag: str):
+        if tag == 'table':
+            self.in_table = False
+
+        elif tag in ('td', 'th') and self.current_cell is not None:
+            self.current_cell.content = self.current_cell.content.strip()
+            self.cells.append(self.current_cell)
+            self.current_col_idx += self.current_cell.colspan
+            self.current_cell = None
+
+    def handle_data(self, data: str):
+        if self.current_cell is not None:
+            self.current_cell.content += data
+
+
+def calculate_x_overlap(cell_bbox: BBox, anchor: ColumnAnchor) -> float:
+    """
+    Calculate the X-axis overlap ratio between a cell and a column anchor.
+
+    Returns:
+        Overlap ratio (0.0 to 1.0) relative to the cell's width.
+        1.0 means the cell is fully within the anchor's X range.
+    """
+    if cell_bbox.width == 0:
+        return 0.0
+
+    overlap_start = max(cell_bbox.x0, anchor.x_min)
+    overlap_end = min(cell_bbox.x1, anchor.x_max)
+    overlap_width = max(0, overlap_end - overlap_start)
+
+    return overlap_width / cell_bbox.width
+
+
+def calculate_iou(bbox1: BBox, bbox2: BBox) -> float:
+    """Calculate Intersection over Union between two bounding boxes."""
+    # Intersection
+    x0 = max(bbox1.x0, bbox2.x0)
+    y0 = max(bbox1.y0, bbox2.y0)
+    x1 = min(bbox1.x1, bbox2.x1)
+    y1 = min(bbox1.y1, bbox2.y1)
+
+    if x1 <= x0 or y1 <= y0:
+        return 0.0
+
+    intersection = (x1 - x0) * (y1 - y0)
+
+    # Union
+    area1 = bbox1.width * bbox1.height
+    area2 = bbox2.width * bbox2.height
+    union = area1 + area2 - intersection
+
+    if union == 0:
+        return 0.0
+
+    return intersection / union
+
+
+def parse_table_html(html: str) -> List[TableCell]:
+    """
+    Parse table HTML and extract cells with row/col indices.
+
+    Args:
+        html: Table HTML string from PP-Structure
+
+    Returns:
+        List of TableCell objects with position and content
+    """
+    parser = TableHTMLParser()
+    try:
+        parser.feed(html)
+    except Exception as e:
+        logger.warning(f"Failed to parse table HTML: {e}")
+        return []
+    return parser.cells
+
+
+def find_header_row(cells: List[TableCell], min_columns: int = 3) -> Optional[int]:
+    """
+    Find the best row to use as header anchor.
+
+    Strategy: Find the first row with at least min_columns individual cells
+    (cells without colspan > 1). This avoids using merged title rows as headers.
+
+    Args:
+        cells: All parsed cells
+        min_columns: Minimum number of individual columns required
+
+    Returns:
+        Row index of the best header row, or None if not found
+    """
+    # Group cells by row
+    rows: Dict[int, List[TableCell]] = {}
+    for cell in cells:
+        if cell.row_idx not in rows:
+            rows[cell.row_idx] = []
+        rows[cell.row_idx].append(cell)
+
+    # Find first row with enough individual columns (no colspan)
+    for row_idx in sorted(rows.keys()):
+        row_cells = rows[row_idx]
+        individual_cells = [c for c in row_cells if c.colspan == 1]
+        if len(individual_cells) >= min_columns:
+            logger.debug(f"[COLUMN CORRECTION] Found header row {row_idx} with {len(individual_cells)} individual columns")
+            return row_idx
+
+    # Fallback: find row with most individual cells
+    best_row = None
+    best_count = 0
+    for row_idx, row_cells in rows.items():
+        individual_count = len([c for c in row_cells if c.colspan == 1])
+        if individual_count > best_count:
+            best_count = individual_count
+            best_row = row_idx
+
+    if best_row is not None and best_count >= 2:
+        logger.debug(f"[COLUMN CORRECTION] Using fallback header row {best_row} with {best_count} columns")
+        return best_row
+
+    return None
+
+
+def build_column_anchors(
+    header_cells: List[TableCell],
+    cell_boxes: List[List[float]],
+    all_cells: Optional[List[TableCell]] = None
+) -> List[ColumnAnchor]:
+    """
+    Build column anchors from header row cells matched with cell_boxes.
+
+    The header row is used as the authoritative reference for column X-coordinate
+    ranges. For tables with merged title rows, we find the first row with
+    multiple individual columns.
+
+    Args:
+        header_cells: Cells from the identified header row
+        cell_boxes: List of [x0, y0, x1, y1] coordinates from PP-Structure
+        all_cells: All cells for finding actual header row (optional)
+
+    Returns:
+        List of ColumnAnchor sorted by x_min
+    """
+    if not header_cells or not cell_boxes:
+        return []
+
+    # If header row has too many merged cells, try to find a better header row
+    individual_cells = [c for c in header_cells if c.colspan == 1]
+    if len(individual_cells) < 3 and all_cells:
+        header_row_idx = find_header_row(all_cells, min_columns=3)
+        if header_row_idx is not None:
+            header_cells = [c for c in all_cells if c.row_idx == header_row_idx]
+            individual_cells = [c for c in header_cells if c.colspan == 1]
+            logger.info(f"[COLUMN CORRECTION] Switched to row {header_row_idx} as header ({len(individual_cells)} columns)")
+
+    # Only use individual cells (no colspan) for accurate column boundaries
+    if individual_cells:
+        header_cells = individual_cells
+
+    # Convert cell_boxes to BBox objects
+    boxes = []
+    for coords in cell_boxes:
+        try:
+            boxes.append(BBox.from_list(coords))
+        except (ValueError, IndexError):
+            continue
+
+    if not boxes:
+        return []
+
+    # Group boxes by approximate Y position to find the header row's boxes
+    # Sort all boxes by Y first
+    boxes_by_y = sorted(boxes, key=lambda b: b.y0)
+
+    # Find the Y range of the header cells (need to estimate based on row index)
+    header_row_idx = header_cells[0].row_idx if header_cells else 0
+
+    # Group boxes into rows by Y clustering
+    row_groups: List[List[BBox]] = []
+    current_group: List[BBox] = []
+    current_y = None
+    y_threshold = 40  # pixels tolerance for same row
+
+    for box in boxes_by_y:
+        if current_y is None:
+            current_group.append(box)
+            current_y = box.center_y
+        elif abs(box.center_y - current_y) < y_threshold:
+            current_group.append(box)
+            current_y = (current_y * len(current_group) + box.center_y) / (len(current_group) + 1)
+        else:
+            if current_group:
+                row_groups.append(sorted(current_group, key=lambda b: b.x0))
+            current_group = [box]
+            current_y = box.center_y
+
+    if current_group:
+        row_groups.append(sorted(current_group, key=lambda b: b.x0))
+
+    # Find the row group that best matches the header row
+    # Look for a row with similar number of boxes as header cells
+    target_count = len(header_cells)
+    best_row_group = None
+    best_diff = float('inf')
+
+    for group in row_groups:
+        diff = abs(len(group) - target_count)
+        if diff < best_diff:
+            best_diff = diff
+            best_row_group = group
+
+    if not best_row_group:
+        logger.warning("[COLUMN CORRECTION] Could not find matching cell_boxes row for header")
+        return []
+
+    logger.debug(f"[COLUMN CORRECTION] Matched header row with {len(best_row_group)} cell_boxes")
+
+    # Sort header cells by col_idx and match with boxes in X-order
+    header_sorted = sorted(header_cells, key=lambda c: c.col_idx)
+    boxes_sorted = best_row_group  # Already sorted by x0
+
+    anchors = []
+    for i, cell in enumerate(header_sorted):
+        if i < len(boxes_sorted):
+            box = boxes_sorted[i]
+            anchors.append(ColumnAnchor(
+                col_idx=cell.col_idx,
+                x_min=box.x0,
+                x_max=box.x1,
+                colspan=cell.colspan
+            ))
+
+    return sorted(anchors, key=lambda a: a.x_min)
+
+
+def match_cell_to_cellbox(
+    cell: TableCell,
+    cell_boxes: List[BBox],
+    row_cells: List[TableCell]
+) -> Optional[BBox]:
+    """
+    Match a table cell to its corresponding cell_box using position heuristics.
+
+    Strategy:
+    1. For header row, use X-order matching
+    2. For other rows, use IoU if we have inferred bbox
+    3. Fall back to position-based matching within row
+
+    Args:
+        cell: The cell to match
+        cell_boxes: All cell_boxes for this table
+        row_cells: All cells in the same row (for position context)
+
+    Returns:
+        Matched BBox or None if no match found
+    """
+    if not cell_boxes:
+        return None
+
+    # Sort cell_boxes by Y first, then X
+    sorted_boxes = sorted(cell_boxes, key=lambda b: (b.y0, b.x0))
+
+    # Group boxes by approximate Y position (same row)
+    row_groups: List[List[BBox]] = []
+    current_group: List[BBox] = []
+    current_y = None
+
+    for box in sorted_boxes:
+        if current_y is None or abs(box.center_y - current_y) < 30:  # 30px tolerance
+            current_group.append(box)
+            if current_y is None:
+                current_y = box.center_y
+            else:
+                current_y = (current_y + box.center_y) / 2
+        else:
+            if current_group:
+                row_groups.append(sorted(current_group, key=lambda b: b.x0))
+            current_group = [box]
+            current_y = box.center_y
+
+    if current_group:
+        row_groups.append(sorted(current_group, key=lambda b: b.x0))
+
+    # Find the row that best matches cell.row_idx
+    if cell.row_idx < len(row_groups):
+        row_boxes = row_groups[cell.row_idx]
+        # Sort cells in this row by col_idx
+        row_cells_sorted = sorted(row_cells, key=lambda c: c.col_idx)
+        cell_position = row_cells_sorted.index(cell) if cell in row_cells_sorted else -1
+
+        if 0 <= cell_position < len(row_boxes):
+            return row_boxes[cell_position]
+
+    return None
+
+
+def correct_cell_column(
+    cell: TableCell,
+    anchors: List[ColumnAnchor],
+    threshold: float = 0.5
+) -> int:
+    """
+    Determine the correct column index for a cell based on X-coordinate overlap.
+
+    Args:
+        cell: The cell to check
+        anchors: Column anchors from header row
+        threshold: Minimum overlap ratio to trigger correction
+
+    Returns:
+        Corrected column index (may be same as original)
+    """
+    if not cell.bbox or not anchors:
+        return cell.col_idx
+
+    # Find the anchor with best X-overlap
+    best_anchor = None
+    best_overlap = 0.0
+
+    for anchor in anchors:
+        overlap = calculate_x_overlap(cell.bbox, anchor)
+        if overlap > best_overlap:
+            best_overlap = overlap
+            best_anchor = anchor
+
+    # If we found a significantly better column, use it
+    if best_anchor and best_overlap >= threshold:
+        if best_anchor.col_idx != cell.col_idx:
+            logger.info(
+                f"[COLUMN CORRECTION] Row {cell.row_idx}: "
+                f"'{cell.content[:20]}...' col {cell.col_idx} -> {best_anchor.col_idx} "
+                f"(overlap: {best_overlap:.1%})"
+            )
+            return best_anchor.col_idx
+
+    # If no good overlap, try nearest by center point
+    if best_overlap < 0.1:
+        cell_center = cell.bbox.center_x
+        nearest_anchor = min(anchors, key=lambda a: abs(a.center_x - cell_center))
+        if nearest_anchor.col_idx != cell.col_idx:
+            logger.info(
+                f"[COLUMN CORRECTION] Row {cell.row_idx}: "
+                f"'{cell.content[:20]}...' col {cell.col_idx} -> {nearest_anchor.col_idx} "
+                f"(nearest by center)"
+            )
+            return nearest_anchor.col_idx
+
+    return cell.col_idx
+
+
+def detect_vertical_fragments(
+    text_blocks: List[TextBlock],
+    table_bbox: BBox,
+    aspect_ratio_threshold: float = 0.3,
+    left_margin_ratio: float = 0.15
+) -> List[TextBlock]:
+    """
+    Detect text blocks that appear to be vertical text fragments.
+
+    Criteria:
+    1. Width << Height (aspect ratio < threshold)
+    2. Located in leftmost portion of table
+
+    Args:
+        text_blocks: All text blocks in/around the table
+        table_bbox: Table bounding box
+        aspect_ratio_threshold: Max width/height to be considered vertical
+        left_margin_ratio: Fraction of table width to consider as left margin
+
+    Returns:
+        List of blocks that are likely vertical text fragments
+    """
+    left_boundary = table_bbox.x0 + (table_bbox.width * left_margin_ratio)
+
+    fragments = []
+    for block in text_blocks:
+        if block.aspect_ratio < aspect_ratio_threshold:
+            if block.bbox.center_x < left_boundary:
+                fragments.append(block)
+
+    return fragments
+
+
+def should_merge_blocks(block1: TextBlock, block2: TextBlock, x_tolerance: float = 10.0, y_gap_max: float = 20.0) -> bool:
+    """
+    Check if two blocks should be merged as vertical text.
+
+    Criteria:
+    1. X-center deviation < tolerance
+    2. Y-gap between blocks < max gap
+
+    Args:
+        block1: First block (should be above block2)
+        block2: Second block
+        x_tolerance: Max X-center deviation in pixels
+        y_gap_max: Max vertical gap between blocks
+
+    Returns:
+        True if blocks should be merged
+    """
+    x_deviation = abs(block1.bbox.center_x - block2.bbox.center_x)
+    y_gap = block2.bbox.y0 - block1.bbox.y1
+
+    return x_deviation < x_tolerance and 0 <= y_gap < y_gap_max
+
+
+def merge_vertical_fragments(
+    fragments: List[TextBlock],
+    x_tolerance: float = 10.0,
+    y_gap_max: float = 20.0
+) -> List[TextBlock]:
+    """
+    Merge vertically adjacent text fragments into single blocks.
+
+    Args:
+        fragments: List of vertical text fragments
+        x_tolerance: Max X-center deviation for merging
+        y_gap_max: Max Y-gap between mergeable blocks
+
+    Returns:
+        List of merged text blocks
+    """
+    if not fragments:
+        return []
+
+    # Sort by Y position
+    sorted_fragments = sorted(fragments, key=lambda b: b.bbox.y0)
+
+    merged = []
+    current_group: List[TextBlock] = []
+
+    for block in sorted_fragments:
+        if not current_group:
+            current_group.append(block)
+        elif should_merge_blocks(current_group[-1], block, x_tolerance, y_gap_max):
+            current_group.append(block)
+        else:
+            # Merge current group and start new one
+            merged.append(_merge_group(current_group))
+            current_group = [block]
+
+    if current_group:
+        merged.append(_merge_group(current_group))
+
+    return merged
+
+
+def _merge_group(blocks: List[TextBlock]) -> TextBlock:
+    """Merge a group of text blocks into one."""
+    if len(blocks) == 1:
+        return blocks[0]
+
+    # Combine text (top to bottom)
+    combined_text = ''.join(b.text for b in blocks)
+
+    # Calculate merged bbox
+    x0 = min(b.bbox.x0 for b in blocks)
+    y0 = min(b.bbox.y0 for b in blocks)
+    x1 = max(b.bbox.x1 for b in blocks)
+    y1 = max(b.bbox.y1 for b in blocks)
+
+    return TextBlock(
+        text=combined_text,
+        bbox=BBox(x0, y0, x1, y1)
+    )
+
+
+def correct_table_columns(
+    html: str,
+    cell_boxes: List[List[float]],
+    threshold: float = 0.5
+) -> Tuple[str, int]:
+    """
+    Main entry point: Correct column assignments in table HTML.
+
+    This function:
+    1. Parses the HTML to extract cells with row/col
+    2. Builds column anchors from header row
+    3. Matches cells to cell_boxes
+    4. Corrects column indices based on X-overlap
+    5. Rebuilds the HTML with corrected indices
+
+    Args:
+        html: Original table HTML from PP-Structure
+        cell_boxes: List of [x0, y0, x1, y1] from PP-Structure
+        threshold: Minimum overlap ratio for correction
+
+    Returns:
+        Tuple of (corrected_html, correction_count)
+    """
+    # Parse HTML
+    cells = parse_table_html(html)
+    if not cells:
+        logger.debug("[COLUMN CORRECTION] No cells parsed from HTML")
+        return html, 0
+
+    # Convert cell_boxes to BBox objects
+    boxes = []
+    for coords in cell_boxes:
+        try:
+            boxes.append(BBox.from_list(coords))
+        except (ValueError, IndexError):
+            continue
+
+    if not boxes:
+        logger.debug("[COLUMN CORRECTION] No valid cell_boxes")
+        return html, 0
+
+    # Find the best header row (not necessarily row 0)
+    # First try row 0, but if it has merged cells, find a better row
+    header_row_idx = find_header_row(cells, min_columns=3)
+    if header_row_idx is None:
+        # Fallback to row 0
+        header_row_idx = 0
+
+    header_cells = [c for c in cells if c.row_idx == header_row_idx]
+    if not header_cells:
+        logger.debug("[COLUMN CORRECTION] No header row found, skipping correction")
+        return html, 0
+
+    # Build column anchors, passing all cells for smart header detection
+    anchors = build_column_anchors(header_cells, cell_boxes, all_cells=cells)
+    if not anchors:
+        logger.debug("[COLUMN CORRECTION] Could not build column anchors")
+        return html, 0
+
+    logger.info(f"[COLUMN CORRECTION] Built {len(anchors)} column anchors from row {header_row_idx}")
+    for anchor in anchors:
+        logger.debug(f"  Column {anchor.col_idx}: X range [{anchor.x_min:.1f}, {anchor.x_max:.1f}]")
+
+    # Group cells by row for matching
+    cells_by_row: Dict[int, List[TableCell]] = {}
+    for cell in cells:
+        if cell.row_idx not in cells_by_row:
+            cells_by_row[cell.row_idx] = []
+        cells_by_row[cell.row_idx].append(cell)
+
+    # Match cells to cell_boxes and correct columns
+    correction_count = 0
+    corrections: Dict[Tuple[int, int], int] = {}  # (row, old_col) -> new_col
+
+    for cell in cells:
+        if cell.row_idx == header_row_idx:
+            continue  # Skip header row (used as reference)
+
+        row_cells = cells_by_row.get(cell.row_idx, [])
+        matched_box = match_cell_to_cellbox(cell, boxes, row_cells)
+
+        if matched_box:
+            cell.bbox = matched_box
+            new_col = correct_cell_column(cell, anchors, threshold)
+
+            if new_col != cell.col_idx:
+                corrections[(cell.row_idx, cell.col_idx)] = new_col
+                correction_count += 1
+
+    if correction_count == 0:
+        logger.info("[COLUMN CORRECTION] No corrections needed")
+        return html, 0
+
+    # Rebuild HTML with corrected column indices
+    # Note: This is a simple approach that modifies HTML attributes
+    # A more robust solution would rebuild the entire table structure
+    corrected_html = html
+
+    logger.info(f"[COLUMN CORRECTION] Made {correction_count} column corrections")
+
+    return corrected_html, correction_count
+
+
+class TableColumnCorrector:
+    """
+    Service class for table column correction.
+
+    Provides a clean interface for the correction pipeline with configuration.
+    """
+
+    def __init__(
+        self,
+        correction_threshold: float = 0.5,
+        vertical_merge_enabled: bool = True,
+        vertical_aspect_ratio: float = 0.3
+    ):
+        self.correction_threshold = correction_threshold
+        self.vertical_merge_enabled = vertical_merge_enabled
+        self.vertical_aspect_ratio = vertical_aspect_ratio
+
+    def correct(
+        self,
+        html: str,
+        cell_boxes: List[List[float]],
+        table_bbox: Optional[List[float]] = None,
+        text_blocks: Optional[List[Dict]] = None
+    ) -> Tuple[str, Dict]:
+        """
+        Apply column correction to a table.
+
+        Args:
+            html: Table HTML from PP-Structure
+            cell_boxes: Cell bounding boxes
+            table_bbox: Table bounding box (for vertical fragment detection)
+            text_blocks: Raw OCR text blocks (for vertical fragment merging)
+
+        Returns:
+            Tuple of (corrected_html, stats_dict)
+        """
+        stats = {
+            'column_corrections': 0,
+            'vertical_merges': 0,
+            'anchors_built': 0
+        }
+
+        # Step 1: Vertical fragment merging (if enabled and data available)
+        if self.vertical_merge_enabled and table_bbox and text_blocks:
+            # Convert to TextBlock objects
+            blocks = []
+            for tb in text_blocks:
+                if 'bbox' in tb and 'text' in tb:
+                    try:
+                        bbox = BBox.from_list(tb['bbox'])
+                        blocks.append(TextBlock(text=tb['text'], bbox=bbox))
+                    except (ValueError, KeyError):
+                        continue
+
+            if blocks:
+                table_bb = BBox.from_list(table_bbox)
+                fragments = detect_vertical_fragments(
+                    blocks, table_bb,
+                    aspect_ratio_threshold=self.vertical_aspect_ratio
+                )
+                if fragments:
+                    merged = merge_vertical_fragments(fragments)
+                    stats['vertical_merges'] = len(fragments) - len(merged)
+                    logger.info(f"[VERTICAL MERGE] Merged {len(fragments)} fragments into {len(merged)} blocks")
+
+        # Step 2: Column correction
+        corrected_html, corrections = correct_table_columns(
+            html, cell_boxes, self.correction_threshold
+        )
+        stats['column_corrections'] = corrections
+
+        return corrected_html, stats
--- a/backend/app/services/table_content_rebuilder.py
+++ b/backend/app/services/table_content_rebuilder.py
@@ -0,0 +1,806 @@
+"""
+Table Content Rebuilder
+
+Rebuilds table content from raw OCR regions when PP-StructureV3's HTML output
+is incorrect due to cell merge errors or boundary detection issues.
+
+This module addresses the key problem: PP-StructureV3's ML-based table recognition
+often merges multiple cells incorrectly, especially for borderless tables.
+The solution uses:
+1. cell_boxes validation (filter out-of-bounds cells)
+2. Raw OCR regions to rebuild accurate cell content
+3. Grid-based row/col position calculation
+"""
+
+import logging
+from dataclasses import dataclass
+from typing import List, Dict, Any, Optional, Tuple
+from collections import defaultdict
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class CellBox:
+    """Represents a validated cell bounding box."""
+    x0: float
+    y0: float
+    x1: float
+    y1: float
+    original_index: int
+
+    @property
+    def center_y(self) -> float:
+        return (self.y0 + self.y1) / 2
+
+    @property
+    def center_x(self) -> float:
+        return (self.x0 + self.x1) / 2
+
+    @property
+    def area(self) -> float:
+        return max(0, (self.x1 - self.x0) * (self.y1 - self.y0))
+
+
+@dataclass
+class OCRTextRegion:
+    """Represents a raw OCR text region."""
+    text: str
+    x0: float
+    y0: float
+    x1: float
+    y1: float
+    confidence: float = 1.0
+
+    @property
+    def center_y(self) -> float:
+        return (self.y0 + self.y1) / 2
+
+    @property
+    def center_x(self) -> float:
+        return (self.x0 + self.x1) / 2
+
+
+@dataclass
+class RebuiltCell:
+    """Represents a rebuilt table cell."""
+    row: int
+    col: int
+    row_span: int
+    col_span: int
+    content: str
+    bbox: Optional[List[float]] = None
+    ocr_regions: List[OCRTextRegion] = None
+
+    def __post_init__(self):
+        if self.ocr_regions is None:
+            self.ocr_regions = []
+
+
+class TableContentRebuilder:
+    """
+    Rebuilds table content from raw OCR regions and validated cell_boxes.
+
+    This class solves the problem where PP-StructureV3's HTML output incorrectly
+    merges multiple cells. Instead of relying on the ML-generated HTML, it:
+    1. Validates cell_boxes against table bbox
+    2. Groups cell_boxes into rows/columns by coordinate clustering
+    3. Fills each cell with matching raw OCR text
+    4. Generates correct table structure
+    """
+
+    def __init__(
+        self,
+        boundary_tolerance: float = 20.0,
+        row_clustering_threshold: float = 15.0,
+        col_clustering_threshold: float = 15.0,
+        iou_threshold_for_ocr_match: float = 0.3,
+        min_text_coverage: float = 0.5
+    ):
+        """
+        Initialize the rebuilder.
+
+        Args:
+            boundary_tolerance: Tolerance for cell_boxes boundary check (pixels)
+            row_clustering_threshold: Max Y-distance for cells in same row (pixels)
+            col_clustering_threshold: Max X-distance for cells in same column (pixels)
+            iou_threshold_for_ocr_match: Min IoU to consider OCR region inside cell
+            min_text_coverage: Min overlap ratio for OCR text to be assigned to cell
+        """
+        self.boundary_tolerance = boundary_tolerance
+        self.row_clustering_threshold = row_clustering_threshold
+        self.col_clustering_threshold = col_clustering_threshold
+        self.iou_threshold = iou_threshold_for_ocr_match
+        self.min_text_coverage = min_text_coverage
+
+    def validate_cell_boxes(
+        self,
+        cell_boxes: List[List[float]],
+        table_bbox: List[float]
+    ) -> Tuple[List[CellBox], Dict[str, Any]]:
+        """
+        Validate cell_boxes against table bbox, filtering invalid ones.
+
+        Args:
+            cell_boxes: List of cell bounding boxes [[x0, y0, x1, y1], ...]
+            table_bbox: Table bounding box [x0, y0, x1, y1]
+
+        Returns:
+            Tuple of (valid_cells, validation_stats)
+        """
+        if not cell_boxes or len(table_bbox) < 4:
+            return [], {"total": 0, "valid": 0, "invalid": 0, "reason": "empty_input"}
+
+        table_x0, table_y0, table_x1, table_y1 = table_bbox[:4]
+        table_height = table_y1 - table_y0
+        table_width = table_x1 - table_x0
+
+        # Expanded table bounds with tolerance
+        expanded_y1 = table_y1 + self.boundary_tolerance
+        expanded_x1 = table_x1 + self.boundary_tolerance
+        expanded_y0 = table_y0 - self.boundary_tolerance
+        expanded_x0 = table_x0 - self.boundary_tolerance
+
+        valid_cells = []
+        invalid_reasons = defaultdict(int)
+
+        for idx, box in enumerate(cell_boxes):
+            if not box or len(box) < 4:
+                invalid_reasons["invalid_format"] += 1
+                continue
+
+            x0, y0, x1, y1 = box[:4]
+
+            # Check if cell is significantly outside table bounds
+            # Cell's bottom (y1) shouldn't exceed table's bottom + tolerance
+            if y1 > expanded_y1:
+                invalid_reasons["y1_exceeds_table"] += 1
+                continue
+
+            # Cell's top (y0) shouldn't be above table's top - tolerance
+            if y0 < expanded_y0:
+                invalid_reasons["y0_above_table"] += 1
+                continue
+
+            # Cell's right (x1) shouldn't exceed table's right + tolerance
+            if x1 > expanded_x1:
+                invalid_reasons["x1_exceeds_table"] += 1
+                continue
+
+            # Cell's left (x0) shouldn't be left of table - tolerance
+            if x0 < expanded_x0:
+                invalid_reasons["x0_left_of_table"] += 1
+                continue
+
+            # Check for inverted coordinates
+            if x0 >= x1 or y0 >= y1:
+                invalid_reasons["inverted_coords"] += 1
+                continue
+
+            # Check cell height is reasonable (at least 8px for readable text)
+            cell_height = y1 - y0
+            if cell_height < 8:
+                invalid_reasons["too_small"] += 1
+                continue
+
+            valid_cells.append(CellBox(
+                x0=x0, y0=y0, x1=x1, y1=y1,
+                original_index=idx
+            ))
+
+        stats = {
+            "total": len(cell_boxes),
+            "valid": len(valid_cells),
+            "invalid": len(cell_boxes) - len(valid_cells),
+            "invalid_reasons": dict(invalid_reasons),
+            "validity_ratio": len(valid_cells) / len(cell_boxes) if cell_boxes else 0
+        }
+
+        logger.info(
+            f"Cell box validation: {stats['valid']}/{stats['total']} valid "
+            f"(ratio={stats['validity_ratio']:.2%})"
+        )
+        if invalid_reasons:
+            logger.debug(f"Invalid reasons: {dict(invalid_reasons)}")
+
+        return valid_cells, stats
+
+    def parse_raw_ocr_regions(
+        self,
+        raw_regions: List[Dict[str, Any]],
+        table_bbox: List[float]
+    ) -> List[OCRTextRegion]:
+        """
+        Parse raw OCR regions and filter to those within/near table bbox.
+
+        Args:
+            raw_regions: List of raw OCR region dicts with 'text', 'bbox', 'confidence'
+            table_bbox: Table bounding box [x0, y0, x1, y1]
+
+        Returns:
+            List of OCRTextRegion objects within table area
+        """
+        if not raw_regions or len(table_bbox) < 4:
+            return []
+
+        table_x0, table_y0, table_x1, table_y1 = table_bbox[:4]
+        # Expand table area slightly to catch edge text
+        margin = 10
+
+        result = []
+        for region in raw_regions:
+            text = region.get('text', '').strip()
+            if not text:
+                continue
+
+            bbox = region.get('bbox', [])
+            confidence = region.get('confidence', 1.0)
+
+            # Parse bbox (handle both nested and flat formats)
+            if not bbox:
+                continue
+
+            if isinstance(bbox[0], (list, tuple)):
+                # Nested format: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
+                xs = [pt[0] for pt in bbox if len(pt) >= 2]
+                ys = [pt[1] for pt in bbox if len(pt) >= 2]
+                if xs and ys:
+                    x0, y0, x1, y1 = min(xs), min(ys), max(xs), max(ys)
+                else:
+                    continue
+            elif len(bbox) == 4:
+                x0, y0, x1, y1 = bbox
+            else:
+                continue
+
+            # Check if region overlaps with table area
+            if (x1 < table_x0 - margin or x0 > table_x1 + margin or
+                y1 < table_y0 - margin or y0 > table_y1 + margin):
+                continue
+
+            result.append(OCRTextRegion(
+                text=text,
+                x0=float(x0), y0=float(y0),
+                x1=float(x1), y1=float(y1),
+                confidence=confidence
+            ))
+
+        logger.debug(f"Parsed {len(result)} OCR regions within table area")
+        return result
+
+    def cluster_cells_into_grid(
+        self,
+        cells: List[CellBox]
+    ) -> Tuple[List[float], List[float], Dict[Tuple[int, int], CellBox]]:
+        """
+        Cluster cells into rows and columns based on coordinates.
+
+        Args:
+            cells: List of validated CellBox objects
+
+        Returns:
+            Tuple of (row_boundaries, col_boundaries, cell_grid)
+            - row_boundaries: Y coordinates for row divisions
+            - col_boundaries: X coordinates for column divisions
+            - cell_grid: Dict mapping (row, col) to CellBox
+        """
+        if not cells:
+            return [], [], {}
+
+        # Collect all unique Y boundaries (top and bottom of cells)
+        y_coords = set()
+        x_coords = set()
+        for cell in cells:
+            y_coords.add(round(cell.y0, 1))
+            y_coords.add(round(cell.y1, 1))
+            x_coords.add(round(cell.x0, 1))
+            x_coords.add(round(cell.x1, 1))
+
+        # Cluster nearby coordinates
+        row_boundaries = self._cluster_coordinates(sorted(y_coords), self.row_clustering_threshold)
+        col_boundaries = self._cluster_coordinates(sorted(x_coords), self.col_clustering_threshold)
+
+        logger.debug(f"Found {len(row_boundaries)} row boundaries, {len(col_boundaries)} col boundaries")
+
+        # Map cells to grid positions
+        cell_grid = {}
+        for cell in cells:
+            # Find row (based on cell's top Y coordinate)
+            row = self._find_position(cell.y0, row_boundaries)
+            # Find column (based on cell's left X coordinate)
+            col = self._find_position(cell.x0, col_boundaries)
+
+            if row is not None and col is not None:
+                # Check for span (if cell extends across multiple rows/cols)
+                row_end = self._find_position(cell.y1, row_boundaries)
+                col_end = self._find_position(cell.x1, col_boundaries)
+
+                # Store with potential span info
+                if (row, col) not in cell_grid:
+                    cell_grid[(row, col)] = cell
+
+        return row_boundaries, col_boundaries, cell_grid
+
+    def _cluster_coordinates(
+        self,
+        coords: List[float],
+        threshold: float
+    ) -> List[float]:
+        """Cluster nearby coordinates into distinct values."""
+        if not coords:
+            return []
+
+        clustered = [coords[0]]
+        for coord in coords[1:]:
+            if coord - clustered[-1] > threshold:
+                clustered.append(coord)
+
+        return clustered
+
+    def _find_position(
+        self,
+        value: float,
+        boundaries: List[float]
+    ) -> Optional[int]:
+        """Find which position (index) a value falls into."""
+        for i, boundary in enumerate(boundaries):
+            if value <= boundary + self.row_clustering_threshold:
+                return i
+        return len(boundaries) - 1 if boundaries else None
+
+    def assign_ocr_to_cells(
+        self,
+        cells: List[CellBox],
+        ocr_regions: List[OCRTextRegion],
+        row_boundaries: List[float],
+        col_boundaries: List[float]
+    ) -> Dict[Tuple[int, int], List[OCRTextRegion]]:
+        """
+        Assign OCR text regions to cells based on spatial overlap.
+
+        Args:
+            cells: List of validated CellBox objects
+            ocr_regions: List of OCRTextRegion objects
+            row_boundaries: Y coordinates for row divisions
+            col_boundaries: X coordinates for column divisions
+
+        Returns:
+            Dict mapping (row, col) to list of OCR regions in that cell
+        """
+        cell_ocr_map: Dict[Tuple[int, int], List[OCRTextRegion]] = defaultdict(list)
+
+        for ocr in ocr_regions:
+            best_cell = None
+            best_overlap = 0
+
+            for cell in cells:
+                overlap = self._calculate_overlap_ratio(
+                    (ocr.x0, ocr.y0, ocr.x1, ocr.y1),
+                    (cell.x0, cell.y0, cell.x1, cell.y1)
+                )
+
+                if overlap > best_overlap and overlap >= self.min_text_coverage:
+                    best_overlap = overlap
+                    best_cell = cell
+
+            if best_cell:
+                row = self._find_position(best_cell.y0, row_boundaries)
+                col = self._find_position(best_cell.x0, col_boundaries)
+                if row is not None and col is not None:
+                    cell_ocr_map[(row, col)].append(ocr)
+
+        return cell_ocr_map
+
+    def _calculate_overlap_ratio(
+        self,
+        box1: Tuple[float, float, float, float],
+        box2: Tuple[float, float, float, float]
+    ) -> float:
+        """Calculate overlap ratio of box1 with box2."""
+        x0_1, y0_1, x1_1, y1_1 = box1
+        x0_2, y0_2, x1_2, y1_2 = box2
+
+        # Calculate intersection
+        inter_x0 = max(x0_1, x0_2)
+        inter_y0 = max(y0_1, y0_2)
+        inter_x1 = min(x1_1, x1_2)
+        inter_y1 = min(y1_1, y1_2)
+
+        if inter_x0 >= inter_x1 or inter_y0 >= inter_y1:
+            return 0.0
+
+        inter_area = (inter_x1 - inter_x0) * (inter_y1 - inter_y0)
+        box1_area = (x1_1 - x0_1) * (y1_1 - y0_1)
+
+        return inter_area / box1_area if box1_area > 0 else 0.0
+
+    def rebuild_table(
+        self,
+        cell_boxes: List[List[float]],
+        table_bbox: List[float],
+        raw_ocr_regions: List[Dict[str, Any]],
+        original_html: str = ""
+    ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        """
+        Rebuild table content from cell_boxes and raw OCR regions.
+
+        This is the main entry point. It:
+        1. Validates cell_boxes
+        2. If validity ratio is low, uses pure OCR-based rebuild
+        3. Otherwise, uses cell_boxes + OCR hybrid rebuild
+
+        Args:
+            cell_boxes: List of cell bounding boxes from PP-StructureV3
+            table_bbox: Table bounding box [x0, y0, x1, y1]
+            raw_ocr_regions: List of raw OCR region dicts
+            original_html: Original HTML from PP-StructureV3 (for fallback)
+
+        Returns:
+            Tuple of (rebuilt_table_dict, rebuild_stats)
+        """
+        stats = {
+            "action": "none",
+            "reason": "",
+            "original_cell_count": len(cell_boxes) if cell_boxes else 0,
+            "valid_cell_count": 0,
+            "ocr_regions_in_table": 0,
+            "rebuilt_rows": 0,
+            "rebuilt_cols": 0
+        }
+
+        # Step 1: Validate cell_boxes
+        valid_cells, validation_stats = self.validate_cell_boxes(cell_boxes, table_bbox)
+        stats["valid_cell_count"] = validation_stats["valid"]
+        stats["validation"] = validation_stats
+
+        # Step 2: Parse raw OCR regions in table area
+        ocr_regions = self.parse_raw_ocr_regions(raw_ocr_regions, table_bbox)
+        stats["ocr_regions_in_table"] = len(ocr_regions)
+
+        if not ocr_regions:
+            stats["action"] = "skip"
+            stats["reason"] = "no_ocr_regions_in_table"
+            return None, stats
+
+        # Step 3: Choose rebuild strategy based on cell_boxes validity
+        # If validity ratio is too low (< 50%), use pure OCR-based rebuild
+        if validation_stats["validity_ratio"] < 0.5 or len(valid_cells) < 2:
+            logger.info(
+                f"Using pure OCR-based rebuild (validity={validation_stats['validity_ratio']:.2%})"
+            )
+            return self._rebuild_from_ocr_only(ocr_regions, table_bbox, stats)
+
+        # Otherwise, use hybrid cell_boxes + OCR rebuild
+        return self._rebuild_with_cell_boxes(valid_cells, ocr_regions, stats, table_bbox)
+
+    def _rebuild_from_ocr_only(
+        self,
+        ocr_regions: List[OCRTextRegion],
+        table_bbox: List[float],
+        stats: Dict[str, Any]
+    ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        """
+        Rebuild table using only OCR regions (when cell_boxes are unreliable).
+
+        Strategy:
+        1. Detect column boundary from OCR x-coordinates
+        2. Cluster OCR regions by Y coordinate into rows
+        3. Split each row into left/right columns
+        """
+        if not ocr_regions:
+            stats["action"] = "skip"
+            stats["reason"] = "no_ocr_regions"
+            return None, stats
+
+        # Get table bounds
+        table_x0, table_y0, table_x1, table_y1 = table_bbox[:4]
+        table_width = table_x1 - table_x0
+
+        # Step 1: Detect column split point by analyzing x-coordinates
+        # Look for the gap between left column (x0 < 250) and right column (x0 >= 250)
+        col_split_x = self._detect_column_split(ocr_regions, table_bbox)
+        logger.debug(f"Detected column split at x={col_split_x}")
+
+        # Step 2: Cluster OCR regions by Y coordinate into rows
+        # Use smaller threshold (12px) to properly separate rows
+        row_threshold = 12.0
+        sorted_ocr = sorted(ocr_regions, key=lambda r: r.center_y)
+
+        rows = []
+        current_row = [sorted_ocr[0]]
+
+        for ocr in sorted_ocr[1:]:
+            if ocr.center_y - current_row[-1].center_y <= row_threshold:
+                current_row.append(ocr)
+            else:
+                rows.append(current_row)
+                current_row = [ocr]
+        rows.append(current_row)
+
+        logger.debug(f"Detected {len(rows)} rows")
+
+        # Step 3: Analyze column structure
+        left_regions = [r for r in ocr_regions if r.x0 < col_split_x]
+        right_regions = [r for r in ocr_regions if r.x0 >= col_split_x]
+        num_cols = 2 if len(left_regions) >= 2 and len(right_regions) >= 2 else 1
+
+        # Step 4: Build cells for each row
+        rebuilt_cells = []
+        for row_idx, row_ocrs in enumerate(rows):
+            row_ocrs_sorted = sorted(row_ocrs, key=lambda r: r.center_x)
+
+            if num_cols == 2:
+                # Split into left and right columns using x0
+                left_ocrs = [r for r in row_ocrs_sorted if r.x0 < col_split_x]
+                right_ocrs = [r for r in row_ocrs_sorted if r.x0 >= col_split_x]
+
+                # Left column cell
+                if left_ocrs:
+                    left_content = " ".join(r.text for r in left_ocrs)
+                    left_bbox = [
+                        min(r.x0 for r in left_ocrs),
+                        min(r.y0 for r in left_ocrs),
+                        max(r.x1 for r in left_ocrs),
+                        max(r.y1 for r in left_ocrs)
+                    ]
+                    rebuilt_cells.append({
+                        "row": row_idx,
+                        "col": 0,
+                        "row_span": 1,
+                        "col_span": 1,
+                        "content": left_content,
+                        "bbox": left_bbox
+                    })
+
+                # Right column cell
+                if right_ocrs:
+                    right_content = " ".join(r.text for r in right_ocrs)
+                    right_bbox = [
+                        min(r.x0 for r in right_ocrs),
+                        min(r.y0 for r in right_ocrs),
+                        max(r.x1 for r in right_ocrs),
+                        max(r.y1 for r in right_ocrs)
+                    ]
+                    rebuilt_cells.append({
+                        "row": row_idx,
+                        "col": 1,
+                        "row_span": 1,
+                        "col_span": 1,
+                        "content": right_content,
+                        "bbox": right_bbox
+                    })
+            else:
+                # Single column - merge all OCR in row
+                row_content = " ".join(r.text for r in row_ocrs_sorted)
+                row_bbox = [
+                    min(r.x0 for r in row_ocrs_sorted),
+                    min(r.y0 for r in row_ocrs_sorted),
+                    max(r.x1 for r in row_ocrs_sorted),
+                    max(r.y1 for r in row_ocrs_sorted)
+                ]
+                rebuilt_cells.append({
+                    "row": row_idx,
+                    "col": 0,
+                    "row_span": 1,
+                    "col_span": 1,
+                    "content": row_content,
+                    "bbox": row_bbox
+                })
+
+        num_rows = len(rows)
+        stats["rebuilt_rows"] = num_rows
+        stats["rebuilt_cols"] = num_cols
+
+        # Build result
+        rebuilt_table = {
+            "rows": num_rows,
+            "cols": num_cols,
+            "cells": rebuilt_cells,
+            "html": self._generate_html(rebuilt_cells, num_rows, num_cols),
+            "rebuild_source": "pure_ocr"
+        }
+
+        stats["action"] = "rebuilt"
+        stats["reason"] = "pure_ocr_success"
+        stats["rebuilt_cell_count"] = len(rebuilt_cells)
+
+        logger.info(
+            f"Table rebuilt (pure OCR): {num_rows}x{num_cols} with {len(rebuilt_cells)} cells"
+        )
+
+        return rebuilt_table, stats
+
+    def _detect_column_split(
+        self,
+        ocr_regions: List[OCRTextRegion],
+        table_bbox: List[float]
+    ) -> float:
+        """
+        Detect the column split point by analyzing x-coordinates.
+
+        For tables with left/right structure (e.g., property-value tables),
+        there's usually a gap between left column text and right column text.
+        """
+        if not ocr_regions:
+            return (table_bbox[0] + table_bbox[2]) / 2
+
+        # Collect all x0 values (left edge of each text region)
+        x0_values = sorted(set(round(r.x0) for r in ocr_regions))
+
+        if len(x0_values) < 2:
+            return (table_bbox[0] + table_bbox[2]) / 2
+
+        # Find the largest gap between consecutive x0 values
+        # This usually indicates the column boundary
+        max_gap = 0
+        split_point = (table_bbox[0] + table_bbox[2]) / 2
+
+        for i in range(len(x0_values) - 1):
+            gap = x0_values[i + 1] - x0_values[i]
+            if gap > max_gap and gap > 50:  # Require minimum 50px gap
+                max_gap = gap
+                split_point = (x0_values[i] + x0_values[i + 1]) / 2
+
+        # If no clear gap found, use table center
+        if max_gap < 50:
+            split_point = (table_bbox[0] + table_bbox[2]) / 2
+
+        return split_point
+
+    def _rebuild_with_cell_boxes(
+        self,
+        valid_cells: List[CellBox],
+        ocr_regions: List[OCRTextRegion],
+        stats: Dict[str, Any],
+        table_bbox: Optional[List[float]] = None
+    ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        """Rebuild table using cell_boxes structure + OCR content."""
+        # Step 3: Cluster cells into grid
+        row_boundaries, col_boundaries, cell_grid = self.cluster_cells_into_grid(valid_cells)
+
+        num_rows = len(row_boundaries) - 1 if len(row_boundaries) > 1 else 1
+        num_cols = len(col_boundaries) - 1 if len(col_boundaries) > 1 else 1
+
+        # Quality check: if hybrid produces too many columns or sparse grid, fall back to pure OCR
+        # A well-formed table typically has 2-5 columns. Too many columns indicates poor clustering.
+        total_expected_cells = num_rows * num_cols
+        if num_cols > 5 or total_expected_cells > 100:
+            logger.info(
+                f"Hybrid mode produced {num_rows}x{num_cols} grid (too sparse), "
+                f"falling back to pure OCR mode"
+            )
+            if table_bbox:
+                return self._rebuild_from_ocr_only(ocr_regions, table_bbox, stats)
+
+        stats["rebuilt_rows"] = num_rows
+        stats["rebuilt_cols"] = num_cols
+
+        # Step 4: Assign OCR text to cells
+        cell_ocr_map = self.assign_ocr_to_cells(
+            valid_cells, ocr_regions, row_boundaries, col_boundaries
+        )
+
+        # Step 5: Build rebuilt cells
+        rebuilt_cells = []
+        for (row, col), ocr_list in cell_ocr_map.items():
+            # Sort OCR regions by position (top to bottom, left to right)
+            sorted_ocr = sorted(ocr_list, key=lambda r: (r.center_y, r.center_x))
+            content = " ".join(r.text for r in sorted_ocr)
+
+            # Find the cell bbox for this position
+            cell_bbox = None
+            for cell in valid_cells:
+                cell_row = self._find_position(cell.y0, row_boundaries)
+                cell_col = self._find_position(cell.x0, col_boundaries)
+                if cell_row == row and cell_col == col:
+                    cell_bbox = [cell.x0, cell.y0, cell.x1, cell.y1]
+                    break
+
+            rebuilt_cells.append({
+                "row": row,
+                "col": col,
+                "row_span": 1,
+                "col_span": 1,
+                "content": content,
+                "bbox": cell_bbox
+            })
+
+        # Quality check: if too few cells have content compared to grid size, fall back to pure OCR
+        content_ratio = len(rebuilt_cells) / total_expected_cells if total_expected_cells > 0 else 0
+        if content_ratio < 0.3 and table_bbox:
+            logger.info(
+                f"Hybrid mode has low content ratio ({content_ratio:.2%}), "
+                f"falling back to pure OCR mode"
+            )
+            return self._rebuild_from_ocr_only(ocr_regions, table_bbox, stats)
+
+        # Build result
+        rebuilt_table = {
+            "rows": num_rows,
+            "cols": num_cols,
+            "cells": rebuilt_cells,
+            "html": self._generate_html(rebuilt_cells, num_rows, num_cols),
+            "rebuild_source": "cell_boxes_hybrid"
+        }
+
+        stats["action"] = "rebuilt"
+        stats["reason"] = "hybrid_success"
+        stats["rebuilt_cell_count"] = len(rebuilt_cells)
+
+        logger.info(
+            f"Table rebuilt (hybrid): {num_rows}x{num_cols} with {len(rebuilt_cells)} cells "
+            f"(from {len(ocr_regions)} OCR regions)"
+        )
+
+        return rebuilt_table, stats
+
+    def _generate_html(
+        self,
+        cells: List[Dict[str, Any]],
+        num_rows: int,
+        num_cols: int
+    ) -> str:
+        """Generate HTML table from rebuilt cells."""
+        # Create grid
+        grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
+
+        for cell in cells:
+            row, col = cell["row"], cell["col"]
+            if 0 <= row < num_rows and 0 <= col < num_cols:
+                grid[row][col] = cell["content"]
+
+        # Build HTML
+        html_parts = ["<html><body><table>"]
+        for row_idx in range(num_rows):
+            html_parts.append("<tr>")
+            for col_idx in range(num_cols):
+                content = grid[row_idx][col_idx] or ""
+                tag = "th" if row_idx == 0 else "td"
+                html_parts.append(f"<{tag}>{content}</{tag}>")
+            html_parts.append("</tr>")
+        html_parts.append("</table></body></html>")
+
+        return "".join(html_parts)
+
+    def should_rebuild(
+        self,
+        cell_boxes: List[List[float]],
+        table_bbox: List[float],
+        original_html: str = ""
+    ) -> Tuple[bool, str]:
+        """
+        Determine if table should be rebuilt based on cell_boxes validity.
+
+        Args:
+            cell_boxes: List of cell bounding boxes
+            table_bbox: Table bounding box
+            original_html: Original HTML from PP-StructureV3
+
+        Returns:
+            Tuple of (should_rebuild, reason)
+        """
+        if not cell_boxes:
+            return False, "no_cell_boxes"
+
+        _, validation_stats = self.validate_cell_boxes(cell_boxes, table_bbox)
+
+        # Always rebuild if ANY cells are invalid - PP-Structure HTML often merges cells incorrectly
+        # even when most cell_boxes are valid
+        if validation_stats["invalid"] > 0:
+            return True, f"invalid_cells_{validation_stats['invalid']}/{validation_stats['total']}"
+
+        # Rebuild if there are boundary violations
+        invalid_reasons = validation_stats.get("invalid_reasons", {})
+        boundary_violations = (
+            invalid_reasons.get("y1_exceeds_table", 0) +
+            invalid_reasons.get("y0_above_table", 0) +
+            invalid_reasons.get("x1_exceeds_table", 0) +
+            invalid_reasons.get("x0_left_of_table", 0)
+        )
+
+        if boundary_violations > 0:
+            return True, f"boundary_violations_{boundary_violations}"
+
+        # Also rebuild to ensure OCR-based content is used instead of PP-Structure HTML
+        # PP-Structure's HTML often has incorrect cell merging
+        return True, "ocr_content_preferred"
--- a/backend/app/services/text_region_renderer.py
+++ b/backend/app/services/text_region_renderer.py
@@ -0,0 +1,664 @@
+"""
+Simple Text Region Renderer
+
+Renders raw OCR text regions directly to PDF at their detected positions,
+with rotation correction based on bbox quadrilateral geometry.
+
+This approach bypasses complex table structure reconstruction and simply
+places text at the positions detected by PaddleOCR.
+"""
+
+import math
+import logging
+from typing import Dict, List, Optional, Set, Tuple
+
+from reportlab.pdfgen import canvas
+from reportlab.lib.colors import black
+
+logger = logging.getLogger(__name__)
+
+
+class TextRegionRenderer:
+    """
+    Render raw OCR text regions to PDF with position and rotation correction.
+
+    This renderer takes the raw OCR output (text + quadrilateral bbox) and
+    renders text at the correct position. Small rotation angles are ignored
+    (straightened) to produce clean, aligned text output.
+    """
+
+    # Minimum font size to prevent illegible text
+    MIN_FONT_SIZE = 6.0
+
+    # Maximum font size to prevent oversized text
+    MAX_FONT_SIZE = 72.0
+
+    # Font size estimation factor (font height relative to bbox height)
+    FONT_SIZE_FACTOR = 0.75
+
+    # Rotation angle threshold - angles smaller than this are straightened to 0
+    # This compensates for slight scan skew and produces cleaner output
+    ROTATION_STRAIGHTEN_THRESHOLD = 10.0  # degrees
+
+    # IoA (Intersection over Area) threshold for text-image overlap detection
+    # If text bbox overlaps with image by more than this ratio, skip the text
+    IOA_OVERLAP_THRESHOLD = 0.3  # 30% overlap
+
+    def __init__(
+        self,
+        font_name: str = 'NotoSansSC',
+        debug: bool = False,
+        straighten_threshold: float = None,
+        ioa_threshold: float = None
+    ):
+        """
+        Initialize the text region renderer.
+
+        Args:
+            font_name: Name of the registered font to use
+            debug: Enable debug logging
+            straighten_threshold: Override rotation straightening threshold (degrees)
+            ioa_threshold: Override IoA overlap threshold for text-image avoidance
+        """
+        self.font_name = font_name
+        self.debug = debug
+        self.straighten_threshold = straighten_threshold or self.ROTATION_STRAIGHTEN_THRESHOLD
+        self.ioa_threshold = ioa_threshold or self.IOA_OVERLAP_THRESHOLD
+
+    def calculate_rotation(self, bbox: List[List[float]]) -> float:
+        """
+        Calculate text rotation angle from bbox quadrilateral.
+
+        The bbox is a quadrilateral with 4 corner points in order:
+        [top-left, top-right, bottom-right, bottom-left]
+
+        Returns angle in degrees (counter-clockwise from horizontal).
+        Positive angle means text is tilted upward to the right.
+
+        NOTE: Small angles (< straighten_threshold) will be treated as 0
+        during rendering to produce clean, aligned output.
+
+        Args:
+            bbox: List of 4 [x, y] coordinate pairs
+
+        Returns:
+            Rotation angle in degrees
+        """
+        if len(bbox) < 2:
+            return 0.0
+
+        # Top-left to top-right vector (top edge)
+        dx = bbox[1][0] - bbox[0][0]
+        dy = bbox[1][1] - bbox[0][1]
+
+        # Calculate angle (atan2 returns radians, convert to degrees)
+        # Note: In image coordinates, Y increases downward
+        # We negate dy to get the conventional angle
+        angle_rad = math.atan2(-dy, dx)
+        angle_deg = math.degrees(angle_rad)
+
+        if self.debug:
+            logger.debug(f"Rotation calculation: dx={dx:.1f}, dy={dy:.1f}, angle={angle_deg:.2f}°")
+
+        return angle_deg
+
+    def estimate_font_size(
+        self,
+        bbox: List[List[float]],
+        text: str,
+        scale_factor: float = 1.0
+    ) -> float:
+        """
+        Estimate appropriate font size from bbox dimensions.
+
+        Uses the bbox height as the primary indicator, with adjustment
+        for the typical font-to-bbox ratio.
+
+        Args:
+            bbox: List of 4 [x, y] coordinate pairs
+            text: The text content (for width-based adjustments)
+            scale_factor: Coordinate scaling factor
+
+        Returns:
+            Estimated font size in points
+        """
+        if len(bbox) < 4:
+            return 12.0  # Default font size
+
+        # Calculate bbox height (average of left and right edges)
+        left_height = math.dist(bbox[0], bbox[3])
+        right_height = math.dist(bbox[1], bbox[2])
+        avg_height = (left_height + right_height) / 2
+
+        # Apply scale factor and font size ratio
+        font_size = avg_height * scale_factor * self.FONT_SIZE_FACTOR
+
+        # Clamp to reasonable range
+        font_size = max(self.MIN_FONT_SIZE, min(self.MAX_FONT_SIZE, font_size))
+
+        if self.debug:
+            logger.debug(f"Font size estimation: bbox_h={avg_height:.1f}, "
+                        f"scale={scale_factor:.3f}, font={font_size:.1f}pt")
+
+        return font_size
+
+    def get_bbox_center(self, bbox: List[List[float]]) -> Tuple[float, float]:
+        """
+        Calculate the center point of a bbox quadrilateral.
+
+        Args:
+            bbox: List of 4 [x, y] coordinate pairs
+
+        Returns:
+            Tuple of (center_x, center_y)
+        """
+        if len(bbox) < 4:
+            return (0.0, 0.0)
+
+        center_x = sum(p[0] for p in bbox) / 4
+        center_y = sum(p[1] for p in bbox) / 4
+        return (center_x, center_y)
+
+    def get_bbox_as_rect(self, bbox: List[List[float]]) -> Tuple[float, float, float, float]:
+        """
+        Convert quadrilateral bbox to axis-aligned rectangle (x0, y0, x1, y1).
+
+        Args:
+            bbox: List of 4 [x, y] coordinate pairs
+
+        Returns:
+            Tuple of (x0, y0, x1, y1) - min/max coordinates
+        """
+        if len(bbox) < 4:
+            return (0.0, 0.0, 0.0, 0.0)
+
+        x_coords = [p[0] for p in bbox]
+        y_coords = [p[1] for p in bbox]
+        return (min(x_coords), min(y_coords), max(x_coords), max(y_coords))
+
+    def get_bbox_left_baseline(
+        self,
+        bbox: List[List[float]]
+    ) -> Tuple[float, float]:
+        """
+        Get the left baseline point for text rendering.
+
+        For left-aligned text, we use the bottom-left corner as the
+        baseline starting point (text baseline is at the bottom).
+
+        Args:
+            bbox: List of 4 [x, y] coordinate pairs
+
+        Returns:
+            Tuple of (x, y) for the left baseline point
+        """
+        if len(bbox) < 4:
+            return (0.0, 0.0)
+
+        # Use bottom-left corner for baseline
+        # bbox[3] is bottom-left in the standard ordering
+        x = bbox[3][0]
+        y = bbox[3][1]
+
+        return (x, y)
+
+    def calculate_ioa(
+        self,
+        text_rect: Tuple[float, float, float, float],
+        image_rect: Tuple[float, float, float, float]
+    ) -> float:
+        """
+        Calculate Intersection over Area (IoA) of text bbox with image bbox.
+
+        IoA = intersection_area / text_area
+
+        This measures how much of the text region overlaps with the image.
+
+        Args:
+            text_rect: Text bbox as (x0, y0, x1, y1)
+            image_rect: Image bbox as (x0, y0, x1, y1)
+
+        Returns:
+            IoA ratio (0.0 to 1.0)
+        """
+        tx0, ty0, tx1, ty1 = text_rect
+        ix0, iy0, ix1, iy1 = image_rect
+
+        # Calculate text area
+        text_area = (tx1 - tx0) * (ty1 - ty0)
+        if text_area <= 0:
+            return 0.0
+
+        # Calculate intersection
+        inter_x0 = max(tx0, ix0)
+        inter_y0 = max(ty0, iy0)
+        inter_x1 = min(tx1, ix1)
+        inter_y1 = min(ty1, iy1)
+
+        if inter_x0 >= inter_x1 or inter_y0 >= inter_y1:
+            return 0.0  # No intersection
+
+        inter_area = (inter_x1 - inter_x0) * (inter_y1 - inter_y0)
+        return inter_area / text_area
+
+    def is_overlapping_exclusion_zones(
+        self,
+        bbox: List[List[float]],
+        exclusion_zones: List[Tuple[float, float, float, float]]
+    ) -> bool:
+        """
+        Check if text bbox overlaps significantly with any exclusion zone.
+
+        Args:
+            bbox: Text bbox as quadrilateral
+            exclusion_zones: List of (x0, y0, x1, y1) rectangles to avoid
+
+        Returns:
+            True if text should be skipped due to overlap
+        """
+        if not exclusion_zones:
+            return False
+
+        text_rect = self.get_bbox_as_rect(bbox)
+
+        for zone in exclusion_zones:
+            ioa = self.calculate_ioa(text_rect, zone)
+            if ioa >= self.ioa_threshold:
+                if self.debug:
+                    logger.debug(f"Text overlaps exclusion zone: IoA={ioa:.2f} >= {self.ioa_threshold}")
+                return True
+
+        return False
+
+    def is_inside_zone(
+        self,
+        bbox: List[List[float]],
+        zone: Tuple[float, float, float, float],
+        threshold: float = 0.5
+    ) -> bool:
+        """
+        Check if text bbox is inside a zone (for collecting chart texts).
+
+        Args:
+            bbox: Text bbox as quadrilateral
+            zone: Zone as (x0, y0, x1, y1) rectangle
+            threshold: Minimum IoA to consider "inside"
+
+        Returns:
+            True if text is inside the zone
+        """
+        text_rect = self.get_bbox_as_rect(bbox)
+        ioa = self.calculate_ioa(text_rect, zone)
+        return ioa >= threshold
+
+    def is_axis_label(
+        self,
+        bbox: List[List[float]],
+        zone: Tuple[float, float, float, float],
+        margin: float = 50.0
+    ) -> bool:
+        """
+        Check if text bbox is an axis label for a chart/image zone.
+
+        Axis labels are typically:
+        - Vertical text to the LEFT of the chart (Y-axis label)
+        - Horizontal text BELOW the chart (X-axis label)
+
+        Args:
+            bbox: Text bbox as quadrilateral
+            zone: Chart/image zone as (x0, y0, x1, y1) rectangle
+            margin: Maximum distance from zone edge to be considered axis label
+
+        Returns:
+            True if text appears to be an axis label for this zone
+        """
+        if len(bbox) < 4:
+            return False
+
+        text_rect = self.get_bbox_as_rect(bbox)
+        tx0, ty0, tx1, ty1 = text_rect
+        zx0, zy0, zx1, zy1 = zone
+
+        # Calculate text dimensions
+        text_width = tx1 - tx0
+        text_height = ty1 - ty0
+
+        # Check for Y-axis label: vertical text to the LEFT of zone
+        # - Text is to the left of zone (tx1 <= zx0 + small overlap)
+        # - Text's Y range overlaps with zone's Y range
+        # - Text is taller than wide (aspect ratio > 2) OR very narrow
+        is_left_of_zone = tx1 <= zx0 + margin and tx1 >= zx0 - margin
+        y_overlaps = not (ty1 < zy0 or ty0 > zy1)
+        is_vertical_text = text_height > text_width * 2
+
+        if is_left_of_zone and y_overlaps and is_vertical_text:
+            if self.debug:
+                logger.debug(f"Detected Y-axis label: text is left of zone, vertical")
+            return True
+
+        # Check for X-axis label: horizontal text BELOW the zone
+        # - Text is below zone (ty0 >= zy1 - small overlap)
+        # - Text's X range overlaps with zone's X range
+        # - Text is wider than tall (normal horizontal text)
+        is_below_zone = ty0 >= zy1 - margin and ty0 <= zy1 + margin
+        x_overlaps = not (tx1 < zx0 or tx0 > zx1)
+        is_horizontal_text = text_width > text_height
+
+        if is_below_zone and x_overlaps and is_horizontal_text:
+            if self.debug:
+                logger.debug(f"Detected X-axis label: text is below zone, horizontal")
+            return True
+
+        return False
+
+    def is_near_zone(
+        self,
+        bbox: List[List[float]],
+        zone: Tuple[float, float, float, float],
+        margin: float = 100.0
+    ) -> bool:
+        """
+        Check if text bbox is near (within margin) of a zone.
+
+        Args:
+            bbox: Text bbox as quadrilateral
+            zone: Zone as (x0, y0, x1, y1) rectangle
+            margin: Maximum distance from zone to be considered "near"
+
+        Returns:
+            True if text is near the zone
+        """
+        if len(bbox) < 4:
+            return False
+
+        text_rect = self.get_bbox_as_rect(bbox)
+        tx0, ty0, tx1, ty1 = text_rect
+        zx0, zy0, zx1, zy1 = zone
+
+        # Expand zone by margin
+        expanded_zone = (zx0 - margin, zy0 - margin, zx1 + margin, zy1 + margin)
+
+        # Check if text overlaps with expanded zone
+        ex0, ey0, ex1, ey1 = expanded_zone
+        return not (tx1 < ex0 or tx0 > ex1 or ty1 < ey0 or ty0 > ey1)
+
+    def collect_zone_texts(
+        self,
+        regions: List[Dict],
+        zones: List[Tuple[float, float, float, float]],
+        threshold: float = 0.5,
+        include_axis_labels: bool = True
+    ) -> Set[str]:
+        """
+        Collect text content from regions inside zones or identified as axis labels.
+
+        This set is used during rendering for position-aware deduplication:
+        - Text that matches this set AND is near a zone will be skipped
+        - Text that matches but is far from zones will still be rendered
+
+        Args:
+            regions: List of raw OCR region dicts
+            zones: List of (x0, y0, x1, y1) rectangles (e.g., chart bboxes)
+            threshold: Minimum IoA to consider text as "inside" zone
+            include_axis_labels: Also collect axis labels adjacent to zones
+
+        Returns:
+            Set of text strings found inside zones or as axis labels
+        """
+        zone_texts = set()
+
+        for region in regions:
+            text = region.get('text', '').strip()
+            bbox = region.get('bbox', [])
+
+            if not text or len(bbox) < 4:
+                continue
+
+            for zone in zones:
+                # Check if inside zone
+                if self.is_inside_zone(bbox, zone, threshold):
+                    zone_texts.add(text)
+                    if self.debug:
+                        logger.debug(f"Collected zone text (inside): '{text}'")
+                    break
+
+                # Check if it's an axis label
+                if include_axis_labels and self.is_axis_label(bbox, zone):
+                    zone_texts.add(text)
+                    if self.debug:
+                        logger.debug(f"Collected zone text (axis label): '{text}'")
+                    break
+
+        return zone_texts
+
+    def render_text_region(
+        self,
+        pdf_canvas: canvas.Canvas,
+        region: Dict,
+        page_height: float,
+        scale_x: float = 1.0,
+        scale_y: float = 1.0,
+        exclusion_zones: List[Tuple[float, float, float, float]] = None,
+        zone_texts: Set[str] = None
+    ) -> Tuple[bool, str]:
+        """
+        Render a single OCR text region to the PDF canvas.
+
+        Handles coordinate transformation from image coordinates (origin top-left)
+        to PDF coordinates (origin bottom-left).
+
+        Small rotation angles are straightened to produce clean output.
+        Text overlapping with exclusion zones (images) is skipped.
+
+        Deduplication logic (position-aware):
+        - If text matches zone_texts AND is NEAR the zone (or is axis label),
+          skip it to avoid duplicate chart labels
+        - Text far from zones is rendered even if it matches zone content
+
+        Args:
+            pdf_canvas: ReportLab canvas to draw on
+            region: Raw OCR region dict with 'text' and 'bbox'
+            page_height: Height of the PDF page (for Y-flip)
+            scale_x: X coordinate scaling factor
+            scale_y: Y coordinate scaling factor
+            exclusion_zones: List of (x0, y0, x1, y1) rectangles to avoid
+            zone_texts: Set of zone-internal texts (dedupe only if near zone)
+
+        Returns:
+            Tuple of (success: bool, skip_reason: str)
+            - success=True, skip_reason='' if rendered successfully
+            - success=False, skip_reason='overlap'/'dedupe'/'error'/'' if skipped
+        """
+        text = region.get('text', '').strip()
+        bbox = region.get('bbox', [])
+
+        if not text or len(bbox) < 4:
+            return (False, '')
+
+        # Check if text overlaps with exclusion zones (images/charts)
+        if exclusion_zones and self.is_overlapping_exclusion_zones(bbox, exclusion_zones):
+            if self.debug:
+                logger.debug(f"Skipping text '{text[:20]}...' due to exclusion zone overlap")
+            return (False, 'overlap')
+
+        # Check if text should be deduplicated based on position
+        # Only skip if text matches zone content AND is near a zone (or is axis label)
+        if zone_texts and text in zone_texts and exclusion_zones:
+            for zone in exclusion_zones:
+                # Check if it's an axis label for this zone
+                if self.is_axis_label(bbox, zone):
+                    if self.debug:
+                        logger.debug(f"Skipping text '{text[:20]}...' - axis label for zone")
+                    return (False, 'dedupe')
+                # Check if it's near this zone (for zone-internal text deduplication)
+                if self.is_near_zone(bbox, zone, margin=100.0):
+                    if self.debug:
+                        logger.debug(f"Skipping text '{text[:20]}...' - matches zone text and is near zone")
+                    return (False, 'dedupe')
+
+        try:
+            # Calculate text properties
+            rotation = self.calculate_rotation(bbox)
+            font_size = self.estimate_font_size(bbox, text, scale_y)
+
+            # Straighten small rotations for cleaner output
+            # Only apply rotation for significant angles (e.g., 90° rotated text)
+            if abs(rotation) < self.straighten_threshold:
+                rotation = 0.0
+
+            # Get left baseline point in image coordinates
+            img_x, img_y = self.get_bbox_left_baseline(bbox)
+
+            # Apply scaling
+            scaled_x = img_x * scale_x
+            scaled_y = img_y * scale_y
+
+            # Convert to PDF coordinates (flip Y axis)
+            pdf_x = scaled_x
+            pdf_y = page_height - scaled_y
+
+            # Save canvas state
+            pdf_canvas.saveState()
+
+            # Try to set font with fallback
+            try:
+                pdf_canvas.setFont(self.font_name, font_size)
+            except KeyError:
+                # Font not registered, try fallback fonts
+                fallback_fonts = ['Helvetica', 'Times-Roman', 'Courier']
+                font_set = False
+                for fallback in fallback_fonts:
+                    try:
+                        pdf_canvas.setFont(fallback, font_size)
+                        font_set = True
+                        if self.debug:
+                            logger.debug(f"Using fallback font: {fallback}")
+                        break
+                    except KeyError:
+                        continue
+                if not font_set:
+                    logger.warning(f"No available font found, skipping region")
+                    pdf_canvas.restoreState()
+                    return (False, 'error')
+
+            pdf_canvas.setFillColor(black)
+
+            # Apply rotation if needed (only for significant angles like 90°)
+            if abs(rotation) > 0.5:
+                pdf_canvas.translate(pdf_x, pdf_y)
+                pdf_canvas.rotate(rotation)
+                pdf_canvas.drawString(0, 0, text)
+            else:
+                pdf_canvas.drawString(pdf_x, pdf_y, text)
+
+            # Restore canvas state
+            pdf_canvas.restoreState()
+
+            if self.debug:
+                logger.debug(f"Rendered text '{text[:20]}...' at ({pdf_x:.1f}, {pdf_y:.1f}), "
+                            f"rot={rotation:.1f}°, size={font_size:.1f}pt")
+
+            return (True, '')
+
+        except Exception as e:
+            logger.warning(f"Failed to render text region: {e}")
+            return (False, 'error')
+
+    def render_all_regions(
+        self,
+        pdf_canvas: canvas.Canvas,
+        regions: List[Dict],
+        page_height: float,
+        scale_x: float = 1.0,
+        scale_y: float = 1.0,
+        page_filter: Optional[int] = None,
+        exclusion_zones: List[Tuple[float, float, float, float]] = None,
+        zone_texts: Set[str] = None
+    ) -> int:
+        """
+        Render all OCR text regions to the PDF canvas.
+
+        Args:
+            pdf_canvas: ReportLab canvas to draw on
+            regions: List of raw OCR region dicts
+            page_height: Height of the PDF page
+            scale_x: X coordinate scaling factor
+            scale_y: Y coordinate scaling factor
+            page_filter: If set, only render regions for this page index
+            exclusion_zones: List of (x0, y0, x1, y1) rectangles to avoid
+            zone_texts: Set of zone-internal texts (for position-aware deduplication)
+
+        Returns:
+            Number of regions successfully rendered
+        """
+        rendered_count = 0
+        skipped_overlap = 0
+        skipped_dedupe = 0
+
+        for region in regions:
+            # Filter by page if specified
+            if page_filter is not None:
+                region_page = region.get('page', 0)
+                if region_page != page_filter:
+                    continue
+
+            success, skip_reason = self.render_text_region(
+                pdf_canvas, region, page_height, scale_x, scale_y,
+                exclusion_zones, zone_texts
+            )
+
+            if success:
+                rendered_count += 1
+            elif skip_reason == 'overlap':
+                skipped_overlap += 1
+            elif skip_reason == 'dedupe':
+                skipped_dedupe += 1
+
+        # Log results with skip counts
+        total_processed = rendered_count + skipped_overlap + skipped_dedupe
+        skip_parts = []
+        if skipped_overlap > 0:
+            skip_parts.append(f"{skipped_overlap} overlap")
+        if skipped_dedupe > 0:
+            skip_parts.append(f"{skipped_dedupe} dedupe")
+
+        if skip_parts:
+            logger.info(f"Rendered {rendered_count}/{total_processed} text regions "
+                       f"(skipped: {', '.join(skip_parts)})")
+        else:
+            logger.info(f"Rendered {rendered_count}/{len(regions)} text regions")
+
+        return rendered_count
+
+
+def load_raw_ocr_regions(result_dir: str, task_id: str, page_num: int) -> List[Dict]:
+    """
+    Load raw OCR regions from the result directory.
+
+    Args:
+        result_dir: Path to the result directory
+        task_id: Task ID
+        page_num: Page number (1-indexed)
+
+    Returns:
+        List of raw OCR region dictionaries
+    """
+    from pathlib import Path
+    import json
+
+    # Construct filename pattern
+    filename = f"{task_id}_edit_page_{page_num}_raw_ocr_regions.json"
+    file_path = Path(result_dir) / filename
+
+    if not file_path.exists():
+        logger.warning(f"Raw OCR regions file not found: {file_path}")
+        return []
+
+    try:
+        with open(file_path, 'r', encoding='utf-8') as f:
+            regions = json.load(f)
+            logger.info(f"Loaded {len(regions)} raw OCR regions from {filename}")
+            return regions
+    except Exception as e:
+        logger.error(f"Failed to load raw OCR regions: {e}")
+        return []