chore: backup before code cleanup

Backup commit before executing remove-unused-code proposal. This includes all pending changes and new features. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-11 11:55:39 +08:00
parent eff9b0bcd5
commit 940a406dce
58 changed files with 8226 additions and 175 deletions
--- a/backend/app/services/cell_validation_engine.py
+++ b/backend/app/services/cell_validation_engine.py
@@ -0,0 +1,583 @@
+"""
+Cell Validation Engine
+
+Validates PP-StructureV3 table detections using metric-based heuristics
+to filter over-detected cells and reclassify invalid tables as TEXT elements.
+
+Metrics used:
+- Cell density: cells per 10,000 px² (normal: 0.4-1.0, over-detected: 6+)
+- Average cell area: px² per cell (normal: 10,000-25,000, over-detected: ~1,600)
+- Cell height: table_height / cell_count (minimum: 10px for readable text)
+"""
+
+import logging
+from dataclasses import dataclass
+from typing import List, Dict, Any, Optional, Tuple
+from html.parser import HTMLParser
+import re
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class CellValidationConfig:
+    """Configuration for cell validation thresholds."""
+    max_cell_density: float = 3.0  # cells per 10,000 px²
+    min_avg_cell_area: float = 3000.0  # px² per cell
+    min_cell_height: float = 10.0  # px per cell row
+    enabled: bool = True
+
+
+@dataclass
+class TableValidationResult:
+    """Result of table validation."""
+    is_valid: bool
+    table_element: Dict[str, Any]
+    reason: Optional[str] = None
+    metrics: Optional[Dict[str, float]] = None
+
+
+class CellValidationEngine:
+    """
+    Validates table elements from PP-StructureV3 output.
+
+    Over-detected tables are identified by abnormal metrics and
+    reclassified as TEXT elements while preserving content.
+    """
+
+    def __init__(self, config: Optional[CellValidationConfig] = None):
+        self.config = config or CellValidationConfig()
+
+    def calculate_table_metrics(
+        self,
+        bbox: List[float],
+        cell_boxes: List[List[float]]
+    ) -> Dict[str, float]:
+        """
+        Calculate validation metrics for a table.
+
+        Args:
+            bbox: Table bounding box [x0, y0, x1, y1]
+            cell_boxes: List of cell bounding boxes
+
+        Returns:
+            Dictionary with calculated metrics
+        """
+        if len(bbox) < 4:
+            return {"cell_count": 0, "cell_density": 0, "avg_cell_area": 0, "avg_cell_height": 0}
+
+        cell_count = len(cell_boxes)
+        if cell_count == 0:
+            return {"cell_count": 0, "cell_density": 0, "avg_cell_area": 0, "avg_cell_height": 0}
+
+        # Calculate table dimensions
+        table_width = bbox[2] - bbox[0]
+        table_height = bbox[3] - bbox[1]
+        table_area = table_width * table_height
+
+        if table_area <= 0:
+            return {"cell_count": cell_count, "cell_density": 0, "avg_cell_area": 0, "avg_cell_height": 0}
+
+        # Cell density: cells per 10,000 px²
+        cell_density = (cell_count / table_area) * 10000
+
+        # Average cell area
+        avg_cell_area = table_area / cell_count
+
+        # Average cell height (table height / cell count)
+        avg_cell_height = table_height / cell_count
+
+        return {
+            "cell_count": cell_count,
+            "table_width": table_width,
+            "table_height": table_height,
+            "table_area": table_area,
+            "cell_density": cell_density,
+            "avg_cell_area": avg_cell_area,
+            "avg_cell_height": avg_cell_height
+        }
+
+    def validate_table(
+        self,
+        element: Dict[str, Any]
+    ) -> TableValidationResult:
+        """
+        Validate a single table element.
+
+        Args:
+            element: Table element from PP-StructureV3 output
+
+        Returns:
+            TableValidationResult with validation status and metrics
+        """
+        if not self.config.enabled:
+            return TableValidationResult(is_valid=True, table_element=element)
+
+        # Extract bbox and cell_boxes
+        bbox = element.get("bbox", [])
+        cell_boxes = element.get("cell_boxes", [])
+
+        # Tables without cells pass validation (structure-only tables)
+        if not cell_boxes:
+            return TableValidationResult(
+                is_valid=True,
+                table_element=element,
+                reason="No cells to validate"
+            )
+
+        # Calculate metrics
+        metrics = self.calculate_table_metrics(bbox, cell_boxes)
+
+        # Check cell density
+        if metrics["cell_density"] > self.config.max_cell_density:
+            return TableValidationResult(
+                is_valid=False,
+                table_element=element,
+                reason=f"Cell density {metrics['cell_density']:.2f} exceeds threshold {self.config.max_cell_density}",
+                metrics=metrics
+            )
+
+        # Check average cell area
+        if metrics["avg_cell_area"] < self.config.min_avg_cell_area:
+            return TableValidationResult(
+                is_valid=False,
+                table_element=element,
+                reason=f"Avg cell area {metrics['avg_cell_area']:.0f}px² below threshold {self.config.min_avg_cell_area}px²",
+                metrics=metrics
+            )
+
+        # Check cell height
+        if metrics["avg_cell_height"] < self.config.min_cell_height:
+            return TableValidationResult(
+                is_valid=False,
+                table_element=element,
+                reason=f"Avg cell height {metrics['avg_cell_height']:.1f}px below threshold {self.config.min_cell_height}px",
+                metrics=metrics
+            )
+
+        # Content-based validation: check if content looks like prose vs tabular data
+        content_check = self._validate_table_content(element)
+        if not content_check["is_tabular"]:
+            return TableValidationResult(
+                is_valid=False,
+                table_element=element,
+                reason=content_check["reason"],
+                metrics=metrics
+            )
+
+        return TableValidationResult(
+            is_valid=True,
+            table_element=element,
+            metrics=metrics
+        )
+
+    def _validate_table_content(self, element: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Validate table content to detect false positive tables.
+
+        Checks:
+        1. Sparsity: text coverage ratio (text area / table area)
+        2. Header: does table have proper header structure
+        3. Key-Value: for 2-col tables, is it a key-value list or random layout
+        4. Prose: are cells containing long prose text
+
+        Returns:
+            Dict with is_tabular (bool) and reason (str)
+        """
+        html_content = element.get("content", "")
+        bbox = element.get("bbox", [])
+        cell_boxes = element.get("cell_boxes", [])
+
+        if not html_content or '<table' not in html_content.lower():
+            return {"is_tabular": True, "reason": "no_html_content"}
+
+        try:
+            from bs4 import BeautifulSoup
+            soup = BeautifulSoup(html_content, 'html.parser')
+            table = soup.find('table')
+            if not table:
+                return {"is_tabular": True, "reason": "no_table_element"}
+
+            rows = table.find_all('tr')
+            if not rows:
+                return {"is_tabular": True, "reason": "no_rows"}
+
+            # Extract cell contents with row structure
+            row_data = []
+            all_cells = []
+            for row_idx, row in enumerate(rows):
+                cells = row.find_all(['td', 'th'])
+                row_cells = []
+                for cell in cells:
+                    text = cell.get_text(strip=True)
+                    colspan = int(cell.get('colspan', 1))
+                    is_header = cell.name == 'th'
+                    cell_info = {
+                        "text": text,
+                        "length": len(text),
+                        "colspan": colspan,
+                        "is_header": is_header,
+                        "row": row_idx
+                    }
+                    row_cells.append(cell_info)
+                    all_cells.append(cell_info)
+                row_data.append(row_cells)
+
+            if not all_cells:
+                return {"is_tabular": True, "reason": "no_cells"}
+
+            num_rows = len(row_data)
+            num_cols = max(len(r) for r in row_data) if row_data else 0
+
+            # === Check 1: Sparsity (text coverage) ===
+            sparsity_result = self._check_sparsity(bbox, cell_boxes, all_cells)
+            if not sparsity_result["is_valid"]:
+                return {"is_tabular": False, "reason": sparsity_result["reason"]}
+
+            # === Check 2: Header structure ===
+            header_result = self._check_header_structure(row_data, num_cols)
+            if not header_result["has_header"] and num_rows > 3:
+                # Large table without header is suspicious
+                logger.debug(f"Table has no header structure with {num_rows} rows")
+
+            # === Check 3: Key-Value pattern for 2-column tables ===
+            if num_cols == 2:
+                kv_result = self._check_key_value_pattern(row_data)
+                if kv_result["is_kv_list"] and kv_result["confidence"] > 0.7:
+                    # High confidence key-value list - keep as table but log
+                    logger.debug(f"Table identified as key-value list (conf={kv_result['confidence']:.2f})")
+                elif not kv_result["is_kv_list"] and kv_result["is_random_layout"]:
+                    # Random 2-column layout, not a real table
+                    return {
+                        "is_tabular": False,
+                        "reason": f"random_two_column_layout (not key-value)"
+                    }
+
+            # === Check 4: Prose content ===
+            long_cells = [c for c in all_cells if c["length"] > 80]
+            prose_ratio = len(long_cells) / len(all_cells) if all_cells else 0
+            if prose_ratio > 0.3:
+                return {
+                    "is_tabular": False,
+                    "reason": f"prose_content ({len(long_cells)}/{len(all_cells)} cells > 80 chars)"
+                }
+
+            # === Check 5: Section header as table ===
+            if num_rows <= 2 and num_cols <= 2:
+                first_row = row_data[0] if row_data else []
+                if len(first_row) == 1:
+                    text = first_row[0]["text"]
+                    if text.isupper() and len(text) < 50:
+                        return {
+                            "is_tabular": False,
+                            "reason": f"section_header_only ({text[:30]})"
+                        }
+
+            return {"is_tabular": True, "reason": "content_valid"}
+
+        except Exception as e:
+            logger.warning(f"Content validation failed: {e}")
+            return {"is_tabular": True, "reason": f"validation_error: {e}"}
+
+    def _check_sparsity(
+        self,
+        bbox: List[float],
+        cell_boxes: List[List[float]],
+        all_cells: List[Dict]
+    ) -> Dict[str, Any]:
+        """
+        Check text coverage ratio (sparsity).
+
+        Two-column layouts have large empty gaps in the middle.
+        Real tables have more uniform cell distribution.
+        """
+        if len(bbox) < 4:
+            return {"is_valid": True, "reason": "no_bbox"}
+
+        table_width = bbox[2] - bbox[0]
+        table_height = bbox[3] - bbox[1]
+        table_area = table_width * table_height
+
+        if table_area <= 0:
+            return {"is_valid": True, "reason": "invalid_area"}
+
+        # Calculate text area from cell_boxes
+        if cell_boxes:
+            text_area = 0
+            for cb in cell_boxes:
+                if len(cb) >= 4:
+                    w = abs(cb[2] - cb[0])
+                    h = abs(cb[3] - cb[1])
+                    text_area += w * h
+            coverage = text_area / table_area
+        else:
+            # Estimate from cell content length
+            total_chars = sum(c["length"] for c in all_cells)
+            # Rough estimate: 1 char ≈ 8x12 pixels = 96 px²
+            estimated_text_area = total_chars * 96
+            coverage = min(estimated_text_area / table_area, 1.0)
+
+        # Very sparse table (< 15% coverage) is suspicious
+        if coverage < 0.15:
+            return {
+                "is_valid": False,
+                "reason": f"sparse_content (coverage={coverage:.1%})"
+            }
+
+        return {"is_valid": True, "coverage": coverage}
+
+    def _check_header_structure(
+        self,
+        row_data: List[List[Dict]],
+        num_cols: int
+    ) -> Dict[str, Any]:
+        """
+        Check if table has proper header structure.
+
+        Real tables usually have:
+        - First row with <th> elements
+        - Or first row with different content pattern (labels vs values)
+        """
+        if not row_data:
+            return {"has_header": False}
+
+        first_row = row_data[0]
+
+        # Check for <th> elements
+        th_count = sum(1 for c in first_row if c.get("is_header", False))
+        if th_count > 0 and th_count >= len(first_row) * 0.5:
+            return {"has_header": True, "type": "th_elements"}
+
+        # Check for header-like content (short, distinct from body)
+        if len(row_data) > 1:
+            first_row_avg_len = sum(c["length"] for c in first_row) / len(first_row) if first_row else 0
+            body_rows = row_data[1:]
+            body_cells = [c for row in body_rows for c in row]
+            body_avg_len = sum(c["length"] for c in body_cells) / len(body_cells) if body_cells else 0
+
+            # Header row should be shorter (labels) than body (data)
+            if first_row_avg_len < body_avg_len * 0.7:
+                return {"has_header": True, "type": "short_labels"}
+
+        return {"has_header": False}
+
+    def _check_key_value_pattern(
+        self,
+        row_data: List[List[Dict]]
+    ) -> Dict[str, Any]:
+        """
+        For 2-column tables, check if it's a key-value list.
+
+        Key-value characteristics:
+        - Left column: short labels (< 30 chars)
+        - Right column: values (can be longer)
+        - Consistent pattern across rows
+
+        Random layout characteristics:
+        - Both columns have similar length distribution
+        - No clear label-value relationship
+        """
+        if not row_data:
+            return {"is_kv_list": False, "is_random_layout": False, "confidence": 0}
+
+        left_lengths = []
+        right_lengths = []
+        kv_rows = 0
+        total_rows = 0
+
+        for row in row_data:
+            if len(row) != 2:
+                continue
+            total_rows += 1
+            left = row[0]
+            right = row[1]
+            left_lengths.append(left["length"])
+            right_lengths.append(right["length"])
+
+            # Key-value pattern: left is short label, right is value
+            if left["length"] < 40 and left["length"] < right["length"] * 2:
+                kv_rows += 1
+
+        if total_rows == 0:
+            return {"is_kv_list": False, "is_random_layout": False, "confidence": 0}
+
+        kv_ratio = kv_rows / total_rows
+        avg_left = sum(left_lengths) / len(left_lengths) if left_lengths else 0
+        avg_right = sum(right_lengths) / len(right_lengths) if right_lengths else 0
+
+        # High KV ratio and left column is shorter = key-value list
+        if kv_ratio > 0.6 and avg_left < avg_right:
+            return {
+                "is_kv_list": True,
+                "is_random_layout": False,
+                "confidence": kv_ratio,
+                "avg_left": avg_left,
+                "avg_right": avg_right
+            }
+
+        # Similar lengths on both sides = random layout
+        if avg_left > 0 and 0.5 < avg_right / avg_left < 2.0:
+            # Both columns have similar content length
+            return {
+                "is_kv_list": False,
+                "is_random_layout": True,
+                "confidence": 1 - kv_ratio,
+                "avg_left": avg_left,
+                "avg_right": avg_right
+            }
+
+        return {
+            "is_kv_list": False,
+            "is_random_layout": False,
+            "confidence": 0,
+            "avg_left": avg_left,
+            "avg_right": avg_right
+        }
+
+    def extract_text_from_table_html(self, html_content: str) -> str:
+        """
+        Extract plain text from table HTML content.
+
+        Args:
+            html_content: HTML string containing table structure
+
+        Returns:
+            Plain text extracted from table cells
+        """
+        if not html_content:
+            return ""
+
+        try:
+            class TableTextExtractor(HTMLParser):
+                def __init__(self):
+                    super().__init__()
+                    self.text_parts = []
+                    self.in_cell = False
+
+                def handle_starttag(self, tag, attrs):
+                    if tag in ('td', 'th'):
+                        self.in_cell = True
+
+                def handle_endtag(self, tag):
+                    if tag in ('td', 'th'):
+                        self.in_cell = False
+
+                def handle_data(self, data):
+                    if self.in_cell:
+                        stripped = data.strip()
+                        if stripped:
+                            self.text_parts.append(stripped)
+
+            parser = TableTextExtractor()
+            parser.feed(html_content)
+            return ' '.join(parser.text_parts)
+        except Exception as e:
+            logger.warning(f"Failed to parse table HTML: {e}")
+            # Fallback: strip HTML tags with regex
+            text = re.sub(r'<[^>]+>', ' ', html_content)
+            text = re.sub(r'\s+', ' ', text).strip()
+            return text
+
+    def reclassify_as_text(self, element: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Convert an over-detected table element to a TEXT element.
+
+        Args:
+            element: Table element to reclassify
+
+        Returns:
+            New TEXT element with preserved content
+        """
+        # Extract text content from HTML
+        html_content = element.get("content", "")
+        text_content = self.extract_text_from_table_html(html_content)
+
+        # Create new TEXT element
+        text_element = {
+            "element_id": element.get("element_id", ""),
+            "type": "text",
+            "original_type": "table_reclassified",  # Mark as reclassified
+            "content": text_content,
+            "page": element.get("page", 0),
+            "bbox": element.get("bbox", []),
+            "index": element.get("index", 0),
+            "confidence": element.get("confidence", 1.0),
+            "reclassified_from": "table",
+            "reclassification_reason": "over_detection"
+        }
+
+        return text_element
+
+    def validate_and_filter_elements(
+        self,
+        elements: List[Dict[str, Any]]
+    ) -> Tuple[List[Dict[str, Any]], Dict[str, Any]]:
+        """
+        Validate all elements and filter/reclassify over-detected tables.
+
+        Args:
+            elements: List of elements from PP-StructureV3 output
+
+        Returns:
+            Tuple of (filtered_elements, statistics)
+        """
+        filtered_elements = []
+        stats = {
+            "total_tables": 0,
+            "valid_tables": 0,
+            "reclassified_tables": 0,
+            "reclassification_details": []
+        }
+
+        for element in elements:
+            if element.get("type") != "table":
+                # Non-table elements pass through unchanged
+                filtered_elements.append(element)
+                continue
+
+            stats["total_tables"] += 1
+
+            # Validate table
+            result = self.validate_table(element)
+
+            if result.is_valid:
+                stats["valid_tables"] += 1
+                filtered_elements.append(element)
+            else:
+                # Reclassify as TEXT
+                stats["reclassified_tables"] += 1
+                text_element = self.reclassify_as_text(element)
+                filtered_elements.append(text_element)
+
+                stats["reclassification_details"].append({
+                    "element_id": element.get("element_id"),
+                    "reason": result.reason,
+                    "metrics": result.metrics
+                })
+
+                logger.info(
+                    f"Reclassified table {element.get('element_id')} as TEXT: {result.reason}"
+                )
+
+        # Re-sort by reading order (y0 then x0)
+        filtered_elements = self._sort_by_reading_order(filtered_elements)
+
+        return filtered_elements, stats
+
+    def _sort_by_reading_order(
+        self,
+        elements: List[Dict[str, Any]]
+    ) -> List[Dict[str, Any]]:
+        """Sort elements by reading order (top-to-bottom, left-to-right)."""
+        def sort_key(elem):
+            bbox = elem.get("bbox", [0, 0, 0, 0])
+            if isinstance(bbox, dict):
+                y0 = bbox.get("y0", 0)
+                x0 = bbox.get("x0", 0)
+            elif isinstance(bbox, list) and len(bbox) >= 2:
+                x0, y0 = bbox[0], bbox[1]
+            else:
+                y0, x0 = 0, 0
+            return (y0, x0)
+
+        return sorted(elements, key=sort_key)