OCR/backend/app/services/cell_validation_engine.py

"""
Cell Validation Engine

Validates PP-StructureV3 table detections using metric-based heuristics
to filter over-detected cells and reclassify invalid tables as TEXT elements.

Metrics used:
- Cell density: cells per 10,000 px² (normal: 0.4-1.0, over-detected: 6+)
- Average cell area: px² per cell (normal: 10,000-25,000, over-detected: ~1,600)
- Cell height: table_height / cell_count (minimum: 10px for readable text)
"""

import logging
from dataclasses import dataclass
from typing import List, Dict, Any, Optional, Tuple
from html.parser import HTMLParser
import re

logger = logging.getLogger(__name__)


@dataclass
class CellValidationConfig:
    """Configuration for cell validation thresholds."""
    max_cell_density: float = 3.0  # cells per 10,000 px²
    min_avg_cell_area: float = 3000.0  # px² per cell
    min_cell_height: float = 10.0  # px per cell row
    enabled: bool = True


@dataclass
class TableValidationResult:
    """Result of table validation."""
    is_valid: bool
    table_element: Dict[str, Any]
    reason: Optional[str] = None
    metrics: Optional[Dict[str, float]] = None


class CellValidationEngine:
    """
    Validates table elements from PP-StructureV3 output.

    Over-detected tables are identified by abnormal metrics and
    reclassified as TEXT elements while preserving content.
    """

    def __init__(self, config: Optional[CellValidationConfig] = None):
        self.config = config or CellValidationConfig()

    def calculate_table_metrics(
        self,
        bbox: List[float],
        cell_boxes: List[List[float]]
    ) -> Dict[str, float]:
        """
        Calculate validation metrics for a table.

        Args:
            bbox: Table bounding box [x0, y0, x1, y1]
            cell_boxes: List of cell bounding boxes

        Returns:
            Dictionary with calculated metrics
        """
        if len(bbox) < 4:
            return {"cell_count": 0, "cell_density": 0, "avg_cell_area": 0, "avg_cell_height": 0}

        cell_count = len(cell_boxes)
        if cell_count == 0:
            return {"cell_count": 0, "cell_density": 0, "avg_cell_area": 0, "avg_cell_height": 0}

        # Calculate table dimensions
        table_width = bbox[2] - bbox[0]
        table_height = bbox[3] - bbox[1]
        table_area = table_width * table_height

        if table_area <= 0:
            return {"cell_count": cell_count, "cell_density": 0, "avg_cell_area": 0, "avg_cell_height": 0}

        # Cell density: cells per 10,000 px²
        cell_density = (cell_count / table_area) * 10000

        # Average cell area
        avg_cell_area = table_area / cell_count

        # Average cell height (table height / cell count)
        avg_cell_height = table_height / cell_count

        return {
            "cell_count": cell_count,
            "table_width": table_width,
            "table_height": table_height,
            "table_area": table_area,
            "cell_density": cell_density,
            "avg_cell_area": avg_cell_area,
            "avg_cell_height": avg_cell_height
        }

    def validate_table(
        self,
        element: Dict[str, Any]
    ) -> TableValidationResult:
        """
        Validate a single table element.

        Args:
            element: Table element from PP-StructureV3 output

        Returns:
            TableValidationResult with validation status and metrics
        """
        if not self.config.enabled:
            return TableValidationResult(is_valid=True, table_element=element)

        # Extract bbox and cell_boxes
        bbox = element.get("bbox", [])
        cell_boxes = element.get("cell_boxes", [])

        # Tables without cells pass validation (structure-only tables)
        if not cell_boxes:
            return TableValidationResult(
                is_valid=True,
                table_element=element,
                reason="No cells to validate"
            )

        # Calculate metrics
        metrics = self.calculate_table_metrics(bbox, cell_boxes)

        # Check cell density
        if metrics["cell_density"] > self.config.max_cell_density:
            return TableValidationResult(
                is_valid=False,
                table_element=element,
                reason=f"Cell density {metrics['cell_density']:.2f} exceeds threshold {self.config.max_cell_density}",
                metrics=metrics
            )

        # Check average cell area
        if metrics["avg_cell_area"] < self.config.min_avg_cell_area:
            return TableValidationResult(
                is_valid=False,
                table_element=element,
                reason=f"Avg cell area {metrics['avg_cell_area']:.0f}px² below threshold {self.config.min_avg_cell_area}px²",
                metrics=metrics
            )

        # Check cell height
        if metrics["avg_cell_height"] < self.config.min_cell_height:
            return TableValidationResult(
                is_valid=False,
                table_element=element,
                reason=f"Avg cell height {metrics['avg_cell_height']:.1f}px below threshold {self.config.min_cell_height}px",
                metrics=metrics
            )

        # Content-based validation: check if content looks like prose vs tabular data
        content_check = self._validate_table_content(element)
        if not content_check["is_tabular"]:
            return TableValidationResult(
                is_valid=False,
                table_element=element,
                reason=content_check["reason"],
                metrics=metrics
            )

        return TableValidationResult(
            is_valid=True,
            table_element=element,
            metrics=metrics
        )

    def _validate_table_content(self, element: Dict[str, Any]) -> Dict[str, Any]:
        """
        Validate table content to detect false positive tables.

        Checks:
        1. Sparsity: text coverage ratio (text area / table area)
        2. Header: does table have proper header structure
        3. Key-Value: for 2-col tables, is it a key-value list or random layout
        4. Prose: are cells containing long prose text

        Returns:
            Dict with is_tabular (bool) and reason (str)
        """
        html_content = element.get("content", "")
        bbox = element.get("bbox", [])
        cell_boxes = element.get("cell_boxes", [])

        if not html_content or '<table' not in html_content.lower():
            return {"is_tabular": True, "reason": "no_html_content"}

        try:
            from bs4 import BeautifulSoup
            soup = BeautifulSoup(html_content, 'html.parser')
            table = soup.find('table')
            if not table:
                return {"is_tabular": True, "reason": "no_table_element"}

            rows = table.find_all('tr')
            if not rows:
                return {"is_tabular": True, "reason": "no_rows"}

            # Extract cell contents with row structure
            row_data = []
            all_cells = []
            for row_idx, row in enumerate(rows):
                cells = row.find_all(['td', 'th'])
                row_cells = []
                for cell in cells:
                    text = cell.get_text(strip=True)
                    colspan = int(cell.get('colspan', 1))
                    is_header = cell.name == 'th'
                    cell_info = {
                        "text": text,
                        "length": len(text),
                        "colspan": colspan,
                        "is_header": is_header,
                        "row": row_idx
                    }
                    row_cells.append(cell_info)
                    all_cells.append(cell_info)
                row_data.append(row_cells)

            if not all_cells:
                return {"is_tabular": True, "reason": "no_cells"}

            num_rows = len(row_data)
            num_cols = max(len(r) for r in row_data) if row_data else 0

            # === Check 1: Sparsity (text coverage) ===
            sparsity_result = self._check_sparsity(bbox, cell_boxes, all_cells)
            if not sparsity_result["is_valid"]:
                return {"is_tabular": False, "reason": sparsity_result["reason"]}

            # === Check 2: Header structure ===
            header_result = self._check_header_structure(row_data, num_cols)
            if not header_result["has_header"] and num_rows > 3:
                # Large table without header is suspicious
                logger.debug(f"Table has no header structure with {num_rows} rows")

            # === Check 3: Key-Value pattern for 2-column tables ===
            if num_cols == 2:
                kv_result = self._check_key_value_pattern(row_data)
                if kv_result["is_kv_list"] and kv_result["confidence"] > 0.7:
                    # High confidence key-value list - keep as table but log
                    logger.debug(f"Table identified as key-value list (conf={kv_result['confidence']:.2f})")
                elif not kv_result["is_kv_list"] and kv_result["is_random_layout"]:
                    # Random 2-column layout, not a real table
                    return {
                        "is_tabular": False,
                        "reason": f"random_two_column_layout (not key-value)"
                    }

            # === Check 4: Prose content ===
            long_cells = [c for c in all_cells if c["length"] > 80]
            prose_ratio = len(long_cells) / len(all_cells) if all_cells else 0
            if prose_ratio > 0.3:
                return {
                    "is_tabular": False,
                    "reason": f"prose_content ({len(long_cells)}/{len(all_cells)} cells > 80 chars)"
                }

            # === Check 5: Section header as table ===
            if num_rows <= 2 and num_cols <= 2:
                first_row = row_data[0] if row_data else []
                if len(first_row) == 1:
                    text = first_row[0]["text"]
                    if text.isupper() and len(text) < 50:
                        return {
                            "is_tabular": False,
                            "reason": f"section_header_only ({text[:30]})"
                        }

            return {"is_tabular": True, "reason": "content_valid"}

        except Exception as e:
            logger.warning(f"Content validation failed: {e}")
            return {"is_tabular": True, "reason": f"validation_error: {e}"}

    def _check_sparsity(
        self,
        bbox: List[float],
        cell_boxes: List[List[float]],
        all_cells: List[Dict]
    ) -> Dict[str, Any]:
        """
        Check text coverage ratio (sparsity).

        Two-column layouts have large empty gaps in the middle.
        Real tables have more uniform cell distribution.
        """
        if len(bbox) < 4:
            return {"is_valid": True, "reason": "no_bbox"}

        table_width = bbox[2] - bbox[0]
        table_height = bbox[3] - bbox[1]
        table_area = table_width * table_height

        if table_area <= 0:
            return {"is_valid": True, "reason": "invalid_area"}

        # Calculate text area from cell_boxes
        if cell_boxes:
            text_area = 0
            for cb in cell_boxes:
                if len(cb) >= 4:
                    w = abs(cb[2] - cb[0])
                    h = abs(cb[3] - cb[1])
                    text_area += w * h
            coverage = text_area / table_area
        else:
            # Estimate from cell content length
            total_chars = sum(c["length"] for c in all_cells)
            # Rough estimate: 1 char ≈ 8x12 pixels = 96 px²
            estimated_text_area = total_chars * 96
            coverage = min(estimated_text_area / table_area, 1.0)

        # Very sparse table (< 15% coverage) is suspicious
        if coverage < 0.15:
            return {
                "is_valid": False,
                "reason": f"sparse_content (coverage={coverage:.1%})"
            }

        return {"is_valid": True, "coverage": coverage}

    def _check_header_structure(
        self,
        row_data: List[List[Dict]],
        num_cols: int
    ) -> Dict[str, Any]:
        """
        Check if table has proper header structure.

        Real tables usually have:
        - First row with <th> elements
        - Or first row with different content pattern (labels vs values)
        """
        if not row_data:
            return {"has_header": False}

        first_row = row_data[0]

        # Check for <th> elements
        th_count = sum(1 for c in first_row if c.get("is_header", False))
        if th_count > 0 and th_count >= len(first_row) * 0.5:
            return {"has_header": True, "type": "th_elements"}

        # Check for header-like content (short, distinct from body)
        if len(row_data) > 1:
            first_row_avg_len = sum(c["length"] for c in first_row) / len(first_row) if first_row else 0
            body_rows = row_data[1:]
            body_cells = [c for row in body_rows for c in row]
            body_avg_len = sum(c["length"] for c in body_cells) / len(body_cells) if body_cells else 0

            # Header row should be shorter (labels) than body (data)
            if first_row_avg_len < body_avg_len * 0.7:
                return {"has_header": True, "type": "short_labels"}

        return {"has_header": False}

    def _check_key_value_pattern(
        self,
        row_data: List[List[Dict]]
    ) -> Dict[str, Any]:
        """
        For 2-column tables, check if it's a key-value list.

        Key-value characteristics:
        - Left column: short labels (< 30 chars)
        - Right column: values (can be longer)
        - Consistent pattern across rows

        Random layout characteristics:
        - Both columns have similar length distribution
        - No clear label-value relationship
        """
        if not row_data:
            return {"is_kv_list": False, "is_random_layout": False, "confidence": 0}

        left_lengths = []
        right_lengths = []
        kv_rows = 0
        total_rows = 0

        for row in row_data:
            if len(row) != 2:
                continue
            total_rows += 1
            left = row[0]
            right = row[1]
            left_lengths.append(left["length"])
            right_lengths.append(right["length"])

            # Key-value pattern: left is short label, right is value
            if left["length"] < 40 and left["length"] < right["length"] * 2:
                kv_rows += 1

        if total_rows == 0:
            return {"is_kv_list": False, "is_random_layout": False, "confidence": 0}

        kv_ratio = kv_rows / total_rows
        avg_left = sum(left_lengths) / len(left_lengths) if left_lengths else 0
        avg_right = sum(right_lengths) / len(right_lengths) if right_lengths else 0

        # High KV ratio and left column is shorter = key-value list
        if kv_ratio > 0.6 and avg_left < avg_right:
            return {
                "is_kv_list": True,
                "is_random_layout": False,
                "confidence": kv_ratio,
                "avg_left": avg_left,
                "avg_right": avg_right
            }

        # Similar lengths on both sides = random layout
        if avg_left > 0 and 0.5 < avg_right / avg_left < 2.0:
            # Both columns have similar content length
            return {
                "is_kv_list": False,
                "is_random_layout": True,
                "confidence": 1 - kv_ratio,
                "avg_left": avg_left,
                "avg_right": avg_right
            }

        return {
            "is_kv_list": False,
            "is_random_layout": False,
            "confidence": 0,
            "avg_left": avg_left,
            "avg_right": avg_right
        }

    def extract_text_from_table_html(self, html_content: str) -> str:
        """
        Extract plain text from table HTML content.

        Args:
            html_content: HTML string containing table structure

        Returns:
            Plain text extracted from table cells
        """
        if not html_content:
            return ""

        try:
            class TableTextExtractor(HTMLParser):
                def __init__(self):
                    super().__init__()
                    self.text_parts = []
                    self.in_cell = False

                def handle_starttag(self, tag, attrs):
                    if tag in ('td', 'th'):
                        self.in_cell = True

                def handle_endtag(self, tag):
                    if tag in ('td', 'th'):
                        self.in_cell = False

                def handle_data(self, data):
                    if self.in_cell:
                        stripped = data.strip()
                        if stripped:
                            self.text_parts.append(stripped)

            parser = TableTextExtractor()
            parser.feed(html_content)
            return ' '.join(parser.text_parts)
        except Exception as e:
            logger.warning(f"Failed to parse table HTML: {e}")
            # Fallback: strip HTML tags with regex
            text = re.sub(r'<[^>]+>', ' ', html_content)
            text = re.sub(r'\s+', ' ', text).strip()
            return text

    def reclassify_as_text(self, element: Dict[str, Any]) -> Dict[str, Any]:
        """
        Convert an over-detected table element to a TEXT element.

        Args:
            element: Table element to reclassify

        Returns:
            New TEXT element with preserved content
        """
        # Extract text content from HTML
        html_content = element.get("content", "")
        text_content = self.extract_text_from_table_html(html_content)

        # Create new TEXT element
        text_element = {
            "element_id": element.get("element_id", ""),
            "type": "text",
            "original_type": "table_reclassified",  # Mark as reclassified
            "content": text_content,
            "page": element.get("page", 0),
            "bbox": element.get("bbox", []),
            "index": element.get("index", 0),
            "confidence": element.get("confidence", 1.0),
            "reclassified_from": "table",
            "reclassification_reason": "over_detection"
        }

        return text_element

    def validate_and_filter_elements(
        self,
        elements: List[Dict[str, Any]]
    ) -> Tuple[List[Dict[str, Any]], Dict[str, Any]]:
        """
        Validate all elements and filter/reclassify over-detected tables.

        Args:
            elements: List of elements from PP-StructureV3 output

        Returns:
            Tuple of (filtered_elements, statistics)
        """
        filtered_elements = []
        stats = {
            "total_tables": 0,
            "valid_tables": 0,
            "reclassified_tables": 0,
            "reclassification_details": []
        }

        for element in elements:
            if element.get("type") != "table":
                # Non-table elements pass through unchanged
                filtered_elements.append(element)
                continue

            stats["total_tables"] += 1

            # Validate table
            result = self.validate_table(element)

            if result.is_valid:
                stats["valid_tables"] += 1
                filtered_elements.append(element)
            else:
                # Reclassify as TEXT
                stats["reclassified_tables"] += 1
                text_element = self.reclassify_as_text(element)
                filtered_elements.append(text_element)

                stats["reclassification_details"].append({
                    "element_id": element.get("element_id"),
                    "reason": result.reason,
                    "metrics": result.metrics
                })

                logger.info(
                    f"Reclassified table {element.get('element_id')} as TEXT: {result.reason}"
                )

        # Re-sort by reading order (y0 then x0)
        filtered_elements = self._sort_by_reading_order(filtered_elements)

        return filtered_elements, stats

    def _sort_by_reading_order(
        self,
        elements: List[Dict[str, Any]]
    ) -> List[Dict[str, Any]]:
        """Sort elements by reading order (top-to-bottom, left-to-right)."""
        def sort_key(elem):
            bbox = elem.get("bbox", [0, 0, 0, 0])
            if isinstance(bbox, dict):
                y0 = bbox.get("y0", 0)
                x0 = bbox.get("x0", 0)
            elif isinstance(bbox, list) and len(bbox) >= 2:
                x0, y0 = bbox[0], bbox[1]
            else:
                y0, x0 = 0, 0
            return (y0, x0)

        return sorted(elements, key=sort_key)