""" Cell Validation Engine Validates PP-StructureV3 table detections using metric-based heuristics to filter over-detected cells and reclassify invalid tables as TEXT elements. Metrics used: - Cell density: cells per 10,000 px² (normal: 0.4-1.0, over-detected: 6+) - Average cell area: px² per cell (normal: 10,000-25,000, over-detected: ~1,600) - Cell height: table_height / cell_count (minimum: 10px for readable text) """ import logging from dataclasses import dataclass from typing import List, Dict, Any, Optional, Tuple from html.parser import HTMLParser import re logger = logging.getLogger(__name__) @dataclass class CellValidationConfig: """Configuration for cell validation thresholds.""" max_cell_density: float = 3.0 # cells per 10,000 px² min_avg_cell_area: float = 3000.0 # px² per cell min_cell_height: float = 10.0 # px per cell row enabled: bool = True @dataclass class TableValidationResult: """Result of table validation.""" is_valid: bool table_element: Dict[str, Any] reason: Optional[str] = None metrics: Optional[Dict[str, float]] = None class CellValidationEngine: """ Validates table elements from PP-StructureV3 output. Over-detected tables are identified by abnormal metrics and reclassified as TEXT elements while preserving content. """ def __init__(self, config: Optional[CellValidationConfig] = None): self.config = config or CellValidationConfig() def calculate_table_metrics( self, bbox: List[float], cell_boxes: List[List[float]] ) -> Dict[str, float]: """ Calculate validation metrics for a table. Args: bbox: Table bounding box [x0, y0, x1, y1] cell_boxes: List of cell bounding boxes Returns: Dictionary with calculated metrics """ if len(bbox) < 4: return {"cell_count": 0, "cell_density": 0, "avg_cell_area": 0, "avg_cell_height": 0} cell_count = len(cell_boxes) if cell_count == 0: return {"cell_count": 0, "cell_density": 0, "avg_cell_area": 0, "avg_cell_height": 0} # Calculate table dimensions table_width = bbox[2] - bbox[0] table_height = bbox[3] - bbox[1] table_area = table_width * table_height if table_area <= 0: return {"cell_count": cell_count, "cell_density": 0, "avg_cell_area": 0, "avg_cell_height": 0} # Cell density: cells per 10,000 px² cell_density = (cell_count / table_area) * 10000 # Average cell area avg_cell_area = table_area / cell_count # Average cell height (table height / cell count) avg_cell_height = table_height / cell_count return { "cell_count": cell_count, "table_width": table_width, "table_height": table_height, "table_area": table_area, "cell_density": cell_density, "avg_cell_area": avg_cell_area, "avg_cell_height": avg_cell_height } def validate_table( self, element: Dict[str, Any] ) -> TableValidationResult: """ Validate a single table element. Args: element: Table element from PP-StructureV3 output Returns: TableValidationResult with validation status and metrics """ if not self.config.enabled: return TableValidationResult(is_valid=True, table_element=element) # Extract bbox and cell_boxes bbox = element.get("bbox", []) cell_boxes = element.get("cell_boxes", []) # Tables without cells pass validation (structure-only tables) if not cell_boxes: return TableValidationResult( is_valid=True, table_element=element, reason="No cells to validate" ) # Calculate metrics metrics = self.calculate_table_metrics(bbox, cell_boxes) # Check cell density if metrics["cell_density"] > self.config.max_cell_density: return TableValidationResult( is_valid=False, table_element=element, reason=f"Cell density {metrics['cell_density']:.2f} exceeds threshold {self.config.max_cell_density}", metrics=metrics ) # Check average cell area if metrics["avg_cell_area"] < self.config.min_avg_cell_area: return TableValidationResult( is_valid=False, table_element=element, reason=f"Avg cell area {metrics['avg_cell_area']:.0f}px² below threshold {self.config.min_avg_cell_area}px²", metrics=metrics ) # Check cell height if metrics["avg_cell_height"] < self.config.min_cell_height: return TableValidationResult( is_valid=False, table_element=element, reason=f"Avg cell height {metrics['avg_cell_height']:.1f}px below threshold {self.config.min_cell_height}px", metrics=metrics ) # Content-based validation: check if content looks like prose vs tabular data content_check = self._validate_table_content(element) if not content_check["is_tabular"]: return TableValidationResult( is_valid=False, table_element=element, reason=content_check["reason"], metrics=metrics ) return TableValidationResult( is_valid=True, table_element=element, metrics=metrics ) def _validate_table_content(self, element: Dict[str, Any]) -> Dict[str, Any]: """ Validate table content to detect false positive tables. Checks: 1. Sparsity: text coverage ratio (text area / table area) 2. Header: does table have proper header structure 3. Key-Value: for 2-col tables, is it a key-value list or random layout 4. Prose: are cells containing long prose text Returns: Dict with is_tabular (bool) and reason (str) """ html_content = element.get("content", "") bbox = element.get("bbox", []) cell_boxes = element.get("cell_boxes", []) if not html_content or ' 3: # Large table without header is suspicious logger.debug(f"Table has no header structure with {num_rows} rows") # === Check 3: Key-Value pattern for 2-column tables === if num_cols == 2: kv_result = self._check_key_value_pattern(row_data) if kv_result["is_kv_list"] and kv_result["confidence"] > 0.7: # High confidence key-value list - keep as table but log logger.debug(f"Table identified as key-value list (conf={kv_result['confidence']:.2f})") elif not kv_result["is_kv_list"] and kv_result["is_random_layout"]: # Random 2-column layout, not a real table return { "is_tabular": False, "reason": f"random_two_column_layout (not key-value)" } # === Check 4: Prose content === long_cells = [c for c in all_cells if c["length"] > 80] prose_ratio = len(long_cells) / len(all_cells) if all_cells else 0 if prose_ratio > 0.3: return { "is_tabular": False, "reason": f"prose_content ({len(long_cells)}/{len(all_cells)} cells > 80 chars)" } # === Check 5: Section header as table === if num_rows <= 2 and num_cols <= 2: first_row = row_data[0] if row_data else [] if len(first_row) == 1: text = first_row[0]["text"] if text.isupper() and len(text) < 50: return { "is_tabular": False, "reason": f"section_header_only ({text[:30]})" } return {"is_tabular": True, "reason": "content_valid"} except Exception as e: logger.warning(f"Content validation failed: {e}") return {"is_tabular": True, "reason": f"validation_error: {e}"} def _check_sparsity( self, bbox: List[float], cell_boxes: List[List[float]], all_cells: List[Dict] ) -> Dict[str, Any]: """ Check text coverage ratio (sparsity). Two-column layouts have large empty gaps in the middle. Real tables have more uniform cell distribution. """ if len(bbox) < 4: return {"is_valid": True, "reason": "no_bbox"} table_width = bbox[2] - bbox[0] table_height = bbox[3] - bbox[1] table_area = table_width * table_height if table_area <= 0: return {"is_valid": True, "reason": "invalid_area"} # Calculate text area from cell_boxes if cell_boxes: text_area = 0 for cb in cell_boxes: if len(cb) >= 4: w = abs(cb[2] - cb[0]) h = abs(cb[3] - cb[1]) text_area += w * h coverage = text_area / table_area else: # Estimate from cell content length total_chars = sum(c["length"] for c in all_cells) # Rough estimate: 1 char ≈ 8x12 pixels = 96 px² estimated_text_area = total_chars * 96 coverage = min(estimated_text_area / table_area, 1.0) # Very sparse table (< 15% coverage) is suspicious if coverage < 0.15: return { "is_valid": False, "reason": f"sparse_content (coverage={coverage:.1%})" } return {"is_valid": True, "coverage": coverage} def _check_header_structure( self, row_data: List[List[Dict]], num_cols: int ) -> Dict[str, Any]: """ Check if table has proper header structure. Real tables usually have: - First row with elements - Or first row with different content pattern (labels vs values) """ if not row_data: return {"has_header": False} first_row = row_data[0] # Check for elements th_count = sum(1 for c in first_row if c.get("is_header", False)) if th_count > 0 and th_count >= len(first_row) * 0.5: return {"has_header": True, "type": "th_elements"} # Check for header-like content (short, distinct from body) if len(row_data) > 1: first_row_avg_len = sum(c["length"] for c in first_row) / len(first_row) if first_row else 0 body_rows = row_data[1:] body_cells = [c for row in body_rows for c in row] body_avg_len = sum(c["length"] for c in body_cells) / len(body_cells) if body_cells else 0 # Header row should be shorter (labels) than body (data) if first_row_avg_len < body_avg_len * 0.7: return {"has_header": True, "type": "short_labels"} return {"has_header": False} def _check_key_value_pattern( self, row_data: List[List[Dict]] ) -> Dict[str, Any]: """ For 2-column tables, check if it's a key-value list. Key-value characteristics: - Left column: short labels (< 30 chars) - Right column: values (can be longer) - Consistent pattern across rows Random layout characteristics: - Both columns have similar length distribution - No clear label-value relationship """ if not row_data: return {"is_kv_list": False, "is_random_layout": False, "confidence": 0} left_lengths = [] right_lengths = [] kv_rows = 0 total_rows = 0 for row in row_data: if len(row) != 2: continue total_rows += 1 left = row[0] right = row[1] left_lengths.append(left["length"]) right_lengths.append(right["length"]) # Key-value pattern: left is short label, right is value if left["length"] < 40 and left["length"] < right["length"] * 2: kv_rows += 1 if total_rows == 0: return {"is_kv_list": False, "is_random_layout": False, "confidence": 0} kv_ratio = kv_rows / total_rows avg_left = sum(left_lengths) / len(left_lengths) if left_lengths else 0 avg_right = sum(right_lengths) / len(right_lengths) if right_lengths else 0 # High KV ratio and left column is shorter = key-value list if kv_ratio > 0.6 and avg_left < avg_right: return { "is_kv_list": True, "is_random_layout": False, "confidence": kv_ratio, "avg_left": avg_left, "avg_right": avg_right } # Similar lengths on both sides = random layout if avg_left > 0 and 0.5 < avg_right / avg_left < 2.0: # Both columns have similar content length return { "is_kv_list": False, "is_random_layout": True, "confidence": 1 - kv_ratio, "avg_left": avg_left, "avg_right": avg_right } return { "is_kv_list": False, "is_random_layout": False, "confidence": 0, "avg_left": avg_left, "avg_right": avg_right } def extract_text_from_table_html(self, html_content: str) -> str: """ Extract plain text from table HTML content. Args: html_content: HTML string containing table structure Returns: Plain text extracted from table cells """ if not html_content: return "" try: class TableTextExtractor(HTMLParser): def __init__(self): super().__init__() self.text_parts = [] self.in_cell = False def handle_starttag(self, tag, attrs): if tag in ('td', 'th'): self.in_cell = True def handle_endtag(self, tag): if tag in ('td', 'th'): self.in_cell = False def handle_data(self, data): if self.in_cell: stripped = data.strip() if stripped: self.text_parts.append(stripped) parser = TableTextExtractor() parser.feed(html_content) return ' '.join(parser.text_parts) except Exception as e: logger.warning(f"Failed to parse table HTML: {e}") # Fallback: strip HTML tags with regex text = re.sub(r'<[^>]+>', ' ', html_content) text = re.sub(r'\s+', ' ', text).strip() return text def reclassify_as_text(self, element: Dict[str, Any]) -> Dict[str, Any]: """ Convert an over-detected table element to a TEXT element. Args: element: Table element to reclassify Returns: New TEXT element with preserved content """ # Extract text content from HTML html_content = element.get("content", "") text_content = self.extract_text_from_table_html(html_content) # Create new TEXT element text_element = { "element_id": element.get("element_id", ""), "type": "text", "original_type": "table_reclassified", # Mark as reclassified "content": text_content, "page": element.get("page", 0), "bbox": element.get("bbox", []), "index": element.get("index", 0), "confidence": element.get("confidence", 1.0), "reclassified_from": "table", "reclassification_reason": "over_detection" } return text_element def validate_and_filter_elements( self, elements: List[Dict[str, Any]] ) -> Tuple[List[Dict[str, Any]], Dict[str, Any]]: """ Validate all elements and filter/reclassify over-detected tables. Args: elements: List of elements from PP-StructureV3 output Returns: Tuple of (filtered_elements, statistics) """ filtered_elements = [] stats = { "total_tables": 0, "valid_tables": 0, "reclassified_tables": 0, "reclassification_details": [] } for element in elements: if element.get("type") != "table": # Non-table elements pass through unchanged filtered_elements.append(element) continue stats["total_tables"] += 1 # Validate table result = self.validate_table(element) if result.is_valid: stats["valid_tables"] += 1 filtered_elements.append(element) else: # Reclassify as TEXT stats["reclassified_tables"] += 1 text_element = self.reclassify_as_text(element) filtered_elements.append(text_element) stats["reclassification_details"].append({ "element_id": element.get("element_id"), "reason": result.reason, "metrics": result.metrics }) logger.info( f"Reclassified table {element.get('element_id')} as TEXT: {result.reason}" ) # Re-sort by reading order (y0 then x0) filtered_elements = self._sort_by_reading_order(filtered_elements) return filtered_elements, stats def _sort_by_reading_order( self, elements: List[Dict[str, Any]] ) -> List[Dict[str, Any]]: """Sort elements by reading order (top-to-bottom, left-to-right).""" def sort_key(elem): bbox = elem.get("bbox", [0, 0, 0, 0]) if isinstance(bbox, dict): y0 = bbox.get("y0", 0) x0 = bbox.get("x0", 0) elif isinstance(bbox, list) and len(bbox) >= 2: x0, y0 = bbox[0], bbox[1] else: y0, x0 = 0, 0 return (y0, x0) return sorted(elements, key=sort_key)