Backup commit before executing remove-unused-code proposal. This includes all pending changes and new features. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
584 lines
20 KiB
Python
584 lines
20 KiB
Python
"""
|
|
Cell Validation Engine
|
|
|
|
Validates PP-StructureV3 table detections using metric-based heuristics
|
|
to filter over-detected cells and reclassify invalid tables as TEXT elements.
|
|
|
|
Metrics used:
|
|
- Cell density: cells per 10,000 px² (normal: 0.4-1.0, over-detected: 6+)
|
|
- Average cell area: px² per cell (normal: 10,000-25,000, over-detected: ~1,600)
|
|
- Cell height: table_height / cell_count (minimum: 10px for readable text)
|
|
"""
|
|
|
|
import logging
|
|
from dataclasses import dataclass
|
|
from typing import List, Dict, Any, Optional, Tuple
|
|
from html.parser import HTMLParser
|
|
import re
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class CellValidationConfig:
|
|
"""Configuration for cell validation thresholds."""
|
|
max_cell_density: float = 3.0 # cells per 10,000 px²
|
|
min_avg_cell_area: float = 3000.0 # px² per cell
|
|
min_cell_height: float = 10.0 # px per cell row
|
|
enabled: bool = True
|
|
|
|
|
|
@dataclass
|
|
class TableValidationResult:
|
|
"""Result of table validation."""
|
|
is_valid: bool
|
|
table_element: Dict[str, Any]
|
|
reason: Optional[str] = None
|
|
metrics: Optional[Dict[str, float]] = None
|
|
|
|
|
|
class CellValidationEngine:
|
|
"""
|
|
Validates table elements from PP-StructureV3 output.
|
|
|
|
Over-detected tables are identified by abnormal metrics and
|
|
reclassified as TEXT elements while preserving content.
|
|
"""
|
|
|
|
def __init__(self, config: Optional[CellValidationConfig] = None):
|
|
self.config = config or CellValidationConfig()
|
|
|
|
def calculate_table_metrics(
|
|
self,
|
|
bbox: List[float],
|
|
cell_boxes: List[List[float]]
|
|
) -> Dict[str, float]:
|
|
"""
|
|
Calculate validation metrics for a table.
|
|
|
|
Args:
|
|
bbox: Table bounding box [x0, y0, x1, y1]
|
|
cell_boxes: List of cell bounding boxes
|
|
|
|
Returns:
|
|
Dictionary with calculated metrics
|
|
"""
|
|
if len(bbox) < 4:
|
|
return {"cell_count": 0, "cell_density": 0, "avg_cell_area": 0, "avg_cell_height": 0}
|
|
|
|
cell_count = len(cell_boxes)
|
|
if cell_count == 0:
|
|
return {"cell_count": 0, "cell_density": 0, "avg_cell_area": 0, "avg_cell_height": 0}
|
|
|
|
# Calculate table dimensions
|
|
table_width = bbox[2] - bbox[0]
|
|
table_height = bbox[3] - bbox[1]
|
|
table_area = table_width * table_height
|
|
|
|
if table_area <= 0:
|
|
return {"cell_count": cell_count, "cell_density": 0, "avg_cell_area": 0, "avg_cell_height": 0}
|
|
|
|
# Cell density: cells per 10,000 px²
|
|
cell_density = (cell_count / table_area) * 10000
|
|
|
|
# Average cell area
|
|
avg_cell_area = table_area / cell_count
|
|
|
|
# Average cell height (table height / cell count)
|
|
avg_cell_height = table_height / cell_count
|
|
|
|
return {
|
|
"cell_count": cell_count,
|
|
"table_width": table_width,
|
|
"table_height": table_height,
|
|
"table_area": table_area,
|
|
"cell_density": cell_density,
|
|
"avg_cell_area": avg_cell_area,
|
|
"avg_cell_height": avg_cell_height
|
|
}
|
|
|
|
def validate_table(
|
|
self,
|
|
element: Dict[str, Any]
|
|
) -> TableValidationResult:
|
|
"""
|
|
Validate a single table element.
|
|
|
|
Args:
|
|
element: Table element from PP-StructureV3 output
|
|
|
|
Returns:
|
|
TableValidationResult with validation status and metrics
|
|
"""
|
|
if not self.config.enabled:
|
|
return TableValidationResult(is_valid=True, table_element=element)
|
|
|
|
# Extract bbox and cell_boxes
|
|
bbox = element.get("bbox", [])
|
|
cell_boxes = element.get("cell_boxes", [])
|
|
|
|
# Tables without cells pass validation (structure-only tables)
|
|
if not cell_boxes:
|
|
return TableValidationResult(
|
|
is_valid=True,
|
|
table_element=element,
|
|
reason="No cells to validate"
|
|
)
|
|
|
|
# Calculate metrics
|
|
metrics = self.calculate_table_metrics(bbox, cell_boxes)
|
|
|
|
# Check cell density
|
|
if metrics["cell_density"] > self.config.max_cell_density:
|
|
return TableValidationResult(
|
|
is_valid=False,
|
|
table_element=element,
|
|
reason=f"Cell density {metrics['cell_density']:.2f} exceeds threshold {self.config.max_cell_density}",
|
|
metrics=metrics
|
|
)
|
|
|
|
# Check average cell area
|
|
if metrics["avg_cell_area"] < self.config.min_avg_cell_area:
|
|
return TableValidationResult(
|
|
is_valid=False,
|
|
table_element=element,
|
|
reason=f"Avg cell area {metrics['avg_cell_area']:.0f}px² below threshold {self.config.min_avg_cell_area}px²",
|
|
metrics=metrics
|
|
)
|
|
|
|
# Check cell height
|
|
if metrics["avg_cell_height"] < self.config.min_cell_height:
|
|
return TableValidationResult(
|
|
is_valid=False,
|
|
table_element=element,
|
|
reason=f"Avg cell height {metrics['avg_cell_height']:.1f}px below threshold {self.config.min_cell_height}px",
|
|
metrics=metrics
|
|
)
|
|
|
|
# Content-based validation: check if content looks like prose vs tabular data
|
|
content_check = self._validate_table_content(element)
|
|
if not content_check["is_tabular"]:
|
|
return TableValidationResult(
|
|
is_valid=False,
|
|
table_element=element,
|
|
reason=content_check["reason"],
|
|
metrics=metrics
|
|
)
|
|
|
|
return TableValidationResult(
|
|
is_valid=True,
|
|
table_element=element,
|
|
metrics=metrics
|
|
)
|
|
|
|
def _validate_table_content(self, element: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""
|
|
Validate table content to detect false positive tables.
|
|
|
|
Checks:
|
|
1. Sparsity: text coverage ratio (text area / table area)
|
|
2. Header: does table have proper header structure
|
|
3. Key-Value: for 2-col tables, is it a key-value list or random layout
|
|
4. Prose: are cells containing long prose text
|
|
|
|
Returns:
|
|
Dict with is_tabular (bool) and reason (str)
|
|
"""
|
|
html_content = element.get("content", "")
|
|
bbox = element.get("bbox", [])
|
|
cell_boxes = element.get("cell_boxes", [])
|
|
|
|
if not html_content or '<table' not in html_content.lower():
|
|
return {"is_tabular": True, "reason": "no_html_content"}
|
|
|
|
try:
|
|
from bs4 import BeautifulSoup
|
|
soup = BeautifulSoup(html_content, 'html.parser')
|
|
table = soup.find('table')
|
|
if not table:
|
|
return {"is_tabular": True, "reason": "no_table_element"}
|
|
|
|
rows = table.find_all('tr')
|
|
if not rows:
|
|
return {"is_tabular": True, "reason": "no_rows"}
|
|
|
|
# Extract cell contents with row structure
|
|
row_data = []
|
|
all_cells = []
|
|
for row_idx, row in enumerate(rows):
|
|
cells = row.find_all(['td', 'th'])
|
|
row_cells = []
|
|
for cell in cells:
|
|
text = cell.get_text(strip=True)
|
|
colspan = int(cell.get('colspan', 1))
|
|
is_header = cell.name == 'th'
|
|
cell_info = {
|
|
"text": text,
|
|
"length": len(text),
|
|
"colspan": colspan,
|
|
"is_header": is_header,
|
|
"row": row_idx
|
|
}
|
|
row_cells.append(cell_info)
|
|
all_cells.append(cell_info)
|
|
row_data.append(row_cells)
|
|
|
|
if not all_cells:
|
|
return {"is_tabular": True, "reason": "no_cells"}
|
|
|
|
num_rows = len(row_data)
|
|
num_cols = max(len(r) for r in row_data) if row_data else 0
|
|
|
|
# === Check 1: Sparsity (text coverage) ===
|
|
sparsity_result = self._check_sparsity(bbox, cell_boxes, all_cells)
|
|
if not sparsity_result["is_valid"]:
|
|
return {"is_tabular": False, "reason": sparsity_result["reason"]}
|
|
|
|
# === Check 2: Header structure ===
|
|
header_result = self._check_header_structure(row_data, num_cols)
|
|
if not header_result["has_header"] and num_rows > 3:
|
|
# Large table without header is suspicious
|
|
logger.debug(f"Table has no header structure with {num_rows} rows")
|
|
|
|
# === Check 3: Key-Value pattern for 2-column tables ===
|
|
if num_cols == 2:
|
|
kv_result = self._check_key_value_pattern(row_data)
|
|
if kv_result["is_kv_list"] and kv_result["confidence"] > 0.7:
|
|
# High confidence key-value list - keep as table but log
|
|
logger.debug(f"Table identified as key-value list (conf={kv_result['confidence']:.2f})")
|
|
elif not kv_result["is_kv_list"] and kv_result["is_random_layout"]:
|
|
# Random 2-column layout, not a real table
|
|
return {
|
|
"is_tabular": False,
|
|
"reason": f"random_two_column_layout (not key-value)"
|
|
}
|
|
|
|
# === Check 4: Prose content ===
|
|
long_cells = [c for c in all_cells if c["length"] > 80]
|
|
prose_ratio = len(long_cells) / len(all_cells) if all_cells else 0
|
|
if prose_ratio > 0.3:
|
|
return {
|
|
"is_tabular": False,
|
|
"reason": f"prose_content ({len(long_cells)}/{len(all_cells)} cells > 80 chars)"
|
|
}
|
|
|
|
# === Check 5: Section header as table ===
|
|
if num_rows <= 2 and num_cols <= 2:
|
|
first_row = row_data[0] if row_data else []
|
|
if len(first_row) == 1:
|
|
text = first_row[0]["text"]
|
|
if text.isupper() and len(text) < 50:
|
|
return {
|
|
"is_tabular": False,
|
|
"reason": f"section_header_only ({text[:30]})"
|
|
}
|
|
|
|
return {"is_tabular": True, "reason": "content_valid"}
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Content validation failed: {e}")
|
|
return {"is_tabular": True, "reason": f"validation_error: {e}"}
|
|
|
|
def _check_sparsity(
|
|
self,
|
|
bbox: List[float],
|
|
cell_boxes: List[List[float]],
|
|
all_cells: List[Dict]
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Check text coverage ratio (sparsity).
|
|
|
|
Two-column layouts have large empty gaps in the middle.
|
|
Real tables have more uniform cell distribution.
|
|
"""
|
|
if len(bbox) < 4:
|
|
return {"is_valid": True, "reason": "no_bbox"}
|
|
|
|
table_width = bbox[2] - bbox[0]
|
|
table_height = bbox[3] - bbox[1]
|
|
table_area = table_width * table_height
|
|
|
|
if table_area <= 0:
|
|
return {"is_valid": True, "reason": "invalid_area"}
|
|
|
|
# Calculate text area from cell_boxes
|
|
if cell_boxes:
|
|
text_area = 0
|
|
for cb in cell_boxes:
|
|
if len(cb) >= 4:
|
|
w = abs(cb[2] - cb[0])
|
|
h = abs(cb[3] - cb[1])
|
|
text_area += w * h
|
|
coverage = text_area / table_area
|
|
else:
|
|
# Estimate from cell content length
|
|
total_chars = sum(c["length"] for c in all_cells)
|
|
# Rough estimate: 1 char ≈ 8x12 pixels = 96 px²
|
|
estimated_text_area = total_chars * 96
|
|
coverage = min(estimated_text_area / table_area, 1.0)
|
|
|
|
# Very sparse table (< 15% coverage) is suspicious
|
|
if coverage < 0.15:
|
|
return {
|
|
"is_valid": False,
|
|
"reason": f"sparse_content (coverage={coverage:.1%})"
|
|
}
|
|
|
|
return {"is_valid": True, "coverage": coverage}
|
|
|
|
def _check_header_structure(
|
|
self,
|
|
row_data: List[List[Dict]],
|
|
num_cols: int
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Check if table has proper header structure.
|
|
|
|
Real tables usually have:
|
|
- First row with <th> elements
|
|
- Or first row with different content pattern (labels vs values)
|
|
"""
|
|
if not row_data:
|
|
return {"has_header": False}
|
|
|
|
first_row = row_data[0]
|
|
|
|
# Check for <th> elements
|
|
th_count = sum(1 for c in first_row if c.get("is_header", False))
|
|
if th_count > 0 and th_count >= len(first_row) * 0.5:
|
|
return {"has_header": True, "type": "th_elements"}
|
|
|
|
# Check for header-like content (short, distinct from body)
|
|
if len(row_data) > 1:
|
|
first_row_avg_len = sum(c["length"] for c in first_row) / len(first_row) if first_row else 0
|
|
body_rows = row_data[1:]
|
|
body_cells = [c for row in body_rows for c in row]
|
|
body_avg_len = sum(c["length"] for c in body_cells) / len(body_cells) if body_cells else 0
|
|
|
|
# Header row should be shorter (labels) than body (data)
|
|
if first_row_avg_len < body_avg_len * 0.7:
|
|
return {"has_header": True, "type": "short_labels"}
|
|
|
|
return {"has_header": False}
|
|
|
|
def _check_key_value_pattern(
|
|
self,
|
|
row_data: List[List[Dict]]
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
For 2-column tables, check if it's a key-value list.
|
|
|
|
Key-value characteristics:
|
|
- Left column: short labels (< 30 chars)
|
|
- Right column: values (can be longer)
|
|
- Consistent pattern across rows
|
|
|
|
Random layout characteristics:
|
|
- Both columns have similar length distribution
|
|
- No clear label-value relationship
|
|
"""
|
|
if not row_data:
|
|
return {"is_kv_list": False, "is_random_layout": False, "confidence": 0}
|
|
|
|
left_lengths = []
|
|
right_lengths = []
|
|
kv_rows = 0
|
|
total_rows = 0
|
|
|
|
for row in row_data:
|
|
if len(row) != 2:
|
|
continue
|
|
total_rows += 1
|
|
left = row[0]
|
|
right = row[1]
|
|
left_lengths.append(left["length"])
|
|
right_lengths.append(right["length"])
|
|
|
|
# Key-value pattern: left is short label, right is value
|
|
if left["length"] < 40 and left["length"] < right["length"] * 2:
|
|
kv_rows += 1
|
|
|
|
if total_rows == 0:
|
|
return {"is_kv_list": False, "is_random_layout": False, "confidence": 0}
|
|
|
|
kv_ratio = kv_rows / total_rows
|
|
avg_left = sum(left_lengths) / len(left_lengths) if left_lengths else 0
|
|
avg_right = sum(right_lengths) / len(right_lengths) if right_lengths else 0
|
|
|
|
# High KV ratio and left column is shorter = key-value list
|
|
if kv_ratio > 0.6 and avg_left < avg_right:
|
|
return {
|
|
"is_kv_list": True,
|
|
"is_random_layout": False,
|
|
"confidence": kv_ratio,
|
|
"avg_left": avg_left,
|
|
"avg_right": avg_right
|
|
}
|
|
|
|
# Similar lengths on both sides = random layout
|
|
if avg_left > 0 and 0.5 < avg_right / avg_left < 2.0:
|
|
# Both columns have similar content length
|
|
return {
|
|
"is_kv_list": False,
|
|
"is_random_layout": True,
|
|
"confidence": 1 - kv_ratio,
|
|
"avg_left": avg_left,
|
|
"avg_right": avg_right
|
|
}
|
|
|
|
return {
|
|
"is_kv_list": False,
|
|
"is_random_layout": False,
|
|
"confidence": 0,
|
|
"avg_left": avg_left,
|
|
"avg_right": avg_right
|
|
}
|
|
|
|
def extract_text_from_table_html(self, html_content: str) -> str:
|
|
"""
|
|
Extract plain text from table HTML content.
|
|
|
|
Args:
|
|
html_content: HTML string containing table structure
|
|
|
|
Returns:
|
|
Plain text extracted from table cells
|
|
"""
|
|
if not html_content:
|
|
return ""
|
|
|
|
try:
|
|
class TableTextExtractor(HTMLParser):
|
|
def __init__(self):
|
|
super().__init__()
|
|
self.text_parts = []
|
|
self.in_cell = False
|
|
|
|
def handle_starttag(self, tag, attrs):
|
|
if tag in ('td', 'th'):
|
|
self.in_cell = True
|
|
|
|
def handle_endtag(self, tag):
|
|
if tag in ('td', 'th'):
|
|
self.in_cell = False
|
|
|
|
def handle_data(self, data):
|
|
if self.in_cell:
|
|
stripped = data.strip()
|
|
if stripped:
|
|
self.text_parts.append(stripped)
|
|
|
|
parser = TableTextExtractor()
|
|
parser.feed(html_content)
|
|
return ' '.join(parser.text_parts)
|
|
except Exception as e:
|
|
logger.warning(f"Failed to parse table HTML: {e}")
|
|
# Fallback: strip HTML tags with regex
|
|
text = re.sub(r'<[^>]+>', ' ', html_content)
|
|
text = re.sub(r'\s+', ' ', text).strip()
|
|
return text
|
|
|
|
def reclassify_as_text(self, element: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""
|
|
Convert an over-detected table element to a TEXT element.
|
|
|
|
Args:
|
|
element: Table element to reclassify
|
|
|
|
Returns:
|
|
New TEXT element with preserved content
|
|
"""
|
|
# Extract text content from HTML
|
|
html_content = element.get("content", "")
|
|
text_content = self.extract_text_from_table_html(html_content)
|
|
|
|
# Create new TEXT element
|
|
text_element = {
|
|
"element_id": element.get("element_id", ""),
|
|
"type": "text",
|
|
"original_type": "table_reclassified", # Mark as reclassified
|
|
"content": text_content,
|
|
"page": element.get("page", 0),
|
|
"bbox": element.get("bbox", []),
|
|
"index": element.get("index", 0),
|
|
"confidence": element.get("confidence", 1.0),
|
|
"reclassified_from": "table",
|
|
"reclassification_reason": "over_detection"
|
|
}
|
|
|
|
return text_element
|
|
|
|
def validate_and_filter_elements(
|
|
self,
|
|
elements: List[Dict[str, Any]]
|
|
) -> Tuple[List[Dict[str, Any]], Dict[str, Any]]:
|
|
"""
|
|
Validate all elements and filter/reclassify over-detected tables.
|
|
|
|
Args:
|
|
elements: List of elements from PP-StructureV3 output
|
|
|
|
Returns:
|
|
Tuple of (filtered_elements, statistics)
|
|
"""
|
|
filtered_elements = []
|
|
stats = {
|
|
"total_tables": 0,
|
|
"valid_tables": 0,
|
|
"reclassified_tables": 0,
|
|
"reclassification_details": []
|
|
}
|
|
|
|
for element in elements:
|
|
if element.get("type") != "table":
|
|
# Non-table elements pass through unchanged
|
|
filtered_elements.append(element)
|
|
continue
|
|
|
|
stats["total_tables"] += 1
|
|
|
|
# Validate table
|
|
result = self.validate_table(element)
|
|
|
|
if result.is_valid:
|
|
stats["valid_tables"] += 1
|
|
filtered_elements.append(element)
|
|
else:
|
|
# Reclassify as TEXT
|
|
stats["reclassified_tables"] += 1
|
|
text_element = self.reclassify_as_text(element)
|
|
filtered_elements.append(text_element)
|
|
|
|
stats["reclassification_details"].append({
|
|
"element_id": element.get("element_id"),
|
|
"reason": result.reason,
|
|
"metrics": result.metrics
|
|
})
|
|
|
|
logger.info(
|
|
f"Reclassified table {element.get('element_id')} as TEXT: {result.reason}"
|
|
)
|
|
|
|
# Re-sort by reading order (y0 then x0)
|
|
filtered_elements = self._sort_by_reading_order(filtered_elements)
|
|
|
|
return filtered_elements, stats
|
|
|
|
def _sort_by_reading_order(
|
|
self,
|
|
elements: List[Dict[str, Any]]
|
|
) -> List[Dict[str, Any]]:
|
|
"""Sort elements by reading order (top-to-bottom, left-to-right)."""
|
|
def sort_key(elem):
|
|
bbox = elem.get("bbox", [0, 0, 0, 0])
|
|
if isinstance(bbox, dict):
|
|
y0 = bbox.get("y0", 0)
|
|
x0 = bbox.get("x0", 0)
|
|
elif isinstance(bbox, list) and len(bbox) >= 2:
|
|
x0, y0 = bbox[0], bbox[1]
|
|
else:
|
|
y0, x0 = 0, 0
|
|
return (y0, x0)
|
|
|
|
return sorted(elements, key=sort_key)
|