chore: backup before code cleanup
Backup commit before executing remove-unused-code proposal. This includes all pending changes and new features. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
583
backend/app/services/cell_validation_engine.py
Normal file
583
backend/app/services/cell_validation_engine.py
Normal file
@@ -0,0 +1,583 @@
|
||||
"""
|
||||
Cell Validation Engine
|
||||
|
||||
Validates PP-StructureV3 table detections using metric-based heuristics
|
||||
to filter over-detected cells and reclassify invalid tables as TEXT elements.
|
||||
|
||||
Metrics used:
|
||||
- Cell density: cells per 10,000 px² (normal: 0.4-1.0, over-detected: 6+)
|
||||
- Average cell area: px² per cell (normal: 10,000-25,000, over-detected: ~1,600)
|
||||
- Cell height: table_height / cell_count (minimum: 10px for readable text)
|
||||
"""
|
||||
|
||||
import logging
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Dict, Any, Optional, Tuple
|
||||
from html.parser import HTMLParser
|
||||
import re
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class CellValidationConfig:
|
||||
"""Configuration for cell validation thresholds."""
|
||||
max_cell_density: float = 3.0 # cells per 10,000 px²
|
||||
min_avg_cell_area: float = 3000.0 # px² per cell
|
||||
min_cell_height: float = 10.0 # px per cell row
|
||||
enabled: bool = True
|
||||
|
||||
|
||||
@dataclass
|
||||
class TableValidationResult:
|
||||
"""Result of table validation."""
|
||||
is_valid: bool
|
||||
table_element: Dict[str, Any]
|
||||
reason: Optional[str] = None
|
||||
metrics: Optional[Dict[str, float]] = None
|
||||
|
||||
|
||||
class CellValidationEngine:
|
||||
"""
|
||||
Validates table elements from PP-StructureV3 output.
|
||||
|
||||
Over-detected tables are identified by abnormal metrics and
|
||||
reclassified as TEXT elements while preserving content.
|
||||
"""
|
||||
|
||||
def __init__(self, config: Optional[CellValidationConfig] = None):
|
||||
self.config = config or CellValidationConfig()
|
||||
|
||||
def calculate_table_metrics(
|
||||
self,
|
||||
bbox: List[float],
|
||||
cell_boxes: List[List[float]]
|
||||
) -> Dict[str, float]:
|
||||
"""
|
||||
Calculate validation metrics for a table.
|
||||
|
||||
Args:
|
||||
bbox: Table bounding box [x0, y0, x1, y1]
|
||||
cell_boxes: List of cell bounding boxes
|
||||
|
||||
Returns:
|
||||
Dictionary with calculated metrics
|
||||
"""
|
||||
if len(bbox) < 4:
|
||||
return {"cell_count": 0, "cell_density": 0, "avg_cell_area": 0, "avg_cell_height": 0}
|
||||
|
||||
cell_count = len(cell_boxes)
|
||||
if cell_count == 0:
|
||||
return {"cell_count": 0, "cell_density": 0, "avg_cell_area": 0, "avg_cell_height": 0}
|
||||
|
||||
# Calculate table dimensions
|
||||
table_width = bbox[2] - bbox[0]
|
||||
table_height = bbox[3] - bbox[1]
|
||||
table_area = table_width * table_height
|
||||
|
||||
if table_area <= 0:
|
||||
return {"cell_count": cell_count, "cell_density": 0, "avg_cell_area": 0, "avg_cell_height": 0}
|
||||
|
||||
# Cell density: cells per 10,000 px²
|
||||
cell_density = (cell_count / table_area) * 10000
|
||||
|
||||
# Average cell area
|
||||
avg_cell_area = table_area / cell_count
|
||||
|
||||
# Average cell height (table height / cell count)
|
||||
avg_cell_height = table_height / cell_count
|
||||
|
||||
return {
|
||||
"cell_count": cell_count,
|
||||
"table_width": table_width,
|
||||
"table_height": table_height,
|
||||
"table_area": table_area,
|
||||
"cell_density": cell_density,
|
||||
"avg_cell_area": avg_cell_area,
|
||||
"avg_cell_height": avg_cell_height
|
||||
}
|
||||
|
||||
def validate_table(
|
||||
self,
|
||||
element: Dict[str, Any]
|
||||
) -> TableValidationResult:
|
||||
"""
|
||||
Validate a single table element.
|
||||
|
||||
Args:
|
||||
element: Table element from PP-StructureV3 output
|
||||
|
||||
Returns:
|
||||
TableValidationResult with validation status and metrics
|
||||
"""
|
||||
if not self.config.enabled:
|
||||
return TableValidationResult(is_valid=True, table_element=element)
|
||||
|
||||
# Extract bbox and cell_boxes
|
||||
bbox = element.get("bbox", [])
|
||||
cell_boxes = element.get("cell_boxes", [])
|
||||
|
||||
# Tables without cells pass validation (structure-only tables)
|
||||
if not cell_boxes:
|
||||
return TableValidationResult(
|
||||
is_valid=True,
|
||||
table_element=element,
|
||||
reason="No cells to validate"
|
||||
)
|
||||
|
||||
# Calculate metrics
|
||||
metrics = self.calculate_table_metrics(bbox, cell_boxes)
|
||||
|
||||
# Check cell density
|
||||
if metrics["cell_density"] > self.config.max_cell_density:
|
||||
return TableValidationResult(
|
||||
is_valid=False,
|
||||
table_element=element,
|
||||
reason=f"Cell density {metrics['cell_density']:.2f} exceeds threshold {self.config.max_cell_density}",
|
||||
metrics=metrics
|
||||
)
|
||||
|
||||
# Check average cell area
|
||||
if metrics["avg_cell_area"] < self.config.min_avg_cell_area:
|
||||
return TableValidationResult(
|
||||
is_valid=False,
|
||||
table_element=element,
|
||||
reason=f"Avg cell area {metrics['avg_cell_area']:.0f}px² below threshold {self.config.min_avg_cell_area}px²",
|
||||
metrics=metrics
|
||||
)
|
||||
|
||||
# Check cell height
|
||||
if metrics["avg_cell_height"] < self.config.min_cell_height:
|
||||
return TableValidationResult(
|
||||
is_valid=False,
|
||||
table_element=element,
|
||||
reason=f"Avg cell height {metrics['avg_cell_height']:.1f}px below threshold {self.config.min_cell_height}px",
|
||||
metrics=metrics
|
||||
)
|
||||
|
||||
# Content-based validation: check if content looks like prose vs tabular data
|
||||
content_check = self._validate_table_content(element)
|
||||
if not content_check["is_tabular"]:
|
||||
return TableValidationResult(
|
||||
is_valid=False,
|
||||
table_element=element,
|
||||
reason=content_check["reason"],
|
||||
metrics=metrics
|
||||
)
|
||||
|
||||
return TableValidationResult(
|
||||
is_valid=True,
|
||||
table_element=element,
|
||||
metrics=metrics
|
||||
)
|
||||
|
||||
def _validate_table_content(self, element: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Validate table content to detect false positive tables.
|
||||
|
||||
Checks:
|
||||
1. Sparsity: text coverage ratio (text area / table area)
|
||||
2. Header: does table have proper header structure
|
||||
3. Key-Value: for 2-col tables, is it a key-value list or random layout
|
||||
4. Prose: are cells containing long prose text
|
||||
|
||||
Returns:
|
||||
Dict with is_tabular (bool) and reason (str)
|
||||
"""
|
||||
html_content = element.get("content", "")
|
||||
bbox = element.get("bbox", [])
|
||||
cell_boxes = element.get("cell_boxes", [])
|
||||
|
||||
if not html_content or '<table' not in html_content.lower():
|
||||
return {"is_tabular": True, "reason": "no_html_content"}
|
||||
|
||||
try:
|
||||
from bs4 import BeautifulSoup
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
table = soup.find('table')
|
||||
if not table:
|
||||
return {"is_tabular": True, "reason": "no_table_element"}
|
||||
|
||||
rows = table.find_all('tr')
|
||||
if not rows:
|
||||
return {"is_tabular": True, "reason": "no_rows"}
|
||||
|
||||
# Extract cell contents with row structure
|
||||
row_data = []
|
||||
all_cells = []
|
||||
for row_idx, row in enumerate(rows):
|
||||
cells = row.find_all(['td', 'th'])
|
||||
row_cells = []
|
||||
for cell in cells:
|
||||
text = cell.get_text(strip=True)
|
||||
colspan = int(cell.get('colspan', 1))
|
||||
is_header = cell.name == 'th'
|
||||
cell_info = {
|
||||
"text": text,
|
||||
"length": len(text),
|
||||
"colspan": colspan,
|
||||
"is_header": is_header,
|
||||
"row": row_idx
|
||||
}
|
||||
row_cells.append(cell_info)
|
||||
all_cells.append(cell_info)
|
||||
row_data.append(row_cells)
|
||||
|
||||
if not all_cells:
|
||||
return {"is_tabular": True, "reason": "no_cells"}
|
||||
|
||||
num_rows = len(row_data)
|
||||
num_cols = max(len(r) for r in row_data) if row_data else 0
|
||||
|
||||
# === Check 1: Sparsity (text coverage) ===
|
||||
sparsity_result = self._check_sparsity(bbox, cell_boxes, all_cells)
|
||||
if not sparsity_result["is_valid"]:
|
||||
return {"is_tabular": False, "reason": sparsity_result["reason"]}
|
||||
|
||||
# === Check 2: Header structure ===
|
||||
header_result = self._check_header_structure(row_data, num_cols)
|
||||
if not header_result["has_header"] and num_rows > 3:
|
||||
# Large table without header is suspicious
|
||||
logger.debug(f"Table has no header structure with {num_rows} rows")
|
||||
|
||||
# === Check 3: Key-Value pattern for 2-column tables ===
|
||||
if num_cols == 2:
|
||||
kv_result = self._check_key_value_pattern(row_data)
|
||||
if kv_result["is_kv_list"] and kv_result["confidence"] > 0.7:
|
||||
# High confidence key-value list - keep as table but log
|
||||
logger.debug(f"Table identified as key-value list (conf={kv_result['confidence']:.2f})")
|
||||
elif not kv_result["is_kv_list"] and kv_result["is_random_layout"]:
|
||||
# Random 2-column layout, not a real table
|
||||
return {
|
||||
"is_tabular": False,
|
||||
"reason": f"random_two_column_layout (not key-value)"
|
||||
}
|
||||
|
||||
# === Check 4: Prose content ===
|
||||
long_cells = [c for c in all_cells if c["length"] > 80]
|
||||
prose_ratio = len(long_cells) / len(all_cells) if all_cells else 0
|
||||
if prose_ratio > 0.3:
|
||||
return {
|
||||
"is_tabular": False,
|
||||
"reason": f"prose_content ({len(long_cells)}/{len(all_cells)} cells > 80 chars)"
|
||||
}
|
||||
|
||||
# === Check 5: Section header as table ===
|
||||
if num_rows <= 2 and num_cols <= 2:
|
||||
first_row = row_data[0] if row_data else []
|
||||
if len(first_row) == 1:
|
||||
text = first_row[0]["text"]
|
||||
if text.isupper() and len(text) < 50:
|
||||
return {
|
||||
"is_tabular": False,
|
||||
"reason": f"section_header_only ({text[:30]})"
|
||||
}
|
||||
|
||||
return {"is_tabular": True, "reason": "content_valid"}
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Content validation failed: {e}")
|
||||
return {"is_tabular": True, "reason": f"validation_error: {e}"}
|
||||
|
||||
def _check_sparsity(
|
||||
self,
|
||||
bbox: List[float],
|
||||
cell_boxes: List[List[float]],
|
||||
all_cells: List[Dict]
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Check text coverage ratio (sparsity).
|
||||
|
||||
Two-column layouts have large empty gaps in the middle.
|
||||
Real tables have more uniform cell distribution.
|
||||
"""
|
||||
if len(bbox) < 4:
|
||||
return {"is_valid": True, "reason": "no_bbox"}
|
||||
|
||||
table_width = bbox[2] - bbox[0]
|
||||
table_height = bbox[3] - bbox[1]
|
||||
table_area = table_width * table_height
|
||||
|
||||
if table_area <= 0:
|
||||
return {"is_valid": True, "reason": "invalid_area"}
|
||||
|
||||
# Calculate text area from cell_boxes
|
||||
if cell_boxes:
|
||||
text_area = 0
|
||||
for cb in cell_boxes:
|
||||
if len(cb) >= 4:
|
||||
w = abs(cb[2] - cb[0])
|
||||
h = abs(cb[3] - cb[1])
|
||||
text_area += w * h
|
||||
coverage = text_area / table_area
|
||||
else:
|
||||
# Estimate from cell content length
|
||||
total_chars = sum(c["length"] for c in all_cells)
|
||||
# Rough estimate: 1 char ≈ 8x12 pixels = 96 px²
|
||||
estimated_text_area = total_chars * 96
|
||||
coverage = min(estimated_text_area / table_area, 1.0)
|
||||
|
||||
# Very sparse table (< 15% coverage) is suspicious
|
||||
if coverage < 0.15:
|
||||
return {
|
||||
"is_valid": False,
|
||||
"reason": f"sparse_content (coverage={coverage:.1%})"
|
||||
}
|
||||
|
||||
return {"is_valid": True, "coverage": coverage}
|
||||
|
||||
def _check_header_structure(
|
||||
self,
|
||||
row_data: List[List[Dict]],
|
||||
num_cols: int
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Check if table has proper header structure.
|
||||
|
||||
Real tables usually have:
|
||||
- First row with <th> elements
|
||||
- Or first row with different content pattern (labels vs values)
|
||||
"""
|
||||
if not row_data:
|
||||
return {"has_header": False}
|
||||
|
||||
first_row = row_data[0]
|
||||
|
||||
# Check for <th> elements
|
||||
th_count = sum(1 for c in first_row if c.get("is_header", False))
|
||||
if th_count > 0 and th_count >= len(first_row) * 0.5:
|
||||
return {"has_header": True, "type": "th_elements"}
|
||||
|
||||
# Check for header-like content (short, distinct from body)
|
||||
if len(row_data) > 1:
|
||||
first_row_avg_len = sum(c["length"] for c in first_row) / len(first_row) if first_row else 0
|
||||
body_rows = row_data[1:]
|
||||
body_cells = [c for row in body_rows for c in row]
|
||||
body_avg_len = sum(c["length"] for c in body_cells) / len(body_cells) if body_cells else 0
|
||||
|
||||
# Header row should be shorter (labels) than body (data)
|
||||
if first_row_avg_len < body_avg_len * 0.7:
|
||||
return {"has_header": True, "type": "short_labels"}
|
||||
|
||||
return {"has_header": False}
|
||||
|
||||
def _check_key_value_pattern(
|
||||
self,
|
||||
row_data: List[List[Dict]]
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
For 2-column tables, check if it's a key-value list.
|
||||
|
||||
Key-value characteristics:
|
||||
- Left column: short labels (< 30 chars)
|
||||
- Right column: values (can be longer)
|
||||
- Consistent pattern across rows
|
||||
|
||||
Random layout characteristics:
|
||||
- Both columns have similar length distribution
|
||||
- No clear label-value relationship
|
||||
"""
|
||||
if not row_data:
|
||||
return {"is_kv_list": False, "is_random_layout": False, "confidence": 0}
|
||||
|
||||
left_lengths = []
|
||||
right_lengths = []
|
||||
kv_rows = 0
|
||||
total_rows = 0
|
||||
|
||||
for row in row_data:
|
||||
if len(row) != 2:
|
||||
continue
|
||||
total_rows += 1
|
||||
left = row[0]
|
||||
right = row[1]
|
||||
left_lengths.append(left["length"])
|
||||
right_lengths.append(right["length"])
|
||||
|
||||
# Key-value pattern: left is short label, right is value
|
||||
if left["length"] < 40 and left["length"] < right["length"] * 2:
|
||||
kv_rows += 1
|
||||
|
||||
if total_rows == 0:
|
||||
return {"is_kv_list": False, "is_random_layout": False, "confidence": 0}
|
||||
|
||||
kv_ratio = kv_rows / total_rows
|
||||
avg_left = sum(left_lengths) / len(left_lengths) if left_lengths else 0
|
||||
avg_right = sum(right_lengths) / len(right_lengths) if right_lengths else 0
|
||||
|
||||
# High KV ratio and left column is shorter = key-value list
|
||||
if kv_ratio > 0.6 and avg_left < avg_right:
|
||||
return {
|
||||
"is_kv_list": True,
|
||||
"is_random_layout": False,
|
||||
"confidence": kv_ratio,
|
||||
"avg_left": avg_left,
|
||||
"avg_right": avg_right
|
||||
}
|
||||
|
||||
# Similar lengths on both sides = random layout
|
||||
if avg_left > 0 and 0.5 < avg_right / avg_left < 2.0:
|
||||
# Both columns have similar content length
|
||||
return {
|
||||
"is_kv_list": False,
|
||||
"is_random_layout": True,
|
||||
"confidence": 1 - kv_ratio,
|
||||
"avg_left": avg_left,
|
||||
"avg_right": avg_right
|
||||
}
|
||||
|
||||
return {
|
||||
"is_kv_list": False,
|
||||
"is_random_layout": False,
|
||||
"confidence": 0,
|
||||
"avg_left": avg_left,
|
||||
"avg_right": avg_right
|
||||
}
|
||||
|
||||
def extract_text_from_table_html(self, html_content: str) -> str:
|
||||
"""
|
||||
Extract plain text from table HTML content.
|
||||
|
||||
Args:
|
||||
html_content: HTML string containing table structure
|
||||
|
||||
Returns:
|
||||
Plain text extracted from table cells
|
||||
"""
|
||||
if not html_content:
|
||||
return ""
|
||||
|
||||
try:
|
||||
class TableTextExtractor(HTMLParser):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.text_parts = []
|
||||
self.in_cell = False
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
if tag in ('td', 'th'):
|
||||
self.in_cell = True
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
if tag in ('td', 'th'):
|
||||
self.in_cell = False
|
||||
|
||||
def handle_data(self, data):
|
||||
if self.in_cell:
|
||||
stripped = data.strip()
|
||||
if stripped:
|
||||
self.text_parts.append(stripped)
|
||||
|
||||
parser = TableTextExtractor()
|
||||
parser.feed(html_content)
|
||||
return ' '.join(parser.text_parts)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to parse table HTML: {e}")
|
||||
# Fallback: strip HTML tags with regex
|
||||
text = re.sub(r'<[^>]+>', ' ', html_content)
|
||||
text = re.sub(r'\s+', ' ', text).strip()
|
||||
return text
|
||||
|
||||
def reclassify_as_text(self, element: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Convert an over-detected table element to a TEXT element.
|
||||
|
||||
Args:
|
||||
element: Table element to reclassify
|
||||
|
||||
Returns:
|
||||
New TEXT element with preserved content
|
||||
"""
|
||||
# Extract text content from HTML
|
||||
html_content = element.get("content", "")
|
||||
text_content = self.extract_text_from_table_html(html_content)
|
||||
|
||||
# Create new TEXT element
|
||||
text_element = {
|
||||
"element_id": element.get("element_id", ""),
|
||||
"type": "text",
|
||||
"original_type": "table_reclassified", # Mark as reclassified
|
||||
"content": text_content,
|
||||
"page": element.get("page", 0),
|
||||
"bbox": element.get("bbox", []),
|
||||
"index": element.get("index", 0),
|
||||
"confidence": element.get("confidence", 1.0),
|
||||
"reclassified_from": "table",
|
||||
"reclassification_reason": "over_detection"
|
||||
}
|
||||
|
||||
return text_element
|
||||
|
||||
def validate_and_filter_elements(
|
||||
self,
|
||||
elements: List[Dict[str, Any]]
|
||||
) -> Tuple[List[Dict[str, Any]], Dict[str, Any]]:
|
||||
"""
|
||||
Validate all elements and filter/reclassify over-detected tables.
|
||||
|
||||
Args:
|
||||
elements: List of elements from PP-StructureV3 output
|
||||
|
||||
Returns:
|
||||
Tuple of (filtered_elements, statistics)
|
||||
"""
|
||||
filtered_elements = []
|
||||
stats = {
|
||||
"total_tables": 0,
|
||||
"valid_tables": 0,
|
||||
"reclassified_tables": 0,
|
||||
"reclassification_details": []
|
||||
}
|
||||
|
||||
for element in elements:
|
||||
if element.get("type") != "table":
|
||||
# Non-table elements pass through unchanged
|
||||
filtered_elements.append(element)
|
||||
continue
|
||||
|
||||
stats["total_tables"] += 1
|
||||
|
||||
# Validate table
|
||||
result = self.validate_table(element)
|
||||
|
||||
if result.is_valid:
|
||||
stats["valid_tables"] += 1
|
||||
filtered_elements.append(element)
|
||||
else:
|
||||
# Reclassify as TEXT
|
||||
stats["reclassified_tables"] += 1
|
||||
text_element = self.reclassify_as_text(element)
|
||||
filtered_elements.append(text_element)
|
||||
|
||||
stats["reclassification_details"].append({
|
||||
"element_id": element.get("element_id"),
|
||||
"reason": result.reason,
|
||||
"metrics": result.metrics
|
||||
})
|
||||
|
||||
logger.info(
|
||||
f"Reclassified table {element.get('element_id')} as TEXT: {result.reason}"
|
||||
)
|
||||
|
||||
# Re-sort by reading order (y0 then x0)
|
||||
filtered_elements = self._sort_by_reading_order(filtered_elements)
|
||||
|
||||
return filtered_elements, stats
|
||||
|
||||
def _sort_by_reading_order(
|
||||
self,
|
||||
elements: List[Dict[str, Any]]
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Sort elements by reading order (top-to-bottom, left-to-right)."""
|
||||
def sort_key(elem):
|
||||
bbox = elem.get("bbox", [0, 0, 0, 0])
|
||||
if isinstance(bbox, dict):
|
||||
y0 = bbox.get("y0", 0)
|
||||
x0 = bbox.get("x0", 0)
|
||||
elif isinstance(bbox, list) and len(bbox) >= 2:
|
||||
x0, y0 = bbox[0], bbox[1]
|
||||
else:
|
||||
y0, x0 = 0, 0
|
||||
return (y0, x0)
|
||||
|
||||
return sorted(elements, key=sort_key)
|
||||
Reference in New Issue
Block a user