chore: backup before code cleanup

Backup commit before executing remove-unused-code proposal.
This includes all pending changes and new features.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
egg
2025-12-11 11:55:39 +08:00
parent eff9b0bcd5
commit 940a406dce
58 changed files with 8226 additions and 175 deletions

View File

@@ -0,0 +1,583 @@
"""
Cell Validation Engine
Validates PP-StructureV3 table detections using metric-based heuristics
to filter over-detected cells and reclassify invalid tables as TEXT elements.
Metrics used:
- Cell density: cells per 10,000 px² (normal: 0.4-1.0, over-detected: 6+)
- Average cell area: px² per cell (normal: 10,000-25,000, over-detected: ~1,600)
- Cell height: table_height / cell_count (minimum: 10px for readable text)
"""
import logging
from dataclasses import dataclass
from typing import List, Dict, Any, Optional, Tuple
from html.parser import HTMLParser
import re
logger = logging.getLogger(__name__)
@dataclass
class CellValidationConfig:
"""Configuration for cell validation thresholds."""
max_cell_density: float = 3.0 # cells per 10,000 px²
min_avg_cell_area: float = 3000.0 # px² per cell
min_cell_height: float = 10.0 # px per cell row
enabled: bool = True
@dataclass
class TableValidationResult:
"""Result of table validation."""
is_valid: bool
table_element: Dict[str, Any]
reason: Optional[str] = None
metrics: Optional[Dict[str, float]] = None
class CellValidationEngine:
"""
Validates table elements from PP-StructureV3 output.
Over-detected tables are identified by abnormal metrics and
reclassified as TEXT elements while preserving content.
"""
def __init__(self, config: Optional[CellValidationConfig] = None):
self.config = config or CellValidationConfig()
def calculate_table_metrics(
self,
bbox: List[float],
cell_boxes: List[List[float]]
) -> Dict[str, float]:
"""
Calculate validation metrics for a table.
Args:
bbox: Table bounding box [x0, y0, x1, y1]
cell_boxes: List of cell bounding boxes
Returns:
Dictionary with calculated metrics
"""
if len(bbox) < 4:
return {"cell_count": 0, "cell_density": 0, "avg_cell_area": 0, "avg_cell_height": 0}
cell_count = len(cell_boxes)
if cell_count == 0:
return {"cell_count": 0, "cell_density": 0, "avg_cell_area": 0, "avg_cell_height": 0}
# Calculate table dimensions
table_width = bbox[2] - bbox[0]
table_height = bbox[3] - bbox[1]
table_area = table_width * table_height
if table_area <= 0:
return {"cell_count": cell_count, "cell_density": 0, "avg_cell_area": 0, "avg_cell_height": 0}
# Cell density: cells per 10,000 px²
cell_density = (cell_count / table_area) * 10000
# Average cell area
avg_cell_area = table_area / cell_count
# Average cell height (table height / cell count)
avg_cell_height = table_height / cell_count
return {
"cell_count": cell_count,
"table_width": table_width,
"table_height": table_height,
"table_area": table_area,
"cell_density": cell_density,
"avg_cell_area": avg_cell_area,
"avg_cell_height": avg_cell_height
}
def validate_table(
self,
element: Dict[str, Any]
) -> TableValidationResult:
"""
Validate a single table element.
Args:
element: Table element from PP-StructureV3 output
Returns:
TableValidationResult with validation status and metrics
"""
if not self.config.enabled:
return TableValidationResult(is_valid=True, table_element=element)
# Extract bbox and cell_boxes
bbox = element.get("bbox", [])
cell_boxes = element.get("cell_boxes", [])
# Tables without cells pass validation (structure-only tables)
if not cell_boxes:
return TableValidationResult(
is_valid=True,
table_element=element,
reason="No cells to validate"
)
# Calculate metrics
metrics = self.calculate_table_metrics(bbox, cell_boxes)
# Check cell density
if metrics["cell_density"] > self.config.max_cell_density:
return TableValidationResult(
is_valid=False,
table_element=element,
reason=f"Cell density {metrics['cell_density']:.2f} exceeds threshold {self.config.max_cell_density}",
metrics=metrics
)
# Check average cell area
if metrics["avg_cell_area"] < self.config.min_avg_cell_area:
return TableValidationResult(
is_valid=False,
table_element=element,
reason=f"Avg cell area {metrics['avg_cell_area']:.0f}px² below threshold {self.config.min_avg_cell_area}px²",
metrics=metrics
)
# Check cell height
if metrics["avg_cell_height"] < self.config.min_cell_height:
return TableValidationResult(
is_valid=False,
table_element=element,
reason=f"Avg cell height {metrics['avg_cell_height']:.1f}px below threshold {self.config.min_cell_height}px",
metrics=metrics
)
# Content-based validation: check if content looks like prose vs tabular data
content_check = self._validate_table_content(element)
if not content_check["is_tabular"]:
return TableValidationResult(
is_valid=False,
table_element=element,
reason=content_check["reason"],
metrics=metrics
)
return TableValidationResult(
is_valid=True,
table_element=element,
metrics=metrics
)
def _validate_table_content(self, element: Dict[str, Any]) -> Dict[str, Any]:
"""
Validate table content to detect false positive tables.
Checks:
1. Sparsity: text coverage ratio (text area / table area)
2. Header: does table have proper header structure
3. Key-Value: for 2-col tables, is it a key-value list or random layout
4. Prose: are cells containing long prose text
Returns:
Dict with is_tabular (bool) and reason (str)
"""
html_content = element.get("content", "")
bbox = element.get("bbox", [])
cell_boxes = element.get("cell_boxes", [])
if not html_content or '<table' not in html_content.lower():
return {"is_tabular": True, "reason": "no_html_content"}
try:
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')
table = soup.find('table')
if not table:
return {"is_tabular": True, "reason": "no_table_element"}
rows = table.find_all('tr')
if not rows:
return {"is_tabular": True, "reason": "no_rows"}
# Extract cell contents with row structure
row_data = []
all_cells = []
for row_idx, row in enumerate(rows):
cells = row.find_all(['td', 'th'])
row_cells = []
for cell in cells:
text = cell.get_text(strip=True)
colspan = int(cell.get('colspan', 1))
is_header = cell.name == 'th'
cell_info = {
"text": text,
"length": len(text),
"colspan": colspan,
"is_header": is_header,
"row": row_idx
}
row_cells.append(cell_info)
all_cells.append(cell_info)
row_data.append(row_cells)
if not all_cells:
return {"is_tabular": True, "reason": "no_cells"}
num_rows = len(row_data)
num_cols = max(len(r) for r in row_data) if row_data else 0
# === Check 1: Sparsity (text coverage) ===
sparsity_result = self._check_sparsity(bbox, cell_boxes, all_cells)
if not sparsity_result["is_valid"]:
return {"is_tabular": False, "reason": sparsity_result["reason"]}
# === Check 2: Header structure ===
header_result = self._check_header_structure(row_data, num_cols)
if not header_result["has_header"] and num_rows > 3:
# Large table without header is suspicious
logger.debug(f"Table has no header structure with {num_rows} rows")
# === Check 3: Key-Value pattern for 2-column tables ===
if num_cols == 2:
kv_result = self._check_key_value_pattern(row_data)
if kv_result["is_kv_list"] and kv_result["confidence"] > 0.7:
# High confidence key-value list - keep as table but log
logger.debug(f"Table identified as key-value list (conf={kv_result['confidence']:.2f})")
elif not kv_result["is_kv_list"] and kv_result["is_random_layout"]:
# Random 2-column layout, not a real table
return {
"is_tabular": False,
"reason": f"random_two_column_layout (not key-value)"
}
# === Check 4: Prose content ===
long_cells = [c for c in all_cells if c["length"] > 80]
prose_ratio = len(long_cells) / len(all_cells) if all_cells else 0
if prose_ratio > 0.3:
return {
"is_tabular": False,
"reason": f"prose_content ({len(long_cells)}/{len(all_cells)} cells > 80 chars)"
}
# === Check 5: Section header as table ===
if num_rows <= 2 and num_cols <= 2:
first_row = row_data[0] if row_data else []
if len(first_row) == 1:
text = first_row[0]["text"]
if text.isupper() and len(text) < 50:
return {
"is_tabular": False,
"reason": f"section_header_only ({text[:30]})"
}
return {"is_tabular": True, "reason": "content_valid"}
except Exception as e:
logger.warning(f"Content validation failed: {e}")
return {"is_tabular": True, "reason": f"validation_error: {e}"}
def _check_sparsity(
self,
bbox: List[float],
cell_boxes: List[List[float]],
all_cells: List[Dict]
) -> Dict[str, Any]:
"""
Check text coverage ratio (sparsity).
Two-column layouts have large empty gaps in the middle.
Real tables have more uniform cell distribution.
"""
if len(bbox) < 4:
return {"is_valid": True, "reason": "no_bbox"}
table_width = bbox[2] - bbox[0]
table_height = bbox[3] - bbox[1]
table_area = table_width * table_height
if table_area <= 0:
return {"is_valid": True, "reason": "invalid_area"}
# Calculate text area from cell_boxes
if cell_boxes:
text_area = 0
for cb in cell_boxes:
if len(cb) >= 4:
w = abs(cb[2] - cb[0])
h = abs(cb[3] - cb[1])
text_area += w * h
coverage = text_area / table_area
else:
# Estimate from cell content length
total_chars = sum(c["length"] for c in all_cells)
# Rough estimate: 1 char ≈ 8x12 pixels = 96 px²
estimated_text_area = total_chars * 96
coverage = min(estimated_text_area / table_area, 1.0)
# Very sparse table (< 15% coverage) is suspicious
if coverage < 0.15:
return {
"is_valid": False,
"reason": f"sparse_content (coverage={coverage:.1%})"
}
return {"is_valid": True, "coverage": coverage}
def _check_header_structure(
self,
row_data: List[List[Dict]],
num_cols: int
) -> Dict[str, Any]:
"""
Check if table has proper header structure.
Real tables usually have:
- First row with <th> elements
- Or first row with different content pattern (labels vs values)
"""
if not row_data:
return {"has_header": False}
first_row = row_data[0]
# Check for <th> elements
th_count = sum(1 for c in first_row if c.get("is_header", False))
if th_count > 0 and th_count >= len(first_row) * 0.5:
return {"has_header": True, "type": "th_elements"}
# Check for header-like content (short, distinct from body)
if len(row_data) > 1:
first_row_avg_len = sum(c["length"] for c in first_row) / len(first_row) if first_row else 0
body_rows = row_data[1:]
body_cells = [c for row in body_rows for c in row]
body_avg_len = sum(c["length"] for c in body_cells) / len(body_cells) if body_cells else 0
# Header row should be shorter (labels) than body (data)
if first_row_avg_len < body_avg_len * 0.7:
return {"has_header": True, "type": "short_labels"}
return {"has_header": False}
def _check_key_value_pattern(
self,
row_data: List[List[Dict]]
) -> Dict[str, Any]:
"""
For 2-column tables, check if it's a key-value list.
Key-value characteristics:
- Left column: short labels (< 30 chars)
- Right column: values (can be longer)
- Consistent pattern across rows
Random layout characteristics:
- Both columns have similar length distribution
- No clear label-value relationship
"""
if not row_data:
return {"is_kv_list": False, "is_random_layout": False, "confidence": 0}
left_lengths = []
right_lengths = []
kv_rows = 0
total_rows = 0
for row in row_data:
if len(row) != 2:
continue
total_rows += 1
left = row[0]
right = row[1]
left_lengths.append(left["length"])
right_lengths.append(right["length"])
# Key-value pattern: left is short label, right is value
if left["length"] < 40 and left["length"] < right["length"] * 2:
kv_rows += 1
if total_rows == 0:
return {"is_kv_list": False, "is_random_layout": False, "confidence": 0}
kv_ratio = kv_rows / total_rows
avg_left = sum(left_lengths) / len(left_lengths) if left_lengths else 0
avg_right = sum(right_lengths) / len(right_lengths) if right_lengths else 0
# High KV ratio and left column is shorter = key-value list
if kv_ratio > 0.6 and avg_left < avg_right:
return {
"is_kv_list": True,
"is_random_layout": False,
"confidence": kv_ratio,
"avg_left": avg_left,
"avg_right": avg_right
}
# Similar lengths on both sides = random layout
if avg_left > 0 and 0.5 < avg_right / avg_left < 2.0:
# Both columns have similar content length
return {
"is_kv_list": False,
"is_random_layout": True,
"confidence": 1 - kv_ratio,
"avg_left": avg_left,
"avg_right": avg_right
}
return {
"is_kv_list": False,
"is_random_layout": False,
"confidence": 0,
"avg_left": avg_left,
"avg_right": avg_right
}
def extract_text_from_table_html(self, html_content: str) -> str:
"""
Extract plain text from table HTML content.
Args:
html_content: HTML string containing table structure
Returns:
Plain text extracted from table cells
"""
if not html_content:
return ""
try:
class TableTextExtractor(HTMLParser):
def __init__(self):
super().__init__()
self.text_parts = []
self.in_cell = False
def handle_starttag(self, tag, attrs):
if tag in ('td', 'th'):
self.in_cell = True
def handle_endtag(self, tag):
if tag in ('td', 'th'):
self.in_cell = False
def handle_data(self, data):
if self.in_cell:
stripped = data.strip()
if stripped:
self.text_parts.append(stripped)
parser = TableTextExtractor()
parser.feed(html_content)
return ' '.join(parser.text_parts)
except Exception as e:
logger.warning(f"Failed to parse table HTML: {e}")
# Fallback: strip HTML tags with regex
text = re.sub(r'<[^>]+>', ' ', html_content)
text = re.sub(r'\s+', ' ', text).strip()
return text
def reclassify_as_text(self, element: Dict[str, Any]) -> Dict[str, Any]:
"""
Convert an over-detected table element to a TEXT element.
Args:
element: Table element to reclassify
Returns:
New TEXT element with preserved content
"""
# Extract text content from HTML
html_content = element.get("content", "")
text_content = self.extract_text_from_table_html(html_content)
# Create new TEXT element
text_element = {
"element_id": element.get("element_id", ""),
"type": "text",
"original_type": "table_reclassified", # Mark as reclassified
"content": text_content,
"page": element.get("page", 0),
"bbox": element.get("bbox", []),
"index": element.get("index", 0),
"confidence": element.get("confidence", 1.0),
"reclassified_from": "table",
"reclassification_reason": "over_detection"
}
return text_element
def validate_and_filter_elements(
self,
elements: List[Dict[str, Any]]
) -> Tuple[List[Dict[str, Any]], Dict[str, Any]]:
"""
Validate all elements and filter/reclassify over-detected tables.
Args:
elements: List of elements from PP-StructureV3 output
Returns:
Tuple of (filtered_elements, statistics)
"""
filtered_elements = []
stats = {
"total_tables": 0,
"valid_tables": 0,
"reclassified_tables": 0,
"reclassification_details": []
}
for element in elements:
if element.get("type") != "table":
# Non-table elements pass through unchanged
filtered_elements.append(element)
continue
stats["total_tables"] += 1
# Validate table
result = self.validate_table(element)
if result.is_valid:
stats["valid_tables"] += 1
filtered_elements.append(element)
else:
# Reclassify as TEXT
stats["reclassified_tables"] += 1
text_element = self.reclassify_as_text(element)
filtered_elements.append(text_element)
stats["reclassification_details"].append({
"element_id": element.get("element_id"),
"reason": result.reason,
"metrics": result.metrics
})
logger.info(
f"Reclassified table {element.get('element_id')} as TEXT: {result.reason}"
)
# Re-sort by reading order (y0 then x0)
filtered_elements = self._sort_by_reading_order(filtered_elements)
return filtered_elements, stats
def _sort_by_reading_order(
self,
elements: List[Dict[str, Any]]
) -> List[Dict[str, Any]]:
"""Sort elements by reading order (top-to-bottom, left-to-right)."""
def sort_key(elem):
bbox = elem.get("bbox", [0, 0, 0, 0])
if isinstance(bbox, dict):
y0 = bbox.get("y0", 0)
x0 = bbox.get("x0", 0)
elif isinstance(bbox, list) and len(bbox) >= 2:
x0, y0 = bbox[0], bbox[1]
else:
y0, x0 = 0, 0
return (y0, x0)
return sorted(elements, key=sort_key)