fix: resolve Direct track PDF regression issues
- Add _is_likely_chart() to detect charts misclassified as tables - High empty cell ratio (>70%) indicates chart grid - Axis label patterns (numbers, °C, %, Time, Temperature) - Multi-line cells with axis text - Add _build_rows_from_cells_dict() to handle JSON table content - Properly parse cells structure from Direct extraction - Avoid HTML round-trip conversion issues - Remove rowHeights parameter from Table() to fix content overlap - Let ReportLab auto-calculate row heights based on content - Use scaling to fit within bbox Fixes edit.pdf table overlap and chart misclassification issues. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -690,6 +690,89 @@ class DirectExtractionEngine:
|
||||
# Default to paragraph for longer text, text for shorter
|
||||
return ElementType.PARAGRAPH if len(text) > 150 else ElementType.TEXT
|
||||
|
||||
def _is_likely_chart(self, data: list, table) -> bool:
|
||||
"""
|
||||
Detect if a "table" detected by find_tables() is actually a chart/graph.
|
||||
|
||||
Charts often get misclassified as tables because they have grid lines.
|
||||
Characteristics of a chart misclassified as table:
|
||||
1. High percentage of empty cells (>60%)
|
||||
2. Content patterns that look like axis labels (numbers, units like °C, %, etc.)
|
||||
3. Single cell contains multi-line text with chart-like patterns
|
||||
4. Cell content contains typical chart axis patterns
|
||||
|
||||
Args:
|
||||
data: Extracted table data (list of lists)
|
||||
table: PyMuPDF table object
|
||||
|
||||
Returns:
|
||||
True if the table is likely a chart
|
||||
"""
|
||||
if not data:
|
||||
return False
|
||||
|
||||
# Count total cells and empty cells
|
||||
total_cells = 0
|
||||
empty_cells = 0
|
||||
multi_line_cells = 0
|
||||
axis_pattern_cells = 0
|
||||
|
||||
# Patterns that suggest chart axis labels
|
||||
import re
|
||||
axis_patterns = [
|
||||
r'^-?\d+$', # Simple numbers (axis ticks)
|
||||
r'^-?\d+\.?\d*$', # Decimal numbers
|
||||
r'°[CF]', # Temperature units
|
||||
r'%$', # Percentage
|
||||
r'\bppm\b', # Parts per million
|
||||
r'\bmin\b', # Minutes
|
||||
r'\bsec\b', # Seconds
|
||||
r'\bTime\b', # Time axis label
|
||||
r'\bTemperature\b', # Temperature axis label
|
||||
r'[Aa]xis', # Axis label
|
||||
]
|
||||
|
||||
for row in data:
|
||||
for cell in row:
|
||||
total_cells += 1
|
||||
cell_text = str(cell).strip() if cell else ""
|
||||
|
||||
if not cell_text:
|
||||
empty_cells += 1
|
||||
else:
|
||||
# Check for multi-line content
|
||||
if '\n' in cell_text:
|
||||
multi_line_cells += 1
|
||||
|
||||
# Check for axis patterns
|
||||
for pattern in axis_patterns:
|
||||
if re.search(pattern, cell_text, re.IGNORECASE):
|
||||
axis_pattern_cells += 1
|
||||
break
|
||||
|
||||
# Calculate metrics
|
||||
empty_ratio = empty_cells / total_cells if total_cells > 0 else 0
|
||||
|
||||
# Decision criteria for chart detection:
|
||||
# 1. Very high empty cell ratio (>70%) suggests it's a chart grid
|
||||
if empty_ratio > 0.7:
|
||||
logger.debug(f"Chart detection: high empty ratio {empty_ratio:.2f} (>70%)")
|
||||
return True
|
||||
|
||||
# 2. High empty ratio + axis patterns suggests chart
|
||||
if empty_ratio > 0.5 and axis_pattern_cells >= 3:
|
||||
logger.debug(f"Chart detection: empty ratio {empty_ratio:.2f} + {axis_pattern_cells} axis patterns")
|
||||
return True
|
||||
|
||||
# 3. Multi-line cell with axis patterns in first cell (often chart legend text)
|
||||
if multi_line_cells >= 1 and axis_pattern_cells >= 2:
|
||||
first_cell = str(data[0][0]).strip() if data and data[0] else ""
|
||||
if '\n' in first_cell and len(first_cell.split('\n')) >= 5:
|
||||
logger.debug(f"Chart detection: first cell has {len(first_cell.split(chr(10)))} lines with axis patterns")
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def _process_native_table(self, table, page_num: int, counter: int) -> Optional[DocumentElement]:
|
||||
"""Process a natively detected table"""
|
||||
try:
|
||||
@@ -698,6 +781,11 @@ class DirectExtractionEngine:
|
||||
if not data or len(data) < self.min_table_rows:
|
||||
return None
|
||||
|
||||
# Check if this "table" is actually a chart (misclassified by find_tables)
|
||||
if self._is_likely_chart(data, table):
|
||||
logger.info(f"Skipping table_{page_num}_{counter} - detected as chart (not table)")
|
||||
return None
|
||||
|
||||
# Get table bounding box
|
||||
bbox_data = table.bbox
|
||||
bbox = BoundingBox(
|
||||
|
||||
Reference in New Issue
Block a user