fix: resolve Direct track PDF regression issues

- Add _is_likely_chart() to detect charts misclassified as tables
  - High empty cell ratio (>70%) indicates chart grid
  - Axis label patterns (numbers, °C, %, Time, Temperature)
  - Multi-line cells with axis text

- Add _build_rows_from_cells_dict() to handle JSON table content
  - Properly parse cells structure from Direct extraction
  - Avoid HTML round-trip conversion issues

- Remove rowHeights parameter from Table() to fix content overlap
  - Let ReportLab auto-calculate row heights based on content
  - Use scaling to fit within bbox

Fixes edit.pdf table overlap and chart misclassification issues.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
egg
2025-11-26 12:29:46 +08:00
parent 1afdb822c3
commit 79cffe6da0
2 changed files with 168 additions and 32 deletions

View File

@@ -690,6 +690,89 @@ class DirectExtractionEngine:
# Default to paragraph for longer text, text for shorter
return ElementType.PARAGRAPH if len(text) > 150 else ElementType.TEXT
def _is_likely_chart(self, data: list, table) -> bool:
"""
Detect if a "table" detected by find_tables() is actually a chart/graph.
Charts often get misclassified as tables because they have grid lines.
Characteristics of a chart misclassified as table:
1. High percentage of empty cells (>60%)
2. Content patterns that look like axis labels (numbers, units like °C, %, etc.)
3. Single cell contains multi-line text with chart-like patterns
4. Cell content contains typical chart axis patterns
Args:
data: Extracted table data (list of lists)
table: PyMuPDF table object
Returns:
True if the table is likely a chart
"""
if not data:
return False
# Count total cells and empty cells
total_cells = 0
empty_cells = 0
multi_line_cells = 0
axis_pattern_cells = 0
# Patterns that suggest chart axis labels
import re
axis_patterns = [
r'^-?\d+$', # Simple numbers (axis ticks)
r'^-?\d+\.?\d*$', # Decimal numbers
r'°[CF]', # Temperature units
r'%$', # Percentage
r'\bppm\b', # Parts per million
r'\bmin\b', # Minutes
r'\bsec\b', # Seconds
r'\bTime\b', # Time axis label
r'\bTemperature\b', # Temperature axis label
r'[Aa]xis', # Axis label
]
for row in data:
for cell in row:
total_cells += 1
cell_text = str(cell).strip() if cell else ""
if not cell_text:
empty_cells += 1
else:
# Check for multi-line content
if '\n' in cell_text:
multi_line_cells += 1
# Check for axis patterns
for pattern in axis_patterns:
if re.search(pattern, cell_text, re.IGNORECASE):
axis_pattern_cells += 1
break
# Calculate metrics
empty_ratio = empty_cells / total_cells if total_cells > 0 else 0
# Decision criteria for chart detection:
# 1. Very high empty cell ratio (>70%) suggests it's a chart grid
if empty_ratio > 0.7:
logger.debug(f"Chart detection: high empty ratio {empty_ratio:.2f} (>70%)")
return True
# 2. High empty ratio + axis patterns suggests chart
if empty_ratio > 0.5 and axis_pattern_cells >= 3:
logger.debug(f"Chart detection: empty ratio {empty_ratio:.2f} + {axis_pattern_cells} axis patterns")
return True
# 3. Multi-line cell with axis patterns in first cell (often chart legend text)
if multi_line_cells >= 1 and axis_pattern_cells >= 2:
first_cell = str(data[0][0]).strip() if data and data[0] else ""
if '\n' in first_cell and len(first_cell.split('\n')) >= 5:
logger.debug(f"Chart detection: first cell has {len(first_cell.split(chr(10)))} lines with axis patterns")
return True
return False
def _process_native_table(self, table, page_num: int, counter: int) -> Optional[DocumentElement]:
"""Process a natively detected table"""
try:
@@ -698,6 +781,11 @@ class DirectExtractionEngine:
if not data or len(data) < self.min_table_rows:
return None
# Check if this "table" is actually a chart (misclassified by find_tables)
if self._is_likely_chart(data, table):
logger.info(f"Skipping table_{page_num}_{counter} - detected as chart (not table)")
return None
# Get table bounding box
bbox_data = table.bbox
bbox = BoundingBox(