fix: resolve Direct track PDF regression issues

- Add _is_likely_chart() to detect charts misclassified as tables
  - High empty cell ratio (>70%) indicates chart grid
  - Axis label patterns (numbers, °C, %, Time, Temperature)
  - Multi-line cells with axis text

- Add _build_rows_from_cells_dict() to handle JSON table content
  - Properly parse cells structure from Direct extraction
  - Avoid HTML round-trip conversion issues

- Remove rowHeights parameter from Table() to fix content overlap
  - Let ReportLab auto-calculate row heights based on content
  - Use scaling to fit within bbox

Fixes edit.pdf table overlap and chart misclassification issues.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
egg
2025-11-26 12:29:46 +08:00
parent 1afdb822c3
commit 79cffe6da0
2 changed files with 168 additions and 32 deletions

View File

@@ -690,6 +690,89 @@ class DirectExtractionEngine:
# Default to paragraph for longer text, text for shorter # Default to paragraph for longer text, text for shorter
return ElementType.PARAGRAPH if len(text) > 150 else ElementType.TEXT return ElementType.PARAGRAPH if len(text) > 150 else ElementType.TEXT
def _is_likely_chart(self, data: list, table) -> bool:
"""
Detect if a "table" detected by find_tables() is actually a chart/graph.
Charts often get misclassified as tables because they have grid lines.
Characteristics of a chart misclassified as table:
1. High percentage of empty cells (>60%)
2. Content patterns that look like axis labels (numbers, units like °C, %, etc.)
3. Single cell contains multi-line text with chart-like patterns
4. Cell content contains typical chart axis patterns
Args:
data: Extracted table data (list of lists)
table: PyMuPDF table object
Returns:
True if the table is likely a chart
"""
if not data:
return False
# Count total cells and empty cells
total_cells = 0
empty_cells = 0
multi_line_cells = 0
axis_pattern_cells = 0
# Patterns that suggest chart axis labels
import re
axis_patterns = [
r'^-?\d+$', # Simple numbers (axis ticks)
r'^-?\d+\.?\d*$', # Decimal numbers
r'°[CF]', # Temperature units
r'%$', # Percentage
r'\bppm\b', # Parts per million
r'\bmin\b', # Minutes
r'\bsec\b', # Seconds
r'\bTime\b', # Time axis label
r'\bTemperature\b', # Temperature axis label
r'[Aa]xis', # Axis label
]
for row in data:
for cell in row:
total_cells += 1
cell_text = str(cell).strip() if cell else ""
if not cell_text:
empty_cells += 1
else:
# Check for multi-line content
if '\n' in cell_text:
multi_line_cells += 1
# Check for axis patterns
for pattern in axis_patterns:
if re.search(pattern, cell_text, re.IGNORECASE):
axis_pattern_cells += 1
break
# Calculate metrics
empty_ratio = empty_cells / total_cells if total_cells > 0 else 0
# Decision criteria for chart detection:
# 1. Very high empty cell ratio (>70%) suggests it's a chart grid
if empty_ratio > 0.7:
logger.debug(f"Chart detection: high empty ratio {empty_ratio:.2f} (>70%)")
return True
# 2. High empty ratio + axis patterns suggests chart
if empty_ratio > 0.5 and axis_pattern_cells >= 3:
logger.debug(f"Chart detection: empty ratio {empty_ratio:.2f} + {axis_pattern_cells} axis patterns")
return True
# 3. Multi-line cell with axis patterns in first cell (often chart legend text)
if multi_line_cells >= 1 and axis_pattern_cells >= 2:
first_cell = str(data[0][0]).strip() if data and data[0] else ""
if '\n' in first_cell and len(first_cell.split('\n')) >= 5:
logger.debug(f"Chart detection: first cell has {len(first_cell.split(chr(10)))} lines with axis patterns")
return True
return False
def _process_native_table(self, table, page_num: int, counter: int) -> Optional[DocumentElement]: def _process_native_table(self, table, page_num: int, counter: int) -> Optional[DocumentElement]:
"""Process a natively detected table""" """Process a natively detected table"""
try: try:
@@ -698,6 +781,11 @@ class DirectExtractionEngine:
if not data or len(data) < self.min_table_rows: if not data or len(data) < self.min_table_rows:
return None return None
# Check if this "table" is actually a chart (misclassified by find_tables)
if self._is_likely_chart(data, table):
logger.info(f"Skipping table_{page_num}_{counter} - detected as chart (not table)")
return None
# Get table bounding box # Get table bounding box
bbox_data = table.bbox bbox_data = table.bbox
bbox = BoundingBox( bbox = BoundingBox(

View File

@@ -2235,6 +2235,57 @@ class PDFGeneratorService:
except Exception as e: except Exception as e:
logger.error(f"Failed to draw text element {element.element_id}: {e}") logger.error(f"Failed to draw text element {element.element_id}: {e}")
def _build_rows_from_cells_dict(self, content: dict) -> list:
"""
Build row structure from cells dict (from Direct extraction JSON).
The cells structure from Direct extraction:
{
"rows": 6,
"cols": 2,
"cells": [
{"row": 0, "col": 0, "content": "..."},
{"row": 0, "col": 1, "content": "..."},
...
]
}
Returns format compatible with HTMLTableParser output:
[
{"cells": [{"text": "..."}, {"text": "..."}]}, # row 0
{"cells": [{"text": "..."}, {"text": "..."}]}, # row 1
...
]
"""
try:
num_rows = content.get('rows', 0)
num_cols = content.get('cols', 0)
cells = content.get('cells', [])
if not cells or num_rows == 0 or num_cols == 0:
return []
# Initialize rows structure
rows_data = []
for _ in range(num_rows):
rows_data.append({'cells': [{'text': ''} for _ in range(num_cols)]})
# Fill in cell content
for cell in cells:
row_idx = cell.get('row', 0)
col_idx = cell.get('col', 0)
cell_content = cell.get('content', '')
if 0 <= row_idx < num_rows and 0 <= col_idx < num_cols:
rows_data[row_idx]['cells'][col_idx]['text'] = str(cell_content) if cell_content else ''
logger.debug(f"Built {num_rows} rows from cells dict")
return rows_data
except Exception as e:
logger.error(f"Error building rows from cells dict: {e}")
return []
def _draw_table_element_direct( def _draw_table_element_direct(
self, self,
pdf_canvas: canvas.Canvas, pdf_canvas: canvas.Canvas,
@@ -2250,28 +2301,34 @@ class PDFGeneratorService:
page_height: Page height for coordinate transformation page_height: Page height for coordinate transformation
""" """
try: try:
# Get table HTML content # Get table data - can be TableData object or dict from JSON
rows_data = None
if isinstance(element.content, TableData): if isinstance(element.content, TableData):
# Direct TableData object - convert to HTML then parse
html_content = element.content.to_html() html_content = element.content.to_html()
elif isinstance(element.content, dict):
html_content = element.content.get('html', str(element.content))
else:
html_content = str(element.content)
if not html_content:
logger.warning(f"No HTML content for table {element.element_id}")
return
# Parse HTML
parser = HTMLTableParser() parser = HTMLTableParser()
parser.feed(html_content) parser.feed(html_content)
if parser.tables and parser.tables[0]['rows']:
rows_data = parser.tables[0]['rows']
elif isinstance(element.content, dict):
# Dict from JSON - check if it has cells structure (from Direct extraction)
if 'cells' in element.content:
# Build rows from cells structure directly (avoid HTML round-trip)
rows_data = self._build_rows_from_cells_dict(element.content)
elif 'html' in element.content:
# Has HTML content - parse it
html_content = element.content['html']
parser = HTMLTableParser()
parser.feed(html_content)
if parser.tables and parser.tables[0]['rows']:
rows_data = parser.tables[0]['rows']
if not parser.tables or not parser.tables[0]['rows']: if not rows_data:
logger.warning(f"No table data parsed for {element.element_id}") logger.warning(f"No table data for {element.element_id}")
return return
table_data = parser.tables[0] rows = rows_data
rows = table_data['rows']
# Get bbox # Get bbox
bbox = element.bbox bbox = element.bbox
@@ -2304,23 +2361,14 @@ class PDFGeneratorService:
col_widths = element.metadata['column_widths'] col_widths = element.metadata['column_widths']
logger.debug(f"Using extracted column widths: {col_widths}") logger.debug(f"Using extracted column widths: {col_widths}")
# Use original row heights from extraction if available # NOTE: Don't use rowHeights from extraction - it causes content overlap
# Row heights must match the number of data rows exactly # The extracted row heights are based on cell boundaries, not text content height.
row_heights_list = None # When text wraps or uses different font sizes, the heights don't match.
if element.metadata and 'row_heights' in element.metadata: # Let ReportLab auto-calculate row heights based on content, then use scaling
extracted_row_heights = element.metadata['row_heights'] # to fit within the bbox (same approach as old commit ba8ddf2b).
num_data_rows = len(table_content)
num_height_rows = len(extracted_row_heights)
if num_height_rows == num_data_rows: # Create table without rowHeights - let ReportLab auto-calculate
row_heights_list = extracted_row_heights t = Table(table_content, colWidths=col_widths)
logger.debug(f"Using extracted row heights ({num_height_rows} rows): {row_heights_list}")
else:
# Row counts don't match - this can happen with merged cells or empty rows
logger.warning(f"Row height mismatch: {num_height_rows} heights for {num_data_rows} data rows, falling back to auto-sizing")
# Create table with both column widths and row heights for accurate sizing
t = Table(table_content, colWidths=col_widths, rowHeights=row_heights_list)
# Apply style with minimal padding to reduce table extension # Apply style with minimal padding to reduce table extension
# Use Chinese font to support special characters (℃, μm, ≦, ×, Ω, etc.) # Use Chinese font to support special characters (℃, μm, ≦, ×, Ω, etc.)