fix: resolve Direct track PDF regression issues
- Add _is_likely_chart() to detect charts misclassified as tables - High empty cell ratio (>70%) indicates chart grid - Axis label patterns (numbers, °C, %, Time, Temperature) - Multi-line cells with axis text - Add _build_rows_from_cells_dict() to handle JSON table content - Properly parse cells structure from Direct extraction - Avoid HTML round-trip conversion issues - Remove rowHeights parameter from Table() to fix content overlap - Let ReportLab auto-calculate row heights based on content - Use scaling to fit within bbox Fixes edit.pdf table overlap and chart misclassification issues. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -2235,6 +2235,57 @@ class PDFGeneratorService:
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to draw text element {element.element_id}: {e}")
|
||||
|
||||
def _build_rows_from_cells_dict(self, content: dict) -> list:
|
||||
"""
|
||||
Build row structure from cells dict (from Direct extraction JSON).
|
||||
|
||||
The cells structure from Direct extraction:
|
||||
{
|
||||
"rows": 6,
|
||||
"cols": 2,
|
||||
"cells": [
|
||||
{"row": 0, "col": 0, "content": "..."},
|
||||
{"row": 0, "col": 1, "content": "..."},
|
||||
...
|
||||
]
|
||||
}
|
||||
|
||||
Returns format compatible with HTMLTableParser output:
|
||||
[
|
||||
{"cells": [{"text": "..."}, {"text": "..."}]}, # row 0
|
||||
{"cells": [{"text": "..."}, {"text": "..."}]}, # row 1
|
||||
...
|
||||
]
|
||||
"""
|
||||
try:
|
||||
num_rows = content.get('rows', 0)
|
||||
num_cols = content.get('cols', 0)
|
||||
cells = content.get('cells', [])
|
||||
|
||||
if not cells or num_rows == 0 or num_cols == 0:
|
||||
return []
|
||||
|
||||
# Initialize rows structure
|
||||
rows_data = []
|
||||
for _ in range(num_rows):
|
||||
rows_data.append({'cells': [{'text': ''} for _ in range(num_cols)]})
|
||||
|
||||
# Fill in cell content
|
||||
for cell in cells:
|
||||
row_idx = cell.get('row', 0)
|
||||
col_idx = cell.get('col', 0)
|
||||
cell_content = cell.get('content', '')
|
||||
|
||||
if 0 <= row_idx < num_rows and 0 <= col_idx < num_cols:
|
||||
rows_data[row_idx]['cells'][col_idx]['text'] = str(cell_content) if cell_content else ''
|
||||
|
||||
logger.debug(f"Built {num_rows} rows from cells dict")
|
||||
return rows_data
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error building rows from cells dict: {e}")
|
||||
return []
|
||||
|
||||
def _draw_table_element_direct(
|
||||
self,
|
||||
pdf_canvas: canvas.Canvas,
|
||||
@@ -2250,28 +2301,34 @@ class PDFGeneratorService:
|
||||
page_height: Page height for coordinate transformation
|
||||
"""
|
||||
try:
|
||||
# Get table HTML content
|
||||
# Get table data - can be TableData object or dict from JSON
|
||||
rows_data = None
|
||||
|
||||
if isinstance(element.content, TableData):
|
||||
# Direct TableData object - convert to HTML then parse
|
||||
html_content = element.content.to_html()
|
||||
parser = HTMLTableParser()
|
||||
parser.feed(html_content)
|
||||
if parser.tables and parser.tables[0]['rows']:
|
||||
rows_data = parser.tables[0]['rows']
|
||||
elif isinstance(element.content, dict):
|
||||
html_content = element.content.get('html', str(element.content))
|
||||
else:
|
||||
html_content = str(element.content)
|
||||
# Dict from JSON - check if it has cells structure (from Direct extraction)
|
||||
if 'cells' in element.content:
|
||||
# Build rows from cells structure directly (avoid HTML round-trip)
|
||||
rows_data = self._build_rows_from_cells_dict(element.content)
|
||||
elif 'html' in element.content:
|
||||
# Has HTML content - parse it
|
||||
html_content = element.content['html']
|
||||
parser = HTMLTableParser()
|
||||
parser.feed(html_content)
|
||||
if parser.tables and parser.tables[0]['rows']:
|
||||
rows_data = parser.tables[0]['rows']
|
||||
|
||||
if not html_content:
|
||||
logger.warning(f"No HTML content for table {element.element_id}")
|
||||
if not rows_data:
|
||||
logger.warning(f"No table data for {element.element_id}")
|
||||
return
|
||||
|
||||
# Parse HTML
|
||||
parser = HTMLTableParser()
|
||||
parser.feed(html_content)
|
||||
|
||||
if not parser.tables or not parser.tables[0]['rows']:
|
||||
logger.warning(f"No table data parsed for {element.element_id}")
|
||||
return
|
||||
|
||||
table_data = parser.tables[0]
|
||||
rows = table_data['rows']
|
||||
rows = rows_data
|
||||
|
||||
# Get bbox
|
||||
bbox = element.bbox
|
||||
@@ -2304,23 +2361,14 @@ class PDFGeneratorService:
|
||||
col_widths = element.metadata['column_widths']
|
||||
logger.debug(f"Using extracted column widths: {col_widths}")
|
||||
|
||||
# Use original row heights from extraction if available
|
||||
# Row heights must match the number of data rows exactly
|
||||
row_heights_list = None
|
||||
if element.metadata and 'row_heights' in element.metadata:
|
||||
extracted_row_heights = element.metadata['row_heights']
|
||||
num_data_rows = len(table_content)
|
||||
num_height_rows = len(extracted_row_heights)
|
||||
# NOTE: Don't use rowHeights from extraction - it causes content overlap
|
||||
# The extracted row heights are based on cell boundaries, not text content height.
|
||||
# When text wraps or uses different font sizes, the heights don't match.
|
||||
# Let ReportLab auto-calculate row heights based on content, then use scaling
|
||||
# to fit within the bbox (same approach as old commit ba8ddf2b).
|
||||
|
||||
if num_height_rows == num_data_rows:
|
||||
row_heights_list = extracted_row_heights
|
||||
logger.debug(f"Using extracted row heights ({num_height_rows} rows): {row_heights_list}")
|
||||
else:
|
||||
# Row counts don't match - this can happen with merged cells or empty rows
|
||||
logger.warning(f"Row height mismatch: {num_height_rows} heights for {num_data_rows} data rows, falling back to auto-sizing")
|
||||
|
||||
# Create table with both column widths and row heights for accurate sizing
|
||||
t = Table(table_content, colWidths=col_widths, rowHeights=row_heights_list)
|
||||
# Create table without rowHeights - let ReportLab auto-calculate
|
||||
t = Table(table_content, colWidths=col_widths)
|
||||
|
||||
# Apply style with minimal padding to reduce table extension
|
||||
# Use Chinese font to support special characters (℃, μm, ≦, ×, Ω, etc.)
|
||||
|
||||
Reference in New Issue
Block a user