From 79cffe6da0852d5d675abb07c580e2b5fc899348 Mon Sep 17 00:00:00 2001 From: egg Date: Wed, 26 Nov 2025 12:29:46 +0800 Subject: [PATCH] fix: resolve Direct track PDF regression issues MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add _is_likely_chart() to detect charts misclassified as tables - High empty cell ratio (>70%) indicates chart grid - Axis label patterns (numbers, °C, %, Time, Temperature) - Multi-line cells with axis text - Add _build_rows_from_cells_dict() to handle JSON table content - Properly parse cells structure from Direct extraction - Avoid HTML round-trip conversion issues - Remove rowHeights parameter from Table() to fix content overlap - Let ReportLab auto-calculate row heights based on content - Use scaling to fit within bbox Fixes edit.pdf table overlap and chart misclassification issues. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../app/services/direct_extraction_engine.py | 88 ++++++++++++++ backend/app/services/pdf_generator_service.py | 112 +++++++++++++----- 2 files changed, 168 insertions(+), 32 deletions(-) diff --git a/backend/app/services/direct_extraction_engine.py b/backend/app/services/direct_extraction_engine.py index d0b23b9..0aad35c 100644 --- a/backend/app/services/direct_extraction_engine.py +++ b/backend/app/services/direct_extraction_engine.py @@ -690,6 +690,89 @@ class DirectExtractionEngine: # Default to paragraph for longer text, text for shorter return ElementType.PARAGRAPH if len(text) > 150 else ElementType.TEXT + def _is_likely_chart(self, data: list, table) -> bool: + """ + Detect if a "table" detected by find_tables() is actually a chart/graph. + + Charts often get misclassified as tables because they have grid lines. + Characteristics of a chart misclassified as table: + 1. High percentage of empty cells (>60%) + 2. Content patterns that look like axis labels (numbers, units like °C, %, etc.) + 3. Single cell contains multi-line text with chart-like patterns + 4. Cell content contains typical chart axis patterns + + Args: + data: Extracted table data (list of lists) + table: PyMuPDF table object + + Returns: + True if the table is likely a chart + """ + if not data: + return False + + # Count total cells and empty cells + total_cells = 0 + empty_cells = 0 + multi_line_cells = 0 + axis_pattern_cells = 0 + + # Patterns that suggest chart axis labels + import re + axis_patterns = [ + r'^-?\d+$', # Simple numbers (axis ticks) + r'^-?\d+\.?\d*$', # Decimal numbers + r'°[CF]', # Temperature units + r'%$', # Percentage + r'\bppm\b', # Parts per million + r'\bmin\b', # Minutes + r'\bsec\b', # Seconds + r'\bTime\b', # Time axis label + r'\bTemperature\b', # Temperature axis label + r'[Aa]xis', # Axis label + ] + + for row in data: + for cell in row: + total_cells += 1 + cell_text = str(cell).strip() if cell else "" + + if not cell_text: + empty_cells += 1 + else: + # Check for multi-line content + if '\n' in cell_text: + multi_line_cells += 1 + + # Check for axis patterns + for pattern in axis_patterns: + if re.search(pattern, cell_text, re.IGNORECASE): + axis_pattern_cells += 1 + break + + # Calculate metrics + empty_ratio = empty_cells / total_cells if total_cells > 0 else 0 + + # Decision criteria for chart detection: + # 1. Very high empty cell ratio (>70%) suggests it's a chart grid + if empty_ratio > 0.7: + logger.debug(f"Chart detection: high empty ratio {empty_ratio:.2f} (>70%)") + return True + + # 2. High empty ratio + axis patterns suggests chart + if empty_ratio > 0.5 and axis_pattern_cells >= 3: + logger.debug(f"Chart detection: empty ratio {empty_ratio:.2f} + {axis_pattern_cells} axis patterns") + return True + + # 3. Multi-line cell with axis patterns in first cell (often chart legend text) + if multi_line_cells >= 1 and axis_pattern_cells >= 2: + first_cell = str(data[0][0]).strip() if data and data[0] else "" + if '\n' in first_cell and len(first_cell.split('\n')) >= 5: + logger.debug(f"Chart detection: first cell has {len(first_cell.split(chr(10)))} lines with axis patterns") + return True + + return False + def _process_native_table(self, table, page_num: int, counter: int) -> Optional[DocumentElement]: """Process a natively detected table""" try: @@ -698,6 +781,11 @@ class DirectExtractionEngine: if not data or len(data) < self.min_table_rows: return None + # Check if this "table" is actually a chart (misclassified by find_tables) + if self._is_likely_chart(data, table): + logger.info(f"Skipping table_{page_num}_{counter} - detected as chart (not table)") + return None + # Get table bounding box bbox_data = table.bbox bbox = BoundingBox( diff --git a/backend/app/services/pdf_generator_service.py b/backend/app/services/pdf_generator_service.py index 5e2e462..73596b7 100644 --- a/backend/app/services/pdf_generator_service.py +++ b/backend/app/services/pdf_generator_service.py @@ -2235,6 +2235,57 @@ class PDFGeneratorService: except Exception as e: logger.error(f"Failed to draw text element {element.element_id}: {e}") + def _build_rows_from_cells_dict(self, content: dict) -> list: + """ + Build row structure from cells dict (from Direct extraction JSON). + + The cells structure from Direct extraction: + { + "rows": 6, + "cols": 2, + "cells": [ + {"row": 0, "col": 0, "content": "..."}, + {"row": 0, "col": 1, "content": "..."}, + ... + ] + } + + Returns format compatible with HTMLTableParser output: + [ + {"cells": [{"text": "..."}, {"text": "..."}]}, # row 0 + {"cells": [{"text": "..."}, {"text": "..."}]}, # row 1 + ... + ] + """ + try: + num_rows = content.get('rows', 0) + num_cols = content.get('cols', 0) + cells = content.get('cells', []) + + if not cells or num_rows == 0 or num_cols == 0: + return [] + + # Initialize rows structure + rows_data = [] + for _ in range(num_rows): + rows_data.append({'cells': [{'text': ''} for _ in range(num_cols)]}) + + # Fill in cell content + for cell in cells: + row_idx = cell.get('row', 0) + col_idx = cell.get('col', 0) + cell_content = cell.get('content', '') + + if 0 <= row_idx < num_rows and 0 <= col_idx < num_cols: + rows_data[row_idx]['cells'][col_idx]['text'] = str(cell_content) if cell_content else '' + + logger.debug(f"Built {num_rows} rows from cells dict") + return rows_data + + except Exception as e: + logger.error(f"Error building rows from cells dict: {e}") + return [] + def _draw_table_element_direct( self, pdf_canvas: canvas.Canvas, @@ -2250,28 +2301,34 @@ class PDFGeneratorService: page_height: Page height for coordinate transformation """ try: - # Get table HTML content + # Get table data - can be TableData object or dict from JSON + rows_data = None + if isinstance(element.content, TableData): + # Direct TableData object - convert to HTML then parse html_content = element.content.to_html() + parser = HTMLTableParser() + parser.feed(html_content) + if parser.tables and parser.tables[0]['rows']: + rows_data = parser.tables[0]['rows'] elif isinstance(element.content, dict): - html_content = element.content.get('html', str(element.content)) - else: - html_content = str(element.content) + # Dict from JSON - check if it has cells structure (from Direct extraction) + if 'cells' in element.content: + # Build rows from cells structure directly (avoid HTML round-trip) + rows_data = self._build_rows_from_cells_dict(element.content) + elif 'html' in element.content: + # Has HTML content - parse it + html_content = element.content['html'] + parser = HTMLTableParser() + parser.feed(html_content) + if parser.tables and parser.tables[0]['rows']: + rows_data = parser.tables[0]['rows'] - if not html_content: - logger.warning(f"No HTML content for table {element.element_id}") + if not rows_data: + logger.warning(f"No table data for {element.element_id}") return - # Parse HTML - parser = HTMLTableParser() - parser.feed(html_content) - - if not parser.tables or not parser.tables[0]['rows']: - logger.warning(f"No table data parsed for {element.element_id}") - return - - table_data = parser.tables[0] - rows = table_data['rows'] + rows = rows_data # Get bbox bbox = element.bbox @@ -2304,23 +2361,14 @@ class PDFGeneratorService: col_widths = element.metadata['column_widths'] logger.debug(f"Using extracted column widths: {col_widths}") - # Use original row heights from extraction if available - # Row heights must match the number of data rows exactly - row_heights_list = None - if element.metadata and 'row_heights' in element.metadata: - extracted_row_heights = element.metadata['row_heights'] - num_data_rows = len(table_content) - num_height_rows = len(extracted_row_heights) + # NOTE: Don't use rowHeights from extraction - it causes content overlap + # The extracted row heights are based on cell boundaries, not text content height. + # When text wraps or uses different font sizes, the heights don't match. + # Let ReportLab auto-calculate row heights based on content, then use scaling + # to fit within the bbox (same approach as old commit ba8ddf2b). - if num_height_rows == num_data_rows: - row_heights_list = extracted_row_heights - logger.debug(f"Using extracted row heights ({num_height_rows} rows): {row_heights_list}") - else: - # Row counts don't match - this can happen with merged cells or empty rows - logger.warning(f"Row height mismatch: {num_height_rows} heights for {num_data_rows} data rows, falling back to auto-sizing") - - # Create table with both column widths and row heights for accurate sizing - t = Table(table_content, colWidths=col_widths, rowHeights=row_heights_list) + # Create table without rowHeights - let ReportLab auto-calculate + t = Table(table_content, colWidths=col_widths) # Apply style with minimal padding to reduce table extension # Use Chinese font to support special characters (℃, μm, ≦, ×, Ω, etc.)