fix: resolve Direct track PDF regression issues

- Add _is_likely_chart() to detect charts misclassified as tables - High empty cell ratio (>70%) indicates chart grid - Axis label patterns (numbers, °C, %, Time, Temperature) - Multi-line cells with axis text - Add _build_rows_from_cells_dict() to handle JSON table content - Properly parse cells structure from Direct extraction - Avoid HTML round-trip conversion issues - Remove rowHeights parameter from Table() to fix content overlap - Let ReportLab auto-calculate row heights based on content - Use scaling to fit within bbox Fixes edit.pdf table overlap and chart misclassification issues. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-26 12:29:46 +08:00
parent 1afdb822c3
commit 79cffe6da0
2 changed files with 168 additions and 32 deletions
--- a/backend/app/services/direct_extraction_engine.py
+++ b/backend/app/services/direct_extraction_engine.py
@@ -690,6 +690,89 @@ class DirectExtractionEngine:
        # Default to paragraph for longer text, text for shorter
        return ElementType.PARAGRAPH if len(text) > 150 else ElementType.TEXT

+    def _is_likely_chart(self, data: list, table) -> bool:
+        """
+        Detect if a "table" detected by find_tables() is actually a chart/graph.
+
+        Charts often get misclassified as tables because they have grid lines.
+        Characteristics of a chart misclassified as table:
+        1. High percentage of empty cells (>60%)
+        2. Content patterns that look like axis labels (numbers, units like °C, %, etc.)
+        3. Single cell contains multi-line text with chart-like patterns
+        4. Cell content contains typical chart axis patterns
+
+        Args:
+            data: Extracted table data (list of lists)
+            table: PyMuPDF table object
+
+        Returns:
+            True if the table is likely a chart
+        """
+        if not data:
+            return False
+
+        # Count total cells and empty cells
+        total_cells = 0
+        empty_cells = 0
+        multi_line_cells = 0
+        axis_pattern_cells = 0
+
+        # Patterns that suggest chart axis labels
+        import re
+        axis_patterns = [
+            r'^-?\d+$',           # Simple numbers (axis ticks)
+            r'^-?\d+\.?\d*$',     # Decimal numbers
+            r'°[CF]',             # Temperature units
+            r'%$',                # Percentage
+            r'\bppm\b',           # Parts per million
+            r'\bmin\b',           # Minutes
+            r'\bsec\b',           # Seconds
+            r'\bTime\b',          # Time axis label
+            r'\bTemperature\b',   # Temperature axis label
+            r'[Aa]xis',           # Axis label
+        ]
+
+        for row in data:
+            for cell in row:
+                total_cells += 1
+                cell_text = str(cell).strip() if cell else ""
+
+                if not cell_text:
+                    empty_cells += 1
+                else:
+                    # Check for multi-line content
+                    if '\n' in cell_text:
+                        multi_line_cells += 1
+
+                    # Check for axis patterns
+                    for pattern in axis_patterns:
+                        if re.search(pattern, cell_text, re.IGNORECASE):
+                            axis_pattern_cells += 1
+                            break
+
+        # Calculate metrics
+        empty_ratio = empty_cells / total_cells if total_cells > 0 else 0
+
+        # Decision criteria for chart detection:
+        # 1. Very high empty cell ratio (>70%) suggests it's a chart grid
+        if empty_ratio > 0.7:
+            logger.debug(f"Chart detection: high empty ratio {empty_ratio:.2f} (>70%)")
+            return True
+
+        # 2. High empty ratio + axis patterns suggests chart
+        if empty_ratio > 0.5 and axis_pattern_cells >= 3:
+            logger.debug(f"Chart detection: empty ratio {empty_ratio:.2f} + {axis_pattern_cells} axis patterns")
+            return True
+
+        # 3. Multi-line cell with axis patterns in first cell (often chart legend text)
+        if multi_line_cells >= 1 and axis_pattern_cells >= 2:
+            first_cell = str(data[0][0]).strip() if data and data[0] else ""
+            if '\n' in first_cell and len(first_cell.split('\n')) >= 5:
+                logger.debug(f"Chart detection: first cell has {len(first_cell.split(chr(10)))} lines with axis patterns")
+                return True
+
+        return False
+
    def _process_native_table(self, table, page_num: int, counter: int) -> Optional[DocumentElement]:
        """Process a natively detected table"""
        try:
@@ -698,6 +781,11 @@ class DirectExtractionEngine:
            if not data or len(data) < self.min_table_rows:
                return None

+            # Check if this "table" is actually a chart (misclassified by find_tables)
+            if self._is_likely_chart(data, table):
+                logger.info(f"Skipping table_{page_num}_{counter} - detected as chart (not table)")
+                return None
+
            # Get table bounding box
            bbox_data = table.bbox
            bbox = BoundingBox(