From 79cffe6da0852d5d675abb07c580e2b5fc899348 Mon Sep 17 00:00:00 2001
From: egg <lin4637lin4637@gmail.com>
Date: Wed, 26 Nov 2025 12:29:46 +0800
Subject: [PATCH] fix: resolve Direct track PDF regression issues
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add _is_likely_chart() to detect charts misclassified as tables
  - High empty cell ratio (>70%) indicates chart grid
  - Axis label patterns (numbers, °C, %, Time, Temperature)
  - Multi-line cells with axis text

- Add _build_rows_from_cells_dict() to handle JSON table content
  - Properly parse cells structure from Direct extraction
  - Avoid HTML round-trip conversion issues

- Remove rowHeights parameter from Table() to fix content overlap
  - Let ReportLab auto-calculate row heights based on content
  - Use scaling to fit within bbox

Fixes edit.pdf table overlap and chart misclassification issues.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 .../app/services/direct_extraction_engine.py  |  88 ++++++++++++++
 backend/app/services/pdf_generator_service.py | 112 +++++++++++++-----
 2 files changed, 168 insertions(+), 32 deletions(-)

diff --git a/backend/app/services/direct_extraction_engine.py b/backend/app/services/direct_extraction_engine.py
index d0b23b9..0aad35c 100644
--- a/backend/app/services/direct_extraction_engine.py
+++ b/backend/app/services/direct_extraction_engine.py
@@ -690,6 +690,89 @@ class DirectExtractionEngine:
         # Default to paragraph for longer text, text for shorter
         return ElementType.PARAGRAPH if len(text) > 150 else ElementType.TEXT
 
+    def _is_likely_chart(self, data: list, table) -> bool:
+        """
+        Detect if a "table" detected by find_tables() is actually a chart/graph.
+
+        Charts often get misclassified as tables because they have grid lines.
+        Characteristics of a chart misclassified as table:
+        1. High percentage of empty cells (>60%)
+        2. Content patterns that look like axis labels (numbers, units like °C, %, etc.)
+        3. Single cell contains multi-line text with chart-like patterns
+        4. Cell content contains typical chart axis patterns
+
+        Args:
+            data: Extracted table data (list of lists)
+            table: PyMuPDF table object
+
+        Returns:
+            True if the table is likely a chart
+        """
+        if not data:
+            return False
+
+        # Count total cells and empty cells
+        total_cells = 0
+        empty_cells = 0
+        multi_line_cells = 0
+        axis_pattern_cells = 0
+
+        # Patterns that suggest chart axis labels
+        import re
+        axis_patterns = [
+            r'^-?\d+$',           # Simple numbers (axis ticks)
+            r'^-?\d+\.?\d*$',     # Decimal numbers
+            r'°[CF]',             # Temperature units
+            r'%$',                # Percentage
+            r'\bppm\b',           # Parts per million
+            r'\bmin\b',           # Minutes
+            r'\bsec\b',           # Seconds
+            r'\bTime\b',          # Time axis label
+            r'\bTemperature\b',   # Temperature axis label
+            r'[Aa]xis',           # Axis label
+        ]
+
+        for row in data:
+            for cell in row:
+                total_cells += 1
+                cell_text = str(cell).strip() if cell else ""
+
+                if not cell_text:
+                    empty_cells += 1
+                else:
+                    # Check for multi-line content
+                    if '\n' in cell_text:
+                        multi_line_cells += 1
+
+                    # Check for axis patterns
+                    for pattern in axis_patterns:
+                        if re.search(pattern, cell_text, re.IGNORECASE):
+                            axis_pattern_cells += 1
+                            break
+
+        # Calculate metrics
+        empty_ratio = empty_cells / total_cells if total_cells > 0 else 0
+
+        # Decision criteria for chart detection:
+        # 1. Very high empty cell ratio (>70%) suggests it's a chart grid
+        if empty_ratio > 0.7:
+            logger.debug(f"Chart detection: high empty ratio {empty_ratio:.2f} (>70%)")
+            return True
+
+        # 2. High empty ratio + axis patterns suggests chart
+        if empty_ratio > 0.5 and axis_pattern_cells >= 3:
+            logger.debug(f"Chart detection: empty ratio {empty_ratio:.2f} + {axis_pattern_cells} axis patterns")
+            return True
+
+        # 3. Multi-line cell with axis patterns in first cell (often chart legend text)
+        if multi_line_cells >= 1 and axis_pattern_cells >= 2:
+            first_cell = str(data[0][0]).strip() if data and data[0] else ""
+            if '\n' in first_cell and len(first_cell.split('\n')) >= 5:
+                logger.debug(f"Chart detection: first cell has {len(first_cell.split(chr(10)))} lines with axis patterns")
+                return True
+
+        return False
+
     def _process_native_table(self, table, page_num: int, counter: int) -> Optional[DocumentElement]:
         """Process a natively detected table"""
         try:
@@ -698,6 +781,11 @@ class DirectExtractionEngine:
             if not data or len(data) < self.min_table_rows:
                 return None
 
+            # Check if this "table" is actually a chart (misclassified by find_tables)
+            if self._is_likely_chart(data, table):
+                logger.info(f"Skipping table_{page_num}_{counter} - detected as chart (not table)")
+                return None
+
             # Get table bounding box
             bbox_data = table.bbox
             bbox = BoundingBox(
diff --git a/backend/app/services/pdf_generator_service.py b/backend/app/services/pdf_generator_service.py
index 5e2e462..73596b7 100644
--- a/backend/app/services/pdf_generator_service.py
+++ b/backend/app/services/pdf_generator_service.py
@@ -2235,6 +2235,57 @@ class PDFGeneratorService:
         except Exception as e:
             logger.error(f"Failed to draw text element {element.element_id}: {e}")
 
+    def _build_rows_from_cells_dict(self, content: dict) -> list:
+        """
+        Build row structure from cells dict (from Direct extraction JSON).
+
+        The cells structure from Direct extraction:
+        {
+            "rows": 6,
+            "cols": 2,
+            "cells": [
+                {"row": 0, "col": 0, "content": "..."},
+                {"row": 0, "col": 1, "content": "..."},
+                ...
+            ]
+        }
+
+        Returns format compatible with HTMLTableParser output:
+        [
+            {"cells": [{"text": "..."}, {"text": "..."}]},  # row 0
+            {"cells": [{"text": "..."}, {"text": "..."}]},  # row 1
+            ...
+        ]
+        """
+        try:
+            num_rows = content.get('rows', 0)
+            num_cols = content.get('cols', 0)
+            cells = content.get('cells', [])
+
+            if not cells or num_rows == 0 or num_cols == 0:
+                return []
+
+            # Initialize rows structure
+            rows_data = []
+            for _ in range(num_rows):
+                rows_data.append({'cells': [{'text': ''} for _ in range(num_cols)]})
+
+            # Fill in cell content
+            for cell in cells:
+                row_idx = cell.get('row', 0)
+                col_idx = cell.get('col', 0)
+                cell_content = cell.get('content', '')
+
+                if 0 <= row_idx < num_rows and 0 <= col_idx < num_cols:
+                    rows_data[row_idx]['cells'][col_idx]['text'] = str(cell_content) if cell_content else ''
+
+            logger.debug(f"Built {num_rows} rows from cells dict")
+            return rows_data
+
+        except Exception as e:
+            logger.error(f"Error building rows from cells dict: {e}")
+            return []
+
     def _draw_table_element_direct(
         self,
         pdf_canvas: canvas.Canvas,
@@ -2250,28 +2301,34 @@ class PDFGeneratorService:
             page_height: Page height for coordinate transformation
         """
         try:
-            # Get table HTML content
+            # Get table data - can be TableData object or dict from JSON
+            rows_data = None
+
             if isinstance(element.content, TableData):
+                # Direct TableData object - convert to HTML then parse
                 html_content = element.content.to_html()
+                parser = HTMLTableParser()
+                parser.feed(html_content)
+                if parser.tables and parser.tables[0]['rows']:
+                    rows_data = parser.tables[0]['rows']
             elif isinstance(element.content, dict):
-                html_content = element.content.get('html', str(element.content))
-            else:
-                html_content = str(element.content)
+                # Dict from JSON - check if it has cells structure (from Direct extraction)
+                if 'cells' in element.content:
+                    # Build rows from cells structure directly (avoid HTML round-trip)
+                    rows_data = self._build_rows_from_cells_dict(element.content)
+                elif 'html' in element.content:
+                    # Has HTML content - parse it
+                    html_content = element.content['html']
+                    parser = HTMLTableParser()
+                    parser.feed(html_content)
+                    if parser.tables and parser.tables[0]['rows']:
+                        rows_data = parser.tables[0]['rows']
 
-            if not html_content:
-                logger.warning(f"No HTML content for table {element.element_id}")
+            if not rows_data:
+                logger.warning(f"No table data for {element.element_id}")
                 return
 
-            # Parse HTML
-            parser = HTMLTableParser()
-            parser.feed(html_content)
-
-            if not parser.tables or not parser.tables[0]['rows']:
-                logger.warning(f"No table data parsed for {element.element_id}")
-                return
-
-            table_data = parser.tables[0]
-            rows = table_data['rows']
+            rows = rows_data
 
             # Get bbox
             bbox = element.bbox
@@ -2304,23 +2361,14 @@ class PDFGeneratorService:
                 col_widths = element.metadata['column_widths']
                 logger.debug(f"Using extracted column widths: {col_widths}")
 
-            # Use original row heights from extraction if available
-            # Row heights must match the number of data rows exactly
-            row_heights_list = None
-            if element.metadata and 'row_heights' in element.metadata:
-                extracted_row_heights = element.metadata['row_heights']
-                num_data_rows = len(table_content)
-                num_height_rows = len(extracted_row_heights)
+            # NOTE: Don't use rowHeights from extraction - it causes content overlap
+            # The extracted row heights are based on cell boundaries, not text content height.
+            # When text wraps or uses different font sizes, the heights don't match.
+            # Let ReportLab auto-calculate row heights based on content, then use scaling
+            # to fit within the bbox (same approach as old commit ba8ddf2b).
 
-                if num_height_rows == num_data_rows:
-                    row_heights_list = extracted_row_heights
-                    logger.debug(f"Using extracted row heights ({num_height_rows} rows): {row_heights_list}")
-                else:
-                    # Row counts don't match - this can happen with merged cells or empty rows
-                    logger.warning(f"Row height mismatch: {num_height_rows} heights for {num_data_rows} data rows, falling back to auto-sizing")
-
-            # Create table with both column widths and row heights for accurate sizing
-            t = Table(table_content, colWidths=col_widths, rowHeights=row_heights_list)
+            # Create table without rowHeights - let ReportLab auto-calculate
+            t = Table(table_content, colWidths=col_widths)
 
             # Apply style with minimal padding to reduce table extension
             # Use Chinese font to support special characters (℃, μm, ≦, ×, Ω, etc.)