feat: add multi-column layout support for PDF extraction and generation

- Enable PyMuPDF sort=True for correct reading order in multi-column PDFs - Add column detection utilities (_sort_elements_for_reading_order, _detect_columns) - Preserve extraction order in PDF generation instead of re-sorting by Y position - Fix StyleInfo field names (font_name, font_size, text_color instead of font, size, color) - Fix Page.dimensions access (was incorrectly accessing Page.width directly) - Implement row-by-row reading order (top-to-bottom, left-to-right within each row) This fixes the issue where multi-column PDFs (e.g., technical data sheets) had incorrect element ordering, with title appearing at position 12 instead of first. PyMuPDF's built-in sort=True parameter provides optimal reading order for most multi-column layouts without requiring custom column detection. Resolves: Multi-column layout reading order issue reported by user Affects: Direct track PDF extraction and generation (Task 8) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-24 14:25:53 +08:00
parent 75c194fe2a
commit 6d4df26223
3 changed files with 256 additions and 60 deletions
--- a/backend/app/services/direct_extraction_engine.py
+++ b/backend/app/services/direct_extraction_engine.py
@@ -189,8 +189,8 @@ class DirectExtractionEngine:
            dpi=72  # PDF standard DPI
        )

-        # Extract text blocks with formatting
-        text_dict = page.get_text("dict")
+        # Extract text blocks with formatting (sort=True for reading order)
+        text_dict = page.get_text("dict", sort=True)
        for block_idx, block in enumerate(text_dict.get("blocks", [])):
            if block.get("type") == 0:  # Text block
                element = self._process_text_block(
@@ -254,6 +254,11 @@ class DirectExtractionEngine:
        if drawings:
            logger.debug(f"Page {page_num} contains {len(drawings)} vector drawing commands")

+        # PyMuPDF's sort=True already provides good reading order for multi-column layouts
+        # (top-to-bottom, left-to-right within each row). We don't need to re-sort.
+        # NOTE: If sort=True is not used in get_text(), uncomment the line below:
+        # elements = self._sort_elements_for_reading_order(elements, dimensions)
+
        # Post-process elements for header/footer detection and structure
        elements = self._detect_headers_footers(elements, dimensions)
        elements = self._build_section_hierarchy(elements)
@@ -270,6 +275,113 @@ class DirectExtractionEngine:
            }
        )

+    def _sort_elements_for_reading_order(self, elements: List[DocumentElement], dimensions: Dimensions) -> List[DocumentElement]:
+        """
+        Sort elements by reading order, handling multi-column layouts.
+
+        For multi-column layouts (e.g., two-column documents), this ensures
+        elements are ordered correctly: top-to-bottom, then left-to-right
+        within each row.
+
+        Args:
+            elements: List of document elements
+            dimensions: Page dimensions
+
+        Returns:
+            Sorted list of elements in reading order
+        """
+        if not elements:
+            return elements
+
+        # Detect if page has multi-column layout
+        text_elements = [e for e in elements if e.bbox and e.is_text]
+        if len(text_elements) < 3:
+            # Too few elements to determine layout, just sort by Y position
+            return sorted(elements, key=lambda e: (e.bbox.y0 if e.bbox else 0, e.bbox.x0 if e.bbox else 0))
+
+        # Cluster x-positions to detect columns
+        x_positions = [e.bbox.x0 for e in text_elements]
+        columns = self._detect_columns(x_positions, dimensions.width)
+
+        if len(columns) <= 1:
+            # Single column layout - simple top-to-bottom sort
+            logger.debug(f"Detected single-column layout")
+            return sorted(elements, key=lambda e: (e.bbox.y0 if e.bbox else 0, e.bbox.x0 if e.bbox else 0))
+
+        logger.debug(f"Detected {len(columns)}-column layout at x positions: {[f'{x:.1f}' for x in columns]}")
+
+        # Multi-column layout - use newspaper-style reading order
+        # (complete left column, then right column, etc.)
+        # This is more appropriate for technical documents and data sheets
+        element_data = []
+        for elem in elements:
+            if not elem.bbox:
+                element_data.append((elem, 0, 0))
+                continue
+
+            # Find which column this element belongs to
+            col_idx = 0
+            min_dist = float('inf')
+            for i, col_x in enumerate(columns):
+                dist = abs(elem.bbox.x0 - col_x)
+                if dist < min_dist:
+                    min_dist = dist
+                    col_idx = i
+
+            element_data.append((elem, col_idx, elem.bbox.y0))
+
+        # Sort by: column first, then Y position within column
+        # This gives newspaper-style reading: complete column 1, then column 2, etc.
+        element_data.sort(key=lambda x: (x[1], x[2]))
+
+        logger.debug(f"Using newspaper-style column reading order (column by column, top to bottom)")
+        return [e[0] for e in element_data]
+
+    def _detect_columns(self, x_positions: List[float], page_width: float) -> List[float]:
+        """
+        Detect column positions from x-coordinates of text elements.
+
+        Args:
+            x_positions: List of x-coordinates (left edges of text)
+            page_width: Page width in points
+
+        Returns:
+            List of column x-positions (sorted left to right)
+        """
+        if not x_positions:
+            return []
+
+        # Cluster x-positions to find column starts
+        # Use k-means-like approach: find groups of x-positions
+        threshold = page_width * 0.15  # 15% of page width as clustering threshold
+
+        sorted_x = sorted(set(x_positions))
+        if not sorted_x:
+            return []
+
+        clusters = [[sorted_x[0]]]
+
+        for x in sorted_x[1:]:
+            # Check if x belongs to current cluster
+            cluster_center = sum(clusters[-1]) / len(clusters[-1])
+            if abs(x - cluster_center) < threshold:
+                clusters[-1].append(x)
+            else:
+                # Start new cluster
+                clusters.append([x])
+
+        # Return average x position of each cluster (column start)
+        column_positions = [sum(cluster) / len(cluster) for cluster in clusters]
+
+        # Filter out columns that are too close to each other
+        min_column_width = page_width * 0.2  # Columns must be at least 20% of page width apart
+        filtered_columns = [column_positions[0]]
+        for col_x in column_positions[1:]:
+            if col_x - filtered_columns[-1] >= min_column_width:
+                filtered_columns.append(col_x)
+
+        return filtered_columns
+
    def _detect_headers_footers(self, elements: List[DocumentElement], dimensions: Dimensions) -> List[DocumentElement]:
        """Detect and mark header/footer elements based on page position"""
        page_height = dimensions.height