feat: add multi-column layout support for PDF extraction and generation

- Enable PyMuPDF sort=True for correct reading order in multi-column PDFs - Add column detection utilities (_sort_elements_for_reading_order, _detect_columns) - Preserve extraction order in PDF generation instead of re-sorting by Y position - Fix StyleInfo field names (font_name, font_size, text_color instead of font, size, color) - Fix Page.dimensions access (was incorrectly accessing Page.width directly) - Implement row-by-row reading order (top-to-bottom, left-to-right within each row) This fixes the issue where multi-column PDFs (e.g., technical data sheets) had incorrect element ordering, with title appearing at position 12 instead of first. PyMuPDF's built-in sort=True parameter provides optimal reading order for most multi-column layouts without requiring custom column detection. Resolves: Multi-column layout reading order issue reported by user Affects: Direct track PDF extraction and generation (Task 8) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-24 14:25:53 +08:00
parent 75c194fe2a
commit 6d4df26223
3 changed files with 256 additions and 60 deletions
--- a/backend/app/services/pdf_generator_service.py
+++ b/backend/app/services/pdf_generator_service.py
@@ -256,15 +256,21 @@ class PDFGeneratorService:
            # Extract style attributes
            if hasattr(style_info, '__dict__'):
                # StyleInfo object
-                font_family = getattr(style_info, 'font', None)
-                font_size = getattr(style_info, 'size', default_size)
-                color = getattr(style_info, 'color', None)
+                font_family = getattr(style_info, 'font_name', None)
+                font_size = getattr(style_info, 'font_size', default_size)
+                color = getattr(style_info, 'text_color', None)
+                font_weight = getattr(style_info, 'font_weight', 'normal')
+                font_style = getattr(style_info, 'font_style', 'normal')
+                # Legacy flags support
                flags = getattr(style_info, 'flags', 0)
            elif isinstance(style_info, dict):
                # Dictionary
-                font_family = style_info.get('font')
-                font_size = style_info.get('size', default_size)
-                color = style_info.get('color')
+                font_family = style_info.get('font_name')
+                font_size = style_info.get('font_size', default_size)
+                color = style_info.get('text_color')
+                font_weight = style_info.get('font_weight', 'normal')
+                font_style = style_info.get('font_style', 'normal')
+                # Legacy flags support
                flags = style_info.get('flags', 0)
            else:
                # Unknown format, use defaults
@@ -275,10 +281,12 @@ class PDFGeneratorService:
            # Map font name
            base_font = self._map_font(font_family) if font_family else 'Helvetica'

+            # Determine bold and italic from font_weight/font_style (preferred) or flags (legacy)
+            is_bold = font_weight == 'bold' if font_weight else bool(flags & self.STYLE_FLAG_BOLD)
+            is_italic = font_style == 'italic' if font_style else bool(flags & self.STYLE_FLAG_ITALIC)
+
            # Apply bold/italic modifiers
-            if flags:
-                is_bold = bool(flags & self.STYLE_FLAG_BOLD)
-                is_italic = bool(flags & self.STYLE_FLAG_ITALIC)
+            if is_bold or is_italic:

                if is_bold and is_italic:
                    # Try bold-italic variant
@@ -315,8 +323,20 @@ class PDFGeneratorService:
                c.setFont('Helvetica', actual_size)

            # Apply color
-            if color:
+            rgb_color = None
+            if hasattr(style_info, 'get_rgb_color'):
+                # Use StyleInfo method if available
+                rgb_color = style_info.get_rgb_color()
+            elif color is not None:
+                # Parse from extracted color value
                r, g, b = self._parse_color(color)
+                rgb_color = (r, g, b)
+
+            if rgb_color:
+                # text_color is in 0-255 range, convert to 0-1 for ReportLab
+                r, g, b = rgb_color
+                if any(v > 1 for v in [r, g, b]):
+                    r, g, b = r/255.0, g/255.0, b/255.0
                c.setFillColorRGB(r, g, b)
            else:
                c.setFillColorRGB(0, 0, 0)  # Default black
@@ -603,8 +623,8 @@ class PDFGeneratorService:
                return False

            first_page = unified_doc.pages[0]
-            page_width = first_page.width
-            page_height = first_page.height
+            page_width = first_page.dimensions.width
+            page_height = first_page.dimensions.height

            logger.info(f"Page dimensions: {page_width} x {page_height}")

@@ -650,22 +670,36 @@ class PDFGeneratorService:
                           f"{len(table_elements)} tables, {len(image_elements)} images, "
                           f"{len(list_elements)} list items")

-                # Draw in layers: images → tables → lists → text
+                # Use original element order from extraction engine
+                # The extraction engine has already sorted elements by reading order,
+                # handling multi-column layouts correctly (top-to-bottom, left-to-right)
+                all_elements = []

-                # 1. Draw images
-                for img_elem in image_elements:
-                    self._draw_image_element_direct(pdf_canvas, img_elem, page_height, output_path.parent)
+                # Preserve original order by iterating through page.elements
+                for elem in page.elements:
+                    if elem in image_elements:
+                        all_elements.append(('image', elem))
+                    elif elem in table_elements:
+                        all_elements.append(('table', elem))
+                    elif elem in list_elements:
+                        all_elements.append(('list', elem))
+                    elif elem in text_elements:
+                        all_elements.append(('text', elem))

-                # 2. Draw tables
-                for table_elem in table_elements:
-                    self._draw_table_element_direct(pdf_canvas, table_elem, page_height)
+                logger.debug(f"Drawing {len(all_elements)} elements in extraction order (preserves multi-column reading order)")

-                # 3. Draw lists with sequential numbering
-                self._draw_list_elements_direct(pdf_canvas, list_elements, page_height)
-
-                # 4. Draw text with line breaks and styling
-                for text_elem in text_elements:
-                    self._draw_text_element_direct(pdf_canvas, text_elem, page_height)
+                # Draw elements in document order
+                for elem_type, elem in all_elements:
+                    if elem_type == 'image':
+                        self._draw_image_element_direct(pdf_canvas, elem, page_height, output_path.parent)
+                    elif elem_type == 'table':
+                        self._draw_table_element_direct(pdf_canvas, elem, page_height)
+                    elif elem_type == 'list':
+                        # Lists need special handling for sequential numbering
+                        # For now, draw individually (may lose list context)
+                        self._draw_text_element_direct(pdf_canvas, elem, page_height)
+                    elif elem_type == 'text':
+                        self._draw_text_element_direct(pdf_canvas, elem, page_height)

            # Save PDF
            pdf_canvas.save()
@@ -1688,7 +1722,8 @@ class PDFGeneratorService:
        spans: List['DocumentElement'],
        line_x: float,
        line_y: float,
-        default_font_size: float
+        default_font_size: float,
+        max_width: float = None
    ) -> float:
        """
        Draw text with inline span styling (mixed styles within a line).
@@ -1699,39 +1734,64 @@ class PDFGeneratorService:
            line_x: Starting X position
            line_y: Y position
            default_font_size: Default font size if span has none
+            max_width: Maximum width available (for scaling if needed)

        Returns:
            Total width of drawn text
        """
-        x_pos = line_x
+        if not spans:
+            return 0
+
+        # First pass: calculate total width with original sizes
        total_width = 0
+        span_data = []  # Store (span, text, font, size) for rendering

        for span in spans:
            span_text = span.get_text()
            if not span_text:
                continue

-            # Apply span-specific styling
+            # Apply span-specific styling to get font and size
            if span.style:
                self._apply_text_style(pdf_canvas, span.style, default_size=default_font_size)
            else:
-                # Fallback to default font
                font_name = self.font_name if self.font_registered else 'Helvetica'
                pdf_canvas.setFont(font_name, default_font_size)

-            # Get current font for width calculation
            current_font = pdf_canvas._fontname
            current_size = pdf_canvas._fontsize

+            # Calculate span width
+            span_width = pdf_canvas.stringWidth(span_text, current_font, current_size)
+            total_width += span_width
+
+            span_data.append((span, span_text, current_font, current_size, span_width))
+
+        # Calculate scale factor if needed
+        scale_factor = 1.0
+        if max_width and total_width > max_width:
+            scale_factor = (max_width / total_width) * 0.95  # 95% to leave margin
+            logger.debug(f"Scaling spans: total_width={total_width:.1f}pt > max_width={max_width:.1f}pt, scale={scale_factor:.2f}")
+
+        # Second pass: draw spans with scaling
+        x_pos = line_x
+
+        for span, span_text, font_name, original_size, span_width in span_data:
+            # Apply scaled font size
+            scaled_size = original_size * scale_factor
+            scaled_size = max(scaled_size, 3)  # Minimum 3pt
+
+            # Set font with scaled size
+            pdf_canvas.setFont(font_name, scaled_size)
+
            # Draw this span
            pdf_canvas.drawString(x_pos, line_y, span_text)

-            # Calculate width and advance position
-            span_width = pdf_canvas.stringWidth(span_text, current_font, current_size)
-            x_pos += span_width
-            total_width += span_width
+            # Calculate actual width with scaled size and advance position
+            actual_width = pdf_canvas.stringWidth(span_text, font_name, scaled_size)
+            x_pos += actual_width

-        return total_width
+        return total_width * scale_factor

    def _draw_text_element_direct(
        self,
@@ -1908,9 +1968,10 @@ class PDFGeneratorService:
                    # Multi-line span support would require more complex line breaking logic
                    if i == 0:  # Only render spans on first line for now
                        total_width = self._draw_text_with_spans(
-                            pdf_canvas, element.children, line_x, line_y, font_size
+                            pdf_canvas, element.children, line_x, line_y, font_size,
+                            max_width=available_width
                        )
-                        logger.debug(f"Drew {len(element.children)} spans, total width={total_width:.1f}pt")
+                        logger.debug(f"Drew {len(element.children)} spans, total width={total_width:.1f}pt, max_width={available_width:.1f}pt")
                    # Skip rendering on subsequent lines (text already drawn via spans)
                else:
                    # Normal single-style rendering