test

2025-12-04 18:00:37 +08:00
parent 9437387ef1
commit 8265be1741
22 changed files with 2672 additions and 196 deletions
--- a/backend/app/services/pdf_generator_service.py
+++ b/backend/app/services/pdf_generator_service.py
@@ -3371,18 +3371,21 @@ class PDFGeneratorService:
            "rows": 6,
            "cols": 2,
            "cells": [
-                {"row": 0, "col": 0, "content": "..."},
+                {"row": 0, "col": 0, "content": "...", "row_span": 1, "col_span": 2},
                {"row": 0, "col": 1, "content": "..."},
                ...
            ]
        }

-        Returns format compatible with HTMLTableParser output:
+        Returns format compatible with HTMLTableParser output (with colspan/rowspan/col):
        [
-            {"cells": [{"text": "..."}, {"text": "..."}]},  # row 0
-            {"cells": [{"text": "..."}, {"text": "..."}]},  # row 1
+            {"cells": [{"text": "...", "colspan": 1, "rowspan": 1, "col": 0}, ...]},
+            {"cells": [{"text": "...", "colspan": 1, "rowspan": 1, "col": 0}, ...]},
            ...
        ]
+
+        Note: This returns actual cells per row with their absolute column positions.
+        The table renderer uses 'col' to place cells correctly in the grid.
        """
        try:
            num_rows = content.get('rows', 0)
@@ -3392,21 +3395,39 @@ class PDFGeneratorService:
            if not cells or num_rows == 0 or num_cols == 0:
                return []

-            # Initialize rows structure
-            rows_data = []
-            for _ in range(num_rows):
-                rows_data.append({'cells': [{'text': ''} for _ in range(num_cols)]})
-
-            # Fill in cell content
+            # Group cells by row
+            cells_by_row = {}
            for cell in cells:
                row_idx = cell.get('row', 0)
-                col_idx = cell.get('col', 0)
-                cell_content = cell.get('content', '')
+                if row_idx not in cells_by_row:
+                    cells_by_row[row_idx] = []
+                cells_by_row[row_idx].append(cell)

-                if 0 <= row_idx < num_rows and 0 <= col_idx < num_cols:
-                    rows_data[row_idx]['cells'][col_idx]['text'] = str(cell_content) if cell_content else ''
+            # Sort cells within each row by column
+            for row_idx in cells_by_row:
+                cells_by_row[row_idx].sort(key=lambda c: c.get('col', 0))

-            logger.debug(f"Built {num_rows} rows from cells dict")
+            # Build rows structure with colspan/rowspan info and absolute col position
+            rows_data = []
+            for row_idx in range(num_rows):
+                row_cells = []
+                if row_idx in cells_by_row:
+                    for cell in cells_by_row[row_idx]:
+                        cell_content = cell.get('content', '')
+                        row_span = cell.get('row_span', 1) or 1
+                        col_span = cell.get('col_span', 1) or 1
+                        col_idx = cell.get('col', 0)
+
+                        row_cells.append({
+                            'text': str(cell_content) if cell_content else '',
+                            'rowspan': row_span,
+                            'colspan': col_span,
+                            'col': col_idx  # Absolute column position
+                        })
+
+                rows_data.append({'cells': row_cells})
+
+            logger.debug(f"Built {num_rows} rows from cells dict with span info")
            return rows_data

        except Exception as e:
@@ -3471,19 +3492,115 @@ class PDFGeneratorService:
            table_width = bbox.x1 - bbox.x0
            table_height = bbox.y1 - bbox.y0

-            # Build table data for ReportLab
-            table_content = []
-            for row in rows:
-                row_data = [cell['text'].strip() for cell in row['cells']]
-                table_content.append(row_data)
-
            # Create table
            from reportlab.platypus import Table, TableStyle
            from reportlab.lib import colors

-            # Determine number of rows and columns for cell_boxes calculation
+            # Determine grid size from rows structure
+            # Note: rows may have 'col' attribute for absolute positioning (from Direct extraction)
+            # or may be sequential (from HTML parsing)
            num_rows = len(rows)
-            max_cols = max(len(row['cells']) for row in rows) if rows else 0
+
+            # Check if cells have absolute column positions
+            has_absolute_cols = any(
+                'col' in cell
+                for row in rows
+                for cell in row['cells']
+            )
+
+            # Calculate actual number of columns
+            max_cols = 0
+            if has_absolute_cols:
+                # Use absolute col positions + colspan to find max column
+                for row in rows:
+                    for cell in row['cells']:
+                        col = cell.get('col', 0)
+                        colspan = cell.get('colspan', 1)
+                        max_cols = max(max_cols, col + colspan)
+            else:
+                # Sequential cells: sum up colspans
+                for row in rows:
+                    col_pos = 0
+                    for cell in row['cells']:
+                        colspan = cell.get('colspan', 1)
+                        col_pos += colspan
+                    max_cols = max(max_cols, col_pos)
+
+            # Build table data for ReportLab with proper grid structure
+            # ReportLab needs a full grid with placeholders for spanned cells
+            # and SPAN commands to merge them
+            table_content = []
+            span_commands = []
+            covered = set()  # Track cells covered by spans
+
+            # First pass: mark covered cells and collect SPAN commands
+            for row_idx, row in enumerate(rows):
+                if has_absolute_cols:
+                    # Use absolute column positions
+                    for cell in row['cells']:
+                        col_pos = cell.get('col', 0)
+                        colspan = cell.get('colspan', 1)
+                        rowspan = cell.get('rowspan', 1)
+
+                        # Mark cells covered by this span
+                        if colspan > 1 or rowspan > 1:
+                            for r in range(row_idx, row_idx + rowspan):
+                                for c in range(col_pos, col_pos + colspan):
+                                    if (r, c) != (row_idx, col_pos):
+                                        covered.add((r, c))
+                            # Add SPAN command for ReportLab
+                            span_commands.append((
+                                'SPAN',
+                                (col_pos, row_idx),
+                                (col_pos + colspan - 1, row_idx + rowspan - 1)
+                            ))
+                else:
+                    # Sequential positioning
+                    col_pos = 0
+                    for cell in row['cells']:
+                        while (row_idx, col_pos) in covered:
+                            col_pos += 1
+
+                        colspan = cell.get('colspan', 1)
+                        rowspan = cell.get('rowspan', 1)
+
+                        if colspan > 1 or rowspan > 1:
+                            for r in range(row_idx, row_idx + rowspan):
+                                for c in range(col_pos, col_pos + colspan):
+                                    if (r, c) != (row_idx, col_pos):
+                                        covered.add((r, c))
+                            span_commands.append((
+                                'SPAN',
+                                (col_pos, row_idx),
+                                (col_pos + colspan - 1, row_idx + rowspan - 1)
+                            ))
+                        col_pos += colspan
+
+            # Second pass: build content grid
+            for row_idx in range(num_rows):
+                row_data = [''] * max_cols
+
+                if row_idx < len(rows):
+                    if has_absolute_cols:
+                        # Place cells at their absolute positions
+                        for cell in rows[row_idx]['cells']:
+                            col_pos = cell.get('col', 0)
+                            if col_pos < max_cols:
+                                row_data[col_pos] = cell['text'].strip()
+                    else:
+                        # Sequential placement
+                        col_pos = 0
+                        for cell in rows[row_idx]['cells']:
+                            while col_pos < max_cols and (row_idx, col_pos) in covered:
+                                col_pos += 1
+                            if col_pos < max_cols:
+                                row_data[col_pos] = cell['text'].strip()
+                                colspan = cell.get('colspan', 1)
+                                col_pos += colspan
+
+                table_content.append(row_data)
+
+            logger.debug(f"Built table grid: {num_rows} rows × {max_cols} cols, {len(span_commands)} span commands (absolute_cols={has_absolute_cols})")

            # Use original column widths from extraction if available
            # Otherwise try to compute from cell_boxes (from PP-StructureV3)
@@ -3517,7 +3634,7 @@ class PDFGeneratorService:
            # Apply style with minimal padding to reduce table extension
            # Use Chinese font to support special characters (℃, μm, ≦, ×, Ω, etc.)
            font_for_table = self.font_name if self.font_registered else 'Helvetica'
-            style = TableStyle([
+            style_commands = [
                ('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
                ('FONTNAME', (0, 0), (-1, -1), font_for_table),
                ('FONTSIZE', (0, 0), (-1, -1), 8),
@@ -3529,7 +3646,13 @@ class PDFGeneratorService:
                ('BOTTOMPADDING', (0, 0), (-1, -1), 0),
                ('LEFTPADDING', (0, 0), (-1, -1), 1),
                ('RIGHTPADDING', (0, 0), (-1, -1), 1),
-            ])
+            ]
+            # Add span commands for merged cells
+            style_commands.extend(span_commands)
+            if span_commands:
+                logger.info(f"Applied {len(span_commands)} SPAN commands for merged cells")
+
+            style = TableStyle(style_commands)
            t.setStyle(style)

            # Use canvas scaling as fallback to fit table within bbox
@@ -4350,33 +4473,100 @@ class PDFGeneratorService:
                        # Replace newlines with <br/>
                        safe_content = safe_content.replace('\n', '<br/>')

-                        # Calculate font size from bbox height, but keep minimum 10pt
-                        font_size = max(box_height * 0.7, 10)
-                        font_size = min(font_size, 24)  # Cap at 24pt
+                        # Get original font size from style info
+                        style_info = elem.get('style', {})
+                        original_font_size = style_info.get('font_size', 12.0)

-                        # Create style for this element
-                        elem_style = ParagraphStyle(
-                            f'elem_{id(elem)}',
-                            parent=base_style,
-                            fontSize=font_size,
-                            leading=font_size * 1.2,
+                        # Detect vertical text (Y-axis labels, etc.)
+                        # Vertical text has aspect_ratio (height/width) > 2 and multiple characters
+                        is_vertical_text = (
+                            box_height > box_width * 2 and
+                            len(content.strip()) > 1
                        )

-                        # Create paragraph
-                        para = Paragraph(safe_content, elem_style)
+                        if is_vertical_text:
+                            # For vertical text, use original font size and rotate
+                            font_size = min(original_font_size, box_width * 0.9)
+                            font_size = max(font_size, 6)  # Minimum 6pt

-                        # Calculate available width and height
-                        available_width = box_width
-                        available_height = box_height * 2  # Allow overflow
+                            # Save canvas state for rotation
+                            pdf_canvas.saveState()

-                        # Wrap the paragraph
-                        para_width, para_height = para.wrap(available_width, available_height)
+                            # Convert to PDF coordinates
+                            pdf_y_center = current_page_height - (y0 + y1) / 2
+                            x_center = (x0 + x1) / 2

-                        # Convert to PDF coordinates (y from bottom)
-                        pdf_y = current_page_height - y0 - para_height
+                            # Translate to center, rotate, translate back
+                            pdf_canvas.translate(x_center, pdf_y_center)
+                            pdf_canvas.rotate(90)

-                        # Draw the paragraph
-                        para.drawOn(pdf_canvas, x0, pdf_y)
+                            # Set font and draw text centered
+                            pdf_canvas.setFont(
+                                self.font_name if self.font_registered else 'Helvetica',
+                                font_size
+                            )
+                            # Draw text at origin (since we translated to center)
+                            text_width = pdf_canvas.stringWidth(
+                                safe_content.replace('&amp;', '&').replace('&lt;', '<').replace('&gt;', '>'),
+                                self.font_name if self.font_registered else 'Helvetica',
+                                font_size
+                            )
+                            pdf_canvas.drawString(-text_width / 2, -font_size / 3,
+                                safe_content.replace('&amp;', '&').replace('&lt;', '<').replace('&gt;', '>'))
+
+                            pdf_canvas.restoreState()
+                        else:
+                            # For horizontal text, dynamically fit text within bbox
+                            # Start with original font size and reduce until text fits
+                            MIN_FONT_SIZE = 6
+                            MAX_FONT_SIZE = 14
+
+                            if original_font_size > 0:
+                                start_font_size = min(original_font_size, MAX_FONT_SIZE)
+                            else:
+                                start_font_size = min(box_height * 0.7, MAX_FONT_SIZE)
+
+                            font_size = max(start_font_size, MIN_FONT_SIZE)
+
+                            # Try progressively smaller font sizes until text fits
+                            para = None
+                            para_height = box_height + 1  # Start with height > box to enter loop
+
+                            while font_size >= MIN_FONT_SIZE and para_height > box_height:
+                                elem_style = ParagraphStyle(
+                                    f'elem_{id(elem)}_{font_size}',
+                                    parent=base_style,
+                                    fontSize=font_size,
+                                    leading=font_size * 1.15,  # Tighter leading
+                                )
+
+                                para = Paragraph(safe_content, elem_style)
+                                para_width, para_height = para.wrap(box_width, box_height * 3)
+
+                                if para_height <= box_height:
+                                    break  # Text fits!
+
+                                font_size -= 0.5  # Reduce font size and try again
+
+                            # Ensure minimum font size
+                            if font_size < MIN_FONT_SIZE:
+                                font_size = MIN_FONT_SIZE
+                                elem_style = ParagraphStyle(
+                                    f'elem_{id(elem)}_min',
+                                    parent=base_style,
+                                    fontSize=font_size,
+                                    leading=font_size * 1.15,
+                                )
+                                para = Paragraph(safe_content, elem_style)
+                                para_width, para_height = para.wrap(box_width, box_height * 3)
+
+                            # Convert to PDF coordinates (y from bottom)
+                            # Clip to bbox height to prevent overflow
+                            actual_height = min(para_height, box_height)
+                            pdf_y = current_page_height - y0 - actual_height
+
+                            # Draw the paragraph
+                            para.drawOn(pdf_canvas, x0, pdf_y)

            # Save PDF
            pdf_canvas.save()
@@ -4451,13 +4641,47 @@ class PDFGeneratorService:
            pdf_y_bottom = page_height - ty1
            pdf_canvas.rect(tx0, pdf_y_bottom, table_width, table_height, stroke=1, fill=0)

-            # Step 2: Draw cell borders using cell_boxes
+            # Step 2: Get or calculate cell boxes
            cell_boxes = metadata.get('cell_boxes', [])
-            if cell_boxes:
-                # Normalize cell boxes for grid alignment
-                if hasattr(self, '_normalize_cell_boxes_to_grid'):
-                    cell_boxes = self._normalize_cell_boxes_to_grid(cell_boxes)

+            # If no cell_boxes, calculate from column_widths and row_heights
+            if not cell_boxes:
+                column_widths = metadata.get('column_widths', [])
+                row_heights = metadata.get('row_heights', [])
+
+                if column_widths and row_heights:
+                    # Calculate cell positions from widths and heights
+                    cell_boxes = []
+                    rows = content.get('rows', len(row_heights)) if isinstance(content, dict) else len(row_heights)
+                    cols = content.get('cols', len(column_widths)) if isinstance(content, dict) else len(column_widths)
+
+                    # Calculate cumulative positions
+                    x_positions = [tx0]
+                    for w in column_widths[:cols]:
+                        x_positions.append(x_positions[-1] + w)
+
+                    y_positions = [ty0]
+                    for h in row_heights[:rows]:
+                        y_positions.append(y_positions[-1] + h)
+
+                    # Create cell boxes for each cell (row-major order)
+                    for row_idx in range(rows):
+                        for col_idx in range(cols):
+                            if col_idx < len(x_positions) - 1 and row_idx < len(y_positions) - 1:
+                                cx0 = x_positions[col_idx]
+                                cy0 = y_positions[row_idx]
+                                cx1 = x_positions[col_idx + 1]
+                                cy1 = y_positions[row_idx + 1]
+                                cell_boxes.append([cx0, cy0, cx1, cy1])
+
+                    logger.debug(f"Calculated {len(cell_boxes)} cell boxes from {cols} cols x {rows} rows")
+
+            # Normalize cell boxes for grid alignment
+            if cell_boxes and hasattr(self, '_normalize_cell_boxes_to_grid'):
+                cell_boxes = self._normalize_cell_boxes_to_grid(cell_boxes)
+
+            # Draw cell borders
+            if cell_boxes:
                pdf_canvas.setLineWidth(0.5)
                for box in cell_boxes:
                    if len(box) >= 4: