fix: improve PDF layout generation for Direct track

Key fixes: - Skip large vector_graphics charts (>50% page coverage) that cover text - Fix font fallback to use NotoSansSC for CJK support instead of Helvetica - Improve translated table rendering with dynamic font sizing - Add merged cell (row_span/col_span) support for reflow tables - Skip text elements inside table bboxes to avoid duplication Archive openspec proposal: fix-pdf-table-rendering 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-03 14:55:00 +08:00
parent 08adf3d01d
commit 1b5c7f39a8
5 changed files with 405 additions and 111 deletions
--- a/backend/app/services/pdf_generator_service.py
+++ b/backend/app/services/pdf_generator_service.py
@@ -354,9 +354,13 @@ class PDFGeneratorService:
        elif 'courier' in font_lower:
            return 'Courier'

-        # Default fallback
-        logger.debug(f"Font '{font_name}' not found in mapping, using Helvetica")
-        return 'Helvetica'
+        # Default fallback - use NotoSansSC for CJK support if registered
+        if self.font_registered:
+            logger.debug(f"Font '{font_name}' not found in mapping, using {self.font_name} for CJK support")
+            return self.font_name
+        else:
+            logger.debug(f"Font '{font_name}' not found in mapping, using Helvetica")
+            return 'Helvetica'

    def _apply_text_style(self, c: canvas.Canvas, style_info, default_size: float = 12):
        """
@@ -866,6 +870,23 @@ class PDFGeneratorService:
                        ElementType.IMAGE, ElementType.FIGURE,
                        ElementType.CHART, ElementType.DIAGRAM, ElementType.LOGO, ElementType.STAMP
                    ]:
+                        # Skip large vector_graphics charts in Direct track
+                        # These are visual decorations (borders, lines, frames) that would cover text
+                        # PyMuPDF extracts both vector graphics as images AND text layer separately
+                        if element.type == ElementType.CHART and element.bbox:
+                            content = element.content
+                            is_vector_graphics = (
+                                isinstance(content, dict) and
+                                content.get('source') == 'vector_graphics'
+                            )
+                            if is_vector_graphics:
+                                elem_area = (element.bbox.x1 - element.bbox.x0) * (element.bbox.y1 - element.bbox.y0)
+                                coverage_ratio = elem_area / page_area if page_area > 0 else 0
+                                if coverage_ratio > 0.5:
+                                    logger.info(f"Skipping large vector_graphics chart {element.element_id} "
+                                              f"(covers {coverage_ratio*100:.1f}% of page) - text provides actual content")
+                                    continue
+
                        image_elements.append(element)
                        # Only add real images to exclusion regions, NOT charts/diagrams
                        # Charts often have large bounding boxes that include text labels
@@ -3704,64 +3725,103 @@ class PDFGeneratorService:

    def _create_reflow_table(self, table_data: Dict, styles: Dict) -> Optional[Table]:
        """
-        Create a Platypus Table for reflow mode.
+        Create a Platypus Table for reflow mode with merged cell support.

        Args:
-            table_data: Table element dictionary with 'rows' or 'cells'
+            table_data: Table element dictionary with 'content' containing 'cells'
            styles: Style dictionary

        Returns:
            Platypus Table object or None
        """
        try:
-            # Get content - cells might be inside 'content' dict
+            # Get content - cells are inside 'content' dict
            content = table_data.get('content', {})
-            if isinstance(content, dict):
-                rows_data = content.get('rows', []) if isinstance(content.get('rows'), list) else []
-                cells = content.get('cells', [])
-            else:
-                rows_data = table_data.get('rows', [])
-                cells = table_data.get('cells', [])
-
-            if not rows_data and cells:
-                # Group cells by row - support both 'row'/'col' and 'row_index'/'col_index' keys
-                row_map = {}
-                for cell in cells:
-                    row_idx = cell.get('row', cell.get('row_index', 0))
-                    if row_idx not in row_map:
-                        row_map[row_idx] = []
-                    row_map[row_idx].append(cell)
-                # Sort and create rows
-                rows_data = []
-                for row_idx in sorted(row_map.keys()):
-                    row_cells = sorted(row_map[row_idx], key=lambda c: c.get('col', c.get('col_index', 0)))
-                    rows_data.append({'cells': row_cells})
-
-            if not rows_data:
+            if not isinstance(content, dict):
                return None

-            # Build table data
+            cells = content.get('cells', [])
+            if not cells:
+                return None
+
+            # Determine grid dimensions
+            num_rows = content.get('rows', 0)
+            num_cols = content.get('cols', 0)
+
+            if num_rows == 0 or num_cols == 0:
+                # Calculate from cells
+                for cell in cells:
+                    row = cell.get('row', cell.get('row_index', 0))
+                    col = cell.get('col', cell.get('col_index', 0))
+                    row_span = cell.get('row_span', 1)
+                    col_span = cell.get('col_span', 1)
+                    num_rows = max(num_rows, row + row_span)
+                    num_cols = max(num_cols, col + col_span)
+
+            if num_rows == 0 or num_cols == 0:
+                return None
+
+            # Initialize grid with empty strings
+            grid = [['' for _ in range(num_cols)] for _ in range(num_rows)]
+            # Track which cells are covered by spans
+            covered = [[False for _ in range(num_cols)] for _ in range(num_rows)]
+            # Track span commands
+            span_commands = []
+
+            # Fill grid with cell content
+            for cell in cells:
+                row = cell.get('row', cell.get('row_index', 0))
+                col = cell.get('col', cell.get('col_index', 0))
+                row_span = cell.get('row_span', 1)
+                col_span = cell.get('col_span', 1)
+
+                # Get cell text
+                text = cell.get('content', cell.get('text', ''))
+                if not isinstance(text, str):
+                    text = str(text) if text else ''
+
+                # Escape HTML special characters
+                text = text.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
+
+                # Place content in the top-left cell of the span
+                if 0 <= row < num_rows and 0 <= col < num_cols:
+                    grid[row][col] = text
+
+                    # Mark covered cells for spans
+                    if row_span > 1 or col_span > 1:
+                        # Add SPAN command
+                        span_commands.append((
+                            'SPAN',
+                            (col, row),
+                            (col + col_span - 1, row + row_span - 1)
+                        ))
+                        # Mark cells as covered
+                        for r in range(row, min(row + row_span, num_rows)):
+                            for c in range(col, min(col + col_span, num_cols)):
+                                if r != row or c != col:
+                                    covered[r][c] = True
+
+            # Build table data with Paragraphs
            data = []
-            for row in rows_data:
+            for row_idx in range(num_rows):
                row_data = []
-                row_cells = row.get('cells', [])
-                for cell in row_cells:
-                    # Support both 'text' and 'content' keys
-                    text = cell.get('text', cell.get('content', ''))
-                    if not isinstance(text, str):
-                        text = str(text) if text else ''
-                    # Escape HTML special characters
-                    text = text.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
-                    row_data.append(Paragraph(text, styles['TableCell']))
-                if row_data:
-                    data.append(row_data)
+                for col_idx in range(num_cols):
+                    if covered[row_idx][col_idx]:
+                        # Empty cell for covered spans
+                        row_data.append('')
+                    else:
+                        text = grid[row_idx][col_idx]
+                        row_data.append(Paragraph(text, styles['TableCell']))
+                data.append(row_data)

            if not data:
                return None

            # Create table
            table = Table(data)
-            table.setStyle(TableStyle([
+
+            # Build style commands
+            style_commands = [
                ('GRID', (0, 0), (-1, -1), 0.5, colors.black),
                ('VALIGN', (0, 0), (-1, -1), 'TOP'),
                ('LEFTPADDING', (0, 0), (-1, -1), 6),
@@ -3769,11 +3829,20 @@ class PDFGeneratorService:
                ('TOPPADDING', (0, 0), (-1, -1), 4),
                ('BOTTOMPADDING', (0, 0), (-1, -1), 4),
                ('BACKGROUND', (0, 0), (-1, 0), colors.lightgrey),  # Header row
-            ]))
+            ]
+
+            # Add span commands
+            style_commands.extend(span_commands)
+
+            table.setStyle(TableStyle(style_commands))
+
+            logger.info(f"[REFLOW TABLE] Created table with {num_rows}x{num_cols} cells, {len(span_commands)} spans")
            return table

        except Exception as e:
            logger.error(f"Failed to create reflow table: {e}")
+            import traceback
+            traceback.print_exc()
            return None

    def _embed_image_reflow(
@@ -4189,8 +4258,31 @@ class PDFGeneratorService:

                pdf_canvas.setPageSize((current_page_width, current_page_height))

-                # Process elements
+                # Process elements:
+                # - Tables: draw borders + translated cell text (with dynamic font sizing)
+                # - Text elements: draw at original positions, SKIP if inside table bbox
+                # - Images: draw at original positions
                elements = page_data.get('elements', [])
+
+                # Collect table bboxes to skip text elements inside tables
+                table_bboxes = []
+                for elem in elements:
+                    if elem.get('type') in ('table', 'Table'):
+                        elem_bbox = elem.get('bbox', {})
+                        if elem_bbox:
+                            table_bboxes.append(elem_bbox)
+
+                def is_inside_table(text_bbox):
+                    """Check if text bbox is inside any table bbox."""
+                    margin = 5
+                    for tb in table_bboxes:
+                        if (text_bbox.get('x0', 0) >= tb.get('x0', 0) - margin and
+                            text_bbox.get('y0', 0) >= tb.get('y0', 0) - margin and
+                            text_bbox.get('x1', 0) <= tb.get('x1', 0) + margin and
+                            text_bbox.get('y1', 0) <= tb.get('y1', 0) + margin):
+                            return True
+                    return False
+
                for elem in elements:
                    elem_type = elem.get('type', 'text')
                    content = elem.get('content', '')
@@ -4211,6 +4303,22 @@ class PDFGeneratorService:

                    # Handle different element types
                    if elem_type in ('image', 'figure', 'Image', 'Figure', 'chart', 'Chart'):
+                        # Skip large vector_graphics charts - they're visual decorations that cover text
+                        if elem_type in ('chart', 'Chart'):
+                            elem_content = elem.get('content', {})
+                            is_vector_graphics = (
+                                isinstance(elem_content, dict) and
+                                elem_content.get('source') == 'vector_graphics'
+                            )
+                            if is_vector_graphics:
+                                page_area = current_page_width * current_page_height
+                                elem_area = box_width * box_height
+                                coverage_ratio = elem_area / page_area if page_area > 0 else 0
+                                if coverage_ratio > 0.5:
+                                    logger.info(f"Skipping large vector_graphics chart "
+                                              f"(covers {coverage_ratio*100:.1f}% of page)")
+                                    continue
+
                        # Draw image
                        img = self._embed_image_reflow(elem, image_dir)
                        if img:
@@ -4229,6 +4337,11 @@ class PDFGeneratorService:
                        )

                    elif isinstance(content, str) and content.strip():
+                        # Skip text elements inside table bboxes
+                        # (Table cells are rendered by _draw_translated_table with dynamic font sizing)
+                        if is_inside_table(bbox):
+                            continue
+
                        # Text element - use Paragraph for word wrapping
                        # Escape special characters
                        safe_content = content.replace('&', '&amp;')
@@ -4290,106 +4403,140 @@ class PDFGeneratorService:
        image_dir: Path
    ):
        """
-        Draw a table with translated content using Platypus Table.
+        Draw a table with translated content.

-        Supports adaptive column widths and text wrapping within cells.
+        Approach:
+        1. Draw cell borders using cell_boxes from metadata
+        2. Render translated text in each cell with dynamic font sizing
+        3. Draw embedded images at their original positions
+
+        Text is rendered with dynamic font sizing to fit within cells.
+        Minimum font size is 6pt for readability.

        Args:
            pdf_canvas: ReportLab canvas
-            elem: Table element dict
+            elem: Table element dict with metadata containing cell_boxes
            page_height: Page height for coordinate transformation
            image_dir: Directory containing images
        """
-        from reportlab.platypus import Table, TableStyle, Paragraph
-        from reportlab.lib.styles import ParagraphStyle
        from reportlab.lib import colors
+        from reportlab.lib.styles import ParagraphStyle
+        from reportlab.platypus import Paragraph
+
+        MIN_FONT_SIZE = 6  # Minimum font size for readability

        try:
            content = elem.get('content', {})
            bbox = elem.get('bbox', {})
+            metadata = elem.get('metadata', {})

            if not bbox:
                return

-            x0 = bbox.get('x0', 0)
-            y0 = bbox.get('y0', 0)
-            x1 = bbox.get('x1', 0)
-            y1 = bbox.get('y1', 0)
-            table_width = x1 - x0
-            table_height = y1 - y0
-
-            # Parse table content
-            if isinstance(content, dict):
-                rows = content.get('rows', [])
-                cells = content.get('cells', [])
+            # Get table bounding box
+            if isinstance(bbox, dict):
+                tx0 = bbox.get('x0', 0)
+                ty0 = bbox.get('y0', 0)
+                tx1 = bbox.get('x1', 0)
+                ty1 = bbox.get('y1', 0)
            else:
-                return
+                tx0, ty0, tx1, ty1 = bbox[:4] if len(bbox) >= 4 else (0, 0, 0, 0)

-            if not rows and not cells:
-                return
+            table_width = tx1 - tx0
+            table_height = ty1 - ty0

-            # Build table data
-            table_data = []
+            # Step 1: Draw outer table border
+            pdf_canvas.setStrokeColor(colors.black)
+            pdf_canvas.setLineWidth(1.0)
+            pdf_y_bottom = page_height - ty1
+            pdf_canvas.rect(tx0, pdf_y_bottom, table_width, table_height, stroke=1, fill=0)

-            if rows:
-                for row in rows:
-                    row_cells = row if isinstance(row, list) else row.get('cells', [])
-                    row_data = []
-                    for cell in row_cells:
-                        if isinstance(cell, str):
-                            cell_text = cell
-                        elif isinstance(cell, dict):
-                            cell_text = cell.get('content', cell.get('text', ''))
-                        else:
-                            cell_text = str(cell) if cell else ''
+            # Step 2: Draw cell borders using cell_boxes
+            cell_boxes = metadata.get('cell_boxes', [])
+            if cell_boxes:
+                # Normalize cell boxes for grid alignment
+                if hasattr(self, '_normalize_cell_boxes_to_grid'):
+                    cell_boxes = self._normalize_cell_boxes_to_grid(cell_boxes)

-                        # Create paragraph for text wrapping
-                        safe_text = str(cell_text).replace('&', '&amp;')
-                        safe_text = safe_text.replace('<', '&lt;').replace('>', '&gt;')
+                pdf_canvas.setLineWidth(0.5)
+                for box in cell_boxes:
+                    if len(box) >= 4:
+                        cx0, cy0, cx1, cy1 = box[:4]
+                        cell_width = cx1 - cx0
+                        cell_height = cy1 - cy0
+                        pdf_cell_y = page_height - cy1
+                        pdf_canvas.rect(cx0, pdf_cell_y, cell_width, cell_height, stroke=1, fill=0)

-                        cell_style = ParagraphStyle(
-                            f'cell_{id(cell)}',
-                            fontName=self.font_name if self.font_registered else 'Helvetica',
-                            fontSize=9,
-                            leading=11,
-                            wordWrap='CJK',
-                        )
-                        para = Paragraph(safe_text, cell_style)
-                        row_data.append(para)
+            # Step 3: Render translated text in each cell
+            cells = content.get('cells', []) if isinstance(content, dict) else []
+            font_name = self.font_name if self.font_registered else 'Helvetica'

-                    if row_data:
-                        table_data.append(row_data)
+            for i, cell in enumerate(cells):
+                cell_text = cell.get('content', cell.get('text', ''))
+                if not cell_text or not cell_text.strip():
+                    continue

-            if not table_data:
-                return
+                # Get cell bounding box by index
+                if i >= len(cell_boxes):
+                    continue

-            # Calculate column widths
-            num_cols = max(len(row) for row in table_data) if table_data else 1
-            col_width = table_width / num_cols if num_cols > 0 else table_width
+                cx0, cy0, cx1, cy1 = cell_boxes[i][:4]
+                cell_width = cx1 - cx0
+                cell_height = cy1 - cy0

-            # Create table
-            table = Table(table_data, colWidths=[col_width] * num_cols)
+                # Skip tiny cells
+                if cell_width < 10 or cell_height < 10:
+                    continue

-            # Apply table style
-            table.setStyle(TableStyle([
-                ('GRID', (0, 0), (-1, -1), 0.5, colors.black),
-                ('VALIGN', (0, 0), (-1, -1), 'TOP'),
-                ('LEFTPADDING', (0, 0), (-1, -1), 4),
-                ('RIGHTPADDING', (0, 0), (-1, -1), 4),
-                ('TOPPADDING', (0, 0), (-1, -1), 2),
-                ('BOTTOMPADDING', (0, 0), (-1, -1), 2),
-            ]))
+                # Prepare text (escape HTML special chars)
+                safe_text = str(cell_text).replace('&', '&amp;')
+                safe_text = safe_text.replace('<', '&lt;').replace('>', '&gt;')
+                safe_text = safe_text.replace('\n', '<br/>')

-            # Wrap and draw table
-            t_width, t_height = table.wrap(table_width, table_height * 2)
+                # Dynamic font sizing: start at 10pt, shrink until text fits
+                padding = 3
+                available_width = cell_width - padding * 2
+                available_height = cell_height - padding * 2

-            # Convert to PDF coordinates
-            pdf_y = page_height - y0 - t_height
+                if available_width <= 0 or available_height <= 0:
+                    continue

-            table.drawOn(pdf_canvas, x0, pdf_y)
+                # Try font sizes from 10pt down to MIN_FONT_SIZE
+                for font_size in range(10, MIN_FONT_SIZE - 1, -1):
+                    cell_style = ParagraphStyle(
+                        f'cell_{i}_{font_size}',
+                        fontName=font_name,
+                        fontSize=font_size,
+                        leading=font_size * 1.15,
+                        wordWrap='CJK',
+                    )
+                    para = Paragraph(safe_text, cell_style)
+                    para_width, para_height = para.wrap(available_width, available_height * 10)
+
+                    if para_height <= available_height:
+                        break  # Text fits at this font size
+
+                # Draw text (centered vertically in cell)
+                text_x = cx0 + padding
+                # Calculate vertical position (top-aligned within cell)
+                text_y = page_height - cy0 - padding - min(para_height, available_height)
+
+                para.drawOn(pdf_canvas, text_x, text_y)
+
+            logger.info(f"[TRANSLATED TABLE] Drew table with {len(cell_boxes)} borders, {len(cells)} cells")
+
+            # Step 4: Draw embedded images
+            embedded_images = metadata.get('embedded_images', [])
+            if embedded_images and image_dir:
+                for emb_img in embedded_images:
+                    self._draw_embedded_image(
+                        pdf_canvas, emb_img, page_height, image_dir, 1.0, 1.0
+                    )

        except Exception as e:
            logger.error(f"Failed to draw translated table: {e}")
+            import traceback
+            traceback.print_exc()


 # Singleton instance