feat: add table detection options and scan artifact removal

- Add TableDetectionSelector component for wired/wireless/region detection - Add CV-based table line detector module (disabled due to poor performance) - Add scan artifact removal preprocessing step (removes faint horizontal lines) - Add PreprocessingConfig schema with remove_scan_artifacts option - Update frontend PreprocessingSettings with scan artifact toggle - Integrate table detection config into ProcessingPage - Archive extract-table-cell-boxes proposal 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-30 13:21:50 +08:00
parent f5a2c8a750
commit 95ae1f1bdb
17 changed files with 1906 additions and 344 deletions
--- a/backend/app/services/pdf_generator_service.py
+++ b/backend/app/services/pdf_generator_service.py
@@ -447,7 +447,8 @@ class PDFGeneratorService:
                            'text': text_content,
                            'bbox': bbox_polygon,
                            'confidence': element.confidence or 1.0,
-                            'page': page_num
+                            'page': page_num,
+                            'element_type': element.type.value  # Include element type for styling
                        }

                        # Include style information if available (for Direct track)
@@ -466,13 +467,24 @@ class PDFGeneratorService:
                    else:
                        html_content = str(element.content)

-                    layout_elements.append({
+                    table_element = {
                        'type': 'table',
                        'content': html_content,
                        'bbox': [element.bbox.x0, element.bbox.y0,
                                element.bbox.x1, element.bbox.y1],
                        'page': page_num - 1  # layout uses 0-based
-                    })
+                    }
+
+                    # Preserve cell_boxes and embedded_images from metadata
+                    # These are extracted by PP-StructureV3 and used for accurate table rendering
+                    if element.metadata:
+                        if 'cell_boxes' in element.metadata:
+                            table_element['cell_boxes'] = element.metadata['cell_boxes']
+                            table_element['cell_boxes_source'] = element.metadata.get('cell_boxes_source', 'metadata')
+                        if 'embedded_images' in element.metadata:
+                            table_element['embedded_images'] = element.metadata['embedded_images']
+
+                    layout_elements.append(table_element)

                    # Add bbox to images_metadata for text overlap filtering
                    # (no actual image file, just bbox for filtering)
@@ -484,10 +496,10 @@ class PDFGeneratorService:
                        'element_id': element.element_id
                    })

-                # Handle image/visual elements
+                # Handle image/visual elements (including stamps/seals)
                elif element.is_visual or element.type in [
                    ElementType.IMAGE, ElementType.FIGURE, ElementType.CHART,
-                    ElementType.DIAGRAM, ElementType.LOGO
+                    ElementType.DIAGRAM, ElementType.LOGO, ElementType.STAMP
                ]:
                    # Get image path using fallback logic
                    image_path = self._get_image_path(element)
@@ -729,13 +741,13 @@ class PDFGeneratorService:
                        regions_to_avoid.append(element)  # Tables are exclusion regions
                    elif element.is_visual or element.type in [
                        ElementType.IMAGE, ElementType.FIGURE,
-                        ElementType.CHART, ElementType.DIAGRAM, ElementType.LOGO
+                        ElementType.CHART, ElementType.DIAGRAM, ElementType.LOGO, ElementType.STAMP
                    ]:
                        image_elements.append(element)
                        # Only add real images to exclusion regions, NOT charts/diagrams
                        # Charts often have large bounding boxes that include text labels
                        # which should be rendered as selectable text on top
-                        if element.type in [ElementType.IMAGE, ElementType.FIGURE, ElementType.LOGO]:
+                        if element.type in [ElementType.IMAGE, ElementType.FIGURE, ElementType.LOGO, ElementType.STAMP]:
                            regions_to_avoid.append(element)
                    elif element.type == ElementType.LIST_ITEM:
                        list_elements.append(element)
@@ -934,11 +946,14 @@ class PDFGeneratorService:
            # Create PDF canvas with initial page size (will be updated per page)
            pdf_canvas = canvas.Canvas(str(output_path), pagesize=(target_width, target_height))

-            # Filter text regions to avoid overlap with tables/images
-            regions_to_avoid = images_metadata
+            # LAYERED RENDERING: Exclude tables from regions_to_avoid
+            # Text inside tables will be rendered at raw OCR positions (via GapFillingService)
+            # while table borders are drawn separately using cell_boxes
+            # Only avoid overlap with actual images/figures/charts
+            regions_to_avoid = [img for img in images_metadata if img.get('type') != 'table']
            table_count = len([img for img in images_metadata if img.get('type') == 'table'])

-            logger.info(f"過濾文字區域: {len(regions_to_avoid)} 個區域需要避免 (含 {table_count} 個表格)")
+            logger.info(f"過濾文字區域: {len(regions_to_avoid)} 個區域需要避免 (不含表格), {table_count} 個表格使用分層渲染")

            filtered_text_regions = self._filter_text_in_regions(text_regions, regions_to_avoid)

@@ -1042,7 +1057,8 @@ class PDFGeneratorService:
                for table_elem in page_table_regions:
                    self.draw_table_region(
                        pdf_canvas, table_elem, images_metadata,
-                        current_target_h, current_scale_w, current_scale_h
+                        current_target_h, current_scale_w, current_scale_h,
+                        result_dir=json_parent_dir
                    )

                # 3. Draw text (top layer)
@@ -1542,8 +1558,8 @@ class PDFGeneratorService:
            logger.info(f"[文字] '{text[:30]}' → PDF位置: ({pdf_x:.1f}, {pdf_y:.1f}), 字體:{font_size:.1f}pt, 寬x高:{bbox_width:.0f}x{bbox_height:.0f}, 行數:{num_lines}")

            # Set font with track-specific styling
-            # Note: OCR track has no StyleInfo (extracted from images), so no advanced formatting
            style_info = region.get('style')
+            element_type = region.get('element_type', 'text')
            is_direct_track = (self.current_processing_track == ProcessingTrack.DIRECT or
                               self.current_processing_track == ProcessingTrack.HYBRID)

@@ -1555,9 +1571,25 @@ class PDFGeneratorService:
                font_size = pdf_canvas._fontsize
                logger.debug(f"Applied Direct track style: font={font_name}, size={font_size}")
            else:
-                # OCR track or no style: Use simple font selection
+                # OCR track or no style: Use simple font selection with element-type based styling
                font_name = self.font_name if self.font_registered else 'Helvetica'
-                pdf_canvas.setFont(font_name, font_size)
+
+                # Apply element-type specific styling (for OCR track)
+                if element_type == 'title':
+                    # Titles: use larger, bold font
+                    font_size = min(font_size * 1.3, 36)  # 30% larger, max 36pt
+                    pdf_canvas.setFont(font_name, font_size)
+                    logger.debug(f"Applied title style: size={font_size:.1f}")
+                elif element_type == 'header':
+                    # Headers: slightly larger
+                    font_size = min(font_size * 1.15, 24)  # 15% larger, max 24pt
+                    pdf_canvas.setFont(font_name, font_size)
+                elif element_type == 'caption':
+                    # Captions: slightly smaller, italic if available
+                    font_size = max(font_size * 0.9, 6)  # 10% smaller, min 6pt
+                    pdf_canvas.setFont(font_name, font_size)
+                else:
+                    pdf_canvas.setFont(font_name, font_size)

            # Handle line breaks (split text by newlines)
            # OCR track: simple left-aligned rendering
@@ -1726,7 +1758,8 @@ class PDFGeneratorService:
        images_metadata: List[Dict],
        page_height: float,
        scale_w: float = 1.0,
-        scale_h: float = 1.0
+        scale_h: float = 1.0,
+        result_dir: Optional[Path] = None
    ):
        """
        Draw a table region by parsing HTML and rebuilding with ReportLab Table
@@ -1738,13 +1771,27 @@ class PDFGeneratorService:
            page_height: Height of page
            scale_w: Scale factor for X coordinates (PDF width / OCR width)
            scale_h: Scale factor for Y coordinates (PDF height / OCR height)
+            result_dir: Directory containing result files (for embedded images)
        """
        try:
            html_content = table_element.get('content', '')
            if not html_content:
                return

-            # Parse HTML to extract table structure
+            # Try to use cell_boxes for direct rendering first (more accurate)
+            cell_boxes = table_element.get('cell_boxes', [])
+            if cell_boxes:
+                logger.info(f"[TABLE] Using cell_boxes direct rendering ({len(cell_boxes)} cells)")
+                success = self._draw_table_with_cell_boxes(
+                    pdf_canvas, table_element, page_height,
+                    scale_w, scale_h, result_dir
+                )
+                if success:
+                    return  # Successfully rendered with cell_boxes
+
+                logger.info("[TABLE] Falling back to ReportLab Table")
+
+            # Fallback: Parse HTML to extract table structure and use ReportLab Table
            parser = HTMLTableParser()
            parser.feed(html_content)

@@ -1901,14 +1948,18 @@ class PDFGeneratorService:
                logger.info(f"[TABLE] Using cell_boxes col widths (scaled)")
            else:
                col_widths = [table_width / max_cols] * max_cols
-                logger.info(f"[TABLE] Using equal distribution col widths")
+                logger.info(f"[TABLE] Using equal distribution col widths: {table_width/max_cols:.1f} each")

-            # Row heights are used optionally (ReportLab can auto-size)
-            row_heights = None
+            # Row heights - ALWAYS use to ensure table fits bbox properly
+            # Use computed heights from cell_boxes, or uniform distribution as fallback
            if computed_row_heights:
                # Scale row_heights to PDF coordinates
                row_heights = [h * scale_h for h in computed_row_heights]
-                logger.debug(f"[TABLE] Cell_boxes row heights available (scaled)")
+                logger.info(f"[TABLE] Using cell_boxes row heights (scaled)")
+            else:
+                # Uniform distribution based on table bbox - ensures table fills its allocated space
+                row_heights = [table_height / num_rows] * num_rows
+                logger.info(f"[TABLE] Using uniform row heights: {table_height/num_rows:.1f} each")

            # Create ReportLab Table
            # Use smaller font to fit content with auto-wrap
@@ -1932,12 +1983,10 @@ class PDFGeneratorService:
                        escaped_text = cell_text.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
                        reportlab_data[row_idx][col_idx] = Paragraph(escaped_text, cell_style)

-            # Create table with computed col widths
-            # Note: We don't use row_heights even when available from cell_boxes because:
-            # 1. ReportLab's auto-sizing handles content overflow better
-            # 2. Fixed heights can cause text clipping when content exceeds cell size
-            # 3. The col_widths from cell_boxes provide the main layout benefit
-            table = Table(reportlab_data, colWidths=col_widths)
+            # Create table with col widths and row heights
+            # Always use row_heights to ensure table fits bbox properly
+            table = Table(reportlab_data, colWidths=col_widths, rowHeights=row_heights)
+            logger.info(f"[TABLE] Created with {len(col_widths)} cols, {len(row_heights)} rows")

            # Apply table style
            style = TableStyle([
@@ -1974,26 +2023,303 @@ class PDFGeneratorService:
            scale_y = table_height / actual_height if actual_height > table_height else 1.0
            scale_factor = min(scale_x, scale_y)  # Use smaller scale to fit both dimensions

+            # Calculate the table top position in PDF coordinates
+            # ReportLab uses bottom-left origin, so we need to position from TOP
+            pdf_y_top = page_height - ocr_y_top  # Top of table in PDF coords
+
+            # Calculate the actual bottom position based on scaled height
+            # Table should be positioned so its TOP aligns with the bbox top
+            scaled_height = actual_height * scale_factor
+            pdf_y_bottom = pdf_y_top - scaled_height  # Bottom of scaled table
+
+            logger.info(f"[表格] PDF座標: top={pdf_y_top:.0f}, bottom={pdf_y_bottom:.0f}, scaled_height={scaled_height:.0f}")
+
            if scale_factor < 1.0:
                logger.info(f"[表格] 縮放比例: {scale_factor:.2f} (需要縮小以適應 bbox)")
                # Apply scaling transformation
                pdf_canvas.saveState()
-                pdf_canvas.translate(pdf_x, pdf_y)
+                pdf_canvas.translate(pdf_x, pdf_y_bottom)
                pdf_canvas.scale(scale_factor, scale_factor)
                # Draw at origin since we've already translated
                table.drawOn(pdf_canvas, 0, 0)
                pdf_canvas.restoreState()
            else:
                # Draw table at position without scaling
-                table.drawOn(pdf_canvas, pdf_x, pdf_y)
+                # pdf_y should be the bottom of the table
+                table.drawOn(pdf_canvas, pdf_x, pdf_y_bottom)

-            logger.info(f"Drew table at ({pdf_x:.0f}, {pdf_y:.0f}) size {table_width:.0f}x{table_height:.0f} with {len(rows)} rows")
+            logger.info(f"Drew table at ({pdf_x:.0f}, {pdf_y_bottom:.0f}) size {table_width:.0f}x{scaled_height:.0f} with {len(rows)} rows")
+
+            # Draw embedded images (images detected inside the table region)
+            embedded_images = table_element.get('embedded_images', [])
+            if embedded_images and result_dir:
+                logger.info(f"[TABLE] Drawing {len(embedded_images)} embedded images")
+                for emb_img in embedded_images:
+                    self._draw_embedded_image(
+                        pdf_canvas, emb_img, page_height, result_dir, scale_w, scale_h
+                    )

        except Exception as e:
            logger.warning(f"Failed to draw table region: {e}")
            import traceback
            traceback.print_exc()

+    def _draw_embedded_image(
+        self,
+        pdf_canvas: canvas.Canvas,
+        emb_img: Dict,
+        page_height: float,
+        result_dir: Path,
+        scale_w: float = 1.0,
+        scale_h: float = 1.0
+    ):
+        """Draw an embedded image inside a table region."""
+        try:
+            # Get image path
+            saved_path = emb_img.get('saved_path', '')
+            if not saved_path:
+                return
+
+            # Construct full path
+            image_path = result_dir / saved_path
+            if not image_path.exists():
+                image_path = result_dir / Path(saved_path).name
+
+            if not image_path.exists():
+                logger.warning(f"Embedded image not found: {saved_path}")
+                return
+
+            # Get bbox from embedded image data
+            bbox = emb_img.get('bbox', [])
+            if not bbox or len(bbox) < 4:
+                logger.warning(f"No bbox for embedded image: {saved_path}")
+                return
+
+            # Calculate position (bbox is [x0, y0, x1, y1])
+            x0, y0, x1, y1 = bbox[0], bbox[1], bbox[2], bbox[3]
+
+            # Apply scaling
+            x0_scaled = x0 * scale_w
+            y0_scaled = y0 * scale_h
+            x1_scaled = x1 * scale_w
+            y1_scaled = y1 * scale_h
+
+            width = x1_scaled - x0_scaled
+            height = y1_scaled - y0_scaled
+
+            # Transform Y coordinate (ReportLab uses bottom-left origin)
+            pdf_x = x0_scaled
+            pdf_y = page_height - y1_scaled
+
+            # Draw the image
+            from reportlab.lib.utils import ImageReader
+            img_reader = ImageReader(str(image_path))
+            pdf_canvas.drawImage(
+                img_reader, pdf_x, pdf_y, width, height,
+                preserveAspectRatio=True, mask='auto'
+            )
+
+            logger.info(f"Drew embedded image at ({pdf_x:.0f}, {pdf_y:.0f}) size {width:.0f}x{height:.0f}")
+
+        except Exception as e:
+            logger.warning(f"Failed to draw embedded image: {e}")
+
+    def _normalize_cell_boxes_to_grid(
+        self,
+        cell_boxes: List[List[float]],
+        threshold: float = 10.0
+    ) -> List[List[float]]:
+        """
+        Normalize cell boxes to create a proper aligned grid.
+
+        Groups nearby coordinates and snaps them to a common value,
+        eliminating the 2-11 pixel variations that cause skewed tables.
+
+        Args:
+            cell_boxes: List of cell bboxes [[x1,y1,x2,y2], ...]
+            threshold: Maximum distance to consider coordinates as "same line"
+
+        Returns:
+            Normalized cell_boxes with aligned coordinates
+        """
+        if not cell_boxes or len(cell_boxes) < 2:
+            return cell_boxes
+
+        # Collect all X and Y coordinates
+        x_coords = []  # (value, box_idx, is_x1)
+        y_coords = []  # (value, box_idx, is_y1)
+
+        for i, box in enumerate(cell_boxes):
+            x1, y1, x2, y2 = box[0], box[1], box[2], box[3]
+            x_coords.append((x1, i, True))   # x1 (left)
+            x_coords.append((x2, i, False))  # x2 (right)
+            y_coords.append((y1, i, True))   # y1 (top)
+            y_coords.append((y2, i, False))  # y2 (bottom)
+
+        def cluster_and_normalize(coords, threshold):
+            """Cluster nearby coordinates and return mapping to normalized values."""
+            if not coords:
+                return {}
+
+            # Sort by value
+            sorted_coords = sorted(coords, key=lambda x: x[0])
+
+            # Cluster nearby values
+            clusters = []
+            current_cluster = [sorted_coords[0]]
+
+            for coord in sorted_coords[1:]:
+                if coord[0] - current_cluster[-1][0] <= threshold:
+                    current_cluster.append(coord)
+                else:
+                    clusters.append(current_cluster)
+                    current_cluster = [coord]
+            clusters.append(current_cluster)
+
+            # Create mapping: (box_idx, is_first) -> normalized value
+            mapping = {}
+            for cluster in clusters:
+                # Use average of cluster as normalized value
+                avg_value = sum(c[0] for c in cluster) / len(cluster)
+                for _, box_idx, is_first in cluster:
+                    mapping[(box_idx, is_first)] = avg_value
+
+            return mapping
+
+        x_mapping = cluster_and_normalize(x_coords, threshold)
+        y_mapping = cluster_and_normalize(y_coords, threshold)
+
+        # Create normalized cell boxes
+        normalized_boxes = []
+        for i, box in enumerate(cell_boxes):
+            x1_norm = x_mapping.get((i, True), box[0])
+            x2_norm = x_mapping.get((i, False), box[2])
+            y1_norm = y_mapping.get((i, True), box[1])
+            y2_norm = y_mapping.get((i, False), box[3])
+            normalized_boxes.append([x1_norm, y1_norm, x2_norm, y2_norm])
+
+        logger.debug(f"[TABLE] Normalized {len(cell_boxes)} cell boxes to grid")
+        return normalized_boxes
+
+    def _draw_table_with_cell_boxes(
+        self,
+        pdf_canvas: canvas.Canvas,
+        table_element: Dict,
+        page_height: float,
+        scale_w: float = 1.0,
+        scale_h: float = 1.0,
+        result_dir: Optional[Path] = None
+    ):
+        """
+        Draw table borders using cell_boxes for accurate positioning.
+
+        LAYERED RENDERING APPROACH:
+        - This method ONLY draws cell borders and embedded images
+        - Text is rendered separately using raw OCR positions (via GapFillingService)
+        - This decouples visual structure (borders) from content (text)
+
+        FALLBACK: If cell_boxes are incomplete, always draws the outer table
+        border using the table's bbox to ensure table boundaries are visible.
+
+        Args:
+            pdf_canvas: ReportLab canvas object
+            table_element: Table element dict with cell_boxes
+            page_height: Height of page in PDF coordinates
+            scale_w: Scale factor for X coordinates
+            scale_h: Scale factor for Y coordinates
+            result_dir: Directory containing result files (for embedded images)
+        """
+        try:
+            cell_boxes = table_element.get('cell_boxes', [])
+
+            # Always draw outer table border first (fallback for incomplete cell_boxes)
+            table_bbox = table_element.get('bbox', [])
+            if table_bbox and len(table_bbox) >= 4:
+                # Handle different bbox formats (list or dict)
+                if isinstance(table_bbox, dict):
+                    tx1 = float(table_bbox.get('x0', 0))
+                    ty1 = float(table_bbox.get('y0', 0))
+                    tx2 = float(table_bbox.get('x1', 0))
+                    ty2 = float(table_bbox.get('y1', 0))
+                else:
+                    tx1, ty1, tx2, ty2 = table_bbox[:4]
+
+                # Apply scaling
+                tx1_scaled = tx1 * scale_w
+                ty1_scaled = ty1 * scale_h
+                tx2_scaled = tx2 * scale_w
+                ty2_scaled = ty2 * scale_h
+
+                table_width = tx2_scaled - tx1_scaled
+                table_height = ty2_scaled - ty1_scaled
+
+                # Transform Y coordinate (PDF uses bottom-left origin)
+                pdf_x = tx1_scaled
+                pdf_y = page_height - ty2_scaled  # Bottom of table in PDF coords
+
+                # Draw outer table border (slightly thicker for visibility)
+                pdf_canvas.setStrokeColor(colors.black)
+                pdf_canvas.setLineWidth(1.0)
+                pdf_canvas.rect(pdf_x, pdf_y, table_width, table_height, stroke=1, fill=0)
+                logger.info(f"[TABLE] Drew outer table border at [{int(tx1)},{int(ty1)},{int(tx2)},{int(ty2)}]")
+
+            if not cell_boxes:
+                logger.warning("[TABLE] No cell_boxes available, only outer border drawn")
+                # Still draw embedded images even without cell borders
+                embedded_images = table_element.get('embedded_images', [])
+                if embedded_images and result_dir:
+                    for emb_img in embedded_images:
+                        self._draw_embedded_image(
+                            pdf_canvas, emb_img, page_height, result_dir, scale_w, scale_h
+                        )
+                return True  # Outer border drawn successfully
+
+            # Normalize cell boxes to create aligned grid
+            cell_boxes = self._normalize_cell_boxes_to_grid(cell_boxes)
+
+            logger.info(f"[TABLE] Drawing {len(cell_boxes)} cell borders (layered mode, grid-aligned)")
+
+            # Draw each cell border
+            for box in cell_boxes:
+                x1, y1, x2, y2 = box[0], box[1], box[2], box[3]
+
+                # Apply scaling
+                x1_scaled = x1 * scale_w
+                y1_scaled = y1 * scale_h
+                x2_scaled = x2 * scale_w
+                y2_scaled = y2 * scale_h
+
+                cell_width = x2_scaled - x1_scaled
+                cell_height = y2_scaled - y1_scaled
+
+                # Transform Y coordinate (PDF uses bottom-left origin)
+                pdf_x = x1_scaled
+                pdf_y = page_height - y2_scaled  # Bottom of cell in PDF coords
+
+                # Draw cell border only (no fill, no text)
+                pdf_canvas.setStrokeColor(colors.black)
+                pdf_canvas.setLineWidth(0.5)
+                pdf_canvas.rect(pdf_x, pdf_y, cell_width, cell_height, stroke=1, fill=0)
+
+            logger.info(f"[TABLE] Drew {len(cell_boxes)} cell borders")
+
+            # Draw embedded images
+            embedded_images = table_element.get('embedded_images', [])
+            if embedded_images and result_dir:
+                logger.info(f"[TABLE] Drawing {len(embedded_images)} embedded images")
+                for emb_img in embedded_images:
+                    self._draw_embedded_image(
+                        pdf_canvas, emb_img, page_height, result_dir, scale_w, scale_h
+                    )
+
+            return True
+
+        except Exception as e:
+            logger.warning(f"[TABLE] Failed to draw cell borders: {e}")
+            import traceback
+            traceback.print_exc()
+            return False
+
    def draw_image_region(
        self,
        pdf_canvas: canvas.Canvas,
@@ -2923,12 +3249,29 @@ class PDFGeneratorService:
            from reportlab.platypus import Table, TableStyle
            from reportlab.lib import colors

+            # Determine number of rows and columns for cell_boxes calculation
+            num_rows = len(rows)
+            max_cols = max(len(row['cells']) for row in rows) if rows else 0
+
            # Use original column widths from extraction if available
-            # Otherwise let ReportLab auto-calculate
+            # Otherwise try to compute from cell_boxes (from PP-StructureV3)
            col_widths = None
            if element.metadata and 'column_widths' in element.metadata:
                col_widths = element.metadata['column_widths']
                logger.debug(f"Using extracted column widths: {col_widths}")
+            elif element.metadata and 'cell_boxes' in element.metadata:
+                # Use cell_boxes from PP-StructureV3 for accurate column/row sizing
+                cell_boxes = element.metadata['cell_boxes']
+                cell_boxes_source = element.metadata.get('cell_boxes_source', 'unknown')
+                table_bbox_list = [bbox.x0, bbox.y0, bbox.x1, bbox.y1]
+                logger.info(f"[TABLE] Using {len(cell_boxes)} cell boxes from {cell_boxes_source}")
+
+                computed_col_widths, computed_row_heights = self._compute_table_grid_from_cell_boxes(
+                    cell_boxes, table_bbox_list, num_rows, max_cols
+                )
+                if computed_col_widths:
+                    col_widths = computed_col_widths
+                    logger.info(f"[TABLE] Computed {len(col_widths)} column widths from cell_boxes")

            # NOTE: Don't use rowHeights from extraction - it causes content overlap
            # The extracted row heights are based on cell boundaries, not text content height.