OCR/backend/app/services/pdf_table_renderer.py

"""
PDF Table Renderer - Handles table rendering for PDF generation.

This module provides unified table rendering capabilities extracted from
PDFGeneratorService, supporting multiple input formats:
- HTML tables
- Cell boxes (layered approach)
- Cells dictionary (Direct track)
- TableData objects
"""

import logging
from dataclasses import dataclass, field
from html.parser import HTMLParser
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union

from reportlab.lib import colors
from reportlab.lib.enums import TA_CENTER, TA_LEFT, TA_RIGHT
from reportlab.lib.styles import ParagraphStyle
from reportlab.lib.utils import ImageReader
from reportlab.platypus import Paragraph, Table, TableStyle

logger = logging.getLogger(__name__)


# ============================================================================
# Configuration
# ============================================================================

@dataclass
class TableRenderConfig:
    """Configuration for table rendering."""
    font_name: str = "Helvetica"
    font_size: int = 8
    min_font_size: int = 6
    max_font_size: int = 10

    # Padding options
    left_padding: int = 2
    right_padding: int = 2
    top_padding: int = 2
    bottom_padding: int = 2

    # Border options
    border_color: Any = colors.black
    border_width: float = 0.5

    # Alignment
    horizontal_align: str = "CENTER"
    vertical_align: str = "MIDDLE"

    # Header styling
    header_background: Any = colors.lightgrey

    # Grid normalization threshold
    grid_threshold: float = 10.0

    # Merged cells threshold
    merge_boundary_threshold: float = 5.0


# ============================================================================
# HTML Table Parser
# ============================================================================

class HTMLTableParser(HTMLParser):
    """
    Parse HTML table structure for rendering.

    Extracts table rows, cells, and merged cell information (colspan/rowspan)
    from HTML table markup.
    """

    def __init__(self):
        super().__init__()
        self.tables = []
        self.current_table = None
        self.current_row = None
        self.current_cell = None
        self.in_cell = False

    def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]):
        if tag == 'table':
            self.current_table = {'rows': []}
        elif tag == 'tr':
            self.current_row = {'cells': []}
        elif tag in ('td', 'th'):
            # Extract colspan and rowspan attributes
            attrs_dict = dict(attrs)
            colspan = int(attrs_dict.get('colspan', 1))
            rowspan = int(attrs_dict.get('rowspan', 1))
            self.current_cell = {
                'text': '',
                'is_header': tag == 'th',
                'colspan': colspan,
                'rowspan': rowspan
            }
            self.in_cell = True

    def handle_endtag(self, tag: str):
        if tag == 'table' and self.current_table:
            self.tables.append(self.current_table)
            self.current_table = None
        elif tag == 'tr' and self.current_row:
            if self.current_table:
                self.current_table['rows'].append(self.current_row)
            self.current_row = None
        elif tag in ('td', 'th') and self.current_cell:
            if self.current_row:
                self.current_row['cells'].append(self.current_cell)
            self.current_cell = None
            self.in_cell = False

    def handle_data(self, data: str):
        if self.in_cell and self.current_cell is not None:
            self.current_cell['text'] += data


# ============================================================================
# Table Renderer
# ============================================================================

class TableRenderer:
    """
    Unified table rendering engine for PDF generation.

    Supports multiple input formats and rendering modes:
    - HTML table parsing and rendering
    - Cell boxes rendering (layered approach)
    - Direct track cells dictionary
    - Translated content with dynamic font sizing
    """

    def __init__(self, config: Optional[TableRenderConfig] = None):
        """
        Initialize TableRenderer with configuration.

        Args:
            config: TableRenderConfig instance (uses defaults if None)
        """
        self.config = config or TableRenderConfig()

    def render_from_html(
        self,
        pdf_canvas,
        html_content: str,
        table_bbox: Tuple[float, float, float, float],
        page_height: float,
        scale_w: float = 1.0,
        scale_h: float = 1.0
    ) -> bool:
        """
        Parse HTML and render table to PDF canvas.

        Args:
            pdf_canvas: ReportLab canvas
            html_content: HTML table string
            table_bbox: (x0, y0, x1, y1) bounding box
            page_height: PDF page height for Y coordinate flip
            scale_w: Horizontal scale factor
            scale_h: Vertical scale factor

        Returns:
            True if successful, False otherwise
        """
        try:
            # Parse HTML
            parser = HTMLTableParser()
            parser.feed(html_content)

            if not parser.tables:
                logger.warning("No tables found in HTML content")
                return False

            table_data = parser.tables[0]
            return self._render_parsed_table(
                pdf_canvas, table_data, table_bbox, page_height, scale_w, scale_h
            )

        except Exception as e:
            logger.error(f"HTML table rendering failed: {e}")
            import traceback
            traceback.print_exc()
            return False

    def render_from_cells_dict(
        self,
        pdf_canvas,
        cells_dict: Dict,
        table_bbox: Tuple[float, float, float, float],
        page_height: float,
        cell_boxes: Optional[List] = None
    ) -> bool:
        """
        Render table from Direct track cell structure.

        Args:
            pdf_canvas: ReportLab canvas
            cells_dict: Dict with 'rows', 'cols', 'cells' keys
            table_bbox: (x0, y0, x1, y1) bounding box
            page_height: PDF page height
            cell_boxes: Optional precomputed cell boxes

        Returns:
            True if successful, False otherwise
        """
        try:
            # Convert cells dict to row format
            rows = self._build_rows_from_cells_dict(cells_dict)

            if not rows:
                logger.warning("No rows built from cells dict")
                return False

            # Build table data structure
            table_data = {'rows': rows}

            # Calculate dimensions
            x0, y0, x1, y1 = table_bbox
            table_width = (x1 - x0)
            table_height = (y1 - y0)

            # Determine grid dimensions
            num_rows = cells_dict.get('rows', len(rows))
            num_cols = cells_dict.get('cols',
                max(len(row['cells']) for row in rows) if rows else 1
            )

            # Calculate column widths and row heights
            if cell_boxes:
                col_widths, row_heights = self.compute_grid_from_cell_boxes(
                    cell_boxes, table_bbox, num_rows, num_cols
                )
            else:
                col_widths = [table_width / num_cols] * num_cols
                row_heights = [table_height / num_rows] * num_rows

            return self._render_with_dimensions(
                pdf_canvas, table_data, table_bbox, page_height,
                col_widths, row_heights
            )

        except Exception as e:
            logger.error(f"Cells dict rendering failed: {e}")
            import traceback
            traceback.print_exc()
            return False

    def render_cell_borders(
        self,
        pdf_canvas,
        cell_boxes: List[List[float]],
        table_bbox: Tuple[float, float, float, float],
        page_height: float,
        embedded_images: Optional[List] = None,
        output_dir: Optional[Path] = None
    ) -> bool:
        """
        Render table cell borders only (layered approach).

        This renders only the cell borders, not the text content.
        Text is typically rendered separately by GapFillingService.

        Args:
            pdf_canvas: ReportLab canvas
            cell_boxes: List of [x0, y0, x1, y1] for each cell
            table_bbox: Table bounding box
            page_height: PDF page height
            embedded_images: Optional list of images within cells
            output_dir: Directory for image files

        Returns:
            True if successful, False otherwise
        """
        try:
            if not cell_boxes:
                # Draw outer border only
                return self._draw_table_border(
                    pdf_canvas, table_bbox, page_height
                )

            # Normalize cell boxes to grid
            normalized_boxes = self.normalize_cell_boxes_to_grid(cell_boxes)

            # Draw each cell border
            pdf_canvas.saveState()
            pdf_canvas.setStrokeColor(self.config.border_color)
            pdf_canvas.setLineWidth(self.config.border_width)

            for box in normalized_boxes:
                if box is None:
                    continue

                x0, y0, x1, y1 = box
                # Convert to PDF coordinates (flip Y)
                pdf_x0 = x0
                pdf_y0 = page_height - y1
                pdf_x1 = x1
                pdf_y1 = page_height - y0

                # Draw cell rectangle
                pdf_canvas.rect(pdf_x0, pdf_y0, pdf_x1 - pdf_x0, pdf_y1 - pdf_y0)

            pdf_canvas.restoreState()

            # Draw embedded images if any
            if embedded_images and output_dir:
                for img_info in embedded_images:
                    self._draw_embedded_image(
                        pdf_canvas, img_info, page_height, output_dir
                    )

            return True

        except Exception as e:
            logger.error(f"Cell borders rendering failed: {e}")
            import traceback
            traceback.print_exc()
            return False

    def render_with_translated_text(
        self,
        pdf_canvas,
        cells: List[Dict],
        cell_boxes: List,
        table_bbox: Tuple[float, float, float, float],
        page_height: float
    ) -> bool:
        """
        Render table with translated content and dynamic font sizing.

        Args:
            pdf_canvas: ReportLab canvas
            cells: List of cell dicts with 'translated_content'
            cell_boxes: List of cell bounding boxes
            table_bbox: Table bounding box
            page_height: PDF page height

        Returns:
            True if successful, False otherwise
        """
        try:
            # Draw outer border
            self._draw_table_border(pdf_canvas, table_bbox, page_height)

            # Normalize cell boxes
            if cell_boxes:
                normalized_boxes = self.normalize_cell_boxes_to_grid(cell_boxes)
            else:
                logger.warning("No cell boxes for translated table")
                return False

            pdf_canvas.saveState()
            pdf_canvas.setStrokeColor(self.config.border_color)
            pdf_canvas.setLineWidth(self.config.border_width)

            # Draw cell borders
            for box in normalized_boxes:
                if box is None:
                    continue
                x0, y0, x1, y1 = box
                pdf_y0 = page_height - y1
                pdf_canvas.rect(x0, pdf_y0, x1 - x0, y1 - y0)

            pdf_canvas.restoreState()

            # Render text in cells with dynamic font sizing
            for i, cell in enumerate(cells):
                if i >= len(normalized_boxes):
                    break

                box = normalized_boxes[i]
                if box is None:
                    continue

                translated_text = cell.get('translated_content', '')
                if not translated_text:
                    continue

                x0, y0, x1, y1 = box
                cell_width = x1 - x0
                cell_height = y1 - y0

                # Find appropriate font size
                font_size = self._fit_text_to_cell(
                    pdf_canvas, translated_text, cell_width, cell_height
                )

                # Render centered text
                pdf_canvas.setFont(self.config.font_name, font_size)

                # Calculate text position (centered)
                text_width = pdf_canvas.stringWidth(translated_text, self.config.font_name, font_size)
                text_x = x0 + (cell_width - text_width) / 2
                text_y = page_height - y0 - cell_height / 2 - font_size / 3

                pdf_canvas.drawString(text_x, text_y, translated_text)

            return True

        except Exception as e:
            logger.error(f"Translated table rendering failed: {e}")
            import traceback
            traceback.print_exc()
            return False

    # =========================================================================
    # Grid and Cell Box Helpers
    # =========================================================================

    def compute_grid_from_cell_boxes(
        self,
        cell_boxes: List,
        table_bbox: Tuple[float, float, float, float],
        num_rows: int,
        num_cols: int
    ) -> Tuple[Optional[List[float]], Optional[List[float]]]:
        """
        Calculate column widths and row heights from cell bounding boxes.

        Args:
            cell_boxes: List of [x0, y0, x1, y1] for each cell
            table_bbox: Table bounding box
            num_rows: Expected number of rows
            num_cols: Expected number of columns

        Returns:
            Tuple of (col_widths, row_heights) or (None, None) on failure
        """
        try:
            if not cell_boxes:
                return None, None

            # Filter valid boxes
            valid_boxes = [b for b in cell_boxes if b is not None and len(b) >= 4]
            if not valid_boxes:
                return None, None

            # Extract unique X and Y boundaries
            x_boundaries = set()
            y_boundaries = set()

            for box in valid_boxes:
                x0, y0, x1, y1 = box[:4]
                x_boundaries.add(round(x0, 1))
                x_boundaries.add(round(x1, 1))
                y_boundaries.add(round(y0, 1))
                y_boundaries.add(round(y1, 1))

            # Sort boundaries
            x_sorted = sorted(x_boundaries)
            y_sorted = sorted(y_boundaries)

            # Merge nearby boundaries
            x_merged = self._merge_boundaries(x_sorted, self.config.merge_boundary_threshold)
            y_merged = self._merge_boundaries(y_sorted, self.config.merge_boundary_threshold)

            # Calculate widths and heights
            col_widths = []
            for i in range(len(x_merged) - 1):
                col_widths.append(x_merged[i + 1] - x_merged[i])

            row_heights = []
            for i in range(len(y_merged) - 1):
                row_heights.append(y_merged[i + 1] - y_merged[i])

            # Validate against expected dimensions (allow for merged cells)
            tolerance = max(num_cols, num_rows) // 2 + 1
            if abs(len(col_widths) - num_cols) > tolerance:
                logger.debug(f"Column count mismatch: {len(col_widths)} vs {num_cols}")
            if abs(len(row_heights) - num_rows) > tolerance:
                logger.debug(f"Row count mismatch: {len(row_heights)} vs {num_rows}")

            return col_widths if col_widths else None, row_heights if row_heights else None

        except Exception as e:
            logger.error(f"Grid computation failed: {e}")
            return None, None

    def normalize_cell_boxes_to_grid(
        self,
        cell_boxes: List,
        threshold: Optional[float] = None
    ) -> List:
        """
        Snap cell boxes to aligned grid to eliminate coordinate variations.

        Args:
            cell_boxes: List of [x0, y0, x1, y1] for each cell
            threshold: Clustering threshold (uses config default if None)

        Returns:
            Normalized cell boxes
        """
        threshold = threshold or self.config.grid_threshold

        if not cell_boxes:
            return []

        try:
            # Collect all coordinates
            all_x = []
            all_y = []

            for box in cell_boxes:
                if box is None or len(box) < 4:
                    continue
                x0, y0, x1, y1 = box[:4]
                all_x.extend([x0, x1])
                all_y.extend([y0, y1])

            if not all_x or not all_y:
                return cell_boxes

            # Cluster and normalize X coordinates
            x_clusters = self._cluster_values(sorted(all_x), threshold)
            y_clusters = self._cluster_values(sorted(all_y), threshold)

            # Build mapping
            x_map = {v: avg for avg, values in x_clusters for v in values}
            y_map = {v: avg for avg, values in y_clusters for v in values}

            # Normalize boxes
            normalized = []
            for box in cell_boxes:
                if box is None or len(box) < 4:
                    normalized.append(box)
                    continue

                x0, y0, x1, y1 = box[:4]
                normalized.append([
                    x_map.get(x0, x0),
                    y_map.get(y0, y0),
                    x_map.get(x1, x1),
                    y_map.get(y1, y1)
                ])

            return normalized

        except Exception as e:
            logger.error(f"Cell box normalization failed: {e}")
            return cell_boxes

    # =========================================================================
    # Private Helper Methods
    # =========================================================================

    def _render_parsed_table(
        self,
        pdf_canvas,
        table_data: Dict,
        table_bbox: Tuple[float, float, float, float],
        page_height: float,
        scale_w: float = 1.0,
        scale_h: float = 1.0
    ) -> bool:
        """Render a parsed table structure."""
        rows = table_data.get('rows', [])
        if not rows:
            return False

        # Build grid content
        num_rows = len(rows)
        num_cols = max(len(row.get('cells', [])) for row in rows)

        # Track occupied cells for rowspan handling
        occupied = [[False] * num_cols for _ in range(num_rows)]

        grid = []
        span_commands = []

        for row_idx, row in enumerate(rows):
            grid_row = [''] * num_cols
            col_idx = 0

            for cell in row.get('cells', []):
                # Skip occupied cells
                while col_idx < num_cols and occupied[row_idx][col_idx]:
                    col_idx += 1

                if col_idx >= num_cols:
                    break

                text = cell.get('text', '').strip()
                colspan = cell.get('colspan', 1)
                rowspan = cell.get('rowspan', 1)

                # Place cell content
                grid_row[col_idx] = text

                # Mark occupied cells and build SPAN command
                if colspan > 1 or rowspan > 1:
                    end_col = min(col_idx + colspan - 1, num_cols - 1)
                    end_row = min(row_idx + rowspan - 1, num_rows - 1)
                    span_commands.append(
                        ('SPAN', (col_idx, row_idx), (end_col, end_row))
                    )

                    for r in range(row_idx, end_row + 1):
                        for c in range(col_idx, end_col + 1):
                            if r < num_rows and c < num_cols:
                                occupied[r][c] = True
                else:
                    occupied[row_idx][col_idx] = True

                col_idx += colspan

            grid.append(grid_row)

        # Calculate dimensions
        x0, y0, x1, y1 = table_bbox
        table_width = (x1 - x0) * scale_w
        table_height = (y1 - y0) * scale_h

        col_widths = [table_width / num_cols] * num_cols
        row_heights = [table_height / num_rows] * num_rows

        # Create paragraph style
        style = ParagraphStyle(
            'TableCell',
            fontName=self.config.font_name,
            fontSize=self.config.font_size,
            alignment=TA_CENTER,
            leading=self.config.font_size * 1.2
        )

        # Convert to Paragraph objects
        para_grid = []
        for row in grid:
            para_row = []
            for cell in row:
                if cell:
                    para_row.append(Paragraph(cell, style))
                else:
                    para_row.append('')
            para_grid.append(para_row)

        # Build TableStyle
        table_style_commands = [
            ('GRID', (0, 0), (-1, -1), self.config.border_width, self.config.border_color),
            ('VALIGN', (0, 0), (-1, -1), self.config.vertical_align),
            ('ALIGN', (0, 0), (-1, -1), self.config.horizontal_align),
            ('LEFTPADDING', (0, 0), (-1, -1), self.config.left_padding),
            ('RIGHTPADDING', (0, 0), (-1, -1), self.config.right_padding),
            ('TOPPADDING', (0, 0), (-1, -1), self.config.top_padding),
            ('BOTTOMPADDING', (0, 0), (-1, -1), self.config.bottom_padding),
            ('FONTNAME', (0, 0), (-1, -1), self.config.font_name),
            ('FONTSIZE', (0, 0), (-1, -1), self.config.font_size),
        ]
        table_style_commands.extend(span_commands)

        # Create and draw table
        table = Table(para_grid, colWidths=col_widths, rowHeights=row_heights)
        table.setStyle(TableStyle(table_style_commands))

        # Position and draw
        pdf_x = x0
        pdf_y = page_height - y1  # Flip Y

        table.wrapOn(pdf_canvas, table_width, table_height)
        table.drawOn(pdf_canvas, pdf_x, pdf_y)

        return True

    def _render_with_dimensions(
        self,
        pdf_canvas,
        table_data: Dict,
        table_bbox: Tuple[float, float, float, float],
        page_height: float,
        col_widths: List[float],
        row_heights: List[float]
    ) -> bool:
        """Render table with specified dimensions."""
        rows = table_data.get('rows', [])
        if not rows:
            return False

        num_rows = len(rows)
        num_cols = max(len(row.get('cells', [])) for row in rows)

        # Adjust widths/heights if needed
        if len(col_widths) != num_cols:
            x0, y0, x1, y1 = table_bbox
            col_widths = [(x1 - x0) / num_cols] * num_cols
        if len(row_heights) != num_rows:
            x0, y0, x1, y1 = table_bbox
            row_heights = [(y1 - y0) / num_rows] * num_rows

        # Build grid with proper positioning
        grid = []
        span_commands = []
        occupied = [[False] * num_cols for _ in range(num_rows)]

        for row_idx, row in enumerate(rows):
            grid_row = [''] * num_cols

            for cell in row.get('cells', []):
                # Get column position
                col_idx = cell.get('col', 0)

                # Skip if out of bounds or occupied
                while col_idx < num_cols and occupied[row_idx][col_idx]:
                    col_idx += 1
                if col_idx >= num_cols:
                    continue

                text = cell.get('text', '').strip()
                colspan = cell.get('colspan', 1)
                rowspan = cell.get('rowspan', 1)

                grid_row[col_idx] = text

                if colspan > 1 or rowspan > 1:
                    end_col = min(col_idx + colspan - 1, num_cols - 1)
                    end_row = min(row_idx + rowspan - 1, num_rows - 1)
                    span_commands.append(
                        ('SPAN', (col_idx, row_idx), (end_col, end_row))
                    )
                    for r in range(row_idx, end_row + 1):
                        for c in range(col_idx, end_col + 1):
                            if r < num_rows and c < num_cols:
                                occupied[r][c] = True
                else:
                    occupied[row_idx][col_idx] = True

            grid.append(grid_row)

        # Create style and table
        style = ParagraphStyle(
            'TableCell',
            fontName=self.config.font_name,
            fontSize=self.config.font_size,
            alignment=TA_CENTER
        )

        para_grid = []
        for row in grid:
            para_row = [Paragraph(cell, style) if cell else '' for cell in row]
            para_grid.append(para_row)

        table_style_commands = [
            ('GRID', (0, 0), (-1, -1), self.config.border_width, self.config.border_color),
            ('VALIGN', (0, 0), (-1, -1), self.config.vertical_align),
            ('LEFTPADDING', (0, 0), (-1, -1), 0),
            ('RIGHTPADDING', (0, 0), (-1, -1), 0),
            ('TOPPADDING', (0, 0), (-1, -1), 0),
            ('BOTTOMPADDING', (0, 0), (-1, -1), 1),
        ]
        table_style_commands.extend(span_commands)

        table = Table(para_grid, colWidths=col_widths, rowHeights=row_heights)
        table.setStyle(TableStyle(table_style_commands))

        x0, y0, x1, y1 = table_bbox
        pdf_x = x0
        pdf_y = page_height - y1

        table.wrapOn(pdf_canvas, x1 - x0, y1 - y0)
        table.drawOn(pdf_canvas, pdf_x, pdf_y)

        return True

    def _build_rows_from_cells_dict(self, cells_dict: Dict) -> List[Dict]:
        """Convert Direct track cell structure to row format."""
        cells = cells_dict.get('cells', [])
        if not cells:
            return []

        num_rows = cells_dict.get('rows', 0)
        num_cols = cells_dict.get('cols', 0)

        # Group cells by row
        rows_data = {}
        for cell in cells:
            row_idx = cell.get('row', 0)
            if row_idx not in rows_data:
                rows_data[row_idx] = []
            rows_data[row_idx].append(cell)

        # Build row list
        rows = []
        for row_idx in range(num_rows):
            row_cells = rows_data.get(row_idx, [])

            # Sort by column
            row_cells.sort(key=lambda c: c.get('col', 0))

            formatted_cells = []
            for cell in row_cells:
                content = cell.get('content', '')
                if isinstance(content, list):
                    content = '\n'.join(str(c) for c in content)

                formatted_cells.append({
                    'text': str(content) if content else '',
                    'colspan': cell.get('col_span', 1),
                    'rowspan': cell.get('row_span', 1),
                    'col': cell.get('col', 0),
                    'is_header': cell.get('is_header', False)
                })

            rows.append({'cells': formatted_cells})

        return rows

    def _draw_table_border(
        self,
        pdf_canvas,
        table_bbox: Tuple[float, float, float, float],
        page_height: float
    ) -> bool:
        """Draw outer table border."""
        try:
            x0, y0, x1, y1 = table_bbox
            pdf_y0 = page_height - y1
            pdf_y1 = page_height - y0

            pdf_canvas.saveState()
            pdf_canvas.setStrokeColor(self.config.border_color)
            pdf_canvas.setLineWidth(self.config.border_width)
            pdf_canvas.rect(x0, pdf_y0, x1 - x0, pdf_y1 - pdf_y0)
            pdf_canvas.restoreState()

            return True
        except Exception as e:
            logger.error(f"Failed to draw table border: {e}")
            return False

    def _draw_embedded_image(
        self,
        pdf_canvas,
        img_info: Dict,
        page_height: float,
        output_dir: Path
    ) -> bool:
        """Draw an image embedded within a table cell."""
        try:
            img_path = img_info.get('path')
            if not img_path:
                return False

            # Resolve path
            if not Path(img_path).is_absolute():
                img_path = output_dir / img_path

            if not Path(img_path).exists():
                logger.warning(f"Embedded image not found: {img_path}")
                return False

            bbox = img_info.get('bbox', {})
            x0 = bbox.get('x0', 0)
            y0 = bbox.get('y0', 0)
            width = bbox.get('width', 100)
            height = bbox.get('height', 100)

            # Flip Y coordinate
            pdf_y = page_height - y0 - height

            # Draw image
            img = ImageReader(str(img_path))
            pdf_canvas.drawImage(img, x0, pdf_y, width, height)

            return True

        except Exception as e:
            logger.error(f"Failed to draw embedded image: {e}")
            return False

    def _fit_text_to_cell(
        self,
        pdf_canvas,
        text: str,
        cell_width: float,
        cell_height: float
    ) -> int:
        """Find font size that fits text in cell."""
        for size in range(self.config.max_font_size, self.config.min_font_size - 1, -1):
            text_width = pdf_canvas.stringWidth(text, self.config.font_name, size)
            if text_width <= cell_width - 6:  # 3pt padding each side
                return size
        return self.config.min_font_size

    def _merge_boundaries(self, values: List[float], threshold: float) -> List[float]:
        """Merge nearby boundary values."""
        if not values:
            return []

        merged = [values[0]]
        for v in values[1:]:
            if abs(v - merged[-1]) > threshold:
                merged.append(v)

        return merged

    def _cluster_values(self, values: List[float], threshold: float) -> List[Tuple[float, List[float]]]:
        """Cluster nearby values and return (average, members) pairs."""
        if not values:
            return []

        clusters = []
        current_cluster = [values[0]]

        for v in values[1:]:
            if abs(v - current_cluster[-1]) <= threshold:
                current_cluster.append(v)
            else:
                avg = sum(current_cluster) / len(current_cluster)
                clusters.append((avg, current_cluster))
                current_cluster = [v]

        if current_cluster:
            avg = sum(current_cluster) / len(current_cluster)
            clusters.append((avg, current_cluster))

        return clusters