OCR/backend/app/services/pdf_table_renderer.py

"""
PDF Table Renderer - Handles table rendering for PDF generation.

This module provides unified table rendering capabilities extracted from
PDFGeneratorService, supporting multiple input formats:
- HTML tables
- Cell boxes (layered approach)
- Cells dictionary (Direct track)
- TableData objects
"""

import logging
from dataclasses import dataclass, field
from html.parser import HTMLParser
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union

from reportlab.lib import colors
from reportlab.lib.enums import TA_CENTER, TA_LEFT, TA_RIGHT
from reportlab.lib.styles import ParagraphStyle
from reportlab.lib.utils import ImageReader
from reportlab.platypus import Paragraph, Table, TableStyle

logger = logging.getLogger(__name__)


# ============================================================================
# Cell Box Grid Inferrer
# ============================================================================

class CellBoxGridInferrer:
    """
    Infer table grid structure from cell_boxes coordinates.

    This class clusters cell_boxes by Y-coordinate (rows) and X-coordinate (columns)
    to determine the grid structure, regardless of HTML colspan/rowspan.
    """

    def __init__(
        self,
        row_threshold: float = 15.0,
        col_threshold: float = 15.0
    ):
        """
        Initialize grid inferrer.

        Args:
            row_threshold: Y-coordinate threshold for row clustering
            col_threshold: X-coordinate threshold for column clustering
        """
        self.row_threshold = row_threshold
        self.col_threshold = col_threshold

    def infer_grid(
        self,
        cell_boxes: List[List[float]]
    ) -> Optional[Dict]:
        """
        Infer grid structure from cell_boxes.

        Args:
            cell_boxes: List of [x0, y0, x1, y1] coordinates

        Returns:
            Dict with 'grid', 'num_rows', 'num_cols', 'row_boundaries', 'col_boundaries'
            or None if inference fails
        """
        if not cell_boxes or len(cell_boxes) < 1:
            return None

        try:
            # Filter valid boxes
            valid_boxes = [
                b for b in cell_boxes
                if b is not None and len(b) >= 4
            ]
            if not valid_boxes:
                return None

            # Extract Y and X boundaries from all cells
            y_mins = [b[1] for b in valid_boxes]  # y0
            y_maxs = [b[3] for b in valid_boxes]  # y1
            x_mins = [b[0] for b in valid_boxes]  # x0
            x_maxs = [b[2] for b in valid_boxes]  # x1

            # Cluster Y values to determine rows
            all_y = sorted(set(y_mins + y_maxs))
            y_boundaries = self._cluster_to_boundaries(all_y, self.row_threshold)

            # Cluster X values to determine columns
            all_x = sorted(set(x_mins + x_maxs))
            x_boundaries = self._cluster_to_boundaries(all_x, self.col_threshold)

            if len(y_boundaries) < 2 or len(x_boundaries) < 2:
                return None

            num_rows = len(y_boundaries) - 1
            num_cols = len(x_boundaries) - 1

            # Build grid: map (row, col) -> cell_box info
            grid = {}
            for idx, box in enumerate(valid_boxes):
                x0, y0, x1, y1 = box[:4]

                # Find row by y_center
                y_center = (y0 + y1) / 2
                row = self._find_position(y_center, y_boundaries)

                # Find col by x_center
                x_center = (x0 + x1) / 2
                col = self._find_position(x_center, x_boundaries)

                if row is not None and col is not None:
                    grid[(row, col)] = {
                        'bbox': box,
                        'index': idx,
                        'content': ''
                    }

            # Calculate row heights and column widths
            row_heights = [
                y_boundaries[i + 1] - y_boundaries[i]
                for i in range(num_rows)
            ]
            col_widths = [
                x_boundaries[i + 1] - x_boundaries[i]
                for i in range(num_cols)
            ]

            return {
                'grid': grid,
                'num_rows': num_rows,
                'num_cols': num_cols,
                'row_boundaries': y_boundaries,
                'col_boundaries': x_boundaries,
                'row_heights': row_heights,
                'col_widths': col_widths
            }

        except Exception as e:
            logger.error(f"Grid inference failed: {e}")
            return None

    def _cluster_to_boundaries(
        self,
        values: List[float],
        threshold: float
    ) -> List[float]:
        """
        Cluster nearby values and return representative boundaries.

        Args:
            values: Sorted list of coordinate values
            threshold: Clustering threshold

        Returns:
            List of boundary values (cluster representatives)
        """
        if not values:
            return []

        boundaries = [values[0]]
        current_cluster = [values[0]]

        for v in values[1:]:
            if v - current_cluster[-1] <= threshold:
                current_cluster.append(v)
            else:
                # Finish current cluster, use average as boundary
                boundaries[-1] = sum(current_cluster) / len(current_cluster)
                boundaries.append(v)
                current_cluster = [v]

        # Finish last cluster
        if current_cluster:
            boundaries[-1] = sum(current_cluster) / len(current_cluster)

        return boundaries

    def _find_position(
        self,
        value: float,
        boundaries: List[float]
    ) -> Optional[int]:
        """
        Find which interval a value falls into.

        Args:
            value: Coordinate value
            boundaries: List of boundary values

        Returns:
            Index of interval, or None if out of bounds
        """
        for i in range(len(boundaries) - 1):
            if boundaries[i] <= value <= boundaries[i + 1]:
                return i

        # Check if close to any boundary
        for i in range(len(boundaries) - 1):
            mid = (boundaries[i] + boundaries[i + 1]) / 2
            if abs(value - mid) < (boundaries[i + 1] - boundaries[i]):
                return i

        return None


def extract_cell_contents_from_html(html: str) -> List[str]:
    """
    Extract cell text contents from HTML in reading order.

    Args:
        html: HTML table string

    Returns:
        List of text strings, one per cell
    """
    try:
        parser = HTMLTableParser()
        parser.feed(html)

        if not parser.tables:
            return []

        contents = []
        for row in parser.tables[0].get('rows', []):
            for cell in row.get('cells', []):
                text = cell.get('text', '').strip()
                contents.append(text)

        return contents

    except Exception as e:
        logger.error(f"HTML content extraction failed: {e}")
        return []


def map_content_to_grid(
    grid: Dict[Tuple[int, int], Dict],
    contents: List[str],
    num_rows: int,
    num_cols: int
) -> Dict[Tuple[int, int], Dict]:
    """
    Map extracted content to grid cells row by row.

    Args:
        grid: Dict mapping (row, col) to cell info
        contents: List of text contents from HTML
        num_rows: Number of rows in grid
        num_cols: Number of columns in grid

    Returns:
        Updated grid with content assigned
    """
    content_idx = 0

    for row in range(num_rows):
        for col in range(num_cols):
            if (row, col) in grid:
                if content_idx < len(contents):
                    grid[(row, col)]['content'] = contents[content_idx]
                    content_idx += 1
                else:
                    grid[(row, col)]['content'] = ''

    # Log if there's a significant mismatch
    if content_idx < len(contents):
        logger.debug(
            f"Content mismatch: {len(contents)} HTML cells, "
            f"only {content_idx} mapped to {len(grid)} grid cells"
        )

    return grid


# ============================================================================
# Configuration
# ============================================================================

@dataclass
class TableRenderConfig:
    """Configuration for table rendering."""
    font_name: str = "Helvetica"
    font_size: int = 8
    min_font_size: int = 6
    max_font_size: int = 10

    # Padding options
    left_padding: int = 2
    right_padding: int = 2
    top_padding: int = 2
    bottom_padding: int = 2

    # Border options
    border_color: Any = colors.black
    border_width: float = 0.5

    # Alignment
    horizontal_align: str = "CENTER"
    vertical_align: str = "MIDDLE"

    # Header styling
    header_background: Any = colors.lightgrey

    # Grid normalization threshold
    grid_threshold: float = 10.0

    # Merged cells threshold
    merge_boundary_threshold: float = 5.0


# ============================================================================
# HTML Table Parser
# ============================================================================

class HTMLTableParser(HTMLParser):
    """
    Parse HTML table structure for rendering.

    Extracts table rows, cells, and merged cell information (colspan/rowspan)
    from HTML table markup.
    """

    def __init__(self):
        super().__init__()
        self.tables = []
        self.current_table = None
        self.current_row = None
        self.current_cell = None
        self.in_cell = False

    def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]):
        if tag == 'table':
            self.current_table = {'rows': []}
        elif tag == 'tr':
            self.current_row = {'cells': []}
        elif tag in ('td', 'th'):
            # Extract colspan and rowspan attributes
            attrs_dict = dict(attrs)
            colspan = int(attrs_dict.get('colspan', 1))
            rowspan = int(attrs_dict.get('rowspan', 1))
            self.current_cell = {
                'text': '',
                'is_header': tag == 'th',
                'colspan': colspan,
                'rowspan': rowspan
            }
            self.in_cell = True

    def handle_endtag(self, tag: str):
        if tag == 'table' and self.current_table:
            self.tables.append(self.current_table)
            self.current_table = None
        elif tag == 'tr' and self.current_row:
            if self.current_table:
                self.current_table['rows'].append(self.current_row)
            self.current_row = None
        elif tag in ('td', 'th') and self.current_cell:
            if self.current_row:
                self.current_row['cells'].append(self.current_cell)
            self.current_cell = None
            self.in_cell = False

    def handle_data(self, data: str):
        if self.in_cell and self.current_cell is not None:
            self.current_cell['text'] += data


# ============================================================================
# Table Renderer
# ============================================================================

class TableRenderer:
    """
    Unified table rendering engine for PDF generation.

    Supports multiple input formats and rendering modes:
    - HTML table parsing and rendering
    - Cell boxes rendering (layered approach)
    - Direct track cells dictionary
    - Translated content with dynamic font sizing
    """

    def __init__(self, config: Optional[TableRenderConfig] = None):
        """
        Initialize TableRenderer with configuration.

        Args:
            config: TableRenderConfig instance (uses defaults if None)
        """
        self.config = config or TableRenderConfig()

    def render_from_html(
        self,
        pdf_canvas,
        html_content: str,
        table_bbox: Tuple[float, float, float, float],
        page_height: float,
        scale_w: float = 1.0,
        scale_h: float = 1.0
    ) -> bool:
        """
        Parse HTML and render table to PDF canvas.

        Args:
            pdf_canvas: ReportLab canvas
            html_content: HTML table string
            table_bbox: (x0, y0, x1, y1) bounding box
            page_height: PDF page height for Y coordinate flip
            scale_w: Horizontal scale factor
            scale_h: Vertical scale factor

        Returns:
            True if successful, False otherwise
        """
        try:
            # Parse HTML
            parser = HTMLTableParser()
            parser.feed(html_content)

            if not parser.tables:
                logger.warning("No tables found in HTML content")
                return False

            table_data = parser.tables[0]
            return self._render_parsed_table(
                pdf_canvas, table_data, table_bbox, page_height, scale_w, scale_h
            )

        except Exception as e:
            logger.error(f"HTML table rendering failed: {e}")
            import traceback
            traceback.print_exc()
            return False

    def render_from_cells_dict(
        self,
        pdf_canvas,
        cells_dict: Dict,
        table_bbox: Tuple[float, float, float, float],
        page_height: float,
        cell_boxes: Optional[List] = None
    ) -> bool:
        """
        Render table from Direct track cell structure.

        Args:
            pdf_canvas: ReportLab canvas
            cells_dict: Dict with 'rows', 'cols', 'cells' keys
            table_bbox: (x0, y0, x1, y1) bounding box
            page_height: PDF page height
            cell_boxes: Optional precomputed cell boxes

        Returns:
            True if successful, False otherwise
        """
        try:
            # Convert cells dict to row format
            rows = self._build_rows_from_cells_dict(cells_dict)

            if not rows:
                logger.warning("No rows built from cells dict")
                return False

            # Build table data structure
            table_data = {'rows': rows}

            # Calculate dimensions
            x0, y0, x1, y1 = table_bbox
            table_width = (x1 - x0)
            table_height = (y1 - y0)

            # Determine grid dimensions
            num_rows = cells_dict.get('rows', len(rows))
            num_cols = cells_dict.get('cols',
                max(len(row['cells']) for row in rows) if rows else 1
            )

            # Calculate column widths and row heights
            if cell_boxes:
                col_widths, row_heights = self.compute_grid_from_cell_boxes(
                    cell_boxes, table_bbox, num_rows, num_cols
                )
            else:
                col_widths = [table_width / num_cols] * num_cols
                row_heights = [table_height / num_rows] * num_rows

            return self._render_with_dimensions(
                pdf_canvas, table_data, table_bbox, page_height,
                col_widths, row_heights
            )

        except Exception as e:
            logger.error(f"Cells dict rendering failed: {e}")
            import traceback
            traceback.print_exc()
            return False

    def render_cell_borders(
        self,
        pdf_canvas,
        cell_boxes: List[List[float]],
        table_bbox: Tuple[float, float, float, float],
        page_height: float,
        embedded_images: Optional[List] = None,
        output_dir: Optional[Path] = None
    ) -> bool:
        """
        Render table cell borders only (layered approach).

        This renders only the cell borders, not the text content.
        Text is typically rendered separately by GapFillingService.

        Args:
            pdf_canvas: ReportLab canvas
            cell_boxes: List of [x0, y0, x1, y1] for each cell
            table_bbox: Table bounding box
            page_height: PDF page height
            embedded_images: Optional list of images within cells
            output_dir: Directory for image files

        Returns:
            True if successful, False otherwise
        """
        try:
            if not cell_boxes:
                # Draw outer border only
                return self._draw_table_border(
                    pdf_canvas, table_bbox, page_height
                )

            # Normalize cell boxes to grid
            normalized_boxes = self.normalize_cell_boxes_to_grid(cell_boxes)

            # Draw each cell border
            pdf_canvas.saveState()
            pdf_canvas.setStrokeColor(self.config.border_color)
            pdf_canvas.setLineWidth(self.config.border_width)

            for box in normalized_boxes:
                if box is None:
                    continue

                x0, y0, x1, y1 = box
                # Convert to PDF coordinates (flip Y)
                pdf_x0 = x0
                pdf_y0 = page_height - y1
                pdf_x1 = x1
                pdf_y1 = page_height - y0

                # Draw cell rectangle
                pdf_canvas.rect(pdf_x0, pdf_y0, pdf_x1 - pdf_x0, pdf_y1 - pdf_y0)

            pdf_canvas.restoreState()

            # Draw embedded images if any
            if embedded_images and output_dir:
                for img_info in embedded_images:
                    self._draw_embedded_image(
                        pdf_canvas, img_info, page_height, output_dir
                    )

            return True

        except Exception as e:
            logger.error(f"Cell borders rendering failed: {e}")
            import traceback
            traceback.print_exc()
            return False

    def render_with_translated_text(
        self,
        pdf_canvas,
        cells: List[Dict],
        cell_boxes: List,
        table_bbox: Tuple[float, float, float, float],
        page_height: float
    ) -> bool:
        """
        Render table with translated content and dynamic font sizing.

        Args:
            pdf_canvas: ReportLab canvas
            cells: List of cell dicts with 'translated_content'
            cell_boxes: List of cell bounding boxes
            table_bbox: Table bounding box
            page_height: PDF page height

        Returns:
            True if successful, False otherwise
        """
        try:
            # Draw outer border
            self._draw_table_border(pdf_canvas, table_bbox, page_height)

            # Normalize cell boxes
            if cell_boxes:
                normalized_boxes = self.normalize_cell_boxes_to_grid(cell_boxes)
            else:
                logger.warning("No cell boxes for translated table")
                return False

            pdf_canvas.saveState()
            pdf_canvas.setStrokeColor(self.config.border_color)
            pdf_canvas.setLineWidth(self.config.border_width)

            # Draw cell borders
            for box in normalized_boxes:
                if box is None:
                    continue
                x0, y0, x1, y1 = box
                pdf_y0 = page_height - y1
                pdf_canvas.rect(x0, pdf_y0, x1 - x0, y1 - y0)

            pdf_canvas.restoreState()

            # Render text in cells with dynamic font sizing
            for i, cell in enumerate(cells):
                if i >= len(normalized_boxes):
                    break

                box = normalized_boxes[i]
                if box is None:
                    continue

                translated_text = cell.get('translated_content', '')
                if not translated_text:
                    continue

                x0, y0, x1, y1 = box
                cell_width = x1 - x0
                cell_height = y1 - y0

                # Find appropriate font size
                font_size = self._fit_text_to_cell(
                    pdf_canvas, translated_text, cell_width, cell_height
                )

                # Render centered text
                pdf_canvas.setFont(self.config.font_name, font_size)

                # Calculate text position (centered)
                text_width = pdf_canvas.stringWidth(translated_text, self.config.font_name, font_size)
                text_x = x0 + (cell_width - text_width) / 2
                text_y = page_height - y0 - cell_height / 2 - font_size / 3

                pdf_canvas.drawString(text_x, text_y, translated_text)

            return True

        except Exception as e:
            logger.error(f"Translated table rendering failed: {e}")
            import traceback
            traceback.print_exc()
            return False

    def render_from_cellboxes_grid(
        self,
        pdf_canvas,
        cell_boxes: List[List[float]],
        html_content: str,
        table_bbox: Tuple[float, float, float, float],
        page_height: float,
        scale_w: float = 1.0,
        scale_h: float = 1.0,
        row_threshold: float = 15.0,
        col_threshold: float = 15.0
    ) -> bool:
        """
        Render table using cell_boxes as the primary structure source.

        This method infers grid structure from cell_boxes coordinates and
        maps HTML content to cells, regardless of HTML colspan/rowspan.

        Args:
            pdf_canvas: ReportLab canvas
            cell_boxes: List of [x0, y0, x1, y1] for each cell
            html_content: HTML table string (for text content)
            table_bbox: Table bounding box
            page_height: PDF page height
            scale_w: Horizontal scale factor
            scale_h: Vertical scale factor
            row_threshold: Y-coordinate threshold for row clustering
            col_threshold: X-coordinate threshold for column clustering

        Returns:
            True if successful, False otherwise
        """
        try:
            if not cell_boxes:
                logger.debug("No cell_boxes provided for grid rendering")
                return False

            # Infer grid structure from cell_boxes
            inferrer = CellBoxGridInferrer(
                row_threshold=row_threshold,
                col_threshold=col_threshold
            )
            grid_info = inferrer.infer_grid(cell_boxes)

            if not grid_info:
                logger.debug("Failed to infer grid from cell_boxes")
                return False

            grid = grid_info['grid']
            num_rows = grid_info['num_rows']
            num_cols = grid_info['num_cols']
            row_boundaries = grid_info['row_boundaries']
            col_boundaries = grid_info['col_boundaries']

            logger.info(
                f"[TABLE] CellBoxes grid inferred: {num_rows} rows x {num_cols} cols "
                f"from {len(cell_boxes)} cell_boxes"
            )

            # Extract content from HTML
            if html_content:
                contents = extract_cell_contents_from_html(html_content)
                grid = map_content_to_grid(grid, contents, num_rows, num_cols)
                logger.debug(f"[TABLE] Mapped {len(contents)} HTML cells to grid")

            # Apply scale factors to boundaries
            scaled_row_boundaries = [y * scale_h for y in row_boundaries]
            scaled_col_boundaries = [x * scale_w for x in col_boundaries]

            # Draw cell borders and content
            pdf_canvas.saveState()
            pdf_canvas.setStrokeColor(self.config.border_color)
            pdf_canvas.setLineWidth(self.config.border_width)

            # Create paragraph style for text
            style = ParagraphStyle(
                'CellBoxCell',
                fontName=self.config.font_name,
                fontSize=self.config.font_size,
                alignment=TA_CENTER,
                leading=self.config.font_size * 1.2
            )

            for row in range(num_rows):
                for col in range(num_cols):
                    # Calculate cell boundaries
                    x0 = scaled_col_boundaries[col]
                    x1 = scaled_col_boundaries[col + 1] if col + 1 < len(scaled_col_boundaries) else x0 + 50
                    y0 = scaled_row_boundaries[row]
                    y1 = scaled_row_boundaries[row + 1] if row + 1 < len(scaled_row_boundaries) else y0 + 20

                    # Convert to PDF coordinates (flip Y)
                    pdf_x0 = x0
                    pdf_y0 = page_height - y1
                    pdf_x1 = x1
                    pdf_y1 = page_height - y0

                    cell_width = pdf_x1 - pdf_x0
                    cell_height = pdf_y1 - pdf_y0

                    # Draw cell border
                    pdf_canvas.rect(pdf_x0, pdf_y0, cell_width, cell_height)

                    # Draw text if cell exists in grid
                    if (row, col) in grid:
                        cell_content = grid[(row, col)].get('content', '')
                        if cell_content:
                            # Calculate text position with padding
                            text_x = pdf_x0 + self.config.left_padding
                            text_y = pdf_y0 + cell_height - self.config.top_padding - self.config.font_size

                            # Fit text to cell
                            available_width = cell_width - self.config.left_padding - self.config.right_padding
                            font_size = self._fit_text_to_cell(
                                pdf_canvas, cell_content, available_width, cell_height
                            )

                            # Draw centered text
                            pdf_canvas.setFont(self.config.font_name, font_size)
                            text_width = pdf_canvas.stringWidth(
                                cell_content, self.config.font_name, font_size
                            )

                            # Center horizontally
                            text_x = pdf_x0 + (cell_width - text_width) / 2
                            # Center vertically
                            text_y = pdf_y0 + (cell_height - font_size) / 2

                            pdf_canvas.drawString(text_x, text_y, cell_content)

            pdf_canvas.restoreState()

            logger.info(f"[TABLE] Successfully rendered {num_rows}x{num_cols} table from cell_boxes")
            return True

        except Exception as e:
            logger.error(f"CellBoxes grid rendering failed: {e}")
            import traceback
            traceback.print_exc()
            return False

    # =========================================================================
    # Grid and Cell Box Helpers
    # =========================================================================

    def compute_grid_from_cell_boxes(
        self,
        cell_boxes: List,
        table_bbox: Tuple[float, float, float, float],
        num_rows: int,
        num_cols: int
    ) -> Tuple[Optional[List[float]], Optional[List[float]]]:
        """
        Calculate column widths and row heights from cell bounding boxes.

        Args:
            cell_boxes: List of [x0, y0, x1, y1] for each cell
            table_bbox: Table bounding box
            num_rows: Expected number of rows
            num_cols: Expected number of columns

        Returns:
            Tuple of (col_widths, row_heights) or (None, None) on failure
        """
        try:
            if not cell_boxes:
                return None, None

            # Filter valid boxes
            valid_boxes = [b for b in cell_boxes if b is not None and len(b) >= 4]
            if not valid_boxes:
                return None, None

            # Extract unique X and Y boundaries
            x_boundaries = set()
            y_boundaries = set()

            for box in valid_boxes:
                x0, y0, x1, y1 = box[:4]
                x_boundaries.add(round(x0, 1))
                x_boundaries.add(round(x1, 1))
                y_boundaries.add(round(y0, 1))
                y_boundaries.add(round(y1, 1))

            # Sort boundaries
            x_sorted = sorted(x_boundaries)
            y_sorted = sorted(y_boundaries)

            # Merge nearby boundaries
            x_merged = self._merge_boundaries(x_sorted, self.config.merge_boundary_threshold)
            y_merged = self._merge_boundaries(y_sorted, self.config.merge_boundary_threshold)

            # Calculate widths and heights
            col_widths = []
            for i in range(len(x_merged) - 1):
                col_widths.append(x_merged[i + 1] - x_merged[i])

            row_heights = []
            for i in range(len(y_merged) - 1):
                row_heights.append(y_merged[i + 1] - y_merged[i])

            # Validate against expected dimensions (allow for merged cells)
            tolerance = max(num_cols, num_rows) // 2 + 1
            if abs(len(col_widths) - num_cols) > tolerance:
                logger.debug(f"Column count mismatch: {len(col_widths)} vs {num_cols}")
            if abs(len(row_heights) - num_rows) > tolerance:
                logger.debug(f"Row count mismatch: {len(row_heights)} vs {num_rows}")

            return col_widths if col_widths else None, row_heights if row_heights else None

        except Exception as e:
            logger.error(f"Grid computation failed: {e}")
            return None, None

    def normalize_cell_boxes_to_grid(
        self,
        cell_boxes: List,
        threshold: Optional[float] = None
    ) -> List:
        """
        Snap cell boxes to aligned grid to eliminate coordinate variations.

        Args:
            cell_boxes: List of [x0, y0, x1, y1] for each cell
            threshold: Clustering threshold (uses config default if None)

        Returns:
            Normalized cell boxes
        """
        threshold = threshold or self.config.grid_threshold

        if not cell_boxes:
            return []

        try:
            # Collect all coordinates
            all_x = []
            all_y = []

            for box in cell_boxes:
                if box is None or len(box) < 4:
                    continue
                x0, y0, x1, y1 = box[:4]
                all_x.extend([x0, x1])
                all_y.extend([y0, y1])

            if not all_x or not all_y:
                return cell_boxes

            # Cluster and normalize X coordinates
            x_clusters = self._cluster_values(sorted(all_x), threshold)
            y_clusters = self._cluster_values(sorted(all_y), threshold)

            # Build mapping
            x_map = {v: avg for avg, values in x_clusters for v in values}
            y_map = {v: avg for avg, values in y_clusters for v in values}

            # Normalize boxes
            normalized = []
            for box in cell_boxes:
                if box is None or len(box) < 4:
                    normalized.append(box)
                    continue

                x0, y0, x1, y1 = box[:4]
                normalized.append([
                    x_map.get(x0, x0),
                    y_map.get(y0, y0),
                    x_map.get(x1, x1),
                    y_map.get(y1, y1)
                ])

            return normalized

        except Exception as e:
            logger.error(f"Cell box normalization failed: {e}")
            return cell_boxes

    # =========================================================================
    # Private Helper Methods
    # =========================================================================

    def _render_parsed_table(
        self,
        pdf_canvas,
        table_data: Dict,
        table_bbox: Tuple[float, float, float, float],
        page_height: float,
        scale_w: float = 1.0,
        scale_h: float = 1.0
    ) -> bool:
        """Render a parsed table structure."""
        rows = table_data.get('rows', [])
        if not rows:
            return False

        # Build grid content
        num_rows = len(rows)
        num_cols = max(len(row.get('cells', [])) for row in rows)

        # Track occupied cells for rowspan handling
        occupied = [[False] * num_cols for _ in range(num_rows)]

        grid = []
        span_commands = []

        for row_idx, row in enumerate(rows):
            grid_row = [''] * num_cols
            col_idx = 0

            for cell in row.get('cells', []):
                # Skip occupied cells
                while col_idx < num_cols and occupied[row_idx][col_idx]:
                    col_idx += 1

                if col_idx >= num_cols:
                    break

                text = cell.get('text', '').strip()
                colspan = cell.get('colspan', 1)
                rowspan = cell.get('rowspan', 1)

                # Place cell content
                grid_row[col_idx] = text

                # Mark occupied cells and build SPAN command
                if colspan > 1 or rowspan > 1:
                    end_col = min(col_idx + colspan - 1, num_cols - 1)
                    end_row = min(row_idx + rowspan - 1, num_rows - 1)
                    span_commands.append(
                        ('SPAN', (col_idx, row_idx), (end_col, end_row))
                    )

                    for r in range(row_idx, end_row + 1):
                        for c in range(col_idx, end_col + 1):
                            if r < num_rows and c < num_cols:
                                occupied[r][c] = True
                else:
                    occupied[row_idx][col_idx] = True

                col_idx += colspan

            grid.append(grid_row)

        # Calculate dimensions
        x0, y0, x1, y1 = table_bbox
        table_width = (x1 - x0) * scale_w
        table_height = (y1 - y0) * scale_h

        col_widths = [table_width / num_cols] * num_cols
        row_heights = [table_height / num_rows] * num_rows

        # Create paragraph style
        style = ParagraphStyle(
            'TableCell',
            fontName=self.config.font_name,
            fontSize=self.config.font_size,
            alignment=TA_CENTER,
            leading=self.config.font_size * 1.2
        )

        # Convert to Paragraph objects
        para_grid = []
        for row in grid:
            para_row = []
            for cell in row:
                if cell:
                    para_row.append(Paragraph(cell, style))
                else:
                    para_row.append('')
            para_grid.append(para_row)

        # Build TableStyle
        table_style_commands = [
            ('GRID', (0, 0), (-1, -1), self.config.border_width, self.config.border_color),
            ('VALIGN', (0, 0), (-1, -1), self.config.vertical_align),
            ('ALIGN', (0, 0), (-1, -1), self.config.horizontal_align),
            ('LEFTPADDING', (0, 0), (-1, -1), self.config.left_padding),
            ('RIGHTPADDING', (0, 0), (-1, -1), self.config.right_padding),
            ('TOPPADDING', (0, 0), (-1, -1), self.config.top_padding),
            ('BOTTOMPADDING', (0, 0), (-1, -1), self.config.bottom_padding),
            ('FONTNAME', (0, 0), (-1, -1), self.config.font_name),
            ('FONTSIZE', (0, 0), (-1, -1), self.config.font_size),
        ]
        table_style_commands.extend(span_commands)

        # Create and draw table
        table = Table(para_grid, colWidths=col_widths, rowHeights=row_heights)
        table.setStyle(TableStyle(table_style_commands))

        # Position and draw
        pdf_x = x0
        pdf_y = page_height - y1  # Flip Y

        table.wrapOn(pdf_canvas, table_width, table_height)
        table.drawOn(pdf_canvas, pdf_x, pdf_y)

        return True

    def _render_with_dimensions(
        self,
        pdf_canvas,
        table_data: Dict,
        table_bbox: Tuple[float, float, float, float],
        page_height: float,
        col_widths: List[float],
        row_heights: List[float]
    ) -> bool:
        """Render table with specified dimensions."""
        rows = table_data.get('rows', [])
        if not rows:
            return False

        num_rows = len(rows)
        num_cols = max(len(row.get('cells', [])) for row in rows)

        # Adjust widths/heights if needed
        if len(col_widths) != num_cols:
            x0, y0, x1, y1 = table_bbox
            col_widths = [(x1 - x0) / num_cols] * num_cols
        if len(row_heights) != num_rows:
            x0, y0, x1, y1 = table_bbox
            row_heights = [(y1 - y0) / num_rows] * num_rows

        # Build grid with proper positioning
        grid = []
        span_commands = []
        occupied = [[False] * num_cols for _ in range(num_rows)]

        for row_idx, row in enumerate(rows):
            grid_row = [''] * num_cols

            for cell in row.get('cells', []):
                # Get column position
                col_idx = cell.get('col', 0)

                # Skip if out of bounds or occupied
                while col_idx < num_cols and occupied[row_idx][col_idx]:
                    col_idx += 1
                if col_idx >= num_cols:
                    continue

                text = cell.get('text', '').strip()
                colspan = cell.get('colspan', 1)
                rowspan = cell.get('rowspan', 1)

                grid_row[col_idx] = text

                if colspan > 1 or rowspan > 1:
                    end_col = min(col_idx + colspan - 1, num_cols - 1)
                    end_row = min(row_idx + rowspan - 1, num_rows - 1)
                    span_commands.append(
                        ('SPAN', (col_idx, row_idx), (end_col, end_row))
                    )
                    for r in range(row_idx, end_row + 1):
                        for c in range(col_idx, end_col + 1):
                            if r < num_rows and c < num_cols:
                                occupied[r][c] = True
                else:
                    occupied[row_idx][col_idx] = True

            grid.append(grid_row)

        # Create style and table
        style = ParagraphStyle(
            'TableCell',
            fontName=self.config.font_name,
            fontSize=self.config.font_size,
            alignment=TA_CENTER
        )

        para_grid = []
        for row in grid:
            para_row = [Paragraph(cell, style) if cell else '' for cell in row]
            para_grid.append(para_row)

        table_style_commands = [
            ('GRID', (0, 0), (-1, -1), self.config.border_width, self.config.border_color),
            ('VALIGN', (0, 0), (-1, -1), self.config.vertical_align),
            ('LEFTPADDING', (0, 0), (-1, -1), 0),
            ('RIGHTPADDING', (0, 0), (-1, -1), 0),
            ('TOPPADDING', (0, 0), (-1, -1), 0),
            ('BOTTOMPADDING', (0, 0), (-1, -1), 1),
        ]
        table_style_commands.extend(span_commands)

        table = Table(para_grid, colWidths=col_widths, rowHeights=row_heights)
        table.setStyle(TableStyle(table_style_commands))

        x0, y0, x1, y1 = table_bbox
        pdf_x = x0
        pdf_y = page_height - y1

        table.wrapOn(pdf_canvas, x1 - x0, y1 - y0)
        table.drawOn(pdf_canvas, pdf_x, pdf_y)

        return True

    def _build_rows_from_cells_dict(self, cells_dict: Dict) -> List[Dict]:
        """Convert Direct track cell structure to row format."""
        cells = cells_dict.get('cells', [])
        if not cells:
            return []

        num_rows = cells_dict.get('rows', 0)
        num_cols = cells_dict.get('cols', 0)

        # Group cells by row
        rows_data = {}
        for cell in cells:
            row_idx = cell.get('row', 0)
            if row_idx not in rows_data:
                rows_data[row_idx] = []
            rows_data[row_idx].append(cell)

        # Build row list
        rows = []
        for row_idx in range(num_rows):
            row_cells = rows_data.get(row_idx, [])

            # Sort by column
            row_cells.sort(key=lambda c: c.get('col', 0))

            formatted_cells = []
            for cell in row_cells:
                content = cell.get('content', '')
                if isinstance(content, list):
                    content = '\n'.join(str(c) for c in content)

                formatted_cells.append({
                    'text': str(content) if content else '',
                    'colspan': cell.get('col_span', 1),
                    'rowspan': cell.get('row_span', 1),
                    'col': cell.get('col', 0),
                    'is_header': cell.get('is_header', False)
                })

            rows.append({'cells': formatted_cells})

        return rows

    def _draw_table_border(
        self,
        pdf_canvas,
        table_bbox: Tuple[float, float, float, float],
        page_height: float
    ) -> bool:
        """Draw outer table border."""
        try:
            x0, y0, x1, y1 = table_bbox
            pdf_y0 = page_height - y1
            pdf_y1 = page_height - y0

            pdf_canvas.saveState()
            pdf_canvas.setStrokeColor(self.config.border_color)
            pdf_canvas.setLineWidth(self.config.border_width)
            pdf_canvas.rect(x0, pdf_y0, x1 - x0, pdf_y1 - pdf_y0)
            pdf_canvas.restoreState()

            return True
        except Exception as e:
            logger.error(f"Failed to draw table border: {e}")
            return False

    def _draw_embedded_image(
        self,
        pdf_canvas,
        img_info: Dict,
        page_height: float,
        output_dir: Path
    ) -> bool:
        """Draw an image embedded within a table cell."""
        try:
            img_path = img_info.get('path')
            if not img_path:
                return False

            # Resolve path
            if not Path(img_path).is_absolute():
                img_path = output_dir / img_path

            if not Path(img_path).exists():
                logger.warning(f"Embedded image not found: {img_path}")
                return False

            bbox = img_info.get('bbox', {})
            x0 = bbox.get('x0', 0)
            y0 = bbox.get('y0', 0)
            width = bbox.get('width', 100)
            height = bbox.get('height', 100)

            # Flip Y coordinate
            pdf_y = page_height - y0 - height

            # Draw image
            img = ImageReader(str(img_path))
            pdf_canvas.drawImage(img, x0, pdf_y, width, height)

            return True

        except Exception as e:
            logger.error(f"Failed to draw embedded image: {e}")
            return False

    def _fit_text_to_cell(
        self,
        pdf_canvas,
        text: str,
        cell_width: float,
        cell_height: float
    ) -> int:
        """Find font size that fits text in cell."""
        for size in range(self.config.max_font_size, self.config.min_font_size - 1, -1):
            text_width = pdf_canvas.stringWidth(text, self.config.font_name, size)
            if text_width <= cell_width - 6:  # 3pt padding each side
                return size
        return self.config.min_font_size

    def _merge_boundaries(self, values: List[float], threshold: float) -> List[float]:
        """Merge nearby boundary values."""
        if not values:
            return []

        merged = [values[0]]
        for v in values[1:]:
            if abs(v - merged[-1]) > threshold:
                merged.append(v)

        return merged

    def _cluster_values(self, values: List[float], threshold: float) -> List[Tuple[float, List[float]]]:
        """Cluster nearby values and return (average, members) pairs."""
        if not values:
            return []

        clusters = []
        current_cluster = [values[0]]

        for v in values[1:]:
            if abs(v - current_cluster[-1]) <= threshold:
                current_cluster.append(v)
            else:
                avg = sum(current_cluster) / len(current_cluster)
                clusters.append((avg, current_cluster))
                current_cluster = [v]

        if current_cluster:
            avg = sum(current_cluster) / len(current_cluster)
            clusters.append((avg, current_cluster))

        return clusters