""" PDF Table Renderer - Handles table rendering for PDF generation. This module provides unified table rendering capabilities extracted from PDFGeneratorService, supporting multiple input formats: - HTML tables - Cell boxes (layered approach) - Cells dictionary (Direct track) - TableData objects """ import logging from dataclasses import dataclass, field from html.parser import HTMLParser from pathlib import Path from typing import Any, Dict, List, Optional, Tuple, Union from reportlab.lib import colors from reportlab.lib.enums import TA_CENTER, TA_LEFT, TA_RIGHT from reportlab.lib.styles import ParagraphStyle from reportlab.lib.utils import ImageReader from reportlab.platypus import Paragraph, Table, TableStyle logger = logging.getLogger(__name__) # ============================================================================ # Configuration # ============================================================================ @dataclass class TableRenderConfig: """Configuration for table rendering.""" font_name: str = "Helvetica" font_size: int = 8 min_font_size: int = 6 max_font_size: int = 10 # Padding options left_padding: int = 2 right_padding: int = 2 top_padding: int = 2 bottom_padding: int = 2 # Border options border_color: Any = colors.black border_width: float = 0.5 # Alignment horizontal_align: str = "CENTER" vertical_align: str = "MIDDLE" # Header styling header_background: Any = colors.lightgrey # Grid normalization threshold grid_threshold: float = 10.0 # Merged cells threshold merge_boundary_threshold: float = 5.0 # ============================================================================ # HTML Table Parser # ============================================================================ class HTMLTableParser(HTMLParser): """ Parse HTML table structure for rendering. Extracts table rows, cells, and merged cell information (colspan/rowspan) from HTML table markup. """ def __init__(self): super().__init__() self.tables = [] self.current_table = None self.current_row = None self.current_cell = None self.in_cell = False def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]): if tag == 'table': self.current_table = {'rows': []} elif tag == 'tr': self.current_row = {'cells': []} elif tag in ('td', 'th'): # Extract colspan and rowspan attributes attrs_dict = dict(attrs) colspan = int(attrs_dict.get('colspan', 1)) rowspan = int(attrs_dict.get('rowspan', 1)) self.current_cell = { 'text': '', 'is_header': tag == 'th', 'colspan': colspan, 'rowspan': rowspan } self.in_cell = True def handle_endtag(self, tag: str): if tag == 'table' and self.current_table: self.tables.append(self.current_table) self.current_table = None elif tag == 'tr' and self.current_row: if self.current_table: self.current_table['rows'].append(self.current_row) self.current_row = None elif tag in ('td', 'th') and self.current_cell: if self.current_row: self.current_row['cells'].append(self.current_cell) self.current_cell = None self.in_cell = False def handle_data(self, data: str): if self.in_cell and self.current_cell is not None: self.current_cell['text'] += data # ============================================================================ # Table Renderer # ============================================================================ class TableRenderer: """ Unified table rendering engine for PDF generation. Supports multiple input formats and rendering modes: - HTML table parsing and rendering - Cell boxes rendering (layered approach) - Direct track cells dictionary - Translated content with dynamic font sizing """ def __init__(self, config: Optional[TableRenderConfig] = None): """ Initialize TableRenderer with configuration. Args: config: TableRenderConfig instance (uses defaults if None) """ self.config = config or TableRenderConfig() def render_from_html( self, pdf_canvas, html_content: str, table_bbox: Tuple[float, float, float, float], page_height: float, scale_w: float = 1.0, scale_h: float = 1.0 ) -> bool: """ Parse HTML and render table to PDF canvas. Args: pdf_canvas: ReportLab canvas html_content: HTML table string table_bbox: (x0, y0, x1, y1) bounding box page_height: PDF page height for Y coordinate flip scale_w: Horizontal scale factor scale_h: Vertical scale factor Returns: True if successful, False otherwise """ try: # Parse HTML parser = HTMLTableParser() parser.feed(html_content) if not parser.tables: logger.warning("No tables found in HTML content") return False table_data = parser.tables[0] return self._render_parsed_table( pdf_canvas, table_data, table_bbox, page_height, scale_w, scale_h ) except Exception as e: logger.error(f"HTML table rendering failed: {e}") import traceback traceback.print_exc() return False def render_from_cells_dict( self, pdf_canvas, cells_dict: Dict, table_bbox: Tuple[float, float, float, float], page_height: float, cell_boxes: Optional[List] = None ) -> bool: """ Render table from Direct track cell structure. Args: pdf_canvas: ReportLab canvas cells_dict: Dict with 'rows', 'cols', 'cells' keys table_bbox: (x0, y0, x1, y1) bounding box page_height: PDF page height cell_boxes: Optional precomputed cell boxes Returns: True if successful, False otherwise """ try: # Convert cells dict to row format rows = self._build_rows_from_cells_dict(cells_dict) if not rows: logger.warning("No rows built from cells dict") return False # Build table data structure table_data = {'rows': rows} # Calculate dimensions x0, y0, x1, y1 = table_bbox table_width = (x1 - x0) table_height = (y1 - y0) # Determine grid dimensions num_rows = cells_dict.get('rows', len(rows)) num_cols = cells_dict.get('cols', max(len(row['cells']) for row in rows) if rows else 1 ) # Calculate column widths and row heights if cell_boxes: col_widths, row_heights = self.compute_grid_from_cell_boxes( cell_boxes, table_bbox, num_rows, num_cols ) else: col_widths = [table_width / num_cols] * num_cols row_heights = [table_height / num_rows] * num_rows return self._render_with_dimensions( pdf_canvas, table_data, table_bbox, page_height, col_widths, row_heights ) except Exception as e: logger.error(f"Cells dict rendering failed: {e}") import traceback traceback.print_exc() return False def render_cell_borders( self, pdf_canvas, cell_boxes: List[List[float]], table_bbox: Tuple[float, float, float, float], page_height: float, embedded_images: Optional[List] = None, output_dir: Optional[Path] = None ) -> bool: """ Render table cell borders only (layered approach). This renders only the cell borders, not the text content. Text is typically rendered separately by GapFillingService. Args: pdf_canvas: ReportLab canvas cell_boxes: List of [x0, y0, x1, y1] for each cell table_bbox: Table bounding box page_height: PDF page height embedded_images: Optional list of images within cells output_dir: Directory for image files Returns: True if successful, False otherwise """ try: if not cell_boxes: # Draw outer border only return self._draw_table_border( pdf_canvas, table_bbox, page_height ) # Normalize cell boxes to grid normalized_boxes = self.normalize_cell_boxes_to_grid(cell_boxes) # Draw each cell border pdf_canvas.saveState() pdf_canvas.setStrokeColor(self.config.border_color) pdf_canvas.setLineWidth(self.config.border_width) for box in normalized_boxes: if box is None: continue x0, y0, x1, y1 = box # Convert to PDF coordinates (flip Y) pdf_x0 = x0 pdf_y0 = page_height - y1 pdf_x1 = x1 pdf_y1 = page_height - y0 # Draw cell rectangle pdf_canvas.rect(pdf_x0, pdf_y0, pdf_x1 - pdf_x0, pdf_y1 - pdf_y0) pdf_canvas.restoreState() # Draw embedded images if any if embedded_images and output_dir: for img_info in embedded_images: self._draw_embedded_image( pdf_canvas, img_info, page_height, output_dir ) return True except Exception as e: logger.error(f"Cell borders rendering failed: {e}") import traceback traceback.print_exc() return False def render_with_translated_text( self, pdf_canvas, cells: List[Dict], cell_boxes: List, table_bbox: Tuple[float, float, float, float], page_height: float ) -> bool: """ Render table with translated content and dynamic font sizing. Args: pdf_canvas: ReportLab canvas cells: List of cell dicts with 'translated_content' cell_boxes: List of cell bounding boxes table_bbox: Table bounding box page_height: PDF page height Returns: True if successful, False otherwise """ try: # Draw outer border self._draw_table_border(pdf_canvas, table_bbox, page_height) # Normalize cell boxes if cell_boxes: normalized_boxes = self.normalize_cell_boxes_to_grid(cell_boxes) else: logger.warning("No cell boxes for translated table") return False pdf_canvas.saveState() pdf_canvas.setStrokeColor(self.config.border_color) pdf_canvas.setLineWidth(self.config.border_width) # Draw cell borders for box in normalized_boxes: if box is None: continue x0, y0, x1, y1 = box pdf_y0 = page_height - y1 pdf_canvas.rect(x0, pdf_y0, x1 - x0, y1 - y0) pdf_canvas.restoreState() # Render text in cells with dynamic font sizing for i, cell in enumerate(cells): if i >= len(normalized_boxes): break box = normalized_boxes[i] if box is None: continue translated_text = cell.get('translated_content', '') if not translated_text: continue x0, y0, x1, y1 = box cell_width = x1 - x0 cell_height = y1 - y0 # Find appropriate font size font_size = self._fit_text_to_cell( pdf_canvas, translated_text, cell_width, cell_height ) # Render centered text pdf_canvas.setFont(self.config.font_name, font_size) # Calculate text position (centered) text_width = pdf_canvas.stringWidth(translated_text, self.config.font_name, font_size) text_x = x0 + (cell_width - text_width) / 2 text_y = page_height - y0 - cell_height / 2 - font_size / 3 pdf_canvas.drawString(text_x, text_y, translated_text) return True except Exception as e: logger.error(f"Translated table rendering failed: {e}") import traceback traceback.print_exc() return False # ========================================================================= # Grid and Cell Box Helpers # ========================================================================= def compute_grid_from_cell_boxes( self, cell_boxes: List, table_bbox: Tuple[float, float, float, float], num_rows: int, num_cols: int ) -> Tuple[Optional[List[float]], Optional[List[float]]]: """ Calculate column widths and row heights from cell bounding boxes. Args: cell_boxes: List of [x0, y0, x1, y1] for each cell table_bbox: Table bounding box num_rows: Expected number of rows num_cols: Expected number of columns Returns: Tuple of (col_widths, row_heights) or (None, None) on failure """ try: if not cell_boxes: return None, None # Filter valid boxes valid_boxes = [b for b in cell_boxes if b is not None and len(b) >= 4] if not valid_boxes: return None, None # Extract unique X and Y boundaries x_boundaries = set() y_boundaries = set() for box in valid_boxes: x0, y0, x1, y1 = box[:4] x_boundaries.add(round(x0, 1)) x_boundaries.add(round(x1, 1)) y_boundaries.add(round(y0, 1)) y_boundaries.add(round(y1, 1)) # Sort boundaries x_sorted = sorted(x_boundaries) y_sorted = sorted(y_boundaries) # Merge nearby boundaries x_merged = self._merge_boundaries(x_sorted, self.config.merge_boundary_threshold) y_merged = self._merge_boundaries(y_sorted, self.config.merge_boundary_threshold) # Calculate widths and heights col_widths = [] for i in range(len(x_merged) - 1): col_widths.append(x_merged[i + 1] - x_merged[i]) row_heights = [] for i in range(len(y_merged) - 1): row_heights.append(y_merged[i + 1] - y_merged[i]) # Validate against expected dimensions (allow for merged cells) tolerance = max(num_cols, num_rows) // 2 + 1 if abs(len(col_widths) - num_cols) > tolerance: logger.debug(f"Column count mismatch: {len(col_widths)} vs {num_cols}") if abs(len(row_heights) - num_rows) > tolerance: logger.debug(f"Row count mismatch: {len(row_heights)} vs {num_rows}") return col_widths if col_widths else None, row_heights if row_heights else None except Exception as e: logger.error(f"Grid computation failed: {e}") return None, None def normalize_cell_boxes_to_grid( self, cell_boxes: List, threshold: Optional[float] = None ) -> List: """ Snap cell boxes to aligned grid to eliminate coordinate variations. Args: cell_boxes: List of [x0, y0, x1, y1] for each cell threshold: Clustering threshold (uses config default if None) Returns: Normalized cell boxes """ threshold = threshold or self.config.grid_threshold if not cell_boxes: return [] try: # Collect all coordinates all_x = [] all_y = [] for box in cell_boxes: if box is None or len(box) < 4: continue x0, y0, x1, y1 = box[:4] all_x.extend([x0, x1]) all_y.extend([y0, y1]) if not all_x or not all_y: return cell_boxes # Cluster and normalize X coordinates x_clusters = self._cluster_values(sorted(all_x), threshold) y_clusters = self._cluster_values(sorted(all_y), threshold) # Build mapping x_map = {v: avg for avg, values in x_clusters for v in values} y_map = {v: avg for avg, values in y_clusters for v in values} # Normalize boxes normalized = [] for box in cell_boxes: if box is None or len(box) < 4: normalized.append(box) continue x0, y0, x1, y1 = box[:4] normalized.append([ x_map.get(x0, x0), y_map.get(y0, y0), x_map.get(x1, x1), y_map.get(y1, y1) ]) return normalized except Exception as e: logger.error(f"Cell box normalization failed: {e}") return cell_boxes # ========================================================================= # Private Helper Methods # ========================================================================= def _render_parsed_table( self, pdf_canvas, table_data: Dict, table_bbox: Tuple[float, float, float, float], page_height: float, scale_w: float = 1.0, scale_h: float = 1.0 ) -> bool: """Render a parsed table structure.""" rows = table_data.get('rows', []) if not rows: return False # Build grid content num_rows = len(rows) num_cols = max(len(row.get('cells', [])) for row in rows) # Track occupied cells for rowspan handling occupied = [[False] * num_cols for _ in range(num_rows)] grid = [] span_commands = [] for row_idx, row in enumerate(rows): grid_row = [''] * num_cols col_idx = 0 for cell in row.get('cells', []): # Skip occupied cells while col_idx < num_cols and occupied[row_idx][col_idx]: col_idx += 1 if col_idx >= num_cols: break text = cell.get('text', '').strip() colspan = cell.get('colspan', 1) rowspan = cell.get('rowspan', 1) # Place cell content grid_row[col_idx] = text # Mark occupied cells and build SPAN command if colspan > 1 or rowspan > 1: end_col = min(col_idx + colspan - 1, num_cols - 1) end_row = min(row_idx + rowspan - 1, num_rows - 1) span_commands.append( ('SPAN', (col_idx, row_idx), (end_col, end_row)) ) for r in range(row_idx, end_row + 1): for c in range(col_idx, end_col + 1): if r < num_rows and c < num_cols: occupied[r][c] = True else: occupied[row_idx][col_idx] = True col_idx += colspan grid.append(grid_row) # Calculate dimensions x0, y0, x1, y1 = table_bbox table_width = (x1 - x0) * scale_w table_height = (y1 - y0) * scale_h col_widths = [table_width / num_cols] * num_cols row_heights = [table_height / num_rows] * num_rows # Create paragraph style style = ParagraphStyle( 'TableCell', fontName=self.config.font_name, fontSize=self.config.font_size, alignment=TA_CENTER, leading=self.config.font_size * 1.2 ) # Convert to Paragraph objects para_grid = [] for row in grid: para_row = [] for cell in row: if cell: para_row.append(Paragraph(cell, style)) else: para_row.append('') para_grid.append(para_row) # Build TableStyle table_style_commands = [ ('GRID', (0, 0), (-1, -1), self.config.border_width, self.config.border_color), ('VALIGN', (0, 0), (-1, -1), self.config.vertical_align), ('ALIGN', (0, 0), (-1, -1), self.config.horizontal_align), ('LEFTPADDING', (0, 0), (-1, -1), self.config.left_padding), ('RIGHTPADDING', (0, 0), (-1, -1), self.config.right_padding), ('TOPPADDING', (0, 0), (-1, -1), self.config.top_padding), ('BOTTOMPADDING', (0, 0), (-1, -1), self.config.bottom_padding), ('FONTNAME', (0, 0), (-1, -1), self.config.font_name), ('FONTSIZE', (0, 0), (-1, -1), self.config.font_size), ] table_style_commands.extend(span_commands) # Create and draw table table = Table(para_grid, colWidths=col_widths, rowHeights=row_heights) table.setStyle(TableStyle(table_style_commands)) # Position and draw pdf_x = x0 pdf_y = page_height - y1 # Flip Y table.wrapOn(pdf_canvas, table_width, table_height) table.drawOn(pdf_canvas, pdf_x, pdf_y) return True def _render_with_dimensions( self, pdf_canvas, table_data: Dict, table_bbox: Tuple[float, float, float, float], page_height: float, col_widths: List[float], row_heights: List[float] ) -> bool: """Render table with specified dimensions.""" rows = table_data.get('rows', []) if not rows: return False num_rows = len(rows) num_cols = max(len(row.get('cells', [])) for row in rows) # Adjust widths/heights if needed if len(col_widths) != num_cols: x0, y0, x1, y1 = table_bbox col_widths = [(x1 - x0) / num_cols] * num_cols if len(row_heights) != num_rows: x0, y0, x1, y1 = table_bbox row_heights = [(y1 - y0) / num_rows] * num_rows # Build grid with proper positioning grid = [] span_commands = [] occupied = [[False] * num_cols for _ in range(num_rows)] for row_idx, row in enumerate(rows): grid_row = [''] * num_cols for cell in row.get('cells', []): # Get column position col_idx = cell.get('col', 0) # Skip if out of bounds or occupied while col_idx < num_cols and occupied[row_idx][col_idx]: col_idx += 1 if col_idx >= num_cols: continue text = cell.get('text', '').strip() colspan = cell.get('colspan', 1) rowspan = cell.get('rowspan', 1) grid_row[col_idx] = text if colspan > 1 or rowspan > 1: end_col = min(col_idx + colspan - 1, num_cols - 1) end_row = min(row_idx + rowspan - 1, num_rows - 1) span_commands.append( ('SPAN', (col_idx, row_idx), (end_col, end_row)) ) for r in range(row_idx, end_row + 1): for c in range(col_idx, end_col + 1): if r < num_rows and c < num_cols: occupied[r][c] = True else: occupied[row_idx][col_idx] = True grid.append(grid_row) # Create style and table style = ParagraphStyle( 'TableCell', fontName=self.config.font_name, fontSize=self.config.font_size, alignment=TA_CENTER ) para_grid = [] for row in grid: para_row = [Paragraph(cell, style) if cell else '' for cell in row] para_grid.append(para_row) table_style_commands = [ ('GRID', (0, 0), (-1, -1), self.config.border_width, self.config.border_color), ('VALIGN', (0, 0), (-1, -1), self.config.vertical_align), ('LEFTPADDING', (0, 0), (-1, -1), 0), ('RIGHTPADDING', (0, 0), (-1, -1), 0), ('TOPPADDING', (0, 0), (-1, -1), 0), ('BOTTOMPADDING', (0, 0), (-1, -1), 1), ] table_style_commands.extend(span_commands) table = Table(para_grid, colWidths=col_widths, rowHeights=row_heights) table.setStyle(TableStyle(table_style_commands)) x0, y0, x1, y1 = table_bbox pdf_x = x0 pdf_y = page_height - y1 table.wrapOn(pdf_canvas, x1 - x0, y1 - y0) table.drawOn(pdf_canvas, pdf_x, pdf_y) return True def _build_rows_from_cells_dict(self, cells_dict: Dict) -> List[Dict]: """Convert Direct track cell structure to row format.""" cells = cells_dict.get('cells', []) if not cells: return [] num_rows = cells_dict.get('rows', 0) num_cols = cells_dict.get('cols', 0) # Group cells by row rows_data = {} for cell in cells: row_idx = cell.get('row', 0) if row_idx not in rows_data: rows_data[row_idx] = [] rows_data[row_idx].append(cell) # Build row list rows = [] for row_idx in range(num_rows): row_cells = rows_data.get(row_idx, []) # Sort by column row_cells.sort(key=lambda c: c.get('col', 0)) formatted_cells = [] for cell in row_cells: content = cell.get('content', '') if isinstance(content, list): content = '\n'.join(str(c) for c in content) formatted_cells.append({ 'text': str(content) if content else '', 'colspan': cell.get('col_span', 1), 'rowspan': cell.get('row_span', 1), 'col': cell.get('col', 0), 'is_header': cell.get('is_header', False) }) rows.append({'cells': formatted_cells}) return rows def _draw_table_border( self, pdf_canvas, table_bbox: Tuple[float, float, float, float], page_height: float ) -> bool: """Draw outer table border.""" try: x0, y0, x1, y1 = table_bbox pdf_y0 = page_height - y1 pdf_y1 = page_height - y0 pdf_canvas.saveState() pdf_canvas.setStrokeColor(self.config.border_color) pdf_canvas.setLineWidth(self.config.border_width) pdf_canvas.rect(x0, pdf_y0, x1 - x0, pdf_y1 - pdf_y0) pdf_canvas.restoreState() return True except Exception as e: logger.error(f"Failed to draw table border: {e}") return False def _draw_embedded_image( self, pdf_canvas, img_info: Dict, page_height: float, output_dir: Path ) -> bool: """Draw an image embedded within a table cell.""" try: img_path = img_info.get('path') if not img_path: return False # Resolve path if not Path(img_path).is_absolute(): img_path = output_dir / img_path if not Path(img_path).exists(): logger.warning(f"Embedded image not found: {img_path}") return False bbox = img_info.get('bbox', {}) x0 = bbox.get('x0', 0) y0 = bbox.get('y0', 0) width = bbox.get('width', 100) height = bbox.get('height', 100) # Flip Y coordinate pdf_y = page_height - y0 - height # Draw image img = ImageReader(str(img_path)) pdf_canvas.drawImage(img, x0, pdf_y, width, height) return True except Exception as e: logger.error(f"Failed to draw embedded image: {e}") return False def _fit_text_to_cell( self, pdf_canvas, text: str, cell_width: float, cell_height: float ) -> int: """Find font size that fits text in cell.""" for size in range(self.config.max_font_size, self.config.min_font_size - 1, -1): text_width = pdf_canvas.stringWidth(text, self.config.font_name, size) if text_width <= cell_width - 6: # 3pt padding each side return size return self.config.min_font_size def _merge_boundaries(self, values: List[float], threshold: float) -> List[float]: """Merge nearby boundary values.""" if not values: return [] merged = [values[0]] for v in values[1:]: if abs(v - merged[-1]) > threshold: merged.append(v) return merged def _cluster_values(self, values: List[float], threshold: float) -> List[Tuple[float, List[float]]]: """Cluster nearby values and return (average, members) pairs.""" if not values: return [] clusters = [] current_cluster = [values[0]] for v in values[1:]: if abs(v - current_cluster[-1]) <= threshold: current_cluster.append(v) else: avg = sum(current_cluster) / len(current_cluster) clusters.append((avg, current_cluster)) current_cluster = [v] if current_cluster: avg = sum(current_cluster) / len(current_cluster) clusters.append((avg, current_cluster)) return clusters