""" PDF Table Renderer - Handles table rendering for PDF generation. This module provides unified table rendering capabilities extracted from PDFGeneratorService, supporting multiple input formats: - HTML tables - Cell boxes (layered approach) - Cells dictionary (Direct track) - TableData objects """ import logging from dataclasses import dataclass, field from html.parser import HTMLParser from pathlib import Path from typing import Any, Dict, List, Optional, Tuple, Union from reportlab.lib import colors from reportlab.lib.enums import TA_CENTER, TA_LEFT, TA_RIGHT from reportlab.lib.styles import ParagraphStyle from reportlab.lib.utils import ImageReader from reportlab.platypus import Paragraph, Table, TableStyle logger = logging.getLogger(__name__) # ============================================================================ # Cell Box Grid Inferrer # ============================================================================ class CellBoxGridInferrer: """ Infer table grid structure from cell_boxes coordinates. This class clusters cell_boxes by Y-coordinate (rows) and X-coordinate (columns) to determine the grid structure, regardless of HTML colspan/rowspan. """ def __init__( self, row_threshold: float = 15.0, col_threshold: float = 15.0 ): """ Initialize grid inferrer. Args: row_threshold: Y-coordinate threshold for row clustering col_threshold: X-coordinate threshold for column clustering """ self.row_threshold = row_threshold self.col_threshold = col_threshold def infer_grid( self, cell_boxes: List[List[float]] ) -> Optional[Dict]: """ Infer grid structure from cell_boxes. Args: cell_boxes: List of [x0, y0, x1, y1] coordinates Returns: Dict with 'grid', 'num_rows', 'num_cols', 'row_boundaries', 'col_boundaries' or None if inference fails """ if not cell_boxes or len(cell_boxes) < 1: return None try: # Filter valid boxes valid_boxes = [ b for b in cell_boxes if b is not None and len(b) >= 4 ] if not valid_boxes: return None # Extract Y and X boundaries from all cells y_mins = [b[1] for b in valid_boxes] # y0 y_maxs = [b[3] for b in valid_boxes] # y1 x_mins = [b[0] for b in valid_boxes] # x0 x_maxs = [b[2] for b in valid_boxes] # x1 # Cluster Y values to determine rows all_y = sorted(set(y_mins + y_maxs)) y_boundaries = self._cluster_to_boundaries(all_y, self.row_threshold) # Cluster X values to determine columns all_x = sorted(set(x_mins + x_maxs)) x_boundaries = self._cluster_to_boundaries(all_x, self.col_threshold) if len(y_boundaries) < 2 or len(x_boundaries) < 2: return None num_rows = len(y_boundaries) - 1 num_cols = len(x_boundaries) - 1 # Build grid: map (row, col) -> cell_box info grid = {} for idx, box in enumerate(valid_boxes): x0, y0, x1, y1 = box[:4] # Find row by y_center y_center = (y0 + y1) / 2 row = self._find_position(y_center, y_boundaries) # Find col by x_center x_center = (x0 + x1) / 2 col = self._find_position(x_center, x_boundaries) if row is not None and col is not None: grid[(row, col)] = { 'bbox': box, 'index': idx, 'content': '' } # Calculate row heights and column widths row_heights = [ y_boundaries[i + 1] - y_boundaries[i] for i in range(num_rows) ] col_widths = [ x_boundaries[i + 1] - x_boundaries[i] for i in range(num_cols) ] return { 'grid': grid, 'num_rows': num_rows, 'num_cols': num_cols, 'row_boundaries': y_boundaries, 'col_boundaries': x_boundaries, 'row_heights': row_heights, 'col_widths': col_widths } except Exception as e: logger.error(f"Grid inference failed: {e}") return None def _cluster_to_boundaries( self, values: List[float], threshold: float ) -> List[float]: """ Cluster nearby values and return representative boundaries. Args: values: Sorted list of coordinate values threshold: Clustering threshold Returns: List of boundary values (cluster representatives) """ if not values: return [] boundaries = [values[0]] current_cluster = [values[0]] for v in values[1:]: if v - current_cluster[-1] <= threshold: current_cluster.append(v) else: # Finish current cluster, use average as boundary boundaries[-1] = sum(current_cluster) / len(current_cluster) boundaries.append(v) current_cluster = [v] # Finish last cluster if current_cluster: boundaries[-1] = sum(current_cluster) / len(current_cluster) return boundaries def _find_position( self, value: float, boundaries: List[float] ) -> Optional[int]: """ Find which interval a value falls into. Args: value: Coordinate value boundaries: List of boundary values Returns: Index of interval, or None if out of bounds """ for i in range(len(boundaries) - 1): if boundaries[i] <= value <= boundaries[i + 1]: return i # Check if close to any boundary for i in range(len(boundaries) - 1): mid = (boundaries[i] + boundaries[i + 1]) / 2 if abs(value - mid) < (boundaries[i + 1] - boundaries[i]): return i return None def extract_cell_contents_from_html(html: str) -> List[str]: """ Extract cell text contents from HTML in reading order. Args: html: HTML table string Returns: List of text strings, one per cell """ try: parser = HTMLTableParser() parser.feed(html) if not parser.tables: return [] contents = [] for row in parser.tables[0].get('rows', []): for cell in row.get('cells', []): text = cell.get('text', '').strip() contents.append(text) return contents except Exception as e: logger.error(f"HTML content extraction failed: {e}") return [] def map_content_to_grid( grid: Dict[Tuple[int, int], Dict], contents: List[str], num_rows: int, num_cols: int ) -> Dict[Tuple[int, int], Dict]: """ Map extracted content to grid cells row by row. Args: grid: Dict mapping (row, col) to cell info contents: List of text contents from HTML num_rows: Number of rows in grid num_cols: Number of columns in grid Returns: Updated grid with content assigned """ content_idx = 0 for row in range(num_rows): for col in range(num_cols): if (row, col) in grid: if content_idx < len(contents): grid[(row, col)]['content'] = contents[content_idx] content_idx += 1 else: grid[(row, col)]['content'] = '' # Log if there's a significant mismatch if content_idx < len(contents): logger.debug( f"Content mismatch: {len(contents)} HTML cells, " f"only {content_idx} mapped to {len(grid)} grid cells" ) return grid # ============================================================================ # Configuration # ============================================================================ @dataclass class TableRenderConfig: """Configuration for table rendering.""" font_name: str = "Helvetica" font_size: int = 8 min_font_size: int = 6 max_font_size: int = 10 # Padding options left_padding: int = 2 right_padding: int = 2 top_padding: int = 2 bottom_padding: int = 2 # Border options border_color: Any = colors.black border_width: float = 0.5 # Alignment horizontal_align: str = "CENTER" vertical_align: str = "MIDDLE" # Header styling header_background: Any = colors.lightgrey # Grid normalization threshold grid_threshold: float = 10.0 # Merged cells threshold merge_boundary_threshold: float = 5.0 # ============================================================================ # HTML Table Parser # ============================================================================ class HTMLTableParser(HTMLParser): """ Parse HTML table structure for rendering. Extracts table rows, cells, and merged cell information (colspan/rowspan) from HTML table markup. """ def __init__(self): super().__init__() self.tables = [] self.current_table = None self.current_row = None self.current_cell = None self.in_cell = False def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]): if tag == 'table': self.current_table = {'rows': []} elif tag == 'tr': self.current_row = {'cells': []} elif tag in ('td', 'th'): # Extract colspan and rowspan attributes attrs_dict = dict(attrs) colspan = int(attrs_dict.get('colspan', 1)) rowspan = int(attrs_dict.get('rowspan', 1)) self.current_cell = { 'text': '', 'is_header': tag == 'th', 'colspan': colspan, 'rowspan': rowspan } self.in_cell = True def handle_endtag(self, tag: str): if tag == 'table' and self.current_table: self.tables.append(self.current_table) self.current_table = None elif tag == 'tr' and self.current_row: if self.current_table: self.current_table['rows'].append(self.current_row) self.current_row = None elif tag in ('td', 'th') and self.current_cell: if self.current_row: self.current_row['cells'].append(self.current_cell) self.current_cell = None self.in_cell = False def handle_data(self, data: str): if self.in_cell and self.current_cell is not None: self.current_cell['text'] += data # ============================================================================ # Table Renderer # ============================================================================ class TableRenderer: """ Unified table rendering engine for PDF generation. Supports multiple input formats and rendering modes: - HTML table parsing and rendering - Cell boxes rendering (layered approach) - Direct track cells dictionary - Translated content with dynamic font sizing """ def __init__(self, config: Optional[TableRenderConfig] = None): """ Initialize TableRenderer with configuration. Args: config: TableRenderConfig instance (uses defaults if None) """ self.config = config or TableRenderConfig() def render_from_html( self, pdf_canvas, html_content: str, table_bbox: Tuple[float, float, float, float], page_height: float, scale_w: float = 1.0, scale_h: float = 1.0 ) -> bool: """ Parse HTML and render table to PDF canvas. Args: pdf_canvas: ReportLab canvas html_content: HTML table string table_bbox: (x0, y0, x1, y1) bounding box page_height: PDF page height for Y coordinate flip scale_w: Horizontal scale factor scale_h: Vertical scale factor Returns: True if successful, False otherwise """ try: # Parse HTML parser = HTMLTableParser() parser.feed(html_content) if not parser.tables: logger.warning("No tables found in HTML content") return False table_data = parser.tables[0] return self._render_parsed_table( pdf_canvas, table_data, table_bbox, page_height, scale_w, scale_h ) except Exception as e: logger.error(f"HTML table rendering failed: {e}") import traceback traceback.print_exc() return False def render_from_cells_dict( self, pdf_canvas, cells_dict: Dict, table_bbox: Tuple[float, float, float, float], page_height: float, cell_boxes: Optional[List] = None ) -> bool: """ Render table from Direct track cell structure. Args: pdf_canvas: ReportLab canvas cells_dict: Dict with 'rows', 'cols', 'cells' keys table_bbox: (x0, y0, x1, y1) bounding box page_height: PDF page height cell_boxes: Optional precomputed cell boxes Returns: True if successful, False otherwise """ try: # Convert cells dict to row format rows = self._build_rows_from_cells_dict(cells_dict) if not rows: logger.warning("No rows built from cells dict") return False # Build table data structure table_data = {'rows': rows} # Calculate dimensions x0, y0, x1, y1 = table_bbox table_width = (x1 - x0) table_height = (y1 - y0) # Determine grid dimensions num_rows = cells_dict.get('rows', len(rows)) num_cols = cells_dict.get('cols', max(len(row['cells']) for row in rows) if rows else 1 ) # Calculate column widths and row heights if cell_boxes: col_widths, row_heights = self.compute_grid_from_cell_boxes( cell_boxes, table_bbox, num_rows, num_cols ) else: col_widths = [table_width / num_cols] * num_cols row_heights = [table_height / num_rows] * num_rows return self._render_with_dimensions( pdf_canvas, table_data, table_bbox, page_height, col_widths, row_heights ) except Exception as e: logger.error(f"Cells dict rendering failed: {e}") import traceback traceback.print_exc() return False def render_cell_borders( self, pdf_canvas, cell_boxes: List[List[float]], table_bbox: Tuple[float, float, float, float], page_height: float, embedded_images: Optional[List] = None, output_dir: Optional[Path] = None ) -> bool: """ Render table cell borders only (layered approach). This renders only the cell borders, not the text content. Text is typically rendered separately by GapFillingService. Args: pdf_canvas: ReportLab canvas cell_boxes: List of [x0, y0, x1, y1] for each cell table_bbox: Table bounding box page_height: PDF page height embedded_images: Optional list of images within cells output_dir: Directory for image files Returns: True if successful, False otherwise """ try: if not cell_boxes: # Draw outer border only return self._draw_table_border( pdf_canvas, table_bbox, page_height ) # Normalize cell boxes to grid normalized_boxes = self.normalize_cell_boxes_to_grid(cell_boxes) # Draw each cell border pdf_canvas.saveState() pdf_canvas.setStrokeColor(self.config.border_color) pdf_canvas.setLineWidth(self.config.border_width) for box in normalized_boxes: if box is None: continue x0, y0, x1, y1 = box # Convert to PDF coordinates (flip Y) pdf_x0 = x0 pdf_y0 = page_height - y1 pdf_x1 = x1 pdf_y1 = page_height - y0 # Draw cell rectangle pdf_canvas.rect(pdf_x0, pdf_y0, pdf_x1 - pdf_x0, pdf_y1 - pdf_y0) pdf_canvas.restoreState() # Draw embedded images if any if embedded_images and output_dir: for img_info in embedded_images: self._draw_embedded_image( pdf_canvas, img_info, page_height, output_dir ) return True except Exception as e: logger.error(f"Cell borders rendering failed: {e}") import traceback traceback.print_exc() return False def render_with_translated_text( self, pdf_canvas, cells: List[Dict], cell_boxes: List, table_bbox: Tuple[float, float, float, float], page_height: float ) -> bool: """ Render table with translated content and dynamic font sizing. Args: pdf_canvas: ReportLab canvas cells: List of cell dicts with 'translated_content' cell_boxes: List of cell bounding boxes table_bbox: Table bounding box page_height: PDF page height Returns: True if successful, False otherwise """ try: # Draw outer border self._draw_table_border(pdf_canvas, table_bbox, page_height) # Normalize cell boxes if cell_boxes: normalized_boxes = self.normalize_cell_boxes_to_grid(cell_boxes) else: logger.warning("No cell boxes for translated table") return False pdf_canvas.saveState() pdf_canvas.setStrokeColor(self.config.border_color) pdf_canvas.setLineWidth(self.config.border_width) # Draw cell borders for box in normalized_boxes: if box is None: continue x0, y0, x1, y1 = box pdf_y0 = page_height - y1 pdf_canvas.rect(x0, pdf_y0, x1 - x0, y1 - y0) pdf_canvas.restoreState() # Render text in cells with dynamic font sizing for i, cell in enumerate(cells): if i >= len(normalized_boxes): break box = normalized_boxes[i] if box is None: continue translated_text = cell.get('translated_content', '') if not translated_text: continue x0, y0, x1, y1 = box cell_width = x1 - x0 cell_height = y1 - y0 # Find appropriate font size font_size = self._fit_text_to_cell( pdf_canvas, translated_text, cell_width, cell_height ) # Render centered text pdf_canvas.setFont(self.config.font_name, font_size) # Calculate text position (centered) text_width = pdf_canvas.stringWidth(translated_text, self.config.font_name, font_size) text_x = x0 + (cell_width - text_width) / 2 text_y = page_height - y0 - cell_height / 2 - font_size / 3 pdf_canvas.drawString(text_x, text_y, translated_text) return True except Exception as e: logger.error(f"Translated table rendering failed: {e}") import traceback traceback.print_exc() return False def render_from_cellboxes_grid( self, pdf_canvas, cell_boxes: List[List[float]], html_content: str, table_bbox: Tuple[float, float, float, float], page_height: float, scale_w: float = 1.0, scale_h: float = 1.0, row_threshold: float = 15.0, col_threshold: float = 15.0 ) -> bool: """ Render table using cell_boxes as the primary structure source. This method infers grid structure from cell_boxes coordinates and maps HTML content to cells, regardless of HTML colspan/rowspan. Args: pdf_canvas: ReportLab canvas cell_boxes: List of [x0, y0, x1, y1] for each cell html_content: HTML table string (for text content) table_bbox: Table bounding box page_height: PDF page height scale_w: Horizontal scale factor scale_h: Vertical scale factor row_threshold: Y-coordinate threshold for row clustering col_threshold: X-coordinate threshold for column clustering Returns: True if successful, False otherwise """ try: if not cell_boxes: logger.debug("No cell_boxes provided for grid rendering") return False # Infer grid structure from cell_boxes inferrer = CellBoxGridInferrer( row_threshold=row_threshold, col_threshold=col_threshold ) grid_info = inferrer.infer_grid(cell_boxes) if not grid_info: logger.debug("Failed to infer grid from cell_boxes") return False grid = grid_info['grid'] num_rows = grid_info['num_rows'] num_cols = grid_info['num_cols'] row_boundaries = grid_info['row_boundaries'] col_boundaries = grid_info['col_boundaries'] logger.info( f"[TABLE] CellBoxes grid inferred: {num_rows} rows x {num_cols} cols " f"from {len(cell_boxes)} cell_boxes" ) # Extract content from HTML if html_content: contents = extract_cell_contents_from_html(html_content) grid = map_content_to_grid(grid, contents, num_rows, num_cols) logger.debug(f"[TABLE] Mapped {len(contents)} HTML cells to grid") # Apply scale factors to boundaries scaled_row_boundaries = [y * scale_h for y in row_boundaries] scaled_col_boundaries = [x * scale_w for x in col_boundaries] # Draw cell borders and content pdf_canvas.saveState() pdf_canvas.setStrokeColor(self.config.border_color) pdf_canvas.setLineWidth(self.config.border_width) # Create paragraph style for text style = ParagraphStyle( 'CellBoxCell', fontName=self.config.font_name, fontSize=self.config.font_size, alignment=TA_CENTER, leading=self.config.font_size * 1.2 ) for row in range(num_rows): for col in range(num_cols): # Calculate cell boundaries x0 = scaled_col_boundaries[col] x1 = scaled_col_boundaries[col + 1] if col + 1 < len(scaled_col_boundaries) else x0 + 50 y0 = scaled_row_boundaries[row] y1 = scaled_row_boundaries[row + 1] if row + 1 < len(scaled_row_boundaries) else y0 + 20 # Convert to PDF coordinates (flip Y) pdf_x0 = x0 pdf_y0 = page_height - y1 pdf_x1 = x1 pdf_y1 = page_height - y0 cell_width = pdf_x1 - pdf_x0 cell_height = pdf_y1 - pdf_y0 # Draw cell border pdf_canvas.rect(pdf_x0, pdf_y0, cell_width, cell_height) # Draw text if cell exists in grid if (row, col) in grid: cell_content = grid[(row, col)].get('content', '') if cell_content: # Calculate text position with padding text_x = pdf_x0 + self.config.left_padding text_y = pdf_y0 + cell_height - self.config.top_padding - self.config.font_size # Fit text to cell available_width = cell_width - self.config.left_padding - self.config.right_padding font_size = self._fit_text_to_cell( pdf_canvas, cell_content, available_width, cell_height ) # Draw centered text pdf_canvas.setFont(self.config.font_name, font_size) text_width = pdf_canvas.stringWidth( cell_content, self.config.font_name, font_size ) # Center horizontally text_x = pdf_x0 + (cell_width - text_width) / 2 # Center vertically text_y = pdf_y0 + (cell_height - font_size) / 2 pdf_canvas.drawString(text_x, text_y, cell_content) pdf_canvas.restoreState() logger.info(f"[TABLE] Successfully rendered {num_rows}x{num_cols} table from cell_boxes") return True except Exception as e: logger.error(f"CellBoxes grid rendering failed: {e}") import traceback traceback.print_exc() return False # ========================================================================= # Grid and Cell Box Helpers # ========================================================================= def compute_grid_from_cell_boxes( self, cell_boxes: List, table_bbox: Tuple[float, float, float, float], num_rows: int, num_cols: int ) -> Tuple[Optional[List[float]], Optional[List[float]]]: """ Calculate column widths and row heights from cell bounding boxes. Args: cell_boxes: List of [x0, y0, x1, y1] for each cell table_bbox: Table bounding box num_rows: Expected number of rows num_cols: Expected number of columns Returns: Tuple of (col_widths, row_heights) or (None, None) on failure """ try: if not cell_boxes: return None, None # Filter valid boxes valid_boxes = [b for b in cell_boxes if b is not None and len(b) >= 4] if not valid_boxes: return None, None # Extract unique X and Y boundaries x_boundaries = set() y_boundaries = set() for box in valid_boxes: x0, y0, x1, y1 = box[:4] x_boundaries.add(round(x0, 1)) x_boundaries.add(round(x1, 1)) y_boundaries.add(round(y0, 1)) y_boundaries.add(round(y1, 1)) # Sort boundaries x_sorted = sorted(x_boundaries) y_sorted = sorted(y_boundaries) # Merge nearby boundaries x_merged = self._merge_boundaries(x_sorted, self.config.merge_boundary_threshold) y_merged = self._merge_boundaries(y_sorted, self.config.merge_boundary_threshold) # Calculate widths and heights col_widths = [] for i in range(len(x_merged) - 1): col_widths.append(x_merged[i + 1] - x_merged[i]) row_heights = [] for i in range(len(y_merged) - 1): row_heights.append(y_merged[i + 1] - y_merged[i]) # Validate against expected dimensions (allow for merged cells) tolerance = max(num_cols, num_rows) // 2 + 1 if abs(len(col_widths) - num_cols) > tolerance: logger.debug(f"Column count mismatch: {len(col_widths)} vs {num_cols}") if abs(len(row_heights) - num_rows) > tolerance: logger.debug(f"Row count mismatch: {len(row_heights)} vs {num_rows}") return col_widths if col_widths else None, row_heights if row_heights else None except Exception as e: logger.error(f"Grid computation failed: {e}") return None, None def normalize_cell_boxes_to_grid( self, cell_boxes: List, threshold: Optional[float] = None ) -> List: """ Snap cell boxes to aligned grid to eliminate coordinate variations. Args: cell_boxes: List of [x0, y0, x1, y1] for each cell threshold: Clustering threshold (uses config default if None) Returns: Normalized cell boxes """ threshold = threshold or self.config.grid_threshold if not cell_boxes: return [] try: # Collect all coordinates all_x = [] all_y = [] for box in cell_boxes: if box is None or len(box) < 4: continue x0, y0, x1, y1 = box[:4] all_x.extend([x0, x1]) all_y.extend([y0, y1]) if not all_x or not all_y: return cell_boxes # Cluster and normalize X coordinates x_clusters = self._cluster_values(sorted(all_x), threshold) y_clusters = self._cluster_values(sorted(all_y), threshold) # Build mapping x_map = {v: avg for avg, values in x_clusters for v in values} y_map = {v: avg for avg, values in y_clusters for v in values} # Normalize boxes normalized = [] for box in cell_boxes: if box is None or len(box) < 4: normalized.append(box) continue x0, y0, x1, y1 = box[:4] normalized.append([ x_map.get(x0, x0), y_map.get(y0, y0), x_map.get(x1, x1), y_map.get(y1, y1) ]) return normalized except Exception as e: logger.error(f"Cell box normalization failed: {e}") return cell_boxes # ========================================================================= # Private Helper Methods # ========================================================================= def _render_parsed_table( self, pdf_canvas, table_data: Dict, table_bbox: Tuple[float, float, float, float], page_height: float, scale_w: float = 1.0, scale_h: float = 1.0 ) -> bool: """Render a parsed table structure.""" rows = table_data.get('rows', []) if not rows: return False # Build grid content num_rows = len(rows) num_cols = max(len(row.get('cells', [])) for row in rows) # Track occupied cells for rowspan handling occupied = [[False] * num_cols for _ in range(num_rows)] grid = [] span_commands = [] for row_idx, row in enumerate(rows): grid_row = [''] * num_cols col_idx = 0 for cell in row.get('cells', []): # Skip occupied cells while col_idx < num_cols and occupied[row_idx][col_idx]: col_idx += 1 if col_idx >= num_cols: break text = cell.get('text', '').strip() colspan = cell.get('colspan', 1) rowspan = cell.get('rowspan', 1) # Place cell content grid_row[col_idx] = text # Mark occupied cells and build SPAN command if colspan > 1 or rowspan > 1: end_col = min(col_idx + colspan - 1, num_cols - 1) end_row = min(row_idx + rowspan - 1, num_rows - 1) span_commands.append( ('SPAN', (col_idx, row_idx), (end_col, end_row)) ) for r in range(row_idx, end_row + 1): for c in range(col_idx, end_col + 1): if r < num_rows and c < num_cols: occupied[r][c] = True else: occupied[row_idx][col_idx] = True col_idx += colspan grid.append(grid_row) # Calculate dimensions x0, y0, x1, y1 = table_bbox table_width = (x1 - x0) * scale_w table_height = (y1 - y0) * scale_h col_widths = [table_width / num_cols] * num_cols row_heights = [table_height / num_rows] * num_rows # Create paragraph style style = ParagraphStyle( 'TableCell', fontName=self.config.font_name, fontSize=self.config.font_size, alignment=TA_CENTER, leading=self.config.font_size * 1.2 ) # Convert to Paragraph objects para_grid = [] for row in grid: para_row = [] for cell in row: if cell: para_row.append(Paragraph(cell, style)) else: para_row.append('') para_grid.append(para_row) # Build TableStyle table_style_commands = [ ('GRID', (0, 0), (-1, -1), self.config.border_width, self.config.border_color), ('VALIGN', (0, 0), (-1, -1), self.config.vertical_align), ('ALIGN', (0, 0), (-1, -1), self.config.horizontal_align), ('LEFTPADDING', (0, 0), (-1, -1), self.config.left_padding), ('RIGHTPADDING', (0, 0), (-1, -1), self.config.right_padding), ('TOPPADDING', (0, 0), (-1, -1), self.config.top_padding), ('BOTTOMPADDING', (0, 0), (-1, -1), self.config.bottom_padding), ('FONTNAME', (0, 0), (-1, -1), self.config.font_name), ('FONTSIZE', (0, 0), (-1, -1), self.config.font_size), ] table_style_commands.extend(span_commands) # Create and draw table table = Table(para_grid, colWidths=col_widths, rowHeights=row_heights) table.setStyle(TableStyle(table_style_commands)) # Position and draw pdf_x = x0 pdf_y = page_height - y1 # Flip Y table.wrapOn(pdf_canvas, table_width, table_height) table.drawOn(pdf_canvas, pdf_x, pdf_y) return True def _render_with_dimensions( self, pdf_canvas, table_data: Dict, table_bbox: Tuple[float, float, float, float], page_height: float, col_widths: List[float], row_heights: List[float] ) -> bool: """Render table with specified dimensions.""" rows = table_data.get('rows', []) if not rows: return False num_rows = len(rows) num_cols = max(len(row.get('cells', [])) for row in rows) # Adjust widths/heights if needed if len(col_widths) != num_cols: x0, y0, x1, y1 = table_bbox col_widths = [(x1 - x0) / num_cols] * num_cols if len(row_heights) != num_rows: x0, y0, x1, y1 = table_bbox row_heights = [(y1 - y0) / num_rows] * num_rows # Build grid with proper positioning grid = [] span_commands = [] occupied = [[False] * num_cols for _ in range(num_rows)] for row_idx, row in enumerate(rows): grid_row = [''] * num_cols for cell in row.get('cells', []): # Get column position col_idx = cell.get('col', 0) # Skip if out of bounds or occupied while col_idx < num_cols and occupied[row_idx][col_idx]: col_idx += 1 if col_idx >= num_cols: continue text = cell.get('text', '').strip() colspan = cell.get('colspan', 1) rowspan = cell.get('rowspan', 1) grid_row[col_idx] = text if colspan > 1 or rowspan > 1: end_col = min(col_idx + colspan - 1, num_cols - 1) end_row = min(row_idx + rowspan - 1, num_rows - 1) span_commands.append( ('SPAN', (col_idx, row_idx), (end_col, end_row)) ) for r in range(row_idx, end_row + 1): for c in range(col_idx, end_col + 1): if r < num_rows and c < num_cols: occupied[r][c] = True else: occupied[row_idx][col_idx] = True grid.append(grid_row) # Create style and table style = ParagraphStyle( 'TableCell', fontName=self.config.font_name, fontSize=self.config.font_size, alignment=TA_CENTER ) para_grid = [] for row in grid: para_row = [Paragraph(cell, style) if cell else '' for cell in row] para_grid.append(para_row) table_style_commands = [ ('GRID', (0, 0), (-1, -1), self.config.border_width, self.config.border_color), ('VALIGN', (0, 0), (-1, -1), self.config.vertical_align), ('LEFTPADDING', (0, 0), (-1, -1), 0), ('RIGHTPADDING', (0, 0), (-1, -1), 0), ('TOPPADDING', (0, 0), (-1, -1), 0), ('BOTTOMPADDING', (0, 0), (-1, -1), 1), ] table_style_commands.extend(span_commands) table = Table(para_grid, colWidths=col_widths, rowHeights=row_heights) table.setStyle(TableStyle(table_style_commands)) x0, y0, x1, y1 = table_bbox pdf_x = x0 pdf_y = page_height - y1 table.wrapOn(pdf_canvas, x1 - x0, y1 - y0) table.drawOn(pdf_canvas, pdf_x, pdf_y) return True def _build_rows_from_cells_dict(self, cells_dict: Dict) -> List[Dict]: """Convert Direct track cell structure to row format.""" cells = cells_dict.get('cells', []) if not cells: return [] num_rows = cells_dict.get('rows', 0) num_cols = cells_dict.get('cols', 0) # Group cells by row rows_data = {} for cell in cells: row_idx = cell.get('row', 0) if row_idx not in rows_data: rows_data[row_idx] = [] rows_data[row_idx].append(cell) # Build row list rows = [] for row_idx in range(num_rows): row_cells = rows_data.get(row_idx, []) # Sort by column row_cells.sort(key=lambda c: c.get('col', 0)) formatted_cells = [] for cell in row_cells: content = cell.get('content', '') if isinstance(content, list): content = '\n'.join(str(c) for c in content) formatted_cells.append({ 'text': str(content) if content else '', 'colspan': cell.get('col_span', 1), 'rowspan': cell.get('row_span', 1), 'col': cell.get('col', 0), 'is_header': cell.get('is_header', False) }) rows.append({'cells': formatted_cells}) return rows def _draw_table_border( self, pdf_canvas, table_bbox: Tuple[float, float, float, float], page_height: float ) -> bool: """Draw outer table border.""" try: x0, y0, x1, y1 = table_bbox pdf_y0 = page_height - y1 pdf_y1 = page_height - y0 pdf_canvas.saveState() pdf_canvas.setStrokeColor(self.config.border_color) pdf_canvas.setLineWidth(self.config.border_width) pdf_canvas.rect(x0, pdf_y0, x1 - x0, pdf_y1 - pdf_y0) pdf_canvas.restoreState() return True except Exception as e: logger.error(f"Failed to draw table border: {e}") return False def _draw_embedded_image( self, pdf_canvas, img_info: Dict, page_height: float, output_dir: Path ) -> bool: """Draw an image embedded within a table cell.""" try: img_path = img_info.get('path') if not img_path: return False # Resolve path if not Path(img_path).is_absolute(): img_path = output_dir / img_path if not Path(img_path).exists(): logger.warning(f"Embedded image not found: {img_path}") return False bbox = img_info.get('bbox', {}) x0 = bbox.get('x0', 0) y0 = bbox.get('y0', 0) width = bbox.get('width', 100) height = bbox.get('height', 100) # Flip Y coordinate pdf_y = page_height - y0 - height # Draw image img = ImageReader(str(img_path)) pdf_canvas.drawImage(img, x0, pdf_y, width, height) return True except Exception as e: logger.error(f"Failed to draw embedded image: {e}") return False def _fit_text_to_cell( self, pdf_canvas, text: str, cell_width: float, cell_height: float ) -> int: """Find font size that fits text in cell.""" for size in range(self.config.max_font_size, self.config.min_font_size - 1, -1): text_width = pdf_canvas.stringWidth(text, self.config.font_name, size) if text_width <= cell_width - 6: # 3pt padding each side return size return self.config.min_font_size def _merge_boundaries(self, values: List[float], threshold: float) -> List[float]: """Merge nearby boundary values.""" if not values: return [] merged = [values[0]] for v in values[1:]: if abs(v - merged[-1]) > threshold: merged.append(v) return merged def _cluster_values(self, values: List[float], threshold: float) -> List[Tuple[float, List[float]]]: """Cluster nearby values and return (average, members) pairs.""" if not values: return [] clusters = [] current_cluster = [values[0]] for v in values[1:]: if abs(v - current_cluster[-1]) <= threshold: current_cluster.append(v) else: avg = sum(current_cluster) / len(current_cluster) clusters.append((avg, current_cluster)) current_cluster = [v] if current_cluster: avg = sum(current_cluster) / len(current_cluster) clusters.append((avg, current_cluster)) return clusters