Backup commit before executing remove-unused-code proposal. This includes all pending changes and new features. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1309 lines
43 KiB
Python
1309 lines
43 KiB
Python
"""
|
|
PDF Table Renderer - Handles table rendering for PDF generation.
|
|
|
|
This module provides unified table rendering capabilities extracted from
|
|
PDFGeneratorService, supporting multiple input formats:
|
|
- HTML tables
|
|
- Cell boxes (layered approach)
|
|
- Cells dictionary (Direct track)
|
|
- TableData objects
|
|
"""
|
|
|
|
import logging
|
|
from dataclasses import dataclass, field
|
|
from html.parser import HTMLParser
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
|
|
from reportlab.lib import colors
|
|
from reportlab.lib.enums import TA_CENTER, TA_LEFT, TA_RIGHT
|
|
from reportlab.lib.styles import ParagraphStyle
|
|
from reportlab.lib.utils import ImageReader
|
|
from reportlab.platypus import Paragraph, Table, TableStyle
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
# ============================================================================
|
|
# Cell Box Grid Inferrer
|
|
# ============================================================================
|
|
|
|
class CellBoxGridInferrer:
|
|
"""
|
|
Infer table grid structure from cell_boxes coordinates.
|
|
|
|
This class clusters cell_boxes by Y-coordinate (rows) and X-coordinate (columns)
|
|
to determine the grid structure, regardless of HTML colspan/rowspan.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
row_threshold: float = 15.0,
|
|
col_threshold: float = 15.0
|
|
):
|
|
"""
|
|
Initialize grid inferrer.
|
|
|
|
Args:
|
|
row_threshold: Y-coordinate threshold for row clustering
|
|
col_threshold: X-coordinate threshold for column clustering
|
|
"""
|
|
self.row_threshold = row_threshold
|
|
self.col_threshold = col_threshold
|
|
|
|
def infer_grid(
|
|
self,
|
|
cell_boxes: List[List[float]]
|
|
) -> Optional[Dict]:
|
|
"""
|
|
Infer grid structure from cell_boxes.
|
|
|
|
Args:
|
|
cell_boxes: List of [x0, y0, x1, y1] coordinates
|
|
|
|
Returns:
|
|
Dict with 'grid', 'num_rows', 'num_cols', 'row_boundaries', 'col_boundaries'
|
|
or None if inference fails
|
|
"""
|
|
if not cell_boxes or len(cell_boxes) < 1:
|
|
return None
|
|
|
|
try:
|
|
# Filter valid boxes
|
|
valid_boxes = [
|
|
b for b in cell_boxes
|
|
if b is not None and len(b) >= 4
|
|
]
|
|
if not valid_boxes:
|
|
return None
|
|
|
|
# Extract Y and X boundaries from all cells
|
|
y_mins = [b[1] for b in valid_boxes] # y0
|
|
y_maxs = [b[3] for b in valid_boxes] # y1
|
|
x_mins = [b[0] for b in valid_boxes] # x0
|
|
x_maxs = [b[2] for b in valid_boxes] # x1
|
|
|
|
# Cluster Y values to determine rows
|
|
all_y = sorted(set(y_mins + y_maxs))
|
|
y_boundaries = self._cluster_to_boundaries(all_y, self.row_threshold)
|
|
|
|
# Cluster X values to determine columns
|
|
all_x = sorted(set(x_mins + x_maxs))
|
|
x_boundaries = self._cluster_to_boundaries(all_x, self.col_threshold)
|
|
|
|
if len(y_boundaries) < 2 or len(x_boundaries) < 2:
|
|
return None
|
|
|
|
num_rows = len(y_boundaries) - 1
|
|
num_cols = len(x_boundaries) - 1
|
|
|
|
# Build grid: map (row, col) -> cell_box info
|
|
grid = {}
|
|
for idx, box in enumerate(valid_boxes):
|
|
x0, y0, x1, y1 = box[:4]
|
|
|
|
# Find row by y_center
|
|
y_center = (y0 + y1) / 2
|
|
row = self._find_position(y_center, y_boundaries)
|
|
|
|
# Find col by x_center
|
|
x_center = (x0 + x1) / 2
|
|
col = self._find_position(x_center, x_boundaries)
|
|
|
|
if row is not None and col is not None:
|
|
grid[(row, col)] = {
|
|
'bbox': box,
|
|
'index': idx,
|
|
'content': ''
|
|
}
|
|
|
|
# Calculate row heights and column widths
|
|
row_heights = [
|
|
y_boundaries[i + 1] - y_boundaries[i]
|
|
for i in range(num_rows)
|
|
]
|
|
col_widths = [
|
|
x_boundaries[i + 1] - x_boundaries[i]
|
|
for i in range(num_cols)
|
|
]
|
|
|
|
return {
|
|
'grid': grid,
|
|
'num_rows': num_rows,
|
|
'num_cols': num_cols,
|
|
'row_boundaries': y_boundaries,
|
|
'col_boundaries': x_boundaries,
|
|
'row_heights': row_heights,
|
|
'col_widths': col_widths
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Grid inference failed: {e}")
|
|
return None
|
|
|
|
def _cluster_to_boundaries(
|
|
self,
|
|
values: List[float],
|
|
threshold: float
|
|
) -> List[float]:
|
|
"""
|
|
Cluster nearby values and return representative boundaries.
|
|
|
|
Args:
|
|
values: Sorted list of coordinate values
|
|
threshold: Clustering threshold
|
|
|
|
Returns:
|
|
List of boundary values (cluster representatives)
|
|
"""
|
|
if not values:
|
|
return []
|
|
|
|
boundaries = [values[0]]
|
|
current_cluster = [values[0]]
|
|
|
|
for v in values[1:]:
|
|
if v - current_cluster[-1] <= threshold:
|
|
current_cluster.append(v)
|
|
else:
|
|
# Finish current cluster, use average as boundary
|
|
boundaries[-1] = sum(current_cluster) / len(current_cluster)
|
|
boundaries.append(v)
|
|
current_cluster = [v]
|
|
|
|
# Finish last cluster
|
|
if current_cluster:
|
|
boundaries[-1] = sum(current_cluster) / len(current_cluster)
|
|
|
|
return boundaries
|
|
|
|
def _find_position(
|
|
self,
|
|
value: float,
|
|
boundaries: List[float]
|
|
) -> Optional[int]:
|
|
"""
|
|
Find which interval a value falls into.
|
|
|
|
Args:
|
|
value: Coordinate value
|
|
boundaries: List of boundary values
|
|
|
|
Returns:
|
|
Index of interval, or None if out of bounds
|
|
"""
|
|
for i in range(len(boundaries) - 1):
|
|
if boundaries[i] <= value <= boundaries[i + 1]:
|
|
return i
|
|
|
|
# Check if close to any boundary
|
|
for i in range(len(boundaries) - 1):
|
|
mid = (boundaries[i] + boundaries[i + 1]) / 2
|
|
if abs(value - mid) < (boundaries[i + 1] - boundaries[i]):
|
|
return i
|
|
|
|
return None
|
|
|
|
|
|
def extract_cell_contents_from_html(html: str) -> List[str]:
|
|
"""
|
|
Extract cell text contents from HTML in reading order.
|
|
|
|
Args:
|
|
html: HTML table string
|
|
|
|
Returns:
|
|
List of text strings, one per cell
|
|
"""
|
|
try:
|
|
parser = HTMLTableParser()
|
|
parser.feed(html)
|
|
|
|
if not parser.tables:
|
|
return []
|
|
|
|
contents = []
|
|
for row in parser.tables[0].get('rows', []):
|
|
for cell in row.get('cells', []):
|
|
text = cell.get('text', '').strip()
|
|
contents.append(text)
|
|
|
|
return contents
|
|
|
|
except Exception as e:
|
|
logger.error(f"HTML content extraction failed: {e}")
|
|
return []
|
|
|
|
|
|
def map_content_to_grid(
|
|
grid: Dict[Tuple[int, int], Dict],
|
|
contents: List[str],
|
|
num_rows: int,
|
|
num_cols: int
|
|
) -> Dict[Tuple[int, int], Dict]:
|
|
"""
|
|
Map extracted content to grid cells row by row.
|
|
|
|
Args:
|
|
grid: Dict mapping (row, col) to cell info
|
|
contents: List of text contents from HTML
|
|
num_rows: Number of rows in grid
|
|
num_cols: Number of columns in grid
|
|
|
|
Returns:
|
|
Updated grid with content assigned
|
|
"""
|
|
content_idx = 0
|
|
|
|
for row in range(num_rows):
|
|
for col in range(num_cols):
|
|
if (row, col) in grid:
|
|
if content_idx < len(contents):
|
|
grid[(row, col)]['content'] = contents[content_idx]
|
|
content_idx += 1
|
|
else:
|
|
grid[(row, col)]['content'] = ''
|
|
|
|
# Log if there's a significant mismatch
|
|
if content_idx < len(contents):
|
|
logger.debug(
|
|
f"Content mismatch: {len(contents)} HTML cells, "
|
|
f"only {content_idx} mapped to {len(grid)} grid cells"
|
|
)
|
|
|
|
return grid
|
|
|
|
|
|
# ============================================================================
|
|
# Configuration
|
|
# ============================================================================
|
|
|
|
@dataclass
|
|
class TableRenderConfig:
|
|
"""Configuration for table rendering."""
|
|
font_name: str = "Helvetica"
|
|
font_size: int = 8
|
|
min_font_size: int = 6
|
|
max_font_size: int = 10
|
|
|
|
# Padding options
|
|
left_padding: int = 2
|
|
right_padding: int = 2
|
|
top_padding: int = 2
|
|
bottom_padding: int = 2
|
|
|
|
# Border options
|
|
border_color: Any = colors.black
|
|
border_width: float = 0.5
|
|
|
|
# Alignment
|
|
horizontal_align: str = "CENTER"
|
|
vertical_align: str = "MIDDLE"
|
|
|
|
# Header styling
|
|
header_background: Any = colors.lightgrey
|
|
|
|
# Grid normalization threshold
|
|
grid_threshold: float = 10.0
|
|
|
|
# Merged cells threshold
|
|
merge_boundary_threshold: float = 5.0
|
|
|
|
|
|
# ============================================================================
|
|
# HTML Table Parser
|
|
# ============================================================================
|
|
|
|
class HTMLTableParser(HTMLParser):
|
|
"""
|
|
Parse HTML table structure for rendering.
|
|
|
|
Extracts table rows, cells, and merged cell information (colspan/rowspan)
|
|
from HTML table markup.
|
|
"""
|
|
|
|
def __init__(self):
|
|
super().__init__()
|
|
self.tables = []
|
|
self.current_table = None
|
|
self.current_row = None
|
|
self.current_cell = None
|
|
self.in_cell = False
|
|
|
|
def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]):
|
|
if tag == 'table':
|
|
self.current_table = {'rows': []}
|
|
elif tag == 'tr':
|
|
self.current_row = {'cells': []}
|
|
elif tag in ('td', 'th'):
|
|
# Extract colspan and rowspan attributes
|
|
attrs_dict = dict(attrs)
|
|
colspan = int(attrs_dict.get('colspan', 1))
|
|
rowspan = int(attrs_dict.get('rowspan', 1))
|
|
self.current_cell = {
|
|
'text': '',
|
|
'is_header': tag == 'th',
|
|
'colspan': colspan,
|
|
'rowspan': rowspan
|
|
}
|
|
self.in_cell = True
|
|
|
|
def handle_endtag(self, tag: str):
|
|
if tag == 'table' and self.current_table:
|
|
self.tables.append(self.current_table)
|
|
self.current_table = None
|
|
elif tag == 'tr' and self.current_row:
|
|
if self.current_table:
|
|
self.current_table['rows'].append(self.current_row)
|
|
self.current_row = None
|
|
elif tag in ('td', 'th') and self.current_cell:
|
|
if self.current_row:
|
|
self.current_row['cells'].append(self.current_cell)
|
|
self.current_cell = None
|
|
self.in_cell = False
|
|
|
|
def handle_data(self, data: str):
|
|
if self.in_cell and self.current_cell is not None:
|
|
self.current_cell['text'] += data
|
|
|
|
|
|
# ============================================================================
|
|
# Table Renderer
|
|
# ============================================================================
|
|
|
|
class TableRenderer:
|
|
"""
|
|
Unified table rendering engine for PDF generation.
|
|
|
|
Supports multiple input formats and rendering modes:
|
|
- HTML table parsing and rendering
|
|
- Cell boxes rendering (layered approach)
|
|
- Direct track cells dictionary
|
|
- Translated content with dynamic font sizing
|
|
"""
|
|
|
|
def __init__(self, config: Optional[TableRenderConfig] = None):
|
|
"""
|
|
Initialize TableRenderer with configuration.
|
|
|
|
Args:
|
|
config: TableRenderConfig instance (uses defaults if None)
|
|
"""
|
|
self.config = config or TableRenderConfig()
|
|
|
|
def render_from_html(
|
|
self,
|
|
pdf_canvas,
|
|
html_content: str,
|
|
table_bbox: Tuple[float, float, float, float],
|
|
page_height: float,
|
|
scale_w: float = 1.0,
|
|
scale_h: float = 1.0
|
|
) -> bool:
|
|
"""
|
|
Parse HTML and render table to PDF canvas.
|
|
|
|
Args:
|
|
pdf_canvas: ReportLab canvas
|
|
html_content: HTML table string
|
|
table_bbox: (x0, y0, x1, y1) bounding box
|
|
page_height: PDF page height for Y coordinate flip
|
|
scale_w: Horizontal scale factor
|
|
scale_h: Vertical scale factor
|
|
|
|
Returns:
|
|
True if successful, False otherwise
|
|
"""
|
|
try:
|
|
# Parse HTML
|
|
parser = HTMLTableParser()
|
|
parser.feed(html_content)
|
|
|
|
if not parser.tables:
|
|
logger.warning("No tables found in HTML content")
|
|
return False
|
|
|
|
table_data = parser.tables[0]
|
|
return self._render_parsed_table(
|
|
pdf_canvas, table_data, table_bbox, page_height, scale_w, scale_h
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"HTML table rendering failed: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return False
|
|
|
|
def render_from_cells_dict(
|
|
self,
|
|
pdf_canvas,
|
|
cells_dict: Dict,
|
|
table_bbox: Tuple[float, float, float, float],
|
|
page_height: float,
|
|
cell_boxes: Optional[List] = None
|
|
) -> bool:
|
|
"""
|
|
Render table from Direct track cell structure.
|
|
|
|
Args:
|
|
pdf_canvas: ReportLab canvas
|
|
cells_dict: Dict with 'rows', 'cols', 'cells' keys
|
|
table_bbox: (x0, y0, x1, y1) bounding box
|
|
page_height: PDF page height
|
|
cell_boxes: Optional precomputed cell boxes
|
|
|
|
Returns:
|
|
True if successful, False otherwise
|
|
"""
|
|
try:
|
|
# Convert cells dict to row format
|
|
rows = self._build_rows_from_cells_dict(cells_dict)
|
|
|
|
if not rows:
|
|
logger.warning("No rows built from cells dict")
|
|
return False
|
|
|
|
# Build table data structure
|
|
table_data = {'rows': rows}
|
|
|
|
# Calculate dimensions
|
|
x0, y0, x1, y1 = table_bbox
|
|
table_width = (x1 - x0)
|
|
table_height = (y1 - y0)
|
|
|
|
# Determine grid dimensions
|
|
num_rows = cells_dict.get('rows', len(rows))
|
|
num_cols = cells_dict.get('cols',
|
|
max(len(row['cells']) for row in rows) if rows else 1
|
|
)
|
|
|
|
# Calculate column widths and row heights
|
|
if cell_boxes:
|
|
col_widths, row_heights = self.compute_grid_from_cell_boxes(
|
|
cell_boxes, table_bbox, num_rows, num_cols
|
|
)
|
|
else:
|
|
col_widths = [table_width / num_cols] * num_cols
|
|
row_heights = [table_height / num_rows] * num_rows
|
|
|
|
return self._render_with_dimensions(
|
|
pdf_canvas, table_data, table_bbox, page_height,
|
|
col_widths, row_heights
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Cells dict rendering failed: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return False
|
|
|
|
def render_cell_borders(
|
|
self,
|
|
pdf_canvas,
|
|
cell_boxes: List[List[float]],
|
|
table_bbox: Tuple[float, float, float, float],
|
|
page_height: float,
|
|
embedded_images: Optional[List] = None,
|
|
output_dir: Optional[Path] = None
|
|
) -> bool:
|
|
"""
|
|
Render table cell borders only (layered approach).
|
|
|
|
This renders only the cell borders, not the text content.
|
|
Text is typically rendered separately by GapFillingService.
|
|
|
|
Args:
|
|
pdf_canvas: ReportLab canvas
|
|
cell_boxes: List of [x0, y0, x1, y1] for each cell
|
|
table_bbox: Table bounding box
|
|
page_height: PDF page height
|
|
embedded_images: Optional list of images within cells
|
|
output_dir: Directory for image files
|
|
|
|
Returns:
|
|
True if successful, False otherwise
|
|
"""
|
|
try:
|
|
if not cell_boxes:
|
|
# Draw outer border only
|
|
return self._draw_table_border(
|
|
pdf_canvas, table_bbox, page_height
|
|
)
|
|
|
|
# Normalize cell boxes to grid
|
|
normalized_boxes = self.normalize_cell_boxes_to_grid(cell_boxes)
|
|
|
|
# Draw each cell border
|
|
pdf_canvas.saveState()
|
|
pdf_canvas.setStrokeColor(self.config.border_color)
|
|
pdf_canvas.setLineWidth(self.config.border_width)
|
|
|
|
for box in normalized_boxes:
|
|
if box is None:
|
|
continue
|
|
|
|
x0, y0, x1, y1 = box
|
|
# Convert to PDF coordinates (flip Y)
|
|
pdf_x0 = x0
|
|
pdf_y0 = page_height - y1
|
|
pdf_x1 = x1
|
|
pdf_y1 = page_height - y0
|
|
|
|
# Draw cell rectangle
|
|
pdf_canvas.rect(pdf_x0, pdf_y0, pdf_x1 - pdf_x0, pdf_y1 - pdf_y0)
|
|
|
|
pdf_canvas.restoreState()
|
|
|
|
# Draw embedded images if any
|
|
if embedded_images and output_dir:
|
|
for img_info in embedded_images:
|
|
self._draw_embedded_image(
|
|
pdf_canvas, img_info, page_height, output_dir
|
|
)
|
|
|
|
return True
|
|
|
|
except Exception as e:
|
|
logger.error(f"Cell borders rendering failed: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return False
|
|
|
|
def render_with_translated_text(
|
|
self,
|
|
pdf_canvas,
|
|
cells: List[Dict],
|
|
cell_boxes: List,
|
|
table_bbox: Tuple[float, float, float, float],
|
|
page_height: float
|
|
) -> bool:
|
|
"""
|
|
Render table with translated content and dynamic font sizing.
|
|
|
|
Args:
|
|
pdf_canvas: ReportLab canvas
|
|
cells: List of cell dicts with 'translated_content'
|
|
cell_boxes: List of cell bounding boxes
|
|
table_bbox: Table bounding box
|
|
page_height: PDF page height
|
|
|
|
Returns:
|
|
True if successful, False otherwise
|
|
"""
|
|
try:
|
|
# Draw outer border
|
|
self._draw_table_border(pdf_canvas, table_bbox, page_height)
|
|
|
|
# Normalize cell boxes
|
|
if cell_boxes:
|
|
normalized_boxes = self.normalize_cell_boxes_to_grid(cell_boxes)
|
|
else:
|
|
logger.warning("No cell boxes for translated table")
|
|
return False
|
|
|
|
pdf_canvas.saveState()
|
|
pdf_canvas.setStrokeColor(self.config.border_color)
|
|
pdf_canvas.setLineWidth(self.config.border_width)
|
|
|
|
# Draw cell borders
|
|
for box in normalized_boxes:
|
|
if box is None:
|
|
continue
|
|
x0, y0, x1, y1 = box
|
|
pdf_y0 = page_height - y1
|
|
pdf_canvas.rect(x0, pdf_y0, x1 - x0, y1 - y0)
|
|
|
|
pdf_canvas.restoreState()
|
|
|
|
# Render text in cells with dynamic font sizing
|
|
for i, cell in enumerate(cells):
|
|
if i >= len(normalized_boxes):
|
|
break
|
|
|
|
box = normalized_boxes[i]
|
|
if box is None:
|
|
continue
|
|
|
|
translated_text = cell.get('translated_content', '')
|
|
if not translated_text:
|
|
continue
|
|
|
|
x0, y0, x1, y1 = box
|
|
cell_width = x1 - x0
|
|
cell_height = y1 - y0
|
|
|
|
# Find appropriate font size
|
|
font_size = self._fit_text_to_cell(
|
|
pdf_canvas, translated_text, cell_width, cell_height
|
|
)
|
|
|
|
# Render centered text
|
|
pdf_canvas.setFont(self.config.font_name, font_size)
|
|
|
|
# Calculate text position (centered)
|
|
text_width = pdf_canvas.stringWidth(translated_text, self.config.font_name, font_size)
|
|
text_x = x0 + (cell_width - text_width) / 2
|
|
text_y = page_height - y0 - cell_height / 2 - font_size / 3
|
|
|
|
pdf_canvas.drawString(text_x, text_y, translated_text)
|
|
|
|
return True
|
|
|
|
except Exception as e:
|
|
logger.error(f"Translated table rendering failed: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return False
|
|
|
|
def render_from_cellboxes_grid(
|
|
self,
|
|
pdf_canvas,
|
|
cell_boxes: List[List[float]],
|
|
html_content: str,
|
|
table_bbox: Tuple[float, float, float, float],
|
|
page_height: float,
|
|
scale_w: float = 1.0,
|
|
scale_h: float = 1.0,
|
|
row_threshold: float = 15.0,
|
|
col_threshold: float = 15.0
|
|
) -> bool:
|
|
"""
|
|
Render table using cell_boxes as the primary structure source.
|
|
|
|
This method infers grid structure from cell_boxes coordinates and
|
|
maps HTML content to cells, regardless of HTML colspan/rowspan.
|
|
|
|
Args:
|
|
pdf_canvas: ReportLab canvas
|
|
cell_boxes: List of [x0, y0, x1, y1] for each cell
|
|
html_content: HTML table string (for text content)
|
|
table_bbox: Table bounding box
|
|
page_height: PDF page height
|
|
scale_w: Horizontal scale factor
|
|
scale_h: Vertical scale factor
|
|
row_threshold: Y-coordinate threshold for row clustering
|
|
col_threshold: X-coordinate threshold for column clustering
|
|
|
|
Returns:
|
|
True if successful, False otherwise
|
|
"""
|
|
try:
|
|
if not cell_boxes:
|
|
logger.debug("No cell_boxes provided for grid rendering")
|
|
return False
|
|
|
|
# Infer grid structure from cell_boxes
|
|
inferrer = CellBoxGridInferrer(
|
|
row_threshold=row_threshold,
|
|
col_threshold=col_threshold
|
|
)
|
|
grid_info = inferrer.infer_grid(cell_boxes)
|
|
|
|
if not grid_info:
|
|
logger.debug("Failed to infer grid from cell_boxes")
|
|
return False
|
|
|
|
grid = grid_info['grid']
|
|
num_rows = grid_info['num_rows']
|
|
num_cols = grid_info['num_cols']
|
|
row_boundaries = grid_info['row_boundaries']
|
|
col_boundaries = grid_info['col_boundaries']
|
|
|
|
logger.info(
|
|
f"[TABLE] CellBoxes grid inferred: {num_rows} rows x {num_cols} cols "
|
|
f"from {len(cell_boxes)} cell_boxes"
|
|
)
|
|
|
|
# Extract content from HTML
|
|
if html_content:
|
|
contents = extract_cell_contents_from_html(html_content)
|
|
grid = map_content_to_grid(grid, contents, num_rows, num_cols)
|
|
logger.debug(f"[TABLE] Mapped {len(contents)} HTML cells to grid")
|
|
|
|
# Apply scale factors to boundaries
|
|
scaled_row_boundaries = [y * scale_h for y in row_boundaries]
|
|
scaled_col_boundaries = [x * scale_w for x in col_boundaries]
|
|
|
|
# Draw cell borders and content
|
|
pdf_canvas.saveState()
|
|
pdf_canvas.setStrokeColor(self.config.border_color)
|
|
pdf_canvas.setLineWidth(self.config.border_width)
|
|
|
|
# Create paragraph style for text
|
|
style = ParagraphStyle(
|
|
'CellBoxCell',
|
|
fontName=self.config.font_name,
|
|
fontSize=self.config.font_size,
|
|
alignment=TA_CENTER,
|
|
leading=self.config.font_size * 1.2
|
|
)
|
|
|
|
for row in range(num_rows):
|
|
for col in range(num_cols):
|
|
# Calculate cell boundaries
|
|
x0 = scaled_col_boundaries[col]
|
|
x1 = scaled_col_boundaries[col + 1] if col + 1 < len(scaled_col_boundaries) else x0 + 50
|
|
y0 = scaled_row_boundaries[row]
|
|
y1 = scaled_row_boundaries[row + 1] if row + 1 < len(scaled_row_boundaries) else y0 + 20
|
|
|
|
# Convert to PDF coordinates (flip Y)
|
|
pdf_x0 = x0
|
|
pdf_y0 = page_height - y1
|
|
pdf_x1 = x1
|
|
pdf_y1 = page_height - y0
|
|
|
|
cell_width = pdf_x1 - pdf_x0
|
|
cell_height = pdf_y1 - pdf_y0
|
|
|
|
# Draw cell border
|
|
pdf_canvas.rect(pdf_x0, pdf_y0, cell_width, cell_height)
|
|
|
|
# Draw text if cell exists in grid
|
|
if (row, col) in grid:
|
|
cell_content = grid[(row, col)].get('content', '')
|
|
if cell_content:
|
|
# Calculate text position with padding
|
|
text_x = pdf_x0 + self.config.left_padding
|
|
text_y = pdf_y0 + cell_height - self.config.top_padding - self.config.font_size
|
|
|
|
# Fit text to cell
|
|
available_width = cell_width - self.config.left_padding - self.config.right_padding
|
|
font_size = self._fit_text_to_cell(
|
|
pdf_canvas, cell_content, available_width, cell_height
|
|
)
|
|
|
|
# Draw centered text
|
|
pdf_canvas.setFont(self.config.font_name, font_size)
|
|
text_width = pdf_canvas.stringWidth(
|
|
cell_content, self.config.font_name, font_size
|
|
)
|
|
|
|
# Center horizontally
|
|
text_x = pdf_x0 + (cell_width - text_width) / 2
|
|
# Center vertically
|
|
text_y = pdf_y0 + (cell_height - font_size) / 2
|
|
|
|
pdf_canvas.drawString(text_x, text_y, cell_content)
|
|
|
|
pdf_canvas.restoreState()
|
|
|
|
logger.info(f"[TABLE] Successfully rendered {num_rows}x{num_cols} table from cell_boxes")
|
|
return True
|
|
|
|
except Exception as e:
|
|
logger.error(f"CellBoxes grid rendering failed: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return False
|
|
|
|
# =========================================================================
|
|
# Grid and Cell Box Helpers
|
|
# =========================================================================
|
|
|
|
def compute_grid_from_cell_boxes(
|
|
self,
|
|
cell_boxes: List,
|
|
table_bbox: Tuple[float, float, float, float],
|
|
num_rows: int,
|
|
num_cols: int
|
|
) -> Tuple[Optional[List[float]], Optional[List[float]]]:
|
|
"""
|
|
Calculate column widths and row heights from cell bounding boxes.
|
|
|
|
Args:
|
|
cell_boxes: List of [x0, y0, x1, y1] for each cell
|
|
table_bbox: Table bounding box
|
|
num_rows: Expected number of rows
|
|
num_cols: Expected number of columns
|
|
|
|
Returns:
|
|
Tuple of (col_widths, row_heights) or (None, None) on failure
|
|
"""
|
|
try:
|
|
if not cell_boxes:
|
|
return None, None
|
|
|
|
# Filter valid boxes
|
|
valid_boxes = [b for b in cell_boxes if b is not None and len(b) >= 4]
|
|
if not valid_boxes:
|
|
return None, None
|
|
|
|
# Extract unique X and Y boundaries
|
|
x_boundaries = set()
|
|
y_boundaries = set()
|
|
|
|
for box in valid_boxes:
|
|
x0, y0, x1, y1 = box[:4]
|
|
x_boundaries.add(round(x0, 1))
|
|
x_boundaries.add(round(x1, 1))
|
|
y_boundaries.add(round(y0, 1))
|
|
y_boundaries.add(round(y1, 1))
|
|
|
|
# Sort boundaries
|
|
x_sorted = sorted(x_boundaries)
|
|
y_sorted = sorted(y_boundaries)
|
|
|
|
# Merge nearby boundaries
|
|
x_merged = self._merge_boundaries(x_sorted, self.config.merge_boundary_threshold)
|
|
y_merged = self._merge_boundaries(y_sorted, self.config.merge_boundary_threshold)
|
|
|
|
# Calculate widths and heights
|
|
col_widths = []
|
|
for i in range(len(x_merged) - 1):
|
|
col_widths.append(x_merged[i + 1] - x_merged[i])
|
|
|
|
row_heights = []
|
|
for i in range(len(y_merged) - 1):
|
|
row_heights.append(y_merged[i + 1] - y_merged[i])
|
|
|
|
# Validate against expected dimensions (allow for merged cells)
|
|
tolerance = max(num_cols, num_rows) // 2 + 1
|
|
if abs(len(col_widths) - num_cols) > tolerance:
|
|
logger.debug(f"Column count mismatch: {len(col_widths)} vs {num_cols}")
|
|
if abs(len(row_heights) - num_rows) > tolerance:
|
|
logger.debug(f"Row count mismatch: {len(row_heights)} vs {num_rows}")
|
|
|
|
return col_widths if col_widths else None, row_heights if row_heights else None
|
|
|
|
except Exception as e:
|
|
logger.error(f"Grid computation failed: {e}")
|
|
return None, None
|
|
|
|
def normalize_cell_boxes_to_grid(
|
|
self,
|
|
cell_boxes: List,
|
|
threshold: Optional[float] = None
|
|
) -> List:
|
|
"""
|
|
Snap cell boxes to aligned grid to eliminate coordinate variations.
|
|
|
|
Args:
|
|
cell_boxes: List of [x0, y0, x1, y1] for each cell
|
|
threshold: Clustering threshold (uses config default if None)
|
|
|
|
Returns:
|
|
Normalized cell boxes
|
|
"""
|
|
threshold = threshold or self.config.grid_threshold
|
|
|
|
if not cell_boxes:
|
|
return []
|
|
|
|
try:
|
|
# Collect all coordinates
|
|
all_x = []
|
|
all_y = []
|
|
|
|
for box in cell_boxes:
|
|
if box is None or len(box) < 4:
|
|
continue
|
|
x0, y0, x1, y1 = box[:4]
|
|
all_x.extend([x0, x1])
|
|
all_y.extend([y0, y1])
|
|
|
|
if not all_x or not all_y:
|
|
return cell_boxes
|
|
|
|
# Cluster and normalize X coordinates
|
|
x_clusters = self._cluster_values(sorted(all_x), threshold)
|
|
y_clusters = self._cluster_values(sorted(all_y), threshold)
|
|
|
|
# Build mapping
|
|
x_map = {v: avg for avg, values in x_clusters for v in values}
|
|
y_map = {v: avg for avg, values in y_clusters for v in values}
|
|
|
|
# Normalize boxes
|
|
normalized = []
|
|
for box in cell_boxes:
|
|
if box is None or len(box) < 4:
|
|
normalized.append(box)
|
|
continue
|
|
|
|
x0, y0, x1, y1 = box[:4]
|
|
normalized.append([
|
|
x_map.get(x0, x0),
|
|
y_map.get(y0, y0),
|
|
x_map.get(x1, x1),
|
|
y_map.get(y1, y1)
|
|
])
|
|
|
|
return normalized
|
|
|
|
except Exception as e:
|
|
logger.error(f"Cell box normalization failed: {e}")
|
|
return cell_boxes
|
|
|
|
# =========================================================================
|
|
# Private Helper Methods
|
|
# =========================================================================
|
|
|
|
def _render_parsed_table(
|
|
self,
|
|
pdf_canvas,
|
|
table_data: Dict,
|
|
table_bbox: Tuple[float, float, float, float],
|
|
page_height: float,
|
|
scale_w: float = 1.0,
|
|
scale_h: float = 1.0
|
|
) -> bool:
|
|
"""Render a parsed table structure."""
|
|
rows = table_data.get('rows', [])
|
|
if not rows:
|
|
return False
|
|
|
|
# Build grid content
|
|
num_rows = len(rows)
|
|
num_cols = max(len(row.get('cells', [])) for row in rows)
|
|
|
|
# Track occupied cells for rowspan handling
|
|
occupied = [[False] * num_cols for _ in range(num_rows)]
|
|
|
|
grid = []
|
|
span_commands = []
|
|
|
|
for row_idx, row in enumerate(rows):
|
|
grid_row = [''] * num_cols
|
|
col_idx = 0
|
|
|
|
for cell in row.get('cells', []):
|
|
# Skip occupied cells
|
|
while col_idx < num_cols and occupied[row_idx][col_idx]:
|
|
col_idx += 1
|
|
|
|
if col_idx >= num_cols:
|
|
break
|
|
|
|
text = cell.get('text', '').strip()
|
|
colspan = cell.get('colspan', 1)
|
|
rowspan = cell.get('rowspan', 1)
|
|
|
|
# Place cell content
|
|
grid_row[col_idx] = text
|
|
|
|
# Mark occupied cells and build SPAN command
|
|
if colspan > 1 or rowspan > 1:
|
|
end_col = min(col_idx + colspan - 1, num_cols - 1)
|
|
end_row = min(row_idx + rowspan - 1, num_rows - 1)
|
|
span_commands.append(
|
|
('SPAN', (col_idx, row_idx), (end_col, end_row))
|
|
)
|
|
|
|
for r in range(row_idx, end_row + 1):
|
|
for c in range(col_idx, end_col + 1):
|
|
if r < num_rows and c < num_cols:
|
|
occupied[r][c] = True
|
|
else:
|
|
occupied[row_idx][col_idx] = True
|
|
|
|
col_idx += colspan
|
|
|
|
grid.append(grid_row)
|
|
|
|
# Calculate dimensions
|
|
x0, y0, x1, y1 = table_bbox
|
|
table_width = (x1 - x0) * scale_w
|
|
table_height = (y1 - y0) * scale_h
|
|
|
|
col_widths = [table_width / num_cols] * num_cols
|
|
row_heights = [table_height / num_rows] * num_rows
|
|
|
|
# Create paragraph style
|
|
style = ParagraphStyle(
|
|
'TableCell',
|
|
fontName=self.config.font_name,
|
|
fontSize=self.config.font_size,
|
|
alignment=TA_CENTER,
|
|
leading=self.config.font_size * 1.2
|
|
)
|
|
|
|
# Convert to Paragraph objects
|
|
para_grid = []
|
|
for row in grid:
|
|
para_row = []
|
|
for cell in row:
|
|
if cell:
|
|
para_row.append(Paragraph(cell, style))
|
|
else:
|
|
para_row.append('')
|
|
para_grid.append(para_row)
|
|
|
|
# Build TableStyle
|
|
table_style_commands = [
|
|
('GRID', (0, 0), (-1, -1), self.config.border_width, self.config.border_color),
|
|
('VALIGN', (0, 0), (-1, -1), self.config.vertical_align),
|
|
('ALIGN', (0, 0), (-1, -1), self.config.horizontal_align),
|
|
('LEFTPADDING', (0, 0), (-1, -1), self.config.left_padding),
|
|
('RIGHTPADDING', (0, 0), (-1, -1), self.config.right_padding),
|
|
('TOPPADDING', (0, 0), (-1, -1), self.config.top_padding),
|
|
('BOTTOMPADDING', (0, 0), (-1, -1), self.config.bottom_padding),
|
|
('FONTNAME', (0, 0), (-1, -1), self.config.font_name),
|
|
('FONTSIZE', (0, 0), (-1, -1), self.config.font_size),
|
|
]
|
|
table_style_commands.extend(span_commands)
|
|
|
|
# Create and draw table
|
|
table = Table(para_grid, colWidths=col_widths, rowHeights=row_heights)
|
|
table.setStyle(TableStyle(table_style_commands))
|
|
|
|
# Position and draw
|
|
pdf_x = x0
|
|
pdf_y = page_height - y1 # Flip Y
|
|
|
|
table.wrapOn(pdf_canvas, table_width, table_height)
|
|
table.drawOn(pdf_canvas, pdf_x, pdf_y)
|
|
|
|
return True
|
|
|
|
def _render_with_dimensions(
|
|
self,
|
|
pdf_canvas,
|
|
table_data: Dict,
|
|
table_bbox: Tuple[float, float, float, float],
|
|
page_height: float,
|
|
col_widths: List[float],
|
|
row_heights: List[float]
|
|
) -> bool:
|
|
"""Render table with specified dimensions."""
|
|
rows = table_data.get('rows', [])
|
|
if not rows:
|
|
return False
|
|
|
|
num_rows = len(rows)
|
|
num_cols = max(len(row.get('cells', [])) for row in rows)
|
|
|
|
# Adjust widths/heights if needed
|
|
if len(col_widths) != num_cols:
|
|
x0, y0, x1, y1 = table_bbox
|
|
col_widths = [(x1 - x0) / num_cols] * num_cols
|
|
if len(row_heights) != num_rows:
|
|
x0, y0, x1, y1 = table_bbox
|
|
row_heights = [(y1 - y0) / num_rows] * num_rows
|
|
|
|
# Build grid with proper positioning
|
|
grid = []
|
|
span_commands = []
|
|
occupied = [[False] * num_cols for _ in range(num_rows)]
|
|
|
|
for row_idx, row in enumerate(rows):
|
|
grid_row = [''] * num_cols
|
|
|
|
for cell in row.get('cells', []):
|
|
# Get column position
|
|
col_idx = cell.get('col', 0)
|
|
|
|
# Skip if out of bounds or occupied
|
|
while col_idx < num_cols and occupied[row_idx][col_idx]:
|
|
col_idx += 1
|
|
if col_idx >= num_cols:
|
|
continue
|
|
|
|
text = cell.get('text', '').strip()
|
|
colspan = cell.get('colspan', 1)
|
|
rowspan = cell.get('rowspan', 1)
|
|
|
|
grid_row[col_idx] = text
|
|
|
|
if colspan > 1 or rowspan > 1:
|
|
end_col = min(col_idx + colspan - 1, num_cols - 1)
|
|
end_row = min(row_idx + rowspan - 1, num_rows - 1)
|
|
span_commands.append(
|
|
('SPAN', (col_idx, row_idx), (end_col, end_row))
|
|
)
|
|
for r in range(row_idx, end_row + 1):
|
|
for c in range(col_idx, end_col + 1):
|
|
if r < num_rows and c < num_cols:
|
|
occupied[r][c] = True
|
|
else:
|
|
occupied[row_idx][col_idx] = True
|
|
|
|
grid.append(grid_row)
|
|
|
|
# Create style and table
|
|
style = ParagraphStyle(
|
|
'TableCell',
|
|
fontName=self.config.font_name,
|
|
fontSize=self.config.font_size,
|
|
alignment=TA_CENTER
|
|
)
|
|
|
|
para_grid = []
|
|
for row in grid:
|
|
para_row = [Paragraph(cell, style) if cell else '' for cell in row]
|
|
para_grid.append(para_row)
|
|
|
|
table_style_commands = [
|
|
('GRID', (0, 0), (-1, -1), self.config.border_width, self.config.border_color),
|
|
('VALIGN', (0, 0), (-1, -1), self.config.vertical_align),
|
|
('LEFTPADDING', (0, 0), (-1, -1), 0),
|
|
('RIGHTPADDING', (0, 0), (-1, -1), 0),
|
|
('TOPPADDING', (0, 0), (-1, -1), 0),
|
|
('BOTTOMPADDING', (0, 0), (-1, -1), 1),
|
|
]
|
|
table_style_commands.extend(span_commands)
|
|
|
|
table = Table(para_grid, colWidths=col_widths, rowHeights=row_heights)
|
|
table.setStyle(TableStyle(table_style_commands))
|
|
|
|
x0, y0, x1, y1 = table_bbox
|
|
pdf_x = x0
|
|
pdf_y = page_height - y1
|
|
|
|
table.wrapOn(pdf_canvas, x1 - x0, y1 - y0)
|
|
table.drawOn(pdf_canvas, pdf_x, pdf_y)
|
|
|
|
return True
|
|
|
|
def _build_rows_from_cells_dict(self, cells_dict: Dict) -> List[Dict]:
|
|
"""Convert Direct track cell structure to row format."""
|
|
cells = cells_dict.get('cells', [])
|
|
if not cells:
|
|
return []
|
|
|
|
num_rows = cells_dict.get('rows', 0)
|
|
num_cols = cells_dict.get('cols', 0)
|
|
|
|
# Group cells by row
|
|
rows_data = {}
|
|
for cell in cells:
|
|
row_idx = cell.get('row', 0)
|
|
if row_idx not in rows_data:
|
|
rows_data[row_idx] = []
|
|
rows_data[row_idx].append(cell)
|
|
|
|
# Build row list
|
|
rows = []
|
|
for row_idx in range(num_rows):
|
|
row_cells = rows_data.get(row_idx, [])
|
|
|
|
# Sort by column
|
|
row_cells.sort(key=lambda c: c.get('col', 0))
|
|
|
|
formatted_cells = []
|
|
for cell in row_cells:
|
|
content = cell.get('content', '')
|
|
if isinstance(content, list):
|
|
content = '\n'.join(str(c) for c in content)
|
|
|
|
formatted_cells.append({
|
|
'text': str(content) if content else '',
|
|
'colspan': cell.get('col_span', 1),
|
|
'rowspan': cell.get('row_span', 1),
|
|
'col': cell.get('col', 0),
|
|
'is_header': cell.get('is_header', False)
|
|
})
|
|
|
|
rows.append({'cells': formatted_cells})
|
|
|
|
return rows
|
|
|
|
def _draw_table_border(
|
|
self,
|
|
pdf_canvas,
|
|
table_bbox: Tuple[float, float, float, float],
|
|
page_height: float
|
|
) -> bool:
|
|
"""Draw outer table border."""
|
|
try:
|
|
x0, y0, x1, y1 = table_bbox
|
|
pdf_y0 = page_height - y1
|
|
pdf_y1 = page_height - y0
|
|
|
|
pdf_canvas.saveState()
|
|
pdf_canvas.setStrokeColor(self.config.border_color)
|
|
pdf_canvas.setLineWidth(self.config.border_width)
|
|
pdf_canvas.rect(x0, pdf_y0, x1 - x0, pdf_y1 - pdf_y0)
|
|
pdf_canvas.restoreState()
|
|
|
|
return True
|
|
except Exception as e:
|
|
logger.error(f"Failed to draw table border: {e}")
|
|
return False
|
|
|
|
def _draw_embedded_image(
|
|
self,
|
|
pdf_canvas,
|
|
img_info: Dict,
|
|
page_height: float,
|
|
output_dir: Path
|
|
) -> bool:
|
|
"""Draw an image embedded within a table cell."""
|
|
try:
|
|
img_path = img_info.get('path')
|
|
if not img_path:
|
|
return False
|
|
|
|
# Resolve path
|
|
if not Path(img_path).is_absolute():
|
|
img_path = output_dir / img_path
|
|
|
|
if not Path(img_path).exists():
|
|
logger.warning(f"Embedded image not found: {img_path}")
|
|
return False
|
|
|
|
bbox = img_info.get('bbox', {})
|
|
x0 = bbox.get('x0', 0)
|
|
y0 = bbox.get('y0', 0)
|
|
width = bbox.get('width', 100)
|
|
height = bbox.get('height', 100)
|
|
|
|
# Flip Y coordinate
|
|
pdf_y = page_height - y0 - height
|
|
|
|
# Draw image
|
|
img = ImageReader(str(img_path))
|
|
pdf_canvas.drawImage(img, x0, pdf_y, width, height)
|
|
|
|
return True
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to draw embedded image: {e}")
|
|
return False
|
|
|
|
def _fit_text_to_cell(
|
|
self,
|
|
pdf_canvas,
|
|
text: str,
|
|
cell_width: float,
|
|
cell_height: float
|
|
) -> int:
|
|
"""Find font size that fits text in cell."""
|
|
for size in range(self.config.max_font_size, self.config.min_font_size - 1, -1):
|
|
text_width = pdf_canvas.stringWidth(text, self.config.font_name, size)
|
|
if text_width <= cell_width - 6: # 3pt padding each side
|
|
return size
|
|
return self.config.min_font_size
|
|
|
|
def _merge_boundaries(self, values: List[float], threshold: float) -> List[float]:
|
|
"""Merge nearby boundary values."""
|
|
if not values:
|
|
return []
|
|
|
|
merged = [values[0]]
|
|
for v in values[1:]:
|
|
if abs(v - merged[-1]) > threshold:
|
|
merged.append(v)
|
|
|
|
return merged
|
|
|
|
def _cluster_values(self, values: List[float], threshold: float) -> List[Tuple[float, List[float]]]:
|
|
"""Cluster nearby values and return (average, members) pairs."""
|
|
if not values:
|
|
return []
|
|
|
|
clusters = []
|
|
current_cluster = [values[0]]
|
|
|
|
for v in values[1:]:
|
|
if abs(v - current_cluster[-1]) <= threshold:
|
|
current_cluster.append(v)
|
|
else:
|
|
avg = sum(current_cluster) / len(current_cluster)
|
|
clusters.append((avg, current_cluster))
|
|
current_cluster = [v]
|
|
|
|
if current_cluster:
|
|
avg = sum(current_cluster) / len(current_cluster)
|
|
clusters.append((avg, current_cluster))
|
|
|
|
return clusters
|