chore: backup before code cleanup
Backup commit before executing remove-unused-code proposal. This includes all pending changes and new features. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -24,6 +24,256 @@ from reportlab.platypus import Paragraph, Table, TableStyle
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Cell Box Grid Inferrer
|
||||
# ============================================================================
|
||||
|
||||
class CellBoxGridInferrer:
|
||||
"""
|
||||
Infer table grid structure from cell_boxes coordinates.
|
||||
|
||||
This class clusters cell_boxes by Y-coordinate (rows) and X-coordinate (columns)
|
||||
to determine the grid structure, regardless of HTML colspan/rowspan.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
row_threshold: float = 15.0,
|
||||
col_threshold: float = 15.0
|
||||
):
|
||||
"""
|
||||
Initialize grid inferrer.
|
||||
|
||||
Args:
|
||||
row_threshold: Y-coordinate threshold for row clustering
|
||||
col_threshold: X-coordinate threshold for column clustering
|
||||
"""
|
||||
self.row_threshold = row_threshold
|
||||
self.col_threshold = col_threshold
|
||||
|
||||
def infer_grid(
|
||||
self,
|
||||
cell_boxes: List[List[float]]
|
||||
) -> Optional[Dict]:
|
||||
"""
|
||||
Infer grid structure from cell_boxes.
|
||||
|
||||
Args:
|
||||
cell_boxes: List of [x0, y0, x1, y1] coordinates
|
||||
|
||||
Returns:
|
||||
Dict with 'grid', 'num_rows', 'num_cols', 'row_boundaries', 'col_boundaries'
|
||||
or None if inference fails
|
||||
"""
|
||||
if not cell_boxes or len(cell_boxes) < 1:
|
||||
return None
|
||||
|
||||
try:
|
||||
# Filter valid boxes
|
||||
valid_boxes = [
|
||||
b for b in cell_boxes
|
||||
if b is not None and len(b) >= 4
|
||||
]
|
||||
if not valid_boxes:
|
||||
return None
|
||||
|
||||
# Extract Y and X boundaries from all cells
|
||||
y_mins = [b[1] for b in valid_boxes] # y0
|
||||
y_maxs = [b[3] for b in valid_boxes] # y1
|
||||
x_mins = [b[0] for b in valid_boxes] # x0
|
||||
x_maxs = [b[2] for b in valid_boxes] # x1
|
||||
|
||||
# Cluster Y values to determine rows
|
||||
all_y = sorted(set(y_mins + y_maxs))
|
||||
y_boundaries = self._cluster_to_boundaries(all_y, self.row_threshold)
|
||||
|
||||
# Cluster X values to determine columns
|
||||
all_x = sorted(set(x_mins + x_maxs))
|
||||
x_boundaries = self._cluster_to_boundaries(all_x, self.col_threshold)
|
||||
|
||||
if len(y_boundaries) < 2 or len(x_boundaries) < 2:
|
||||
return None
|
||||
|
||||
num_rows = len(y_boundaries) - 1
|
||||
num_cols = len(x_boundaries) - 1
|
||||
|
||||
# Build grid: map (row, col) -> cell_box info
|
||||
grid = {}
|
||||
for idx, box in enumerate(valid_boxes):
|
||||
x0, y0, x1, y1 = box[:4]
|
||||
|
||||
# Find row by y_center
|
||||
y_center = (y0 + y1) / 2
|
||||
row = self._find_position(y_center, y_boundaries)
|
||||
|
||||
# Find col by x_center
|
||||
x_center = (x0 + x1) / 2
|
||||
col = self._find_position(x_center, x_boundaries)
|
||||
|
||||
if row is not None and col is not None:
|
||||
grid[(row, col)] = {
|
||||
'bbox': box,
|
||||
'index': idx,
|
||||
'content': ''
|
||||
}
|
||||
|
||||
# Calculate row heights and column widths
|
||||
row_heights = [
|
||||
y_boundaries[i + 1] - y_boundaries[i]
|
||||
for i in range(num_rows)
|
||||
]
|
||||
col_widths = [
|
||||
x_boundaries[i + 1] - x_boundaries[i]
|
||||
for i in range(num_cols)
|
||||
]
|
||||
|
||||
return {
|
||||
'grid': grid,
|
||||
'num_rows': num_rows,
|
||||
'num_cols': num_cols,
|
||||
'row_boundaries': y_boundaries,
|
||||
'col_boundaries': x_boundaries,
|
||||
'row_heights': row_heights,
|
||||
'col_widths': col_widths
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Grid inference failed: {e}")
|
||||
return None
|
||||
|
||||
def _cluster_to_boundaries(
|
||||
self,
|
||||
values: List[float],
|
||||
threshold: float
|
||||
) -> List[float]:
|
||||
"""
|
||||
Cluster nearby values and return representative boundaries.
|
||||
|
||||
Args:
|
||||
values: Sorted list of coordinate values
|
||||
threshold: Clustering threshold
|
||||
|
||||
Returns:
|
||||
List of boundary values (cluster representatives)
|
||||
"""
|
||||
if not values:
|
||||
return []
|
||||
|
||||
boundaries = [values[0]]
|
||||
current_cluster = [values[0]]
|
||||
|
||||
for v in values[1:]:
|
||||
if v - current_cluster[-1] <= threshold:
|
||||
current_cluster.append(v)
|
||||
else:
|
||||
# Finish current cluster, use average as boundary
|
||||
boundaries[-1] = sum(current_cluster) / len(current_cluster)
|
||||
boundaries.append(v)
|
||||
current_cluster = [v]
|
||||
|
||||
# Finish last cluster
|
||||
if current_cluster:
|
||||
boundaries[-1] = sum(current_cluster) / len(current_cluster)
|
||||
|
||||
return boundaries
|
||||
|
||||
def _find_position(
|
||||
self,
|
||||
value: float,
|
||||
boundaries: List[float]
|
||||
) -> Optional[int]:
|
||||
"""
|
||||
Find which interval a value falls into.
|
||||
|
||||
Args:
|
||||
value: Coordinate value
|
||||
boundaries: List of boundary values
|
||||
|
||||
Returns:
|
||||
Index of interval, or None if out of bounds
|
||||
"""
|
||||
for i in range(len(boundaries) - 1):
|
||||
if boundaries[i] <= value <= boundaries[i + 1]:
|
||||
return i
|
||||
|
||||
# Check if close to any boundary
|
||||
for i in range(len(boundaries) - 1):
|
||||
mid = (boundaries[i] + boundaries[i + 1]) / 2
|
||||
if abs(value - mid) < (boundaries[i + 1] - boundaries[i]):
|
||||
return i
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def extract_cell_contents_from_html(html: str) -> List[str]:
|
||||
"""
|
||||
Extract cell text contents from HTML in reading order.
|
||||
|
||||
Args:
|
||||
html: HTML table string
|
||||
|
||||
Returns:
|
||||
List of text strings, one per cell
|
||||
"""
|
||||
try:
|
||||
parser = HTMLTableParser()
|
||||
parser.feed(html)
|
||||
|
||||
if not parser.tables:
|
||||
return []
|
||||
|
||||
contents = []
|
||||
for row in parser.tables[0].get('rows', []):
|
||||
for cell in row.get('cells', []):
|
||||
text = cell.get('text', '').strip()
|
||||
contents.append(text)
|
||||
|
||||
return contents
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"HTML content extraction failed: {e}")
|
||||
return []
|
||||
|
||||
|
||||
def map_content_to_grid(
|
||||
grid: Dict[Tuple[int, int], Dict],
|
||||
contents: List[str],
|
||||
num_rows: int,
|
||||
num_cols: int
|
||||
) -> Dict[Tuple[int, int], Dict]:
|
||||
"""
|
||||
Map extracted content to grid cells row by row.
|
||||
|
||||
Args:
|
||||
grid: Dict mapping (row, col) to cell info
|
||||
contents: List of text contents from HTML
|
||||
num_rows: Number of rows in grid
|
||||
num_cols: Number of columns in grid
|
||||
|
||||
Returns:
|
||||
Updated grid with content assigned
|
||||
"""
|
||||
content_idx = 0
|
||||
|
||||
for row in range(num_rows):
|
||||
for col in range(num_cols):
|
||||
if (row, col) in grid:
|
||||
if content_idx < len(contents):
|
||||
grid[(row, col)]['content'] = contents[content_idx]
|
||||
content_idx += 1
|
||||
else:
|
||||
grid[(row, col)]['content'] = ''
|
||||
|
||||
# Log if there's a significant mismatch
|
||||
if content_idx < len(contents):
|
||||
logger.debug(
|
||||
f"Content mismatch: {len(contents)} HTML cells, "
|
||||
f"only {content_idx} mapped to {len(grid)} grid cells"
|
||||
)
|
||||
|
||||
return grid
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Configuration
|
||||
# ============================================================================
|
||||
@@ -405,6 +655,147 @@ class TableRenderer:
|
||||
traceback.print_exc()
|
||||
return False
|
||||
|
||||
def render_from_cellboxes_grid(
|
||||
self,
|
||||
pdf_canvas,
|
||||
cell_boxes: List[List[float]],
|
||||
html_content: str,
|
||||
table_bbox: Tuple[float, float, float, float],
|
||||
page_height: float,
|
||||
scale_w: float = 1.0,
|
||||
scale_h: float = 1.0,
|
||||
row_threshold: float = 15.0,
|
||||
col_threshold: float = 15.0
|
||||
) -> bool:
|
||||
"""
|
||||
Render table using cell_boxes as the primary structure source.
|
||||
|
||||
This method infers grid structure from cell_boxes coordinates and
|
||||
maps HTML content to cells, regardless of HTML colspan/rowspan.
|
||||
|
||||
Args:
|
||||
pdf_canvas: ReportLab canvas
|
||||
cell_boxes: List of [x0, y0, x1, y1] for each cell
|
||||
html_content: HTML table string (for text content)
|
||||
table_bbox: Table bounding box
|
||||
page_height: PDF page height
|
||||
scale_w: Horizontal scale factor
|
||||
scale_h: Vertical scale factor
|
||||
row_threshold: Y-coordinate threshold for row clustering
|
||||
col_threshold: X-coordinate threshold for column clustering
|
||||
|
||||
Returns:
|
||||
True if successful, False otherwise
|
||||
"""
|
||||
try:
|
||||
if not cell_boxes:
|
||||
logger.debug("No cell_boxes provided for grid rendering")
|
||||
return False
|
||||
|
||||
# Infer grid structure from cell_boxes
|
||||
inferrer = CellBoxGridInferrer(
|
||||
row_threshold=row_threshold,
|
||||
col_threshold=col_threshold
|
||||
)
|
||||
grid_info = inferrer.infer_grid(cell_boxes)
|
||||
|
||||
if not grid_info:
|
||||
logger.debug("Failed to infer grid from cell_boxes")
|
||||
return False
|
||||
|
||||
grid = grid_info['grid']
|
||||
num_rows = grid_info['num_rows']
|
||||
num_cols = grid_info['num_cols']
|
||||
row_boundaries = grid_info['row_boundaries']
|
||||
col_boundaries = grid_info['col_boundaries']
|
||||
|
||||
logger.info(
|
||||
f"[TABLE] CellBoxes grid inferred: {num_rows} rows x {num_cols} cols "
|
||||
f"from {len(cell_boxes)} cell_boxes"
|
||||
)
|
||||
|
||||
# Extract content from HTML
|
||||
if html_content:
|
||||
contents = extract_cell_contents_from_html(html_content)
|
||||
grid = map_content_to_grid(grid, contents, num_rows, num_cols)
|
||||
logger.debug(f"[TABLE] Mapped {len(contents)} HTML cells to grid")
|
||||
|
||||
# Apply scale factors to boundaries
|
||||
scaled_row_boundaries = [y * scale_h for y in row_boundaries]
|
||||
scaled_col_boundaries = [x * scale_w for x in col_boundaries]
|
||||
|
||||
# Draw cell borders and content
|
||||
pdf_canvas.saveState()
|
||||
pdf_canvas.setStrokeColor(self.config.border_color)
|
||||
pdf_canvas.setLineWidth(self.config.border_width)
|
||||
|
||||
# Create paragraph style for text
|
||||
style = ParagraphStyle(
|
||||
'CellBoxCell',
|
||||
fontName=self.config.font_name,
|
||||
fontSize=self.config.font_size,
|
||||
alignment=TA_CENTER,
|
||||
leading=self.config.font_size * 1.2
|
||||
)
|
||||
|
||||
for row in range(num_rows):
|
||||
for col in range(num_cols):
|
||||
# Calculate cell boundaries
|
||||
x0 = scaled_col_boundaries[col]
|
||||
x1 = scaled_col_boundaries[col + 1] if col + 1 < len(scaled_col_boundaries) else x0 + 50
|
||||
y0 = scaled_row_boundaries[row]
|
||||
y1 = scaled_row_boundaries[row + 1] if row + 1 < len(scaled_row_boundaries) else y0 + 20
|
||||
|
||||
# Convert to PDF coordinates (flip Y)
|
||||
pdf_x0 = x0
|
||||
pdf_y0 = page_height - y1
|
||||
pdf_x1 = x1
|
||||
pdf_y1 = page_height - y0
|
||||
|
||||
cell_width = pdf_x1 - pdf_x0
|
||||
cell_height = pdf_y1 - pdf_y0
|
||||
|
||||
# Draw cell border
|
||||
pdf_canvas.rect(pdf_x0, pdf_y0, cell_width, cell_height)
|
||||
|
||||
# Draw text if cell exists in grid
|
||||
if (row, col) in grid:
|
||||
cell_content = grid[(row, col)].get('content', '')
|
||||
if cell_content:
|
||||
# Calculate text position with padding
|
||||
text_x = pdf_x0 + self.config.left_padding
|
||||
text_y = pdf_y0 + cell_height - self.config.top_padding - self.config.font_size
|
||||
|
||||
# Fit text to cell
|
||||
available_width = cell_width - self.config.left_padding - self.config.right_padding
|
||||
font_size = self._fit_text_to_cell(
|
||||
pdf_canvas, cell_content, available_width, cell_height
|
||||
)
|
||||
|
||||
# Draw centered text
|
||||
pdf_canvas.setFont(self.config.font_name, font_size)
|
||||
text_width = pdf_canvas.stringWidth(
|
||||
cell_content, self.config.font_name, font_size
|
||||
)
|
||||
|
||||
# Center horizontally
|
||||
text_x = pdf_x0 + (cell_width - text_width) / 2
|
||||
# Center vertically
|
||||
text_y = pdf_y0 + (cell_height - font_size) / 2
|
||||
|
||||
pdf_canvas.drawString(text_x, text_y, cell_content)
|
||||
|
||||
pdf_canvas.restoreState()
|
||||
|
||||
logger.info(f"[TABLE] Successfully rendered {num_rows}x{num_cols} table from cell_boxes")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"CellBoxes grid rendering failed: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return False
|
||||
|
||||
# =========================================================================
|
||||
# Grid and Cell Box Helpers
|
||||
# =========================================================================
|
||||
|
||||
Reference in New Issue
Block a user