chore: backup before code cleanup

Backup commit before executing remove-unused-code proposal.
This includes all pending changes and new features.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
egg
2025-12-11 11:55:39 +08:00
parent eff9b0bcd5
commit 940a406dce
58 changed files with 8226 additions and 175 deletions

View File

@@ -24,6 +24,256 @@ from reportlab.platypus import Paragraph, Table, TableStyle
logger = logging.getLogger(__name__)
# ============================================================================
# Cell Box Grid Inferrer
# ============================================================================
class CellBoxGridInferrer:
"""
Infer table grid structure from cell_boxes coordinates.
This class clusters cell_boxes by Y-coordinate (rows) and X-coordinate (columns)
to determine the grid structure, regardless of HTML colspan/rowspan.
"""
def __init__(
self,
row_threshold: float = 15.0,
col_threshold: float = 15.0
):
"""
Initialize grid inferrer.
Args:
row_threshold: Y-coordinate threshold for row clustering
col_threshold: X-coordinate threshold for column clustering
"""
self.row_threshold = row_threshold
self.col_threshold = col_threshold
def infer_grid(
self,
cell_boxes: List[List[float]]
) -> Optional[Dict]:
"""
Infer grid structure from cell_boxes.
Args:
cell_boxes: List of [x0, y0, x1, y1] coordinates
Returns:
Dict with 'grid', 'num_rows', 'num_cols', 'row_boundaries', 'col_boundaries'
or None if inference fails
"""
if not cell_boxes or len(cell_boxes) < 1:
return None
try:
# Filter valid boxes
valid_boxes = [
b for b in cell_boxes
if b is not None and len(b) >= 4
]
if not valid_boxes:
return None
# Extract Y and X boundaries from all cells
y_mins = [b[1] for b in valid_boxes] # y0
y_maxs = [b[3] for b in valid_boxes] # y1
x_mins = [b[0] for b in valid_boxes] # x0
x_maxs = [b[2] for b in valid_boxes] # x1
# Cluster Y values to determine rows
all_y = sorted(set(y_mins + y_maxs))
y_boundaries = self._cluster_to_boundaries(all_y, self.row_threshold)
# Cluster X values to determine columns
all_x = sorted(set(x_mins + x_maxs))
x_boundaries = self._cluster_to_boundaries(all_x, self.col_threshold)
if len(y_boundaries) < 2 or len(x_boundaries) < 2:
return None
num_rows = len(y_boundaries) - 1
num_cols = len(x_boundaries) - 1
# Build grid: map (row, col) -> cell_box info
grid = {}
for idx, box in enumerate(valid_boxes):
x0, y0, x1, y1 = box[:4]
# Find row by y_center
y_center = (y0 + y1) / 2
row = self._find_position(y_center, y_boundaries)
# Find col by x_center
x_center = (x0 + x1) / 2
col = self._find_position(x_center, x_boundaries)
if row is not None and col is not None:
grid[(row, col)] = {
'bbox': box,
'index': idx,
'content': ''
}
# Calculate row heights and column widths
row_heights = [
y_boundaries[i + 1] - y_boundaries[i]
for i in range(num_rows)
]
col_widths = [
x_boundaries[i + 1] - x_boundaries[i]
for i in range(num_cols)
]
return {
'grid': grid,
'num_rows': num_rows,
'num_cols': num_cols,
'row_boundaries': y_boundaries,
'col_boundaries': x_boundaries,
'row_heights': row_heights,
'col_widths': col_widths
}
except Exception as e:
logger.error(f"Grid inference failed: {e}")
return None
def _cluster_to_boundaries(
self,
values: List[float],
threshold: float
) -> List[float]:
"""
Cluster nearby values and return representative boundaries.
Args:
values: Sorted list of coordinate values
threshold: Clustering threshold
Returns:
List of boundary values (cluster representatives)
"""
if not values:
return []
boundaries = [values[0]]
current_cluster = [values[0]]
for v in values[1:]:
if v - current_cluster[-1] <= threshold:
current_cluster.append(v)
else:
# Finish current cluster, use average as boundary
boundaries[-1] = sum(current_cluster) / len(current_cluster)
boundaries.append(v)
current_cluster = [v]
# Finish last cluster
if current_cluster:
boundaries[-1] = sum(current_cluster) / len(current_cluster)
return boundaries
def _find_position(
self,
value: float,
boundaries: List[float]
) -> Optional[int]:
"""
Find which interval a value falls into.
Args:
value: Coordinate value
boundaries: List of boundary values
Returns:
Index of interval, or None if out of bounds
"""
for i in range(len(boundaries) - 1):
if boundaries[i] <= value <= boundaries[i + 1]:
return i
# Check if close to any boundary
for i in range(len(boundaries) - 1):
mid = (boundaries[i] + boundaries[i + 1]) / 2
if abs(value - mid) < (boundaries[i + 1] - boundaries[i]):
return i
return None
def extract_cell_contents_from_html(html: str) -> List[str]:
"""
Extract cell text contents from HTML in reading order.
Args:
html: HTML table string
Returns:
List of text strings, one per cell
"""
try:
parser = HTMLTableParser()
parser.feed(html)
if not parser.tables:
return []
contents = []
for row in parser.tables[0].get('rows', []):
for cell in row.get('cells', []):
text = cell.get('text', '').strip()
contents.append(text)
return contents
except Exception as e:
logger.error(f"HTML content extraction failed: {e}")
return []
def map_content_to_grid(
grid: Dict[Tuple[int, int], Dict],
contents: List[str],
num_rows: int,
num_cols: int
) -> Dict[Tuple[int, int], Dict]:
"""
Map extracted content to grid cells row by row.
Args:
grid: Dict mapping (row, col) to cell info
contents: List of text contents from HTML
num_rows: Number of rows in grid
num_cols: Number of columns in grid
Returns:
Updated grid with content assigned
"""
content_idx = 0
for row in range(num_rows):
for col in range(num_cols):
if (row, col) in grid:
if content_idx < len(contents):
grid[(row, col)]['content'] = contents[content_idx]
content_idx += 1
else:
grid[(row, col)]['content'] = ''
# Log if there's a significant mismatch
if content_idx < len(contents):
logger.debug(
f"Content mismatch: {len(contents)} HTML cells, "
f"only {content_idx} mapped to {len(grid)} grid cells"
)
return grid
# ============================================================================
# Configuration
# ============================================================================
@@ -405,6 +655,147 @@ class TableRenderer:
traceback.print_exc()
return False
def render_from_cellboxes_grid(
self,
pdf_canvas,
cell_boxes: List[List[float]],
html_content: str,
table_bbox: Tuple[float, float, float, float],
page_height: float,
scale_w: float = 1.0,
scale_h: float = 1.0,
row_threshold: float = 15.0,
col_threshold: float = 15.0
) -> bool:
"""
Render table using cell_boxes as the primary structure source.
This method infers grid structure from cell_boxes coordinates and
maps HTML content to cells, regardless of HTML colspan/rowspan.
Args:
pdf_canvas: ReportLab canvas
cell_boxes: List of [x0, y0, x1, y1] for each cell
html_content: HTML table string (for text content)
table_bbox: Table bounding box
page_height: PDF page height
scale_w: Horizontal scale factor
scale_h: Vertical scale factor
row_threshold: Y-coordinate threshold for row clustering
col_threshold: X-coordinate threshold for column clustering
Returns:
True if successful, False otherwise
"""
try:
if not cell_boxes:
logger.debug("No cell_boxes provided for grid rendering")
return False
# Infer grid structure from cell_boxes
inferrer = CellBoxGridInferrer(
row_threshold=row_threshold,
col_threshold=col_threshold
)
grid_info = inferrer.infer_grid(cell_boxes)
if not grid_info:
logger.debug("Failed to infer grid from cell_boxes")
return False
grid = grid_info['grid']
num_rows = grid_info['num_rows']
num_cols = grid_info['num_cols']
row_boundaries = grid_info['row_boundaries']
col_boundaries = grid_info['col_boundaries']
logger.info(
f"[TABLE] CellBoxes grid inferred: {num_rows} rows x {num_cols} cols "
f"from {len(cell_boxes)} cell_boxes"
)
# Extract content from HTML
if html_content:
contents = extract_cell_contents_from_html(html_content)
grid = map_content_to_grid(grid, contents, num_rows, num_cols)
logger.debug(f"[TABLE] Mapped {len(contents)} HTML cells to grid")
# Apply scale factors to boundaries
scaled_row_boundaries = [y * scale_h for y in row_boundaries]
scaled_col_boundaries = [x * scale_w for x in col_boundaries]
# Draw cell borders and content
pdf_canvas.saveState()
pdf_canvas.setStrokeColor(self.config.border_color)
pdf_canvas.setLineWidth(self.config.border_width)
# Create paragraph style for text
style = ParagraphStyle(
'CellBoxCell',
fontName=self.config.font_name,
fontSize=self.config.font_size,
alignment=TA_CENTER,
leading=self.config.font_size * 1.2
)
for row in range(num_rows):
for col in range(num_cols):
# Calculate cell boundaries
x0 = scaled_col_boundaries[col]
x1 = scaled_col_boundaries[col + 1] if col + 1 < len(scaled_col_boundaries) else x0 + 50
y0 = scaled_row_boundaries[row]
y1 = scaled_row_boundaries[row + 1] if row + 1 < len(scaled_row_boundaries) else y0 + 20
# Convert to PDF coordinates (flip Y)
pdf_x0 = x0
pdf_y0 = page_height - y1
pdf_x1 = x1
pdf_y1 = page_height - y0
cell_width = pdf_x1 - pdf_x0
cell_height = pdf_y1 - pdf_y0
# Draw cell border
pdf_canvas.rect(pdf_x0, pdf_y0, cell_width, cell_height)
# Draw text if cell exists in grid
if (row, col) in grid:
cell_content = grid[(row, col)].get('content', '')
if cell_content:
# Calculate text position with padding
text_x = pdf_x0 + self.config.left_padding
text_y = pdf_y0 + cell_height - self.config.top_padding - self.config.font_size
# Fit text to cell
available_width = cell_width - self.config.left_padding - self.config.right_padding
font_size = self._fit_text_to_cell(
pdf_canvas, cell_content, available_width, cell_height
)
# Draw centered text
pdf_canvas.setFont(self.config.font_name, font_size)
text_width = pdf_canvas.stringWidth(
cell_content, self.config.font_name, font_size
)
# Center horizontally
text_x = pdf_x0 + (cell_width - text_width) / 2
# Center vertically
text_y = pdf_y0 + (cell_height - font_size) / 2
pdf_canvas.drawString(text_x, text_y, cell_content)
pdf_canvas.restoreState()
logger.info(f"[TABLE] Successfully rendered {num_rows}x{num_cols} table from cell_boxes")
return True
except Exception as e:
logger.error(f"CellBoxes grid rendering failed: {e}")
import traceback
traceback.print_exc()
return False
# =========================================================================
# Grid and Cell Box Helpers
# =========================================================================