feat: implement table cell boxes extraction with SLANeXt

Phase 1-3 implementation of extract-table-cell-boxes proposal:

- Add enable_table_cell_boxes_extraction config option
- Implement lazy-loaded SLANeXt model caching in PPStructureEnhanced
- Add _extract_cell_boxes_with_slanet() method for direct model invocation
- Supplement PPStructureV3 table processing with SLANeXt cell boxes
- Add _compute_table_grid_from_cell_boxes() for column width calculation
- Modify draw_table_region() to use cell_boxes for accurate layout

Key features:
- Auto-detect table type (wired/wireless) using PP-LCNet classifier
- Convert 8-point polygon bbox to 4-point rectangle
- Graceful fallback to equal distribution when cell_boxes unavailable
- Proper coordinate transformation with scaling support

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
egg
2025-11-28 12:20:32 +08:00
parent 801ee9c4b6
commit 715805b3b8
3 changed files with 379 additions and 7 deletions

View File

@@ -1605,6 +1605,120 @@ class PDFGeneratorService:
except Exception as e:
logger.warning(f"Failed to draw text region '{text[:20]}...': {e}")
def _compute_table_grid_from_cell_boxes(
self,
cell_boxes: List[List[float]],
table_bbox: List[float],
num_rows: int,
num_cols: int
) -> Tuple[Optional[List[float]], Optional[List[float]]]:
"""
Compute column widths and row heights from cell bounding boxes.
This uses the cell boxes extracted by SLANeXt to calculate the actual
column widths and row heights, which provides more accurate table rendering
than uniform distribution.
Args:
cell_boxes: List of cell bboxes [[x1,y1,x2,y2], ...]
table_bbox: Table bounding box [x1,y1,x2,y2]
num_rows: Number of rows in the table
num_cols: Number of columns in the table
Returns:
Tuple of (col_widths, row_heights) or (None, None) if calculation fails
"""
if not cell_boxes or len(cell_boxes) < 2:
return None, None
try:
table_x1, table_y1, table_x2, table_y2 = table_bbox
table_width = table_x2 - table_x1
table_height = table_y2 - table_y1
# Collect all unique X and Y boundaries from cell boxes
x_boundaries = set()
y_boundaries = set()
for box in cell_boxes:
if len(box) >= 4:
x1, y1, x2, y2 = box[:4]
# Convert to relative coordinates within table
x_boundaries.add(x1 - table_x1)
x_boundaries.add(x2 - table_x1)
y_boundaries.add(y1 - table_y1)
y_boundaries.add(y2 - table_y1)
# Sort boundaries
x_boundaries = sorted(x_boundaries)
y_boundaries = sorted(y_boundaries)
# Ensure we have boundaries at table edges
if x_boundaries and x_boundaries[0] > 5:
x_boundaries.insert(0, 0)
if x_boundaries and x_boundaries[-1] < table_width - 5:
x_boundaries.append(table_width)
if y_boundaries and y_boundaries[0] > 5:
y_boundaries.insert(0, 0)
if y_boundaries and y_boundaries[-1] < table_height - 5:
y_boundaries.append(table_height)
# Calculate column widths from X boundaries
# Merge boundaries that are too close (< 5px)
merged_x = [x_boundaries[0]] if x_boundaries else []
for x in x_boundaries[1:]:
if x - merged_x[-1] > 5:
merged_x.append(x)
x_boundaries = merged_x
# Calculate row heights from Y boundaries
merged_y = [y_boundaries[0]] if y_boundaries else []
for y in y_boundaries[1:]:
if y - merged_y[-1] > 5:
merged_y.append(y)
y_boundaries = merged_y
# Calculate widths and heights
col_widths = []
for i in range(len(x_boundaries) - 1):
col_widths.append(x_boundaries[i + 1] - x_boundaries[i])
row_heights = []
for i in range(len(y_boundaries) - 1):
row_heights.append(y_boundaries[i + 1] - y_boundaries[i])
# Validate: number of columns/rows should match expected
if len(col_widths) == num_cols and len(row_heights) == num_rows:
logger.info(f"[TABLE] Cell boxes grid: {num_cols} cols, {num_rows} rows")
logger.debug(f"[TABLE] Col widths from cell_boxes: {[f'{w:.1f}' for w in col_widths]}")
logger.debug(f"[TABLE] Row heights from cell_boxes: {[f'{h:.1f}' for h in row_heights]}")
return col_widths, row_heights
else:
# Grid doesn't match, might be due to merged cells
logger.debug(
f"[TABLE] Cell boxes grid mismatch: "
f"got {len(col_widths)}x{len(row_heights)}, expected {num_cols}x{num_rows}"
)
# Still return the widths/heights if counts are close
if abs(len(col_widths) - num_cols) <= 1 and abs(len(row_heights) - num_rows) <= 1:
# Adjust to match expected count
while len(col_widths) < num_cols:
col_widths.append(col_widths[-1] if col_widths else table_width / num_cols)
while len(col_widths) > num_cols:
col_widths.pop()
while len(row_heights) < num_rows:
row_heights.append(row_heights[-1] if row_heights else table_height / num_rows)
while len(row_heights) > num_rows:
row_heights.pop()
return col_widths, row_heights
return None, None
except Exception as e:
logger.warning(f"[TABLE] Failed to compute grid from cell boxes: {e}")
return None, None
def draw_table_region(
self,
pdf_canvas: canvas.Canvas,
@@ -1765,8 +1879,36 @@ class PDFGeneratorService:
reportlab_data.append(row_data)
# Calculate column widths (equal distribution)
col_widths = [table_width / max_cols] * max_cols
# Calculate column widths and row heights
# First, try to use cell_boxes if available for more accurate layout
cell_boxes = table_element.get('cell_boxes')
raw_table_bbox = [ocr_x_left_raw, ocr_y_top_raw, ocr_x_right_raw, ocr_y_bottom_raw]
computed_col_widths = None
computed_row_heights = None
if cell_boxes:
cell_boxes_source = table_element.get('cell_boxes_source', 'unknown')
logger.info(f"[TABLE] Using {len(cell_boxes)} cell boxes from {cell_boxes_source}")
computed_col_widths, computed_row_heights = self._compute_table_grid_from_cell_boxes(
cell_boxes, raw_table_bbox, num_rows, max_cols
)
# Use computed widths if available, otherwise fall back to equal distribution
if computed_col_widths:
# Scale col_widths to PDF coordinates
col_widths = [w * scale_w for w in computed_col_widths]
logger.info(f"[TABLE] Using cell_boxes col widths (scaled)")
else:
col_widths = [table_width / max_cols] * max_cols
logger.info(f"[TABLE] Using equal distribution col widths")
# Row heights are used optionally (ReportLab can auto-size)
row_heights = None
if computed_row_heights:
# Scale row_heights to PDF coordinates
row_heights = [h * scale_h for h in computed_row_heights]
logger.debug(f"[TABLE] Cell_boxes row heights available (scaled)")
# Create ReportLab Table
# Use smaller font to fit content with auto-wrap
@@ -1790,7 +1932,11 @@ class PDFGeneratorService:
escaped_text = cell_text.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
reportlab_data[row_idx][col_idx] = Paragraph(escaped_text, cell_style)
# Create table WITHOUT fixed row heights - let it auto-size based on content
# Create table with computed col widths
# Note: We don't use row_heights even when available from cell_boxes because:
# 1. ReportLab's auto-sizing handles content overflow better
# 2. Fixed heights can cause text clipping when content exceeds cell size
# 3. The col_widths from cell_boxes provide the main layout benefit
table = Table(reportlab_data, colWidths=col_widths)
# Apply table style