feat: implement table cell boxes extraction with SLANeXt

Phase 1-3 implementation of extract-table-cell-boxes proposal:

- Add enable_table_cell_boxes_extraction config option
- Implement lazy-loaded SLANeXt model caching in PPStructureEnhanced
- Add _extract_cell_boxes_with_slanet() method for direct model invocation
- Supplement PPStructureV3 table processing with SLANeXt cell boxes
- Add _compute_table_grid_from_cell_boxes() for column width calculation
- Modify draw_table_region() to use cell_boxes for accurate layout

Key features:
- Auto-detect table type (wired/wireless) using PP-LCNet classifier
- Convert 8-point polygon bbox to 4-point rectangle
- Graceful fallback to equal distribution when cell_boxes unavailable
- Proper coordinate transformation with scaling support

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
egg
2025-11-28 12:20:32 +08:00
parent 801ee9c4b6
commit 715805b3b8
3 changed files with 379 additions and 7 deletions

View File

@@ -80,6 +80,176 @@ class PPStructureEnhanced:
"""
self.structure_engine = structure_engine
# Lazy-loaded SLANeXt models for cell boxes extraction
# These are loaded on-demand when enable_table_cell_boxes_extraction is True
self._slanet_wired_model = None
self._slanet_wireless_model = None
self._table_cls_model = None
def _get_slanet_model(self, is_wired: bool = True):
"""
Get or create SLANeXt model for cell boxes extraction (lazy loading).
Args:
is_wired: True for wired (bordered) tables, False for wireless
Returns:
SLANeXt model instance or None if loading fails
"""
if not settings.enable_table_cell_boxes_extraction:
return None
try:
from paddlex import create_model
if is_wired:
if self._slanet_wired_model is None:
model_name = settings.wired_table_model_name or "SLANeXt_wired"
logger.info(f"Loading SLANeXt wired model: {model_name}")
self._slanet_wired_model = create_model(model_name)
return self._slanet_wired_model
else:
if self._slanet_wireless_model is None:
model_name = settings.wireless_table_model_name or "SLANeXt_wireless"
logger.info(f"Loading SLANeXt wireless model: {model_name}")
self._slanet_wireless_model = create_model(model_name)
return self._slanet_wireless_model
except Exception as e:
logger.error(f"Failed to load SLANeXt model: {e}")
return None
def _get_table_classifier(self):
"""
Get or create table classification model (lazy loading).
Returns:
Table classifier model instance or None if loading fails
"""
if not settings.enable_table_cell_boxes_extraction:
return None
try:
from paddlex import create_model
if self._table_cls_model is None:
model_name = settings.table_classification_model_name or "PP-LCNet_x1_0_table_cls"
logger.info(f"Loading table classification model: {model_name}")
self._table_cls_model = create_model(model_name)
return self._table_cls_model
except Exception as e:
logger.error(f"Failed to load table classifier: {e}")
return None
def _extract_cell_boxes_with_slanet(
self,
table_image: np.ndarray,
table_bbox: List[float],
is_wired: Optional[bool] = None
) -> Optional[List[List[float]]]:
"""
Extract cell bounding boxes using direct SLANeXt model call.
This supplements PPStructureV3 which doesn't expose cell boxes in its output.
Args:
table_image: Cropped table image as numpy array (BGR format)
table_bbox: Table bounding box in page coordinates [x1, y1, x2, y2]
is_wired: If None, auto-detect using classifier. True for bordered tables.
Returns:
List of cell bounding boxes in page coordinates [[x1,y1,x2,y2], ...],
or None if extraction fails
"""
if not settings.enable_table_cell_boxes_extraction:
return None
try:
# Auto-detect table type if not specified
if is_wired is None:
classifier = self._get_table_classifier()
if classifier:
try:
cls_result = classifier.predict(table_image)
# PP-LCNet returns classification result
for res in cls_result:
label_names = res.get('label_names', [])
if label_names:
is_wired = 'wired' in str(label_names[0]).lower()
logger.debug(f"Table classified as: {'wired' if is_wired else 'wireless'}")
break
except Exception as e:
logger.warning(f"Table classification failed, defaulting to wired: {e}")
is_wired = True
else:
is_wired = True # Default to wired if classifier unavailable
# Get appropriate SLANeXt model
model = self._get_slanet_model(is_wired=is_wired)
if model is None:
return None
# Run SLANeXt prediction
results = model.predict(table_image)
# Extract cell boxes from result
cell_boxes = []
table_x, table_y = table_bbox[0], table_bbox[1]
for result in results:
# SLANeXt returns 'bbox' with 8-point polygon format
# [[x1,y1,x2,y2,x3,y3,x4,y4], ...]
boxes = result.get('bbox', [])
for box in boxes:
if isinstance(box, (list, tuple)):
if len(box) >= 8:
# 8-point polygon: convert to 4-point rectangle
xs = [box[i] for i in range(0, 8, 2)]
ys = [box[i] for i in range(1, 8, 2)]
x1, y1 = min(xs), min(ys)
x2, y2 = max(xs), max(ys)
elif len(box) >= 4:
# Already 4-point rectangle
x1, y1, x2, y2 = box[:4]
else:
continue
# Convert to absolute page coordinates
abs_box = [
float(x1 + table_x),
float(y1 + table_y),
float(x2 + table_x),
float(y2 + table_y)
]
cell_boxes.append(abs_box)
logger.info(f"SLANeXt extracted {len(cell_boxes)} cell boxes (is_wired={is_wired})")
return cell_boxes if cell_boxes else None
except Exception as e:
logger.error(f"Cell boxes extraction with SLANeXt failed: {e}")
return None
def release_slanet_models(self):
"""Release SLANeXt models to free GPU memory."""
if self._slanet_wired_model is not None:
del self._slanet_wired_model
self._slanet_wired_model = None
logger.info("Released SLANeXt wired model")
if self._slanet_wireless_model is not None:
del self._slanet_wireless_model
self._slanet_wireless_model = None
logger.info("Released SLANeXt wireless model")
if self._table_cls_model is not None:
del self._table_cls_model
self._table_cls_model = None
logger.info("Released table classifier model")
gc.collect()
if TORCH_AVAILABLE:
torch.cuda.empty_cache()
def analyze_with_full_structure(
self,
image_path: Path,
@@ -372,9 +542,12 @@ class PPStructureEnhanced:
element['html'] = html_content
element['extracted_text'] = self._extract_text_from_html(html_content)
# 2. 【新增】提取 Cell 座標 (boxes)
# SLANet 回傳的格式通常是 [[x1, y1, x2, y2], ...]
# 2. 提取 Cell 座標 (boxes)
# 優先使用 PPStructureV3 返回的 boxes若無則調用 SLANeXt 補充
cell_boxes_extracted = False
if 'boxes' in res_data:
# PPStructureV3 returned cell boxes (unlikely in PaddleX 3.x)
cell_boxes = res_data['boxes']
logger.info(f"[TABLE] Found {len(cell_boxes)} cell boxes in res_data")
@@ -399,9 +572,54 @@ class PPStructureEnhanced:
# 將處理後的 Cell 座標存入 element
element['cell_boxes'] = processed_cells
element['raw_cell_boxes'] = cell_boxes
element['cell_boxes_source'] = 'ppstructure'
logger.info(f"[TABLE] Processed {len(processed_cells)} cell boxes with table offset ({table_x}, {table_y})")
else:
logger.info(f"[TABLE] No 'boxes' key in res_data. Available: {list(res_data.keys()) if res_data else 'empty'}")
cell_boxes_extracted = True
# Supplement with direct SLANeXt call if PPStructureV3 didn't provide boxes
if not cell_boxes_extracted and source_image_path and bbox != [0, 0, 0, 0]:
logger.info(f"[TABLE] No boxes from PPStructureV3, attempting SLANeXt extraction...")
try:
# Load source image and crop table region
source_img = Image.open(source_image_path)
source_array = np.array(source_img)
# Crop table region (bbox is in original image coordinates)
x1, y1, x2, y2 = [int(round(c)) for c in bbox]
# Ensure coordinates are within image bounds
h, w = source_array.shape[:2]
x1, y1 = max(0, x1), max(0, y1)
x2, y2 = min(w, x2), min(h, y2)
if x2 > x1 and y2 > y1:
table_crop = source_array[y1:y2, x1:x2]
# Convert RGB to BGR for SLANeXt
if len(table_crop.shape) == 3 and table_crop.shape[2] == 3:
table_crop_bgr = table_crop[:, :, ::-1]
else:
table_crop_bgr = table_crop
# Extract cell boxes using SLANeXt
slanet_boxes = self._extract_cell_boxes_with_slanet(
table_crop_bgr,
bbox, # Pass original bbox for coordinate offset
is_wired=None # Auto-detect
)
if slanet_boxes:
element['cell_boxes'] = slanet_boxes
element['cell_boxes_source'] = 'slanet'
cell_boxes_extracted = True
logger.info(f"[TABLE] SLANeXt extracted {len(slanet_boxes)} cell boxes")
else:
logger.warning(f"[TABLE] Invalid crop region: ({x1},{y1})-({x2},{y2})")
except Exception as e:
logger.error(f"[TABLE] SLANeXt extraction failed: {e}")
if not cell_boxes_extracted:
logger.info(f"[TABLE] No cell boxes available. PPStructureV3 keys: {list(res_data.keys()) if res_data else 'empty'}")
# Special handling for images/figures
elif mapped_type in [ElementType.IMAGE, ElementType.FIGURE]: