feat: extract cell_box_list from table_res_list

Based on pp_demo analysis, PPStructureV3 returns table_res_list containing
cell_box_list which was previously ignored. This commit:

- Extract table_res_list from PPStructureV3 result alongside parsing_res_list
- Add table_res_list parameter to _process_parsing_res_list()
- Prioritize cell_box_list from table_res_list over SLANeXt extraction
- Match tables by HTML content or use first available

Priority order for cell boxes:
1. table_res_list.cell_box_list (native, already absolute coords)
2. res_data['boxes'] (unlikely in PaddleX 3.x)
3. Direct SLANeXt model call (fallback)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
egg
2025-11-28 12:41:18 +08:00
parent 5ddccbf5a2
commit f5a2c8a750

View File

@@ -316,29 +316,35 @@ class PPStructureEnhanced:
# Process each page result
for page_idx, page_result in enumerate(results):
# Try to access parsing_res_list (the complete structure)
# Try to access parsing_res_list and table_res_list (the complete structure)
parsing_res_list = None
table_res_list = None
result_dict = None
# Method 1: Direct access to json attribute (check both top-level and res)
if hasattr(page_result, 'json'):
result_json = page_result.json
if isinstance(result_json, dict):
result_dict = result_json
# Check top-level
if 'parsing_res_list' in result_json:
parsing_res_list = result_json['parsing_res_list']
logger.info(f"Found parsing_res_list at top level with {len(parsing_res_list)} elements")
# Check inside 'res' (new structure in paddlex)
elif 'res' in result_json and isinstance(result_json['res'], dict):
result_dict = result_json['res']
if 'parsing_res_list' in result_json['res']:
parsing_res_list = result_json['res']['parsing_res_list']
logger.info(f"Found parsing_res_list inside 'res' with {len(parsing_res_list)} elements")
# Method 2: Try direct dict access (LayoutParsingResultV2 inherits from dict)
elif isinstance(page_result, dict):
result_dict = page_result
if 'parsing_res_list' in page_result:
parsing_res_list = page_result['parsing_res_list']
logger.info(f"Found parsing_res_list via dict access with {len(parsing_res_list)} elements")
elif 'res' in page_result and isinstance(page_result['res'], dict):
result_dict = page_result['res']
if 'parsing_res_list' in page_result['res']:
parsing_res_list = page_result['res']['parsing_res_list']
logger.info(f"Found parsing_res_list inside page_result['res'] with {len(parsing_res_list)} elements")
@@ -347,6 +353,8 @@ class PPStructureEnhanced:
elif hasattr(page_result, 'parsing_res_list'):
parsing_res_list = page_result.parsing_res_list
logger.info(f"Found parsing_res_list attribute with {len(parsing_res_list)} elements")
if hasattr(page_result, '__dict__'):
result_dict = page_result.__dict__
# Method 4: Check if result has to_dict method
elif hasattr(page_result, 'to_dict'):
@@ -355,14 +363,25 @@ class PPStructureEnhanced:
parsing_res_list = result_dict['parsing_res_list']
logger.info(f"Found parsing_res_list in to_dict with {len(parsing_res_list)} elements")
elif 'res' in result_dict and isinstance(result_dict['res'], dict):
if 'parsing_res_list' in result_dict['res']:
parsing_res_list = result_dict['res']['parsing_res_list']
result_dict = result_dict['res']
if 'parsing_res_list' in result_dict:
parsing_res_list = result_dict['parsing_res_list']
logger.info(f"Found parsing_res_list in to_dict['res'] with {len(parsing_res_list)} elements")
# Extract table_res_list which contains cell_box_list
if result_dict:
if 'table_res_list' in result_dict:
table_res_list = result_dict['table_res_list']
logger.info(f"Found table_res_list with {len(table_res_list)} tables")
for i, tbl in enumerate(table_res_list):
if 'cell_box_list' in tbl:
logger.info(f" Table {i}: {len(tbl['cell_box_list'])} cell boxes")
# Process parsing_res_list if found
if parsing_res_list:
elements = self._process_parsing_res_list(
parsing_res_list, current_page, output_dir, image_path, scaling_info
parsing_res_list, current_page, output_dir, image_path, scaling_info,
table_res_list=table_res_list # Pass table_res_list for cell_box_list
)
all_elements.extend(elements)
@@ -426,7 +445,8 @@ class PPStructureEnhanced:
current_page: int,
output_dir: Optional[Path],
source_image_path: Optional[Path] = None,
scaling_info: Optional['ScalingInfo'] = None
scaling_info: Optional['ScalingInfo'] = None,
table_res_list: Optional[List[Dict]] = None
) -> List[Dict[str, Any]]:
"""
Process parsing_res_list to extract all elements.
@@ -437,6 +457,7 @@ class PPStructureEnhanced:
current_page: Current page number
output_dir: Optional output directory
source_image_path: Path to source image for cropping image regions
table_res_list: Optional list of table results containing cell_box_list
Returns:
List of processed elements with normalized structure
@@ -543,11 +564,42 @@ class PPStructureEnhanced:
element['extracted_text'] = self._extract_text_from_html(html_content)
# 2. 提取 Cell 座標 (boxes)
# 優先使用 PPStructureV3 返回的 boxes若無則調用 SLANeXt 補充
# 優先順序: table_res_list > res_data['boxes'] > SLANeXt 補充
cell_boxes_extracted = False
if 'boxes' in res_data:
# PPStructureV3 returned cell boxes (unlikely in PaddleX 3.x)
# First, try to get cell_box_list from table_res_list (pp_demo style)
if table_res_list and not cell_boxes_extracted:
# Match table by HTML content or find closest bbox
for tbl_res in table_res_list:
if 'cell_box_list' in tbl_res and tbl_res['cell_box_list']:
# Check if HTML matches
tbl_html = tbl_res.get('pred_html', '')
if html_content and tbl_html:
# Simple check: if both have same structure
if tbl_html[:100] == html_content[:100]:
cell_boxes = tbl_res['cell_box_list']
# cell_box_list is already in absolute coordinates
element['cell_boxes'] = [[float(c) for c in box] for box in cell_boxes]
element['cell_boxes_source'] = 'table_res_list'
cell_boxes_extracted = True
logger.info(f"[TABLE] Found {len(cell_boxes)} cell boxes from table_res_list (HTML match)")
break
# If no HTML match, use first available table_res with cell_box_list
if not cell_boxes_extracted:
for tbl_res in table_res_list:
if 'cell_box_list' in tbl_res and tbl_res['cell_box_list']:
cell_boxes = tbl_res['cell_box_list']
element['cell_boxes'] = [[float(c) for c in box] for box in cell_boxes]
element['cell_boxes_source'] = 'table_res_list'
cell_boxes_extracted = True
logger.info(f"[TABLE] Found {len(cell_boxes)} cell boxes from table_res_list (first available)")
# Remove used table_res to avoid reuse
table_res_list.remove(tbl_res)
break
if not cell_boxes_extracted and 'boxes' in res_data:
# PPStructureV3 returned cell boxes in res (unlikely in PaddleX 3.x)
cell_boxes = res_data['boxes']
logger.info(f"[TABLE] Found {len(cell_boxes)} cell boxes in res_data")