feat: extract cell_box_list from table_res_list
Based on pp_demo analysis, PPStructureV3 returns table_res_list containing cell_box_list which was previously ignored. This commit: - Extract table_res_list from PPStructureV3 result alongside parsing_res_list - Add table_res_list parameter to _process_parsing_res_list() - Prioritize cell_box_list from table_res_list over SLANeXt extraction - Match tables by HTML content or use first available Priority order for cell boxes: 1. table_res_list.cell_box_list (native, already absolute coords) 2. res_data['boxes'] (unlikely in PaddleX 3.x) 3. Direct SLANeXt model call (fallback) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -316,29 +316,35 @@ class PPStructureEnhanced:
|
||||
|
||||
# Process each page result
|
||||
for page_idx, page_result in enumerate(results):
|
||||
# Try to access parsing_res_list (the complete structure)
|
||||
# Try to access parsing_res_list and table_res_list (the complete structure)
|
||||
parsing_res_list = None
|
||||
table_res_list = None
|
||||
result_dict = None
|
||||
|
||||
# Method 1: Direct access to json attribute (check both top-level and res)
|
||||
if hasattr(page_result, 'json'):
|
||||
result_json = page_result.json
|
||||
if isinstance(result_json, dict):
|
||||
result_dict = result_json
|
||||
# Check top-level
|
||||
if 'parsing_res_list' in result_json:
|
||||
parsing_res_list = result_json['parsing_res_list']
|
||||
logger.info(f"Found parsing_res_list at top level with {len(parsing_res_list)} elements")
|
||||
# Check inside 'res' (new structure in paddlex)
|
||||
elif 'res' in result_json and isinstance(result_json['res'], dict):
|
||||
result_dict = result_json['res']
|
||||
if 'parsing_res_list' in result_json['res']:
|
||||
parsing_res_list = result_json['res']['parsing_res_list']
|
||||
logger.info(f"Found parsing_res_list inside 'res' with {len(parsing_res_list)} elements")
|
||||
|
||||
# Method 2: Try direct dict access (LayoutParsingResultV2 inherits from dict)
|
||||
elif isinstance(page_result, dict):
|
||||
result_dict = page_result
|
||||
if 'parsing_res_list' in page_result:
|
||||
parsing_res_list = page_result['parsing_res_list']
|
||||
logger.info(f"Found parsing_res_list via dict access with {len(parsing_res_list)} elements")
|
||||
elif 'res' in page_result and isinstance(page_result['res'], dict):
|
||||
result_dict = page_result['res']
|
||||
if 'parsing_res_list' in page_result['res']:
|
||||
parsing_res_list = page_result['res']['parsing_res_list']
|
||||
logger.info(f"Found parsing_res_list inside page_result['res'] with {len(parsing_res_list)} elements")
|
||||
@@ -347,6 +353,8 @@ class PPStructureEnhanced:
|
||||
elif hasattr(page_result, 'parsing_res_list'):
|
||||
parsing_res_list = page_result.parsing_res_list
|
||||
logger.info(f"Found parsing_res_list attribute with {len(parsing_res_list)} elements")
|
||||
if hasattr(page_result, '__dict__'):
|
||||
result_dict = page_result.__dict__
|
||||
|
||||
# Method 4: Check if result has to_dict method
|
||||
elif hasattr(page_result, 'to_dict'):
|
||||
@@ -355,14 +363,25 @@ class PPStructureEnhanced:
|
||||
parsing_res_list = result_dict['parsing_res_list']
|
||||
logger.info(f"Found parsing_res_list in to_dict with {len(parsing_res_list)} elements")
|
||||
elif 'res' in result_dict and isinstance(result_dict['res'], dict):
|
||||
if 'parsing_res_list' in result_dict['res']:
|
||||
parsing_res_list = result_dict['res']['parsing_res_list']
|
||||
result_dict = result_dict['res']
|
||||
if 'parsing_res_list' in result_dict:
|
||||
parsing_res_list = result_dict['parsing_res_list']
|
||||
logger.info(f"Found parsing_res_list in to_dict['res'] with {len(parsing_res_list)} elements")
|
||||
|
||||
# Extract table_res_list which contains cell_box_list
|
||||
if result_dict:
|
||||
if 'table_res_list' in result_dict:
|
||||
table_res_list = result_dict['table_res_list']
|
||||
logger.info(f"Found table_res_list with {len(table_res_list)} tables")
|
||||
for i, tbl in enumerate(table_res_list):
|
||||
if 'cell_box_list' in tbl:
|
||||
logger.info(f" Table {i}: {len(tbl['cell_box_list'])} cell boxes")
|
||||
|
||||
# Process parsing_res_list if found
|
||||
if parsing_res_list:
|
||||
elements = self._process_parsing_res_list(
|
||||
parsing_res_list, current_page, output_dir, image_path, scaling_info
|
||||
parsing_res_list, current_page, output_dir, image_path, scaling_info,
|
||||
table_res_list=table_res_list # Pass table_res_list for cell_box_list
|
||||
)
|
||||
all_elements.extend(elements)
|
||||
|
||||
@@ -426,7 +445,8 @@ class PPStructureEnhanced:
|
||||
current_page: int,
|
||||
output_dir: Optional[Path],
|
||||
source_image_path: Optional[Path] = None,
|
||||
scaling_info: Optional['ScalingInfo'] = None
|
||||
scaling_info: Optional['ScalingInfo'] = None,
|
||||
table_res_list: Optional[List[Dict]] = None
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Process parsing_res_list to extract all elements.
|
||||
@@ -437,6 +457,7 @@ class PPStructureEnhanced:
|
||||
current_page: Current page number
|
||||
output_dir: Optional output directory
|
||||
source_image_path: Path to source image for cropping image regions
|
||||
table_res_list: Optional list of table results containing cell_box_list
|
||||
|
||||
Returns:
|
||||
List of processed elements with normalized structure
|
||||
@@ -543,11 +564,42 @@ class PPStructureEnhanced:
|
||||
element['extracted_text'] = self._extract_text_from_html(html_content)
|
||||
|
||||
# 2. 提取 Cell 座標 (boxes)
|
||||
# 優先使用 PPStructureV3 返回的 boxes,若無則調用 SLANeXt 補充
|
||||
# 優先順序: table_res_list > res_data['boxes'] > SLANeXt 補充
|
||||
cell_boxes_extracted = False
|
||||
|
||||
if 'boxes' in res_data:
|
||||
# PPStructureV3 returned cell boxes (unlikely in PaddleX 3.x)
|
||||
# First, try to get cell_box_list from table_res_list (pp_demo style)
|
||||
if table_res_list and not cell_boxes_extracted:
|
||||
# Match table by HTML content or find closest bbox
|
||||
for tbl_res in table_res_list:
|
||||
if 'cell_box_list' in tbl_res and tbl_res['cell_box_list']:
|
||||
# Check if HTML matches
|
||||
tbl_html = tbl_res.get('pred_html', '')
|
||||
if html_content and tbl_html:
|
||||
# Simple check: if both have same structure
|
||||
if tbl_html[:100] == html_content[:100]:
|
||||
cell_boxes = tbl_res['cell_box_list']
|
||||
# cell_box_list is already in absolute coordinates
|
||||
element['cell_boxes'] = [[float(c) for c in box] for box in cell_boxes]
|
||||
element['cell_boxes_source'] = 'table_res_list'
|
||||
cell_boxes_extracted = True
|
||||
logger.info(f"[TABLE] Found {len(cell_boxes)} cell boxes from table_res_list (HTML match)")
|
||||
break
|
||||
|
||||
# If no HTML match, use first available table_res with cell_box_list
|
||||
if not cell_boxes_extracted:
|
||||
for tbl_res in table_res_list:
|
||||
if 'cell_box_list' in tbl_res and tbl_res['cell_box_list']:
|
||||
cell_boxes = tbl_res['cell_box_list']
|
||||
element['cell_boxes'] = [[float(c) for c in box] for box in cell_boxes]
|
||||
element['cell_boxes_source'] = 'table_res_list'
|
||||
cell_boxes_extracted = True
|
||||
logger.info(f"[TABLE] Found {len(cell_boxes)} cell boxes from table_res_list (first available)")
|
||||
# Remove used table_res to avoid reuse
|
||||
table_res_list.remove(tbl_res)
|
||||
break
|
||||
|
||||
if not cell_boxes_extracted and 'boxes' in res_data:
|
||||
# PPStructureV3 returned cell boxes in res (unlikely in PaddleX 3.x)
|
||||
cell_boxes = res_data['boxes']
|
||||
logger.info(f"[TABLE] Found {len(cell_boxes)} cell boxes in res_data")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user