feat: extract cell_box_list from table_res_list
Based on pp_demo analysis, PPStructureV3 returns table_res_list containing cell_box_list which was previously ignored. This commit: - Extract table_res_list from PPStructureV3 result alongside parsing_res_list - Add table_res_list parameter to _process_parsing_res_list() - Prioritize cell_box_list from table_res_list over SLANeXt extraction - Match tables by HTML content or use first available Priority order for cell boxes: 1. table_res_list.cell_box_list (native, already absolute coords) 2. res_data['boxes'] (unlikely in PaddleX 3.x) 3. Direct SLANeXt model call (fallback) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -316,29 +316,35 @@ class PPStructureEnhanced:
|
|||||||
|
|
||||||
# Process each page result
|
# Process each page result
|
||||||
for page_idx, page_result in enumerate(results):
|
for page_idx, page_result in enumerate(results):
|
||||||
# Try to access parsing_res_list (the complete structure)
|
# Try to access parsing_res_list and table_res_list (the complete structure)
|
||||||
parsing_res_list = None
|
parsing_res_list = None
|
||||||
|
table_res_list = None
|
||||||
|
result_dict = None
|
||||||
|
|
||||||
# Method 1: Direct access to json attribute (check both top-level and res)
|
# Method 1: Direct access to json attribute (check both top-level and res)
|
||||||
if hasattr(page_result, 'json'):
|
if hasattr(page_result, 'json'):
|
||||||
result_json = page_result.json
|
result_json = page_result.json
|
||||||
if isinstance(result_json, dict):
|
if isinstance(result_json, dict):
|
||||||
|
result_dict = result_json
|
||||||
# Check top-level
|
# Check top-level
|
||||||
if 'parsing_res_list' in result_json:
|
if 'parsing_res_list' in result_json:
|
||||||
parsing_res_list = result_json['parsing_res_list']
|
parsing_res_list = result_json['parsing_res_list']
|
||||||
logger.info(f"Found parsing_res_list at top level with {len(parsing_res_list)} elements")
|
logger.info(f"Found parsing_res_list at top level with {len(parsing_res_list)} elements")
|
||||||
# Check inside 'res' (new structure in paddlex)
|
# Check inside 'res' (new structure in paddlex)
|
||||||
elif 'res' in result_json and isinstance(result_json['res'], dict):
|
elif 'res' in result_json and isinstance(result_json['res'], dict):
|
||||||
|
result_dict = result_json['res']
|
||||||
if 'parsing_res_list' in result_json['res']:
|
if 'parsing_res_list' in result_json['res']:
|
||||||
parsing_res_list = result_json['res']['parsing_res_list']
|
parsing_res_list = result_json['res']['parsing_res_list']
|
||||||
logger.info(f"Found parsing_res_list inside 'res' with {len(parsing_res_list)} elements")
|
logger.info(f"Found parsing_res_list inside 'res' with {len(parsing_res_list)} elements")
|
||||||
|
|
||||||
# Method 2: Try direct dict access (LayoutParsingResultV2 inherits from dict)
|
# Method 2: Try direct dict access (LayoutParsingResultV2 inherits from dict)
|
||||||
elif isinstance(page_result, dict):
|
elif isinstance(page_result, dict):
|
||||||
|
result_dict = page_result
|
||||||
if 'parsing_res_list' in page_result:
|
if 'parsing_res_list' in page_result:
|
||||||
parsing_res_list = page_result['parsing_res_list']
|
parsing_res_list = page_result['parsing_res_list']
|
||||||
logger.info(f"Found parsing_res_list via dict access with {len(parsing_res_list)} elements")
|
logger.info(f"Found parsing_res_list via dict access with {len(parsing_res_list)} elements")
|
||||||
elif 'res' in page_result and isinstance(page_result['res'], dict):
|
elif 'res' in page_result and isinstance(page_result['res'], dict):
|
||||||
|
result_dict = page_result['res']
|
||||||
if 'parsing_res_list' in page_result['res']:
|
if 'parsing_res_list' in page_result['res']:
|
||||||
parsing_res_list = page_result['res']['parsing_res_list']
|
parsing_res_list = page_result['res']['parsing_res_list']
|
||||||
logger.info(f"Found parsing_res_list inside page_result['res'] with {len(parsing_res_list)} elements")
|
logger.info(f"Found parsing_res_list inside page_result['res'] with {len(parsing_res_list)} elements")
|
||||||
@@ -347,6 +353,8 @@ class PPStructureEnhanced:
|
|||||||
elif hasattr(page_result, 'parsing_res_list'):
|
elif hasattr(page_result, 'parsing_res_list'):
|
||||||
parsing_res_list = page_result.parsing_res_list
|
parsing_res_list = page_result.parsing_res_list
|
||||||
logger.info(f"Found parsing_res_list attribute with {len(parsing_res_list)} elements")
|
logger.info(f"Found parsing_res_list attribute with {len(parsing_res_list)} elements")
|
||||||
|
if hasattr(page_result, '__dict__'):
|
||||||
|
result_dict = page_result.__dict__
|
||||||
|
|
||||||
# Method 4: Check if result has to_dict method
|
# Method 4: Check if result has to_dict method
|
||||||
elif hasattr(page_result, 'to_dict'):
|
elif hasattr(page_result, 'to_dict'):
|
||||||
@@ -355,14 +363,25 @@ class PPStructureEnhanced:
|
|||||||
parsing_res_list = result_dict['parsing_res_list']
|
parsing_res_list = result_dict['parsing_res_list']
|
||||||
logger.info(f"Found parsing_res_list in to_dict with {len(parsing_res_list)} elements")
|
logger.info(f"Found parsing_res_list in to_dict with {len(parsing_res_list)} elements")
|
||||||
elif 'res' in result_dict and isinstance(result_dict['res'], dict):
|
elif 'res' in result_dict and isinstance(result_dict['res'], dict):
|
||||||
if 'parsing_res_list' in result_dict['res']:
|
result_dict = result_dict['res']
|
||||||
parsing_res_list = result_dict['res']['parsing_res_list']
|
if 'parsing_res_list' in result_dict:
|
||||||
|
parsing_res_list = result_dict['parsing_res_list']
|
||||||
logger.info(f"Found parsing_res_list in to_dict['res'] with {len(parsing_res_list)} elements")
|
logger.info(f"Found parsing_res_list in to_dict['res'] with {len(parsing_res_list)} elements")
|
||||||
|
|
||||||
|
# Extract table_res_list which contains cell_box_list
|
||||||
|
if result_dict:
|
||||||
|
if 'table_res_list' in result_dict:
|
||||||
|
table_res_list = result_dict['table_res_list']
|
||||||
|
logger.info(f"Found table_res_list with {len(table_res_list)} tables")
|
||||||
|
for i, tbl in enumerate(table_res_list):
|
||||||
|
if 'cell_box_list' in tbl:
|
||||||
|
logger.info(f" Table {i}: {len(tbl['cell_box_list'])} cell boxes")
|
||||||
|
|
||||||
# Process parsing_res_list if found
|
# Process parsing_res_list if found
|
||||||
if parsing_res_list:
|
if parsing_res_list:
|
||||||
elements = self._process_parsing_res_list(
|
elements = self._process_parsing_res_list(
|
||||||
parsing_res_list, current_page, output_dir, image_path, scaling_info
|
parsing_res_list, current_page, output_dir, image_path, scaling_info,
|
||||||
|
table_res_list=table_res_list # Pass table_res_list for cell_box_list
|
||||||
)
|
)
|
||||||
all_elements.extend(elements)
|
all_elements.extend(elements)
|
||||||
|
|
||||||
@@ -426,7 +445,8 @@ class PPStructureEnhanced:
|
|||||||
current_page: int,
|
current_page: int,
|
||||||
output_dir: Optional[Path],
|
output_dir: Optional[Path],
|
||||||
source_image_path: Optional[Path] = None,
|
source_image_path: Optional[Path] = None,
|
||||||
scaling_info: Optional['ScalingInfo'] = None
|
scaling_info: Optional['ScalingInfo'] = None,
|
||||||
|
table_res_list: Optional[List[Dict]] = None
|
||||||
) -> List[Dict[str, Any]]:
|
) -> List[Dict[str, Any]]:
|
||||||
"""
|
"""
|
||||||
Process parsing_res_list to extract all elements.
|
Process parsing_res_list to extract all elements.
|
||||||
@@ -437,6 +457,7 @@ class PPStructureEnhanced:
|
|||||||
current_page: Current page number
|
current_page: Current page number
|
||||||
output_dir: Optional output directory
|
output_dir: Optional output directory
|
||||||
source_image_path: Path to source image for cropping image regions
|
source_image_path: Path to source image for cropping image regions
|
||||||
|
table_res_list: Optional list of table results containing cell_box_list
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
List of processed elements with normalized structure
|
List of processed elements with normalized structure
|
||||||
@@ -543,11 +564,42 @@ class PPStructureEnhanced:
|
|||||||
element['extracted_text'] = self._extract_text_from_html(html_content)
|
element['extracted_text'] = self._extract_text_from_html(html_content)
|
||||||
|
|
||||||
# 2. 提取 Cell 座標 (boxes)
|
# 2. 提取 Cell 座標 (boxes)
|
||||||
# 優先使用 PPStructureV3 返回的 boxes,若無則調用 SLANeXt 補充
|
# 優先順序: table_res_list > res_data['boxes'] > SLANeXt 補充
|
||||||
cell_boxes_extracted = False
|
cell_boxes_extracted = False
|
||||||
|
|
||||||
if 'boxes' in res_data:
|
# First, try to get cell_box_list from table_res_list (pp_demo style)
|
||||||
# PPStructureV3 returned cell boxes (unlikely in PaddleX 3.x)
|
if table_res_list and not cell_boxes_extracted:
|
||||||
|
# Match table by HTML content or find closest bbox
|
||||||
|
for tbl_res in table_res_list:
|
||||||
|
if 'cell_box_list' in tbl_res and tbl_res['cell_box_list']:
|
||||||
|
# Check if HTML matches
|
||||||
|
tbl_html = tbl_res.get('pred_html', '')
|
||||||
|
if html_content and tbl_html:
|
||||||
|
# Simple check: if both have same structure
|
||||||
|
if tbl_html[:100] == html_content[:100]:
|
||||||
|
cell_boxes = tbl_res['cell_box_list']
|
||||||
|
# cell_box_list is already in absolute coordinates
|
||||||
|
element['cell_boxes'] = [[float(c) for c in box] for box in cell_boxes]
|
||||||
|
element['cell_boxes_source'] = 'table_res_list'
|
||||||
|
cell_boxes_extracted = True
|
||||||
|
logger.info(f"[TABLE] Found {len(cell_boxes)} cell boxes from table_res_list (HTML match)")
|
||||||
|
break
|
||||||
|
|
||||||
|
# If no HTML match, use first available table_res with cell_box_list
|
||||||
|
if not cell_boxes_extracted:
|
||||||
|
for tbl_res in table_res_list:
|
||||||
|
if 'cell_box_list' in tbl_res and tbl_res['cell_box_list']:
|
||||||
|
cell_boxes = tbl_res['cell_box_list']
|
||||||
|
element['cell_boxes'] = [[float(c) for c in box] for box in cell_boxes]
|
||||||
|
element['cell_boxes_source'] = 'table_res_list'
|
||||||
|
cell_boxes_extracted = True
|
||||||
|
logger.info(f"[TABLE] Found {len(cell_boxes)} cell boxes from table_res_list (first available)")
|
||||||
|
# Remove used table_res to avoid reuse
|
||||||
|
table_res_list.remove(tbl_res)
|
||||||
|
break
|
||||||
|
|
||||||
|
if not cell_boxes_extracted and 'boxes' in res_data:
|
||||||
|
# PPStructureV3 returned cell boxes in res (unlikely in PaddleX 3.x)
|
||||||
cell_boxes = res_data['boxes']
|
cell_boxes = res_data['boxes']
|
||||||
logger.info(f"[TABLE] Found {len(cell_boxes)} cell boxes in res_data")
|
logger.info(f"[TABLE] Found {len(cell_boxes)} cell boxes in res_data")
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user