From f5a2c8a75090eceefaf24cd2ebd7ae5f44702eee Mon Sep 17 00:00:00 2001 From: egg Date: Fri, 28 Nov 2025 12:41:18 +0800 Subject: [PATCH] feat: extract cell_box_list from table_res_list MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Based on pp_demo analysis, PPStructureV3 returns table_res_list containing cell_box_list which was previously ignored. This commit: - Extract table_res_list from PPStructureV3 result alongside parsing_res_list - Add table_res_list parameter to _process_parsing_res_list() - Prioritize cell_box_list from table_res_list over SLANeXt extraction - Match tables by HTML content or use first available Priority order for cell boxes: 1. table_res_list.cell_box_list (native, already absolute coords) 2. res_data['boxes'] (unlikely in PaddleX 3.x) 3. Direct SLANeXt model call (fallback) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- backend/app/services/pp_structure_enhanced.py | 68 ++++++++++++++++--- 1 file changed, 60 insertions(+), 8 deletions(-) diff --git a/backend/app/services/pp_structure_enhanced.py b/backend/app/services/pp_structure_enhanced.py index c0cfed3..583faf1 100644 --- a/backend/app/services/pp_structure_enhanced.py +++ b/backend/app/services/pp_structure_enhanced.py @@ -316,29 +316,35 @@ class PPStructureEnhanced: # Process each page result for page_idx, page_result in enumerate(results): - # Try to access parsing_res_list (the complete structure) + # Try to access parsing_res_list and table_res_list (the complete structure) parsing_res_list = None + table_res_list = None + result_dict = None # Method 1: Direct access to json attribute (check both top-level and res) if hasattr(page_result, 'json'): result_json = page_result.json if isinstance(result_json, dict): + result_dict = result_json # Check top-level if 'parsing_res_list' in result_json: parsing_res_list = result_json['parsing_res_list'] logger.info(f"Found parsing_res_list at top level with {len(parsing_res_list)} elements") # Check inside 'res' (new structure in paddlex) elif 'res' in result_json and isinstance(result_json['res'], dict): + result_dict = result_json['res'] if 'parsing_res_list' in result_json['res']: parsing_res_list = result_json['res']['parsing_res_list'] logger.info(f"Found parsing_res_list inside 'res' with {len(parsing_res_list)} elements") # Method 2: Try direct dict access (LayoutParsingResultV2 inherits from dict) elif isinstance(page_result, dict): + result_dict = page_result if 'parsing_res_list' in page_result: parsing_res_list = page_result['parsing_res_list'] logger.info(f"Found parsing_res_list via dict access with {len(parsing_res_list)} elements") elif 'res' in page_result and isinstance(page_result['res'], dict): + result_dict = page_result['res'] if 'parsing_res_list' in page_result['res']: parsing_res_list = page_result['res']['parsing_res_list'] logger.info(f"Found parsing_res_list inside page_result['res'] with {len(parsing_res_list)} elements") @@ -347,6 +353,8 @@ class PPStructureEnhanced: elif hasattr(page_result, 'parsing_res_list'): parsing_res_list = page_result.parsing_res_list logger.info(f"Found parsing_res_list attribute with {len(parsing_res_list)} elements") + if hasattr(page_result, '__dict__'): + result_dict = page_result.__dict__ # Method 4: Check if result has to_dict method elif hasattr(page_result, 'to_dict'): @@ -355,14 +363,25 @@ class PPStructureEnhanced: parsing_res_list = result_dict['parsing_res_list'] logger.info(f"Found parsing_res_list in to_dict with {len(parsing_res_list)} elements") elif 'res' in result_dict and isinstance(result_dict['res'], dict): - if 'parsing_res_list' in result_dict['res']: - parsing_res_list = result_dict['res']['parsing_res_list'] + result_dict = result_dict['res'] + if 'parsing_res_list' in result_dict: + parsing_res_list = result_dict['parsing_res_list'] logger.info(f"Found parsing_res_list in to_dict['res'] with {len(parsing_res_list)} elements") + # Extract table_res_list which contains cell_box_list + if result_dict: + if 'table_res_list' in result_dict: + table_res_list = result_dict['table_res_list'] + logger.info(f"Found table_res_list with {len(table_res_list)} tables") + for i, tbl in enumerate(table_res_list): + if 'cell_box_list' in tbl: + logger.info(f" Table {i}: {len(tbl['cell_box_list'])} cell boxes") + # Process parsing_res_list if found if parsing_res_list: elements = self._process_parsing_res_list( - parsing_res_list, current_page, output_dir, image_path, scaling_info + parsing_res_list, current_page, output_dir, image_path, scaling_info, + table_res_list=table_res_list # Pass table_res_list for cell_box_list ) all_elements.extend(elements) @@ -426,7 +445,8 @@ class PPStructureEnhanced: current_page: int, output_dir: Optional[Path], source_image_path: Optional[Path] = None, - scaling_info: Optional['ScalingInfo'] = None + scaling_info: Optional['ScalingInfo'] = None, + table_res_list: Optional[List[Dict]] = None ) -> List[Dict[str, Any]]: """ Process parsing_res_list to extract all elements. @@ -437,6 +457,7 @@ class PPStructureEnhanced: current_page: Current page number output_dir: Optional output directory source_image_path: Path to source image for cropping image regions + table_res_list: Optional list of table results containing cell_box_list Returns: List of processed elements with normalized structure @@ -543,11 +564,42 @@ class PPStructureEnhanced: element['extracted_text'] = self._extract_text_from_html(html_content) # 2. 提取 Cell 座標 (boxes) - # 優先使用 PPStructureV3 返回的 boxes,若無則調用 SLANeXt 補充 + # 優先順序: table_res_list > res_data['boxes'] > SLANeXt 補充 cell_boxes_extracted = False - if 'boxes' in res_data: - # PPStructureV3 returned cell boxes (unlikely in PaddleX 3.x) + # First, try to get cell_box_list from table_res_list (pp_demo style) + if table_res_list and not cell_boxes_extracted: + # Match table by HTML content or find closest bbox + for tbl_res in table_res_list: + if 'cell_box_list' in tbl_res and tbl_res['cell_box_list']: + # Check if HTML matches + tbl_html = tbl_res.get('pred_html', '') + if html_content and tbl_html: + # Simple check: if both have same structure + if tbl_html[:100] == html_content[:100]: + cell_boxes = tbl_res['cell_box_list'] + # cell_box_list is already in absolute coordinates + element['cell_boxes'] = [[float(c) for c in box] for box in cell_boxes] + element['cell_boxes_source'] = 'table_res_list' + cell_boxes_extracted = True + logger.info(f"[TABLE] Found {len(cell_boxes)} cell boxes from table_res_list (HTML match)") + break + + # If no HTML match, use first available table_res with cell_box_list + if not cell_boxes_extracted: + for tbl_res in table_res_list: + if 'cell_box_list' in tbl_res and tbl_res['cell_box_list']: + cell_boxes = tbl_res['cell_box_list'] + element['cell_boxes'] = [[float(c) for c in box] for box in cell_boxes] + element['cell_boxes_source'] = 'table_res_list' + cell_boxes_extracted = True + logger.info(f"[TABLE] Found {len(cell_boxes)} cell boxes from table_res_list (first available)") + # Remove used table_res to avoid reuse + table_res_list.remove(tbl_res) + break + + if not cell_boxes_extracted and 'boxes' in res_data: + # PPStructureV3 returned cell boxes in res (unlikely in PaddleX 3.x) cell_boxes = res_data['boxes'] logger.info(f"[TABLE] Found {len(cell_boxes)} cell boxes in res_data")