From f5a2c8a75090eceefaf24cd2ebd7ae5f44702eee Mon Sep 17 00:00:00 2001
From: egg <lin4637lin4637@gmail.com>
Date: Fri, 28 Nov 2025 12:41:18 +0800
Subject: [PATCH] feat: extract cell_box_list from table_res_list
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Based on pp_demo analysis, PPStructureV3 returns table_res_list containing
cell_box_list which was previously ignored. This commit:

- Extract table_res_list from PPStructureV3 result alongside parsing_res_list
- Add table_res_list parameter to _process_parsing_res_list()
- Prioritize cell_box_list from table_res_list over SLANeXt extraction
- Match tables by HTML content or use first available

Priority order for cell boxes:
1. table_res_list.cell_box_list (native, already absolute coords)
2. res_data['boxes'] (unlikely in PaddleX 3.x)
3. Direct SLANeXt model call (fallback)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 backend/app/services/pp_structure_enhanced.py | 68 ++++++++++++++++---
 1 file changed, 60 insertions(+), 8 deletions(-)

diff --git a/backend/app/services/pp_structure_enhanced.py b/backend/app/services/pp_structure_enhanced.py
index c0cfed3..583faf1 100644
--- a/backend/app/services/pp_structure_enhanced.py
+++ b/backend/app/services/pp_structure_enhanced.py
@@ -316,29 +316,35 @@ class PPStructureEnhanced:
 
             # Process each page result
             for page_idx, page_result in enumerate(results):
-                # Try to access parsing_res_list (the complete structure)
+                # Try to access parsing_res_list and table_res_list (the complete structure)
                 parsing_res_list = None
+                table_res_list = None
+                result_dict = None
 
                 # Method 1: Direct access to json attribute (check both top-level and res)
                 if hasattr(page_result, 'json'):
                     result_json = page_result.json
                     if isinstance(result_json, dict):
+                        result_dict = result_json
                         # Check top-level
                         if 'parsing_res_list' in result_json:
                             parsing_res_list = result_json['parsing_res_list']
                             logger.info(f"Found parsing_res_list at top level with {len(parsing_res_list)} elements")
                         # Check inside 'res' (new structure in paddlex)
                         elif 'res' in result_json and isinstance(result_json['res'], dict):
+                            result_dict = result_json['res']
                             if 'parsing_res_list' in result_json['res']:
                                 parsing_res_list = result_json['res']['parsing_res_list']
                                 logger.info(f"Found parsing_res_list inside 'res' with {len(parsing_res_list)} elements")
 
                 # Method 2: Try direct dict access (LayoutParsingResultV2 inherits from dict)
                 elif isinstance(page_result, dict):
+                    result_dict = page_result
                     if 'parsing_res_list' in page_result:
                         parsing_res_list = page_result['parsing_res_list']
                         logger.info(f"Found parsing_res_list via dict access with {len(parsing_res_list)} elements")
                     elif 'res' in page_result and isinstance(page_result['res'], dict):
+                        result_dict = page_result['res']
                         if 'parsing_res_list' in page_result['res']:
                             parsing_res_list = page_result['res']['parsing_res_list']
                             logger.info(f"Found parsing_res_list inside page_result['res'] with {len(parsing_res_list)} elements")
@@ -347,6 +353,8 @@ class PPStructureEnhanced:
                 elif hasattr(page_result, 'parsing_res_list'):
                     parsing_res_list = page_result.parsing_res_list
                     logger.info(f"Found parsing_res_list attribute with {len(parsing_res_list)} elements")
+                    if hasattr(page_result, '__dict__'):
+                        result_dict = page_result.__dict__
 
                 # Method 4: Check if result has to_dict method
                 elif hasattr(page_result, 'to_dict'):
@@ -355,14 +363,25 @@ class PPStructureEnhanced:
                         parsing_res_list = result_dict['parsing_res_list']
                         logger.info(f"Found parsing_res_list in to_dict with {len(parsing_res_list)} elements")
                     elif 'res' in result_dict and isinstance(result_dict['res'], dict):
-                        if 'parsing_res_list' in result_dict['res']:
-                            parsing_res_list = result_dict['res']['parsing_res_list']
+                        result_dict = result_dict['res']
+                        if 'parsing_res_list' in result_dict:
+                            parsing_res_list = result_dict['parsing_res_list']
                             logger.info(f"Found parsing_res_list in to_dict['res'] with {len(parsing_res_list)} elements")
 
+                # Extract table_res_list which contains cell_box_list
+                if result_dict:
+                    if 'table_res_list' in result_dict:
+                        table_res_list = result_dict['table_res_list']
+                        logger.info(f"Found table_res_list with {len(table_res_list)} tables")
+                        for i, tbl in enumerate(table_res_list):
+                            if 'cell_box_list' in tbl:
+                                logger.info(f"  Table {i}: {len(tbl['cell_box_list'])} cell boxes")
+
                 # Process parsing_res_list if found
                 if parsing_res_list:
                     elements = self._process_parsing_res_list(
-                        parsing_res_list, current_page, output_dir, image_path, scaling_info
+                        parsing_res_list, current_page, output_dir, image_path, scaling_info,
+                        table_res_list=table_res_list  # Pass table_res_list for cell_box_list
                     )
                     all_elements.extend(elements)
 
@@ -426,7 +445,8 @@ class PPStructureEnhanced:
         current_page: int,
         output_dir: Optional[Path],
         source_image_path: Optional[Path] = None,
-        scaling_info: Optional['ScalingInfo'] = None
+        scaling_info: Optional['ScalingInfo'] = None,
+        table_res_list: Optional[List[Dict]] = None
     ) -> List[Dict[str, Any]]:
         """
         Process parsing_res_list to extract all elements.
@@ -437,6 +457,7 @@ class PPStructureEnhanced:
             current_page: Current page number
             output_dir: Optional output directory
             source_image_path: Path to source image for cropping image regions
+            table_res_list: Optional list of table results containing cell_box_list
 
         Returns:
             List of processed elements with normalized structure
@@ -543,11 +564,42 @@ class PPStructureEnhanced:
                     element['extracted_text'] = self._extract_text_from_html(html_content)
 
                 # 2. 提取 Cell 座標 (boxes)
-                # 優先使用 PPStructureV3 返回的 boxes，若無則調用 SLANeXt 補充
+                # 優先順序: table_res_list > res_data['boxes'] > SLANeXt 補充
                 cell_boxes_extracted = False
 
-                if 'boxes' in res_data:
-                    # PPStructureV3 returned cell boxes (unlikely in PaddleX 3.x)
+                # First, try to get cell_box_list from table_res_list (pp_demo style)
+                if table_res_list and not cell_boxes_extracted:
+                    # Match table by HTML content or find closest bbox
+                    for tbl_res in table_res_list:
+                        if 'cell_box_list' in tbl_res and tbl_res['cell_box_list']:
+                            # Check if HTML matches
+                            tbl_html = tbl_res.get('pred_html', '')
+                            if html_content and tbl_html:
+                                # Simple check: if both have same structure
+                                if tbl_html[:100] == html_content[:100]:
+                                    cell_boxes = tbl_res['cell_box_list']
+                                    # cell_box_list is already in absolute coordinates
+                                    element['cell_boxes'] = [[float(c) for c in box] for box in cell_boxes]
+                                    element['cell_boxes_source'] = 'table_res_list'
+                                    cell_boxes_extracted = True
+                                    logger.info(f"[TABLE] Found {len(cell_boxes)} cell boxes from table_res_list (HTML match)")
+                                    break
+
+                    # If no HTML match, use first available table_res with cell_box_list
+                    if not cell_boxes_extracted:
+                        for tbl_res in table_res_list:
+                            if 'cell_box_list' in tbl_res and tbl_res['cell_box_list']:
+                                cell_boxes = tbl_res['cell_box_list']
+                                element['cell_boxes'] = [[float(c) for c in box] for box in cell_boxes]
+                                element['cell_boxes_source'] = 'table_res_list'
+                                cell_boxes_extracted = True
+                                logger.info(f"[TABLE] Found {len(cell_boxes)} cell boxes from table_res_list (first available)")
+                                # Remove used table_res to avoid reuse
+                                table_res_list.remove(tbl_res)
+                                break
+
+                if not cell_boxes_extracted and 'boxes' in res_data:
+                    # PPStructureV3 returned cell boxes in res (unlikely in PaddleX 3.x)
                     cell_boxes = res_data['boxes']
                     logger.info(f"[TABLE] Found {len(cell_boxes)} cell boxes in res_data")