feat: create extract-table-cell-boxes proposal and archive old proposal

- Archive unify-image-scaling proposal to archive/2025-11-28 - Create new extract-table-cell-boxes proposal for supplementing PPStructureV3 with direct SLANeXt model calls to extract table cell bounding boxes - Add debug logging to pp_structure_enhanced.py for table cell boxes investigation - Discovered that PPStructureV3 high-level API filters out cell bbox data, but paddlex.create_model() can directly invoke underlying models 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-28 12:15:06 +08:00
parent dda9621e17
commit 801ee9c4b6
7 changed files with 393 additions and 4 deletions
--- a/backend/app/services/pp_structure_enhanced.py
+++ b/backend/app/services/pp_structure_enhanced.py
@@ -355,14 +355,54 @@ class PPStructureEnhanced:

            # Special handling for tables
            if mapped_type == ElementType.TABLE:
-                # Use HTML content from content-based detection or extract from 'res'
-                html_content = html_table_content  # From content-based detection
-                if not html_content and 'res' in item and isinstance(item['res'], dict):
-                    html_content = item['res'].get('html', '')
+                # 1. 提取 HTML (原有邏輯)
+                html_content = html_table_content
+                res_data = {}
+
+                # 獲取 res 字典 (包含 html 和 boxes)
+                if 'res' in item and isinstance(item['res'], dict):
+                    res_data = item['res']
+                    logger.info(f"[TABLE] Found 'res' dict with keys: {list(res_data.keys())}")
+                    if not html_content:
+                        html_content = res_data.get('html', '')
+                else:
+                    logger.info(f"[TABLE] No 'res' key in item. Available keys: {list(item.keys())}")
+
                if html_content:
                    element['html'] = html_content
                    element['extracted_text'] = self._extract_text_from_html(html_content)

+                # 2. 【新增】提取 Cell 座標 (boxes)
+                # SLANet 回傳的格式通常是 [[x1, y1, x2, y2], ...]
+                if 'boxes' in res_data:
+                    cell_boxes = res_data['boxes']
+                    logger.info(f"[TABLE] Found {len(cell_boxes)} cell boxes in res_data")
+
+                    # 獲取表格自身的偏移量 (用於將 Cell 的相對座標轉為絕對座標)
+                    table_x, table_y = 0, 0
+                    if len(bbox) >= 2:  # bbox is [x1, y1, x2, y2]
+                        table_x, table_y = bbox[0], bbox[1]
+
+                    processed_cells = []
+                    for cell_box in cell_boxes:
+                        # 確保格式正確
+                        if isinstance(cell_box, (list, tuple)) and len(cell_box) >= 4:
+                            # 轉換為絕對座標: Cell x + 表格 x
+                            abs_cell_box = [
+                                cell_box[0] + table_x,
+                                cell_box[1] + table_y,
+                                cell_box[2] + table_x,
+                                cell_box[3] + table_y
+                            ]
+                            processed_cells.append(abs_cell_box)
+
+                    # 將處理後的 Cell 座標存入 element
+                    element['cell_boxes'] = processed_cells
+                    element['raw_cell_boxes'] = cell_boxes
+                    logger.info(f"[TABLE] Processed {len(processed_cells)} cell boxes with table offset ({table_x}, {table_y})")
+                else:
+                    logger.info(f"[TABLE] No 'boxes' key in res_data. Available: {list(res_data.keys()) if res_data else 'empty'}")
+
            # Special handling for images/figures
            elif mapped_type in [ElementType.IMAGE, ElementType.FIGURE]:
                # Save image if path provided