fix: improve PP-StructureV3 structure preservation for complex diagrams

- Fix parsing_res_list field mapping (block_label, block_content, block_bbox) - Add fine-grained PP-StructureV3 configuration parameters - Lower detection thresholds (0.5→0.2) for more sensitive element detection - Use 'small' merge mode instead of default to minimize bbox merging - Add layout_nms, unclip_ratio, text_det thresholds for better control - Result: Doubled element detection from 6 to 12 elements on complex diagrams 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-25 08:53:37 +08:00
parent 4325d024a7
commit a659e7ae00
3 changed files with 73 additions and 17 deletions
--- a/backend/app/services/pp_structure_enhanced.py
+++ b/backend/app/services/pp_structure_enhanced.py
@@ -34,6 +34,7 @@ class PPStructureEnhanced:
    # Mapping from PP-StructureV3 types to our ElementType
    ELEMENT_TYPE_MAPPING = {
        'title': ElementType.TITLE,
+        'paragraph_title': ElementType.TITLE,  # PP-StructureV3 block_label
        'text': ElementType.TEXT,
        'paragraph': ElementType.PARAGRAPH,
        'figure': ElementType.FIGURE,
@@ -107,24 +108,45 @@ class PPStructureEnhanced:
                # Try to access parsing_res_list (the complete structure)
                parsing_res_list = None

-                # Method 1: Direct access to json attribute
+                # Method 1: Direct access to json attribute (check both top-level and res)
                if hasattr(page_result, 'json'):
                    result_json = page_result.json
-                    if isinstance(result_json, dict) and 'parsing_res_list' in result_json:
-                        parsing_res_list = result_json['parsing_res_list']
-                        logger.info(f"Found parsing_res_list with {len(parsing_res_list)} elements")
+                    if isinstance(result_json, dict):
+                        # Check top-level
+                        if 'parsing_res_list' in result_json:
+                            parsing_res_list = result_json['parsing_res_list']
+                            logger.info(f"Found parsing_res_list at top level with {len(parsing_res_list)} elements")
+                        # Check inside 'res' (new structure in paddlex)
+                        elif 'res' in result_json and isinstance(result_json['res'], dict):
+                            if 'parsing_res_list' in result_json['res']:
+                                parsing_res_list = result_json['res']['parsing_res_list']
+                                logger.info(f"Found parsing_res_list inside 'res' with {len(parsing_res_list)} elements")

-                # Method 2: Try to access as attribute
+                # Method 2: Try direct dict access (LayoutParsingResultV2 inherits from dict)
+                elif isinstance(page_result, dict):
+                    if 'parsing_res_list' in page_result:
+                        parsing_res_list = page_result['parsing_res_list']
+                        logger.info(f"Found parsing_res_list via dict access with {len(parsing_res_list)} elements")
+                    elif 'res' in page_result and isinstance(page_result['res'], dict):
+                        if 'parsing_res_list' in page_result['res']:
+                            parsing_res_list = page_result['res']['parsing_res_list']
+                            logger.info(f"Found parsing_res_list inside page_result['res'] with {len(parsing_res_list)} elements")
+
+                # Method 3: Try to access as attribute
                elif hasattr(page_result, 'parsing_res_list'):
                    parsing_res_list = page_result.parsing_res_list
                    logger.info(f"Found parsing_res_list attribute with {len(parsing_res_list)} elements")

-                # Method 3: Check if result has to_dict method
+                # Method 4: Check if result has to_dict method
                elif hasattr(page_result, 'to_dict'):
                    result_dict = page_result.to_dict()
                    if 'parsing_res_list' in result_dict:
                        parsing_res_list = result_dict['parsing_res_list']
                        logger.info(f"Found parsing_res_list in to_dict with {len(parsing_res_list)} elements")
+                    elif 'res' in result_dict and isinstance(result_dict['res'], dict):
+                        if 'parsing_res_list' in result_dict['res']:
+                            parsing_res_list = result_dict['res']['parsing_res_list']
+                            logger.info(f"Found parsing_res_list in to_dict['res'] with {len(parsing_res_list)} elements")

                # Process parsing_res_list if found
                if parsing_res_list:
@@ -207,27 +229,41 @@ class PPStructureEnhanced:
        elements = []

        for idx, item in enumerate(parsing_res_list):
-            # Extract element type
-            element_type = item.get('type', 'text').lower()
+            # Debug: log the structure of the first item
+            if idx == 0:
+                logger.info(f"First parsing_res_list item structure: {list(item.keys()) if isinstance(item, dict) else type(item)}")
+                logger.info(f"First parsing_res_list item sample: {str(item)[:500]}")
+
+            # Extract element type (check both 'type' and 'block_label')
+            element_type = item.get('type', '') or item.get('block_label', 'text')
+            element_type = element_type.lower()
            mapped_type = self.ELEMENT_TYPE_MAPPING.get(
                element_type, ElementType.TEXT
            )

-            # Extract bbox (layout_bbox has the precise coordinates)
-            layout_bbox = item.get('layout_bbox', [])
-            if not layout_bbox and 'bbox' in item:
-                layout_bbox = item['bbox']
+            # Extract bbox (check multiple possible keys)
+            layout_bbox = (
+                item.get('layout_bbox', []) or
+                item.get('block_bbox', []) or
+                item.get('bbox', [])
+            )

            # Ensure bbox has 4 values
            if len(layout_bbox) >= 4:
                bbox = layout_bbox[:4]  # [x1, y1, x2, y2]
            else:
                bbox = [0, 0, 0, 0]  # Default if bbox missing
+                logger.warning(f"Element {idx} has invalid bbox: {layout_bbox}")

-            # Extract content
-            content = item.get('content', '')
+            # Extract content (check multiple possible keys)
+            content = (
+                item.get('content', '') or
+                item.get('block_content', '') or
+                ''
+            )
+
+            # Additional fallback for content in 'res' field
            if not content and 'res' in item:
-                # Some elements have content in 'res' field
                res = item.get('res', {})
                if isinstance(res, dict):
                    content = res.get('content', '') or res.get('text', '')