fix: improve PP-StructureV3 structure preservation for complex diagrams
- Fix parsing_res_list field mapping (block_label, block_content, block_bbox) - Add fine-grained PP-StructureV3 configuration parameters - Lower detection thresholds (0.5→0.2) for more sensitive element detection - Use 'small' merge mode instead of default to minimize bbox merging - Add layout_nms, unclip_ratio, text_det thresholds for better control - Result: Doubled element detection from 6 to 12 elements on complex diagrams 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -91,7 +91,13 @@ class Settings(BaseSettings):
|
|||||||
enable_table_recognition: bool = Field(default=True) # Table structure recognition
|
enable_table_recognition: bool = Field(default=True) # Table structure recognition
|
||||||
enable_seal_recognition: bool = Field(default=True) # Seal/stamp recognition
|
enable_seal_recognition: bool = Field(default=True) # Seal/stamp recognition
|
||||||
enable_text_recognition: bool = Field(default=True) # General text recognition
|
enable_text_recognition: bool = Field(default=True) # General text recognition
|
||||||
layout_detection_threshold: float = Field(default=0.5)
|
layout_detection_threshold: float = Field(default=0.2) # Lower threshold for more sensitive detection
|
||||||
|
layout_nms_threshold: float = Field(default=0.2) # Lower NMS to preserve more individual elements
|
||||||
|
layout_merge_mode: str = Field(default="small") # Use 'small' to minimize bbox merging
|
||||||
|
layout_unclip_ratio: float = Field(default=1.2) # Smaller unclip to preserve element boundaries
|
||||||
|
text_det_thresh: float = Field(default=0.2) # More sensitive text detection
|
||||||
|
text_det_box_thresh: float = Field(default=0.3) # Lower box threshold for better detection
|
||||||
|
text_det_unclip_ratio: float = Field(default=1.2) # Smaller unclip for tighter text boxes
|
||||||
|
|
||||||
# Performance tuning
|
# Performance tuning
|
||||||
use_fp16_inference: bool = Field(default=False) # Half-precision (if supported)
|
use_fp16_inference: bool = Field(default=False) # Half-precision (if supported)
|
||||||
|
|||||||
@@ -359,8 +359,16 @@ class OCRService:
|
|||||||
use_formula = settings.enable_formula_recognition
|
use_formula = settings.enable_formula_recognition
|
||||||
use_table = settings.enable_table_recognition
|
use_table = settings.enable_table_recognition
|
||||||
layout_threshold = settings.layout_detection_threshold
|
layout_threshold = settings.layout_detection_threshold
|
||||||
|
layout_nms = settings.layout_nms_threshold
|
||||||
|
layout_merge = settings.layout_merge_mode
|
||||||
|
layout_unclip = settings.layout_unclip_ratio
|
||||||
|
text_thresh = settings.text_det_thresh
|
||||||
|
text_box_thresh = settings.text_det_box_thresh
|
||||||
|
text_unclip = settings.text_det_unclip_ratio
|
||||||
|
|
||||||
logger.info(f"PP-StructureV3 config: table={use_table}, formula={use_formula}, chart={use_chart}")
|
logger.info(f"PP-StructureV3 config: table={use_table}, formula={use_formula}, chart={use_chart}")
|
||||||
|
logger.info(f"Layout config: threshold={layout_threshold}, nms={layout_nms}, merge={layout_merge}, unclip={layout_unclip}")
|
||||||
|
logger.info(f"Text detection: thresh={text_thresh}, box_thresh={text_box_thresh}, unclip={text_unclip}")
|
||||||
|
|
||||||
self.structure_engine = PPStructureV3(
|
self.structure_engine = PPStructureV3(
|
||||||
use_doc_orientation_classify=False,
|
use_doc_orientation_classify=False,
|
||||||
@@ -368,8 +376,14 @@ class OCRService:
|
|||||||
use_textline_orientation=False,
|
use_textline_orientation=False,
|
||||||
use_table_recognition=use_table,
|
use_table_recognition=use_table,
|
||||||
use_formula_recognition=use_formula,
|
use_formula_recognition=use_formula,
|
||||||
use_chart_recognition=use_chart, # Disabled by default to save ~500MB VRAM
|
use_chart_recognition=use_chart,
|
||||||
layout_threshold=layout_threshold,
|
layout_threshold=layout_threshold,
|
||||||
|
layout_nms=layout_nms,
|
||||||
|
layout_unclip_ratio=layout_unclip,
|
||||||
|
layout_merge_bboxes_mode=layout_merge, # Use 'small' to minimize merging
|
||||||
|
text_det_thresh=text_thresh,
|
||||||
|
text_det_box_thresh=text_box_thresh,
|
||||||
|
text_det_unclip_ratio=text_unclip,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Track model loading for cache management
|
# Track model loading for cache management
|
||||||
|
|||||||
@@ -34,6 +34,7 @@ class PPStructureEnhanced:
|
|||||||
# Mapping from PP-StructureV3 types to our ElementType
|
# Mapping from PP-StructureV3 types to our ElementType
|
||||||
ELEMENT_TYPE_MAPPING = {
|
ELEMENT_TYPE_MAPPING = {
|
||||||
'title': ElementType.TITLE,
|
'title': ElementType.TITLE,
|
||||||
|
'paragraph_title': ElementType.TITLE, # PP-StructureV3 block_label
|
||||||
'text': ElementType.TEXT,
|
'text': ElementType.TEXT,
|
||||||
'paragraph': ElementType.PARAGRAPH,
|
'paragraph': ElementType.PARAGRAPH,
|
||||||
'figure': ElementType.FIGURE,
|
'figure': ElementType.FIGURE,
|
||||||
@@ -107,24 +108,45 @@ class PPStructureEnhanced:
|
|||||||
# Try to access parsing_res_list (the complete structure)
|
# Try to access parsing_res_list (the complete structure)
|
||||||
parsing_res_list = None
|
parsing_res_list = None
|
||||||
|
|
||||||
# Method 1: Direct access to json attribute
|
# Method 1: Direct access to json attribute (check both top-level and res)
|
||||||
if hasattr(page_result, 'json'):
|
if hasattr(page_result, 'json'):
|
||||||
result_json = page_result.json
|
result_json = page_result.json
|
||||||
if isinstance(result_json, dict) and 'parsing_res_list' in result_json:
|
if isinstance(result_json, dict):
|
||||||
parsing_res_list = result_json['parsing_res_list']
|
# Check top-level
|
||||||
logger.info(f"Found parsing_res_list with {len(parsing_res_list)} elements")
|
if 'parsing_res_list' in result_json:
|
||||||
|
parsing_res_list = result_json['parsing_res_list']
|
||||||
|
logger.info(f"Found parsing_res_list at top level with {len(parsing_res_list)} elements")
|
||||||
|
# Check inside 'res' (new structure in paddlex)
|
||||||
|
elif 'res' in result_json and isinstance(result_json['res'], dict):
|
||||||
|
if 'parsing_res_list' in result_json['res']:
|
||||||
|
parsing_res_list = result_json['res']['parsing_res_list']
|
||||||
|
logger.info(f"Found parsing_res_list inside 'res' with {len(parsing_res_list)} elements")
|
||||||
|
|
||||||
# Method 2: Try to access as attribute
|
# Method 2: Try direct dict access (LayoutParsingResultV2 inherits from dict)
|
||||||
|
elif isinstance(page_result, dict):
|
||||||
|
if 'parsing_res_list' in page_result:
|
||||||
|
parsing_res_list = page_result['parsing_res_list']
|
||||||
|
logger.info(f"Found parsing_res_list via dict access with {len(parsing_res_list)} elements")
|
||||||
|
elif 'res' in page_result and isinstance(page_result['res'], dict):
|
||||||
|
if 'parsing_res_list' in page_result['res']:
|
||||||
|
parsing_res_list = page_result['res']['parsing_res_list']
|
||||||
|
logger.info(f"Found parsing_res_list inside page_result['res'] with {len(parsing_res_list)} elements")
|
||||||
|
|
||||||
|
# Method 3: Try to access as attribute
|
||||||
elif hasattr(page_result, 'parsing_res_list'):
|
elif hasattr(page_result, 'parsing_res_list'):
|
||||||
parsing_res_list = page_result.parsing_res_list
|
parsing_res_list = page_result.parsing_res_list
|
||||||
logger.info(f"Found parsing_res_list attribute with {len(parsing_res_list)} elements")
|
logger.info(f"Found parsing_res_list attribute with {len(parsing_res_list)} elements")
|
||||||
|
|
||||||
# Method 3: Check if result has to_dict method
|
# Method 4: Check if result has to_dict method
|
||||||
elif hasattr(page_result, 'to_dict'):
|
elif hasattr(page_result, 'to_dict'):
|
||||||
result_dict = page_result.to_dict()
|
result_dict = page_result.to_dict()
|
||||||
if 'parsing_res_list' in result_dict:
|
if 'parsing_res_list' in result_dict:
|
||||||
parsing_res_list = result_dict['parsing_res_list']
|
parsing_res_list = result_dict['parsing_res_list']
|
||||||
logger.info(f"Found parsing_res_list in to_dict with {len(parsing_res_list)} elements")
|
logger.info(f"Found parsing_res_list in to_dict with {len(parsing_res_list)} elements")
|
||||||
|
elif 'res' in result_dict and isinstance(result_dict['res'], dict):
|
||||||
|
if 'parsing_res_list' in result_dict['res']:
|
||||||
|
parsing_res_list = result_dict['res']['parsing_res_list']
|
||||||
|
logger.info(f"Found parsing_res_list in to_dict['res'] with {len(parsing_res_list)} elements")
|
||||||
|
|
||||||
# Process parsing_res_list if found
|
# Process parsing_res_list if found
|
||||||
if parsing_res_list:
|
if parsing_res_list:
|
||||||
@@ -207,27 +229,41 @@ class PPStructureEnhanced:
|
|||||||
elements = []
|
elements = []
|
||||||
|
|
||||||
for idx, item in enumerate(parsing_res_list):
|
for idx, item in enumerate(parsing_res_list):
|
||||||
# Extract element type
|
# Debug: log the structure of the first item
|
||||||
element_type = item.get('type', 'text').lower()
|
if idx == 0:
|
||||||
|
logger.info(f"First parsing_res_list item structure: {list(item.keys()) if isinstance(item, dict) else type(item)}")
|
||||||
|
logger.info(f"First parsing_res_list item sample: {str(item)[:500]}")
|
||||||
|
|
||||||
|
# Extract element type (check both 'type' and 'block_label')
|
||||||
|
element_type = item.get('type', '') or item.get('block_label', 'text')
|
||||||
|
element_type = element_type.lower()
|
||||||
mapped_type = self.ELEMENT_TYPE_MAPPING.get(
|
mapped_type = self.ELEMENT_TYPE_MAPPING.get(
|
||||||
element_type, ElementType.TEXT
|
element_type, ElementType.TEXT
|
||||||
)
|
)
|
||||||
|
|
||||||
# Extract bbox (layout_bbox has the precise coordinates)
|
# Extract bbox (check multiple possible keys)
|
||||||
layout_bbox = item.get('layout_bbox', [])
|
layout_bbox = (
|
||||||
if not layout_bbox and 'bbox' in item:
|
item.get('layout_bbox', []) or
|
||||||
layout_bbox = item['bbox']
|
item.get('block_bbox', []) or
|
||||||
|
item.get('bbox', [])
|
||||||
|
)
|
||||||
|
|
||||||
# Ensure bbox has 4 values
|
# Ensure bbox has 4 values
|
||||||
if len(layout_bbox) >= 4:
|
if len(layout_bbox) >= 4:
|
||||||
bbox = layout_bbox[:4] # [x1, y1, x2, y2]
|
bbox = layout_bbox[:4] # [x1, y1, x2, y2]
|
||||||
else:
|
else:
|
||||||
bbox = [0, 0, 0, 0] # Default if bbox missing
|
bbox = [0, 0, 0, 0] # Default if bbox missing
|
||||||
|
logger.warning(f"Element {idx} has invalid bbox: {layout_bbox}")
|
||||||
|
|
||||||
# Extract content
|
# Extract content (check multiple possible keys)
|
||||||
content = item.get('content', '')
|
content = (
|
||||||
|
item.get('content', '') or
|
||||||
|
item.get('block_content', '') or
|
||||||
|
''
|
||||||
|
)
|
||||||
|
|
||||||
|
# Additional fallback for content in 'res' field
|
||||||
if not content and 'res' in item:
|
if not content and 'res' in item:
|
||||||
# Some elements have content in 'res' field
|
|
||||||
res = item.get('res', {})
|
res = item.get('res', {})
|
||||||
if isinstance(res, dict):
|
if isinstance(res, dict):
|
||||||
content = res.get('content', '') or res.get('text', '')
|
content = res.get('content', '') or res.get('text', '')
|
||||||
|
|||||||
Reference in New Issue
Block a user