diff --git a/backend/app/core/config.py b/backend/app/core/config.py index b076de1..20c6722 100644 --- a/backend/app/core/config.py +++ b/backend/app/core/config.py @@ -91,7 +91,13 @@ class Settings(BaseSettings): enable_table_recognition: bool = Field(default=True) # Table structure recognition enable_seal_recognition: bool = Field(default=True) # Seal/stamp recognition enable_text_recognition: bool = Field(default=True) # General text recognition - layout_detection_threshold: float = Field(default=0.5) + layout_detection_threshold: float = Field(default=0.2) # Lower threshold for more sensitive detection + layout_nms_threshold: float = Field(default=0.2) # Lower NMS to preserve more individual elements + layout_merge_mode: str = Field(default="small") # Use 'small' to minimize bbox merging + layout_unclip_ratio: float = Field(default=1.2) # Smaller unclip to preserve element boundaries + text_det_thresh: float = Field(default=0.2) # More sensitive text detection + text_det_box_thresh: float = Field(default=0.3) # Lower box threshold for better detection + text_det_unclip_ratio: float = Field(default=1.2) # Smaller unclip for tighter text boxes # Performance tuning use_fp16_inference: bool = Field(default=False) # Half-precision (if supported) diff --git a/backend/app/services/ocr_service.py b/backend/app/services/ocr_service.py index db0b881..d93d786 100644 --- a/backend/app/services/ocr_service.py +++ b/backend/app/services/ocr_service.py @@ -359,8 +359,16 @@ class OCRService: use_formula = settings.enable_formula_recognition use_table = settings.enable_table_recognition layout_threshold = settings.layout_detection_threshold + layout_nms = settings.layout_nms_threshold + layout_merge = settings.layout_merge_mode + layout_unclip = settings.layout_unclip_ratio + text_thresh = settings.text_det_thresh + text_box_thresh = settings.text_det_box_thresh + text_unclip = settings.text_det_unclip_ratio logger.info(f"PP-StructureV3 config: table={use_table}, formula={use_formula}, chart={use_chart}") + logger.info(f"Layout config: threshold={layout_threshold}, nms={layout_nms}, merge={layout_merge}, unclip={layout_unclip}") + logger.info(f"Text detection: thresh={text_thresh}, box_thresh={text_box_thresh}, unclip={text_unclip}") self.structure_engine = PPStructureV3( use_doc_orientation_classify=False, @@ -368,8 +376,14 @@ class OCRService: use_textline_orientation=False, use_table_recognition=use_table, use_formula_recognition=use_formula, - use_chart_recognition=use_chart, # Disabled by default to save ~500MB VRAM + use_chart_recognition=use_chart, layout_threshold=layout_threshold, + layout_nms=layout_nms, + layout_unclip_ratio=layout_unclip, + layout_merge_bboxes_mode=layout_merge, # Use 'small' to minimize merging + text_det_thresh=text_thresh, + text_det_box_thresh=text_box_thresh, + text_det_unclip_ratio=text_unclip, ) # Track model loading for cache management diff --git a/backend/app/services/pp_structure_enhanced.py b/backend/app/services/pp_structure_enhanced.py index 5d9fedf..d1f00ea 100644 --- a/backend/app/services/pp_structure_enhanced.py +++ b/backend/app/services/pp_structure_enhanced.py @@ -34,6 +34,7 @@ class PPStructureEnhanced: # Mapping from PP-StructureV3 types to our ElementType ELEMENT_TYPE_MAPPING = { 'title': ElementType.TITLE, + 'paragraph_title': ElementType.TITLE, # PP-StructureV3 block_label 'text': ElementType.TEXT, 'paragraph': ElementType.PARAGRAPH, 'figure': ElementType.FIGURE, @@ -107,24 +108,45 @@ class PPStructureEnhanced: # Try to access parsing_res_list (the complete structure) parsing_res_list = None - # Method 1: Direct access to json attribute + # Method 1: Direct access to json attribute (check both top-level and res) if hasattr(page_result, 'json'): result_json = page_result.json - if isinstance(result_json, dict) and 'parsing_res_list' in result_json: - parsing_res_list = result_json['parsing_res_list'] - logger.info(f"Found parsing_res_list with {len(parsing_res_list)} elements") + if isinstance(result_json, dict): + # Check top-level + if 'parsing_res_list' in result_json: + parsing_res_list = result_json['parsing_res_list'] + logger.info(f"Found parsing_res_list at top level with {len(parsing_res_list)} elements") + # Check inside 'res' (new structure in paddlex) + elif 'res' in result_json and isinstance(result_json['res'], dict): + if 'parsing_res_list' in result_json['res']: + parsing_res_list = result_json['res']['parsing_res_list'] + logger.info(f"Found parsing_res_list inside 'res' with {len(parsing_res_list)} elements") - # Method 2: Try to access as attribute + # Method 2: Try direct dict access (LayoutParsingResultV2 inherits from dict) + elif isinstance(page_result, dict): + if 'parsing_res_list' in page_result: + parsing_res_list = page_result['parsing_res_list'] + logger.info(f"Found parsing_res_list via dict access with {len(parsing_res_list)} elements") + elif 'res' in page_result and isinstance(page_result['res'], dict): + if 'parsing_res_list' in page_result['res']: + parsing_res_list = page_result['res']['parsing_res_list'] + logger.info(f"Found parsing_res_list inside page_result['res'] with {len(parsing_res_list)} elements") + + # Method 3: Try to access as attribute elif hasattr(page_result, 'parsing_res_list'): parsing_res_list = page_result.parsing_res_list logger.info(f"Found parsing_res_list attribute with {len(parsing_res_list)} elements") - # Method 3: Check if result has to_dict method + # Method 4: Check if result has to_dict method elif hasattr(page_result, 'to_dict'): result_dict = page_result.to_dict() if 'parsing_res_list' in result_dict: parsing_res_list = result_dict['parsing_res_list'] logger.info(f"Found parsing_res_list in to_dict with {len(parsing_res_list)} elements") + elif 'res' in result_dict and isinstance(result_dict['res'], dict): + if 'parsing_res_list' in result_dict['res']: + parsing_res_list = result_dict['res']['parsing_res_list'] + logger.info(f"Found parsing_res_list in to_dict['res'] with {len(parsing_res_list)} elements") # Process parsing_res_list if found if parsing_res_list: @@ -207,27 +229,41 @@ class PPStructureEnhanced: elements = [] for idx, item in enumerate(parsing_res_list): - # Extract element type - element_type = item.get('type', 'text').lower() + # Debug: log the structure of the first item + if idx == 0: + logger.info(f"First parsing_res_list item structure: {list(item.keys()) if isinstance(item, dict) else type(item)}") + logger.info(f"First parsing_res_list item sample: {str(item)[:500]}") + + # Extract element type (check both 'type' and 'block_label') + element_type = item.get('type', '') or item.get('block_label', 'text') + element_type = element_type.lower() mapped_type = self.ELEMENT_TYPE_MAPPING.get( element_type, ElementType.TEXT ) - # Extract bbox (layout_bbox has the precise coordinates) - layout_bbox = item.get('layout_bbox', []) - if not layout_bbox and 'bbox' in item: - layout_bbox = item['bbox'] + # Extract bbox (check multiple possible keys) + layout_bbox = ( + item.get('layout_bbox', []) or + item.get('block_bbox', []) or + item.get('bbox', []) + ) # Ensure bbox has 4 values if len(layout_bbox) >= 4: bbox = layout_bbox[:4] # [x1, y1, x2, y2] else: bbox = [0, 0, 0, 0] # Default if bbox missing + logger.warning(f"Element {idx} has invalid bbox: {layout_bbox}") - # Extract content - content = item.get('content', '') + # Extract content (check multiple possible keys) + content = ( + item.get('content', '') or + item.get('block_content', '') or + '' + ) + + # Additional fallback for content in 'res' field if not content and 'res' in item: - # Some elements have content in 'res' field res = item.get('res', {}) if isinstance(res, dict): content = res.get('content', '') or res.get('text', '')