""" Enhanced PP-StructureV3 processing with full element extraction This module provides enhanced PP-StructureV3 processing that extracts all 23 element types with their bbox coordinates and reading order. """ import logging from pathlib import Path from typing import Dict, List, Optional, Tuple, Any import json import gc # Optional torch import for additional GPU memory management try: import torch TORCH_AVAILABLE = True except ImportError: TORCH_AVAILABLE = False import paddle from paddleocr import PPStructureV3 from PIL import Image import numpy as np from app.models.unified_document import ElementType from app.core.config import settings from app.services.memory_manager import prediction_context logger = logging.getLogger(__name__) class PPStructureEnhanced: """ Enhanced PP-StructureV3 processor that extracts all available element types and structure information from parsing_res_list. """ # Mapping from PP-StructureV3 types to our ElementType ELEMENT_TYPE_MAPPING = { 'title': ElementType.TITLE, 'paragraph_title': ElementType.TITLE, # PP-StructureV3 block_label 'text': ElementType.TEXT, 'paragraph': ElementType.PARAGRAPH, 'figure': ElementType.FIGURE, 'figure_caption': ElementType.CAPTION, 'table': ElementType.TABLE, 'table_caption': ElementType.TABLE_CAPTION, 'header': ElementType.HEADER, 'footer': ElementType.FOOTER, 'reference': ElementType.REFERENCE, 'equation': ElementType.EQUATION, 'formula': ElementType.FORMULA, 'list-item': ElementType.LIST_ITEM, 'list': ElementType.LIST, 'code': ElementType.CODE, 'footnote': ElementType.FOOTNOTE, 'page-number': ElementType.PAGE_NUMBER, 'watermark': ElementType.WATERMARK, 'signature': ElementType.SIGNATURE, 'stamp': ElementType.STAMP, 'logo': ElementType.LOGO, 'barcode': ElementType.BARCODE, 'qr-code': ElementType.QR_CODE, # Default fallback 'image': ElementType.IMAGE, 'chart': ElementType.CHART, 'diagram': ElementType.DIAGRAM, } def __init__(self, structure_engine: PPStructureV3): """ Initialize with existing PP-StructureV3 engine. Args: structure_engine: Initialized PPStructureV3 instance """ self.structure_engine = structure_engine def analyze_with_full_structure( self, image_path: Path, output_dir: Optional[Path] = None, current_page: int = 0, preprocessed_image: Optional[Image.Image] = None ) -> Dict[str, Any]: """ Analyze document with full PP-StructureV3 capabilities. Args: image_path: Path to original image file (used for cropping extracted images) output_dir: Optional output directory for saving extracted content current_page: Current page number (0-based) preprocessed_image: Optional preprocessed PIL Image for layout detection. If provided, this is used for PP-Structure prediction, but original image_path is still used for cropping images. Returns: Dictionary with complete structure information including: - elements: List of all detected elements with types and bbox - reading_order: Reading order indices - images: Extracted images with metadata - tables: Extracted tables with structure """ try: logger.info(f"Enhanced PP-StructureV3 analysis on {image_path.name}") if preprocessed_image: logger.info("Using preprocessed image for layout detection") # Perform structure analysis with semaphore control # This prevents OOM errors from multiple simultaneous predictions with prediction_context(timeout=settings.service_acquire_timeout_seconds) as acquired: if not acquired: logger.error("Failed to acquire prediction slot (timeout), returning empty result") return { 'has_parsing_res_list': False, 'elements': [], 'total_elements': 0, 'images': [], 'tables': [], 'element_types': {}, 'error': 'Prediction slot timeout' } # Use preprocessed image if provided, otherwise use original path if preprocessed_image is not None: # Convert PIL to numpy array (BGR format for PP-Structure) predict_input = np.array(preprocessed_image) if len(predict_input.shape) == 3 and predict_input.shape[2] == 3: # Convert RGB to BGR predict_input = predict_input[:, :, ::-1] results = self.structure_engine.predict(predict_input) else: results = self.structure_engine.predict(str(image_path)) all_elements = [] all_images = [] all_tables = [] # Process each page result for page_idx, page_result in enumerate(results): # Try to access parsing_res_list (the complete structure) parsing_res_list = None # Method 1: Direct access to json attribute (check both top-level and res) if hasattr(page_result, 'json'): result_json = page_result.json if isinstance(result_json, dict): # Check top-level if 'parsing_res_list' in result_json: parsing_res_list = result_json['parsing_res_list'] logger.info(f"Found parsing_res_list at top level with {len(parsing_res_list)} elements") # Check inside 'res' (new structure in paddlex) elif 'res' in result_json and isinstance(result_json['res'], dict): if 'parsing_res_list' in result_json['res']: parsing_res_list = result_json['res']['parsing_res_list'] logger.info(f"Found parsing_res_list inside 'res' with {len(parsing_res_list)} elements") # Method 2: Try direct dict access (LayoutParsingResultV2 inherits from dict) elif isinstance(page_result, dict): if 'parsing_res_list' in page_result: parsing_res_list = page_result['parsing_res_list'] logger.info(f"Found parsing_res_list via dict access with {len(parsing_res_list)} elements") elif 'res' in page_result and isinstance(page_result['res'], dict): if 'parsing_res_list' in page_result['res']: parsing_res_list = page_result['res']['parsing_res_list'] logger.info(f"Found parsing_res_list inside page_result['res'] with {len(parsing_res_list)} elements") # Method 3: Try to access as attribute elif hasattr(page_result, 'parsing_res_list'): parsing_res_list = page_result.parsing_res_list logger.info(f"Found parsing_res_list attribute with {len(parsing_res_list)} elements") # Method 4: Check if result has to_dict method elif hasattr(page_result, 'to_dict'): result_dict = page_result.to_dict() if 'parsing_res_list' in result_dict: parsing_res_list = result_dict['parsing_res_list'] logger.info(f"Found parsing_res_list in to_dict with {len(parsing_res_list)} elements") elif 'res' in result_dict and isinstance(result_dict['res'], dict): if 'parsing_res_list' in result_dict['res']: parsing_res_list = result_dict['res']['parsing_res_list'] logger.info(f"Found parsing_res_list in to_dict['res'] with {len(parsing_res_list)} elements") # Process parsing_res_list if found if parsing_res_list: elements = self._process_parsing_res_list( parsing_res_list, current_page, output_dir, image_path ) all_elements.extend(elements) # Extract tables and images from elements for elem in elements: if elem['type'] == ElementType.TABLE: all_tables.append(elem) elif elem['type'] in [ElementType.IMAGE, ElementType.FIGURE]: all_images.append(elem) else: # Fallback to markdown if parsing_res_list not available logger.warning("parsing_res_list not found, falling back to markdown") elements = self._process_markdown_fallback( page_result, current_page, output_dir ) all_elements.extend(elements) # Create reading order based on element positions reading_order = self._determine_reading_order(all_elements) return { 'elements': all_elements, 'total_elements': len(all_elements), 'reading_order': reading_order, 'tables': all_tables, 'images': all_images, 'element_types': self._count_element_types(all_elements), 'has_parsing_res_list': parsing_res_list is not None } except Exception as e: logger.error(f"Enhanced PP-StructureV3 analysis error: {e}") import traceback traceback.print_exc() # Clean up GPU memory on error try: if TORCH_AVAILABLE and torch.cuda.is_available(): torch.cuda.empty_cache() torch.cuda.synchronize() if paddle.device.is_compiled_with_cuda(): paddle.device.cuda.empty_cache() gc.collect() except: pass # Ignore cleanup errors return { 'elements': [], 'total_elements': 0, 'reading_order': [], 'tables': [], 'images': [], 'element_types': {}, 'has_parsing_res_list': False, 'error': str(e) } def _process_parsing_res_list( self, parsing_res_list: List[Dict], current_page: int, output_dir: Optional[Path], source_image_path: Optional[Path] = None ) -> List[Dict[str, Any]]: """ Process parsing_res_list to extract all elements. Args: parsing_res_list: List of parsed elements from PP-StructureV3 current_page: Current page number output_dir: Optional output directory source_image_path: Path to source image for cropping image regions Returns: List of processed elements with normalized structure """ elements = [] for idx, item in enumerate(parsing_res_list): # Debug: log the structure of the first item if idx == 0: logger.info(f"First parsing_res_list item structure: {list(item.keys()) if isinstance(item, dict) else type(item)}") logger.info(f"First parsing_res_list item sample: {str(item)[:500]}") # Extract element type (check both 'type' and 'block_label') element_type = item.get('type', '') or item.get('block_label', 'text') element_type = element_type.lower() mapped_type = self.ELEMENT_TYPE_MAPPING.get( element_type, ElementType.TEXT ) # Extract bbox (check multiple possible keys) layout_bbox = ( item.get('layout_bbox', []) or item.get('block_bbox', []) or item.get('bbox', []) ) # Ensure bbox has 4 values if len(layout_bbox) >= 4: bbox = layout_bbox[:4] # [x1, y1, x2, y2] else: bbox = [0, 0, 0, 0] # Default if bbox missing logger.warning(f"Element {idx} has invalid bbox: {layout_bbox}") # Extract content (check multiple possible keys) content = ( item.get('content', '') or item.get('block_content', '') or '' ) # Additional fallback for content in 'res' field if not content and 'res' in item: res = item.get('res', {}) if isinstance(res, dict): content = res.get('content', '') or res.get('text', '') elif isinstance(res, str): content = res # Content-based HTML table detection: PP-StructureV3 sometimes # classifies tables as 'text' but returns HTML table content html_table_content = None if content and '