diff --git a/backend/app/services/ocr_to_unified_converter.py b/backend/app/services/ocr_to_unified_converter.py index 371d8dc..b94874d 100644 --- a/backend/app/services/ocr_to_unified_converter.py +++ b/backend/app/services/ocr_to_unified_converter.py @@ -84,7 +84,7 @@ class OCRToUnifiedConverter: except Exception as e: logger.error(f"Error converting OCR results: {e}") import traceback - traceback.print_exc() + logger.error(f"Traceback: {traceback.format_exc()}") # Return minimal document with error return UnifiedDocument( @@ -127,6 +127,9 @@ class OCRToUnifiedConverter: # Check if we have enhanced results from PPStructureEnhanced if 'enhanced_results' in ocr_results: pages = self._extract_from_enhanced_results(ocr_results['enhanced_results']) + # Check for traditional OCR results with text_regions at top level (from process_file_traditional) + elif 'text_regions' in ocr_results: + pages = self._extract_from_traditional_ocr(ocr_results) # Check for traditional layout_data structure elif 'layout_data' in ocr_results: pages = self._extract_from_layout_data(ocr_results['layout_data']) @@ -237,6 +240,96 @@ class OCRToUnifiedConverter: return pages + def _extract_from_traditional_ocr(self, ocr_results: Dict[str, Any]) -> List[Page]: + """ + Extract pages from traditional OCR results (process_file_traditional). + + This handles the structure where text_regions and images_metadata are at + the top level of ocr_results, not nested inside layout_data. + """ + pages = [] + + # Get text regions and page dimensions + text_regions = ocr_results.get('text_regions', []) + ocr_dimensions = ocr_results.get('ocr_dimensions', []) + total_pages = ocr_results.get('total_pages', 1) + + # Group elements by page + elements_by_page = {} + + # Process text regions + for text_region in text_regions: + page_num = text_region.get('page', 1) + if page_num not in elements_by_page: + elements_by_page[page_num] = [] + + element = self._convert_text_region(text_region) + if element: + elements_by_page[page_num].append(element) + + # Process images + for img_meta in ocr_results.get('images_metadata', []): + page_num = img_meta.get('page', 1) + if page_num not in elements_by_page: + elements_by_page[page_num] = [] + + element = self._convert_image_metadata(img_meta) + if element: + elements_by_page[page_num].append(element) + + # Process tables from layout_data if available + if 'layout_data' in ocr_results and isinstance(ocr_results['layout_data'], dict): + for table_data in ocr_results['layout_data'].get('tables', []): + page_num = table_data.get('page', 1) + if page_num not in elements_by_page: + elements_by_page[page_num] = [] + + element = self._convert_table_data(table_data) + if element: + elements_by_page[page_num].append(element) + + # Create pages + max_page = max(elements_by_page.keys()) if elements_by_page else total_pages + for page_num in range(1, max_page + 1): + elements = elements_by_page.get(page_num, []) + + # Get page dimensions + # Handle both dict (single page) and list (multiple pages) formats + if isinstance(ocr_dimensions, dict): + # Single page format: {'width': W, 'height': H} + page_width = ocr_dimensions.get('width', 0) + page_height = ocr_dimensions.get('height', 0) + elif isinstance(ocr_dimensions, list): + # Multi-page format: [{'page': 1, 'width': W, 'height': H}, ...] + page_dims = next((d for d in ocr_dimensions if isinstance(d, dict) and d.get('page') == page_num), None) + if page_dims: + page_width = page_dims.get('width', 0) + page_height = page_dims.get('height', 0) + else: + page_width = 0 + page_height = 0 + else: + # Default dimensions if not available + page_width = 0 + page_height = 0 + + # Determine reading order based on position + reading_order = self._calculate_reading_order(elements) + + page = Page( + page_number=page_num, + dimensions=Dimensions( + width=page_width, + height=page_height + ), + elements=elements, + metadata={'reading_order': reading_order} + ) + + pages.append(page) + + return pages + def _convert_pp3_element( self, elem_data: Dict[str, Any],