fix: resolve OCR track converter data structure mismatch
**Problem**: OCR track was producing empty output files (0 pages, 0 elements)
despite successful OCR extraction (27 text regions detected).
**Root Causes**:
1. Converter expected `text_regions` inside `layout_data`, but
`process_file_traditional` returns it at top level
2. Converter expected `ocr_dimensions` to be a list, but single-page
documents return it as dict `{'width': W, 'height': H}`
**Solution**:
- Add `_extract_from_traditional_ocr()` method to handle top-level
`text_regions` structure from `process_file_traditional`
- Handle both dict (single-page) and list (multi-page) formats for
`ocr_dimensions`
- Update `_extract_pages()` to check for `text_regions` key before
`layout_data` key
**Verification**:
- Before: img1.png → 0 pages, 0 elements, 0 characters
- After: img1.png → 1 page, 27 elements, 278 characters
- Output files now properly generated (JSON: 13KB, MD: 498B, PDF: 23KB)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -84,7 +84,7 @@ class OCRToUnifiedConverter:
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error converting OCR results: {e}")
|
logger.error(f"Error converting OCR results: {e}")
|
||||||
import traceback
|
import traceback
|
||||||
traceback.print_exc()
|
logger.error(f"Traceback: {traceback.format_exc()}")
|
||||||
|
|
||||||
# Return minimal document with error
|
# Return minimal document with error
|
||||||
return UnifiedDocument(
|
return UnifiedDocument(
|
||||||
@@ -127,6 +127,9 @@ class OCRToUnifiedConverter:
|
|||||||
# Check if we have enhanced results from PPStructureEnhanced
|
# Check if we have enhanced results from PPStructureEnhanced
|
||||||
if 'enhanced_results' in ocr_results:
|
if 'enhanced_results' in ocr_results:
|
||||||
pages = self._extract_from_enhanced_results(ocr_results['enhanced_results'])
|
pages = self._extract_from_enhanced_results(ocr_results['enhanced_results'])
|
||||||
|
# Check for traditional OCR results with text_regions at top level (from process_file_traditional)
|
||||||
|
elif 'text_regions' in ocr_results:
|
||||||
|
pages = self._extract_from_traditional_ocr(ocr_results)
|
||||||
# Check for traditional layout_data structure
|
# Check for traditional layout_data structure
|
||||||
elif 'layout_data' in ocr_results:
|
elif 'layout_data' in ocr_results:
|
||||||
pages = self._extract_from_layout_data(ocr_results['layout_data'])
|
pages = self._extract_from_layout_data(ocr_results['layout_data'])
|
||||||
@@ -237,6 +240,96 @@ class OCRToUnifiedConverter:
|
|||||||
|
|
||||||
return pages
|
return pages
|
||||||
|
|
||||||
|
def _extract_from_traditional_ocr(self, ocr_results: Dict[str, Any]) -> List[Page]:
|
||||||
|
"""
|
||||||
|
Extract pages from traditional OCR results (process_file_traditional).
|
||||||
|
|
||||||
|
This handles the structure where text_regions and images_metadata are at
|
||||||
|
the top level of ocr_results, not nested inside layout_data.
|
||||||
|
"""
|
||||||
|
pages = []
|
||||||
|
|
||||||
|
# Get text regions and page dimensions
|
||||||
|
text_regions = ocr_results.get('text_regions', [])
|
||||||
|
ocr_dimensions = ocr_results.get('ocr_dimensions', [])
|
||||||
|
total_pages = ocr_results.get('total_pages', 1)
|
||||||
|
|
||||||
|
# Group elements by page
|
||||||
|
elements_by_page = {}
|
||||||
|
|
||||||
|
# Process text regions
|
||||||
|
for text_region in text_regions:
|
||||||
|
page_num = text_region.get('page', 1)
|
||||||
|
if page_num not in elements_by_page:
|
||||||
|
elements_by_page[page_num] = []
|
||||||
|
|
||||||
|
element = self._convert_text_region(text_region)
|
||||||
|
if element:
|
||||||
|
elements_by_page[page_num].append(element)
|
||||||
|
|
||||||
|
# Process images
|
||||||
|
for img_meta in ocr_results.get('images_metadata', []):
|
||||||
|
page_num = img_meta.get('page', 1)
|
||||||
|
if page_num not in elements_by_page:
|
||||||
|
elements_by_page[page_num] = []
|
||||||
|
|
||||||
|
element = self._convert_image_metadata(img_meta)
|
||||||
|
if element:
|
||||||
|
elements_by_page[page_num].append(element)
|
||||||
|
|
||||||
|
# Process tables from layout_data if available
|
||||||
|
if 'layout_data' in ocr_results and isinstance(ocr_results['layout_data'], dict):
|
||||||
|
for table_data in ocr_results['layout_data'].get('tables', []):
|
||||||
|
page_num = table_data.get('page', 1)
|
||||||
|
if page_num not in elements_by_page:
|
||||||
|
elements_by_page[page_num] = []
|
||||||
|
|
||||||
|
element = self._convert_table_data(table_data)
|
||||||
|
if element:
|
||||||
|
elements_by_page[page_num].append(element)
|
||||||
|
|
||||||
|
# Create pages
|
||||||
|
max_page = max(elements_by_page.keys()) if elements_by_page else total_pages
|
||||||
|
for page_num in range(1, max_page + 1):
|
||||||
|
elements = elements_by_page.get(page_num, [])
|
||||||
|
|
||||||
|
# Get page dimensions
|
||||||
|
# Handle both dict (single page) and list (multiple pages) formats
|
||||||
|
if isinstance(ocr_dimensions, dict):
|
||||||
|
# Single page format: {'width': W, 'height': H}
|
||||||
|
page_width = ocr_dimensions.get('width', 0)
|
||||||
|
page_height = ocr_dimensions.get('height', 0)
|
||||||
|
elif isinstance(ocr_dimensions, list):
|
||||||
|
# Multi-page format: [{'page': 1, 'width': W, 'height': H}, ...]
|
||||||
|
page_dims = next((d for d in ocr_dimensions if isinstance(d, dict) and d.get('page') == page_num), None)
|
||||||
|
if page_dims:
|
||||||
|
page_width = page_dims.get('width', 0)
|
||||||
|
page_height = page_dims.get('height', 0)
|
||||||
|
else:
|
||||||
|
page_width = 0
|
||||||
|
page_height = 0
|
||||||
|
else:
|
||||||
|
# Default dimensions if not available
|
||||||
|
page_width = 0
|
||||||
|
page_height = 0
|
||||||
|
|
||||||
|
# Determine reading order based on position
|
||||||
|
reading_order = self._calculate_reading_order(elements)
|
||||||
|
|
||||||
|
page = Page(
|
||||||
|
page_number=page_num,
|
||||||
|
dimensions=Dimensions(
|
||||||
|
width=page_width,
|
||||||
|
height=page_height
|
||||||
|
),
|
||||||
|
elements=elements,
|
||||||
|
metadata={'reading_order': reading_order}
|
||||||
|
)
|
||||||
|
|
||||||
|
pages.append(page)
|
||||||
|
|
||||||
|
return pages
|
||||||
|
|
||||||
def _convert_pp3_element(
|
def _convert_pp3_element(
|
||||||
self,
|
self,
|
||||||
elem_data: Dict[str, Any],
|
elem_data: Dict[str, Any],
|
||||||
|
|||||||
Reference in New Issue
Block a user