fix: resolve OCR track converter data structure mismatch
**Problem**: OCR track was producing empty output files (0 pages, 0 elements)
despite successful OCR extraction (27 text regions detected).
**Root Causes**:
1. Converter expected `text_regions` inside `layout_data`, but
`process_file_traditional` returns it at top level
2. Converter expected `ocr_dimensions` to be a list, but single-page
documents return it as dict `{'width': W, 'height': H}`
**Solution**:
- Add `_extract_from_traditional_ocr()` method to handle top-level
`text_regions` structure from `process_file_traditional`
- Handle both dict (single-page) and list (multi-page) formats for
`ocr_dimensions`
- Update `_extract_pages()` to check for `text_regions` key before
`layout_data` key
**Verification**:
- Before: img1.png → 0 pages, 0 elements, 0 characters
- After: img1.png → 1 page, 27 elements, 278 characters
- Output files now properly generated (JSON: 13KB, MD: 498B, PDF: 23KB)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -84,7 +84,7 @@ class OCRToUnifiedConverter:
|
||||
except Exception as e:
|
||||
logger.error(f"Error converting OCR results: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
logger.error(f"Traceback: {traceback.format_exc()}")
|
||||
|
||||
# Return minimal document with error
|
||||
return UnifiedDocument(
|
||||
@@ -127,6 +127,9 @@ class OCRToUnifiedConverter:
|
||||
# Check if we have enhanced results from PPStructureEnhanced
|
||||
if 'enhanced_results' in ocr_results:
|
||||
pages = self._extract_from_enhanced_results(ocr_results['enhanced_results'])
|
||||
# Check for traditional OCR results with text_regions at top level (from process_file_traditional)
|
||||
elif 'text_regions' in ocr_results:
|
||||
pages = self._extract_from_traditional_ocr(ocr_results)
|
||||
# Check for traditional layout_data structure
|
||||
elif 'layout_data' in ocr_results:
|
||||
pages = self._extract_from_layout_data(ocr_results['layout_data'])
|
||||
@@ -237,6 +240,96 @@ class OCRToUnifiedConverter:
|
||||
|
||||
return pages
|
||||
|
||||
def _extract_from_traditional_ocr(self, ocr_results: Dict[str, Any]) -> List[Page]:
|
||||
"""
|
||||
Extract pages from traditional OCR results (process_file_traditional).
|
||||
|
||||
This handles the structure where text_regions and images_metadata are at
|
||||
the top level of ocr_results, not nested inside layout_data.
|
||||
"""
|
||||
pages = []
|
||||
|
||||
# Get text regions and page dimensions
|
||||
text_regions = ocr_results.get('text_regions', [])
|
||||
ocr_dimensions = ocr_results.get('ocr_dimensions', [])
|
||||
total_pages = ocr_results.get('total_pages', 1)
|
||||
|
||||
# Group elements by page
|
||||
elements_by_page = {}
|
||||
|
||||
# Process text regions
|
||||
for text_region in text_regions:
|
||||
page_num = text_region.get('page', 1)
|
||||
if page_num not in elements_by_page:
|
||||
elements_by_page[page_num] = []
|
||||
|
||||
element = self._convert_text_region(text_region)
|
||||
if element:
|
||||
elements_by_page[page_num].append(element)
|
||||
|
||||
# Process images
|
||||
for img_meta in ocr_results.get('images_metadata', []):
|
||||
page_num = img_meta.get('page', 1)
|
||||
if page_num not in elements_by_page:
|
||||
elements_by_page[page_num] = []
|
||||
|
||||
element = self._convert_image_metadata(img_meta)
|
||||
if element:
|
||||
elements_by_page[page_num].append(element)
|
||||
|
||||
# Process tables from layout_data if available
|
||||
if 'layout_data' in ocr_results and isinstance(ocr_results['layout_data'], dict):
|
||||
for table_data in ocr_results['layout_data'].get('tables', []):
|
||||
page_num = table_data.get('page', 1)
|
||||
if page_num not in elements_by_page:
|
||||
elements_by_page[page_num] = []
|
||||
|
||||
element = self._convert_table_data(table_data)
|
||||
if element:
|
||||
elements_by_page[page_num].append(element)
|
||||
|
||||
# Create pages
|
||||
max_page = max(elements_by_page.keys()) if elements_by_page else total_pages
|
||||
for page_num in range(1, max_page + 1):
|
||||
elements = elements_by_page.get(page_num, [])
|
||||
|
||||
# Get page dimensions
|
||||
# Handle both dict (single page) and list (multiple pages) formats
|
||||
if isinstance(ocr_dimensions, dict):
|
||||
# Single page format: {'width': W, 'height': H}
|
||||
page_width = ocr_dimensions.get('width', 0)
|
||||
page_height = ocr_dimensions.get('height', 0)
|
||||
elif isinstance(ocr_dimensions, list):
|
||||
# Multi-page format: [{'page': 1, 'width': W, 'height': H}, ...]
|
||||
page_dims = next((d for d in ocr_dimensions if isinstance(d, dict) and d.get('page') == page_num), None)
|
||||
if page_dims:
|
||||
page_width = page_dims.get('width', 0)
|
||||
page_height = page_dims.get('height', 0)
|
||||
else:
|
||||
page_width = 0
|
||||
page_height = 0
|
||||
else:
|
||||
# Default dimensions if not available
|
||||
page_width = 0
|
||||
page_height = 0
|
||||
|
||||
# Determine reading order based on position
|
||||
reading_order = self._calculate_reading_order(elements)
|
||||
|
||||
page = Page(
|
||||
page_number=page_num,
|
||||
dimensions=Dimensions(
|
||||
width=page_width,
|
||||
height=page_height
|
||||
),
|
||||
elements=elements,
|
||||
metadata={'reading_order': reading_order}
|
||||
)
|
||||
|
||||
pages.append(page)
|
||||
|
||||
return pages
|
||||
|
||||
def _convert_pp3_element(
|
||||
self,
|
||||
elem_data: Dict[str, Any],
|
||||
|
||||
Reference in New Issue
Block a user