fix: resolve OCR track converter data structure mismatch

**Problem**: OCR track was producing empty output files (0 pages, 0 elements)
despite successful OCR extraction (27 text regions detected).

**Root Causes**:
1. Converter expected `text_regions` inside `layout_data`, but
   `process_file_traditional` returns it at top level
2. Converter expected `ocr_dimensions` to be a list, but single-page
   documents return it as dict `{'width': W, 'height': H}`

**Solution**:
- Add `_extract_from_traditional_ocr()` method to handle top-level
  `text_regions` structure from `process_file_traditional`
- Handle both dict (single-page) and list (multi-page) formats for
  `ocr_dimensions`
- Update `_extract_pages()` to check for `text_regions` key before
  `layout_data` key

**Verification**:
- Before: img1.png → 0 pages, 0 elements, 0 characters
- After: img1.png → 1 page, 27 elements, 278 characters
- Output files now properly generated (JSON: 13KB, MD: 498B, PDF: 23KB)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
egg
2025-11-20 17:51:18 +08:00
parent 2ecd022d6b
commit e23aaacd84

View File

@@ -84,7 +84,7 @@ class OCRToUnifiedConverter:
except Exception as e: except Exception as e:
logger.error(f"Error converting OCR results: {e}") logger.error(f"Error converting OCR results: {e}")
import traceback import traceback
traceback.print_exc() logger.error(f"Traceback: {traceback.format_exc()}")
# Return minimal document with error # Return minimal document with error
return UnifiedDocument( return UnifiedDocument(
@@ -127,6 +127,9 @@ class OCRToUnifiedConverter:
# Check if we have enhanced results from PPStructureEnhanced # Check if we have enhanced results from PPStructureEnhanced
if 'enhanced_results' in ocr_results: if 'enhanced_results' in ocr_results:
pages = self._extract_from_enhanced_results(ocr_results['enhanced_results']) pages = self._extract_from_enhanced_results(ocr_results['enhanced_results'])
# Check for traditional OCR results with text_regions at top level (from process_file_traditional)
elif 'text_regions' in ocr_results:
pages = self._extract_from_traditional_ocr(ocr_results)
# Check for traditional layout_data structure # Check for traditional layout_data structure
elif 'layout_data' in ocr_results: elif 'layout_data' in ocr_results:
pages = self._extract_from_layout_data(ocr_results['layout_data']) pages = self._extract_from_layout_data(ocr_results['layout_data'])
@@ -237,6 +240,96 @@ class OCRToUnifiedConverter:
return pages return pages
def _extract_from_traditional_ocr(self, ocr_results: Dict[str, Any]) -> List[Page]:
"""
Extract pages from traditional OCR results (process_file_traditional).
This handles the structure where text_regions and images_metadata are at
the top level of ocr_results, not nested inside layout_data.
"""
pages = []
# Get text regions and page dimensions
text_regions = ocr_results.get('text_regions', [])
ocr_dimensions = ocr_results.get('ocr_dimensions', [])
total_pages = ocr_results.get('total_pages', 1)
# Group elements by page
elements_by_page = {}
# Process text regions
for text_region in text_regions:
page_num = text_region.get('page', 1)
if page_num not in elements_by_page:
elements_by_page[page_num] = []
element = self._convert_text_region(text_region)
if element:
elements_by_page[page_num].append(element)
# Process images
for img_meta in ocr_results.get('images_metadata', []):
page_num = img_meta.get('page', 1)
if page_num not in elements_by_page:
elements_by_page[page_num] = []
element = self._convert_image_metadata(img_meta)
if element:
elements_by_page[page_num].append(element)
# Process tables from layout_data if available
if 'layout_data' in ocr_results and isinstance(ocr_results['layout_data'], dict):
for table_data in ocr_results['layout_data'].get('tables', []):
page_num = table_data.get('page', 1)
if page_num not in elements_by_page:
elements_by_page[page_num] = []
element = self._convert_table_data(table_data)
if element:
elements_by_page[page_num].append(element)
# Create pages
max_page = max(elements_by_page.keys()) if elements_by_page else total_pages
for page_num in range(1, max_page + 1):
elements = elements_by_page.get(page_num, [])
# Get page dimensions
# Handle both dict (single page) and list (multiple pages) formats
if isinstance(ocr_dimensions, dict):
# Single page format: {'width': W, 'height': H}
page_width = ocr_dimensions.get('width', 0)
page_height = ocr_dimensions.get('height', 0)
elif isinstance(ocr_dimensions, list):
# Multi-page format: [{'page': 1, 'width': W, 'height': H}, ...]
page_dims = next((d for d in ocr_dimensions if isinstance(d, dict) and d.get('page') == page_num), None)
if page_dims:
page_width = page_dims.get('width', 0)
page_height = page_dims.get('height', 0)
else:
page_width = 0
page_height = 0
else:
# Default dimensions if not available
page_width = 0
page_height = 0
# Determine reading order based on position
reading_order = self._calculate_reading_order(elements)
page = Page(
page_number=page_num,
dimensions=Dimensions(
width=page_width,
height=page_height
),
elements=elements,
metadata={'reading_order': reading_order}
)
pages.append(page)
return pages
def _convert_pp3_element( def _convert_pp3_element(
self, self,
elem_data: Dict[str, Any], elem_data: Dict[str, Any],