feat: add OCR to UnifiedDocument converter for PP-StructureV3 integration
Implements the converter that transforms PP-StructureV3 OCR results into the UnifiedDocument format, enabling consistent output for both OCR and direct extraction tracks. - Create OCRToUnifiedConverter class with full element type mapping - Handle both enhanced (parsing_res_list) and standard markdown results - Support 4-point and simple bbox formats for coordinates - Establish element relationships (captions, lists, headers) - Integrate converter into OCR service dual-track processing - Update tasks.md marking section 3.3 complete 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -22,10 +22,11 @@ from app.services.office_converter import OfficeConverter, OfficeConverterError
|
||||
try:
|
||||
from app.services.document_type_detector import DocumentTypeDetector, ProcessingTrackRecommendation
|
||||
from app.services.direct_extraction_engine import DirectExtractionEngine
|
||||
from app.services.ocr_to_unified_converter import OCRToUnifiedConverter
|
||||
from app.models.unified_document import (
|
||||
UnifiedDocument, UnifiedDocumentConverter, DocumentMetadata,
|
||||
UnifiedDocument, DocumentMetadata,
|
||||
ProcessingTrack, ElementType, DocumentElement, Page, Dimensions,
|
||||
BoundingBox
|
||||
BoundingBox, ProcessingInfo
|
||||
)
|
||||
DUAL_TRACK_AVAILABLE = True
|
||||
except ImportError as e:
|
||||
@@ -66,11 +67,13 @@ class OCRService:
|
||||
enable_table_detection=True,
|
||||
enable_image_extraction=True
|
||||
)
|
||||
self.ocr_to_unified_converter = OCRToUnifiedConverter()
|
||||
self.dual_track_enabled = True
|
||||
logger.info("Dual-track processing enabled")
|
||||
else:
|
||||
self.document_detector = None
|
||||
self.direct_extraction_engine = None
|
||||
self.ocr_to_unified_converter = None
|
||||
self.dual_track_enabled = False
|
||||
logger.info("Dual-track processing not available, using OCR-only mode")
|
||||
|
||||
@@ -541,6 +544,17 @@ class OCRService:
|
||||
}
|
||||
}
|
||||
|
||||
# If layout data is enhanced, add enhanced results for converter
|
||||
if layout_data and layout_data.get('enhanced'):
|
||||
result['enhanced_results'] = [{
|
||||
'elements': layout_data.get('elements', []),
|
||||
'reading_order': layout_data.get('reading_order', []),
|
||||
'element_types': layout_data.get('element_types', {}),
|
||||
'page': current_page,
|
||||
'width': ocr_width,
|
||||
'height': ocr_height
|
||||
}]
|
||||
|
||||
logger.info(
|
||||
f"OCR completed: {image_path.name} - "
|
||||
f"{len(text_regions)} regions, "
|
||||
@@ -621,7 +635,7 @@ class OCRService:
|
||||
|
||||
def analyze_layout(self, image_path: Path, output_dir: Optional[Path] = None, current_page: int = 0) -> Tuple[Optional[Dict], List[Dict]]:
|
||||
"""
|
||||
Analyze document layout using PP-StructureV3
|
||||
Analyze document layout using PP-StructureV3 with enhanced element extraction
|
||||
|
||||
Args:
|
||||
image_path: Path to image file
|
||||
@@ -634,8 +648,49 @@ class OCRService:
|
||||
try:
|
||||
structure_engine = self.get_structure_engine()
|
||||
|
||||
# Perform structure analysis using predict() method (PaddleOCR 3.x API)
|
||||
logger.info(f"Running layout analysis on {image_path.name}")
|
||||
# Try enhanced processing first
|
||||
try:
|
||||
from app.services.pp_structure_enhanced import PPStructureEnhanced
|
||||
|
||||
enhanced_processor = PPStructureEnhanced(structure_engine)
|
||||
result = enhanced_processor.analyze_with_full_structure(
|
||||
image_path, output_dir, current_page
|
||||
)
|
||||
|
||||
if result.get('has_parsing_res_list'):
|
||||
logger.info(f"Enhanced PP-StructureV3 analysis successful with {result['total_elements']} elements")
|
||||
logger.info(f"Element types found: {result.get('element_types', {})}")
|
||||
|
||||
# Convert to legacy format for compatibility
|
||||
layout_data = {
|
||||
'elements': result['elements'],
|
||||
'total_elements': result['total_elements'],
|
||||
'reading_order': result['reading_order'],
|
||||
'element_types': result.get('element_types', {}),
|
||||
'enhanced': True
|
||||
}
|
||||
|
||||
# Extract images metadata
|
||||
images_metadata = []
|
||||
for elem in result.get('images', []):
|
||||
images_metadata.append({
|
||||
'element_id': elem['element_id'],
|
||||
'type': 'image',
|
||||
'page': elem['page'],
|
||||
'bbox': elem['bbox']
|
||||
})
|
||||
|
||||
return layout_data, images_metadata
|
||||
else:
|
||||
logger.info("parsing_res_list not available, using standard processing")
|
||||
|
||||
except ImportError:
|
||||
logger.debug("Enhanced PP-StructureV3 module not available, using standard processing")
|
||||
except Exception as e:
|
||||
logger.warning(f"Enhanced processing failed, falling back to standard: {e}")
|
||||
|
||||
# Standard processing (original implementation)
|
||||
logger.info(f"Running standard layout analysis on {image_path.name}")
|
||||
results = structure_engine.predict(str(image_path))
|
||||
|
||||
layout_elements = []
|
||||
@@ -858,20 +913,12 @@ class OCRService:
|
||||
file_path, lang, detect_layout, confidence_threshold, output_dir
|
||||
)
|
||||
|
||||
# Convert OCR result to UnifiedDocument
|
||||
metadata = DocumentMetadata(
|
||||
filename=file_path.name,
|
||||
file_type=file_path.suffix,
|
||||
file_size=file_path.stat().st_size,
|
||||
created_at=start_time,
|
||||
processing_track=ProcessingTrack.OCR,
|
||||
processing_time=(datetime.now() - start_time).total_seconds(),
|
||||
language=lang
|
||||
)
|
||||
|
||||
unified_doc = UnifiedDocumentConverter.from_ocr_result(
|
||||
ocr_result, document_id, metadata
|
||||
# Convert OCR result to UnifiedDocument using the converter
|
||||
processing_time_so_far = (datetime.now() - start_time).total_seconds()
|
||||
unified_doc = self.ocr_to_unified_converter.convert(
|
||||
ocr_result, file_path, processing_time_so_far, lang
|
||||
)
|
||||
unified_doc.document_id = document_id
|
||||
|
||||
# Update processing track metadata
|
||||
unified_doc.metadata.processing_track = (
|
||||
@@ -951,11 +998,13 @@ class OCRService:
|
||||
'processing_time': 0.0,
|
||||
'pages': [],
|
||||
'layout_data': {'elements': []},
|
||||
'images_metadata': []
|
||||
'images_metadata': [],
|
||||
'enhanced_results': [] # For PP-StructureV3 enhanced results
|
||||
}
|
||||
|
||||
total_confidence = 0.0
|
||||
total_regions = 0
|
||||
has_enhanced = False
|
||||
|
||||
for page_num, result in enumerate(results):
|
||||
if result['status'] == 'success':
|
||||
@@ -971,7 +1020,21 @@ class OCRService:
|
||||
|
||||
# Collect layout data
|
||||
if result.get('layout_data'):
|
||||
for elem in result['layout_data'].get('elements', []):
|
||||
layout = result['layout_data']
|
||||
# Check if this is enhanced layout data
|
||||
if layout.get('enhanced'):
|
||||
has_enhanced = True
|
||||
# Store enhanced results separately for converter
|
||||
combined['enhanced_results'].append({
|
||||
'elements': layout.get('elements', []),
|
||||
'reading_order': layout.get('reading_order', []),
|
||||
'element_types': layout.get('element_types', {}),
|
||||
'page': page_num,
|
||||
'width': result.get('ocr_dimensions', {}).get('width', 0),
|
||||
'height': result.get('ocr_dimensions', {}).get('height', 0)
|
||||
})
|
||||
# Always collect elements for backward compatibility
|
||||
for elem in layout.get('elements', []):
|
||||
elem['page'] = page_num
|
||||
combined['layout_data']['elements'].append(elem)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user