feat: add OCR to UnifiedDocument converter for PP-StructureV3 integration

Implements the converter that transforms PP-StructureV3 OCR results into
the UnifiedDocument format, enabling consistent output for both OCR and
direct extraction tracks.

- Create OCRToUnifiedConverter class with full element type mapping
- Handle both enhanced (parsing_res_list) and standard markdown results
- Support 4-point and simple bbox formats for coordinates
- Establish element relationships (captions, lists, headers)
- Integrate converter into OCR service dual-track processing
- Update tasks.md marking section 3.3 complete

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
egg
2025-11-19 08:05:20 +08:00
parent 062cb1f423
commit a3a6fbe58b
4 changed files with 1172 additions and 29 deletions

View File

@@ -22,10 +22,11 @@ from app.services.office_converter import OfficeConverter, OfficeConverterError
try:
from app.services.document_type_detector import DocumentTypeDetector, ProcessingTrackRecommendation
from app.services.direct_extraction_engine import DirectExtractionEngine
from app.services.ocr_to_unified_converter import OCRToUnifiedConverter
from app.models.unified_document import (
UnifiedDocument, UnifiedDocumentConverter, DocumentMetadata,
UnifiedDocument, DocumentMetadata,
ProcessingTrack, ElementType, DocumentElement, Page, Dimensions,
BoundingBox
BoundingBox, ProcessingInfo
)
DUAL_TRACK_AVAILABLE = True
except ImportError as e:
@@ -66,11 +67,13 @@ class OCRService:
enable_table_detection=True,
enable_image_extraction=True
)
self.ocr_to_unified_converter = OCRToUnifiedConverter()
self.dual_track_enabled = True
logger.info("Dual-track processing enabled")
else:
self.document_detector = None
self.direct_extraction_engine = None
self.ocr_to_unified_converter = None
self.dual_track_enabled = False
logger.info("Dual-track processing not available, using OCR-only mode")
@@ -541,6 +544,17 @@ class OCRService:
}
}
# If layout data is enhanced, add enhanced results for converter
if layout_data and layout_data.get('enhanced'):
result['enhanced_results'] = [{
'elements': layout_data.get('elements', []),
'reading_order': layout_data.get('reading_order', []),
'element_types': layout_data.get('element_types', {}),
'page': current_page,
'width': ocr_width,
'height': ocr_height
}]
logger.info(
f"OCR completed: {image_path.name} - "
f"{len(text_regions)} regions, "
@@ -621,7 +635,7 @@ class OCRService:
def analyze_layout(self, image_path: Path, output_dir: Optional[Path] = None, current_page: int = 0) -> Tuple[Optional[Dict], List[Dict]]:
"""
Analyze document layout using PP-StructureV3
Analyze document layout using PP-StructureV3 with enhanced element extraction
Args:
image_path: Path to image file
@@ -634,8 +648,49 @@ class OCRService:
try:
structure_engine = self.get_structure_engine()
# Perform structure analysis using predict() method (PaddleOCR 3.x API)
logger.info(f"Running layout analysis on {image_path.name}")
# Try enhanced processing first
try:
from app.services.pp_structure_enhanced import PPStructureEnhanced
enhanced_processor = PPStructureEnhanced(structure_engine)
result = enhanced_processor.analyze_with_full_structure(
image_path, output_dir, current_page
)
if result.get('has_parsing_res_list'):
logger.info(f"Enhanced PP-StructureV3 analysis successful with {result['total_elements']} elements")
logger.info(f"Element types found: {result.get('element_types', {})}")
# Convert to legacy format for compatibility
layout_data = {
'elements': result['elements'],
'total_elements': result['total_elements'],
'reading_order': result['reading_order'],
'element_types': result.get('element_types', {}),
'enhanced': True
}
# Extract images metadata
images_metadata = []
for elem in result.get('images', []):
images_metadata.append({
'element_id': elem['element_id'],
'type': 'image',
'page': elem['page'],
'bbox': elem['bbox']
})
return layout_data, images_metadata
else:
logger.info("parsing_res_list not available, using standard processing")
except ImportError:
logger.debug("Enhanced PP-StructureV3 module not available, using standard processing")
except Exception as e:
logger.warning(f"Enhanced processing failed, falling back to standard: {e}")
# Standard processing (original implementation)
logger.info(f"Running standard layout analysis on {image_path.name}")
results = structure_engine.predict(str(image_path))
layout_elements = []
@@ -858,20 +913,12 @@ class OCRService:
file_path, lang, detect_layout, confidence_threshold, output_dir
)
# Convert OCR result to UnifiedDocument
metadata = DocumentMetadata(
filename=file_path.name,
file_type=file_path.suffix,
file_size=file_path.stat().st_size,
created_at=start_time,
processing_track=ProcessingTrack.OCR,
processing_time=(datetime.now() - start_time).total_seconds(),
language=lang
)
unified_doc = UnifiedDocumentConverter.from_ocr_result(
ocr_result, document_id, metadata
# Convert OCR result to UnifiedDocument using the converter
processing_time_so_far = (datetime.now() - start_time).total_seconds()
unified_doc = self.ocr_to_unified_converter.convert(
ocr_result, file_path, processing_time_so_far, lang
)
unified_doc.document_id = document_id
# Update processing track metadata
unified_doc.metadata.processing_track = (
@@ -951,11 +998,13 @@ class OCRService:
'processing_time': 0.0,
'pages': [],
'layout_data': {'elements': []},
'images_metadata': []
'images_metadata': [],
'enhanced_results': [] # For PP-StructureV3 enhanced results
}
total_confidence = 0.0
total_regions = 0
has_enhanced = False
for page_num, result in enumerate(results):
if result['status'] == 'success':
@@ -971,7 +1020,21 @@ class OCRService:
# Collect layout data
if result.get('layout_data'):
for elem in result['layout_data'].get('elements', []):
layout = result['layout_data']
# Check if this is enhanced layout data
if layout.get('enhanced'):
has_enhanced = True
# Store enhanced results separately for converter
combined['enhanced_results'].append({
'elements': layout.get('elements', []),
'reading_order': layout.get('reading_order', []),
'element_types': layout.get('element_types', {}),
'page': page_num,
'width': result.get('ocr_dimensions', {}).get('width', 0),
'height': result.get('ocr_dimensions', {}).get('height', 0)
})
# Always collect elements for backward compatibility
for elem in layout.get('elements', []):
elem['page'] = page_num
combined['layout_data']['elements'].append(elem)