fix: improve OCR track multi-line text rendering and HTML table detection
Multi-line text rendering (pdf_generator_service.py): - Calculate font size by dividing bbox height by number of lines - Start Y coordinate from bbox TOP instead of bottom - Use non_empty_lines for proper line positioning HTML table detection: - pp_structure_enhanced.py: Detect HTML tables in 'text' type content and reclassify to TABLE when <table tag found - pdf_generator_service.py: Content-based reclassification from TEXT to TABLE during UnifiedDocument parsing - ocr_to_unified_converter.py: Fallback to check 'content' field for HTML tables when 'html' field is empty Known issue: OCR processing still has quality issues that need further investigation and fixes. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -286,6 +286,15 @@ class PPStructureEnhanced:
|
||||
elif isinstance(res, str):
|
||||
content = res
|
||||
|
||||
# Content-based HTML table detection: PP-StructureV3 sometimes
|
||||
# classifies tables as 'text' but returns HTML table content
|
||||
html_table_content = None
|
||||
if content and '<table' in content.lower():
|
||||
if mapped_type == ElementType.TEXT or element_type == 'text':
|
||||
logger.info(f"Element {idx}: Detected HTML table content in 'text' type, reclassifying to TABLE")
|
||||
mapped_type = ElementType.TABLE
|
||||
html_table_content = content # Store for later use
|
||||
|
||||
# Create element
|
||||
element = {
|
||||
'element_id': f"pp3_{current_page}_{idx}",
|
||||
@@ -300,12 +309,13 @@ class PPStructureEnhanced:
|
||||
|
||||
# Special handling for tables
|
||||
if mapped_type == ElementType.TABLE:
|
||||
# Extract table structure if available
|
||||
if 'res' in item and isinstance(item['res'], dict):
|
||||
# Use HTML content from content-based detection or extract from 'res'
|
||||
html_content = html_table_content # From content-based detection
|
||||
if not html_content and 'res' in item and isinstance(item['res'], dict):
|
||||
html_content = item['res'].get('html', '')
|
||||
if html_content:
|
||||
element['html'] = html_content
|
||||
element['extracted_text'] = self._extract_text_from_html(html_content)
|
||||
if html_content:
|
||||
element['html'] = html_content
|
||||
element['extracted_text'] = self._extract_text_from_html(html_content)
|
||||
|
||||
# Special handling for images/figures
|
||||
elif mapped_type in [ElementType.IMAGE, ElementType.FIGURE]:
|
||||
|
||||
Reference in New Issue
Block a user