fix: improve OCR track multi-line text rendering and HTML table detection

Multi-line text rendering (pdf_generator_service.py):
- Calculate font size by dividing bbox height by number of lines
- Start Y coordinate from bbox TOP instead of bottom
- Use non_empty_lines for proper line positioning

HTML table detection:
- pp_structure_enhanced.py: Detect HTML tables in 'text' type content
  and reclassify to TABLE when <table tag found
- pdf_generator_service.py: Content-based reclassification from TEXT
  to TABLE during UnifiedDocument parsing
- ocr_to_unified_converter.py: Fallback to check 'content' field for
  HTML tables when 'html' field is empty

Known issue: OCR processing still has quality issues that need further
investigation and fixes.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
egg
2025-11-26 16:09:31 +08:00
parent 19bd5fd609
commit fa9b542b06
3 changed files with 50 additions and 16 deletions

View File

@@ -286,6 +286,15 @@ class PPStructureEnhanced:
elif isinstance(res, str):
content = res
# Content-based HTML table detection: PP-StructureV3 sometimes
# classifies tables as 'text' but returns HTML table content
html_table_content = None
if content and '<table' in content.lower():
if mapped_type == ElementType.TEXT or element_type == 'text':
logger.info(f"Element {idx}: Detected HTML table content in 'text' type, reclassifying to TABLE")
mapped_type = ElementType.TABLE
html_table_content = content # Store for later use
# Create element
element = {
'element_id': f"pp3_{current_page}_{idx}",
@@ -300,12 +309,13 @@ class PPStructureEnhanced:
# Special handling for tables
if mapped_type == ElementType.TABLE:
# Extract table structure if available
if 'res' in item and isinstance(item['res'], dict):
# Use HTML content from content-based detection or extract from 'res'
html_content = html_table_content # From content-based detection
if not html_content and 'res' in item and isinstance(item['res'], dict):
html_content = item['res'].get('html', '')
if html_content:
element['html'] = html_content
element['extracted_text'] = self._extract_text_from_html(html_content)
if html_content:
element['html'] = html_content
element['extracted_text'] = self._extract_text_from_html(html_content)
# Special handling for images/figures
elif mapped_type in [ElementType.IMAGE, ElementType.FIGURE]: