fix: improve OCR track multi-line text rendering and HTML table detection
Multi-line text rendering (pdf_generator_service.py): - Calculate font size by dividing bbox height by number of lines - Start Y coordinate from bbox TOP instead of bottom - Use non_empty_lines for proper line positioning HTML table detection: - pp_structure_enhanced.py: Detect HTML tables in 'text' type content and reclassify to TABLE when <table tag found - pdf_generator_service.py: Content-based reclassification from TEXT to TABLE during UnifiedDocument parsing - ocr_to_unified_converter.py: Fallback to check 'content' field for HTML tables when 'html' field is empty Known issue: OCR processing still has quality issues that need further investigation and fixes. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -543,6 +543,13 @@ class OCRToUnifiedConverter:
|
|||||||
html = elem_data.get('html', '')
|
html = elem_data.get('html', '')
|
||||||
extracted_text = elem_data.get('extracted_text', '')
|
extracted_text = elem_data.get('extracted_text', '')
|
||||||
|
|
||||||
|
# Fallback: check content field for HTML table if html field is empty
|
||||||
|
if not html:
|
||||||
|
content = elem_data.get('content', '')
|
||||||
|
if isinstance(content, str) and '<table' in content.lower():
|
||||||
|
html = content
|
||||||
|
logger.debug("Using content field as HTML table source")
|
||||||
|
|
||||||
# Try to parse HTML to get rows and columns
|
# Try to parse HTML to get rows and columns
|
||||||
rows = 0
|
rows = 0
|
||||||
cols = 0
|
cols = 0
|
||||||
@@ -558,6 +565,10 @@ class OCRToUnifiedConverter:
|
|||||||
first_row = html[:first_row_end]
|
first_row = html[:first_row_end]
|
||||||
cols = first_row.count('<td') + first_row.count('<th')
|
cols = first_row.count('<td') + first_row.count('<th')
|
||||||
|
|
||||||
|
# Return None if no valid table data found
|
||||||
|
if rows == 0 and cols == 0 and not extracted_text:
|
||||||
|
return None
|
||||||
|
|
||||||
# Note: TableData uses 'cols' not 'columns'
|
# Note: TableData uses 'cols' not 'columns'
|
||||||
# HTML content can be stored as caption or in element metadata
|
# HTML content can be stored as caption or in element metadata
|
||||||
return TableData(
|
return TableData(
|
||||||
|
|||||||
@@ -1418,17 +1418,27 @@ class PDFGeneratorService:
|
|||||||
bbox_height = abs(scaled_y_bottom - scaled_y_top)
|
bbox_height = abs(scaled_y_bottom - scaled_y_top)
|
||||||
|
|
||||||
# Calculate font size using heuristics
|
# Calculate font size using heuristics
|
||||||
# Font size is typically 70-90% of bbox height
|
# For multi-line text, divide bbox height by number of lines
|
||||||
# Testing shows 0.75 works well for most cases
|
lines = text.split('\n')
|
||||||
font_size = bbox_height * 0.75
|
non_empty_lines = [l for l in lines if l.strip()]
|
||||||
|
num_lines = max(len(non_empty_lines), 1)
|
||||||
|
|
||||||
|
# Font size = bbox_height / num_lines * factor
|
||||||
|
# Use 0.8 factor to leave room for line spacing
|
||||||
|
font_size = (bbox_height / num_lines) * 0.8
|
||||||
font_size = max(min(font_size, 72), 4) # Clamp between 4pt and 72pt
|
font_size = max(min(font_size, 72), 4) # Clamp between 4pt and 72pt
|
||||||
|
|
||||||
|
logger.debug(f"Text has {num_lines} non-empty lines, bbox_height={bbox_height:.1f}, calculated font_size={font_size:.1f}")
|
||||||
|
|
||||||
# Transform coordinates: OCR (top-left origin) → PDF (bottom-left origin)
|
# Transform coordinates: OCR (top-left origin) → PDF (bottom-left origin)
|
||||||
# CRITICAL: Y-axis flip!
|
# CRITICAL: Y-axis flip!
|
||||||
|
# For multi-line text, start from TOP of bbox and go downward
|
||||||
pdf_x = scaled_x_left
|
pdf_x = scaled_x_left
|
||||||
pdf_y = page_height - scaled_y_bottom # Flip Y-axis using bottom coordinate
|
pdf_y_top = page_height - scaled_y_top # Top of bbox in PDF coordinates
|
||||||
|
# Adjust for font baseline: first line starts below the top edge
|
||||||
|
pdf_y = pdf_y_top - font_size # Start first line one font size below top
|
||||||
|
|
||||||
logger.info(f"[文字] '{text[:30]}' → PDF位置: ({pdf_x:.1f}, {pdf_y:.1f}), 字體:{font_size:.1f}pt, 寬x高:{bbox_width:.0f}x{bbox_height:.0f}")
|
logger.info(f"[文字] '{text[:30]}' → PDF位置: ({pdf_x:.1f}, {pdf_y:.1f}), 字體:{font_size:.1f}pt, 寬x高:{bbox_width:.0f}x{bbox_height:.0f}, 行數:{num_lines}")
|
||||||
|
|
||||||
# Set font with track-specific styling
|
# Set font with track-specific styling
|
||||||
# Note: OCR track has no StyleInfo (extracted from images), so no advanced formatting
|
# Note: OCR track has no StyleInfo (extracted from images), so no advanced formatting
|
||||||
@@ -1450,14 +1460,11 @@ class PDFGeneratorService:
|
|||||||
|
|
||||||
# Handle line breaks (split text by newlines)
|
# Handle line breaks (split text by newlines)
|
||||||
# OCR track: simple left-aligned rendering
|
# OCR track: simple left-aligned rendering
|
||||||
lines = text.split('\n')
|
# Note: non_empty_lines was already calculated above for font sizing
|
||||||
line_height = font_size * 1.2 # 120% of font size for line spacing
|
line_height = font_size * 1.2 # 120% of font size for line spacing
|
||||||
|
|
||||||
# Draw each line (left-aligned for OCR track)
|
# Draw each non-empty line (using proper line index for positioning)
|
||||||
for i, line in enumerate(lines):
|
for i, line in enumerate(non_empty_lines):
|
||||||
if not line.strip():
|
|
||||||
continue # Skip empty lines
|
|
||||||
|
|
||||||
line_y = pdf_y - (i * line_height)
|
line_y = pdf_y - (i * line_height)
|
||||||
|
|
||||||
# Calculate text width to prevent overflow
|
# Calculate text width to prevent overflow
|
||||||
@@ -1902,6 +1909,12 @@ class PDFGeneratorService:
|
|||||||
elem_type = ElementType.TEXT
|
elem_type = ElementType.TEXT
|
||||||
logger.warning(f"Unknown element type '{type_str}', falling back to TEXT")
|
logger.warning(f"Unknown element type '{type_str}', falling back to TEXT")
|
||||||
|
|
||||||
|
# Content-based HTML table detection: reclassify text elements with HTML table content
|
||||||
|
content = elem_dict.get('content', '')
|
||||||
|
if elem_type == ElementType.TEXT and isinstance(content, str) and '<table' in content.lower():
|
||||||
|
logger.info(f"Reclassifying element from TEXT to TABLE due to HTML table content")
|
||||||
|
elem_type = ElementType.TABLE
|
||||||
|
|
||||||
# Parse bounding box
|
# Parse bounding box
|
||||||
bbox_dict = elem_dict.get('bbox', {})
|
bbox_dict = elem_dict.get('bbox', {})
|
||||||
bbox = BoundingBox(
|
bbox = BoundingBox(
|
||||||
|
|||||||
@@ -286,6 +286,15 @@ class PPStructureEnhanced:
|
|||||||
elif isinstance(res, str):
|
elif isinstance(res, str):
|
||||||
content = res
|
content = res
|
||||||
|
|
||||||
|
# Content-based HTML table detection: PP-StructureV3 sometimes
|
||||||
|
# classifies tables as 'text' but returns HTML table content
|
||||||
|
html_table_content = None
|
||||||
|
if content and '<table' in content.lower():
|
||||||
|
if mapped_type == ElementType.TEXT or element_type == 'text':
|
||||||
|
logger.info(f"Element {idx}: Detected HTML table content in 'text' type, reclassifying to TABLE")
|
||||||
|
mapped_type = ElementType.TABLE
|
||||||
|
html_table_content = content # Store for later use
|
||||||
|
|
||||||
# Create element
|
# Create element
|
||||||
element = {
|
element = {
|
||||||
'element_id': f"pp3_{current_page}_{idx}",
|
'element_id': f"pp3_{current_page}_{idx}",
|
||||||
@@ -300,12 +309,13 @@ class PPStructureEnhanced:
|
|||||||
|
|
||||||
# Special handling for tables
|
# Special handling for tables
|
||||||
if mapped_type == ElementType.TABLE:
|
if mapped_type == ElementType.TABLE:
|
||||||
# Extract table structure if available
|
# Use HTML content from content-based detection or extract from 'res'
|
||||||
if 'res' in item and isinstance(item['res'], dict):
|
html_content = html_table_content # From content-based detection
|
||||||
|
if not html_content and 'res' in item and isinstance(item['res'], dict):
|
||||||
html_content = item['res'].get('html', '')
|
html_content = item['res'].get('html', '')
|
||||||
if html_content:
|
if html_content:
|
||||||
element['html'] = html_content
|
element['html'] = html_content
|
||||||
element['extracted_text'] = self._extract_text_from_html(html_content)
|
element['extracted_text'] = self._extract_text_from_html(html_content)
|
||||||
|
|
||||||
# Special handling for images/figures
|
# Special handling for images/figures
|
||||||
elif mapped_type in [ElementType.IMAGE, ElementType.FIGURE]:
|
elif mapped_type in [ElementType.IMAGE, ElementType.FIGURE]:
|
||||||
|
|||||||
Reference in New Issue
Block a user