fix: improve OCR track multi-line text rendering and HTML table detection
Multi-line text rendering (pdf_generator_service.py): - Calculate font size by dividing bbox height by number of lines - Start Y coordinate from bbox TOP instead of bottom - Use non_empty_lines for proper line positioning HTML table detection: - pp_structure_enhanced.py: Detect HTML tables in 'text' type content and reclassify to TABLE when <table tag found - pdf_generator_service.py: Content-based reclassification from TEXT to TABLE during UnifiedDocument parsing - ocr_to_unified_converter.py: Fallback to check 'content' field for HTML tables when 'html' field is empty Known issue: OCR processing still has quality issues that need further investigation and fixes. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -543,6 +543,13 @@ class OCRToUnifiedConverter:
|
||||
html = elem_data.get('html', '')
|
||||
extracted_text = elem_data.get('extracted_text', '')
|
||||
|
||||
# Fallback: check content field for HTML table if html field is empty
|
||||
if not html:
|
||||
content = elem_data.get('content', '')
|
||||
if isinstance(content, str) and '<table' in content.lower():
|
||||
html = content
|
||||
logger.debug("Using content field as HTML table source")
|
||||
|
||||
# Try to parse HTML to get rows and columns
|
||||
rows = 0
|
||||
cols = 0
|
||||
@@ -558,6 +565,10 @@ class OCRToUnifiedConverter:
|
||||
first_row = html[:first_row_end]
|
||||
cols = first_row.count('<td') + first_row.count('<th')
|
||||
|
||||
# Return None if no valid table data found
|
||||
if rows == 0 and cols == 0 and not extracted_text:
|
||||
return None
|
||||
|
||||
# Note: TableData uses 'cols' not 'columns'
|
||||
# HTML content can be stored as caption or in element metadata
|
||||
return TableData(
|
||||
|
||||
@@ -1418,17 +1418,27 @@ class PDFGeneratorService:
|
||||
bbox_height = abs(scaled_y_bottom - scaled_y_top)
|
||||
|
||||
# Calculate font size using heuristics
|
||||
# Font size is typically 70-90% of bbox height
|
||||
# Testing shows 0.75 works well for most cases
|
||||
font_size = bbox_height * 0.75
|
||||
# For multi-line text, divide bbox height by number of lines
|
||||
lines = text.split('\n')
|
||||
non_empty_lines = [l for l in lines if l.strip()]
|
||||
num_lines = max(len(non_empty_lines), 1)
|
||||
|
||||
# Font size = bbox_height / num_lines * factor
|
||||
# Use 0.8 factor to leave room for line spacing
|
||||
font_size = (bbox_height / num_lines) * 0.8
|
||||
font_size = max(min(font_size, 72), 4) # Clamp between 4pt and 72pt
|
||||
|
||||
logger.debug(f"Text has {num_lines} non-empty lines, bbox_height={bbox_height:.1f}, calculated font_size={font_size:.1f}")
|
||||
|
||||
# Transform coordinates: OCR (top-left origin) → PDF (bottom-left origin)
|
||||
# CRITICAL: Y-axis flip!
|
||||
# For multi-line text, start from TOP of bbox and go downward
|
||||
pdf_x = scaled_x_left
|
||||
pdf_y = page_height - scaled_y_bottom # Flip Y-axis using bottom coordinate
|
||||
pdf_y_top = page_height - scaled_y_top # Top of bbox in PDF coordinates
|
||||
# Adjust for font baseline: first line starts below the top edge
|
||||
pdf_y = pdf_y_top - font_size # Start first line one font size below top
|
||||
|
||||
logger.info(f"[文字] '{text[:30]}' → PDF位置: ({pdf_x:.1f}, {pdf_y:.1f}), 字體:{font_size:.1f}pt, 寬x高:{bbox_width:.0f}x{bbox_height:.0f}")
|
||||
logger.info(f"[文字] '{text[:30]}' → PDF位置: ({pdf_x:.1f}, {pdf_y:.1f}), 字體:{font_size:.1f}pt, 寬x高:{bbox_width:.0f}x{bbox_height:.0f}, 行數:{num_lines}")
|
||||
|
||||
# Set font with track-specific styling
|
||||
# Note: OCR track has no StyleInfo (extracted from images), so no advanced formatting
|
||||
@@ -1450,14 +1460,11 @@ class PDFGeneratorService:
|
||||
|
||||
# Handle line breaks (split text by newlines)
|
||||
# OCR track: simple left-aligned rendering
|
||||
lines = text.split('\n')
|
||||
# Note: non_empty_lines was already calculated above for font sizing
|
||||
line_height = font_size * 1.2 # 120% of font size for line spacing
|
||||
|
||||
# Draw each line (left-aligned for OCR track)
|
||||
for i, line in enumerate(lines):
|
||||
if not line.strip():
|
||||
continue # Skip empty lines
|
||||
|
||||
# Draw each non-empty line (using proper line index for positioning)
|
||||
for i, line in enumerate(non_empty_lines):
|
||||
line_y = pdf_y - (i * line_height)
|
||||
|
||||
# Calculate text width to prevent overflow
|
||||
@@ -1902,6 +1909,12 @@ class PDFGeneratorService:
|
||||
elem_type = ElementType.TEXT
|
||||
logger.warning(f"Unknown element type '{type_str}', falling back to TEXT")
|
||||
|
||||
# Content-based HTML table detection: reclassify text elements with HTML table content
|
||||
content = elem_dict.get('content', '')
|
||||
if elem_type == ElementType.TEXT and isinstance(content, str) and '<table' in content.lower():
|
||||
logger.info(f"Reclassifying element from TEXT to TABLE due to HTML table content")
|
||||
elem_type = ElementType.TABLE
|
||||
|
||||
# Parse bounding box
|
||||
bbox_dict = elem_dict.get('bbox', {})
|
||||
bbox = BoundingBox(
|
||||
|
||||
@@ -286,6 +286,15 @@ class PPStructureEnhanced:
|
||||
elif isinstance(res, str):
|
||||
content = res
|
||||
|
||||
# Content-based HTML table detection: PP-StructureV3 sometimes
|
||||
# classifies tables as 'text' but returns HTML table content
|
||||
html_table_content = None
|
||||
if content and '<table' in content.lower():
|
||||
if mapped_type == ElementType.TEXT or element_type == 'text':
|
||||
logger.info(f"Element {idx}: Detected HTML table content in 'text' type, reclassifying to TABLE")
|
||||
mapped_type = ElementType.TABLE
|
||||
html_table_content = content # Store for later use
|
||||
|
||||
# Create element
|
||||
element = {
|
||||
'element_id': f"pp3_{current_page}_{idx}",
|
||||
@@ -300,8 +309,9 @@ class PPStructureEnhanced:
|
||||
|
||||
# Special handling for tables
|
||||
if mapped_type == ElementType.TABLE:
|
||||
# Extract table structure if available
|
||||
if 'res' in item and isinstance(item['res'], dict):
|
||||
# Use HTML content from content-based detection or extract from 'res'
|
||||
html_content = html_table_content # From content-based detection
|
||||
if not html_content and 'res' in item and isinstance(item['res'], dict):
|
||||
html_content = item['res'].get('html', '')
|
||||
if html_content:
|
||||
element['html'] = html_content
|
||||
|
||||
Reference in New Issue
Block a user