fix: improve OCR track multi-line text rendering and HTML table detection
Multi-line text rendering (pdf_generator_service.py): - Calculate font size by dividing bbox height by number of lines - Start Y coordinate from bbox TOP instead of bottom - Use non_empty_lines for proper line positioning HTML table detection: - pp_structure_enhanced.py: Detect HTML tables in 'text' type content and reclassify to TABLE when <table tag found - pdf_generator_service.py: Content-based reclassification from TEXT to TABLE during UnifiedDocument parsing - ocr_to_unified_converter.py: Fallback to check 'content' field for HTML tables when 'html' field is empty Known issue: OCR processing still has quality issues that need further investigation and fixes. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -1418,17 +1418,27 @@ class PDFGeneratorService:
|
||||
bbox_height = abs(scaled_y_bottom - scaled_y_top)
|
||||
|
||||
# Calculate font size using heuristics
|
||||
# Font size is typically 70-90% of bbox height
|
||||
# Testing shows 0.75 works well for most cases
|
||||
font_size = bbox_height * 0.75
|
||||
# For multi-line text, divide bbox height by number of lines
|
||||
lines = text.split('\n')
|
||||
non_empty_lines = [l for l in lines if l.strip()]
|
||||
num_lines = max(len(non_empty_lines), 1)
|
||||
|
||||
# Font size = bbox_height / num_lines * factor
|
||||
# Use 0.8 factor to leave room for line spacing
|
||||
font_size = (bbox_height / num_lines) * 0.8
|
||||
font_size = max(min(font_size, 72), 4) # Clamp between 4pt and 72pt
|
||||
|
||||
logger.debug(f"Text has {num_lines} non-empty lines, bbox_height={bbox_height:.1f}, calculated font_size={font_size:.1f}")
|
||||
|
||||
# Transform coordinates: OCR (top-left origin) → PDF (bottom-left origin)
|
||||
# CRITICAL: Y-axis flip!
|
||||
# For multi-line text, start from TOP of bbox and go downward
|
||||
pdf_x = scaled_x_left
|
||||
pdf_y = page_height - scaled_y_bottom # Flip Y-axis using bottom coordinate
|
||||
pdf_y_top = page_height - scaled_y_top # Top of bbox in PDF coordinates
|
||||
# Adjust for font baseline: first line starts below the top edge
|
||||
pdf_y = pdf_y_top - font_size # Start first line one font size below top
|
||||
|
||||
logger.info(f"[文字] '{text[:30]}' → PDF位置: ({pdf_x:.1f}, {pdf_y:.1f}), 字體:{font_size:.1f}pt, 寬x高:{bbox_width:.0f}x{bbox_height:.0f}")
|
||||
logger.info(f"[文字] '{text[:30]}' → PDF位置: ({pdf_x:.1f}, {pdf_y:.1f}), 字體:{font_size:.1f}pt, 寬x高:{bbox_width:.0f}x{bbox_height:.0f}, 行數:{num_lines}")
|
||||
|
||||
# Set font with track-specific styling
|
||||
# Note: OCR track has no StyleInfo (extracted from images), so no advanced formatting
|
||||
@@ -1450,14 +1460,11 @@ class PDFGeneratorService:
|
||||
|
||||
# Handle line breaks (split text by newlines)
|
||||
# OCR track: simple left-aligned rendering
|
||||
lines = text.split('\n')
|
||||
# Note: non_empty_lines was already calculated above for font sizing
|
||||
line_height = font_size * 1.2 # 120% of font size for line spacing
|
||||
|
||||
# Draw each line (left-aligned for OCR track)
|
||||
for i, line in enumerate(lines):
|
||||
if not line.strip():
|
||||
continue # Skip empty lines
|
||||
|
||||
# Draw each non-empty line (using proper line index for positioning)
|
||||
for i, line in enumerate(non_empty_lines):
|
||||
line_y = pdf_y - (i * line_height)
|
||||
|
||||
# Calculate text width to prevent overflow
|
||||
@@ -1902,6 +1909,12 @@ class PDFGeneratorService:
|
||||
elem_type = ElementType.TEXT
|
||||
logger.warning(f"Unknown element type '{type_str}', falling back to TEXT")
|
||||
|
||||
# Content-based HTML table detection: reclassify text elements with HTML table content
|
||||
content = elem_dict.get('content', '')
|
||||
if elem_type == ElementType.TEXT and isinstance(content, str) and '<table' in content.lower():
|
||||
logger.info(f"Reclassifying element from TEXT to TABLE due to HTML table content")
|
||||
elem_type = ElementType.TABLE
|
||||
|
||||
# Parse bounding box
|
||||
bbox_dict = elem_dict.get('bbox', {})
|
||||
bbox = BoundingBox(
|
||||
|
||||
Reference in New Issue
Block a user