fix: improve OCR track multi-line text rendering and HTML table detection

Multi-line text rendering (pdf_generator_service.py):
- Calculate font size by dividing bbox height by number of lines
- Start Y coordinate from bbox TOP instead of bottom
- Use non_empty_lines for proper line positioning

HTML table detection:
- pp_structure_enhanced.py: Detect HTML tables in 'text' type content
  and reclassify to TABLE when <table tag found
- pdf_generator_service.py: Content-based reclassification from TEXT
  to TABLE during UnifiedDocument parsing
- ocr_to_unified_converter.py: Fallback to check 'content' field for
  HTML tables when 'html' field is empty

Known issue: OCR processing still has quality issues that need further
investigation and fixes.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
egg
2025-11-26 16:09:31 +08:00
parent 19bd5fd609
commit fa9b542b06
3 changed files with 50 additions and 16 deletions

View File

@@ -1418,17 +1418,27 @@ class PDFGeneratorService:
bbox_height = abs(scaled_y_bottom - scaled_y_top)
# Calculate font size using heuristics
# Font size is typically 70-90% of bbox height
# Testing shows 0.75 works well for most cases
font_size = bbox_height * 0.75
# For multi-line text, divide bbox height by number of lines
lines = text.split('\n')
non_empty_lines = [l for l in lines if l.strip()]
num_lines = max(len(non_empty_lines), 1)
# Font size = bbox_height / num_lines * factor
# Use 0.8 factor to leave room for line spacing
font_size = (bbox_height / num_lines) * 0.8
font_size = max(min(font_size, 72), 4) # Clamp between 4pt and 72pt
logger.debug(f"Text has {num_lines} non-empty lines, bbox_height={bbox_height:.1f}, calculated font_size={font_size:.1f}")
# Transform coordinates: OCR (top-left origin) → PDF (bottom-left origin)
# CRITICAL: Y-axis flip!
# For multi-line text, start from TOP of bbox and go downward
pdf_x = scaled_x_left
pdf_y = page_height - scaled_y_bottom # Flip Y-axis using bottom coordinate
pdf_y_top = page_height - scaled_y_top # Top of bbox in PDF coordinates
# Adjust for font baseline: first line starts below the top edge
pdf_y = pdf_y_top - font_size # Start first line one font size below top
logger.info(f"[文字] '{text[:30]}' → PDF位置: ({pdf_x:.1f}, {pdf_y:.1f}), 字體:{font_size:.1f}pt, 寬x高:{bbox_width:.0f}x{bbox_height:.0f}")
logger.info(f"[文字] '{text[:30]}' → PDF位置: ({pdf_x:.1f}, {pdf_y:.1f}), 字體:{font_size:.1f}pt, 寬x高:{bbox_width:.0f}x{bbox_height:.0f}, 行數:{num_lines}")
# Set font with track-specific styling
# Note: OCR track has no StyleInfo (extracted from images), so no advanced formatting
@@ -1450,14 +1460,11 @@ class PDFGeneratorService:
# Handle line breaks (split text by newlines)
# OCR track: simple left-aligned rendering
lines = text.split('\n')
# Note: non_empty_lines was already calculated above for font sizing
line_height = font_size * 1.2 # 120% of font size for line spacing
# Draw each line (left-aligned for OCR track)
for i, line in enumerate(lines):
if not line.strip():
continue # Skip empty lines
# Draw each non-empty line (using proper line index for positioning)
for i, line in enumerate(non_empty_lines):
line_y = pdf_y - (i * line_height)
# Calculate text width to prevent overflow
@@ -1902,6 +1909,12 @@ class PDFGeneratorService:
elem_type = ElementType.TEXT
logger.warning(f"Unknown element type '{type_str}', falling back to TEXT")
# Content-based HTML table detection: reclassify text elements with HTML table content
content = elem_dict.get('content', '')
if elem_type == ElementType.TEXT and isinstance(content, str) and '<table' in content.lower():
logger.info(f"Reclassifying element from TEXT to TABLE due to HTML table content")
elem_type = ElementType.TABLE
# Parse bounding box
bbox_dict = elem_dict.get('bbox', {})
bbox = BoundingBox(