fix: improve OCR track multi-line text rendering and HTML table detection

Multi-line text rendering (pdf_generator_service.py): - Calculate font size by dividing bbox height by number of lines - Start Y coordinate from bbox TOP instead of bottom - Use non_empty_lines for proper line positioning HTML table detection: - pp_structure_enhanced.py: Detect HTML tables in 'text' type content and reclassify to TABLE when <table tag found - pdf_generator_service.py: Content-based reclassification from TEXT to TABLE during UnifiedDocument parsing - ocr_to_unified_converter.py: Fallback to check 'content' field for HTML tables when 'html' field is empty Known issue: OCR processing still has quality issues that need further investigation and fixes. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-26 16:09:31 +08:00
parent 19bd5fd609
commit fa9b542b06
3 changed files with 50 additions and 16 deletions
--- a/backend/app/services/pdf_generator_service.py
+++ b/backend/app/services/pdf_generator_service.py
@@ -1418,17 +1418,27 @@ class PDFGeneratorService:
            bbox_height = abs(scaled_y_bottom - scaled_y_top)

            # Calculate font size using heuristics
-            # Font size is typically 70-90% of bbox height
-            # Testing shows 0.75 works well for most cases
-            font_size = bbox_height * 0.75
+            # For multi-line text, divide bbox height by number of lines
+            lines = text.split('\n')
+            non_empty_lines = [l for l in lines if l.strip()]
+            num_lines = max(len(non_empty_lines), 1)
+
+            # Font size = bbox_height / num_lines * factor
+            # Use 0.8 factor to leave room for line spacing
+            font_size = (bbox_height / num_lines) * 0.8
            font_size = max(min(font_size, 72), 4)  # Clamp between 4pt and 72pt

+            logger.debug(f"Text has {num_lines} non-empty lines, bbox_height={bbox_height:.1f}, calculated font_size={font_size:.1f}")
+
            # Transform coordinates: OCR (top-left origin) → PDF (bottom-left origin)
            # CRITICAL: Y-axis flip!
+            # For multi-line text, start from TOP of bbox and go downward
            pdf_x = scaled_x_left
-            pdf_y = page_height - scaled_y_bottom  # Flip Y-axis using bottom coordinate
+            pdf_y_top = page_height - scaled_y_top  # Top of bbox in PDF coordinates
+            # Adjust for font baseline: first line starts below the top edge
+            pdf_y = pdf_y_top - font_size  # Start first line one font size below top

-            logger.info(f"[文字] '{text[:30]}' → PDF位置: ({pdf_x:.1f}, {pdf_y:.1f}), 字體:{font_size:.1f}pt, 寬x高:{bbox_width:.0f}x{bbox_height:.0f}")
+            logger.info(f"[文字] '{text[:30]}' → PDF位置: ({pdf_x:.1f}, {pdf_y:.1f}), 字體:{font_size:.1f}pt, 寬x高:{bbox_width:.0f}x{bbox_height:.0f}, 行數:{num_lines}")

            # Set font with track-specific styling
            # Note: OCR track has no StyleInfo (extracted from images), so no advanced formatting
@@ -1450,14 +1460,11 @@ class PDFGeneratorService:

            # Handle line breaks (split text by newlines)
            # OCR track: simple left-aligned rendering
-            lines = text.split('\n')
+            # Note: non_empty_lines was already calculated above for font sizing
            line_height = font_size * 1.2  # 120% of font size for line spacing

-            # Draw each line (left-aligned for OCR track)
-            for i, line in enumerate(lines):
-                if not line.strip():
-                    continue  # Skip empty lines
-
+            # Draw each non-empty line (using proper line index for positioning)
+            for i, line in enumerate(non_empty_lines):
                line_y = pdf_y - (i * line_height)

                # Calculate text width to prevent overflow
@@ -1902,6 +1909,12 @@ class PDFGeneratorService:
                elem_type = ElementType.TEXT
                logger.warning(f"Unknown element type '{type_str}', falling back to TEXT")

+            # Content-based HTML table detection: reclassify text elements with HTML table content
+            content = elem_dict.get('content', '')
+            if elem_type == ElementType.TEXT and isinstance(content, str) and '<table' in content.lower():
+                logger.info(f"Reclassifying element from TEXT to TABLE due to HTML table content")
+                elem_type = ElementType.TABLE
+
            # Parse bounding box
            bbox_dict = elem_dict.get('bbox', {})
            bbox = BoundingBox(