From fa9b542b06c6ffb6fb8902ef6560c965a507f00d Mon Sep 17 00:00:00 2001
From: egg <lin4637lin4637@gmail.com>
Date: Wed, 26 Nov 2025 16:09:31 +0800
Subject: [PATCH] fix: improve OCR track multi-line text rendering and HTML
 table detection
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Multi-line text rendering (pdf_generator_service.py):
- Calculate font size by dividing bbox height by number of lines
- Start Y coordinate from bbox TOP instead of bottom
- Use non_empty_lines for proper line positioning

HTML table detection:
- pp_structure_enhanced.py: Detect HTML tables in 'text' type content
  and reclassify to TABLE when <table tag found
- pdf_generator_service.py: Content-based reclassification from TEXT
  to TABLE during UnifiedDocument parsing
- ocr_to_unified_converter.py: Fallback to check 'content' field for
  HTML tables when 'html' field is empty

Known issue: OCR processing still has quality issues that need further
investigation and fixes.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 .../app/services/ocr_to_unified_converter.py  | 11 ++++++
 backend/app/services/pdf_generator_service.py | 35 +++++++++++++------
 backend/app/services/pp_structure_enhanced.py | 20 ++++++++---
 3 files changed, 50 insertions(+), 16 deletions(-)

diff --git a/backend/app/services/ocr_to_unified_converter.py b/backend/app/services/ocr_to_unified_converter.py
index b94874d..721eed4 100644
--- a/backend/app/services/ocr_to_unified_converter.py
+++ b/backend/app/services/ocr_to_unified_converter.py
@@ -543,6 +543,13 @@ class OCRToUnifiedConverter:
             html = elem_data.get('html', '')
             extracted_text = elem_data.get('extracted_text', '')
 
+            # Fallback: check content field for HTML table if html field is empty
+            if not html:
+                content = elem_data.get('content', '')
+                if isinstance(content, str) and '<table' in content.lower():
+                    html = content
+                    logger.debug("Using content field as HTML table source")
+
             # Try to parse HTML to get rows and columns
             rows = 0
             cols = 0
@@ -558,6 +565,10 @@ class OCRToUnifiedConverter:
                         first_row = html[:first_row_end]
                         cols = first_row.count('<td') + first_row.count('<th')
 
+            # Return None if no valid table data found
+            if rows == 0 and cols == 0 and not extracted_text:
+                return None
+
             # Note: TableData uses 'cols' not 'columns'
             # HTML content can be stored as caption or in element metadata
             return TableData(
diff --git a/backend/app/services/pdf_generator_service.py b/backend/app/services/pdf_generator_service.py
index 83ddc43..0d3371f 100644
--- a/backend/app/services/pdf_generator_service.py
+++ b/backend/app/services/pdf_generator_service.py
@@ -1418,17 +1418,27 @@ class PDFGeneratorService:
             bbox_height = abs(scaled_y_bottom - scaled_y_top)
 
             # Calculate font size using heuristics
-            # Font size is typically 70-90% of bbox height
-            # Testing shows 0.75 works well for most cases
-            font_size = bbox_height * 0.75
+            # For multi-line text, divide bbox height by number of lines
+            lines = text.split('\n')
+            non_empty_lines = [l for l in lines if l.strip()]
+            num_lines = max(len(non_empty_lines), 1)
+
+            # Font size = bbox_height / num_lines * factor
+            # Use 0.8 factor to leave room for line spacing
+            font_size = (bbox_height / num_lines) * 0.8
             font_size = max(min(font_size, 72), 4)  # Clamp between 4pt and 72pt
 
+            logger.debug(f"Text has {num_lines} non-empty lines, bbox_height={bbox_height:.1f}, calculated font_size={font_size:.1f}")
+
             # Transform coordinates: OCR (top-left origin) → PDF (bottom-left origin)
             # CRITICAL: Y-axis flip!
+            # For multi-line text, start from TOP of bbox and go downward
             pdf_x = scaled_x_left
-            pdf_y = page_height - scaled_y_bottom  # Flip Y-axis using bottom coordinate
+            pdf_y_top = page_height - scaled_y_top  # Top of bbox in PDF coordinates
+            # Adjust for font baseline: first line starts below the top edge
+            pdf_y = pdf_y_top - font_size  # Start first line one font size below top
 
-            logger.info(f"[文字] '{text[:30]}' → PDF位置: ({pdf_x:.1f}, {pdf_y:.1f}), 字體:{font_size:.1f}pt, 寬x高:{bbox_width:.0f}x{bbox_height:.0f}")
+            logger.info(f"[文字] '{text[:30]}' → PDF位置: ({pdf_x:.1f}, {pdf_y:.1f}), 字體:{font_size:.1f}pt, 寬x高:{bbox_width:.0f}x{bbox_height:.0f}, 行數:{num_lines}")
 
             # Set font with track-specific styling
             # Note: OCR track has no StyleInfo (extracted from images), so no advanced formatting
@@ -1450,14 +1460,11 @@ class PDFGeneratorService:
 
             # Handle line breaks (split text by newlines)
             # OCR track: simple left-aligned rendering
-            lines = text.split('\n')
+            # Note: non_empty_lines was already calculated above for font sizing
             line_height = font_size * 1.2  # 120% of font size for line spacing
 
-            # Draw each line (left-aligned for OCR track)
-            for i, line in enumerate(lines):
-                if not line.strip():
-                    continue  # Skip empty lines
-
+            # Draw each non-empty line (using proper line index for positioning)
+            for i, line in enumerate(non_empty_lines):
                 line_y = pdf_y - (i * line_height)
 
                 # Calculate text width to prevent overflow
@@ -1902,6 +1909,12 @@ class PDFGeneratorService:
                 elem_type = ElementType.TEXT
                 logger.warning(f"Unknown element type '{type_str}', falling back to TEXT")
 
+            # Content-based HTML table detection: reclassify text elements with HTML table content
+            content = elem_dict.get('content', '')
+            if elem_type == ElementType.TEXT and isinstance(content, str) and '<table' in content.lower():
+                logger.info(f"Reclassifying element from TEXT to TABLE due to HTML table content")
+                elem_type = ElementType.TABLE
+
             # Parse bounding box
             bbox_dict = elem_dict.get('bbox', {})
             bbox = BoundingBox(
diff --git a/backend/app/services/pp_structure_enhanced.py b/backend/app/services/pp_structure_enhanced.py
index 4331c38..98382fc 100644
--- a/backend/app/services/pp_structure_enhanced.py
+++ b/backend/app/services/pp_structure_enhanced.py
@@ -286,6 +286,15 @@ class PPStructureEnhanced:
                 elif isinstance(res, str):
                     content = res
 
+            # Content-based HTML table detection: PP-StructureV3 sometimes
+            # classifies tables as 'text' but returns HTML table content
+            html_table_content = None
+            if content and '<table' in content.lower():
+                if mapped_type == ElementType.TEXT or element_type == 'text':
+                    logger.info(f"Element {idx}: Detected HTML table content in 'text' type, reclassifying to TABLE")
+                    mapped_type = ElementType.TABLE
+                    html_table_content = content  # Store for later use
+
             # Create element
             element = {
                 'element_id': f"pp3_{current_page}_{idx}",
@@ -300,12 +309,13 @@ class PPStructureEnhanced:
 
             # Special handling for tables
             if mapped_type == ElementType.TABLE:
-                # Extract table structure if available
-                if 'res' in item and isinstance(item['res'], dict):
+                # Use HTML content from content-based detection or extract from 'res'
+                html_content = html_table_content  # From content-based detection
+                if not html_content and 'res' in item and isinstance(item['res'], dict):
                     html_content = item['res'].get('html', '')
-                    if html_content:
-                        element['html'] = html_content
-                        element['extracted_text'] = self._extract_text_from_html(html_content)
+                if html_content:
+                    element['html'] = html_content
+                    element['extracted_text'] = self._extract_text_from_html(html_content)
 
             # Special handling for images/figures
             elif mapped_type in [ElementType.IMAGE, ElementType.FIGURE]: