From fa9b542b06c6ffb6fb8902ef6560c965a507f00d Mon Sep 17 00:00:00 2001 From: egg Date: Wed, 26 Nov 2025 16:09:31 +0800 Subject: [PATCH] fix: improve OCR track multi-line text rendering and HTML table detection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Multi-line text rendering (pdf_generator_service.py): - Calculate font size by dividing bbox height by number of lines - Start Y coordinate from bbox TOP instead of bottom - Use non_empty_lines for proper line positioning HTML table detection: - pp_structure_enhanced.py: Detect HTML tables in 'text' type content and reclassify to TABLE when --- .../app/services/ocr_to_unified_converter.py | 11 ++++++ backend/app/services/pdf_generator_service.py | 35 +++++++++++++------ backend/app/services/pp_structure_enhanced.py | 20 ++++++++--- 3 files changed, 50 insertions(+), 16 deletions(-) diff --git a/backend/app/services/ocr_to_unified_converter.py b/backend/app/services/ocr_to_unified_converter.py index b94874d..721eed4 100644 --- a/backend/app/services/ocr_to_unified_converter.py +++ b/backend/app/services/ocr_to_unified_converter.py @@ -543,6 +543,13 @@ class OCRToUnifiedConverter: html = elem_data.get('html', '') extracted_text = elem_data.get('extracted_text', '') + # Fallback: check content field for HTML table if html field is empty + if not html: + content = elem_data.get('content', '') + if isinstance(content, str) and '