fix: enable text selection in Direct track PDF output
Root causes: 1. generate_layout_pdf() didn't properly route UnifiedDocument JSON to Direct track rendering - added format detection and JSON-to- UnifiedDocument conversion 2. Chart elements with page-spanning bboxes (e.g., chart_1_44 covering entire page) caused all text to be filtered by _is_element_inside_regions - Fix: only IMAGE/FIGURE/LOGO are exclusion regions, not CHART/DIAGRAM 3. Fixed UnifiedDocument constructor call (removed invalid params) 4. Fixed method name typo (generate_pdf_from_unified_document → generate_from_unified_document) 5. Fixed variable name typo in _draw_image_element_direct logging Result: edit3.pdf text extraction changed from 0 chars to 773 chars Note: Chinese chars render as 'I' due to CJK font encoding - separate issue to be addressed when implementing translation feature. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -29,7 +29,8 @@ from app.core.config import settings
|
||||
try:
|
||||
from app.models.unified_document import (
|
||||
UnifiedDocument, DocumentElement, ElementType,
|
||||
BoundingBox, TableData, ProcessingTrack
|
||||
BoundingBox, TableData, ProcessingTrack,
|
||||
DocumentMetadata, Dimensions, Page, StyleInfo
|
||||
)
|
||||
UNIFIED_DOCUMENT_AVAILABLE = True
|
||||
except ImportError:
|
||||
@@ -731,7 +732,11 @@ class PDFGeneratorService:
|
||||
ElementType.CHART, ElementType.DIAGRAM, ElementType.LOGO
|
||||
]:
|
||||
image_elements.append(element)
|
||||
regions_to_avoid.append(element) # Images are exclusion regions
|
||||
# Only add real images to exclusion regions, NOT charts/diagrams
|
||||
# Charts often have large bounding boxes that include text labels
|
||||
# which should be rendered as selectable text on top
|
||||
if element.type in [ElementType.IMAGE, ElementType.FIGURE, ElementType.LOGO]:
|
||||
regions_to_avoid.append(element)
|
||||
elif element.type == ElementType.LIST_ITEM:
|
||||
list_elements.append(element)
|
||||
elif self._is_list_item_fallback(element):
|
||||
@@ -1757,13 +1762,30 @@ class PDFGeneratorService:
|
||||
if not ocr_data:
|
||||
return False
|
||||
|
||||
# Use internal generation with pre-loaded data
|
||||
return self._generate_pdf_from_data(
|
||||
ocr_data=ocr_data,
|
||||
output_path=output_path,
|
||||
source_file_path=source_file_path,
|
||||
json_parent_dir=json_path.parent
|
||||
)
|
||||
# Check if this is new UnifiedDocument format (has 'pages' with elements)
|
||||
# vs old OCR format (has 'text_regions')
|
||||
if 'pages' in ocr_data and isinstance(ocr_data.get('pages'), list):
|
||||
# New UnifiedDocument format - convert and use Direct track rendering
|
||||
logger.info("Detected UnifiedDocument JSON format, using Direct track rendering")
|
||||
unified_doc = self._json_to_unified_document(ocr_data, json_path.parent)
|
||||
if unified_doc:
|
||||
return self.generate_from_unified_document(
|
||||
unified_doc=unified_doc,
|
||||
output_path=output_path,
|
||||
source_file_path=source_file_path
|
||||
)
|
||||
else:
|
||||
logger.error("Failed to convert JSON to UnifiedDocument")
|
||||
return False
|
||||
else:
|
||||
# Old OCR format - use legacy generation
|
||||
logger.info("Detected legacy OCR JSON format, using OCR track rendering")
|
||||
return self._generate_pdf_from_data(
|
||||
ocr_data=ocr_data,
|
||||
output_path=output_path,
|
||||
source_file_path=source_file_path,
|
||||
json_parent_dir=json_path.parent
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to generate PDF: {e}")
|
||||
@@ -1771,6 +1793,163 @@ class PDFGeneratorService:
|
||||
traceback.print_exc()
|
||||
return False
|
||||
|
||||
def _json_to_unified_document(self, json_data: Dict, result_dir: Path) -> Optional['UnifiedDocument']:
|
||||
"""
|
||||
Convert JSON dict to UnifiedDocument object.
|
||||
|
||||
Args:
|
||||
json_data: Loaded JSON dictionary in UnifiedDocument format
|
||||
result_dir: Directory containing image files
|
||||
|
||||
Returns:
|
||||
UnifiedDocument object or None if conversion fails
|
||||
"""
|
||||
try:
|
||||
from datetime import datetime
|
||||
|
||||
# Parse metadata
|
||||
metadata_dict = json_data.get('metadata', {})
|
||||
|
||||
# Parse processing track
|
||||
track_str = metadata_dict.get('processing_track', 'direct')
|
||||
try:
|
||||
processing_track = ProcessingTrack(track_str)
|
||||
except ValueError:
|
||||
processing_track = ProcessingTrack.DIRECT
|
||||
|
||||
# Create DocumentMetadata
|
||||
metadata = DocumentMetadata(
|
||||
filename=metadata_dict.get('filename', ''),
|
||||
file_type=metadata_dict.get('file_type', 'pdf'),
|
||||
file_size=metadata_dict.get('file_size', 0),
|
||||
created_at=datetime.fromisoformat(metadata_dict.get('created_at', datetime.now().isoformat()).replace('Z', '+00:00')),
|
||||
processing_track=processing_track,
|
||||
processing_time=metadata_dict.get('processing_time', 0),
|
||||
language=metadata_dict.get('language'),
|
||||
title=metadata_dict.get('title'),
|
||||
author=metadata_dict.get('author'),
|
||||
subject=metadata_dict.get('subject'),
|
||||
keywords=metadata_dict.get('keywords'),
|
||||
producer=metadata_dict.get('producer'),
|
||||
creator=metadata_dict.get('creator'),
|
||||
creation_date=datetime.fromisoformat(metadata_dict['creation_date'].replace('Z', '+00:00')) if metadata_dict.get('creation_date') else None,
|
||||
modification_date=datetime.fromisoformat(metadata_dict['modification_date'].replace('Z', '+00:00')) if metadata_dict.get('modification_date') else None,
|
||||
)
|
||||
|
||||
# Parse pages
|
||||
pages = []
|
||||
for page_dict in json_data.get('pages', []):
|
||||
# Parse page dimensions
|
||||
dims = page_dict.get('dimensions', {})
|
||||
if not dims:
|
||||
# Fallback dimensions
|
||||
dims = {'width': 595.32, 'height': 841.92}
|
||||
dimensions = Dimensions(
|
||||
width=dims.get('width', 595.32),
|
||||
height=dims.get('height', 841.92),
|
||||
dpi=dims.get('dpi')
|
||||
)
|
||||
|
||||
# Parse elements
|
||||
elements = []
|
||||
for elem_dict in page_dict.get('elements', []):
|
||||
element = self._json_to_document_element(elem_dict)
|
||||
if element:
|
||||
elements.append(element)
|
||||
|
||||
page = Page(
|
||||
page_number=page_dict.get('page_number', 1),
|
||||
dimensions=dimensions,
|
||||
elements=elements,
|
||||
metadata=page_dict.get('metadata', {})
|
||||
)
|
||||
pages.append(page)
|
||||
|
||||
# Create UnifiedDocument
|
||||
unified_doc = UnifiedDocument(
|
||||
document_id=json_data.get('document_id', ''),
|
||||
metadata=metadata,
|
||||
pages=pages,
|
||||
processing_errors=json_data.get('processing_errors', [])
|
||||
)
|
||||
|
||||
logger.info(f"Converted JSON to UnifiedDocument: {len(pages)} pages, track={processing_track.value}")
|
||||
return unified_doc
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to convert JSON to UnifiedDocument: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return None
|
||||
|
||||
def _json_to_document_element(self, elem_dict: Dict) -> Optional['DocumentElement']:
|
||||
"""
|
||||
Convert JSON dict to DocumentElement.
|
||||
|
||||
Args:
|
||||
elem_dict: Element dictionary from JSON
|
||||
|
||||
Returns:
|
||||
DocumentElement or None if conversion fails
|
||||
"""
|
||||
try:
|
||||
# Parse element type
|
||||
type_str = elem_dict.get('type', 'text')
|
||||
try:
|
||||
elem_type = ElementType(type_str)
|
||||
except ValueError:
|
||||
# Fallback to TEXT for unknown types
|
||||
elem_type = ElementType.TEXT
|
||||
logger.warning(f"Unknown element type '{type_str}', falling back to TEXT")
|
||||
|
||||
# Parse bounding box
|
||||
bbox_dict = elem_dict.get('bbox', {})
|
||||
bbox = BoundingBox(
|
||||
x0=bbox_dict.get('x0', 0),
|
||||
y0=bbox_dict.get('y0', 0),
|
||||
x1=bbox_dict.get('x1', 0),
|
||||
y1=bbox_dict.get('y1', 0)
|
||||
)
|
||||
|
||||
# Parse style if present
|
||||
style = None
|
||||
if 'style' in elem_dict and elem_dict['style']:
|
||||
style_dict = elem_dict['style']
|
||||
style = StyleInfo(
|
||||
font_name=style_dict.get('font_name'),
|
||||
font_size=style_dict.get('font_size'),
|
||||
font_weight=style_dict.get('font_weight'),
|
||||
font_style=style_dict.get('font_style'),
|
||||
text_color=style_dict.get('text_color'),
|
||||
bg_color=style_dict.get('bg_color') or style_dict.get('background_color'),
|
||||
alignment=style_dict.get('alignment'),
|
||||
)
|
||||
|
||||
# Parse children (spans)
|
||||
children = []
|
||||
for child_dict in elem_dict.get('children', []):
|
||||
child = self._json_to_document_element(child_dict)
|
||||
if child:
|
||||
children.append(child)
|
||||
|
||||
# Create element
|
||||
element = DocumentElement(
|
||||
element_id=elem_dict.get('element_id', ''),
|
||||
type=elem_type,
|
||||
content=elem_dict.get('content', ''),
|
||||
bbox=bbox,
|
||||
confidence=elem_dict.get('confidence'),
|
||||
style=style,
|
||||
metadata=elem_dict.get('metadata', {}),
|
||||
children=children
|
||||
)
|
||||
|
||||
return element
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to convert element: {e}")
|
||||
return None
|
||||
|
||||
def _is_list_item_fallback(self, element: 'DocumentElement') -> bool:
|
||||
"""
|
||||
Fallback detection for list items not marked with ElementType.LIST_ITEM.
|
||||
@@ -2474,7 +2653,7 @@ class PDFGeneratorService:
|
||||
preserveAspectRatio=True
|
||||
)
|
||||
|
||||
logger.debug(f"Drew image: {image_path} (from: {original_path_str})")
|
||||
logger.debug(f"Drew image: {image_path} (from: {image_path_str})")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to draw image element {element.element_id}: {e}")
|
||||
|
||||
Reference in New Issue
Block a user