fix: enable text selection in Direct track PDF output
Root causes: 1. generate_layout_pdf() didn't properly route UnifiedDocument JSON to Direct track rendering - added format detection and JSON-to- UnifiedDocument conversion 2. Chart elements with page-spanning bboxes (e.g., chart_1_44 covering entire page) caused all text to be filtered by _is_element_inside_regions - Fix: only IMAGE/FIGURE/LOGO are exclusion regions, not CHART/DIAGRAM 3. Fixed UnifiedDocument constructor call (removed invalid params) 4. Fixed method name typo (generate_pdf_from_unified_document → generate_from_unified_document) 5. Fixed variable name typo in _draw_image_element_direct logging Result: edit3.pdf text extraction changed from 0 chars to 773 chars Note: Chinese chars render as 'I' due to CJK font encoding - separate issue to be addressed when implementing translation feature. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -29,7 +29,8 @@ from app.core.config import settings
|
|||||||
try:
|
try:
|
||||||
from app.models.unified_document import (
|
from app.models.unified_document import (
|
||||||
UnifiedDocument, DocumentElement, ElementType,
|
UnifiedDocument, DocumentElement, ElementType,
|
||||||
BoundingBox, TableData, ProcessingTrack
|
BoundingBox, TableData, ProcessingTrack,
|
||||||
|
DocumentMetadata, Dimensions, Page, StyleInfo
|
||||||
)
|
)
|
||||||
UNIFIED_DOCUMENT_AVAILABLE = True
|
UNIFIED_DOCUMENT_AVAILABLE = True
|
||||||
except ImportError:
|
except ImportError:
|
||||||
@@ -731,7 +732,11 @@ class PDFGeneratorService:
|
|||||||
ElementType.CHART, ElementType.DIAGRAM, ElementType.LOGO
|
ElementType.CHART, ElementType.DIAGRAM, ElementType.LOGO
|
||||||
]:
|
]:
|
||||||
image_elements.append(element)
|
image_elements.append(element)
|
||||||
regions_to_avoid.append(element) # Images are exclusion regions
|
# Only add real images to exclusion regions, NOT charts/diagrams
|
||||||
|
# Charts often have large bounding boxes that include text labels
|
||||||
|
# which should be rendered as selectable text on top
|
||||||
|
if element.type in [ElementType.IMAGE, ElementType.FIGURE, ElementType.LOGO]:
|
||||||
|
regions_to_avoid.append(element)
|
||||||
elif element.type == ElementType.LIST_ITEM:
|
elif element.type == ElementType.LIST_ITEM:
|
||||||
list_elements.append(element)
|
list_elements.append(element)
|
||||||
elif self._is_list_item_fallback(element):
|
elif self._is_list_item_fallback(element):
|
||||||
@@ -1757,13 +1762,30 @@ class PDFGeneratorService:
|
|||||||
if not ocr_data:
|
if not ocr_data:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
# Use internal generation with pre-loaded data
|
# Check if this is new UnifiedDocument format (has 'pages' with elements)
|
||||||
return self._generate_pdf_from_data(
|
# vs old OCR format (has 'text_regions')
|
||||||
ocr_data=ocr_data,
|
if 'pages' in ocr_data and isinstance(ocr_data.get('pages'), list):
|
||||||
output_path=output_path,
|
# New UnifiedDocument format - convert and use Direct track rendering
|
||||||
source_file_path=source_file_path,
|
logger.info("Detected UnifiedDocument JSON format, using Direct track rendering")
|
||||||
json_parent_dir=json_path.parent
|
unified_doc = self._json_to_unified_document(ocr_data, json_path.parent)
|
||||||
)
|
if unified_doc:
|
||||||
|
return self.generate_from_unified_document(
|
||||||
|
unified_doc=unified_doc,
|
||||||
|
output_path=output_path,
|
||||||
|
source_file_path=source_file_path
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
logger.error("Failed to convert JSON to UnifiedDocument")
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
# Old OCR format - use legacy generation
|
||||||
|
logger.info("Detected legacy OCR JSON format, using OCR track rendering")
|
||||||
|
return self._generate_pdf_from_data(
|
||||||
|
ocr_data=ocr_data,
|
||||||
|
output_path=output_path,
|
||||||
|
source_file_path=source_file_path,
|
||||||
|
json_parent_dir=json_path.parent
|
||||||
|
)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed to generate PDF: {e}")
|
logger.error(f"Failed to generate PDF: {e}")
|
||||||
@@ -1771,6 +1793,163 @@ class PDFGeneratorService:
|
|||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
def _json_to_unified_document(self, json_data: Dict, result_dir: Path) -> Optional['UnifiedDocument']:
|
||||||
|
"""
|
||||||
|
Convert JSON dict to UnifiedDocument object.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
json_data: Loaded JSON dictionary in UnifiedDocument format
|
||||||
|
result_dir: Directory containing image files
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
UnifiedDocument object or None if conversion fails
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
# Parse metadata
|
||||||
|
metadata_dict = json_data.get('metadata', {})
|
||||||
|
|
||||||
|
# Parse processing track
|
||||||
|
track_str = metadata_dict.get('processing_track', 'direct')
|
||||||
|
try:
|
||||||
|
processing_track = ProcessingTrack(track_str)
|
||||||
|
except ValueError:
|
||||||
|
processing_track = ProcessingTrack.DIRECT
|
||||||
|
|
||||||
|
# Create DocumentMetadata
|
||||||
|
metadata = DocumentMetadata(
|
||||||
|
filename=metadata_dict.get('filename', ''),
|
||||||
|
file_type=metadata_dict.get('file_type', 'pdf'),
|
||||||
|
file_size=metadata_dict.get('file_size', 0),
|
||||||
|
created_at=datetime.fromisoformat(metadata_dict.get('created_at', datetime.now().isoformat()).replace('Z', '+00:00')),
|
||||||
|
processing_track=processing_track,
|
||||||
|
processing_time=metadata_dict.get('processing_time', 0),
|
||||||
|
language=metadata_dict.get('language'),
|
||||||
|
title=metadata_dict.get('title'),
|
||||||
|
author=metadata_dict.get('author'),
|
||||||
|
subject=metadata_dict.get('subject'),
|
||||||
|
keywords=metadata_dict.get('keywords'),
|
||||||
|
producer=metadata_dict.get('producer'),
|
||||||
|
creator=metadata_dict.get('creator'),
|
||||||
|
creation_date=datetime.fromisoformat(metadata_dict['creation_date'].replace('Z', '+00:00')) if metadata_dict.get('creation_date') else None,
|
||||||
|
modification_date=datetime.fromisoformat(metadata_dict['modification_date'].replace('Z', '+00:00')) if metadata_dict.get('modification_date') else None,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Parse pages
|
||||||
|
pages = []
|
||||||
|
for page_dict in json_data.get('pages', []):
|
||||||
|
# Parse page dimensions
|
||||||
|
dims = page_dict.get('dimensions', {})
|
||||||
|
if not dims:
|
||||||
|
# Fallback dimensions
|
||||||
|
dims = {'width': 595.32, 'height': 841.92}
|
||||||
|
dimensions = Dimensions(
|
||||||
|
width=dims.get('width', 595.32),
|
||||||
|
height=dims.get('height', 841.92),
|
||||||
|
dpi=dims.get('dpi')
|
||||||
|
)
|
||||||
|
|
||||||
|
# Parse elements
|
||||||
|
elements = []
|
||||||
|
for elem_dict in page_dict.get('elements', []):
|
||||||
|
element = self._json_to_document_element(elem_dict)
|
||||||
|
if element:
|
||||||
|
elements.append(element)
|
||||||
|
|
||||||
|
page = Page(
|
||||||
|
page_number=page_dict.get('page_number', 1),
|
||||||
|
dimensions=dimensions,
|
||||||
|
elements=elements,
|
||||||
|
metadata=page_dict.get('metadata', {})
|
||||||
|
)
|
||||||
|
pages.append(page)
|
||||||
|
|
||||||
|
# Create UnifiedDocument
|
||||||
|
unified_doc = UnifiedDocument(
|
||||||
|
document_id=json_data.get('document_id', ''),
|
||||||
|
metadata=metadata,
|
||||||
|
pages=pages,
|
||||||
|
processing_errors=json_data.get('processing_errors', [])
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info(f"Converted JSON to UnifiedDocument: {len(pages)} pages, track={processing_track.value}")
|
||||||
|
return unified_doc
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to convert JSON to UnifiedDocument: {e}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _json_to_document_element(self, elem_dict: Dict) -> Optional['DocumentElement']:
|
||||||
|
"""
|
||||||
|
Convert JSON dict to DocumentElement.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
elem_dict: Element dictionary from JSON
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
DocumentElement or None if conversion fails
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Parse element type
|
||||||
|
type_str = elem_dict.get('type', 'text')
|
||||||
|
try:
|
||||||
|
elem_type = ElementType(type_str)
|
||||||
|
except ValueError:
|
||||||
|
# Fallback to TEXT for unknown types
|
||||||
|
elem_type = ElementType.TEXT
|
||||||
|
logger.warning(f"Unknown element type '{type_str}', falling back to TEXT")
|
||||||
|
|
||||||
|
# Parse bounding box
|
||||||
|
bbox_dict = elem_dict.get('bbox', {})
|
||||||
|
bbox = BoundingBox(
|
||||||
|
x0=bbox_dict.get('x0', 0),
|
||||||
|
y0=bbox_dict.get('y0', 0),
|
||||||
|
x1=bbox_dict.get('x1', 0),
|
||||||
|
y1=bbox_dict.get('y1', 0)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Parse style if present
|
||||||
|
style = None
|
||||||
|
if 'style' in elem_dict and elem_dict['style']:
|
||||||
|
style_dict = elem_dict['style']
|
||||||
|
style = StyleInfo(
|
||||||
|
font_name=style_dict.get('font_name'),
|
||||||
|
font_size=style_dict.get('font_size'),
|
||||||
|
font_weight=style_dict.get('font_weight'),
|
||||||
|
font_style=style_dict.get('font_style'),
|
||||||
|
text_color=style_dict.get('text_color'),
|
||||||
|
bg_color=style_dict.get('bg_color') or style_dict.get('background_color'),
|
||||||
|
alignment=style_dict.get('alignment'),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Parse children (spans)
|
||||||
|
children = []
|
||||||
|
for child_dict in elem_dict.get('children', []):
|
||||||
|
child = self._json_to_document_element(child_dict)
|
||||||
|
if child:
|
||||||
|
children.append(child)
|
||||||
|
|
||||||
|
# Create element
|
||||||
|
element = DocumentElement(
|
||||||
|
element_id=elem_dict.get('element_id', ''),
|
||||||
|
type=elem_type,
|
||||||
|
content=elem_dict.get('content', ''),
|
||||||
|
bbox=bbox,
|
||||||
|
confidence=elem_dict.get('confidence'),
|
||||||
|
style=style,
|
||||||
|
metadata=elem_dict.get('metadata', {}),
|
||||||
|
children=children
|
||||||
|
)
|
||||||
|
|
||||||
|
return element
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Failed to convert element: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
def _is_list_item_fallback(self, element: 'DocumentElement') -> bool:
|
def _is_list_item_fallback(self, element: 'DocumentElement') -> bool:
|
||||||
"""
|
"""
|
||||||
Fallback detection for list items not marked with ElementType.LIST_ITEM.
|
Fallback detection for list items not marked with ElementType.LIST_ITEM.
|
||||||
@@ -2474,7 +2653,7 @@ class PDFGeneratorService:
|
|||||||
preserveAspectRatio=True
|
preserveAspectRatio=True
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.debug(f"Drew image: {image_path} (from: {original_path_str})")
|
logger.debug(f"Drew image: {image_path} (from: {image_path_str})")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed to draw image element {element.element_id}: {e}")
|
logger.error(f"Failed to draw image element {element.element_id}: {e}")
|
||||||
|
|||||||
Reference in New Issue
Block a user