fix: enable text selection in Direct track PDF output

Root causes:
1. generate_layout_pdf() didn't properly route UnifiedDocument JSON
   to Direct track rendering - added format detection and JSON-to-
   UnifiedDocument conversion
2. Chart elements with page-spanning bboxes (e.g., chart_1_44 covering
   entire page) caused all text to be filtered by _is_element_inside_regions
   - Fix: only IMAGE/FIGURE/LOGO are exclusion regions, not CHART/DIAGRAM
3. Fixed UnifiedDocument constructor call (removed invalid params)
4. Fixed method name typo (generate_pdf_from_unified_document →
   generate_from_unified_document)
5. Fixed variable name typo in _draw_image_element_direct logging

Result: edit3.pdf text extraction changed from 0 chars to 773 chars

Note: Chinese chars render as 'I' due to CJK font encoding - separate
issue to be addressed when implementing translation feature.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
egg
2025-11-26 14:49:40 +08:00
parent 5c561f4203
commit 19bd5fd609

View File

@@ -29,7 +29,8 @@ from app.core.config import settings
try:
from app.models.unified_document import (
UnifiedDocument, DocumentElement, ElementType,
BoundingBox, TableData, ProcessingTrack
BoundingBox, TableData, ProcessingTrack,
DocumentMetadata, Dimensions, Page, StyleInfo
)
UNIFIED_DOCUMENT_AVAILABLE = True
except ImportError:
@@ -731,7 +732,11 @@ class PDFGeneratorService:
ElementType.CHART, ElementType.DIAGRAM, ElementType.LOGO
]:
image_elements.append(element)
regions_to_avoid.append(element) # Images are exclusion regions
# Only add real images to exclusion regions, NOT charts/diagrams
# Charts often have large bounding boxes that include text labels
# which should be rendered as selectable text on top
if element.type in [ElementType.IMAGE, ElementType.FIGURE, ElementType.LOGO]:
regions_to_avoid.append(element)
elif element.type == ElementType.LIST_ITEM:
list_elements.append(element)
elif self._is_list_item_fallback(element):
@@ -1757,7 +1762,24 @@ class PDFGeneratorService:
if not ocr_data:
return False
# Use internal generation with pre-loaded data
# Check if this is new UnifiedDocument format (has 'pages' with elements)
# vs old OCR format (has 'text_regions')
if 'pages' in ocr_data and isinstance(ocr_data.get('pages'), list):
# New UnifiedDocument format - convert and use Direct track rendering
logger.info("Detected UnifiedDocument JSON format, using Direct track rendering")
unified_doc = self._json_to_unified_document(ocr_data, json_path.parent)
if unified_doc:
return self.generate_from_unified_document(
unified_doc=unified_doc,
output_path=output_path,
source_file_path=source_file_path
)
else:
logger.error("Failed to convert JSON to UnifiedDocument")
return False
else:
# Old OCR format - use legacy generation
logger.info("Detected legacy OCR JSON format, using OCR track rendering")
return self._generate_pdf_from_data(
ocr_data=ocr_data,
output_path=output_path,
@@ -1771,6 +1793,163 @@ class PDFGeneratorService:
traceback.print_exc()
return False
def _json_to_unified_document(self, json_data: Dict, result_dir: Path) -> Optional['UnifiedDocument']:
"""
Convert JSON dict to UnifiedDocument object.
Args:
json_data: Loaded JSON dictionary in UnifiedDocument format
result_dir: Directory containing image files
Returns:
UnifiedDocument object or None if conversion fails
"""
try:
from datetime import datetime
# Parse metadata
metadata_dict = json_data.get('metadata', {})
# Parse processing track
track_str = metadata_dict.get('processing_track', 'direct')
try:
processing_track = ProcessingTrack(track_str)
except ValueError:
processing_track = ProcessingTrack.DIRECT
# Create DocumentMetadata
metadata = DocumentMetadata(
filename=metadata_dict.get('filename', ''),
file_type=metadata_dict.get('file_type', 'pdf'),
file_size=metadata_dict.get('file_size', 0),
created_at=datetime.fromisoformat(metadata_dict.get('created_at', datetime.now().isoformat()).replace('Z', '+00:00')),
processing_track=processing_track,
processing_time=metadata_dict.get('processing_time', 0),
language=metadata_dict.get('language'),
title=metadata_dict.get('title'),
author=metadata_dict.get('author'),
subject=metadata_dict.get('subject'),
keywords=metadata_dict.get('keywords'),
producer=metadata_dict.get('producer'),
creator=metadata_dict.get('creator'),
creation_date=datetime.fromisoformat(metadata_dict['creation_date'].replace('Z', '+00:00')) if metadata_dict.get('creation_date') else None,
modification_date=datetime.fromisoformat(metadata_dict['modification_date'].replace('Z', '+00:00')) if metadata_dict.get('modification_date') else None,
)
# Parse pages
pages = []
for page_dict in json_data.get('pages', []):
# Parse page dimensions
dims = page_dict.get('dimensions', {})
if not dims:
# Fallback dimensions
dims = {'width': 595.32, 'height': 841.92}
dimensions = Dimensions(
width=dims.get('width', 595.32),
height=dims.get('height', 841.92),
dpi=dims.get('dpi')
)
# Parse elements
elements = []
for elem_dict in page_dict.get('elements', []):
element = self._json_to_document_element(elem_dict)
if element:
elements.append(element)
page = Page(
page_number=page_dict.get('page_number', 1),
dimensions=dimensions,
elements=elements,
metadata=page_dict.get('metadata', {})
)
pages.append(page)
# Create UnifiedDocument
unified_doc = UnifiedDocument(
document_id=json_data.get('document_id', ''),
metadata=metadata,
pages=pages,
processing_errors=json_data.get('processing_errors', [])
)
logger.info(f"Converted JSON to UnifiedDocument: {len(pages)} pages, track={processing_track.value}")
return unified_doc
except Exception as e:
logger.error(f"Failed to convert JSON to UnifiedDocument: {e}")
import traceback
traceback.print_exc()
return None
def _json_to_document_element(self, elem_dict: Dict) -> Optional['DocumentElement']:
"""
Convert JSON dict to DocumentElement.
Args:
elem_dict: Element dictionary from JSON
Returns:
DocumentElement or None if conversion fails
"""
try:
# Parse element type
type_str = elem_dict.get('type', 'text')
try:
elem_type = ElementType(type_str)
except ValueError:
# Fallback to TEXT for unknown types
elem_type = ElementType.TEXT
logger.warning(f"Unknown element type '{type_str}', falling back to TEXT")
# Parse bounding box
bbox_dict = elem_dict.get('bbox', {})
bbox = BoundingBox(
x0=bbox_dict.get('x0', 0),
y0=bbox_dict.get('y0', 0),
x1=bbox_dict.get('x1', 0),
y1=bbox_dict.get('y1', 0)
)
# Parse style if present
style = None
if 'style' in elem_dict and elem_dict['style']:
style_dict = elem_dict['style']
style = StyleInfo(
font_name=style_dict.get('font_name'),
font_size=style_dict.get('font_size'),
font_weight=style_dict.get('font_weight'),
font_style=style_dict.get('font_style'),
text_color=style_dict.get('text_color'),
bg_color=style_dict.get('bg_color') or style_dict.get('background_color'),
alignment=style_dict.get('alignment'),
)
# Parse children (spans)
children = []
for child_dict in elem_dict.get('children', []):
child = self._json_to_document_element(child_dict)
if child:
children.append(child)
# Create element
element = DocumentElement(
element_id=elem_dict.get('element_id', ''),
type=elem_type,
content=elem_dict.get('content', ''),
bbox=bbox,
confidence=elem_dict.get('confidence'),
style=style,
metadata=elem_dict.get('metadata', {}),
children=children
)
return element
except Exception as e:
logger.warning(f"Failed to convert element: {e}")
return None
def _is_list_item_fallback(self, element: 'DocumentElement') -> bool:
"""
Fallback detection for list items not marked with ElementType.LIST_ITEM.
@@ -2474,7 +2653,7 @@ class PDFGeneratorService:
preserveAspectRatio=True
)
logger.debug(f"Drew image: {image_path} (from: {original_path_str})")
logger.debug(f"Drew image: {image_path} (from: {image_path_str})")
except Exception as e:
logger.error(f"Failed to draw image element {element.element_id}: {e}")