fix: enable text selection in Direct track PDF output

Root causes: 1. generate_layout_pdf() didn't properly route UnifiedDocument JSON to Direct track rendering - added format detection and JSON-to- UnifiedDocument conversion 2. Chart elements with page-spanning bboxes (e.g., chart_1_44 covering entire page) caused all text to be filtered by _is_element_inside_regions - Fix: only IMAGE/FIGURE/LOGO are exclusion regions, not CHART/DIAGRAM 3. Fixed UnifiedDocument constructor call (removed invalid params) 4. Fixed method name typo (generate_pdf_from_unified_document → generate_from_unified_document) 5. Fixed variable name typo in _draw_image_element_direct logging Result: edit3.pdf text extraction changed from 0 chars to 773 chars Note: Chinese chars render as 'I' due to CJK font encoding - separate issue to be addressed when implementing translation feature. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-26 14:49:40 +08:00
parent 5c561f4203
commit 19bd5fd609
1 changed files with 189 additions and 10 deletions
--- a/backend/app/services/pdf_generator_service.py
+++ b/backend/app/services/pdf_generator_service.py
@@ -29,7 +29,8 @@ from app.core.config import settings
 try:
    from app.models.unified_document import (
        UnifiedDocument, DocumentElement, ElementType,
-        BoundingBox, TableData, ProcessingTrack
+        BoundingBox, TableData, ProcessingTrack,
        DocumentMetadata, Dimensions, Page, StyleInfo
    )
    UNIFIED_DOCUMENT_AVAILABLE = True
 except ImportError:
@@ -731,7 +732,11 @@ class PDFGeneratorService:
                        ElementType.CHART, ElementType.DIAGRAM, ElementType.LOGO
                    ]:
                        image_elements.append(element)
-                        regions_to_avoid.append(element)  # Images are exclusion regions
+                        # Only add real images to exclusion regions, NOT charts/diagrams
                        # Charts often have large bounding boxes that include text labels
                        # which should be rendered as selectable text on top
                        if element.type in [ElementType.IMAGE, ElementType.FIGURE, ElementType.LOGO]:
                            regions_to_avoid.append(element)
                    elif element.type == ElementType.LIST_ITEM:
                        list_elements.append(element)
                    elif self._is_list_item_fallback(element):
@@ -1757,13 +1762,30 @@ class PDFGeneratorService:
            if not ocr_data:
                return False
-            # Use internal generation with pre-loaded data
+            # Check if this is new UnifiedDocument format (has 'pages' with elements)
-            return self._generate_pdf_from_data(
+            # vs old OCR format (has 'text_regions')
-                ocr_data=ocr_data,
+            if 'pages' in ocr_data and isinstance(ocr_data.get('pages'), list):
-                output_path=output_path,
+                # New UnifiedDocument format - convert and use Direct track rendering
-                source_file_path=source_file_path,
+                logger.info("Detected UnifiedDocument JSON format, using Direct track rendering")
-                json_parent_dir=json_path.parent
+                unified_doc = self._json_to_unified_document(ocr_data, json_path.parent)
-            )
+                if unified_doc:
                    return self.generate_from_unified_document(
                        unified_doc=unified_doc,
                        output_path=output_path,
                        source_file_path=source_file_path
                    )
                else:
                    logger.error("Failed to convert JSON to UnifiedDocument")
                    return False
            else:
                # Old OCR format - use legacy generation
                logger.info("Detected legacy OCR JSON format, using OCR track rendering")
                return self._generate_pdf_from_data(
                    ocr_data=ocr_data,
                    output_path=output_path,
                    source_file_path=source_file_path,
                    json_parent_dir=json_path.parent
                )
        except Exception as e:
            logger.error(f"Failed to generate PDF: {e}")
@@ -1771,6 +1793,163 @@ class PDFGeneratorService:
            traceback.print_exc()
            return False
    def _json_to_unified_document(self, json_data: Dict, result_dir: Path) -> Optional['UnifiedDocument']:
        """
        Convert JSON dict to UnifiedDocument object.
        Args:
            json_data: Loaded JSON dictionary in UnifiedDocument format
            result_dir: Directory containing image files
        Returns:
            UnifiedDocument object or None if conversion fails
        """
        try:
            from datetime import datetime
            # Parse metadata
            metadata_dict = json_data.get('metadata', {})
            # Parse processing track
            track_str = metadata_dict.get('processing_track', 'direct')
            try:
                processing_track = ProcessingTrack(track_str)
            except ValueError:
                processing_track = ProcessingTrack.DIRECT
            # Create DocumentMetadata
            metadata = DocumentMetadata(
                filename=metadata_dict.get('filename', ''),
                file_type=metadata_dict.get('file_type', 'pdf'),
                file_size=metadata_dict.get('file_size', 0),
                created_at=datetime.fromisoformat(metadata_dict.get('created_at', datetime.now().isoformat()).replace('Z', '+00:00')),
                processing_track=processing_track,
                processing_time=metadata_dict.get('processing_time', 0),
                language=metadata_dict.get('language'),
                title=metadata_dict.get('title'),
                author=metadata_dict.get('author'),
                subject=metadata_dict.get('subject'),
                keywords=metadata_dict.get('keywords'),
                producer=metadata_dict.get('producer'),
                creator=metadata_dict.get('creator'),
                creation_date=datetime.fromisoformat(metadata_dict['creation_date'].replace('Z', '+00:00')) if metadata_dict.get('creation_date') else None,
                modification_date=datetime.fromisoformat(metadata_dict['modification_date'].replace('Z', '+00:00')) if metadata_dict.get('modification_date') else None,
            )
            # Parse pages
            pages = []
            for page_dict in json_data.get('pages', []):
                # Parse page dimensions
                dims = page_dict.get('dimensions', {})
                if not dims:
                    # Fallback dimensions
                    dims = {'width': 595.32, 'height': 841.92}
                dimensions = Dimensions(
                    width=dims.get('width', 595.32),
                    height=dims.get('height', 841.92),
                    dpi=dims.get('dpi')
                )
                # Parse elements
                elements = []
                for elem_dict in page_dict.get('elements', []):
                    element = self._json_to_document_element(elem_dict)
                    if element:
                        elements.append(element)
                page = Page(
                    page_number=page_dict.get('page_number', 1),
                    dimensions=dimensions,
                    elements=elements,
                    metadata=page_dict.get('metadata', {})
                )
                pages.append(page)
            # Create UnifiedDocument
            unified_doc = UnifiedDocument(
                document_id=json_data.get('document_id', ''),
                metadata=metadata,
                pages=pages,
                processing_errors=json_data.get('processing_errors', [])
            )
            logger.info(f"Converted JSON to UnifiedDocument: {len(pages)} pages, track={processing_track.value}")
            return unified_doc
        except Exception as e:
            logger.error(f"Failed to convert JSON to UnifiedDocument: {e}")
            import traceback
            traceback.print_exc()
            return None
    def _json_to_document_element(self, elem_dict: Dict) -> Optional['DocumentElement']:
        """
        Convert JSON dict to DocumentElement.
        Args:
            elem_dict: Element dictionary from JSON
        Returns:
            DocumentElement or None if conversion fails
        """
        try:
            # Parse element type
            type_str = elem_dict.get('type', 'text')
            try:
                elem_type = ElementType(type_str)
            except ValueError:
                # Fallback to TEXT for unknown types
                elem_type = ElementType.TEXT
                logger.warning(f"Unknown element type '{type_str}', falling back to TEXT")
            # Parse bounding box
            bbox_dict = elem_dict.get('bbox', {})
            bbox = BoundingBox(
                x0=bbox_dict.get('x0', 0),
                y0=bbox_dict.get('y0', 0),
                x1=bbox_dict.get('x1', 0),
                y1=bbox_dict.get('y1', 0)
            )
            # Parse style if present
            style = None
            if 'style' in elem_dict and elem_dict['style']:
                style_dict = elem_dict['style']
                style = StyleInfo(
                    font_name=style_dict.get('font_name'),
                    font_size=style_dict.get('font_size'),
                    font_weight=style_dict.get('font_weight'),
                    font_style=style_dict.get('font_style'),
                    text_color=style_dict.get('text_color'),
                    bg_color=style_dict.get('bg_color') or style_dict.get('background_color'),
                    alignment=style_dict.get('alignment'),
                )
            # Parse children (spans)
            children = []
            for child_dict in elem_dict.get('children', []):
                child = self._json_to_document_element(child_dict)
                if child:
                    children.append(child)
            # Create element
            element = DocumentElement(
                element_id=elem_dict.get('element_id', ''),
                type=elem_type,
                content=elem_dict.get('content', ''),
                bbox=bbox,
                confidence=elem_dict.get('confidence'),
                style=style,
                metadata=elem_dict.get('metadata', {}),
                children=children
            )
            return element
        except Exception as e:
            logger.warning(f"Failed to convert element: {e}")
            return None
    def _is_list_item_fallback(self, element: 'DocumentElement') -> bool:
        """
        Fallback detection for list items not marked with ElementType.LIST_ITEM.
@@ -2474,7 +2653,7 @@ class PDFGeneratorService:
                preserveAspectRatio=True
            )
-            logger.debug(f"Drew image: {image_path} (from: {original_path_str})")
+            logger.debug(f"Drew image: {image_path} (from: {image_path_str})")
        except Exception as e:
            logger.error(f"Failed to draw image element {element.element_id}: {e}")