diff --git a/backend/app/services/pdf_generator_service.py b/backend/app/services/pdf_generator_service.py index 8b2588e..83ddc43 100644 --- a/backend/app/services/pdf_generator_service.py +++ b/backend/app/services/pdf_generator_service.py @@ -29,7 +29,8 @@ from app.core.config import settings try: from app.models.unified_document import ( UnifiedDocument, DocumentElement, ElementType, - BoundingBox, TableData, ProcessingTrack + BoundingBox, TableData, ProcessingTrack, + DocumentMetadata, Dimensions, Page, StyleInfo ) UNIFIED_DOCUMENT_AVAILABLE = True except ImportError: @@ -731,7 +732,11 @@ class PDFGeneratorService: ElementType.CHART, ElementType.DIAGRAM, ElementType.LOGO ]: image_elements.append(element) - regions_to_avoid.append(element) # Images are exclusion regions + # Only add real images to exclusion regions, NOT charts/diagrams + # Charts often have large bounding boxes that include text labels + # which should be rendered as selectable text on top + if element.type in [ElementType.IMAGE, ElementType.FIGURE, ElementType.LOGO]: + regions_to_avoid.append(element) elif element.type == ElementType.LIST_ITEM: list_elements.append(element) elif self._is_list_item_fallback(element): @@ -1757,13 +1762,30 @@ class PDFGeneratorService: if not ocr_data: return False - # Use internal generation with pre-loaded data - return self._generate_pdf_from_data( - ocr_data=ocr_data, - output_path=output_path, - source_file_path=source_file_path, - json_parent_dir=json_path.parent - ) + # Check if this is new UnifiedDocument format (has 'pages' with elements) + # vs old OCR format (has 'text_regions') + if 'pages' in ocr_data and isinstance(ocr_data.get('pages'), list): + # New UnifiedDocument format - convert and use Direct track rendering + logger.info("Detected UnifiedDocument JSON format, using Direct track rendering") + unified_doc = self._json_to_unified_document(ocr_data, json_path.parent) + if unified_doc: + return self.generate_from_unified_document( + unified_doc=unified_doc, + output_path=output_path, + source_file_path=source_file_path + ) + else: + logger.error("Failed to convert JSON to UnifiedDocument") + return False + else: + # Old OCR format - use legacy generation + logger.info("Detected legacy OCR JSON format, using OCR track rendering") + return self._generate_pdf_from_data( + ocr_data=ocr_data, + output_path=output_path, + source_file_path=source_file_path, + json_parent_dir=json_path.parent + ) except Exception as e: logger.error(f"Failed to generate PDF: {e}") @@ -1771,6 +1793,163 @@ class PDFGeneratorService: traceback.print_exc() return False + def _json_to_unified_document(self, json_data: Dict, result_dir: Path) -> Optional['UnifiedDocument']: + """ + Convert JSON dict to UnifiedDocument object. + + Args: + json_data: Loaded JSON dictionary in UnifiedDocument format + result_dir: Directory containing image files + + Returns: + UnifiedDocument object or None if conversion fails + """ + try: + from datetime import datetime + + # Parse metadata + metadata_dict = json_data.get('metadata', {}) + + # Parse processing track + track_str = metadata_dict.get('processing_track', 'direct') + try: + processing_track = ProcessingTrack(track_str) + except ValueError: + processing_track = ProcessingTrack.DIRECT + + # Create DocumentMetadata + metadata = DocumentMetadata( + filename=metadata_dict.get('filename', ''), + file_type=metadata_dict.get('file_type', 'pdf'), + file_size=metadata_dict.get('file_size', 0), + created_at=datetime.fromisoformat(metadata_dict.get('created_at', datetime.now().isoformat()).replace('Z', '+00:00')), + processing_track=processing_track, + processing_time=metadata_dict.get('processing_time', 0), + language=metadata_dict.get('language'), + title=metadata_dict.get('title'), + author=metadata_dict.get('author'), + subject=metadata_dict.get('subject'), + keywords=metadata_dict.get('keywords'), + producer=metadata_dict.get('producer'), + creator=metadata_dict.get('creator'), + creation_date=datetime.fromisoformat(metadata_dict['creation_date'].replace('Z', '+00:00')) if metadata_dict.get('creation_date') else None, + modification_date=datetime.fromisoformat(metadata_dict['modification_date'].replace('Z', '+00:00')) if metadata_dict.get('modification_date') else None, + ) + + # Parse pages + pages = [] + for page_dict in json_data.get('pages', []): + # Parse page dimensions + dims = page_dict.get('dimensions', {}) + if not dims: + # Fallback dimensions + dims = {'width': 595.32, 'height': 841.92} + dimensions = Dimensions( + width=dims.get('width', 595.32), + height=dims.get('height', 841.92), + dpi=dims.get('dpi') + ) + + # Parse elements + elements = [] + for elem_dict in page_dict.get('elements', []): + element = self._json_to_document_element(elem_dict) + if element: + elements.append(element) + + page = Page( + page_number=page_dict.get('page_number', 1), + dimensions=dimensions, + elements=elements, + metadata=page_dict.get('metadata', {}) + ) + pages.append(page) + + # Create UnifiedDocument + unified_doc = UnifiedDocument( + document_id=json_data.get('document_id', ''), + metadata=metadata, + pages=pages, + processing_errors=json_data.get('processing_errors', []) + ) + + logger.info(f"Converted JSON to UnifiedDocument: {len(pages)} pages, track={processing_track.value}") + return unified_doc + + except Exception as e: + logger.error(f"Failed to convert JSON to UnifiedDocument: {e}") + import traceback + traceback.print_exc() + return None + + def _json_to_document_element(self, elem_dict: Dict) -> Optional['DocumentElement']: + """ + Convert JSON dict to DocumentElement. + + Args: + elem_dict: Element dictionary from JSON + + Returns: + DocumentElement or None if conversion fails + """ + try: + # Parse element type + type_str = elem_dict.get('type', 'text') + try: + elem_type = ElementType(type_str) + except ValueError: + # Fallback to TEXT for unknown types + elem_type = ElementType.TEXT + logger.warning(f"Unknown element type '{type_str}', falling back to TEXT") + + # Parse bounding box + bbox_dict = elem_dict.get('bbox', {}) + bbox = BoundingBox( + x0=bbox_dict.get('x0', 0), + y0=bbox_dict.get('y0', 0), + x1=bbox_dict.get('x1', 0), + y1=bbox_dict.get('y1', 0) + ) + + # Parse style if present + style = None + if 'style' in elem_dict and elem_dict['style']: + style_dict = elem_dict['style'] + style = StyleInfo( + font_name=style_dict.get('font_name'), + font_size=style_dict.get('font_size'), + font_weight=style_dict.get('font_weight'), + font_style=style_dict.get('font_style'), + text_color=style_dict.get('text_color'), + bg_color=style_dict.get('bg_color') or style_dict.get('background_color'), + alignment=style_dict.get('alignment'), + ) + + # Parse children (spans) + children = [] + for child_dict in elem_dict.get('children', []): + child = self._json_to_document_element(child_dict) + if child: + children.append(child) + + # Create element + element = DocumentElement( + element_id=elem_dict.get('element_id', ''), + type=elem_type, + content=elem_dict.get('content', ''), + bbox=bbox, + confidence=elem_dict.get('confidence'), + style=style, + metadata=elem_dict.get('metadata', {}), + children=children + ) + + return element + + except Exception as e: + logger.warning(f"Failed to convert element: {e}") + return None + def _is_list_item_fallback(self, element: 'DocumentElement') -> bool: """ Fallback detection for list items not marked with ElementType.LIST_ITEM. @@ -2474,7 +2653,7 @@ class PDFGeneratorService: preserveAspectRatio=True ) - logger.debug(f"Drew image: {image_path} (from: {original_path_str})") + logger.debug(f"Drew image: {image_path} (from: {image_path_str})") except Exception as e: logger.error(f"Failed to draw image element {element.element_id}: {e}")