diff --git a/backend/app/services/pdf_generator_service.py b/backend/app/services/pdf_generator_service.py index 0bb5220..14d9a10 100644 --- a/backend/app/services/pdf_generator_service.py +++ b/backend/app/services/pdf_generator_service.py @@ -5,6 +5,7 @@ Generates PDF files that preserve the original document layout using OCR JSON da import json import logging +import re from pathlib import Path from typing import Dict, List, Optional, Tuple, Union from datetime import datetime @@ -1562,10 +1563,41 @@ class PDFGeneratorService: font_name = self.font_name if self.font_registered else 'Helvetica' pdf_canvas.setFont(font_name, font_size) + # Detect list items and extract list properties + is_list_item = (element.type == ElementType.LIST_ITEM) + list_level = element.metadata.get('list_level', 0) if element.metadata else 0 + list_type = None # 'ordered' or 'unordered' + list_marker = "" # The bullet or number prefix + + if is_list_item: + # Determine list type based on text content + text_stripped = text_content.lstrip() + # Check for ordered list (starts with digit) + if re.match(r'^\d+[\.\)]\s', text_stripped): + list_type = 'ordered' + # Extract the number + match = re.match(r'^(\d+)[\.\)]\s', text_stripped) + if match: + list_marker = match.group(1) + ". " + # Remove the marker from text content + text_content = text_stripped[len(match.group(0)):] + # Check for unordered list (starts with bullet) + elif re.match(r'^[•·▪▫◦‣⁃]\s', text_stripped): + list_type = 'unordered' + list_marker = "• " # Use standard bullet + # Remove the original marker from text content + text_content = re.sub(r'^[•·▪▫◦‣⁃]\s', '', text_stripped) + # Get indentation from metadata (in points) indent = element.metadata.get('indent', 0) if element.metadata else 0 first_line_indent = element.metadata.get('first_line_indent', indent) if element.metadata else indent + # Apply list indentation (20pt per level) + if is_list_item: + list_indent = list_level * 20 # 20pt per level + indent += list_indent + first_line_indent += list_indent + # Get paragraph spacing # spacing_before: Applied by adjusting starting Y position (pdf_y) # spacing_after: Recorded for debugging; in Direct track with fixed bbox, @@ -1595,8 +1627,13 @@ class PDFGeneratorService: # Calculate line indentation line_indent = first_line_indent if i == 0 else indent + # Prepend list marker to first line + rendered_line = line + if is_list_item and i == 0 and list_marker: + rendered_line = list_marker + line + # Calculate text width - text_width = pdf_canvas.stringWidth(line, font_name, current_font_size) + text_width = pdf_canvas.stringWidth(rendered_line, font_name, current_font_size) available_width = bbox_width - line_indent # Scale font if needed @@ -1605,7 +1642,7 @@ class PDFGeneratorService: scaled_size = current_font_size * scale_factor * 0.95 scaled_size = max(scaled_size, 3) pdf_canvas.setFont(font_name, scaled_size) - text_width = pdf_canvas.stringWidth(line, font_name, scaled_size) + text_width = pdf_canvas.stringWidth(rendered_line, font_name, scaled_size) current_font_size = scaled_size # Calculate X position based on alignment @@ -1617,7 +1654,7 @@ class PDFGeneratorService: line_x = pdf_x + bbox_width - text_width elif alignment == 'justify' and i < len(lines) - 1: # Justify: distribute extra space between words (except last line) - words = line.split() + words = rendered_line.split() if len(words) > 1: total_word_width = sum(pdf_canvas.stringWidth(word, font_name, current_font_size) for word in words) extra_space = available_width - total_word_width @@ -1637,7 +1674,7 @@ class PDFGeneratorService: # else: left alignment uses line_x as-is # Draw the line at calculated position - pdf_canvas.drawString(line_x, line_y, line) + pdf_canvas.drawString(line_x, line_y, rendered_line) # Reset font size for next line if text_width > available_width: @@ -1649,8 +1686,9 @@ class PDFGeneratorService: # Note: spacing_after is inherent in element positioning (bbox-based layout) # If text is shorter than bbox, the remaining space acts as implicit spacing + list_info = f", list={list_type}, level={list_level}" if is_list_item else "" logger.debug(f"Drew text element: {text_content[:30]}... " - f"({len(lines)} lines, align={alignment}, indent={indent}, " + f"({len(lines)} lines, align={alignment}, indent={indent}{list_info}, " f"spacing_before={paragraph_spacing_before}, spacing_after={paragraph_spacing_after}, " f"actual_height={actual_text_height:.1f}, bbox_bottom_margin={bbox_bottom_margin:.1f})") diff --git a/openspec/changes/pdf-layout-restoration/tasks.md b/openspec/changes/pdf-layout-restoration/tasks.md index 308c144..150f78e 100644 --- a/openspec/changes/pdf-layout-restoration/tasks.md +++ b/openspec/changes/pdf-layout-restoration/tasks.md @@ -97,15 +97,15 @@ - [x] 5.3.4 Justify alignment with word spacing distribution - [x] 5.3.5 OCR track: left-aligned only (no StyleInfo available) -### 6. List Formatting -- [ ] 6.1 Detect list elements - - [ ] 6.1.1 Identify list items from metadata - - [ ] 6.1.2 Determine list type (ordered/unordered) - - [ ] 6.1.3 Extract indent level -- [ ] 6.2 Render lists with proper formatting - - [ ] 6.2.1 Add bullets/numbers - - [ ] 6.2.2 Apply indentation - - [ ] 6.2.3 Maintain list spacing +### 6. List Formatting (Direct track only) +- [x] 6.1 Detect list elements from Direct track + - [x] 6.1.1 Identify LIST_ITEM elements (element.type == ElementType.LIST_ITEM) + - [x] 6.1.2 Determine list type via regex (ordered: ^\d+[\.\)], unordered: ^[•·▪▫◦‣⁃]) + - [x] 6.1.3 Extract indent level from metadata (list_level, lines 1567-1598) +- [x] 6.2 Render lists with proper formatting + - [x] 6.2.1 Add bullets/numbers as list markers (lines 1571-1588, prepended to first line) + - [x] 6.2.2 Apply indentation (20pt per level, lines 1594-1598) + - [x] 6.2.3 Maintain list spacing (inherent in bbox-based layout, spacing_before/after) ### 7. Span-Level Rendering (Advanced) - [ ] 7.1 Extract span information from Direct track