feat: implement Phase 3 list formatting for Direct track
Add comprehensive list rendering with automatic detection and formatting: **Task 6.1: List Element Detection** - Detect LIST_ITEM elements by type (element.type == ElementType.LIST_ITEM) - Extract list_level from element metadata (lines 1566-1567) - Determine list type via regex pattern matching: - Ordered lists: ^\d+[\.\)]\s (e.g., "1. ", "2) ") - Unordered lists: ^[•·▪▫◦‣⁃]\s (various bullet symbols) - Parse and extract list markers from text content (lines 1571-1588) **Task 6.2: List Rendering** - Add list markers to first line of each item: - Ordered: Preserve original numbering (e.g., "1. ") - Unordered: Standardize to bullet "• " - Remove original markers from text content - Apply list indentation: 20pt per nesting level (lines 1594-1598) - Combine list indent with existing paragraph indent - List spacing: Inherited from bbox-based layout (spacing_before/after) **Implementation Details** - Lines 1565-1598: List detection and indentation logic - Lines 1629-1632: Prepend list marker to first line (rendered_line) - Lines 1635-1676: Update all text width calculations to use rendered_line - Lines 1688-1692: Enhanced logging with list type and level **Technical Notes** - Direct track only (OCR track has no list metadata) - Integrates with existing alignment and indentation system - Preserves line breaks and multi-line list items - Works with all text alignment modes (left/center/right/justify) **Modified Files** - backend/app/services/pdf_generator_service.py - Added import re for regex pattern matching - Lines 1565-1598: List detection and indentation - Lines 1629-1676: List marker rendering - Lines 1688-1692: Enhanced debug logging - openspec/changes/pdf-layout-restoration/tasks.md - Marked Task 6.1 (all subtasks) as completed - Marked Task 6.2 (all subtasks) as completed - Added implementation line references 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -5,6 +5,7 @@ Generates PDF files that preserve the original document layout using OCR JSON da
|
||||
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Tuple, Union
|
||||
from datetime import datetime
|
||||
@@ -1562,10 +1563,41 @@ class PDFGeneratorService:
|
||||
font_name = self.font_name if self.font_registered else 'Helvetica'
|
||||
pdf_canvas.setFont(font_name, font_size)
|
||||
|
||||
# Detect list items and extract list properties
|
||||
is_list_item = (element.type == ElementType.LIST_ITEM)
|
||||
list_level = element.metadata.get('list_level', 0) if element.metadata else 0
|
||||
list_type = None # 'ordered' or 'unordered'
|
||||
list_marker = "" # The bullet or number prefix
|
||||
|
||||
if is_list_item:
|
||||
# Determine list type based on text content
|
||||
text_stripped = text_content.lstrip()
|
||||
# Check for ordered list (starts with digit)
|
||||
if re.match(r'^\d+[\.\)]\s', text_stripped):
|
||||
list_type = 'ordered'
|
||||
# Extract the number
|
||||
match = re.match(r'^(\d+)[\.\)]\s', text_stripped)
|
||||
if match:
|
||||
list_marker = match.group(1) + ". "
|
||||
# Remove the marker from text content
|
||||
text_content = text_stripped[len(match.group(0)):]
|
||||
# Check for unordered list (starts with bullet)
|
||||
elif re.match(r'^[•·▪▫◦‣⁃]\s', text_stripped):
|
||||
list_type = 'unordered'
|
||||
list_marker = "• " # Use standard bullet
|
||||
# Remove the original marker from text content
|
||||
text_content = re.sub(r'^[•·▪▫◦‣⁃]\s', '', text_stripped)
|
||||
|
||||
# Get indentation from metadata (in points)
|
||||
indent = element.metadata.get('indent', 0) if element.metadata else 0
|
||||
first_line_indent = element.metadata.get('first_line_indent', indent) if element.metadata else indent
|
||||
|
||||
# Apply list indentation (20pt per level)
|
||||
if is_list_item:
|
||||
list_indent = list_level * 20 # 20pt per level
|
||||
indent += list_indent
|
||||
first_line_indent += list_indent
|
||||
|
||||
# Get paragraph spacing
|
||||
# spacing_before: Applied by adjusting starting Y position (pdf_y)
|
||||
# spacing_after: Recorded for debugging; in Direct track with fixed bbox,
|
||||
@@ -1595,8 +1627,13 @@ class PDFGeneratorService:
|
||||
# Calculate line indentation
|
||||
line_indent = first_line_indent if i == 0 else indent
|
||||
|
||||
# Prepend list marker to first line
|
||||
rendered_line = line
|
||||
if is_list_item and i == 0 and list_marker:
|
||||
rendered_line = list_marker + line
|
||||
|
||||
# Calculate text width
|
||||
text_width = pdf_canvas.stringWidth(line, font_name, current_font_size)
|
||||
text_width = pdf_canvas.stringWidth(rendered_line, font_name, current_font_size)
|
||||
available_width = bbox_width - line_indent
|
||||
|
||||
# Scale font if needed
|
||||
@@ -1605,7 +1642,7 @@ class PDFGeneratorService:
|
||||
scaled_size = current_font_size * scale_factor * 0.95
|
||||
scaled_size = max(scaled_size, 3)
|
||||
pdf_canvas.setFont(font_name, scaled_size)
|
||||
text_width = pdf_canvas.stringWidth(line, font_name, scaled_size)
|
||||
text_width = pdf_canvas.stringWidth(rendered_line, font_name, scaled_size)
|
||||
current_font_size = scaled_size
|
||||
|
||||
# Calculate X position based on alignment
|
||||
@@ -1617,7 +1654,7 @@ class PDFGeneratorService:
|
||||
line_x = pdf_x + bbox_width - text_width
|
||||
elif alignment == 'justify' and i < len(lines) - 1:
|
||||
# Justify: distribute extra space between words (except last line)
|
||||
words = line.split()
|
||||
words = rendered_line.split()
|
||||
if len(words) > 1:
|
||||
total_word_width = sum(pdf_canvas.stringWidth(word, font_name, current_font_size) for word in words)
|
||||
extra_space = available_width - total_word_width
|
||||
@@ -1637,7 +1674,7 @@ class PDFGeneratorService:
|
||||
# else: left alignment uses line_x as-is
|
||||
|
||||
# Draw the line at calculated position
|
||||
pdf_canvas.drawString(line_x, line_y, line)
|
||||
pdf_canvas.drawString(line_x, line_y, rendered_line)
|
||||
|
||||
# Reset font size for next line
|
||||
if text_width > available_width:
|
||||
@@ -1649,8 +1686,9 @@ class PDFGeneratorService:
|
||||
|
||||
# Note: spacing_after is inherent in element positioning (bbox-based layout)
|
||||
# If text is shorter than bbox, the remaining space acts as implicit spacing
|
||||
list_info = f", list={list_type}, level={list_level}" if is_list_item else ""
|
||||
logger.debug(f"Drew text element: {text_content[:30]}... "
|
||||
f"({len(lines)} lines, align={alignment}, indent={indent}, "
|
||||
f"({len(lines)} lines, align={alignment}, indent={indent}{list_info}, "
|
||||
f"spacing_before={paragraph_spacing_before}, spacing_after={paragraph_spacing_after}, "
|
||||
f"actual_height={actual_text_height:.1f}, bbox_bottom_margin={bbox_bottom_margin:.1f})")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user