fix: properly implement list formatting with sequential numbering and grouping
Fix critical issues in Task 6 list formatting implementation: **Issue 1: LIST_ITEM Elements Not Rendered** - Problem: LIST_ITEM type not included in is_text property - Fix: Separate list_elements from text_elements (lines 626, 636-637) - Impact: List items were completely ignored in rendering **Issue 2: Missing Sequential Numbering** - Problem: Each list item independently parsed its own number - Fix: Implement _draw_list_elements_direct method (lines 1523-1610) - Groups list items by proximity (max_gap=30pt) and level - Maintains list_counter across items for sequential numbering - Starts from original number in first item **Issue 3: Unreliable List Type Detection** - Problem: Regex-based detection per item, not per list - Fix: Detect type from first item in group, apply to all items - Store computed marker in metadata (_list_marker, _list_type) - Ensures consistency across entire list **Issue 4: Insufficient List Spacing Control** - Problem: No grouping logic, relied solely on bbox positions - Fix: Proximity-based grouping with 30pt max gap threshold - Groups consecutive items into lists - Separates lists when gap exceeds threshold or level changes **Technical Implementation** New method: _draw_list_elements_direct (lines 1523-1610) - Sort items by position (y0, x0) - Group by proximity and level - Detect list type from first item - Assign sequential markers - Store in metadata for _draw_text_element_direct Updated: _draw_text_element_direct (lines 1662-1677) - Use pre-computed _list_marker from metadata - Simplified marker removal (just clean original markers) - No longer needs to maintain counter per-item Updated: _generate_direct_track_pdf (lines 622-663) - Separate list_elements collection - Call _draw_list_elements_direct before text rendering - Updated logging to show list item count **Modified Files** - backend/app/services/pdf_generator_service.py - Lines 626, 636-637: Separate list_elements - Lines 644-646: Updated logging - Lines 658-659: Add list rendering layer - Lines 1523-1610: New _draw_list_elements_direct method - Lines 1662-1677: Simplified list detection in _draw_text_element_direct - openspec/changes/pdf-layout-restoration/tasks.md - Updated Task 6.1 subtasks with accurate implementation details - Updated Task 6.2 subtasks with grouping and numbering logic 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -623,6 +623,7 @@ class PDFGeneratorService:
|
||||
text_elements = []
|
||||
table_elements = []
|
||||
image_elements = []
|
||||
list_elements = []
|
||||
|
||||
for element in page.elements:
|
||||
if element.type == ElementType.TABLE:
|
||||
@@ -632,6 +633,8 @@ class PDFGeneratorService:
|
||||
ElementType.CHART, ElementType.DIAGRAM
|
||||
]:
|
||||
image_elements.append(element)
|
||||
elif element.type == ElementType.LIST_ITEM:
|
||||
list_elements.append(element)
|
||||
elif element.is_text or element.type in [
|
||||
ElementType.TEXT, ElementType.TITLE, ElementType.HEADER,
|
||||
ElementType.FOOTER, ElementType.PARAGRAPH
|
||||
@@ -639,9 +642,10 @@ class PDFGeneratorService:
|
||||
text_elements.append(element)
|
||||
|
||||
logger.info(f"Page {page_idx + 1}: {len(text_elements)} text, "
|
||||
f"{len(table_elements)} tables, {len(image_elements)} images")
|
||||
f"{len(table_elements)} tables, {len(image_elements)} images, "
|
||||
f"{len(list_elements)} list items")
|
||||
|
||||
# Draw in layers: images → tables → text
|
||||
# Draw in layers: images → tables → lists → text
|
||||
|
||||
# 1. Draw images
|
||||
for img_elem in image_elements:
|
||||
@@ -651,7 +655,10 @@ class PDFGeneratorService:
|
||||
for table_elem in table_elements:
|
||||
self._draw_table_element_direct(pdf_canvas, table_elem, page_height)
|
||||
|
||||
# 3. Draw text with line breaks and styling
|
||||
# 3. Draw lists with sequential numbering
|
||||
self._draw_list_elements_direct(pdf_canvas, list_elements, page_height)
|
||||
|
||||
# 4. Draw text with line breaks and styling
|
||||
for text_elem in text_elements:
|
||||
self._draw_text_element_direct(pdf_canvas, text_elem, page_height)
|
||||
|
||||
@@ -1513,6 +1520,95 @@ class PDFGeneratorService:
|
||||
traceback.print_exc()
|
||||
return False
|
||||
|
||||
def _draw_list_elements_direct(
|
||||
self,
|
||||
pdf_canvas: canvas.Canvas,
|
||||
list_elements: List['DocumentElement'],
|
||||
page_height: float
|
||||
):
|
||||
"""
|
||||
Draw list elements with proper sequential numbering and formatting.
|
||||
|
||||
This method processes all list items on a page, groups them into lists,
|
||||
and assigns proper sequential numbering to ordered lists.
|
||||
|
||||
Args:
|
||||
pdf_canvas: ReportLab canvas object
|
||||
list_elements: List of LIST_ITEM elements
|
||||
page_height: Page height for coordinate transformation
|
||||
"""
|
||||
if not list_elements:
|
||||
return
|
||||
|
||||
# Sort list items by position (top to bottom, left to right)
|
||||
sorted_items = sorted(list_elements, key=lambda e: (e.bbox.y0, e.bbox.x0))
|
||||
|
||||
# Group list items into lists based on proximity and level
|
||||
list_groups = []
|
||||
current_group = []
|
||||
prev_y = None
|
||||
prev_level = None
|
||||
max_gap = 30 # Maximum vertical gap between items in same list (in points)
|
||||
|
||||
for item in sorted_items:
|
||||
level = item.metadata.get('list_level', 0) if item.metadata else 0
|
||||
y_pos = item.bbox.y0
|
||||
|
||||
# Check if this item belongs to current group
|
||||
if current_group and prev_y is not None:
|
||||
gap = abs(y_pos - prev_y)
|
||||
# Start new group if gap is too large or level changed significantly
|
||||
if gap > max_gap or (prev_level is not None and level != prev_level):
|
||||
list_groups.append(current_group)
|
||||
current_group = []
|
||||
|
||||
current_group.append(item)
|
||||
prev_y = y_pos
|
||||
prev_level = level
|
||||
|
||||
if current_group:
|
||||
list_groups.append(current_group)
|
||||
|
||||
# Process each list group
|
||||
for group in list_groups:
|
||||
# Detect list type from first item
|
||||
first_item = group[0]
|
||||
text_content = first_item.get_text()
|
||||
text_stripped = text_content.lstrip()
|
||||
|
||||
list_type = None
|
||||
list_counter = 1
|
||||
|
||||
# Determine list type
|
||||
if re.match(r'^\d+[\.\)]\s', text_stripped):
|
||||
list_type = 'ordered'
|
||||
# Extract starting number
|
||||
match = re.match(r'^(\d+)[\.\)]\s', text_stripped)
|
||||
if match:
|
||||
list_counter = int(match.group(1))
|
||||
elif re.match(r'^[•·▪▫◦‣⁃]\s', text_stripped):
|
||||
list_type = 'unordered'
|
||||
|
||||
# Draw each item in the group
|
||||
for item in group:
|
||||
# Prepare list marker based on type
|
||||
if list_type == 'ordered':
|
||||
list_marker = f"{list_counter}. "
|
||||
list_counter += 1
|
||||
elif list_type == 'unordered':
|
||||
list_marker = "• "
|
||||
else:
|
||||
list_marker = "" # No marker if type unknown
|
||||
|
||||
# Store list marker in item metadata for _draw_text_element_direct
|
||||
if not item.metadata:
|
||||
item.metadata = {}
|
||||
item.metadata['_list_marker'] = list_marker
|
||||
item.metadata['_list_type'] = list_type
|
||||
|
||||
# Draw the list item using text element renderer
|
||||
self._draw_text_element_direct(pdf_canvas, item, page_height)
|
||||
|
||||
def _draw_text_element_direct(
|
||||
self,
|
||||
pdf_canvas: canvas.Canvas,
|
||||
@@ -1566,27 +1662,19 @@ class PDFGeneratorService:
|
||||
# Detect list items and extract list properties
|
||||
is_list_item = (element.type == ElementType.LIST_ITEM)
|
||||
list_level = element.metadata.get('list_level', 0) if element.metadata else 0
|
||||
list_type = None # 'ordered' or 'unordered'
|
||||
list_marker = "" # The bullet or number prefix
|
||||
|
||||
if is_list_item:
|
||||
# Determine list type based on text content
|
||||
# Get pre-computed list marker from metadata (set by _draw_list_elements_direct)
|
||||
list_marker = element.metadata.get('_list_marker', '') if element.metadata else ''
|
||||
list_type = element.metadata.get('_list_type') if element.metadata else None
|
||||
|
||||
# If no pre-computed marker, remove original marker from text
|
||||
if is_list_item and list_marker:
|
||||
# Remove original marker from text content
|
||||
text_stripped = text_content.lstrip()
|
||||
# Check for ordered list (starts with digit)
|
||||
if re.match(r'^\d+[\.\)]\s', text_stripped):
|
||||
list_type = 'ordered'
|
||||
# Extract the number
|
||||
match = re.match(r'^(\d+)[\.\)]\s', text_stripped)
|
||||
if match:
|
||||
list_marker = match.group(1) + ". "
|
||||
# Remove the marker from text content
|
||||
text_content = text_stripped[len(match.group(0)):]
|
||||
# Check for unordered list (starts with bullet)
|
||||
elif re.match(r'^[•·▪▫◦‣⁃]\s', text_stripped):
|
||||
list_type = 'unordered'
|
||||
list_marker = "• " # Use standard bullet
|
||||
# Remove the original marker from text content
|
||||
text_content = re.sub(r'^[•·▪▫◦‣⁃]\s', '', text_stripped)
|
||||
# Remove ordered list marker
|
||||
text_content = re.sub(r'^\d+[\.\)]\s', '', text_stripped)
|
||||
# Remove unordered list marker
|
||||
text_content = re.sub(r'^[•·▪▫◦‣⁃]\s', '', text_content)
|
||||
|
||||
# Get indentation from metadata (in points)
|
||||
indent = element.metadata.get('indent', 0) if element.metadata else 0
|
||||
|
||||
Reference in New Issue
Block a user