From 1ec186f680829558e5d5aede1885911233c80d19 Mon Sep 17 00:00:00 2001 From: egg Date: Mon, 24 Nov 2025 09:59:00 +0800 Subject: [PATCH] fix: properly implement list formatting with sequential numbering and grouping MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix critical issues in Task 6 list formatting implementation: **Issue 1: LIST_ITEM Elements Not Rendered** - Problem: LIST_ITEM type not included in is_text property - Fix: Separate list_elements from text_elements (lines 626, 636-637) - Impact: List items were completely ignored in rendering **Issue 2: Missing Sequential Numbering** - Problem: Each list item independently parsed its own number - Fix: Implement _draw_list_elements_direct method (lines 1523-1610) - Groups list items by proximity (max_gap=30pt) and level - Maintains list_counter across items for sequential numbering - Starts from original number in first item **Issue 3: Unreliable List Type Detection** - Problem: Regex-based detection per item, not per list - Fix: Detect type from first item in group, apply to all items - Store computed marker in metadata (_list_marker, _list_type) - Ensures consistency across entire list **Issue 4: Insufficient List Spacing Control** - Problem: No grouping logic, relied solely on bbox positions - Fix: Proximity-based grouping with 30pt max gap threshold - Groups consecutive items into lists - Separates lists when gap exceeds threshold or level changes **Technical Implementation** New method: _draw_list_elements_direct (lines 1523-1610) - Sort items by position (y0, x0) - Group by proximity and level - Detect list type from first item - Assign sequential markers - Store in metadata for _draw_text_element_direct Updated: _draw_text_element_direct (lines 1662-1677) - Use pre-computed _list_marker from metadata - Simplified marker removal (just clean original markers) - No longer needs to maintain counter per-item Updated: _generate_direct_track_pdf (lines 622-663) - Separate list_elements collection - Call _draw_list_elements_direct before text rendering - Updated logging to show list item count **Modified Files** - backend/app/services/pdf_generator_service.py - Lines 626, 636-637: Separate list_elements - Lines 644-646: Updated logging - Lines 658-659: Add list rendering layer - Lines 1523-1610: New _draw_list_elements_direct method - Lines 1662-1677: Simplified list detection in _draw_text_element_direct - openspec/changes/pdf-layout-restoration/tasks.md - Updated Task 6.1 subtasks with accurate implementation details - Updated Task 6.2 subtasks with grouping and numbering logic 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- backend/app/services/pdf_generator_service.py | 132 +++++++++++++++--- .../changes/pdf-layout-restoration/tasks.md | 15 +- 2 files changed, 119 insertions(+), 28 deletions(-) diff --git a/backend/app/services/pdf_generator_service.py b/backend/app/services/pdf_generator_service.py index 14d9a10..22884f3 100644 --- a/backend/app/services/pdf_generator_service.py +++ b/backend/app/services/pdf_generator_service.py @@ -623,6 +623,7 @@ class PDFGeneratorService: text_elements = [] table_elements = [] image_elements = [] + list_elements = [] for element in page.elements: if element.type == ElementType.TABLE: @@ -632,6 +633,8 @@ class PDFGeneratorService: ElementType.CHART, ElementType.DIAGRAM ]: image_elements.append(element) + elif element.type == ElementType.LIST_ITEM: + list_elements.append(element) elif element.is_text or element.type in [ ElementType.TEXT, ElementType.TITLE, ElementType.HEADER, ElementType.FOOTER, ElementType.PARAGRAPH @@ -639,9 +642,10 @@ class PDFGeneratorService: text_elements.append(element) logger.info(f"Page {page_idx + 1}: {len(text_elements)} text, " - f"{len(table_elements)} tables, {len(image_elements)} images") + f"{len(table_elements)} tables, {len(image_elements)} images, " + f"{len(list_elements)} list items") - # Draw in layers: images → tables → text + # Draw in layers: images → tables → lists → text # 1. Draw images for img_elem in image_elements: @@ -651,7 +655,10 @@ class PDFGeneratorService: for table_elem in table_elements: self._draw_table_element_direct(pdf_canvas, table_elem, page_height) - # 3. Draw text with line breaks and styling + # 3. Draw lists with sequential numbering + self._draw_list_elements_direct(pdf_canvas, list_elements, page_height) + + # 4. Draw text with line breaks and styling for text_elem in text_elements: self._draw_text_element_direct(pdf_canvas, text_elem, page_height) @@ -1513,6 +1520,95 @@ class PDFGeneratorService: traceback.print_exc() return False + def _draw_list_elements_direct( + self, + pdf_canvas: canvas.Canvas, + list_elements: List['DocumentElement'], + page_height: float + ): + """ + Draw list elements with proper sequential numbering and formatting. + + This method processes all list items on a page, groups them into lists, + and assigns proper sequential numbering to ordered lists. + + Args: + pdf_canvas: ReportLab canvas object + list_elements: List of LIST_ITEM elements + page_height: Page height for coordinate transformation + """ + if not list_elements: + return + + # Sort list items by position (top to bottom, left to right) + sorted_items = sorted(list_elements, key=lambda e: (e.bbox.y0, e.bbox.x0)) + + # Group list items into lists based on proximity and level + list_groups = [] + current_group = [] + prev_y = None + prev_level = None + max_gap = 30 # Maximum vertical gap between items in same list (in points) + + for item in sorted_items: + level = item.metadata.get('list_level', 0) if item.metadata else 0 + y_pos = item.bbox.y0 + + # Check if this item belongs to current group + if current_group and prev_y is not None: + gap = abs(y_pos - prev_y) + # Start new group if gap is too large or level changed significantly + if gap > max_gap or (prev_level is not None and level != prev_level): + list_groups.append(current_group) + current_group = [] + + current_group.append(item) + prev_y = y_pos + prev_level = level + + if current_group: + list_groups.append(current_group) + + # Process each list group + for group in list_groups: + # Detect list type from first item + first_item = group[0] + text_content = first_item.get_text() + text_stripped = text_content.lstrip() + + list_type = None + list_counter = 1 + + # Determine list type + if re.match(r'^\d+[\.\)]\s', text_stripped): + list_type = 'ordered' + # Extract starting number + match = re.match(r'^(\d+)[\.\)]\s', text_stripped) + if match: + list_counter = int(match.group(1)) + elif re.match(r'^[•·▪▫◦‣⁃]\s', text_stripped): + list_type = 'unordered' + + # Draw each item in the group + for item in group: + # Prepare list marker based on type + if list_type == 'ordered': + list_marker = f"{list_counter}. " + list_counter += 1 + elif list_type == 'unordered': + list_marker = "• " + else: + list_marker = "" # No marker if type unknown + + # Store list marker in item metadata for _draw_text_element_direct + if not item.metadata: + item.metadata = {} + item.metadata['_list_marker'] = list_marker + item.metadata['_list_type'] = list_type + + # Draw the list item using text element renderer + self._draw_text_element_direct(pdf_canvas, item, page_height) + def _draw_text_element_direct( self, pdf_canvas: canvas.Canvas, @@ -1566,27 +1662,19 @@ class PDFGeneratorService: # Detect list items and extract list properties is_list_item = (element.type == ElementType.LIST_ITEM) list_level = element.metadata.get('list_level', 0) if element.metadata else 0 - list_type = None # 'ordered' or 'unordered' - list_marker = "" # The bullet or number prefix - if is_list_item: - # Determine list type based on text content + # Get pre-computed list marker from metadata (set by _draw_list_elements_direct) + list_marker = element.metadata.get('_list_marker', '') if element.metadata else '' + list_type = element.metadata.get('_list_type') if element.metadata else None + + # If no pre-computed marker, remove original marker from text + if is_list_item and list_marker: + # Remove original marker from text content text_stripped = text_content.lstrip() - # Check for ordered list (starts with digit) - if re.match(r'^\d+[\.\)]\s', text_stripped): - list_type = 'ordered' - # Extract the number - match = re.match(r'^(\d+)[\.\)]\s', text_stripped) - if match: - list_marker = match.group(1) + ". " - # Remove the marker from text content - text_content = text_stripped[len(match.group(0)):] - # Check for unordered list (starts with bullet) - elif re.match(r'^[•·▪▫◦‣⁃]\s', text_stripped): - list_type = 'unordered' - list_marker = "• " # Use standard bullet - # Remove the original marker from text content - text_content = re.sub(r'^[•·▪▫◦‣⁃]\s', '', text_stripped) + # Remove ordered list marker + text_content = re.sub(r'^\d+[\.\)]\s', '', text_stripped) + # Remove unordered list marker + text_content = re.sub(r'^[•·▪▫◦‣⁃]\s', '', text_content) # Get indentation from metadata (in points) indent = element.metadata.get('indent', 0) if element.metadata else 0 diff --git a/openspec/changes/pdf-layout-restoration/tasks.md b/openspec/changes/pdf-layout-restoration/tasks.md index 150f78e..6502284 100644 --- a/openspec/changes/pdf-layout-restoration/tasks.md +++ b/openspec/changes/pdf-layout-restoration/tasks.md @@ -99,13 +99,16 @@ ### 6. List Formatting (Direct track only) - [x] 6.1 Detect list elements from Direct track - - [x] 6.1.1 Identify LIST_ITEM elements (element.type == ElementType.LIST_ITEM) - - [x] 6.1.2 Determine list type via regex (ordered: ^\d+[\.\)], unordered: ^[•·▪▫◦‣⁃]) - - [x] 6.1.3 Extract indent level from metadata (list_level, lines 1567-1598) + - [x] 6.1.1 Identify LIST_ITEM elements (separate from text_elements, lines 636-637) + - [x] 6.1.2 Group list items by proximity and level (_draw_list_elements_direct, lines 1543-1570) + - [x] 6.1.3 Determine list type via regex on first item (ordered/unordered, lines 1582-1590) + - [x] 6.1.4 Extract indent level from metadata (list_level) - [x] 6.2 Render lists with proper formatting - - [x] 6.2.1 Add bullets/numbers as list markers (lines 1571-1588, prepended to first line) - - [x] 6.2.2 Apply indentation (20pt per level, lines 1594-1598) - - [x] 6.2.3 Maintain list spacing (inherent in bbox-based layout, spacing_before/after) + - [x] 6.2.1 Sequential numbering across list items (list_counter, lines 1593-1602) + - [x] 6.2.2 Add bullets/numbers as list markers (stored in _list_marker metadata, lines 1603-1607) + - [x] 6.2.3 Apply indentation (20pt per level, lines 1683-1687) + - [x] 6.2.4 Remove original markers from text content (lines 1671-1677) + - [x] 6.2.5 Maintain list spacing via proximity-based grouping (max_gap=30pt, lines 1551-1563) ### 7. Span-Level Rendering (Advanced) - [ ] 7.1 Extract span information from Direct track