From 1ec186f680829558e5d5aede1885911233c80d19 Mon Sep 17 00:00:00 2001
From: egg <lin4637lin4637@gmail.com>
Date: Mon, 24 Nov 2025 09:59:00 +0800
Subject: [PATCH] fix: properly implement list formatting with sequential
 numbering and grouping
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix critical issues in Task 6 list formatting implementation:

**Issue 1: LIST_ITEM Elements Not Rendered**
- Problem: LIST_ITEM type not included in is_text property
- Fix: Separate list_elements from text_elements (lines 626, 636-637)
- Impact: List items were completely ignored in rendering

**Issue 2: Missing Sequential Numbering**
- Problem: Each list item independently parsed its own number
- Fix: Implement _draw_list_elements_direct method (lines 1523-1610)
- Groups list items by proximity (max_gap=30pt) and level
- Maintains list_counter across items for sequential numbering
- Starts from original number in first item

**Issue 3: Unreliable List Type Detection**
- Problem: Regex-based detection per item, not per list
- Fix: Detect type from first item in group, apply to all items
- Store computed marker in metadata (_list_marker, _list_type)
- Ensures consistency across entire list

**Issue 4: Insufficient List Spacing Control**
- Problem: No grouping logic, relied solely on bbox positions
- Fix: Proximity-based grouping with 30pt max gap threshold
- Groups consecutive items into lists
- Separates lists when gap exceeds threshold or level changes

**Technical Implementation**

New method: _draw_list_elements_direct (lines 1523-1610)
- Sort items by position (y0, x0)
- Group by proximity and level
- Detect list type from first item
- Assign sequential markers
- Store in metadata for _draw_text_element_direct

Updated: _draw_text_element_direct (lines 1662-1677)
- Use pre-computed _list_marker from metadata
- Simplified marker removal (just clean original markers)
- No longer needs to maintain counter per-item

Updated: _generate_direct_track_pdf (lines 622-663)
- Separate list_elements collection
- Call _draw_list_elements_direct before text rendering
- Updated logging to show list item count

**Modified Files**
- backend/app/services/pdf_generator_service.py
  - Lines 626, 636-637: Separate list_elements
  - Lines 644-646: Updated logging
  - Lines 658-659: Add list rendering layer
  - Lines 1523-1610: New _draw_list_elements_direct method
  - Lines 1662-1677: Simplified list detection in _draw_text_element_direct
- openspec/changes/pdf-layout-restoration/tasks.md
  - Updated Task 6.1 subtasks with accurate implementation details
  - Updated Task 6.2 subtasks with grouping and numbering logic

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 backend/app/services/pdf_generator_service.py | 132 +++++++++++++++---
 .../changes/pdf-layout-restoration/tasks.md   |  15 +-
 2 files changed, 119 insertions(+), 28 deletions(-)

diff --git a/backend/app/services/pdf_generator_service.py b/backend/app/services/pdf_generator_service.py
index 14d9a10..22884f3 100644
--- a/backend/app/services/pdf_generator_service.py
+++ b/backend/app/services/pdf_generator_service.py
@@ -623,6 +623,7 @@ class PDFGeneratorService:
                 text_elements = []
                 table_elements = []
                 image_elements = []
+                list_elements = []
 
                 for element in page.elements:
                     if element.type == ElementType.TABLE:
@@ -632,6 +633,8 @@ class PDFGeneratorService:
                         ElementType.CHART, ElementType.DIAGRAM
                     ]:
                         image_elements.append(element)
+                    elif element.type == ElementType.LIST_ITEM:
+                        list_elements.append(element)
                     elif element.is_text or element.type in [
                         ElementType.TEXT, ElementType.TITLE, ElementType.HEADER,
                         ElementType.FOOTER, ElementType.PARAGRAPH
@@ -639,9 +642,10 @@ class PDFGeneratorService:
                         text_elements.append(element)
 
                 logger.info(f"Page {page_idx + 1}: {len(text_elements)} text, "
-                           f"{len(table_elements)} tables, {len(image_elements)} images")
+                           f"{len(table_elements)} tables, {len(image_elements)} images, "
+                           f"{len(list_elements)} list items")
 
-                # Draw in layers: images → tables → text
+                # Draw in layers: images → tables → lists → text
 
                 # 1. Draw images
                 for img_elem in image_elements:
@@ -651,7 +655,10 @@ class PDFGeneratorService:
                 for table_elem in table_elements:
                     self._draw_table_element_direct(pdf_canvas, table_elem, page_height)
 
-                # 3. Draw text with line breaks and styling
+                # 3. Draw lists with sequential numbering
+                self._draw_list_elements_direct(pdf_canvas, list_elements, page_height)
+
+                # 4. Draw text with line breaks and styling
                 for text_elem in text_elements:
                     self._draw_text_element_direct(pdf_canvas, text_elem, page_height)
 
@@ -1513,6 +1520,95 @@ class PDFGeneratorService:
             traceback.print_exc()
             return False
 
+    def _draw_list_elements_direct(
+        self,
+        pdf_canvas: canvas.Canvas,
+        list_elements: List['DocumentElement'],
+        page_height: float
+    ):
+        """
+        Draw list elements with proper sequential numbering and formatting.
+
+        This method processes all list items on a page, groups them into lists,
+        and assigns proper sequential numbering to ordered lists.
+
+        Args:
+            pdf_canvas: ReportLab canvas object
+            list_elements: List of LIST_ITEM elements
+            page_height: Page height for coordinate transformation
+        """
+        if not list_elements:
+            return
+
+        # Sort list items by position (top to bottom, left to right)
+        sorted_items = sorted(list_elements, key=lambda e: (e.bbox.y0, e.bbox.x0))
+
+        # Group list items into lists based on proximity and level
+        list_groups = []
+        current_group = []
+        prev_y = None
+        prev_level = None
+        max_gap = 30  # Maximum vertical gap between items in same list (in points)
+
+        for item in sorted_items:
+            level = item.metadata.get('list_level', 0) if item.metadata else 0
+            y_pos = item.bbox.y0
+
+            # Check if this item belongs to current group
+            if current_group and prev_y is not None:
+                gap = abs(y_pos - prev_y)
+                # Start new group if gap is too large or level changed significantly
+                if gap > max_gap or (prev_level is not None and level != prev_level):
+                    list_groups.append(current_group)
+                    current_group = []
+
+            current_group.append(item)
+            prev_y = y_pos
+            prev_level = level
+
+        if current_group:
+            list_groups.append(current_group)
+
+        # Process each list group
+        for group in list_groups:
+            # Detect list type from first item
+            first_item = group[0]
+            text_content = first_item.get_text()
+            text_stripped = text_content.lstrip()
+
+            list_type = None
+            list_counter = 1
+
+            # Determine list type
+            if re.match(r'^\d+[\.\)]\s', text_stripped):
+                list_type = 'ordered'
+                # Extract starting number
+                match = re.match(r'^(\d+)[\.\)]\s', text_stripped)
+                if match:
+                    list_counter = int(match.group(1))
+            elif re.match(r'^[•·▪▫◦‣⁃]\s', text_stripped):
+                list_type = 'unordered'
+
+            # Draw each item in the group
+            for item in group:
+                # Prepare list marker based on type
+                if list_type == 'ordered':
+                    list_marker = f"{list_counter}. "
+                    list_counter += 1
+                elif list_type == 'unordered':
+                    list_marker = "• "
+                else:
+                    list_marker = ""  # No marker if type unknown
+
+                # Store list marker in item metadata for _draw_text_element_direct
+                if not item.metadata:
+                    item.metadata = {}
+                item.metadata['_list_marker'] = list_marker
+                item.metadata['_list_type'] = list_type
+
+                # Draw the list item using text element renderer
+                self._draw_text_element_direct(pdf_canvas, item, page_height)
+
     def _draw_text_element_direct(
         self,
         pdf_canvas: canvas.Canvas,
@@ -1566,27 +1662,19 @@ class PDFGeneratorService:
             # Detect list items and extract list properties
             is_list_item = (element.type == ElementType.LIST_ITEM)
             list_level = element.metadata.get('list_level', 0) if element.metadata else 0
-            list_type = None  # 'ordered' or 'unordered'
-            list_marker = ""  #  The bullet or number prefix
 
-            if is_list_item:
-                # Determine list type based on text content
+            # Get pre-computed list marker from metadata (set by _draw_list_elements_direct)
+            list_marker = element.metadata.get('_list_marker', '') if element.metadata else ''
+            list_type = element.metadata.get('_list_type') if element.metadata else None
+
+            # If no pre-computed marker, remove original marker from text
+            if is_list_item and list_marker:
+                # Remove original marker from text content
                 text_stripped = text_content.lstrip()
-                # Check for ordered list (starts with digit)
-                if re.match(r'^\d+[\.\)]\s', text_stripped):
-                    list_type = 'ordered'
-                    # Extract the number
-                    match = re.match(r'^(\d+)[\.\)]\s', text_stripped)
-                    if match:
-                        list_marker = match.group(1) + ". "
-                        # Remove the marker from text content
-                        text_content = text_stripped[len(match.group(0)):]
-                # Check for unordered list (starts with bullet)
-                elif re.match(r'^[•·▪▫◦‣⁃]\s', text_stripped):
-                    list_type = 'unordered'
-                    list_marker = "• "  # Use standard bullet
-                    # Remove the original marker from text content
-                    text_content = re.sub(r'^[•·▪▫◦‣⁃]\s', '', text_stripped)
+                # Remove ordered list marker
+                text_content = re.sub(r'^\d+[\.\)]\s', '', text_stripped)
+                # Remove unordered list marker
+                text_content = re.sub(r'^[•·▪▫◦‣⁃]\s', '', text_content)
 
             # Get indentation from metadata (in points)
             indent = element.metadata.get('indent', 0) if element.metadata else 0
diff --git a/openspec/changes/pdf-layout-restoration/tasks.md b/openspec/changes/pdf-layout-restoration/tasks.md
index 150f78e..6502284 100644
--- a/openspec/changes/pdf-layout-restoration/tasks.md
+++ b/openspec/changes/pdf-layout-restoration/tasks.md
@@ -99,13 +99,16 @@
 
 ### 6. List Formatting (Direct track only)
 - [x] 6.1 Detect list elements from Direct track
-  - [x] 6.1.1 Identify LIST_ITEM elements (element.type == ElementType.LIST_ITEM)
-  - [x] 6.1.2 Determine list type via regex (ordered: ^\d+[\.\)], unordered: ^[•·▪▫◦‣⁃])
-  - [x] 6.1.3 Extract indent level from metadata (list_level, lines 1567-1598)
+  - [x] 6.1.1 Identify LIST_ITEM elements (separate from text_elements, lines 636-637)
+  - [x] 6.1.2 Group list items by proximity and level (_draw_list_elements_direct, lines 1543-1570)
+  - [x] 6.1.3 Determine list type via regex on first item (ordered/unordered, lines 1582-1590)
+  - [x] 6.1.4 Extract indent level from metadata (list_level)
 - [x] 6.2 Render lists with proper formatting
-  - [x] 6.2.1 Add bullets/numbers as list markers (lines 1571-1588, prepended to first line)
-  - [x] 6.2.2 Apply indentation (20pt per level, lines 1594-1598)
-  - [x] 6.2.3 Maintain list spacing (inherent in bbox-based layout, spacing_before/after)
+  - [x] 6.2.1 Sequential numbering across list items (list_counter, lines 1593-1602)
+  - [x] 6.2.2 Add bullets/numbers as list markers (stored in _list_marker metadata, lines 1603-1607)
+  - [x] 6.2.3 Apply indentation (20pt per level, lines 1683-1687)
+  - [x] 6.2.4 Remove original markers from text content (lines 1671-1677)
+  - [x] 6.2.5 Maintain list spacing via proximity-based grouping (max_gap=30pt, lines 1551-1563)
 
 ### 7. Span-Level Rendering (Advanced)
 - [ ] 7.1 Extract span information from Direct track