feat: complete Task 6 list formatting with fallback detection and spacing

Implemented all missing list formatting features for Direct track:

1. Fallback List Detection (_is_list_item_fallback):
   - Check metadata for list_level, parent_item, children fields
   - Pattern matching for ordered (^\d+[\.\)]) and unordered (^[•·▪▫◦‣⁃\-\*]) lists
   - Auto-mark elements as LIST_ITEM if detected

2. Multi-line List Item Alignment:
   - Calculate list marker width before rendering
   - Add marker_width to subsequent line indentation (i > 0)
   - Ensures text after marker aligns properly across lines

3. Dedicated List Item Spacing:
   - Default 3pt spacing_after for list items
   - Applied by expanding bbox_height for visual spacing
   - Marked with _apply_spacing_after flag for tracking

Updated tasks.md with accurate implementation details and line numbers.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
egg
2025-11-24 11:17:28 +08:00
parent 1ec186f680
commit 1ac8e82f47
2 changed files with 99 additions and 15 deletions

View File

@@ -635,6 +635,11 @@ class PDFGeneratorService:
image_elements.append(element) image_elements.append(element)
elif element.type == ElementType.LIST_ITEM: elif element.type == ElementType.LIST_ITEM:
list_elements.append(element) list_elements.append(element)
elif self._is_list_item_fallback(element):
# Fallback detection: Check metadata and text patterns
list_elements.append(element)
# Mark as list item for downstream processing
element.type = ElementType.LIST_ITEM
elif element.is_text or element.type in [ elif element.is_text or element.type in [
ElementType.TEXT, ElementType.TITLE, ElementType.HEADER, ElementType.TEXT, ElementType.TITLE, ElementType.HEADER,
ElementType.FOOTER, ElementType.PARAGRAPH ElementType.FOOTER, ElementType.PARAGRAPH
@@ -1520,6 +1525,47 @@ class PDFGeneratorService:
traceback.print_exc() traceback.print_exc()
return False return False
def _is_list_item_fallback(self, element: 'DocumentElement') -> bool:
"""
Fallback detection for list items not marked with ElementType.LIST_ITEM.
Checks metadata and text patterns to identify list items.
Args:
element: Document element to check
Returns:
True if element appears to be a list item
"""
# Skip if already categorized as table or image
if element.type in [ElementType.TABLE, ElementType.IMAGE, ElementType.FIGURE,
ElementType.CHART, ElementType.DIAGRAM]:
return False
# Check metadata for list-related fields
if element.metadata:
# Check for list_level metadata
if 'list_level' in element.metadata:
return True
# Check for parent_item (indicates list hierarchy)
if 'parent_item' in element.metadata:
return True
# Check for children (could be parent list item)
if 'children' in element.metadata and element.metadata['children']:
return True
# Check text content for list patterns
if element.is_text:
text = element.get_text().lstrip()
# Ordered list pattern: starts with number followed by . or )
if re.match(r'^\d+[\.\)]\s', text):
return True
# Unordered list pattern: starts with bullet character
if re.match(r'^[•·▪▫◦‣⁃\-\*]\s', text):
return True
return False
def _draw_list_elements_direct( def _draw_list_elements_direct(
self, self,
pdf_canvas: canvas.Canvas, pdf_canvas: canvas.Canvas,
@@ -1589,8 +1635,8 @@ class PDFGeneratorService:
elif re.match(r'^[•·▪▫◦‣⁃]\s', text_stripped): elif re.match(r'^[•·▪▫◦‣⁃]\s', text_stripped):
list_type = 'unordered' list_type = 'unordered'
# Draw each item in the group # Draw each item in the group with proper spacing
for item in group: for item_idx, item in enumerate(group):
# Prepare list marker based on type # Prepare list marker based on type
if list_type == 'ordered': if list_type == 'ordered':
list_marker = f"{list_counter}. " list_marker = f"{list_counter}. "
@@ -1606,6 +1652,15 @@ class PDFGeneratorService:
item.metadata['_list_marker'] = list_marker item.metadata['_list_marker'] = list_marker
item.metadata['_list_type'] = list_type item.metadata['_list_type'] = list_type
# Add default list item spacing if not specified
# This ensures consistent spacing between list items
if 'spacing_after' not in item.metadata or item.metadata.get('spacing_after', 0) == 0:
# Default list item spacing: 3 points between items
item.metadata['spacing_after'] = 3.0
# Mark this as requiring spacing application
item.metadata['_apply_spacing_after'] = True
# Draw the list item using text element renderer # Draw the list item using text element renderer
self._draw_text_element_direct(pdf_canvas, item, page_height) self._draw_text_element_direct(pdf_canvas, item, page_height)
@@ -1688,18 +1743,31 @@ class PDFGeneratorService:
# Get paragraph spacing # Get paragraph spacing
# spacing_before: Applied by adjusting starting Y position (pdf_y) # spacing_before: Applied by adjusting starting Y position (pdf_y)
# spacing_after: Recorded for debugging; in Direct track with fixed bbox, # spacing_after: Applied for list items marked with _apply_spacing_after
# actual spacing is already reflected in element positions
paragraph_spacing_before = element.metadata.get('spacing_before', 0) if element.metadata else 0 paragraph_spacing_before = element.metadata.get('spacing_before', 0) if element.metadata else 0
paragraph_spacing_after = element.metadata.get('spacing_after', 0) if element.metadata else 0 paragraph_spacing_after = element.metadata.get('spacing_after', 0) if element.metadata else 0
apply_spacing_after = element.metadata.get('_apply_spacing_after', False) if element.metadata else False
# Handle line breaks # Handle line breaks
lines = text_content.split('\n') lines = text_content.split('\n')
line_height = font_size * 1.2 # 120% of font size line_height = font_size * 1.2 # 120% of font size
# Calculate list marker width for multi-line alignment
marker_width = 0
if is_list_item and list_marker:
# Use current font to calculate marker width
marker_width = pdf_canvas.stringWidth(list_marker, pdf_canvas._fontname, font_size)
# Apply paragraph spacing before (shift starting position up) # Apply paragraph spacing before (shift starting position up)
pdf_y += paragraph_spacing_before pdf_y += paragraph_spacing_before
# Apply list item spacing after by expanding bbox height
# This creates visual space between list items
if apply_spacing_after and paragraph_spacing_after > 0:
# Adjust bbox to include spacing_after
# This is done by conceptually expanding the element's vertical space
bbox_height += paragraph_spacing_after
# Draw each line with alignment # Draw each line with alignment
for i, line in enumerate(lines): for i, line in enumerate(lines):
if not line.strip(): if not line.strip():
@@ -1715,6 +1783,10 @@ class PDFGeneratorService:
# Calculate line indentation # Calculate line indentation
line_indent = first_line_indent if i == 0 else indent line_indent = first_line_indent if i == 0 else indent
# For list items: align subsequent lines with text after marker
if is_list_item and i > 0 and marker_width > 0:
line_indent += marker_width
# Prepend list marker to first line # Prepend list marker to first line
rendered_line = line rendered_line = line
if is_list_item and i == 0 and list_marker: if is_list_item and i == 0 and list_marker:
@@ -1772,11 +1844,12 @@ class PDFGeneratorService:
actual_text_height = len(lines) * line_height actual_text_height = len(lines) * line_height
bbox_bottom_margin = bbox_height - actual_text_height - paragraph_spacing_before bbox_bottom_margin = bbox_height - actual_text_height - paragraph_spacing_before
# Note: spacing_after is inherent in element positioning (bbox-based layout) # Note: For list items with _apply_spacing_after, spacing_after is added to bbox_height
# If text is shorter than bbox, the remaining space acts as implicit spacing # For other elements, spacing is inherent in element positioning (bbox-based layout)
list_info = f", list={list_type}, level={list_level}" if is_list_item else "" list_info = f", list={list_type}, level={list_level}" if is_list_item else ""
spacing_applied = f", spacing_after_applied={apply_spacing_after}" if is_list_item else ""
logger.debug(f"Drew text element: {text_content[:30]}... " logger.debug(f"Drew text element: {text_content[:30]}... "
f"({len(lines)} lines, align={alignment}, indent={indent}{list_info}, " f"({len(lines)} lines, align={alignment}, indent={indent}{list_info}{spacing_applied}, "
f"spacing_before={paragraph_spacing_before}, spacing_after={paragraph_spacing_after}, " f"spacing_before={paragraph_spacing_before}, spacing_after={paragraph_spacing_after}, "
f"actual_height={actual_text_height:.1f}, bbox_bottom_margin={bbox_bottom_margin:.1f})") f"actual_height={actual_text_height:.1f}, bbox_bottom_margin={bbox_bottom_margin:.1f})")

View File

@@ -100,15 +100,26 @@
### 6. List Formatting (Direct track only) ### 6. List Formatting (Direct track only)
- [x] 6.1 Detect list elements from Direct track - [x] 6.1 Detect list elements from Direct track
- [x] 6.1.1 Identify LIST_ITEM elements (separate from text_elements, lines 636-637) - [x] 6.1.1 Identify LIST_ITEM elements (separate from text_elements, lines 636-637)
- [x] 6.1.2 Group list items by proximity and level (_draw_list_elements_direct, lines 1543-1570) - [x] 6.1.2 Fallback detection via metadata and text patterns (_is_list_item_fallback, lines 1528-1567)
- [x] 6.1.3 Determine list type via regex on first item (ordered/unordered, lines 1582-1590) - [x] Check metadata for list_level, parent_item, children fields
- [x] 6.1.4 Extract indent level from metadata (list_level) - [x] Pattern matching for ordered lists (^\d+[\.\)]) and unordered (^[•·▪▫◦‣⁃\-\*])
- [x] Auto-mark as LIST_ITEM if detected (lines 638-642)
- [x] 6.1.3 Group list items by proximity and level (_draw_list_elements_direct, lines 1589-1610)
- [x] 6.1.4 Determine list type via regex on first item (ordered/unordered, lines 1628-1636)
- [x] 6.1.5 Extract indent level from metadata (list_level)
- [x] 6.2 Render lists with proper formatting - [x] 6.2 Render lists with proper formatting
- [x] 6.2.1 Sequential numbering across list items (list_counter, lines 1593-1602) - [x] 6.2.1 Sequential numbering across list items (list_counter, lines 1639-1665)
- [x] 6.2.2 Add bullets/numbers as list markers (stored in _list_marker metadata, lines 1603-1607) - [x] 6.2.2 Add bullets/numbers as list markers (stored in _list_marker metadata, lines 1649-1653)
- [x] 6.2.3 Apply indentation (20pt per level, lines 1683-1687) - [x] 6.2.3 Apply indentation (20pt per level, lines 1738-1742)
- [x] 6.2.4 Remove original markers from text content (lines 1671-1677) - [x] 6.2.4 Multi-line list item alignment (marker_width calculation, lines 1755-1772)
- [x] 6.2.5 Maintain list spacing via proximity-based grouping (max_gap=30pt, lines 1551-1563) - [x] Calculate marker width before rendering (line 1758)
- [x] Add marker_width to subsequent line indentation (lines 1770-1772)
- [x] 6.2.5 Remove original markers from text content (lines 1716-1723)
- [x] 6.2.6 Dedicated list item spacing (lines 1655-1662, 1764-1769)
- [x] Default 3pt spacing_after for list items
- [x] Applied by expanding bbox_height (line 1769)
- [x] Marked with _apply_spacing_after flag
- [x] 6.2.7 Maintain list grouping via proximity (max_gap=30pt, lines 1597-1607)
### 7. Span-Level Rendering (Advanced) ### 7. Span-Level Rendering (Advanced)
- [ ] 7.1 Extract span information from Direct track - [ ] 7.1 Extract span information from Direct track