feat: implement Phase 3 list formatting for Direct track

Add comprehensive list rendering with automatic detection and formatting: **Task 6.1: List Element Detection** - Detect LIST_ITEM elements by type (element.type == ElementType.LIST_ITEM) - Extract list_level from element metadata (lines 1566-1567) - Determine list type via regex pattern matching: - Ordered lists: ^\d+[\.\)]\s (e.g., "1. ", "2) ") - Unordered lists: ^[•·▪▫◦‣⁃]\s (various bullet symbols) - Parse and extract list markers from text content (lines 1571-1588) **Task 6.2: List Rendering** - Add list markers to first line of each item: - Ordered: Preserve original numbering (e.g., "1. ") - Unordered: Standardize to bullet "• " - Remove original markers from text content - Apply list indentation: 20pt per nesting level (lines 1594-1598) - Combine list indent with existing paragraph indent - List spacing: Inherited from bbox-based layout (spacing_before/after) **Implementation Details** - Lines 1565-1598: List detection and indentation logic - Lines 1629-1632: Prepend list marker to first line (rendered_line) - Lines 1635-1676: Update all text width calculations to use rendered_line - Lines 1688-1692: Enhanced logging with list type and level **Technical Notes** - Direct track only (OCR track has no list metadata) - Integrates with existing alignment and indentation system - Preserves line breaks and multi-line list items - Works with all text alignment modes (left/center/right/justify) **Modified Files** - backend/app/services/pdf_generator_service.py - Added import re for regex pattern matching - Lines 1565-1598: List detection and indentation - Lines 1629-1676: List marker rendering - Lines 1688-1692: Enhanced debug logging - openspec/changes/pdf-layout-restoration/tasks.md - Marked Task 6.1 (all subtasks) as completed - Marked Task 6.2 (all subtasks) as completed - Added implementation line references 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-24 09:54:15 +08:00
parent e1e97c54cf
commit ad879d48e5
2 changed files with 52 additions and 14 deletions
--- a/backend/app/services/pdf_generator_service.py
+++ b/backend/app/services/pdf_generator_service.py
@@ -5,6 +5,7 @@ Generates PDF files that preserve the original document layout using OCR JSON da

 import json
 import logging
+import re
 from pathlib import Path
 from typing import Dict, List, Optional, Tuple, Union
 from datetime import datetime
@@ -1562,10 +1563,41 @@ class PDFGeneratorService:
                font_name = self.font_name if self.font_registered else 'Helvetica'
                pdf_canvas.setFont(font_name, font_size)

+            # Detect list items and extract list properties
+            is_list_item = (element.type == ElementType.LIST_ITEM)
+            list_level = element.metadata.get('list_level', 0) if element.metadata else 0
+            list_type = None  # 'ordered' or 'unordered'
+            list_marker = ""  #  The bullet or number prefix
+
+            if is_list_item:
+                # Determine list type based on text content
+                text_stripped = text_content.lstrip()
+                # Check for ordered list (starts with digit)
+                if re.match(r'^\d+[\.\)]\s', text_stripped):
+                    list_type = 'ordered'
+                    # Extract the number
+                    match = re.match(r'^(\d+)[\.\)]\s', text_stripped)
+                    if match:
+                        list_marker = match.group(1) + ". "
+                        # Remove the marker from text content
+                        text_content = text_stripped[len(match.group(0)):]
+                # Check for unordered list (starts with bullet)
+                elif re.match(r'^[•·▪▫◦‣⁃]\s', text_stripped):
+                    list_type = 'unordered'
+                    list_marker = "• "  # Use standard bullet
+                    # Remove the original marker from text content
+                    text_content = re.sub(r'^[•·▪▫◦‣⁃]\s', '', text_stripped)
+
            # Get indentation from metadata (in points)
            indent = element.metadata.get('indent', 0) if element.metadata else 0
            first_line_indent = element.metadata.get('first_line_indent', indent) if element.metadata else indent

+            # Apply list indentation (20pt per level)
+            if is_list_item:
+                list_indent = list_level * 20  # 20pt per level
+                indent += list_indent
+                first_line_indent += list_indent
+
            # Get paragraph spacing
            # spacing_before: Applied by adjusting starting Y position (pdf_y)
            # spacing_after: Recorded for debugging; in Direct track with fixed bbox,
@@ -1595,8 +1627,13 @@ class PDFGeneratorService:
                # Calculate line indentation
                line_indent = first_line_indent if i == 0 else indent

+                # Prepend list marker to first line
+                rendered_line = line
+                if is_list_item and i == 0 and list_marker:
+                    rendered_line = list_marker + line
+
                # Calculate text width
-                text_width = pdf_canvas.stringWidth(line, font_name, current_font_size)
+                text_width = pdf_canvas.stringWidth(rendered_line, font_name, current_font_size)
                available_width = bbox_width - line_indent

                # Scale font if needed
@@ -1605,7 +1642,7 @@ class PDFGeneratorService:
                    scaled_size = current_font_size * scale_factor * 0.95
                    scaled_size = max(scaled_size, 3)
                    pdf_canvas.setFont(font_name, scaled_size)
-                    text_width = pdf_canvas.stringWidth(line, font_name, scaled_size)
+                    text_width = pdf_canvas.stringWidth(rendered_line, font_name, scaled_size)
                    current_font_size = scaled_size

                # Calculate X position based on alignment
@@ -1617,7 +1654,7 @@ class PDFGeneratorService:
                    line_x = pdf_x + bbox_width - text_width
                elif alignment == 'justify' and i < len(lines) - 1:
                    # Justify: distribute extra space between words (except last line)
-                    words = line.split()
+                    words = rendered_line.split()
                    if len(words) > 1:
                        total_word_width = sum(pdf_canvas.stringWidth(word, font_name, current_font_size) for word in words)
                        extra_space = available_width - total_word_width
@@ -1637,7 +1674,7 @@ class PDFGeneratorService:
                # else: left alignment uses line_x as-is

                # Draw the line at calculated position
-                pdf_canvas.drawString(line_x, line_y, line)
+                pdf_canvas.drawString(line_x, line_y, rendered_line)

                # Reset font size for next line
                if text_width > available_width:
@@ -1649,8 +1686,9 @@ class PDFGeneratorService:

            # Note: spacing_after is inherent in element positioning (bbox-based layout)
            # If text is shorter than bbox, the remaining space acts as implicit spacing
+            list_info = f", list={list_type}, level={list_level}" if is_list_item else ""
            logger.debug(f"Drew text element: {text_content[:30]}... "
-                        f"({len(lines)} lines, align={alignment}, indent={indent}, "
+                        f"({len(lines)} lines, align={alignment}, indent={indent}{list_info}, "
                        f"spacing_before={paragraph_spacing_before}, spacing_after={paragraph_spacing_after}, "
                        f"actual_height={actual_text_height:.1f}, bbox_bottom_margin={bbox_bottom_margin:.1f})")