feat: implement Task 7 span-level rendering for inline styling

Added support for preserving and rendering inline style variations
within text elements (e.g., bold/italic/color changes mid-line).

Span Extraction (direct_extraction_engine.py):
1. Parse PyMuPDF span data with font, size, flags, color per span
2. Create DocumentElement children for each span with StyleInfo
3. Store spans in element.children for downstream rendering
4. Extract span-specific bbox from PyMuPDF (lines 434-453)

Span Rendering (pdf_generator_service.py):
1. Implement _draw_text_with_spans() method (lines 1685-1734)
   - Iterate through span children
   - Apply per-span styling via _apply_text_style
   - Track X position and calculate widths
   - Return total rendered width
2. Integrate in _draw_text_element_direct() (lines 1822-1823, 1905-1914)
   - Check for element.children (has_spans flag)
   - Use span rendering for first line
   - Fall back to normal rendering for list items
3. Add span count to debug logging

Features:
- Inline font changes (Arial → Times → Courier)
- Inline size changes (12pt → 14pt → 10pt)
- Inline style changes (normal → bold → italic)
- Inline color changes (black → red → blue)

Limitations (future work):
- Currently renders all spans on first line only
- Multi-line span support requires line breaking logic
- List items use single-style rendering (compatibility)

Direct track only (OCR track has no span information).

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
egg
2025-11-24 11:44:05 +08:00
parent b1de7616e4
commit 75c194fe2a
3 changed files with 113 additions and 13 deletions

View File

@@ -1682,6 +1682,57 @@ class PDFGeneratorService:
logger.debug(f"Adding {additional_spacing:.1f}pt spacing after list item {item.element_id} "
f"(actual_gap={actual_gap:.1f}pt, desired={desired_spacing_after:.1f}pt)")
def _draw_text_with_spans(
self,
pdf_canvas: canvas.Canvas,
spans: List['DocumentElement'],
line_x: float,
line_y: float,
default_font_size: float
) -> float:
"""
Draw text with inline span styling (mixed styles within a line).
Args:
pdf_canvas: ReportLab canvas object
spans: List of span DocumentElements
line_x: Starting X position
line_y: Y position
default_font_size: Default font size if span has none
Returns:
Total width of drawn text
"""
x_pos = line_x
total_width = 0
for span in spans:
span_text = span.get_text()
if not span_text:
continue
# Apply span-specific styling
if span.style:
self._apply_text_style(pdf_canvas, span.style, default_size=default_font_size)
else:
# Fallback to default font
font_name = self.font_name if self.font_registered else 'Helvetica'
pdf_canvas.setFont(font_name, default_font_size)
# Get current font for width calculation
current_font = pdf_canvas._fontname
current_size = pdf_canvas._fontsize
# Draw this span
pdf_canvas.drawString(x_pos, line_y, span_text)
# Calculate width and advance position
span_width = pdf_canvas.stringWidth(span_text, current_font, current_size)
x_pos += span_width
total_width += span_width
return total_width
def _draw_text_element_direct(
self,
pdf_canvas: canvas.Canvas,
@@ -1693,6 +1744,7 @@ class PDFGeneratorService:
Draw text element with Direct track rich formatting.
Handles line breaks, alignment, indentation, and applies StyleInfo.
Supports span-level inline styling if element has children.
Args:
pdf_canvas: ReportLab canvas object
@@ -1767,6 +1819,9 @@ class PDFGeneratorService:
paragraph_spacing_before = element.metadata.get('spacing_before', 0) if element.metadata else 0
paragraph_spacing_after = element.metadata.get('spacing_after', 0) if element.metadata else 0
# Check if element has span children for inline styling
has_spans = element.children and len(element.children) > 0
# Handle line breaks
lines = text_content.split('\n')
line_height = font_size * 1.2 # 120% of font size
@@ -1846,7 +1901,20 @@ class PDFGeneratorService:
# else: left alignment uses line_x as-is
# Draw the line at calculated position
pdf_canvas.drawString(line_x, line_y, rendered_line)
# Use span-level rendering if element has span children
if has_spans and not is_list_item:
# Render with inline span styling
# Note: Currently we render all spans on first line
# Multi-line span support would require more complex line breaking logic
if i == 0: # Only render spans on first line for now
total_width = self._draw_text_with_spans(
pdf_canvas, element.children, line_x, line_y, font_size
)
logger.debug(f"Drew {len(element.children)} spans, total width={total_width:.1f}pt")
# Skip rendering on subsequent lines (text already drawn via spans)
else:
# Normal single-style rendering
pdf_canvas.drawString(line_x, line_y, rendered_line)
# Reset font size for next line
if text_width > available_width:
@@ -1860,8 +1928,9 @@ class PDFGeneratorService:
# For other elements, spacing is inherent in element positioning (bbox-based layout)
list_info = f", list={list_type}, level={list_level}" if is_list_item else ""
y_offset_info = f", y_offset={y_offset:.1f}pt" if y_offset != 0 else ""
span_info = f", spans={len(element.children)}" if has_spans else ""
logger.debug(f"Drew text element: {text_content[:30]}... "
f"({len(lines)} lines, align={alignment}, indent={indent}{list_info}{y_offset_info}, "
f"({len(lines)} lines, align={alignment}, indent={indent}{list_info}{y_offset_info}{span_info}, "
f"spacing_before={paragraph_spacing_before}, spacing_after={paragraph_spacing_after}, "
f"actual_height={actual_text_height:.1f}, bbox_bottom_margin={bbox_bottom_margin:.1f})")