Files
OCR/backend/app/services/pdf_table_renderer.py
egg eff9b0bcd5 feat: refactor dual-track architecture (Phase 1-5)
## Backend Changes
- **Service Layer Refactoring**:
  - Add ProcessingOrchestrator for unified document processing
  - Add PDFTableRenderer for table rendering extraction
  - Add PDFFontManager for font management with CJK support
  - Add MemoryPolicyEngine (73% code reduction from MemoryGuard)

- **Bug Fixes**:
  - Fix Direct Track table row span calculation
  - Fix OCR Track image path handling
  - Add cell_boxes coordinate validation
  - Filter out small decorative images
  - Add covering image detection

## Frontend Changes
- **State Management**:
  - Add TaskStore for centralized task state management
  - Add localStorage persistence for recent tasks
  - Add processing state tracking

- **Type Consolidation**:
  - Merge shared types from api.ts to apiV2.ts
  - Update imports in authStore, uploadStore, ResultsTable, SettingsPage

- **Page Integration**:
  - Integrate TaskStore in ProcessingPage and TaskDetailPage
  - Update useTaskValidation hook with cache sync

## Testing
- Direct Track: edit.pdf (3 pages, 1.281s), edit3.pdf (2 pages, 0.203s)
- Cell boxes validation: 43 valid, 0 invalid
- Table merging: 12 merged cells verified

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-07 07:18:27 +08:00

918 lines
30 KiB
Python

"""
PDF Table Renderer - Handles table rendering for PDF generation.
This module provides unified table rendering capabilities extracted from
PDFGeneratorService, supporting multiple input formats:
- HTML tables
- Cell boxes (layered approach)
- Cells dictionary (Direct track)
- TableData objects
"""
import logging
from dataclasses import dataclass, field
from html.parser import HTMLParser
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union
from reportlab.lib import colors
from reportlab.lib.enums import TA_CENTER, TA_LEFT, TA_RIGHT
from reportlab.lib.styles import ParagraphStyle
from reportlab.lib.utils import ImageReader
from reportlab.platypus import Paragraph, Table, TableStyle
logger = logging.getLogger(__name__)
# ============================================================================
# Configuration
# ============================================================================
@dataclass
class TableRenderConfig:
"""Configuration for table rendering."""
font_name: str = "Helvetica"
font_size: int = 8
min_font_size: int = 6
max_font_size: int = 10
# Padding options
left_padding: int = 2
right_padding: int = 2
top_padding: int = 2
bottom_padding: int = 2
# Border options
border_color: Any = colors.black
border_width: float = 0.5
# Alignment
horizontal_align: str = "CENTER"
vertical_align: str = "MIDDLE"
# Header styling
header_background: Any = colors.lightgrey
# Grid normalization threshold
grid_threshold: float = 10.0
# Merged cells threshold
merge_boundary_threshold: float = 5.0
# ============================================================================
# HTML Table Parser
# ============================================================================
class HTMLTableParser(HTMLParser):
"""
Parse HTML table structure for rendering.
Extracts table rows, cells, and merged cell information (colspan/rowspan)
from HTML table markup.
"""
def __init__(self):
super().__init__()
self.tables = []
self.current_table = None
self.current_row = None
self.current_cell = None
self.in_cell = False
def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]):
if tag == 'table':
self.current_table = {'rows': []}
elif tag == 'tr':
self.current_row = {'cells': []}
elif tag in ('td', 'th'):
# Extract colspan and rowspan attributes
attrs_dict = dict(attrs)
colspan = int(attrs_dict.get('colspan', 1))
rowspan = int(attrs_dict.get('rowspan', 1))
self.current_cell = {
'text': '',
'is_header': tag == 'th',
'colspan': colspan,
'rowspan': rowspan
}
self.in_cell = True
def handle_endtag(self, tag: str):
if tag == 'table' and self.current_table:
self.tables.append(self.current_table)
self.current_table = None
elif tag == 'tr' and self.current_row:
if self.current_table:
self.current_table['rows'].append(self.current_row)
self.current_row = None
elif tag in ('td', 'th') and self.current_cell:
if self.current_row:
self.current_row['cells'].append(self.current_cell)
self.current_cell = None
self.in_cell = False
def handle_data(self, data: str):
if self.in_cell and self.current_cell is not None:
self.current_cell['text'] += data
# ============================================================================
# Table Renderer
# ============================================================================
class TableRenderer:
"""
Unified table rendering engine for PDF generation.
Supports multiple input formats and rendering modes:
- HTML table parsing and rendering
- Cell boxes rendering (layered approach)
- Direct track cells dictionary
- Translated content with dynamic font sizing
"""
def __init__(self, config: Optional[TableRenderConfig] = None):
"""
Initialize TableRenderer with configuration.
Args:
config: TableRenderConfig instance (uses defaults if None)
"""
self.config = config or TableRenderConfig()
def render_from_html(
self,
pdf_canvas,
html_content: str,
table_bbox: Tuple[float, float, float, float],
page_height: float,
scale_w: float = 1.0,
scale_h: float = 1.0
) -> bool:
"""
Parse HTML and render table to PDF canvas.
Args:
pdf_canvas: ReportLab canvas
html_content: HTML table string
table_bbox: (x0, y0, x1, y1) bounding box
page_height: PDF page height for Y coordinate flip
scale_w: Horizontal scale factor
scale_h: Vertical scale factor
Returns:
True if successful, False otherwise
"""
try:
# Parse HTML
parser = HTMLTableParser()
parser.feed(html_content)
if not parser.tables:
logger.warning("No tables found in HTML content")
return False
table_data = parser.tables[0]
return self._render_parsed_table(
pdf_canvas, table_data, table_bbox, page_height, scale_w, scale_h
)
except Exception as e:
logger.error(f"HTML table rendering failed: {e}")
import traceback
traceback.print_exc()
return False
def render_from_cells_dict(
self,
pdf_canvas,
cells_dict: Dict,
table_bbox: Tuple[float, float, float, float],
page_height: float,
cell_boxes: Optional[List] = None
) -> bool:
"""
Render table from Direct track cell structure.
Args:
pdf_canvas: ReportLab canvas
cells_dict: Dict with 'rows', 'cols', 'cells' keys
table_bbox: (x0, y0, x1, y1) bounding box
page_height: PDF page height
cell_boxes: Optional precomputed cell boxes
Returns:
True if successful, False otherwise
"""
try:
# Convert cells dict to row format
rows = self._build_rows_from_cells_dict(cells_dict)
if not rows:
logger.warning("No rows built from cells dict")
return False
# Build table data structure
table_data = {'rows': rows}
# Calculate dimensions
x0, y0, x1, y1 = table_bbox
table_width = (x1 - x0)
table_height = (y1 - y0)
# Determine grid dimensions
num_rows = cells_dict.get('rows', len(rows))
num_cols = cells_dict.get('cols',
max(len(row['cells']) for row in rows) if rows else 1
)
# Calculate column widths and row heights
if cell_boxes:
col_widths, row_heights = self.compute_grid_from_cell_boxes(
cell_boxes, table_bbox, num_rows, num_cols
)
else:
col_widths = [table_width / num_cols] * num_cols
row_heights = [table_height / num_rows] * num_rows
return self._render_with_dimensions(
pdf_canvas, table_data, table_bbox, page_height,
col_widths, row_heights
)
except Exception as e:
logger.error(f"Cells dict rendering failed: {e}")
import traceback
traceback.print_exc()
return False
def render_cell_borders(
self,
pdf_canvas,
cell_boxes: List[List[float]],
table_bbox: Tuple[float, float, float, float],
page_height: float,
embedded_images: Optional[List] = None,
output_dir: Optional[Path] = None
) -> bool:
"""
Render table cell borders only (layered approach).
This renders only the cell borders, not the text content.
Text is typically rendered separately by GapFillingService.
Args:
pdf_canvas: ReportLab canvas
cell_boxes: List of [x0, y0, x1, y1] for each cell
table_bbox: Table bounding box
page_height: PDF page height
embedded_images: Optional list of images within cells
output_dir: Directory for image files
Returns:
True if successful, False otherwise
"""
try:
if not cell_boxes:
# Draw outer border only
return self._draw_table_border(
pdf_canvas, table_bbox, page_height
)
# Normalize cell boxes to grid
normalized_boxes = self.normalize_cell_boxes_to_grid(cell_boxes)
# Draw each cell border
pdf_canvas.saveState()
pdf_canvas.setStrokeColor(self.config.border_color)
pdf_canvas.setLineWidth(self.config.border_width)
for box in normalized_boxes:
if box is None:
continue
x0, y0, x1, y1 = box
# Convert to PDF coordinates (flip Y)
pdf_x0 = x0
pdf_y0 = page_height - y1
pdf_x1 = x1
pdf_y1 = page_height - y0
# Draw cell rectangle
pdf_canvas.rect(pdf_x0, pdf_y0, pdf_x1 - pdf_x0, pdf_y1 - pdf_y0)
pdf_canvas.restoreState()
# Draw embedded images if any
if embedded_images and output_dir:
for img_info in embedded_images:
self._draw_embedded_image(
pdf_canvas, img_info, page_height, output_dir
)
return True
except Exception as e:
logger.error(f"Cell borders rendering failed: {e}")
import traceback
traceback.print_exc()
return False
def render_with_translated_text(
self,
pdf_canvas,
cells: List[Dict],
cell_boxes: List,
table_bbox: Tuple[float, float, float, float],
page_height: float
) -> bool:
"""
Render table with translated content and dynamic font sizing.
Args:
pdf_canvas: ReportLab canvas
cells: List of cell dicts with 'translated_content'
cell_boxes: List of cell bounding boxes
table_bbox: Table bounding box
page_height: PDF page height
Returns:
True if successful, False otherwise
"""
try:
# Draw outer border
self._draw_table_border(pdf_canvas, table_bbox, page_height)
# Normalize cell boxes
if cell_boxes:
normalized_boxes = self.normalize_cell_boxes_to_grid(cell_boxes)
else:
logger.warning("No cell boxes for translated table")
return False
pdf_canvas.saveState()
pdf_canvas.setStrokeColor(self.config.border_color)
pdf_canvas.setLineWidth(self.config.border_width)
# Draw cell borders
for box in normalized_boxes:
if box is None:
continue
x0, y0, x1, y1 = box
pdf_y0 = page_height - y1
pdf_canvas.rect(x0, pdf_y0, x1 - x0, y1 - y0)
pdf_canvas.restoreState()
# Render text in cells with dynamic font sizing
for i, cell in enumerate(cells):
if i >= len(normalized_boxes):
break
box = normalized_boxes[i]
if box is None:
continue
translated_text = cell.get('translated_content', '')
if not translated_text:
continue
x0, y0, x1, y1 = box
cell_width = x1 - x0
cell_height = y1 - y0
# Find appropriate font size
font_size = self._fit_text_to_cell(
pdf_canvas, translated_text, cell_width, cell_height
)
# Render centered text
pdf_canvas.setFont(self.config.font_name, font_size)
# Calculate text position (centered)
text_width = pdf_canvas.stringWidth(translated_text, self.config.font_name, font_size)
text_x = x0 + (cell_width - text_width) / 2
text_y = page_height - y0 - cell_height / 2 - font_size / 3
pdf_canvas.drawString(text_x, text_y, translated_text)
return True
except Exception as e:
logger.error(f"Translated table rendering failed: {e}")
import traceback
traceback.print_exc()
return False
# =========================================================================
# Grid and Cell Box Helpers
# =========================================================================
def compute_grid_from_cell_boxes(
self,
cell_boxes: List,
table_bbox: Tuple[float, float, float, float],
num_rows: int,
num_cols: int
) -> Tuple[Optional[List[float]], Optional[List[float]]]:
"""
Calculate column widths and row heights from cell bounding boxes.
Args:
cell_boxes: List of [x0, y0, x1, y1] for each cell
table_bbox: Table bounding box
num_rows: Expected number of rows
num_cols: Expected number of columns
Returns:
Tuple of (col_widths, row_heights) or (None, None) on failure
"""
try:
if not cell_boxes:
return None, None
# Filter valid boxes
valid_boxes = [b for b in cell_boxes if b is not None and len(b) >= 4]
if not valid_boxes:
return None, None
# Extract unique X and Y boundaries
x_boundaries = set()
y_boundaries = set()
for box in valid_boxes:
x0, y0, x1, y1 = box[:4]
x_boundaries.add(round(x0, 1))
x_boundaries.add(round(x1, 1))
y_boundaries.add(round(y0, 1))
y_boundaries.add(round(y1, 1))
# Sort boundaries
x_sorted = sorted(x_boundaries)
y_sorted = sorted(y_boundaries)
# Merge nearby boundaries
x_merged = self._merge_boundaries(x_sorted, self.config.merge_boundary_threshold)
y_merged = self._merge_boundaries(y_sorted, self.config.merge_boundary_threshold)
# Calculate widths and heights
col_widths = []
for i in range(len(x_merged) - 1):
col_widths.append(x_merged[i + 1] - x_merged[i])
row_heights = []
for i in range(len(y_merged) - 1):
row_heights.append(y_merged[i + 1] - y_merged[i])
# Validate against expected dimensions (allow for merged cells)
tolerance = max(num_cols, num_rows) // 2 + 1
if abs(len(col_widths) - num_cols) > tolerance:
logger.debug(f"Column count mismatch: {len(col_widths)} vs {num_cols}")
if abs(len(row_heights) - num_rows) > tolerance:
logger.debug(f"Row count mismatch: {len(row_heights)} vs {num_rows}")
return col_widths if col_widths else None, row_heights if row_heights else None
except Exception as e:
logger.error(f"Grid computation failed: {e}")
return None, None
def normalize_cell_boxes_to_grid(
self,
cell_boxes: List,
threshold: Optional[float] = None
) -> List:
"""
Snap cell boxes to aligned grid to eliminate coordinate variations.
Args:
cell_boxes: List of [x0, y0, x1, y1] for each cell
threshold: Clustering threshold (uses config default if None)
Returns:
Normalized cell boxes
"""
threshold = threshold or self.config.grid_threshold
if not cell_boxes:
return []
try:
# Collect all coordinates
all_x = []
all_y = []
for box in cell_boxes:
if box is None or len(box) < 4:
continue
x0, y0, x1, y1 = box[:4]
all_x.extend([x0, x1])
all_y.extend([y0, y1])
if not all_x or not all_y:
return cell_boxes
# Cluster and normalize X coordinates
x_clusters = self._cluster_values(sorted(all_x), threshold)
y_clusters = self._cluster_values(sorted(all_y), threshold)
# Build mapping
x_map = {v: avg for avg, values in x_clusters for v in values}
y_map = {v: avg for avg, values in y_clusters for v in values}
# Normalize boxes
normalized = []
for box in cell_boxes:
if box is None or len(box) < 4:
normalized.append(box)
continue
x0, y0, x1, y1 = box[:4]
normalized.append([
x_map.get(x0, x0),
y_map.get(y0, y0),
x_map.get(x1, x1),
y_map.get(y1, y1)
])
return normalized
except Exception as e:
logger.error(f"Cell box normalization failed: {e}")
return cell_boxes
# =========================================================================
# Private Helper Methods
# =========================================================================
def _render_parsed_table(
self,
pdf_canvas,
table_data: Dict,
table_bbox: Tuple[float, float, float, float],
page_height: float,
scale_w: float = 1.0,
scale_h: float = 1.0
) -> bool:
"""Render a parsed table structure."""
rows = table_data.get('rows', [])
if not rows:
return False
# Build grid content
num_rows = len(rows)
num_cols = max(len(row.get('cells', [])) for row in rows)
# Track occupied cells for rowspan handling
occupied = [[False] * num_cols for _ in range(num_rows)]
grid = []
span_commands = []
for row_idx, row in enumerate(rows):
grid_row = [''] * num_cols
col_idx = 0
for cell in row.get('cells', []):
# Skip occupied cells
while col_idx < num_cols and occupied[row_idx][col_idx]:
col_idx += 1
if col_idx >= num_cols:
break
text = cell.get('text', '').strip()
colspan = cell.get('colspan', 1)
rowspan = cell.get('rowspan', 1)
# Place cell content
grid_row[col_idx] = text
# Mark occupied cells and build SPAN command
if colspan > 1 or rowspan > 1:
end_col = min(col_idx + colspan - 1, num_cols - 1)
end_row = min(row_idx + rowspan - 1, num_rows - 1)
span_commands.append(
('SPAN', (col_idx, row_idx), (end_col, end_row))
)
for r in range(row_idx, end_row + 1):
for c in range(col_idx, end_col + 1):
if r < num_rows and c < num_cols:
occupied[r][c] = True
else:
occupied[row_idx][col_idx] = True
col_idx += colspan
grid.append(grid_row)
# Calculate dimensions
x0, y0, x1, y1 = table_bbox
table_width = (x1 - x0) * scale_w
table_height = (y1 - y0) * scale_h
col_widths = [table_width / num_cols] * num_cols
row_heights = [table_height / num_rows] * num_rows
# Create paragraph style
style = ParagraphStyle(
'TableCell',
fontName=self.config.font_name,
fontSize=self.config.font_size,
alignment=TA_CENTER,
leading=self.config.font_size * 1.2
)
# Convert to Paragraph objects
para_grid = []
for row in grid:
para_row = []
for cell in row:
if cell:
para_row.append(Paragraph(cell, style))
else:
para_row.append('')
para_grid.append(para_row)
# Build TableStyle
table_style_commands = [
('GRID', (0, 0), (-1, -1), self.config.border_width, self.config.border_color),
('VALIGN', (0, 0), (-1, -1), self.config.vertical_align),
('ALIGN', (0, 0), (-1, -1), self.config.horizontal_align),
('LEFTPADDING', (0, 0), (-1, -1), self.config.left_padding),
('RIGHTPADDING', (0, 0), (-1, -1), self.config.right_padding),
('TOPPADDING', (0, 0), (-1, -1), self.config.top_padding),
('BOTTOMPADDING', (0, 0), (-1, -1), self.config.bottom_padding),
('FONTNAME', (0, 0), (-1, -1), self.config.font_name),
('FONTSIZE', (0, 0), (-1, -1), self.config.font_size),
]
table_style_commands.extend(span_commands)
# Create and draw table
table = Table(para_grid, colWidths=col_widths, rowHeights=row_heights)
table.setStyle(TableStyle(table_style_commands))
# Position and draw
pdf_x = x0
pdf_y = page_height - y1 # Flip Y
table.wrapOn(pdf_canvas, table_width, table_height)
table.drawOn(pdf_canvas, pdf_x, pdf_y)
return True
def _render_with_dimensions(
self,
pdf_canvas,
table_data: Dict,
table_bbox: Tuple[float, float, float, float],
page_height: float,
col_widths: List[float],
row_heights: List[float]
) -> bool:
"""Render table with specified dimensions."""
rows = table_data.get('rows', [])
if not rows:
return False
num_rows = len(rows)
num_cols = max(len(row.get('cells', [])) for row in rows)
# Adjust widths/heights if needed
if len(col_widths) != num_cols:
x0, y0, x1, y1 = table_bbox
col_widths = [(x1 - x0) / num_cols] * num_cols
if len(row_heights) != num_rows:
x0, y0, x1, y1 = table_bbox
row_heights = [(y1 - y0) / num_rows] * num_rows
# Build grid with proper positioning
grid = []
span_commands = []
occupied = [[False] * num_cols for _ in range(num_rows)]
for row_idx, row in enumerate(rows):
grid_row = [''] * num_cols
for cell in row.get('cells', []):
# Get column position
col_idx = cell.get('col', 0)
# Skip if out of bounds or occupied
while col_idx < num_cols and occupied[row_idx][col_idx]:
col_idx += 1
if col_idx >= num_cols:
continue
text = cell.get('text', '').strip()
colspan = cell.get('colspan', 1)
rowspan = cell.get('rowspan', 1)
grid_row[col_idx] = text
if colspan > 1 or rowspan > 1:
end_col = min(col_idx + colspan - 1, num_cols - 1)
end_row = min(row_idx + rowspan - 1, num_rows - 1)
span_commands.append(
('SPAN', (col_idx, row_idx), (end_col, end_row))
)
for r in range(row_idx, end_row + 1):
for c in range(col_idx, end_col + 1):
if r < num_rows and c < num_cols:
occupied[r][c] = True
else:
occupied[row_idx][col_idx] = True
grid.append(grid_row)
# Create style and table
style = ParagraphStyle(
'TableCell',
fontName=self.config.font_name,
fontSize=self.config.font_size,
alignment=TA_CENTER
)
para_grid = []
for row in grid:
para_row = [Paragraph(cell, style) if cell else '' for cell in row]
para_grid.append(para_row)
table_style_commands = [
('GRID', (0, 0), (-1, -1), self.config.border_width, self.config.border_color),
('VALIGN', (0, 0), (-1, -1), self.config.vertical_align),
('LEFTPADDING', (0, 0), (-1, -1), 0),
('RIGHTPADDING', (0, 0), (-1, -1), 0),
('TOPPADDING', (0, 0), (-1, -1), 0),
('BOTTOMPADDING', (0, 0), (-1, -1), 1),
]
table_style_commands.extend(span_commands)
table = Table(para_grid, colWidths=col_widths, rowHeights=row_heights)
table.setStyle(TableStyle(table_style_commands))
x0, y0, x1, y1 = table_bbox
pdf_x = x0
pdf_y = page_height - y1
table.wrapOn(pdf_canvas, x1 - x0, y1 - y0)
table.drawOn(pdf_canvas, pdf_x, pdf_y)
return True
def _build_rows_from_cells_dict(self, cells_dict: Dict) -> List[Dict]:
"""Convert Direct track cell structure to row format."""
cells = cells_dict.get('cells', [])
if not cells:
return []
num_rows = cells_dict.get('rows', 0)
num_cols = cells_dict.get('cols', 0)
# Group cells by row
rows_data = {}
for cell in cells:
row_idx = cell.get('row', 0)
if row_idx not in rows_data:
rows_data[row_idx] = []
rows_data[row_idx].append(cell)
# Build row list
rows = []
for row_idx in range(num_rows):
row_cells = rows_data.get(row_idx, [])
# Sort by column
row_cells.sort(key=lambda c: c.get('col', 0))
formatted_cells = []
for cell in row_cells:
content = cell.get('content', '')
if isinstance(content, list):
content = '\n'.join(str(c) for c in content)
formatted_cells.append({
'text': str(content) if content else '',
'colspan': cell.get('col_span', 1),
'rowspan': cell.get('row_span', 1),
'col': cell.get('col', 0),
'is_header': cell.get('is_header', False)
})
rows.append({'cells': formatted_cells})
return rows
def _draw_table_border(
self,
pdf_canvas,
table_bbox: Tuple[float, float, float, float],
page_height: float
) -> bool:
"""Draw outer table border."""
try:
x0, y0, x1, y1 = table_bbox
pdf_y0 = page_height - y1
pdf_y1 = page_height - y0
pdf_canvas.saveState()
pdf_canvas.setStrokeColor(self.config.border_color)
pdf_canvas.setLineWidth(self.config.border_width)
pdf_canvas.rect(x0, pdf_y0, x1 - x0, pdf_y1 - pdf_y0)
pdf_canvas.restoreState()
return True
except Exception as e:
logger.error(f"Failed to draw table border: {e}")
return False
def _draw_embedded_image(
self,
pdf_canvas,
img_info: Dict,
page_height: float,
output_dir: Path
) -> bool:
"""Draw an image embedded within a table cell."""
try:
img_path = img_info.get('path')
if not img_path:
return False
# Resolve path
if not Path(img_path).is_absolute():
img_path = output_dir / img_path
if not Path(img_path).exists():
logger.warning(f"Embedded image not found: {img_path}")
return False
bbox = img_info.get('bbox', {})
x0 = bbox.get('x0', 0)
y0 = bbox.get('y0', 0)
width = bbox.get('width', 100)
height = bbox.get('height', 100)
# Flip Y coordinate
pdf_y = page_height - y0 - height
# Draw image
img = ImageReader(str(img_path))
pdf_canvas.drawImage(img, x0, pdf_y, width, height)
return True
except Exception as e:
logger.error(f"Failed to draw embedded image: {e}")
return False
def _fit_text_to_cell(
self,
pdf_canvas,
text: str,
cell_width: float,
cell_height: float
) -> int:
"""Find font size that fits text in cell."""
for size in range(self.config.max_font_size, self.config.min_font_size - 1, -1):
text_width = pdf_canvas.stringWidth(text, self.config.font_name, size)
if text_width <= cell_width - 6: # 3pt padding each side
return size
return self.config.min_font_size
def _merge_boundaries(self, values: List[float], threshold: float) -> List[float]:
"""Merge nearby boundary values."""
if not values:
return []
merged = [values[0]]
for v in values[1:]:
if abs(v - merged[-1]) > threshold:
merged.append(v)
return merged
def _cluster_values(self, values: List[float], threshold: float) -> List[Tuple[float, List[float]]]:
"""Cluster nearby values and return (average, members) pairs."""
if not values:
return []
clusters = []
current_cluster = [values[0]]
for v in values[1:]:
if abs(v - current_cluster[-1]) <= threshold:
current_cluster.append(v)
else:
avg = sum(current_cluster) / len(current_cluster)
clusters.append((avg, current_cluster))
current_cluster = [v]
if current_cluster:
avg = sum(current_cluster) / len(current_cluster)
clusters.append((avg, current_cluster))
return clusters