feat: refactor dual-track architecture (Phase 1-5)
## Backend Changes - **Service Layer Refactoring**: - Add ProcessingOrchestrator for unified document processing - Add PDFTableRenderer for table rendering extraction - Add PDFFontManager for font management with CJK support - Add MemoryPolicyEngine (73% code reduction from MemoryGuard) - **Bug Fixes**: - Fix Direct Track table row span calculation - Fix OCR Track image path handling - Add cell_boxes coordinate validation - Filter out small decorative images - Add covering image detection ## Frontend Changes - **State Management**: - Add TaskStore for centralized task state management - Add localStorage persistence for recent tasks - Add processing state tracking - **Type Consolidation**: - Merge shared types from api.ts to apiV2.ts - Update imports in authStore, uploadStore, ResultsTable, SettingsPage - **Page Integration**: - Integrate TaskStore in ProcessingPage and TaskDetailPage - Update useTaskValidation hook with cache sync ## Testing - Direct Track: edit.pdf (3 pages, 1.281s), edit3.pdf (2 pages, 0.203s) - Cell boxes validation: 43 valid, 0 invalid - Table merging: 12 merged cells verified 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
917
backend/app/services/pdf_table_renderer.py
Normal file
917
backend/app/services/pdf_table_renderer.py
Normal file
@@ -0,0 +1,917 @@
|
||||
"""
|
||||
PDF Table Renderer - Handles table rendering for PDF generation.
|
||||
|
||||
This module provides unified table rendering capabilities extracted from
|
||||
PDFGeneratorService, supporting multiple input formats:
|
||||
- HTML tables
|
||||
- Cell boxes (layered approach)
|
||||
- Cells dictionary (Direct track)
|
||||
- TableData objects
|
||||
"""
|
||||
|
||||
import logging
|
||||
from dataclasses import dataclass, field
|
||||
from html.parser import HTMLParser
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||
|
||||
from reportlab.lib import colors
|
||||
from reportlab.lib.enums import TA_CENTER, TA_LEFT, TA_RIGHT
|
||||
from reportlab.lib.styles import ParagraphStyle
|
||||
from reportlab.lib.utils import ImageReader
|
||||
from reportlab.platypus import Paragraph, Table, TableStyle
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Configuration
|
||||
# ============================================================================
|
||||
|
||||
@dataclass
|
||||
class TableRenderConfig:
|
||||
"""Configuration for table rendering."""
|
||||
font_name: str = "Helvetica"
|
||||
font_size: int = 8
|
||||
min_font_size: int = 6
|
||||
max_font_size: int = 10
|
||||
|
||||
# Padding options
|
||||
left_padding: int = 2
|
||||
right_padding: int = 2
|
||||
top_padding: int = 2
|
||||
bottom_padding: int = 2
|
||||
|
||||
# Border options
|
||||
border_color: Any = colors.black
|
||||
border_width: float = 0.5
|
||||
|
||||
# Alignment
|
||||
horizontal_align: str = "CENTER"
|
||||
vertical_align: str = "MIDDLE"
|
||||
|
||||
# Header styling
|
||||
header_background: Any = colors.lightgrey
|
||||
|
||||
# Grid normalization threshold
|
||||
grid_threshold: float = 10.0
|
||||
|
||||
# Merged cells threshold
|
||||
merge_boundary_threshold: float = 5.0
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# HTML Table Parser
|
||||
# ============================================================================
|
||||
|
||||
class HTMLTableParser(HTMLParser):
|
||||
"""
|
||||
Parse HTML table structure for rendering.
|
||||
|
||||
Extracts table rows, cells, and merged cell information (colspan/rowspan)
|
||||
from HTML table markup.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.tables = []
|
||||
self.current_table = None
|
||||
self.current_row = None
|
||||
self.current_cell = None
|
||||
self.in_cell = False
|
||||
|
||||
def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]):
|
||||
if tag == 'table':
|
||||
self.current_table = {'rows': []}
|
||||
elif tag == 'tr':
|
||||
self.current_row = {'cells': []}
|
||||
elif tag in ('td', 'th'):
|
||||
# Extract colspan and rowspan attributes
|
||||
attrs_dict = dict(attrs)
|
||||
colspan = int(attrs_dict.get('colspan', 1))
|
||||
rowspan = int(attrs_dict.get('rowspan', 1))
|
||||
self.current_cell = {
|
||||
'text': '',
|
||||
'is_header': tag == 'th',
|
||||
'colspan': colspan,
|
||||
'rowspan': rowspan
|
||||
}
|
||||
self.in_cell = True
|
||||
|
||||
def handle_endtag(self, tag: str):
|
||||
if tag == 'table' and self.current_table:
|
||||
self.tables.append(self.current_table)
|
||||
self.current_table = None
|
||||
elif tag == 'tr' and self.current_row:
|
||||
if self.current_table:
|
||||
self.current_table['rows'].append(self.current_row)
|
||||
self.current_row = None
|
||||
elif tag in ('td', 'th') and self.current_cell:
|
||||
if self.current_row:
|
||||
self.current_row['cells'].append(self.current_cell)
|
||||
self.current_cell = None
|
||||
self.in_cell = False
|
||||
|
||||
def handle_data(self, data: str):
|
||||
if self.in_cell and self.current_cell is not None:
|
||||
self.current_cell['text'] += data
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Table Renderer
|
||||
# ============================================================================
|
||||
|
||||
class TableRenderer:
|
||||
"""
|
||||
Unified table rendering engine for PDF generation.
|
||||
|
||||
Supports multiple input formats and rendering modes:
|
||||
- HTML table parsing and rendering
|
||||
- Cell boxes rendering (layered approach)
|
||||
- Direct track cells dictionary
|
||||
- Translated content with dynamic font sizing
|
||||
"""
|
||||
|
||||
def __init__(self, config: Optional[TableRenderConfig] = None):
|
||||
"""
|
||||
Initialize TableRenderer with configuration.
|
||||
|
||||
Args:
|
||||
config: TableRenderConfig instance (uses defaults if None)
|
||||
"""
|
||||
self.config = config or TableRenderConfig()
|
||||
|
||||
def render_from_html(
|
||||
self,
|
||||
pdf_canvas,
|
||||
html_content: str,
|
||||
table_bbox: Tuple[float, float, float, float],
|
||||
page_height: float,
|
||||
scale_w: float = 1.0,
|
||||
scale_h: float = 1.0
|
||||
) -> bool:
|
||||
"""
|
||||
Parse HTML and render table to PDF canvas.
|
||||
|
||||
Args:
|
||||
pdf_canvas: ReportLab canvas
|
||||
html_content: HTML table string
|
||||
table_bbox: (x0, y0, x1, y1) bounding box
|
||||
page_height: PDF page height for Y coordinate flip
|
||||
scale_w: Horizontal scale factor
|
||||
scale_h: Vertical scale factor
|
||||
|
||||
Returns:
|
||||
True if successful, False otherwise
|
||||
"""
|
||||
try:
|
||||
# Parse HTML
|
||||
parser = HTMLTableParser()
|
||||
parser.feed(html_content)
|
||||
|
||||
if not parser.tables:
|
||||
logger.warning("No tables found in HTML content")
|
||||
return False
|
||||
|
||||
table_data = parser.tables[0]
|
||||
return self._render_parsed_table(
|
||||
pdf_canvas, table_data, table_bbox, page_height, scale_w, scale_h
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"HTML table rendering failed: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return False
|
||||
|
||||
def render_from_cells_dict(
|
||||
self,
|
||||
pdf_canvas,
|
||||
cells_dict: Dict,
|
||||
table_bbox: Tuple[float, float, float, float],
|
||||
page_height: float,
|
||||
cell_boxes: Optional[List] = None
|
||||
) -> bool:
|
||||
"""
|
||||
Render table from Direct track cell structure.
|
||||
|
||||
Args:
|
||||
pdf_canvas: ReportLab canvas
|
||||
cells_dict: Dict with 'rows', 'cols', 'cells' keys
|
||||
table_bbox: (x0, y0, x1, y1) bounding box
|
||||
page_height: PDF page height
|
||||
cell_boxes: Optional precomputed cell boxes
|
||||
|
||||
Returns:
|
||||
True if successful, False otherwise
|
||||
"""
|
||||
try:
|
||||
# Convert cells dict to row format
|
||||
rows = self._build_rows_from_cells_dict(cells_dict)
|
||||
|
||||
if not rows:
|
||||
logger.warning("No rows built from cells dict")
|
||||
return False
|
||||
|
||||
# Build table data structure
|
||||
table_data = {'rows': rows}
|
||||
|
||||
# Calculate dimensions
|
||||
x0, y0, x1, y1 = table_bbox
|
||||
table_width = (x1 - x0)
|
||||
table_height = (y1 - y0)
|
||||
|
||||
# Determine grid dimensions
|
||||
num_rows = cells_dict.get('rows', len(rows))
|
||||
num_cols = cells_dict.get('cols',
|
||||
max(len(row['cells']) for row in rows) if rows else 1
|
||||
)
|
||||
|
||||
# Calculate column widths and row heights
|
||||
if cell_boxes:
|
||||
col_widths, row_heights = self.compute_grid_from_cell_boxes(
|
||||
cell_boxes, table_bbox, num_rows, num_cols
|
||||
)
|
||||
else:
|
||||
col_widths = [table_width / num_cols] * num_cols
|
||||
row_heights = [table_height / num_rows] * num_rows
|
||||
|
||||
return self._render_with_dimensions(
|
||||
pdf_canvas, table_data, table_bbox, page_height,
|
||||
col_widths, row_heights
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Cells dict rendering failed: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return False
|
||||
|
||||
def render_cell_borders(
|
||||
self,
|
||||
pdf_canvas,
|
||||
cell_boxes: List[List[float]],
|
||||
table_bbox: Tuple[float, float, float, float],
|
||||
page_height: float,
|
||||
embedded_images: Optional[List] = None,
|
||||
output_dir: Optional[Path] = None
|
||||
) -> bool:
|
||||
"""
|
||||
Render table cell borders only (layered approach).
|
||||
|
||||
This renders only the cell borders, not the text content.
|
||||
Text is typically rendered separately by GapFillingService.
|
||||
|
||||
Args:
|
||||
pdf_canvas: ReportLab canvas
|
||||
cell_boxes: List of [x0, y0, x1, y1] for each cell
|
||||
table_bbox: Table bounding box
|
||||
page_height: PDF page height
|
||||
embedded_images: Optional list of images within cells
|
||||
output_dir: Directory for image files
|
||||
|
||||
Returns:
|
||||
True if successful, False otherwise
|
||||
"""
|
||||
try:
|
||||
if not cell_boxes:
|
||||
# Draw outer border only
|
||||
return self._draw_table_border(
|
||||
pdf_canvas, table_bbox, page_height
|
||||
)
|
||||
|
||||
# Normalize cell boxes to grid
|
||||
normalized_boxes = self.normalize_cell_boxes_to_grid(cell_boxes)
|
||||
|
||||
# Draw each cell border
|
||||
pdf_canvas.saveState()
|
||||
pdf_canvas.setStrokeColor(self.config.border_color)
|
||||
pdf_canvas.setLineWidth(self.config.border_width)
|
||||
|
||||
for box in normalized_boxes:
|
||||
if box is None:
|
||||
continue
|
||||
|
||||
x0, y0, x1, y1 = box
|
||||
# Convert to PDF coordinates (flip Y)
|
||||
pdf_x0 = x0
|
||||
pdf_y0 = page_height - y1
|
||||
pdf_x1 = x1
|
||||
pdf_y1 = page_height - y0
|
||||
|
||||
# Draw cell rectangle
|
||||
pdf_canvas.rect(pdf_x0, pdf_y0, pdf_x1 - pdf_x0, pdf_y1 - pdf_y0)
|
||||
|
||||
pdf_canvas.restoreState()
|
||||
|
||||
# Draw embedded images if any
|
||||
if embedded_images and output_dir:
|
||||
for img_info in embedded_images:
|
||||
self._draw_embedded_image(
|
||||
pdf_canvas, img_info, page_height, output_dir
|
||||
)
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Cell borders rendering failed: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return False
|
||||
|
||||
def render_with_translated_text(
|
||||
self,
|
||||
pdf_canvas,
|
||||
cells: List[Dict],
|
||||
cell_boxes: List,
|
||||
table_bbox: Tuple[float, float, float, float],
|
||||
page_height: float
|
||||
) -> bool:
|
||||
"""
|
||||
Render table with translated content and dynamic font sizing.
|
||||
|
||||
Args:
|
||||
pdf_canvas: ReportLab canvas
|
||||
cells: List of cell dicts with 'translated_content'
|
||||
cell_boxes: List of cell bounding boxes
|
||||
table_bbox: Table bounding box
|
||||
page_height: PDF page height
|
||||
|
||||
Returns:
|
||||
True if successful, False otherwise
|
||||
"""
|
||||
try:
|
||||
# Draw outer border
|
||||
self._draw_table_border(pdf_canvas, table_bbox, page_height)
|
||||
|
||||
# Normalize cell boxes
|
||||
if cell_boxes:
|
||||
normalized_boxes = self.normalize_cell_boxes_to_grid(cell_boxes)
|
||||
else:
|
||||
logger.warning("No cell boxes for translated table")
|
||||
return False
|
||||
|
||||
pdf_canvas.saveState()
|
||||
pdf_canvas.setStrokeColor(self.config.border_color)
|
||||
pdf_canvas.setLineWidth(self.config.border_width)
|
||||
|
||||
# Draw cell borders
|
||||
for box in normalized_boxes:
|
||||
if box is None:
|
||||
continue
|
||||
x0, y0, x1, y1 = box
|
||||
pdf_y0 = page_height - y1
|
||||
pdf_canvas.rect(x0, pdf_y0, x1 - x0, y1 - y0)
|
||||
|
||||
pdf_canvas.restoreState()
|
||||
|
||||
# Render text in cells with dynamic font sizing
|
||||
for i, cell in enumerate(cells):
|
||||
if i >= len(normalized_boxes):
|
||||
break
|
||||
|
||||
box = normalized_boxes[i]
|
||||
if box is None:
|
||||
continue
|
||||
|
||||
translated_text = cell.get('translated_content', '')
|
||||
if not translated_text:
|
||||
continue
|
||||
|
||||
x0, y0, x1, y1 = box
|
||||
cell_width = x1 - x0
|
||||
cell_height = y1 - y0
|
||||
|
||||
# Find appropriate font size
|
||||
font_size = self._fit_text_to_cell(
|
||||
pdf_canvas, translated_text, cell_width, cell_height
|
||||
)
|
||||
|
||||
# Render centered text
|
||||
pdf_canvas.setFont(self.config.font_name, font_size)
|
||||
|
||||
# Calculate text position (centered)
|
||||
text_width = pdf_canvas.stringWidth(translated_text, self.config.font_name, font_size)
|
||||
text_x = x0 + (cell_width - text_width) / 2
|
||||
text_y = page_height - y0 - cell_height / 2 - font_size / 3
|
||||
|
||||
pdf_canvas.drawString(text_x, text_y, translated_text)
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Translated table rendering failed: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return False
|
||||
|
||||
# =========================================================================
|
||||
# Grid and Cell Box Helpers
|
||||
# =========================================================================
|
||||
|
||||
def compute_grid_from_cell_boxes(
|
||||
self,
|
||||
cell_boxes: List,
|
||||
table_bbox: Tuple[float, float, float, float],
|
||||
num_rows: int,
|
||||
num_cols: int
|
||||
) -> Tuple[Optional[List[float]], Optional[List[float]]]:
|
||||
"""
|
||||
Calculate column widths and row heights from cell bounding boxes.
|
||||
|
||||
Args:
|
||||
cell_boxes: List of [x0, y0, x1, y1] for each cell
|
||||
table_bbox: Table bounding box
|
||||
num_rows: Expected number of rows
|
||||
num_cols: Expected number of columns
|
||||
|
||||
Returns:
|
||||
Tuple of (col_widths, row_heights) or (None, None) on failure
|
||||
"""
|
||||
try:
|
||||
if not cell_boxes:
|
||||
return None, None
|
||||
|
||||
# Filter valid boxes
|
||||
valid_boxes = [b for b in cell_boxes if b is not None and len(b) >= 4]
|
||||
if not valid_boxes:
|
||||
return None, None
|
||||
|
||||
# Extract unique X and Y boundaries
|
||||
x_boundaries = set()
|
||||
y_boundaries = set()
|
||||
|
||||
for box in valid_boxes:
|
||||
x0, y0, x1, y1 = box[:4]
|
||||
x_boundaries.add(round(x0, 1))
|
||||
x_boundaries.add(round(x1, 1))
|
||||
y_boundaries.add(round(y0, 1))
|
||||
y_boundaries.add(round(y1, 1))
|
||||
|
||||
# Sort boundaries
|
||||
x_sorted = sorted(x_boundaries)
|
||||
y_sorted = sorted(y_boundaries)
|
||||
|
||||
# Merge nearby boundaries
|
||||
x_merged = self._merge_boundaries(x_sorted, self.config.merge_boundary_threshold)
|
||||
y_merged = self._merge_boundaries(y_sorted, self.config.merge_boundary_threshold)
|
||||
|
||||
# Calculate widths and heights
|
||||
col_widths = []
|
||||
for i in range(len(x_merged) - 1):
|
||||
col_widths.append(x_merged[i + 1] - x_merged[i])
|
||||
|
||||
row_heights = []
|
||||
for i in range(len(y_merged) - 1):
|
||||
row_heights.append(y_merged[i + 1] - y_merged[i])
|
||||
|
||||
# Validate against expected dimensions (allow for merged cells)
|
||||
tolerance = max(num_cols, num_rows) // 2 + 1
|
||||
if abs(len(col_widths) - num_cols) > tolerance:
|
||||
logger.debug(f"Column count mismatch: {len(col_widths)} vs {num_cols}")
|
||||
if abs(len(row_heights) - num_rows) > tolerance:
|
||||
logger.debug(f"Row count mismatch: {len(row_heights)} vs {num_rows}")
|
||||
|
||||
return col_widths if col_widths else None, row_heights if row_heights else None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Grid computation failed: {e}")
|
||||
return None, None
|
||||
|
||||
def normalize_cell_boxes_to_grid(
|
||||
self,
|
||||
cell_boxes: List,
|
||||
threshold: Optional[float] = None
|
||||
) -> List:
|
||||
"""
|
||||
Snap cell boxes to aligned grid to eliminate coordinate variations.
|
||||
|
||||
Args:
|
||||
cell_boxes: List of [x0, y0, x1, y1] for each cell
|
||||
threshold: Clustering threshold (uses config default if None)
|
||||
|
||||
Returns:
|
||||
Normalized cell boxes
|
||||
"""
|
||||
threshold = threshold or self.config.grid_threshold
|
||||
|
||||
if not cell_boxes:
|
||||
return []
|
||||
|
||||
try:
|
||||
# Collect all coordinates
|
||||
all_x = []
|
||||
all_y = []
|
||||
|
||||
for box in cell_boxes:
|
||||
if box is None or len(box) < 4:
|
||||
continue
|
||||
x0, y0, x1, y1 = box[:4]
|
||||
all_x.extend([x0, x1])
|
||||
all_y.extend([y0, y1])
|
||||
|
||||
if not all_x or not all_y:
|
||||
return cell_boxes
|
||||
|
||||
# Cluster and normalize X coordinates
|
||||
x_clusters = self._cluster_values(sorted(all_x), threshold)
|
||||
y_clusters = self._cluster_values(sorted(all_y), threshold)
|
||||
|
||||
# Build mapping
|
||||
x_map = {v: avg for avg, values in x_clusters for v in values}
|
||||
y_map = {v: avg for avg, values in y_clusters for v in values}
|
||||
|
||||
# Normalize boxes
|
||||
normalized = []
|
||||
for box in cell_boxes:
|
||||
if box is None or len(box) < 4:
|
||||
normalized.append(box)
|
||||
continue
|
||||
|
||||
x0, y0, x1, y1 = box[:4]
|
||||
normalized.append([
|
||||
x_map.get(x0, x0),
|
||||
y_map.get(y0, y0),
|
||||
x_map.get(x1, x1),
|
||||
y_map.get(y1, y1)
|
||||
])
|
||||
|
||||
return normalized
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Cell box normalization failed: {e}")
|
||||
return cell_boxes
|
||||
|
||||
# =========================================================================
|
||||
# Private Helper Methods
|
||||
# =========================================================================
|
||||
|
||||
def _render_parsed_table(
|
||||
self,
|
||||
pdf_canvas,
|
||||
table_data: Dict,
|
||||
table_bbox: Tuple[float, float, float, float],
|
||||
page_height: float,
|
||||
scale_w: float = 1.0,
|
||||
scale_h: float = 1.0
|
||||
) -> bool:
|
||||
"""Render a parsed table structure."""
|
||||
rows = table_data.get('rows', [])
|
||||
if not rows:
|
||||
return False
|
||||
|
||||
# Build grid content
|
||||
num_rows = len(rows)
|
||||
num_cols = max(len(row.get('cells', [])) for row in rows)
|
||||
|
||||
# Track occupied cells for rowspan handling
|
||||
occupied = [[False] * num_cols for _ in range(num_rows)]
|
||||
|
||||
grid = []
|
||||
span_commands = []
|
||||
|
||||
for row_idx, row in enumerate(rows):
|
||||
grid_row = [''] * num_cols
|
||||
col_idx = 0
|
||||
|
||||
for cell in row.get('cells', []):
|
||||
# Skip occupied cells
|
||||
while col_idx < num_cols and occupied[row_idx][col_idx]:
|
||||
col_idx += 1
|
||||
|
||||
if col_idx >= num_cols:
|
||||
break
|
||||
|
||||
text = cell.get('text', '').strip()
|
||||
colspan = cell.get('colspan', 1)
|
||||
rowspan = cell.get('rowspan', 1)
|
||||
|
||||
# Place cell content
|
||||
grid_row[col_idx] = text
|
||||
|
||||
# Mark occupied cells and build SPAN command
|
||||
if colspan > 1 or rowspan > 1:
|
||||
end_col = min(col_idx + colspan - 1, num_cols - 1)
|
||||
end_row = min(row_idx + rowspan - 1, num_rows - 1)
|
||||
span_commands.append(
|
||||
('SPAN', (col_idx, row_idx), (end_col, end_row))
|
||||
)
|
||||
|
||||
for r in range(row_idx, end_row + 1):
|
||||
for c in range(col_idx, end_col + 1):
|
||||
if r < num_rows and c < num_cols:
|
||||
occupied[r][c] = True
|
||||
else:
|
||||
occupied[row_idx][col_idx] = True
|
||||
|
||||
col_idx += colspan
|
||||
|
||||
grid.append(grid_row)
|
||||
|
||||
# Calculate dimensions
|
||||
x0, y0, x1, y1 = table_bbox
|
||||
table_width = (x1 - x0) * scale_w
|
||||
table_height = (y1 - y0) * scale_h
|
||||
|
||||
col_widths = [table_width / num_cols] * num_cols
|
||||
row_heights = [table_height / num_rows] * num_rows
|
||||
|
||||
# Create paragraph style
|
||||
style = ParagraphStyle(
|
||||
'TableCell',
|
||||
fontName=self.config.font_name,
|
||||
fontSize=self.config.font_size,
|
||||
alignment=TA_CENTER,
|
||||
leading=self.config.font_size * 1.2
|
||||
)
|
||||
|
||||
# Convert to Paragraph objects
|
||||
para_grid = []
|
||||
for row in grid:
|
||||
para_row = []
|
||||
for cell in row:
|
||||
if cell:
|
||||
para_row.append(Paragraph(cell, style))
|
||||
else:
|
||||
para_row.append('')
|
||||
para_grid.append(para_row)
|
||||
|
||||
# Build TableStyle
|
||||
table_style_commands = [
|
||||
('GRID', (0, 0), (-1, -1), self.config.border_width, self.config.border_color),
|
||||
('VALIGN', (0, 0), (-1, -1), self.config.vertical_align),
|
||||
('ALIGN', (0, 0), (-1, -1), self.config.horizontal_align),
|
||||
('LEFTPADDING', (0, 0), (-1, -1), self.config.left_padding),
|
||||
('RIGHTPADDING', (0, 0), (-1, -1), self.config.right_padding),
|
||||
('TOPPADDING', (0, 0), (-1, -1), self.config.top_padding),
|
||||
('BOTTOMPADDING', (0, 0), (-1, -1), self.config.bottom_padding),
|
||||
('FONTNAME', (0, 0), (-1, -1), self.config.font_name),
|
||||
('FONTSIZE', (0, 0), (-1, -1), self.config.font_size),
|
||||
]
|
||||
table_style_commands.extend(span_commands)
|
||||
|
||||
# Create and draw table
|
||||
table = Table(para_grid, colWidths=col_widths, rowHeights=row_heights)
|
||||
table.setStyle(TableStyle(table_style_commands))
|
||||
|
||||
# Position and draw
|
||||
pdf_x = x0
|
||||
pdf_y = page_height - y1 # Flip Y
|
||||
|
||||
table.wrapOn(pdf_canvas, table_width, table_height)
|
||||
table.drawOn(pdf_canvas, pdf_x, pdf_y)
|
||||
|
||||
return True
|
||||
|
||||
def _render_with_dimensions(
|
||||
self,
|
||||
pdf_canvas,
|
||||
table_data: Dict,
|
||||
table_bbox: Tuple[float, float, float, float],
|
||||
page_height: float,
|
||||
col_widths: List[float],
|
||||
row_heights: List[float]
|
||||
) -> bool:
|
||||
"""Render table with specified dimensions."""
|
||||
rows = table_data.get('rows', [])
|
||||
if not rows:
|
||||
return False
|
||||
|
||||
num_rows = len(rows)
|
||||
num_cols = max(len(row.get('cells', [])) for row in rows)
|
||||
|
||||
# Adjust widths/heights if needed
|
||||
if len(col_widths) != num_cols:
|
||||
x0, y0, x1, y1 = table_bbox
|
||||
col_widths = [(x1 - x0) / num_cols] * num_cols
|
||||
if len(row_heights) != num_rows:
|
||||
x0, y0, x1, y1 = table_bbox
|
||||
row_heights = [(y1 - y0) / num_rows] * num_rows
|
||||
|
||||
# Build grid with proper positioning
|
||||
grid = []
|
||||
span_commands = []
|
||||
occupied = [[False] * num_cols for _ in range(num_rows)]
|
||||
|
||||
for row_idx, row in enumerate(rows):
|
||||
grid_row = [''] * num_cols
|
||||
|
||||
for cell in row.get('cells', []):
|
||||
# Get column position
|
||||
col_idx = cell.get('col', 0)
|
||||
|
||||
# Skip if out of bounds or occupied
|
||||
while col_idx < num_cols and occupied[row_idx][col_idx]:
|
||||
col_idx += 1
|
||||
if col_idx >= num_cols:
|
||||
continue
|
||||
|
||||
text = cell.get('text', '').strip()
|
||||
colspan = cell.get('colspan', 1)
|
||||
rowspan = cell.get('rowspan', 1)
|
||||
|
||||
grid_row[col_idx] = text
|
||||
|
||||
if colspan > 1 or rowspan > 1:
|
||||
end_col = min(col_idx + colspan - 1, num_cols - 1)
|
||||
end_row = min(row_idx + rowspan - 1, num_rows - 1)
|
||||
span_commands.append(
|
||||
('SPAN', (col_idx, row_idx), (end_col, end_row))
|
||||
)
|
||||
for r in range(row_idx, end_row + 1):
|
||||
for c in range(col_idx, end_col + 1):
|
||||
if r < num_rows and c < num_cols:
|
||||
occupied[r][c] = True
|
||||
else:
|
||||
occupied[row_idx][col_idx] = True
|
||||
|
||||
grid.append(grid_row)
|
||||
|
||||
# Create style and table
|
||||
style = ParagraphStyle(
|
||||
'TableCell',
|
||||
fontName=self.config.font_name,
|
||||
fontSize=self.config.font_size,
|
||||
alignment=TA_CENTER
|
||||
)
|
||||
|
||||
para_grid = []
|
||||
for row in grid:
|
||||
para_row = [Paragraph(cell, style) if cell else '' for cell in row]
|
||||
para_grid.append(para_row)
|
||||
|
||||
table_style_commands = [
|
||||
('GRID', (0, 0), (-1, -1), self.config.border_width, self.config.border_color),
|
||||
('VALIGN', (0, 0), (-1, -1), self.config.vertical_align),
|
||||
('LEFTPADDING', (0, 0), (-1, -1), 0),
|
||||
('RIGHTPADDING', (0, 0), (-1, -1), 0),
|
||||
('TOPPADDING', (0, 0), (-1, -1), 0),
|
||||
('BOTTOMPADDING', (0, 0), (-1, -1), 1),
|
||||
]
|
||||
table_style_commands.extend(span_commands)
|
||||
|
||||
table = Table(para_grid, colWidths=col_widths, rowHeights=row_heights)
|
||||
table.setStyle(TableStyle(table_style_commands))
|
||||
|
||||
x0, y0, x1, y1 = table_bbox
|
||||
pdf_x = x0
|
||||
pdf_y = page_height - y1
|
||||
|
||||
table.wrapOn(pdf_canvas, x1 - x0, y1 - y0)
|
||||
table.drawOn(pdf_canvas, pdf_x, pdf_y)
|
||||
|
||||
return True
|
||||
|
||||
def _build_rows_from_cells_dict(self, cells_dict: Dict) -> List[Dict]:
|
||||
"""Convert Direct track cell structure to row format."""
|
||||
cells = cells_dict.get('cells', [])
|
||||
if not cells:
|
||||
return []
|
||||
|
||||
num_rows = cells_dict.get('rows', 0)
|
||||
num_cols = cells_dict.get('cols', 0)
|
||||
|
||||
# Group cells by row
|
||||
rows_data = {}
|
||||
for cell in cells:
|
||||
row_idx = cell.get('row', 0)
|
||||
if row_idx not in rows_data:
|
||||
rows_data[row_idx] = []
|
||||
rows_data[row_idx].append(cell)
|
||||
|
||||
# Build row list
|
||||
rows = []
|
||||
for row_idx in range(num_rows):
|
||||
row_cells = rows_data.get(row_idx, [])
|
||||
|
||||
# Sort by column
|
||||
row_cells.sort(key=lambda c: c.get('col', 0))
|
||||
|
||||
formatted_cells = []
|
||||
for cell in row_cells:
|
||||
content = cell.get('content', '')
|
||||
if isinstance(content, list):
|
||||
content = '\n'.join(str(c) for c in content)
|
||||
|
||||
formatted_cells.append({
|
||||
'text': str(content) if content else '',
|
||||
'colspan': cell.get('col_span', 1),
|
||||
'rowspan': cell.get('row_span', 1),
|
||||
'col': cell.get('col', 0),
|
||||
'is_header': cell.get('is_header', False)
|
||||
})
|
||||
|
||||
rows.append({'cells': formatted_cells})
|
||||
|
||||
return rows
|
||||
|
||||
def _draw_table_border(
|
||||
self,
|
||||
pdf_canvas,
|
||||
table_bbox: Tuple[float, float, float, float],
|
||||
page_height: float
|
||||
) -> bool:
|
||||
"""Draw outer table border."""
|
||||
try:
|
||||
x0, y0, x1, y1 = table_bbox
|
||||
pdf_y0 = page_height - y1
|
||||
pdf_y1 = page_height - y0
|
||||
|
||||
pdf_canvas.saveState()
|
||||
pdf_canvas.setStrokeColor(self.config.border_color)
|
||||
pdf_canvas.setLineWidth(self.config.border_width)
|
||||
pdf_canvas.rect(x0, pdf_y0, x1 - x0, pdf_y1 - pdf_y0)
|
||||
pdf_canvas.restoreState()
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to draw table border: {e}")
|
||||
return False
|
||||
|
||||
def _draw_embedded_image(
|
||||
self,
|
||||
pdf_canvas,
|
||||
img_info: Dict,
|
||||
page_height: float,
|
||||
output_dir: Path
|
||||
) -> bool:
|
||||
"""Draw an image embedded within a table cell."""
|
||||
try:
|
||||
img_path = img_info.get('path')
|
||||
if not img_path:
|
||||
return False
|
||||
|
||||
# Resolve path
|
||||
if not Path(img_path).is_absolute():
|
||||
img_path = output_dir / img_path
|
||||
|
||||
if not Path(img_path).exists():
|
||||
logger.warning(f"Embedded image not found: {img_path}")
|
||||
return False
|
||||
|
||||
bbox = img_info.get('bbox', {})
|
||||
x0 = bbox.get('x0', 0)
|
||||
y0 = bbox.get('y0', 0)
|
||||
width = bbox.get('width', 100)
|
||||
height = bbox.get('height', 100)
|
||||
|
||||
# Flip Y coordinate
|
||||
pdf_y = page_height - y0 - height
|
||||
|
||||
# Draw image
|
||||
img = ImageReader(str(img_path))
|
||||
pdf_canvas.drawImage(img, x0, pdf_y, width, height)
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to draw embedded image: {e}")
|
||||
return False
|
||||
|
||||
def _fit_text_to_cell(
|
||||
self,
|
||||
pdf_canvas,
|
||||
text: str,
|
||||
cell_width: float,
|
||||
cell_height: float
|
||||
) -> int:
|
||||
"""Find font size that fits text in cell."""
|
||||
for size in range(self.config.max_font_size, self.config.min_font_size - 1, -1):
|
||||
text_width = pdf_canvas.stringWidth(text, self.config.font_name, size)
|
||||
if text_width <= cell_width - 6: # 3pt padding each side
|
||||
return size
|
||||
return self.config.min_font_size
|
||||
|
||||
def _merge_boundaries(self, values: List[float], threshold: float) -> List[float]:
|
||||
"""Merge nearby boundary values."""
|
||||
if not values:
|
||||
return []
|
||||
|
||||
merged = [values[0]]
|
||||
for v in values[1:]:
|
||||
if abs(v - merged[-1]) > threshold:
|
||||
merged.append(v)
|
||||
|
||||
return merged
|
||||
|
||||
def _cluster_values(self, values: List[float], threshold: float) -> List[Tuple[float, List[float]]]:
|
||||
"""Cluster nearby values and return (average, members) pairs."""
|
||||
if not values:
|
||||
return []
|
||||
|
||||
clusters = []
|
||||
current_cluster = [values[0]]
|
||||
|
||||
for v in values[1:]:
|
||||
if abs(v - current_cluster[-1]) <= threshold:
|
||||
current_cluster.append(v)
|
||||
else:
|
||||
avg = sum(current_cluster) / len(current_cluster)
|
||||
clusters.append((avg, current_cluster))
|
||||
current_cluster = [v]
|
||||
|
||||
if current_cluster:
|
||||
avg = sum(current_cluster) / len(current_cluster)
|
||||
clusters.append((avg, current_cluster))
|
||||
|
||||
return clusters
|
||||
Reference in New Issue
Block a user