## Backend Changes - **Service Layer Refactoring**: - Add ProcessingOrchestrator for unified document processing - Add PDFTableRenderer for table rendering extraction - Add PDFFontManager for font management with CJK support - Add MemoryPolicyEngine (73% code reduction from MemoryGuard) - **Bug Fixes**: - Fix Direct Track table row span calculation - Fix OCR Track image path handling - Add cell_boxes coordinate validation - Filter out small decorative images - Add covering image detection ## Frontend Changes - **State Management**: - Add TaskStore for centralized task state management - Add localStorage persistence for recent tasks - Add processing state tracking - **Type Consolidation**: - Merge shared types from api.ts to apiV2.ts - Update imports in authStore, uploadStore, ResultsTable, SettingsPage - **Page Integration**: - Integrate TaskStore in ProcessingPage and TaskDetailPage - Update useTaskValidation hook with cache sync ## Testing - Direct Track: edit.pdf (3 pages, 1.281s), edit3.pdf (2 pages, 0.203s) - Cell boxes validation: 43 valid, 0 invalid - Table merging: 12 merged cells verified 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
918 lines
30 KiB
Python
918 lines
30 KiB
Python
"""
|
|
PDF Table Renderer - Handles table rendering for PDF generation.
|
|
|
|
This module provides unified table rendering capabilities extracted from
|
|
PDFGeneratorService, supporting multiple input formats:
|
|
- HTML tables
|
|
- Cell boxes (layered approach)
|
|
- Cells dictionary (Direct track)
|
|
- TableData objects
|
|
"""
|
|
|
|
import logging
|
|
from dataclasses import dataclass, field
|
|
from html.parser import HTMLParser
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
|
|
from reportlab.lib import colors
|
|
from reportlab.lib.enums import TA_CENTER, TA_LEFT, TA_RIGHT
|
|
from reportlab.lib.styles import ParagraphStyle
|
|
from reportlab.lib.utils import ImageReader
|
|
from reportlab.platypus import Paragraph, Table, TableStyle
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
# ============================================================================
|
|
# Configuration
|
|
# ============================================================================
|
|
|
|
@dataclass
|
|
class TableRenderConfig:
|
|
"""Configuration for table rendering."""
|
|
font_name: str = "Helvetica"
|
|
font_size: int = 8
|
|
min_font_size: int = 6
|
|
max_font_size: int = 10
|
|
|
|
# Padding options
|
|
left_padding: int = 2
|
|
right_padding: int = 2
|
|
top_padding: int = 2
|
|
bottom_padding: int = 2
|
|
|
|
# Border options
|
|
border_color: Any = colors.black
|
|
border_width: float = 0.5
|
|
|
|
# Alignment
|
|
horizontal_align: str = "CENTER"
|
|
vertical_align: str = "MIDDLE"
|
|
|
|
# Header styling
|
|
header_background: Any = colors.lightgrey
|
|
|
|
# Grid normalization threshold
|
|
grid_threshold: float = 10.0
|
|
|
|
# Merged cells threshold
|
|
merge_boundary_threshold: float = 5.0
|
|
|
|
|
|
# ============================================================================
|
|
# HTML Table Parser
|
|
# ============================================================================
|
|
|
|
class HTMLTableParser(HTMLParser):
|
|
"""
|
|
Parse HTML table structure for rendering.
|
|
|
|
Extracts table rows, cells, and merged cell information (colspan/rowspan)
|
|
from HTML table markup.
|
|
"""
|
|
|
|
def __init__(self):
|
|
super().__init__()
|
|
self.tables = []
|
|
self.current_table = None
|
|
self.current_row = None
|
|
self.current_cell = None
|
|
self.in_cell = False
|
|
|
|
def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]):
|
|
if tag == 'table':
|
|
self.current_table = {'rows': []}
|
|
elif tag == 'tr':
|
|
self.current_row = {'cells': []}
|
|
elif tag in ('td', 'th'):
|
|
# Extract colspan and rowspan attributes
|
|
attrs_dict = dict(attrs)
|
|
colspan = int(attrs_dict.get('colspan', 1))
|
|
rowspan = int(attrs_dict.get('rowspan', 1))
|
|
self.current_cell = {
|
|
'text': '',
|
|
'is_header': tag == 'th',
|
|
'colspan': colspan,
|
|
'rowspan': rowspan
|
|
}
|
|
self.in_cell = True
|
|
|
|
def handle_endtag(self, tag: str):
|
|
if tag == 'table' and self.current_table:
|
|
self.tables.append(self.current_table)
|
|
self.current_table = None
|
|
elif tag == 'tr' and self.current_row:
|
|
if self.current_table:
|
|
self.current_table['rows'].append(self.current_row)
|
|
self.current_row = None
|
|
elif tag in ('td', 'th') and self.current_cell:
|
|
if self.current_row:
|
|
self.current_row['cells'].append(self.current_cell)
|
|
self.current_cell = None
|
|
self.in_cell = False
|
|
|
|
def handle_data(self, data: str):
|
|
if self.in_cell and self.current_cell is not None:
|
|
self.current_cell['text'] += data
|
|
|
|
|
|
# ============================================================================
|
|
# Table Renderer
|
|
# ============================================================================
|
|
|
|
class TableRenderer:
|
|
"""
|
|
Unified table rendering engine for PDF generation.
|
|
|
|
Supports multiple input formats and rendering modes:
|
|
- HTML table parsing and rendering
|
|
- Cell boxes rendering (layered approach)
|
|
- Direct track cells dictionary
|
|
- Translated content with dynamic font sizing
|
|
"""
|
|
|
|
def __init__(self, config: Optional[TableRenderConfig] = None):
|
|
"""
|
|
Initialize TableRenderer with configuration.
|
|
|
|
Args:
|
|
config: TableRenderConfig instance (uses defaults if None)
|
|
"""
|
|
self.config = config or TableRenderConfig()
|
|
|
|
def render_from_html(
|
|
self,
|
|
pdf_canvas,
|
|
html_content: str,
|
|
table_bbox: Tuple[float, float, float, float],
|
|
page_height: float,
|
|
scale_w: float = 1.0,
|
|
scale_h: float = 1.0
|
|
) -> bool:
|
|
"""
|
|
Parse HTML and render table to PDF canvas.
|
|
|
|
Args:
|
|
pdf_canvas: ReportLab canvas
|
|
html_content: HTML table string
|
|
table_bbox: (x0, y0, x1, y1) bounding box
|
|
page_height: PDF page height for Y coordinate flip
|
|
scale_w: Horizontal scale factor
|
|
scale_h: Vertical scale factor
|
|
|
|
Returns:
|
|
True if successful, False otherwise
|
|
"""
|
|
try:
|
|
# Parse HTML
|
|
parser = HTMLTableParser()
|
|
parser.feed(html_content)
|
|
|
|
if not parser.tables:
|
|
logger.warning("No tables found in HTML content")
|
|
return False
|
|
|
|
table_data = parser.tables[0]
|
|
return self._render_parsed_table(
|
|
pdf_canvas, table_data, table_bbox, page_height, scale_w, scale_h
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"HTML table rendering failed: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return False
|
|
|
|
def render_from_cells_dict(
|
|
self,
|
|
pdf_canvas,
|
|
cells_dict: Dict,
|
|
table_bbox: Tuple[float, float, float, float],
|
|
page_height: float,
|
|
cell_boxes: Optional[List] = None
|
|
) -> bool:
|
|
"""
|
|
Render table from Direct track cell structure.
|
|
|
|
Args:
|
|
pdf_canvas: ReportLab canvas
|
|
cells_dict: Dict with 'rows', 'cols', 'cells' keys
|
|
table_bbox: (x0, y0, x1, y1) bounding box
|
|
page_height: PDF page height
|
|
cell_boxes: Optional precomputed cell boxes
|
|
|
|
Returns:
|
|
True if successful, False otherwise
|
|
"""
|
|
try:
|
|
# Convert cells dict to row format
|
|
rows = self._build_rows_from_cells_dict(cells_dict)
|
|
|
|
if not rows:
|
|
logger.warning("No rows built from cells dict")
|
|
return False
|
|
|
|
# Build table data structure
|
|
table_data = {'rows': rows}
|
|
|
|
# Calculate dimensions
|
|
x0, y0, x1, y1 = table_bbox
|
|
table_width = (x1 - x0)
|
|
table_height = (y1 - y0)
|
|
|
|
# Determine grid dimensions
|
|
num_rows = cells_dict.get('rows', len(rows))
|
|
num_cols = cells_dict.get('cols',
|
|
max(len(row['cells']) for row in rows) if rows else 1
|
|
)
|
|
|
|
# Calculate column widths and row heights
|
|
if cell_boxes:
|
|
col_widths, row_heights = self.compute_grid_from_cell_boxes(
|
|
cell_boxes, table_bbox, num_rows, num_cols
|
|
)
|
|
else:
|
|
col_widths = [table_width / num_cols] * num_cols
|
|
row_heights = [table_height / num_rows] * num_rows
|
|
|
|
return self._render_with_dimensions(
|
|
pdf_canvas, table_data, table_bbox, page_height,
|
|
col_widths, row_heights
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Cells dict rendering failed: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return False
|
|
|
|
def render_cell_borders(
|
|
self,
|
|
pdf_canvas,
|
|
cell_boxes: List[List[float]],
|
|
table_bbox: Tuple[float, float, float, float],
|
|
page_height: float,
|
|
embedded_images: Optional[List] = None,
|
|
output_dir: Optional[Path] = None
|
|
) -> bool:
|
|
"""
|
|
Render table cell borders only (layered approach).
|
|
|
|
This renders only the cell borders, not the text content.
|
|
Text is typically rendered separately by GapFillingService.
|
|
|
|
Args:
|
|
pdf_canvas: ReportLab canvas
|
|
cell_boxes: List of [x0, y0, x1, y1] for each cell
|
|
table_bbox: Table bounding box
|
|
page_height: PDF page height
|
|
embedded_images: Optional list of images within cells
|
|
output_dir: Directory for image files
|
|
|
|
Returns:
|
|
True if successful, False otherwise
|
|
"""
|
|
try:
|
|
if not cell_boxes:
|
|
# Draw outer border only
|
|
return self._draw_table_border(
|
|
pdf_canvas, table_bbox, page_height
|
|
)
|
|
|
|
# Normalize cell boxes to grid
|
|
normalized_boxes = self.normalize_cell_boxes_to_grid(cell_boxes)
|
|
|
|
# Draw each cell border
|
|
pdf_canvas.saveState()
|
|
pdf_canvas.setStrokeColor(self.config.border_color)
|
|
pdf_canvas.setLineWidth(self.config.border_width)
|
|
|
|
for box in normalized_boxes:
|
|
if box is None:
|
|
continue
|
|
|
|
x0, y0, x1, y1 = box
|
|
# Convert to PDF coordinates (flip Y)
|
|
pdf_x0 = x0
|
|
pdf_y0 = page_height - y1
|
|
pdf_x1 = x1
|
|
pdf_y1 = page_height - y0
|
|
|
|
# Draw cell rectangle
|
|
pdf_canvas.rect(pdf_x0, pdf_y0, pdf_x1 - pdf_x0, pdf_y1 - pdf_y0)
|
|
|
|
pdf_canvas.restoreState()
|
|
|
|
# Draw embedded images if any
|
|
if embedded_images and output_dir:
|
|
for img_info in embedded_images:
|
|
self._draw_embedded_image(
|
|
pdf_canvas, img_info, page_height, output_dir
|
|
)
|
|
|
|
return True
|
|
|
|
except Exception as e:
|
|
logger.error(f"Cell borders rendering failed: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return False
|
|
|
|
def render_with_translated_text(
|
|
self,
|
|
pdf_canvas,
|
|
cells: List[Dict],
|
|
cell_boxes: List,
|
|
table_bbox: Tuple[float, float, float, float],
|
|
page_height: float
|
|
) -> bool:
|
|
"""
|
|
Render table with translated content and dynamic font sizing.
|
|
|
|
Args:
|
|
pdf_canvas: ReportLab canvas
|
|
cells: List of cell dicts with 'translated_content'
|
|
cell_boxes: List of cell bounding boxes
|
|
table_bbox: Table bounding box
|
|
page_height: PDF page height
|
|
|
|
Returns:
|
|
True if successful, False otherwise
|
|
"""
|
|
try:
|
|
# Draw outer border
|
|
self._draw_table_border(pdf_canvas, table_bbox, page_height)
|
|
|
|
# Normalize cell boxes
|
|
if cell_boxes:
|
|
normalized_boxes = self.normalize_cell_boxes_to_grid(cell_boxes)
|
|
else:
|
|
logger.warning("No cell boxes for translated table")
|
|
return False
|
|
|
|
pdf_canvas.saveState()
|
|
pdf_canvas.setStrokeColor(self.config.border_color)
|
|
pdf_canvas.setLineWidth(self.config.border_width)
|
|
|
|
# Draw cell borders
|
|
for box in normalized_boxes:
|
|
if box is None:
|
|
continue
|
|
x0, y0, x1, y1 = box
|
|
pdf_y0 = page_height - y1
|
|
pdf_canvas.rect(x0, pdf_y0, x1 - x0, y1 - y0)
|
|
|
|
pdf_canvas.restoreState()
|
|
|
|
# Render text in cells with dynamic font sizing
|
|
for i, cell in enumerate(cells):
|
|
if i >= len(normalized_boxes):
|
|
break
|
|
|
|
box = normalized_boxes[i]
|
|
if box is None:
|
|
continue
|
|
|
|
translated_text = cell.get('translated_content', '')
|
|
if not translated_text:
|
|
continue
|
|
|
|
x0, y0, x1, y1 = box
|
|
cell_width = x1 - x0
|
|
cell_height = y1 - y0
|
|
|
|
# Find appropriate font size
|
|
font_size = self._fit_text_to_cell(
|
|
pdf_canvas, translated_text, cell_width, cell_height
|
|
)
|
|
|
|
# Render centered text
|
|
pdf_canvas.setFont(self.config.font_name, font_size)
|
|
|
|
# Calculate text position (centered)
|
|
text_width = pdf_canvas.stringWidth(translated_text, self.config.font_name, font_size)
|
|
text_x = x0 + (cell_width - text_width) / 2
|
|
text_y = page_height - y0 - cell_height / 2 - font_size / 3
|
|
|
|
pdf_canvas.drawString(text_x, text_y, translated_text)
|
|
|
|
return True
|
|
|
|
except Exception as e:
|
|
logger.error(f"Translated table rendering failed: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return False
|
|
|
|
# =========================================================================
|
|
# Grid and Cell Box Helpers
|
|
# =========================================================================
|
|
|
|
def compute_grid_from_cell_boxes(
|
|
self,
|
|
cell_boxes: List,
|
|
table_bbox: Tuple[float, float, float, float],
|
|
num_rows: int,
|
|
num_cols: int
|
|
) -> Tuple[Optional[List[float]], Optional[List[float]]]:
|
|
"""
|
|
Calculate column widths and row heights from cell bounding boxes.
|
|
|
|
Args:
|
|
cell_boxes: List of [x0, y0, x1, y1] for each cell
|
|
table_bbox: Table bounding box
|
|
num_rows: Expected number of rows
|
|
num_cols: Expected number of columns
|
|
|
|
Returns:
|
|
Tuple of (col_widths, row_heights) or (None, None) on failure
|
|
"""
|
|
try:
|
|
if not cell_boxes:
|
|
return None, None
|
|
|
|
# Filter valid boxes
|
|
valid_boxes = [b for b in cell_boxes if b is not None and len(b) >= 4]
|
|
if not valid_boxes:
|
|
return None, None
|
|
|
|
# Extract unique X and Y boundaries
|
|
x_boundaries = set()
|
|
y_boundaries = set()
|
|
|
|
for box in valid_boxes:
|
|
x0, y0, x1, y1 = box[:4]
|
|
x_boundaries.add(round(x0, 1))
|
|
x_boundaries.add(round(x1, 1))
|
|
y_boundaries.add(round(y0, 1))
|
|
y_boundaries.add(round(y1, 1))
|
|
|
|
# Sort boundaries
|
|
x_sorted = sorted(x_boundaries)
|
|
y_sorted = sorted(y_boundaries)
|
|
|
|
# Merge nearby boundaries
|
|
x_merged = self._merge_boundaries(x_sorted, self.config.merge_boundary_threshold)
|
|
y_merged = self._merge_boundaries(y_sorted, self.config.merge_boundary_threshold)
|
|
|
|
# Calculate widths and heights
|
|
col_widths = []
|
|
for i in range(len(x_merged) - 1):
|
|
col_widths.append(x_merged[i + 1] - x_merged[i])
|
|
|
|
row_heights = []
|
|
for i in range(len(y_merged) - 1):
|
|
row_heights.append(y_merged[i + 1] - y_merged[i])
|
|
|
|
# Validate against expected dimensions (allow for merged cells)
|
|
tolerance = max(num_cols, num_rows) // 2 + 1
|
|
if abs(len(col_widths) - num_cols) > tolerance:
|
|
logger.debug(f"Column count mismatch: {len(col_widths)} vs {num_cols}")
|
|
if abs(len(row_heights) - num_rows) > tolerance:
|
|
logger.debug(f"Row count mismatch: {len(row_heights)} vs {num_rows}")
|
|
|
|
return col_widths if col_widths else None, row_heights if row_heights else None
|
|
|
|
except Exception as e:
|
|
logger.error(f"Grid computation failed: {e}")
|
|
return None, None
|
|
|
|
def normalize_cell_boxes_to_grid(
|
|
self,
|
|
cell_boxes: List,
|
|
threshold: Optional[float] = None
|
|
) -> List:
|
|
"""
|
|
Snap cell boxes to aligned grid to eliminate coordinate variations.
|
|
|
|
Args:
|
|
cell_boxes: List of [x0, y0, x1, y1] for each cell
|
|
threshold: Clustering threshold (uses config default if None)
|
|
|
|
Returns:
|
|
Normalized cell boxes
|
|
"""
|
|
threshold = threshold or self.config.grid_threshold
|
|
|
|
if not cell_boxes:
|
|
return []
|
|
|
|
try:
|
|
# Collect all coordinates
|
|
all_x = []
|
|
all_y = []
|
|
|
|
for box in cell_boxes:
|
|
if box is None or len(box) < 4:
|
|
continue
|
|
x0, y0, x1, y1 = box[:4]
|
|
all_x.extend([x0, x1])
|
|
all_y.extend([y0, y1])
|
|
|
|
if not all_x or not all_y:
|
|
return cell_boxes
|
|
|
|
# Cluster and normalize X coordinates
|
|
x_clusters = self._cluster_values(sorted(all_x), threshold)
|
|
y_clusters = self._cluster_values(sorted(all_y), threshold)
|
|
|
|
# Build mapping
|
|
x_map = {v: avg for avg, values in x_clusters for v in values}
|
|
y_map = {v: avg for avg, values in y_clusters for v in values}
|
|
|
|
# Normalize boxes
|
|
normalized = []
|
|
for box in cell_boxes:
|
|
if box is None or len(box) < 4:
|
|
normalized.append(box)
|
|
continue
|
|
|
|
x0, y0, x1, y1 = box[:4]
|
|
normalized.append([
|
|
x_map.get(x0, x0),
|
|
y_map.get(y0, y0),
|
|
x_map.get(x1, x1),
|
|
y_map.get(y1, y1)
|
|
])
|
|
|
|
return normalized
|
|
|
|
except Exception as e:
|
|
logger.error(f"Cell box normalization failed: {e}")
|
|
return cell_boxes
|
|
|
|
# =========================================================================
|
|
# Private Helper Methods
|
|
# =========================================================================
|
|
|
|
def _render_parsed_table(
|
|
self,
|
|
pdf_canvas,
|
|
table_data: Dict,
|
|
table_bbox: Tuple[float, float, float, float],
|
|
page_height: float,
|
|
scale_w: float = 1.0,
|
|
scale_h: float = 1.0
|
|
) -> bool:
|
|
"""Render a parsed table structure."""
|
|
rows = table_data.get('rows', [])
|
|
if not rows:
|
|
return False
|
|
|
|
# Build grid content
|
|
num_rows = len(rows)
|
|
num_cols = max(len(row.get('cells', [])) for row in rows)
|
|
|
|
# Track occupied cells for rowspan handling
|
|
occupied = [[False] * num_cols for _ in range(num_rows)]
|
|
|
|
grid = []
|
|
span_commands = []
|
|
|
|
for row_idx, row in enumerate(rows):
|
|
grid_row = [''] * num_cols
|
|
col_idx = 0
|
|
|
|
for cell in row.get('cells', []):
|
|
# Skip occupied cells
|
|
while col_idx < num_cols and occupied[row_idx][col_idx]:
|
|
col_idx += 1
|
|
|
|
if col_idx >= num_cols:
|
|
break
|
|
|
|
text = cell.get('text', '').strip()
|
|
colspan = cell.get('colspan', 1)
|
|
rowspan = cell.get('rowspan', 1)
|
|
|
|
# Place cell content
|
|
grid_row[col_idx] = text
|
|
|
|
# Mark occupied cells and build SPAN command
|
|
if colspan > 1 or rowspan > 1:
|
|
end_col = min(col_idx + colspan - 1, num_cols - 1)
|
|
end_row = min(row_idx + rowspan - 1, num_rows - 1)
|
|
span_commands.append(
|
|
('SPAN', (col_idx, row_idx), (end_col, end_row))
|
|
)
|
|
|
|
for r in range(row_idx, end_row + 1):
|
|
for c in range(col_idx, end_col + 1):
|
|
if r < num_rows and c < num_cols:
|
|
occupied[r][c] = True
|
|
else:
|
|
occupied[row_idx][col_idx] = True
|
|
|
|
col_idx += colspan
|
|
|
|
grid.append(grid_row)
|
|
|
|
# Calculate dimensions
|
|
x0, y0, x1, y1 = table_bbox
|
|
table_width = (x1 - x0) * scale_w
|
|
table_height = (y1 - y0) * scale_h
|
|
|
|
col_widths = [table_width / num_cols] * num_cols
|
|
row_heights = [table_height / num_rows] * num_rows
|
|
|
|
# Create paragraph style
|
|
style = ParagraphStyle(
|
|
'TableCell',
|
|
fontName=self.config.font_name,
|
|
fontSize=self.config.font_size,
|
|
alignment=TA_CENTER,
|
|
leading=self.config.font_size * 1.2
|
|
)
|
|
|
|
# Convert to Paragraph objects
|
|
para_grid = []
|
|
for row in grid:
|
|
para_row = []
|
|
for cell in row:
|
|
if cell:
|
|
para_row.append(Paragraph(cell, style))
|
|
else:
|
|
para_row.append('')
|
|
para_grid.append(para_row)
|
|
|
|
# Build TableStyle
|
|
table_style_commands = [
|
|
('GRID', (0, 0), (-1, -1), self.config.border_width, self.config.border_color),
|
|
('VALIGN', (0, 0), (-1, -1), self.config.vertical_align),
|
|
('ALIGN', (0, 0), (-1, -1), self.config.horizontal_align),
|
|
('LEFTPADDING', (0, 0), (-1, -1), self.config.left_padding),
|
|
('RIGHTPADDING', (0, 0), (-1, -1), self.config.right_padding),
|
|
('TOPPADDING', (0, 0), (-1, -1), self.config.top_padding),
|
|
('BOTTOMPADDING', (0, 0), (-1, -1), self.config.bottom_padding),
|
|
('FONTNAME', (0, 0), (-1, -1), self.config.font_name),
|
|
('FONTSIZE', (0, 0), (-1, -1), self.config.font_size),
|
|
]
|
|
table_style_commands.extend(span_commands)
|
|
|
|
# Create and draw table
|
|
table = Table(para_grid, colWidths=col_widths, rowHeights=row_heights)
|
|
table.setStyle(TableStyle(table_style_commands))
|
|
|
|
# Position and draw
|
|
pdf_x = x0
|
|
pdf_y = page_height - y1 # Flip Y
|
|
|
|
table.wrapOn(pdf_canvas, table_width, table_height)
|
|
table.drawOn(pdf_canvas, pdf_x, pdf_y)
|
|
|
|
return True
|
|
|
|
def _render_with_dimensions(
|
|
self,
|
|
pdf_canvas,
|
|
table_data: Dict,
|
|
table_bbox: Tuple[float, float, float, float],
|
|
page_height: float,
|
|
col_widths: List[float],
|
|
row_heights: List[float]
|
|
) -> bool:
|
|
"""Render table with specified dimensions."""
|
|
rows = table_data.get('rows', [])
|
|
if not rows:
|
|
return False
|
|
|
|
num_rows = len(rows)
|
|
num_cols = max(len(row.get('cells', [])) for row in rows)
|
|
|
|
# Adjust widths/heights if needed
|
|
if len(col_widths) != num_cols:
|
|
x0, y0, x1, y1 = table_bbox
|
|
col_widths = [(x1 - x0) / num_cols] * num_cols
|
|
if len(row_heights) != num_rows:
|
|
x0, y0, x1, y1 = table_bbox
|
|
row_heights = [(y1 - y0) / num_rows] * num_rows
|
|
|
|
# Build grid with proper positioning
|
|
grid = []
|
|
span_commands = []
|
|
occupied = [[False] * num_cols for _ in range(num_rows)]
|
|
|
|
for row_idx, row in enumerate(rows):
|
|
grid_row = [''] * num_cols
|
|
|
|
for cell in row.get('cells', []):
|
|
# Get column position
|
|
col_idx = cell.get('col', 0)
|
|
|
|
# Skip if out of bounds or occupied
|
|
while col_idx < num_cols and occupied[row_idx][col_idx]:
|
|
col_idx += 1
|
|
if col_idx >= num_cols:
|
|
continue
|
|
|
|
text = cell.get('text', '').strip()
|
|
colspan = cell.get('colspan', 1)
|
|
rowspan = cell.get('rowspan', 1)
|
|
|
|
grid_row[col_idx] = text
|
|
|
|
if colspan > 1 or rowspan > 1:
|
|
end_col = min(col_idx + colspan - 1, num_cols - 1)
|
|
end_row = min(row_idx + rowspan - 1, num_rows - 1)
|
|
span_commands.append(
|
|
('SPAN', (col_idx, row_idx), (end_col, end_row))
|
|
)
|
|
for r in range(row_idx, end_row + 1):
|
|
for c in range(col_idx, end_col + 1):
|
|
if r < num_rows and c < num_cols:
|
|
occupied[r][c] = True
|
|
else:
|
|
occupied[row_idx][col_idx] = True
|
|
|
|
grid.append(grid_row)
|
|
|
|
# Create style and table
|
|
style = ParagraphStyle(
|
|
'TableCell',
|
|
fontName=self.config.font_name,
|
|
fontSize=self.config.font_size,
|
|
alignment=TA_CENTER
|
|
)
|
|
|
|
para_grid = []
|
|
for row in grid:
|
|
para_row = [Paragraph(cell, style) if cell else '' for cell in row]
|
|
para_grid.append(para_row)
|
|
|
|
table_style_commands = [
|
|
('GRID', (0, 0), (-1, -1), self.config.border_width, self.config.border_color),
|
|
('VALIGN', (0, 0), (-1, -1), self.config.vertical_align),
|
|
('LEFTPADDING', (0, 0), (-1, -1), 0),
|
|
('RIGHTPADDING', (0, 0), (-1, -1), 0),
|
|
('TOPPADDING', (0, 0), (-1, -1), 0),
|
|
('BOTTOMPADDING', (0, 0), (-1, -1), 1),
|
|
]
|
|
table_style_commands.extend(span_commands)
|
|
|
|
table = Table(para_grid, colWidths=col_widths, rowHeights=row_heights)
|
|
table.setStyle(TableStyle(table_style_commands))
|
|
|
|
x0, y0, x1, y1 = table_bbox
|
|
pdf_x = x0
|
|
pdf_y = page_height - y1
|
|
|
|
table.wrapOn(pdf_canvas, x1 - x0, y1 - y0)
|
|
table.drawOn(pdf_canvas, pdf_x, pdf_y)
|
|
|
|
return True
|
|
|
|
def _build_rows_from_cells_dict(self, cells_dict: Dict) -> List[Dict]:
|
|
"""Convert Direct track cell structure to row format."""
|
|
cells = cells_dict.get('cells', [])
|
|
if not cells:
|
|
return []
|
|
|
|
num_rows = cells_dict.get('rows', 0)
|
|
num_cols = cells_dict.get('cols', 0)
|
|
|
|
# Group cells by row
|
|
rows_data = {}
|
|
for cell in cells:
|
|
row_idx = cell.get('row', 0)
|
|
if row_idx not in rows_data:
|
|
rows_data[row_idx] = []
|
|
rows_data[row_idx].append(cell)
|
|
|
|
# Build row list
|
|
rows = []
|
|
for row_idx in range(num_rows):
|
|
row_cells = rows_data.get(row_idx, [])
|
|
|
|
# Sort by column
|
|
row_cells.sort(key=lambda c: c.get('col', 0))
|
|
|
|
formatted_cells = []
|
|
for cell in row_cells:
|
|
content = cell.get('content', '')
|
|
if isinstance(content, list):
|
|
content = '\n'.join(str(c) for c in content)
|
|
|
|
formatted_cells.append({
|
|
'text': str(content) if content else '',
|
|
'colspan': cell.get('col_span', 1),
|
|
'rowspan': cell.get('row_span', 1),
|
|
'col': cell.get('col', 0),
|
|
'is_header': cell.get('is_header', False)
|
|
})
|
|
|
|
rows.append({'cells': formatted_cells})
|
|
|
|
return rows
|
|
|
|
def _draw_table_border(
|
|
self,
|
|
pdf_canvas,
|
|
table_bbox: Tuple[float, float, float, float],
|
|
page_height: float
|
|
) -> bool:
|
|
"""Draw outer table border."""
|
|
try:
|
|
x0, y0, x1, y1 = table_bbox
|
|
pdf_y0 = page_height - y1
|
|
pdf_y1 = page_height - y0
|
|
|
|
pdf_canvas.saveState()
|
|
pdf_canvas.setStrokeColor(self.config.border_color)
|
|
pdf_canvas.setLineWidth(self.config.border_width)
|
|
pdf_canvas.rect(x0, pdf_y0, x1 - x0, pdf_y1 - pdf_y0)
|
|
pdf_canvas.restoreState()
|
|
|
|
return True
|
|
except Exception as e:
|
|
logger.error(f"Failed to draw table border: {e}")
|
|
return False
|
|
|
|
def _draw_embedded_image(
|
|
self,
|
|
pdf_canvas,
|
|
img_info: Dict,
|
|
page_height: float,
|
|
output_dir: Path
|
|
) -> bool:
|
|
"""Draw an image embedded within a table cell."""
|
|
try:
|
|
img_path = img_info.get('path')
|
|
if not img_path:
|
|
return False
|
|
|
|
# Resolve path
|
|
if not Path(img_path).is_absolute():
|
|
img_path = output_dir / img_path
|
|
|
|
if not Path(img_path).exists():
|
|
logger.warning(f"Embedded image not found: {img_path}")
|
|
return False
|
|
|
|
bbox = img_info.get('bbox', {})
|
|
x0 = bbox.get('x0', 0)
|
|
y0 = bbox.get('y0', 0)
|
|
width = bbox.get('width', 100)
|
|
height = bbox.get('height', 100)
|
|
|
|
# Flip Y coordinate
|
|
pdf_y = page_height - y0 - height
|
|
|
|
# Draw image
|
|
img = ImageReader(str(img_path))
|
|
pdf_canvas.drawImage(img, x0, pdf_y, width, height)
|
|
|
|
return True
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to draw embedded image: {e}")
|
|
return False
|
|
|
|
def _fit_text_to_cell(
|
|
self,
|
|
pdf_canvas,
|
|
text: str,
|
|
cell_width: float,
|
|
cell_height: float
|
|
) -> int:
|
|
"""Find font size that fits text in cell."""
|
|
for size in range(self.config.max_font_size, self.config.min_font_size - 1, -1):
|
|
text_width = pdf_canvas.stringWidth(text, self.config.font_name, size)
|
|
if text_width <= cell_width - 6: # 3pt padding each side
|
|
return size
|
|
return self.config.min_font_size
|
|
|
|
def _merge_boundaries(self, values: List[float], threshold: float) -> List[float]:
|
|
"""Merge nearby boundary values."""
|
|
if not values:
|
|
return []
|
|
|
|
merged = [values[0]]
|
|
for v in values[1:]:
|
|
if abs(v - merged[-1]) > threshold:
|
|
merged.append(v)
|
|
|
|
return merged
|
|
|
|
def _cluster_values(self, values: List[float], threshold: float) -> List[Tuple[float, List[float]]]:
|
|
"""Cluster nearby values and return (average, members) pairs."""
|
|
if not values:
|
|
return []
|
|
|
|
clusters = []
|
|
current_cluster = [values[0]]
|
|
|
|
for v in values[1:]:
|
|
if abs(v - current_cluster[-1]) <= threshold:
|
|
current_cluster.append(v)
|
|
else:
|
|
avg = sum(current_cluster) / len(current_cluster)
|
|
clusters.append((avg, current_cluster))
|
|
current_cluster = [v]
|
|
|
|
if current_cluster:
|
|
avg = sum(current_cluster) / len(current_cluster)
|
|
clusters.append((avg, current_cluster))
|
|
|
|
return clusters
|