fix: improve PDF layout generation for Direct track
Key fixes: - Skip large vector_graphics charts (>50% page coverage) that cover text - Fix font fallback to use NotoSansSC for CJK support instead of Helvetica - Improve translated table rendering with dynamic font sizing - Add merged cell (row_span/col_span) support for reflow tables - Skip text elements inside table bboxes to avoid duplication Archive openspec proposal: fix-pdf-table-rendering 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -354,9 +354,13 @@ class PDFGeneratorService:
|
||||
elif 'courier' in font_lower:
|
||||
return 'Courier'
|
||||
|
||||
# Default fallback
|
||||
logger.debug(f"Font '{font_name}' not found in mapping, using Helvetica")
|
||||
return 'Helvetica'
|
||||
# Default fallback - use NotoSansSC for CJK support if registered
|
||||
if self.font_registered:
|
||||
logger.debug(f"Font '{font_name}' not found in mapping, using {self.font_name} for CJK support")
|
||||
return self.font_name
|
||||
else:
|
||||
logger.debug(f"Font '{font_name}' not found in mapping, using Helvetica")
|
||||
return 'Helvetica'
|
||||
|
||||
def _apply_text_style(self, c: canvas.Canvas, style_info, default_size: float = 12):
|
||||
"""
|
||||
@@ -866,6 +870,23 @@ class PDFGeneratorService:
|
||||
ElementType.IMAGE, ElementType.FIGURE,
|
||||
ElementType.CHART, ElementType.DIAGRAM, ElementType.LOGO, ElementType.STAMP
|
||||
]:
|
||||
# Skip large vector_graphics charts in Direct track
|
||||
# These are visual decorations (borders, lines, frames) that would cover text
|
||||
# PyMuPDF extracts both vector graphics as images AND text layer separately
|
||||
if element.type == ElementType.CHART and element.bbox:
|
||||
content = element.content
|
||||
is_vector_graphics = (
|
||||
isinstance(content, dict) and
|
||||
content.get('source') == 'vector_graphics'
|
||||
)
|
||||
if is_vector_graphics:
|
||||
elem_area = (element.bbox.x1 - element.bbox.x0) * (element.bbox.y1 - element.bbox.y0)
|
||||
coverage_ratio = elem_area / page_area if page_area > 0 else 0
|
||||
if coverage_ratio > 0.5:
|
||||
logger.info(f"Skipping large vector_graphics chart {element.element_id} "
|
||||
f"(covers {coverage_ratio*100:.1f}% of page) - text provides actual content")
|
||||
continue
|
||||
|
||||
image_elements.append(element)
|
||||
# Only add real images to exclusion regions, NOT charts/diagrams
|
||||
# Charts often have large bounding boxes that include text labels
|
||||
@@ -3704,64 +3725,103 @@ class PDFGeneratorService:
|
||||
|
||||
def _create_reflow_table(self, table_data: Dict, styles: Dict) -> Optional[Table]:
|
||||
"""
|
||||
Create a Platypus Table for reflow mode.
|
||||
Create a Platypus Table for reflow mode with merged cell support.
|
||||
|
||||
Args:
|
||||
table_data: Table element dictionary with 'rows' or 'cells'
|
||||
table_data: Table element dictionary with 'content' containing 'cells'
|
||||
styles: Style dictionary
|
||||
|
||||
Returns:
|
||||
Platypus Table object or None
|
||||
"""
|
||||
try:
|
||||
# Get content - cells might be inside 'content' dict
|
||||
# Get content - cells are inside 'content' dict
|
||||
content = table_data.get('content', {})
|
||||
if isinstance(content, dict):
|
||||
rows_data = content.get('rows', []) if isinstance(content.get('rows'), list) else []
|
||||
cells = content.get('cells', [])
|
||||
else:
|
||||
rows_data = table_data.get('rows', [])
|
||||
cells = table_data.get('cells', [])
|
||||
|
||||
if not rows_data and cells:
|
||||
# Group cells by row - support both 'row'/'col' and 'row_index'/'col_index' keys
|
||||
row_map = {}
|
||||
for cell in cells:
|
||||
row_idx = cell.get('row', cell.get('row_index', 0))
|
||||
if row_idx not in row_map:
|
||||
row_map[row_idx] = []
|
||||
row_map[row_idx].append(cell)
|
||||
# Sort and create rows
|
||||
rows_data = []
|
||||
for row_idx in sorted(row_map.keys()):
|
||||
row_cells = sorted(row_map[row_idx], key=lambda c: c.get('col', c.get('col_index', 0)))
|
||||
rows_data.append({'cells': row_cells})
|
||||
|
||||
if not rows_data:
|
||||
if not isinstance(content, dict):
|
||||
return None
|
||||
|
||||
# Build table data
|
||||
cells = content.get('cells', [])
|
||||
if not cells:
|
||||
return None
|
||||
|
||||
# Determine grid dimensions
|
||||
num_rows = content.get('rows', 0)
|
||||
num_cols = content.get('cols', 0)
|
||||
|
||||
if num_rows == 0 or num_cols == 0:
|
||||
# Calculate from cells
|
||||
for cell in cells:
|
||||
row = cell.get('row', cell.get('row_index', 0))
|
||||
col = cell.get('col', cell.get('col_index', 0))
|
||||
row_span = cell.get('row_span', 1)
|
||||
col_span = cell.get('col_span', 1)
|
||||
num_rows = max(num_rows, row + row_span)
|
||||
num_cols = max(num_cols, col + col_span)
|
||||
|
||||
if num_rows == 0 or num_cols == 0:
|
||||
return None
|
||||
|
||||
# Initialize grid with empty strings
|
||||
grid = [['' for _ in range(num_cols)] for _ in range(num_rows)]
|
||||
# Track which cells are covered by spans
|
||||
covered = [[False for _ in range(num_cols)] for _ in range(num_rows)]
|
||||
# Track span commands
|
||||
span_commands = []
|
||||
|
||||
# Fill grid with cell content
|
||||
for cell in cells:
|
||||
row = cell.get('row', cell.get('row_index', 0))
|
||||
col = cell.get('col', cell.get('col_index', 0))
|
||||
row_span = cell.get('row_span', 1)
|
||||
col_span = cell.get('col_span', 1)
|
||||
|
||||
# Get cell text
|
||||
text = cell.get('content', cell.get('text', ''))
|
||||
if not isinstance(text, str):
|
||||
text = str(text) if text else ''
|
||||
|
||||
# Escape HTML special characters
|
||||
text = text.replace('&', '&').replace('<', '<').replace('>', '>')
|
||||
|
||||
# Place content in the top-left cell of the span
|
||||
if 0 <= row < num_rows and 0 <= col < num_cols:
|
||||
grid[row][col] = text
|
||||
|
||||
# Mark covered cells for spans
|
||||
if row_span > 1 or col_span > 1:
|
||||
# Add SPAN command
|
||||
span_commands.append((
|
||||
'SPAN',
|
||||
(col, row),
|
||||
(col + col_span - 1, row + row_span - 1)
|
||||
))
|
||||
# Mark cells as covered
|
||||
for r in range(row, min(row + row_span, num_rows)):
|
||||
for c in range(col, min(col + col_span, num_cols)):
|
||||
if r != row or c != col:
|
||||
covered[r][c] = True
|
||||
|
||||
# Build table data with Paragraphs
|
||||
data = []
|
||||
for row in rows_data:
|
||||
for row_idx in range(num_rows):
|
||||
row_data = []
|
||||
row_cells = row.get('cells', [])
|
||||
for cell in row_cells:
|
||||
# Support both 'text' and 'content' keys
|
||||
text = cell.get('text', cell.get('content', ''))
|
||||
if not isinstance(text, str):
|
||||
text = str(text) if text else ''
|
||||
# Escape HTML special characters
|
||||
text = text.replace('&', '&').replace('<', '<').replace('>', '>')
|
||||
row_data.append(Paragraph(text, styles['TableCell']))
|
||||
if row_data:
|
||||
data.append(row_data)
|
||||
for col_idx in range(num_cols):
|
||||
if covered[row_idx][col_idx]:
|
||||
# Empty cell for covered spans
|
||||
row_data.append('')
|
||||
else:
|
||||
text = grid[row_idx][col_idx]
|
||||
row_data.append(Paragraph(text, styles['TableCell']))
|
||||
data.append(row_data)
|
||||
|
||||
if not data:
|
||||
return None
|
||||
|
||||
# Create table
|
||||
table = Table(data)
|
||||
table.setStyle(TableStyle([
|
||||
|
||||
# Build style commands
|
||||
style_commands = [
|
||||
('GRID', (0, 0), (-1, -1), 0.5, colors.black),
|
||||
('VALIGN', (0, 0), (-1, -1), 'TOP'),
|
||||
('LEFTPADDING', (0, 0), (-1, -1), 6),
|
||||
@@ -3769,11 +3829,20 @@ class PDFGeneratorService:
|
||||
('TOPPADDING', (0, 0), (-1, -1), 4),
|
||||
('BOTTOMPADDING', (0, 0), (-1, -1), 4),
|
||||
('BACKGROUND', (0, 0), (-1, 0), colors.lightgrey), # Header row
|
||||
]))
|
||||
]
|
||||
|
||||
# Add span commands
|
||||
style_commands.extend(span_commands)
|
||||
|
||||
table.setStyle(TableStyle(style_commands))
|
||||
|
||||
logger.info(f"[REFLOW TABLE] Created table with {num_rows}x{num_cols} cells, {len(span_commands)} spans")
|
||||
return table
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to create reflow table: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return None
|
||||
|
||||
def _embed_image_reflow(
|
||||
@@ -4189,8 +4258,31 @@ class PDFGeneratorService:
|
||||
|
||||
pdf_canvas.setPageSize((current_page_width, current_page_height))
|
||||
|
||||
# Process elements
|
||||
# Process elements:
|
||||
# - Tables: draw borders + translated cell text (with dynamic font sizing)
|
||||
# - Text elements: draw at original positions, SKIP if inside table bbox
|
||||
# - Images: draw at original positions
|
||||
elements = page_data.get('elements', [])
|
||||
|
||||
# Collect table bboxes to skip text elements inside tables
|
||||
table_bboxes = []
|
||||
for elem in elements:
|
||||
if elem.get('type') in ('table', 'Table'):
|
||||
elem_bbox = elem.get('bbox', {})
|
||||
if elem_bbox:
|
||||
table_bboxes.append(elem_bbox)
|
||||
|
||||
def is_inside_table(text_bbox):
|
||||
"""Check if text bbox is inside any table bbox."""
|
||||
margin = 5
|
||||
for tb in table_bboxes:
|
||||
if (text_bbox.get('x0', 0) >= tb.get('x0', 0) - margin and
|
||||
text_bbox.get('y0', 0) >= tb.get('y0', 0) - margin and
|
||||
text_bbox.get('x1', 0) <= tb.get('x1', 0) + margin and
|
||||
text_bbox.get('y1', 0) <= tb.get('y1', 0) + margin):
|
||||
return True
|
||||
return False
|
||||
|
||||
for elem in elements:
|
||||
elem_type = elem.get('type', 'text')
|
||||
content = elem.get('content', '')
|
||||
@@ -4211,6 +4303,22 @@ class PDFGeneratorService:
|
||||
|
||||
# Handle different element types
|
||||
if elem_type in ('image', 'figure', 'Image', 'Figure', 'chart', 'Chart'):
|
||||
# Skip large vector_graphics charts - they're visual decorations that cover text
|
||||
if elem_type in ('chart', 'Chart'):
|
||||
elem_content = elem.get('content', {})
|
||||
is_vector_graphics = (
|
||||
isinstance(elem_content, dict) and
|
||||
elem_content.get('source') == 'vector_graphics'
|
||||
)
|
||||
if is_vector_graphics:
|
||||
page_area = current_page_width * current_page_height
|
||||
elem_area = box_width * box_height
|
||||
coverage_ratio = elem_area / page_area if page_area > 0 else 0
|
||||
if coverage_ratio > 0.5:
|
||||
logger.info(f"Skipping large vector_graphics chart "
|
||||
f"(covers {coverage_ratio*100:.1f}% of page)")
|
||||
continue
|
||||
|
||||
# Draw image
|
||||
img = self._embed_image_reflow(elem, image_dir)
|
||||
if img:
|
||||
@@ -4229,6 +4337,11 @@ class PDFGeneratorService:
|
||||
)
|
||||
|
||||
elif isinstance(content, str) and content.strip():
|
||||
# Skip text elements inside table bboxes
|
||||
# (Table cells are rendered by _draw_translated_table with dynamic font sizing)
|
||||
if is_inside_table(bbox):
|
||||
continue
|
||||
|
||||
# Text element - use Paragraph for word wrapping
|
||||
# Escape special characters
|
||||
safe_content = content.replace('&', '&')
|
||||
@@ -4290,106 +4403,140 @@ class PDFGeneratorService:
|
||||
image_dir: Path
|
||||
):
|
||||
"""
|
||||
Draw a table with translated content using Platypus Table.
|
||||
Draw a table with translated content.
|
||||
|
||||
Supports adaptive column widths and text wrapping within cells.
|
||||
Approach:
|
||||
1. Draw cell borders using cell_boxes from metadata
|
||||
2. Render translated text in each cell with dynamic font sizing
|
||||
3. Draw embedded images at their original positions
|
||||
|
||||
Text is rendered with dynamic font sizing to fit within cells.
|
||||
Minimum font size is 6pt for readability.
|
||||
|
||||
Args:
|
||||
pdf_canvas: ReportLab canvas
|
||||
elem: Table element dict
|
||||
elem: Table element dict with metadata containing cell_boxes
|
||||
page_height: Page height for coordinate transformation
|
||||
image_dir: Directory containing images
|
||||
"""
|
||||
from reportlab.platypus import Table, TableStyle, Paragraph
|
||||
from reportlab.lib.styles import ParagraphStyle
|
||||
from reportlab.lib import colors
|
||||
from reportlab.lib.styles import ParagraphStyle
|
||||
from reportlab.platypus import Paragraph
|
||||
|
||||
MIN_FONT_SIZE = 6 # Minimum font size for readability
|
||||
|
||||
try:
|
||||
content = elem.get('content', {})
|
||||
bbox = elem.get('bbox', {})
|
||||
metadata = elem.get('metadata', {})
|
||||
|
||||
if not bbox:
|
||||
return
|
||||
|
||||
x0 = bbox.get('x0', 0)
|
||||
y0 = bbox.get('y0', 0)
|
||||
x1 = bbox.get('x1', 0)
|
||||
y1 = bbox.get('y1', 0)
|
||||
table_width = x1 - x0
|
||||
table_height = y1 - y0
|
||||
|
||||
# Parse table content
|
||||
if isinstance(content, dict):
|
||||
rows = content.get('rows', [])
|
||||
cells = content.get('cells', [])
|
||||
# Get table bounding box
|
||||
if isinstance(bbox, dict):
|
||||
tx0 = bbox.get('x0', 0)
|
||||
ty0 = bbox.get('y0', 0)
|
||||
tx1 = bbox.get('x1', 0)
|
||||
ty1 = bbox.get('y1', 0)
|
||||
else:
|
||||
return
|
||||
tx0, ty0, tx1, ty1 = bbox[:4] if len(bbox) >= 4 else (0, 0, 0, 0)
|
||||
|
||||
if not rows and not cells:
|
||||
return
|
||||
table_width = tx1 - tx0
|
||||
table_height = ty1 - ty0
|
||||
|
||||
# Build table data
|
||||
table_data = []
|
||||
# Step 1: Draw outer table border
|
||||
pdf_canvas.setStrokeColor(colors.black)
|
||||
pdf_canvas.setLineWidth(1.0)
|
||||
pdf_y_bottom = page_height - ty1
|
||||
pdf_canvas.rect(tx0, pdf_y_bottom, table_width, table_height, stroke=1, fill=0)
|
||||
|
||||
if rows:
|
||||
for row in rows:
|
||||
row_cells = row if isinstance(row, list) else row.get('cells', [])
|
||||
row_data = []
|
||||
for cell in row_cells:
|
||||
if isinstance(cell, str):
|
||||
cell_text = cell
|
||||
elif isinstance(cell, dict):
|
||||
cell_text = cell.get('content', cell.get('text', ''))
|
||||
else:
|
||||
cell_text = str(cell) if cell else ''
|
||||
# Step 2: Draw cell borders using cell_boxes
|
||||
cell_boxes = metadata.get('cell_boxes', [])
|
||||
if cell_boxes:
|
||||
# Normalize cell boxes for grid alignment
|
||||
if hasattr(self, '_normalize_cell_boxes_to_grid'):
|
||||
cell_boxes = self._normalize_cell_boxes_to_grid(cell_boxes)
|
||||
|
||||
# Create paragraph for text wrapping
|
||||
safe_text = str(cell_text).replace('&', '&')
|
||||
safe_text = safe_text.replace('<', '<').replace('>', '>')
|
||||
pdf_canvas.setLineWidth(0.5)
|
||||
for box in cell_boxes:
|
||||
if len(box) >= 4:
|
||||
cx0, cy0, cx1, cy1 = box[:4]
|
||||
cell_width = cx1 - cx0
|
||||
cell_height = cy1 - cy0
|
||||
pdf_cell_y = page_height - cy1
|
||||
pdf_canvas.rect(cx0, pdf_cell_y, cell_width, cell_height, stroke=1, fill=0)
|
||||
|
||||
cell_style = ParagraphStyle(
|
||||
f'cell_{id(cell)}',
|
||||
fontName=self.font_name if self.font_registered else 'Helvetica',
|
||||
fontSize=9,
|
||||
leading=11,
|
||||
wordWrap='CJK',
|
||||
)
|
||||
para = Paragraph(safe_text, cell_style)
|
||||
row_data.append(para)
|
||||
# Step 3: Render translated text in each cell
|
||||
cells = content.get('cells', []) if isinstance(content, dict) else []
|
||||
font_name = self.font_name if self.font_registered else 'Helvetica'
|
||||
|
||||
if row_data:
|
||||
table_data.append(row_data)
|
||||
for i, cell in enumerate(cells):
|
||||
cell_text = cell.get('content', cell.get('text', ''))
|
||||
if not cell_text or not cell_text.strip():
|
||||
continue
|
||||
|
||||
if not table_data:
|
||||
return
|
||||
# Get cell bounding box by index
|
||||
if i >= len(cell_boxes):
|
||||
continue
|
||||
|
||||
# Calculate column widths
|
||||
num_cols = max(len(row) for row in table_data) if table_data else 1
|
||||
col_width = table_width / num_cols if num_cols > 0 else table_width
|
||||
cx0, cy0, cx1, cy1 = cell_boxes[i][:4]
|
||||
cell_width = cx1 - cx0
|
||||
cell_height = cy1 - cy0
|
||||
|
||||
# Create table
|
||||
table = Table(table_data, colWidths=[col_width] * num_cols)
|
||||
# Skip tiny cells
|
||||
if cell_width < 10 or cell_height < 10:
|
||||
continue
|
||||
|
||||
# Apply table style
|
||||
table.setStyle(TableStyle([
|
||||
('GRID', (0, 0), (-1, -1), 0.5, colors.black),
|
||||
('VALIGN', (0, 0), (-1, -1), 'TOP'),
|
||||
('LEFTPADDING', (0, 0), (-1, -1), 4),
|
||||
('RIGHTPADDING', (0, 0), (-1, -1), 4),
|
||||
('TOPPADDING', (0, 0), (-1, -1), 2),
|
||||
('BOTTOMPADDING', (0, 0), (-1, -1), 2),
|
||||
]))
|
||||
# Prepare text (escape HTML special chars)
|
||||
safe_text = str(cell_text).replace('&', '&')
|
||||
safe_text = safe_text.replace('<', '<').replace('>', '>')
|
||||
safe_text = safe_text.replace('\n', '<br/>')
|
||||
|
||||
# Wrap and draw table
|
||||
t_width, t_height = table.wrap(table_width, table_height * 2)
|
||||
# Dynamic font sizing: start at 10pt, shrink until text fits
|
||||
padding = 3
|
||||
available_width = cell_width - padding * 2
|
||||
available_height = cell_height - padding * 2
|
||||
|
||||
# Convert to PDF coordinates
|
||||
pdf_y = page_height - y0 - t_height
|
||||
if available_width <= 0 or available_height <= 0:
|
||||
continue
|
||||
|
||||
table.drawOn(pdf_canvas, x0, pdf_y)
|
||||
# Try font sizes from 10pt down to MIN_FONT_SIZE
|
||||
for font_size in range(10, MIN_FONT_SIZE - 1, -1):
|
||||
cell_style = ParagraphStyle(
|
||||
f'cell_{i}_{font_size}',
|
||||
fontName=font_name,
|
||||
fontSize=font_size,
|
||||
leading=font_size * 1.15,
|
||||
wordWrap='CJK',
|
||||
)
|
||||
para = Paragraph(safe_text, cell_style)
|
||||
para_width, para_height = para.wrap(available_width, available_height * 10)
|
||||
|
||||
if para_height <= available_height:
|
||||
break # Text fits at this font size
|
||||
|
||||
# Draw text (centered vertically in cell)
|
||||
text_x = cx0 + padding
|
||||
# Calculate vertical position (top-aligned within cell)
|
||||
text_y = page_height - cy0 - padding - min(para_height, available_height)
|
||||
|
||||
para.drawOn(pdf_canvas, text_x, text_y)
|
||||
|
||||
logger.info(f"[TRANSLATED TABLE] Drew table with {len(cell_boxes)} borders, {len(cells)} cells")
|
||||
|
||||
# Step 4: Draw embedded images
|
||||
embedded_images = metadata.get('embedded_images', [])
|
||||
if embedded_images and image_dir:
|
||||
for emb_img in embedded_images:
|
||||
self._draw_embedded_image(
|
||||
pdf_canvas, emb_img, page_height, image_dir, 1.0, 1.0
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to draw translated table: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
|
||||
# Singleton instance
|
||||
|
||||
Reference in New Issue
Block a user