fix: improve PDF layout generation for Direct track

Key fixes:
- Skip large vector_graphics charts (>50% page coverage) that cover text
- Fix font fallback to use NotoSansSC for CJK support instead of Helvetica
- Improve translated table rendering with dynamic font sizing
- Add merged cell (row_span/col_span) support for reflow tables
- Skip text elements inside table bboxes to avoid duplication

Archive openspec proposal: fix-pdf-table-rendering

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
egg
2025-12-03 14:55:00 +08:00
parent 08adf3d01d
commit 1b5c7f39a8
5 changed files with 405 additions and 111 deletions

View File

@@ -354,9 +354,13 @@ class PDFGeneratorService:
elif 'courier' in font_lower:
return 'Courier'
# Default fallback
logger.debug(f"Font '{font_name}' not found in mapping, using Helvetica")
return 'Helvetica'
# Default fallback - use NotoSansSC for CJK support if registered
if self.font_registered:
logger.debug(f"Font '{font_name}' not found in mapping, using {self.font_name} for CJK support")
return self.font_name
else:
logger.debug(f"Font '{font_name}' not found in mapping, using Helvetica")
return 'Helvetica'
def _apply_text_style(self, c: canvas.Canvas, style_info, default_size: float = 12):
"""
@@ -866,6 +870,23 @@ class PDFGeneratorService:
ElementType.IMAGE, ElementType.FIGURE,
ElementType.CHART, ElementType.DIAGRAM, ElementType.LOGO, ElementType.STAMP
]:
# Skip large vector_graphics charts in Direct track
# These are visual decorations (borders, lines, frames) that would cover text
# PyMuPDF extracts both vector graphics as images AND text layer separately
if element.type == ElementType.CHART and element.bbox:
content = element.content
is_vector_graphics = (
isinstance(content, dict) and
content.get('source') == 'vector_graphics'
)
if is_vector_graphics:
elem_area = (element.bbox.x1 - element.bbox.x0) * (element.bbox.y1 - element.bbox.y0)
coverage_ratio = elem_area / page_area if page_area > 0 else 0
if coverage_ratio > 0.5:
logger.info(f"Skipping large vector_graphics chart {element.element_id} "
f"(covers {coverage_ratio*100:.1f}% of page) - text provides actual content")
continue
image_elements.append(element)
# Only add real images to exclusion regions, NOT charts/diagrams
# Charts often have large bounding boxes that include text labels
@@ -3704,64 +3725,103 @@ class PDFGeneratorService:
def _create_reflow_table(self, table_data: Dict, styles: Dict) -> Optional[Table]:
"""
Create a Platypus Table for reflow mode.
Create a Platypus Table for reflow mode with merged cell support.
Args:
table_data: Table element dictionary with 'rows' or 'cells'
table_data: Table element dictionary with 'content' containing 'cells'
styles: Style dictionary
Returns:
Platypus Table object or None
"""
try:
# Get content - cells might be inside 'content' dict
# Get content - cells are inside 'content' dict
content = table_data.get('content', {})
if isinstance(content, dict):
rows_data = content.get('rows', []) if isinstance(content.get('rows'), list) else []
cells = content.get('cells', [])
else:
rows_data = table_data.get('rows', [])
cells = table_data.get('cells', [])
if not rows_data and cells:
# Group cells by row - support both 'row'/'col' and 'row_index'/'col_index' keys
row_map = {}
for cell in cells:
row_idx = cell.get('row', cell.get('row_index', 0))
if row_idx not in row_map:
row_map[row_idx] = []
row_map[row_idx].append(cell)
# Sort and create rows
rows_data = []
for row_idx in sorted(row_map.keys()):
row_cells = sorted(row_map[row_idx], key=lambda c: c.get('col', c.get('col_index', 0)))
rows_data.append({'cells': row_cells})
if not rows_data:
if not isinstance(content, dict):
return None
# Build table data
cells = content.get('cells', [])
if not cells:
return None
# Determine grid dimensions
num_rows = content.get('rows', 0)
num_cols = content.get('cols', 0)
if num_rows == 0 or num_cols == 0:
# Calculate from cells
for cell in cells:
row = cell.get('row', cell.get('row_index', 0))
col = cell.get('col', cell.get('col_index', 0))
row_span = cell.get('row_span', 1)
col_span = cell.get('col_span', 1)
num_rows = max(num_rows, row + row_span)
num_cols = max(num_cols, col + col_span)
if num_rows == 0 or num_cols == 0:
return None
# Initialize grid with empty strings
grid = [['' for _ in range(num_cols)] for _ in range(num_rows)]
# Track which cells are covered by spans
covered = [[False for _ in range(num_cols)] for _ in range(num_rows)]
# Track span commands
span_commands = []
# Fill grid with cell content
for cell in cells:
row = cell.get('row', cell.get('row_index', 0))
col = cell.get('col', cell.get('col_index', 0))
row_span = cell.get('row_span', 1)
col_span = cell.get('col_span', 1)
# Get cell text
text = cell.get('content', cell.get('text', ''))
if not isinstance(text, str):
text = str(text) if text else ''
# Escape HTML special characters
text = text.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
# Place content in the top-left cell of the span
if 0 <= row < num_rows and 0 <= col < num_cols:
grid[row][col] = text
# Mark covered cells for spans
if row_span > 1 or col_span > 1:
# Add SPAN command
span_commands.append((
'SPAN',
(col, row),
(col + col_span - 1, row + row_span - 1)
))
# Mark cells as covered
for r in range(row, min(row + row_span, num_rows)):
for c in range(col, min(col + col_span, num_cols)):
if r != row or c != col:
covered[r][c] = True
# Build table data with Paragraphs
data = []
for row in rows_data:
for row_idx in range(num_rows):
row_data = []
row_cells = row.get('cells', [])
for cell in row_cells:
# Support both 'text' and 'content' keys
text = cell.get('text', cell.get('content', ''))
if not isinstance(text, str):
text = str(text) if text else ''
# Escape HTML special characters
text = text.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
row_data.append(Paragraph(text, styles['TableCell']))
if row_data:
data.append(row_data)
for col_idx in range(num_cols):
if covered[row_idx][col_idx]:
# Empty cell for covered spans
row_data.append('')
else:
text = grid[row_idx][col_idx]
row_data.append(Paragraph(text, styles['TableCell']))
data.append(row_data)
if not data:
return None
# Create table
table = Table(data)
table.setStyle(TableStyle([
# Build style commands
style_commands = [
('GRID', (0, 0), (-1, -1), 0.5, colors.black),
('VALIGN', (0, 0), (-1, -1), 'TOP'),
('LEFTPADDING', (0, 0), (-1, -1), 6),
@@ -3769,11 +3829,20 @@ class PDFGeneratorService:
('TOPPADDING', (0, 0), (-1, -1), 4),
('BOTTOMPADDING', (0, 0), (-1, -1), 4),
('BACKGROUND', (0, 0), (-1, 0), colors.lightgrey), # Header row
]))
]
# Add span commands
style_commands.extend(span_commands)
table.setStyle(TableStyle(style_commands))
logger.info(f"[REFLOW TABLE] Created table with {num_rows}x{num_cols} cells, {len(span_commands)} spans")
return table
except Exception as e:
logger.error(f"Failed to create reflow table: {e}")
import traceback
traceback.print_exc()
return None
def _embed_image_reflow(
@@ -4189,8 +4258,31 @@ class PDFGeneratorService:
pdf_canvas.setPageSize((current_page_width, current_page_height))
# Process elements
# Process elements:
# - Tables: draw borders + translated cell text (with dynamic font sizing)
# - Text elements: draw at original positions, SKIP if inside table bbox
# - Images: draw at original positions
elements = page_data.get('elements', [])
# Collect table bboxes to skip text elements inside tables
table_bboxes = []
for elem in elements:
if elem.get('type') in ('table', 'Table'):
elem_bbox = elem.get('bbox', {})
if elem_bbox:
table_bboxes.append(elem_bbox)
def is_inside_table(text_bbox):
"""Check if text bbox is inside any table bbox."""
margin = 5
for tb in table_bboxes:
if (text_bbox.get('x0', 0) >= tb.get('x0', 0) - margin and
text_bbox.get('y0', 0) >= tb.get('y0', 0) - margin and
text_bbox.get('x1', 0) <= tb.get('x1', 0) + margin and
text_bbox.get('y1', 0) <= tb.get('y1', 0) + margin):
return True
return False
for elem in elements:
elem_type = elem.get('type', 'text')
content = elem.get('content', '')
@@ -4211,6 +4303,22 @@ class PDFGeneratorService:
# Handle different element types
if elem_type in ('image', 'figure', 'Image', 'Figure', 'chart', 'Chart'):
# Skip large vector_graphics charts - they're visual decorations that cover text
if elem_type in ('chart', 'Chart'):
elem_content = elem.get('content', {})
is_vector_graphics = (
isinstance(elem_content, dict) and
elem_content.get('source') == 'vector_graphics'
)
if is_vector_graphics:
page_area = current_page_width * current_page_height
elem_area = box_width * box_height
coverage_ratio = elem_area / page_area if page_area > 0 else 0
if coverage_ratio > 0.5:
logger.info(f"Skipping large vector_graphics chart "
f"(covers {coverage_ratio*100:.1f}% of page)")
continue
# Draw image
img = self._embed_image_reflow(elem, image_dir)
if img:
@@ -4229,6 +4337,11 @@ class PDFGeneratorService:
)
elif isinstance(content, str) and content.strip():
# Skip text elements inside table bboxes
# (Table cells are rendered by _draw_translated_table with dynamic font sizing)
if is_inside_table(bbox):
continue
# Text element - use Paragraph for word wrapping
# Escape special characters
safe_content = content.replace('&', '&amp;')
@@ -4290,106 +4403,140 @@ class PDFGeneratorService:
image_dir: Path
):
"""
Draw a table with translated content using Platypus Table.
Draw a table with translated content.
Supports adaptive column widths and text wrapping within cells.
Approach:
1. Draw cell borders using cell_boxes from metadata
2. Render translated text in each cell with dynamic font sizing
3. Draw embedded images at their original positions
Text is rendered with dynamic font sizing to fit within cells.
Minimum font size is 6pt for readability.
Args:
pdf_canvas: ReportLab canvas
elem: Table element dict
elem: Table element dict with metadata containing cell_boxes
page_height: Page height for coordinate transformation
image_dir: Directory containing images
"""
from reportlab.platypus import Table, TableStyle, Paragraph
from reportlab.lib.styles import ParagraphStyle
from reportlab.lib import colors
from reportlab.lib.styles import ParagraphStyle
from reportlab.platypus import Paragraph
MIN_FONT_SIZE = 6 # Minimum font size for readability
try:
content = elem.get('content', {})
bbox = elem.get('bbox', {})
metadata = elem.get('metadata', {})
if not bbox:
return
x0 = bbox.get('x0', 0)
y0 = bbox.get('y0', 0)
x1 = bbox.get('x1', 0)
y1 = bbox.get('y1', 0)
table_width = x1 - x0
table_height = y1 - y0
# Parse table content
if isinstance(content, dict):
rows = content.get('rows', [])
cells = content.get('cells', [])
# Get table bounding box
if isinstance(bbox, dict):
tx0 = bbox.get('x0', 0)
ty0 = bbox.get('y0', 0)
tx1 = bbox.get('x1', 0)
ty1 = bbox.get('y1', 0)
else:
return
tx0, ty0, tx1, ty1 = bbox[:4] if len(bbox) >= 4 else (0, 0, 0, 0)
if not rows and not cells:
return
table_width = tx1 - tx0
table_height = ty1 - ty0
# Build table data
table_data = []
# Step 1: Draw outer table border
pdf_canvas.setStrokeColor(colors.black)
pdf_canvas.setLineWidth(1.0)
pdf_y_bottom = page_height - ty1
pdf_canvas.rect(tx0, pdf_y_bottom, table_width, table_height, stroke=1, fill=0)
if rows:
for row in rows:
row_cells = row if isinstance(row, list) else row.get('cells', [])
row_data = []
for cell in row_cells:
if isinstance(cell, str):
cell_text = cell
elif isinstance(cell, dict):
cell_text = cell.get('content', cell.get('text', ''))
else:
cell_text = str(cell) if cell else ''
# Step 2: Draw cell borders using cell_boxes
cell_boxes = metadata.get('cell_boxes', [])
if cell_boxes:
# Normalize cell boxes for grid alignment
if hasattr(self, '_normalize_cell_boxes_to_grid'):
cell_boxes = self._normalize_cell_boxes_to_grid(cell_boxes)
# Create paragraph for text wrapping
safe_text = str(cell_text).replace('&', '&amp;')
safe_text = safe_text.replace('<', '&lt;').replace('>', '&gt;')
pdf_canvas.setLineWidth(0.5)
for box in cell_boxes:
if len(box) >= 4:
cx0, cy0, cx1, cy1 = box[:4]
cell_width = cx1 - cx0
cell_height = cy1 - cy0
pdf_cell_y = page_height - cy1
pdf_canvas.rect(cx0, pdf_cell_y, cell_width, cell_height, stroke=1, fill=0)
cell_style = ParagraphStyle(
f'cell_{id(cell)}',
fontName=self.font_name if self.font_registered else 'Helvetica',
fontSize=9,
leading=11,
wordWrap='CJK',
)
para = Paragraph(safe_text, cell_style)
row_data.append(para)
# Step 3: Render translated text in each cell
cells = content.get('cells', []) if isinstance(content, dict) else []
font_name = self.font_name if self.font_registered else 'Helvetica'
if row_data:
table_data.append(row_data)
for i, cell in enumerate(cells):
cell_text = cell.get('content', cell.get('text', ''))
if not cell_text or not cell_text.strip():
continue
if not table_data:
return
# Get cell bounding box by index
if i >= len(cell_boxes):
continue
# Calculate column widths
num_cols = max(len(row) for row in table_data) if table_data else 1
col_width = table_width / num_cols if num_cols > 0 else table_width
cx0, cy0, cx1, cy1 = cell_boxes[i][:4]
cell_width = cx1 - cx0
cell_height = cy1 - cy0
# Create table
table = Table(table_data, colWidths=[col_width] * num_cols)
# Skip tiny cells
if cell_width < 10 or cell_height < 10:
continue
# Apply table style
table.setStyle(TableStyle([
('GRID', (0, 0), (-1, -1), 0.5, colors.black),
('VALIGN', (0, 0), (-1, -1), 'TOP'),
('LEFTPADDING', (0, 0), (-1, -1), 4),
('RIGHTPADDING', (0, 0), (-1, -1), 4),
('TOPPADDING', (0, 0), (-1, -1), 2),
('BOTTOMPADDING', (0, 0), (-1, -1), 2),
]))
# Prepare text (escape HTML special chars)
safe_text = str(cell_text).replace('&', '&amp;')
safe_text = safe_text.replace('<', '&lt;').replace('>', '&gt;')
safe_text = safe_text.replace('\n', '<br/>')
# Wrap and draw table
t_width, t_height = table.wrap(table_width, table_height * 2)
# Dynamic font sizing: start at 10pt, shrink until text fits
padding = 3
available_width = cell_width - padding * 2
available_height = cell_height - padding * 2
# Convert to PDF coordinates
pdf_y = page_height - y0 - t_height
if available_width <= 0 or available_height <= 0:
continue
table.drawOn(pdf_canvas, x0, pdf_y)
# Try font sizes from 10pt down to MIN_FONT_SIZE
for font_size in range(10, MIN_FONT_SIZE - 1, -1):
cell_style = ParagraphStyle(
f'cell_{i}_{font_size}',
fontName=font_name,
fontSize=font_size,
leading=font_size * 1.15,
wordWrap='CJK',
)
para = Paragraph(safe_text, cell_style)
para_width, para_height = para.wrap(available_width, available_height * 10)
if para_height <= available_height:
break # Text fits at this font size
# Draw text (centered vertically in cell)
text_x = cx0 + padding
# Calculate vertical position (top-aligned within cell)
text_y = page_height - cy0 - padding - min(para_height, available_height)
para.drawOn(pdf_canvas, text_x, text_y)
logger.info(f"[TRANSLATED TABLE] Drew table with {len(cell_boxes)} borders, {len(cells)} cells")
# Step 4: Draw embedded images
embedded_images = metadata.get('embedded_images', [])
if embedded_images and image_dir:
for emb_img in embedded_images:
self._draw_embedded_image(
pdf_canvas, emb_img, page_height, image_dir, 1.0, 1.0
)
except Exception as e:
logger.error(f"Failed to draw translated table: {e}")
import traceback
traceback.print_exc()
# Singleton instance