fix: improve PDF layout generation for Direct track

Key fixes:
- Skip large vector_graphics charts (>50% page coverage) that cover text
- Fix font fallback to use NotoSansSC for CJK support instead of Helvetica
- Improve translated table rendering with dynamic font sizing
- Add merged cell (row_span/col_span) support for reflow tables
- Skip text elements inside table bboxes to avoid duplication

Archive openspec proposal: fix-pdf-table-rendering

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
egg
2025-12-03 14:55:00 +08:00
parent 08adf3d01d
commit 1b5c7f39a8
5 changed files with 405 additions and 111 deletions

View File

@@ -354,7 +354,11 @@ class PDFGeneratorService:
elif 'courier' in font_lower:
return 'Courier'
# Default fallback
# Default fallback - use NotoSansSC for CJK support if registered
if self.font_registered:
logger.debug(f"Font '{font_name}' not found in mapping, using {self.font_name} for CJK support")
return self.font_name
else:
logger.debug(f"Font '{font_name}' not found in mapping, using Helvetica")
return 'Helvetica'
@@ -866,6 +870,23 @@ class PDFGeneratorService:
ElementType.IMAGE, ElementType.FIGURE,
ElementType.CHART, ElementType.DIAGRAM, ElementType.LOGO, ElementType.STAMP
]:
# Skip large vector_graphics charts in Direct track
# These are visual decorations (borders, lines, frames) that would cover text
# PyMuPDF extracts both vector graphics as images AND text layer separately
if element.type == ElementType.CHART and element.bbox:
content = element.content
is_vector_graphics = (
isinstance(content, dict) and
content.get('source') == 'vector_graphics'
)
if is_vector_graphics:
elem_area = (element.bbox.x1 - element.bbox.x0) * (element.bbox.y1 - element.bbox.y0)
coverage_ratio = elem_area / page_area if page_area > 0 else 0
if coverage_ratio > 0.5:
logger.info(f"Skipping large vector_graphics chart {element.element_id} "
f"(covers {coverage_ratio*100:.1f}% of page) - text provides actual content")
continue
image_elements.append(element)
# Only add real images to exclusion regions, NOT charts/diagrams
# Charts often have large bounding boxes that include text labels
@@ -3704,56 +3725,93 @@ class PDFGeneratorService:
def _create_reflow_table(self, table_data: Dict, styles: Dict) -> Optional[Table]:
"""
Create a Platypus Table for reflow mode.
Create a Platypus Table for reflow mode with merged cell support.
Args:
table_data: Table element dictionary with 'rows' or 'cells'
table_data: Table element dictionary with 'content' containing 'cells'
styles: Style dictionary
Returns:
Platypus Table object or None
"""
try:
# Get content - cells might be inside 'content' dict
# Get content - cells are inside 'content' dict
content = table_data.get('content', {})
if isinstance(content, dict):
rows_data = content.get('rows', []) if isinstance(content.get('rows'), list) else []
cells = content.get('cells', [])
else:
rows_data = table_data.get('rows', [])
cells = table_data.get('cells', [])
if not rows_data and cells:
# Group cells by row - support both 'row'/'col' and 'row_index'/'col_index' keys
row_map = {}
for cell in cells:
row_idx = cell.get('row', cell.get('row_index', 0))
if row_idx not in row_map:
row_map[row_idx] = []
row_map[row_idx].append(cell)
# Sort and create rows
rows_data = []
for row_idx in sorted(row_map.keys()):
row_cells = sorted(row_map[row_idx], key=lambda c: c.get('col', c.get('col_index', 0)))
rows_data.append({'cells': row_cells})
if not rows_data:
if not isinstance(content, dict):
return None
# Build table data
data = []
for row in rows_data:
row_data = []
row_cells = row.get('cells', [])
for cell in row_cells:
# Support both 'text' and 'content' keys
text = cell.get('text', cell.get('content', ''))
cells = content.get('cells', [])
if not cells:
return None
# Determine grid dimensions
num_rows = content.get('rows', 0)
num_cols = content.get('cols', 0)
if num_rows == 0 or num_cols == 0:
# Calculate from cells
for cell in cells:
row = cell.get('row', cell.get('row_index', 0))
col = cell.get('col', cell.get('col_index', 0))
row_span = cell.get('row_span', 1)
col_span = cell.get('col_span', 1)
num_rows = max(num_rows, row + row_span)
num_cols = max(num_cols, col + col_span)
if num_rows == 0 or num_cols == 0:
return None
# Initialize grid with empty strings
grid = [['' for _ in range(num_cols)] for _ in range(num_rows)]
# Track which cells are covered by spans
covered = [[False for _ in range(num_cols)] for _ in range(num_rows)]
# Track span commands
span_commands = []
# Fill grid with cell content
for cell in cells:
row = cell.get('row', cell.get('row_index', 0))
col = cell.get('col', cell.get('col_index', 0))
row_span = cell.get('row_span', 1)
col_span = cell.get('col_span', 1)
# Get cell text
text = cell.get('content', cell.get('text', ''))
if not isinstance(text, str):
text = str(text) if text else ''
# Escape HTML special characters
text = text.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
# Place content in the top-left cell of the span
if 0 <= row < num_rows and 0 <= col < num_cols:
grid[row][col] = text
# Mark covered cells for spans
if row_span > 1 or col_span > 1:
# Add SPAN command
span_commands.append((
'SPAN',
(col, row),
(col + col_span - 1, row + row_span - 1)
))
# Mark cells as covered
for r in range(row, min(row + row_span, num_rows)):
for c in range(col, min(col + col_span, num_cols)):
if r != row or c != col:
covered[r][c] = True
# Build table data with Paragraphs
data = []
for row_idx in range(num_rows):
row_data = []
for col_idx in range(num_cols):
if covered[row_idx][col_idx]:
# Empty cell for covered spans
row_data.append('')
else:
text = grid[row_idx][col_idx]
row_data.append(Paragraph(text, styles['TableCell']))
if row_data:
data.append(row_data)
if not data:
@@ -3761,7 +3819,9 @@ class PDFGeneratorService:
# Create table
table = Table(data)
table.setStyle(TableStyle([
# Build style commands
style_commands = [
('GRID', (0, 0), (-1, -1), 0.5, colors.black),
('VALIGN', (0, 0), (-1, -1), 'TOP'),
('LEFTPADDING', (0, 0), (-1, -1), 6),
@@ -3769,11 +3829,20 @@ class PDFGeneratorService:
('TOPPADDING', (0, 0), (-1, -1), 4),
('BOTTOMPADDING', (0, 0), (-1, -1), 4),
('BACKGROUND', (0, 0), (-1, 0), colors.lightgrey), # Header row
]))
]
# Add span commands
style_commands.extend(span_commands)
table.setStyle(TableStyle(style_commands))
logger.info(f"[REFLOW TABLE] Created table with {num_rows}x{num_cols} cells, {len(span_commands)} spans")
return table
except Exception as e:
logger.error(f"Failed to create reflow table: {e}")
import traceback
traceback.print_exc()
return None
def _embed_image_reflow(
@@ -4189,8 +4258,31 @@ class PDFGeneratorService:
pdf_canvas.setPageSize((current_page_width, current_page_height))
# Process elements
# Process elements:
# - Tables: draw borders + translated cell text (with dynamic font sizing)
# - Text elements: draw at original positions, SKIP if inside table bbox
# - Images: draw at original positions
elements = page_data.get('elements', [])
# Collect table bboxes to skip text elements inside tables
table_bboxes = []
for elem in elements:
if elem.get('type') in ('table', 'Table'):
elem_bbox = elem.get('bbox', {})
if elem_bbox:
table_bboxes.append(elem_bbox)
def is_inside_table(text_bbox):
"""Check if text bbox is inside any table bbox."""
margin = 5
for tb in table_bboxes:
if (text_bbox.get('x0', 0) >= tb.get('x0', 0) - margin and
text_bbox.get('y0', 0) >= tb.get('y0', 0) - margin and
text_bbox.get('x1', 0) <= tb.get('x1', 0) + margin and
text_bbox.get('y1', 0) <= tb.get('y1', 0) + margin):
return True
return False
for elem in elements:
elem_type = elem.get('type', 'text')
content = elem.get('content', '')
@@ -4211,6 +4303,22 @@ class PDFGeneratorService:
# Handle different element types
if elem_type in ('image', 'figure', 'Image', 'Figure', 'chart', 'Chart'):
# Skip large vector_graphics charts - they're visual decorations that cover text
if elem_type in ('chart', 'Chart'):
elem_content = elem.get('content', {})
is_vector_graphics = (
isinstance(elem_content, dict) and
elem_content.get('source') == 'vector_graphics'
)
if is_vector_graphics:
page_area = current_page_width * current_page_height
elem_area = box_width * box_height
coverage_ratio = elem_area / page_area if page_area > 0 else 0
if coverage_ratio > 0.5:
logger.info(f"Skipping large vector_graphics chart "
f"(covers {coverage_ratio*100:.1f}% of page)")
continue
# Draw image
img = self._embed_image_reflow(elem, image_dir)
if img:
@@ -4229,6 +4337,11 @@ class PDFGeneratorService:
)
elif isinstance(content, str) and content.strip():
# Skip text elements inside table bboxes
# (Table cells are rendered by _draw_translated_table with dynamic font sizing)
if is_inside_table(bbox):
continue
# Text element - use Paragraph for word wrapping
# Escape special characters
safe_content = content.replace('&', '&amp;')
@@ -4290,106 +4403,140 @@ class PDFGeneratorService:
image_dir: Path
):
"""
Draw a table with translated content using Platypus Table.
Draw a table with translated content.
Supports adaptive column widths and text wrapping within cells.
Approach:
1. Draw cell borders using cell_boxes from metadata
2. Render translated text in each cell with dynamic font sizing
3. Draw embedded images at their original positions
Text is rendered with dynamic font sizing to fit within cells.
Minimum font size is 6pt for readability.
Args:
pdf_canvas: ReportLab canvas
elem: Table element dict
elem: Table element dict with metadata containing cell_boxes
page_height: Page height for coordinate transformation
image_dir: Directory containing images
"""
from reportlab.platypus import Table, TableStyle, Paragraph
from reportlab.lib.styles import ParagraphStyle
from reportlab.lib import colors
from reportlab.lib.styles import ParagraphStyle
from reportlab.platypus import Paragraph
MIN_FONT_SIZE = 6 # Minimum font size for readability
try:
content = elem.get('content', {})
bbox = elem.get('bbox', {})
metadata = elem.get('metadata', {})
if not bbox:
return
x0 = bbox.get('x0', 0)
y0 = bbox.get('y0', 0)
x1 = bbox.get('x1', 0)
y1 = bbox.get('y1', 0)
table_width = x1 - x0
table_height = y1 - y0
# Parse table content
if isinstance(content, dict):
rows = content.get('rows', [])
cells = content.get('cells', [])
# Get table bounding box
if isinstance(bbox, dict):
tx0 = bbox.get('x0', 0)
ty0 = bbox.get('y0', 0)
tx1 = bbox.get('x1', 0)
ty1 = bbox.get('y1', 0)
else:
return
tx0, ty0, tx1, ty1 = bbox[:4] if len(bbox) >= 4 else (0, 0, 0, 0)
if not rows and not cells:
return
table_width = tx1 - tx0
table_height = ty1 - ty0
# Build table data
table_data = []
# Step 1: Draw outer table border
pdf_canvas.setStrokeColor(colors.black)
pdf_canvas.setLineWidth(1.0)
pdf_y_bottom = page_height - ty1
pdf_canvas.rect(tx0, pdf_y_bottom, table_width, table_height, stroke=1, fill=0)
if rows:
for row in rows:
row_cells = row if isinstance(row, list) else row.get('cells', [])
row_data = []
for cell in row_cells:
if isinstance(cell, str):
cell_text = cell
elif isinstance(cell, dict):
# Step 2: Draw cell borders using cell_boxes
cell_boxes = metadata.get('cell_boxes', [])
if cell_boxes:
# Normalize cell boxes for grid alignment
if hasattr(self, '_normalize_cell_boxes_to_grid'):
cell_boxes = self._normalize_cell_boxes_to_grid(cell_boxes)
pdf_canvas.setLineWidth(0.5)
for box in cell_boxes:
if len(box) >= 4:
cx0, cy0, cx1, cy1 = box[:4]
cell_width = cx1 - cx0
cell_height = cy1 - cy0
pdf_cell_y = page_height - cy1
pdf_canvas.rect(cx0, pdf_cell_y, cell_width, cell_height, stroke=1, fill=0)
# Step 3: Render translated text in each cell
cells = content.get('cells', []) if isinstance(content, dict) else []
font_name = self.font_name if self.font_registered else 'Helvetica'
for i, cell in enumerate(cells):
cell_text = cell.get('content', cell.get('text', ''))
else:
cell_text = str(cell) if cell else ''
if not cell_text or not cell_text.strip():
continue
# Create paragraph for text wrapping
# Get cell bounding box by index
if i >= len(cell_boxes):
continue
cx0, cy0, cx1, cy1 = cell_boxes[i][:4]
cell_width = cx1 - cx0
cell_height = cy1 - cy0
# Skip tiny cells
if cell_width < 10 or cell_height < 10:
continue
# Prepare text (escape HTML special chars)
safe_text = str(cell_text).replace('&', '&amp;')
safe_text = safe_text.replace('<', '&lt;').replace('>', '&gt;')
safe_text = safe_text.replace('\n', '<br/>')
# Dynamic font sizing: start at 10pt, shrink until text fits
padding = 3
available_width = cell_width - padding * 2
available_height = cell_height - padding * 2
if available_width <= 0 or available_height <= 0:
continue
# Try font sizes from 10pt down to MIN_FONT_SIZE
for font_size in range(10, MIN_FONT_SIZE - 1, -1):
cell_style = ParagraphStyle(
f'cell_{id(cell)}',
fontName=self.font_name if self.font_registered else 'Helvetica',
fontSize=9,
leading=11,
f'cell_{i}_{font_size}',
fontName=font_name,
fontSize=font_size,
leading=font_size * 1.15,
wordWrap='CJK',
)
para = Paragraph(safe_text, cell_style)
row_data.append(para)
para_width, para_height = para.wrap(available_width, available_height * 10)
if row_data:
table_data.append(row_data)
if para_height <= available_height:
break # Text fits at this font size
if not table_data:
return
# Draw text (centered vertically in cell)
text_x = cx0 + padding
# Calculate vertical position (top-aligned within cell)
text_y = page_height - cy0 - padding - min(para_height, available_height)
# Calculate column widths
num_cols = max(len(row) for row in table_data) if table_data else 1
col_width = table_width / num_cols if num_cols > 0 else table_width
para.drawOn(pdf_canvas, text_x, text_y)
# Create table
table = Table(table_data, colWidths=[col_width] * num_cols)
logger.info(f"[TRANSLATED TABLE] Drew table with {len(cell_boxes)} borders, {len(cells)} cells")
# Apply table style
table.setStyle(TableStyle([
('GRID', (0, 0), (-1, -1), 0.5, colors.black),
('VALIGN', (0, 0), (-1, -1), 'TOP'),
('LEFTPADDING', (0, 0), (-1, -1), 4),
('RIGHTPADDING', (0, 0), (-1, -1), 4),
('TOPPADDING', (0, 0), (-1, -1), 2),
('BOTTOMPADDING', (0, 0), (-1, -1), 2),
]))
# Wrap and draw table
t_width, t_height = table.wrap(table_width, table_height * 2)
# Convert to PDF coordinates
pdf_y = page_height - y0 - t_height
table.drawOn(pdf_canvas, x0, pdf_y)
# Step 4: Draw embedded images
embedded_images = metadata.get('embedded_images', [])
if embedded_images and image_dir:
for emb_img in embedded_images:
self._draw_embedded_image(
pdf_canvas, emb_img, page_height, image_dir, 1.0, 1.0
)
except Exception as e:
logger.error(f"Failed to draw translated table: {e}")
import traceback
traceback.print_exc()
# Singleton instance

View File

@@ -0,0 +1,68 @@
# Design: Fix PDF Table Rendering
## Context
OCR track produces tables with:
- `cell_boxes`: Accurate pixel coordinates for each cell border
- `cells`: Content with row/col indices and row_span/col_span
- `embedded_images`: Images within table cells
Current implementations fail to use these correctly:
- **Reflow PDF**: Ignores merged cells, misaligns content
- **Translated Layout PDF**: Creates new Table object instead of using cell_boxes
## Goals / Non-Goals
**Goals:**
- Translated Layout PDF tables match untranslated Layout PDF quality
- Reflow PDF tables are readable and correctly structured
- Embedded images appear in both formats
**Non-Goals:**
- Perfect pixel-level replication of original table styling
- Support for complex nested tables
## Decisions
### Decision 1: Translated Layout PDF uses Layered Rendering
**What**: Draw cell borders using `cell_boxes`, then render translated text in each cell separately
**Why**: This matches the working approach in `_draw_table_with_cell_boxes()` for untranslated PDFs
```python
# Step 1: Draw borders using cell_boxes
for cell_box in cell_boxes:
pdf_canvas.rect(x, y, width, height)
# Step 2: Render text for each cell
for cell in cells:
cell_bbox = find_matching_cell_box(cell, cell_boxes)
draw_text_in_bbox(translated_content, cell_bbox)
```
### Decision 2: Reflow PDF uses ReportLab SPAN for merged cells
**What**: Apply `('SPAN', (col1, row1), (col2, row2))` style for merged cells
**Why**: ReportLab's Table natively supports merged cells via TableStyle
```python
# Build span commands from cell data
for cell in cells:
if cell.row_span > 1 or cell.col_span > 1:
spans.append(('SPAN',
(cell.col, cell.row),
(cell.col + cell.col_span - 1, cell.row + cell.row_span - 1)))
```
### Decision 3: Column widths from cell_boxes ratio
**What**: Calculate column widths proportionally from cell_boxes
**Why**: Preserves original table structure in reflow mode
## Risks / Trade-offs
| Risk | Mitigation |
|------|------------|
| Text overflow in translated cells | Shrink font (min 8pt) or truncate with ellipsis |
| cell_boxes not matching cells count | Fall back to equal-width columns |
| Complex merged cell patterns | Handle simple spans, skip complex patterns |
## Open Questions
- Should reflow PDF preserve exact column width ratios or allow ReportLab auto-sizing?
- How to handle cells with both text and images?

View File

@@ -0,0 +1,18 @@
# Change: Fix PDF Table Rendering Issues
## Why
OCR track PDF exports have significant table rendering problems:
1. **Reflow PDF** (both translated and untranslated): Tables are misaligned due to missing row_span/col_span support
2. **Translated Layout PDF**: Table borders disappear and text overlaps because it doesn't use the accurate `cell_boxes` positioning
## What Changes
- **Translated Layout PDF**: Adopt layered rendering approach (borders + text separately) using `cell_boxes` from metadata
- **Reflow PDF Tables**: Fix cell extraction and add basic merged cell support
- Ensure embedded images in tables are rendered correctly in all PDF formats
## Impact
- Affected specs: result-export
- Affected code:
- `backend/app/services/pdf_generator_service.py`
- `_draw_translated_table()` - needs complete rewrite
- `_create_reflow_table()` - needs merged cell support

View File

@@ -0,0 +1,41 @@
## MODIFIED Requirements
### Requirement: Translated Layout PDF Generation
The system SHALL generate layout-preserving PDFs with translated content that maintain accurate table structure.
#### Scenario: Table with accurate borders
- **GIVEN** an OCR result with tables containing `cell_boxes` metadata
- **WHEN** generating translated layout PDF
- **THEN** table cell borders SHALL be drawn at positions matching `cell_boxes`
- **AND** translated text SHALL be rendered within each cell's bounding box
#### Scenario: Text overflow handling
- **GIVEN** translated text longer than original text
- **WHEN** text exceeds cell bounding box
- **THEN** the system SHALL reduce font size (minimum 8pt) to fit content
- **OR** truncate with ellipsis if minimum font size is insufficient
#### Scenario: Embedded images in tables
- **GIVEN** a table with `embedded_images` in metadata
- **WHEN** generating translated layout PDF
- **THEN** images SHALL be rendered at their original positions within the table
### Requirement: Reflow PDF Table Rendering
The system SHALL generate reflow PDFs with properly structured tables including merged cell support.
#### Scenario: Basic table rendering
- **GIVEN** an OCR result with table cells containing `row`, `col`, `content`
- **WHEN** generating reflow PDF
- **THEN** cells SHALL be grouped by row and column indices
- **AND** table SHALL render with visible borders
#### Scenario: Merged cells support
- **GIVEN** table cells with `row_span` or `col_span` greater than 1
- **WHEN** generating reflow PDF
- **THEN** the system SHALL apply appropriate cell spanning
- **AND** merged cells SHALL display content without duplication
#### Scenario: Column width calculation
- **GIVEN** a table with `cell_boxes` metadata
- **WHEN** generating reflow PDF
- **THEN** column widths SHOULD be proportional to original cell widths

View File

@@ -0,0 +1,20 @@
# Tasks: Fix PDF Table Rendering
## 1. Translated Layout PDF - Table Fix (P0)
- [ ] 1.1 Refactor `_draw_translated_table()` to use layered rendering approach
- [ ] 1.2 Use `cell_boxes` from metadata for accurate border positioning
- [ ] 1.3 Render translated text within each cell's bbox using Paragraph with wordWrap
- [ ] 1.4 Handle text overflow (shrink font to minimum 8pt or truncate)
- [ ] 1.5 Draw embedded images at correct positions
## 2. Reflow PDF - Table Fix (P1)
- [ ] 2.1 Fix `_create_reflow_table()` cell extraction from content dict
- [ ] 2.2 Add row_span/col_span handling using ReportLab SPAN style
- [ ] 2.3 Calculate proportional column widths based on cell_boxes
- [ ] 2.4 Embed images in table cells instead of after table
## 3. Testing & Validation
- [ ] 3.1 Test with task 48b9e849-f6e3-462f-83a1-911ded701958 (has merged cells)
- [ ] 3.2 Verify translated layout PDF has visible borders
- [ ] 3.3 Verify reflow PDF tables align correctly
- [ ] 3.4 Verify embedded images appear in both formats