diff --git a/backend/app/services/pdf_generator_service.py b/backend/app/services/pdf_generator_service.py index 402c2a4..82c23da 100644 --- a/backend/app/services/pdf_generator_service.py +++ b/backend/app/services/pdf_generator_service.py @@ -354,9 +354,13 @@ class PDFGeneratorService: elif 'courier' in font_lower: return 'Courier' - # Default fallback - logger.debug(f"Font '{font_name}' not found in mapping, using Helvetica") - return 'Helvetica' + # Default fallback - use NotoSansSC for CJK support if registered + if self.font_registered: + logger.debug(f"Font '{font_name}' not found in mapping, using {self.font_name} for CJK support") + return self.font_name + else: + logger.debug(f"Font '{font_name}' not found in mapping, using Helvetica") + return 'Helvetica' def _apply_text_style(self, c: canvas.Canvas, style_info, default_size: float = 12): """ @@ -866,6 +870,23 @@ class PDFGeneratorService: ElementType.IMAGE, ElementType.FIGURE, ElementType.CHART, ElementType.DIAGRAM, ElementType.LOGO, ElementType.STAMP ]: + # Skip large vector_graphics charts in Direct track + # These are visual decorations (borders, lines, frames) that would cover text + # PyMuPDF extracts both vector graphics as images AND text layer separately + if element.type == ElementType.CHART and element.bbox: + content = element.content + is_vector_graphics = ( + isinstance(content, dict) and + content.get('source') == 'vector_graphics' + ) + if is_vector_graphics: + elem_area = (element.bbox.x1 - element.bbox.x0) * (element.bbox.y1 - element.bbox.y0) + coverage_ratio = elem_area / page_area if page_area > 0 else 0 + if coverage_ratio > 0.5: + logger.info(f"Skipping large vector_graphics chart {element.element_id} " + f"(covers {coverage_ratio*100:.1f}% of page) - text provides actual content") + continue + image_elements.append(element) # Only add real images to exclusion regions, NOT charts/diagrams # Charts often have large bounding boxes that include text labels @@ -3704,64 +3725,103 @@ class PDFGeneratorService: def _create_reflow_table(self, table_data: Dict, styles: Dict) -> Optional[Table]: """ - Create a Platypus Table for reflow mode. + Create a Platypus Table for reflow mode with merged cell support. Args: - table_data: Table element dictionary with 'rows' or 'cells' + table_data: Table element dictionary with 'content' containing 'cells' styles: Style dictionary Returns: Platypus Table object or None """ try: - # Get content - cells might be inside 'content' dict + # Get content - cells are inside 'content' dict content = table_data.get('content', {}) - if isinstance(content, dict): - rows_data = content.get('rows', []) if isinstance(content.get('rows'), list) else [] - cells = content.get('cells', []) - else: - rows_data = table_data.get('rows', []) - cells = table_data.get('cells', []) - - if not rows_data and cells: - # Group cells by row - support both 'row'/'col' and 'row_index'/'col_index' keys - row_map = {} - for cell in cells: - row_idx = cell.get('row', cell.get('row_index', 0)) - if row_idx not in row_map: - row_map[row_idx] = [] - row_map[row_idx].append(cell) - # Sort and create rows - rows_data = [] - for row_idx in sorted(row_map.keys()): - row_cells = sorted(row_map[row_idx], key=lambda c: c.get('col', c.get('col_index', 0))) - rows_data.append({'cells': row_cells}) - - if not rows_data: + if not isinstance(content, dict): return None - # Build table data + cells = content.get('cells', []) + if not cells: + return None + + # Determine grid dimensions + num_rows = content.get('rows', 0) + num_cols = content.get('cols', 0) + + if num_rows == 0 or num_cols == 0: + # Calculate from cells + for cell in cells: + row = cell.get('row', cell.get('row_index', 0)) + col = cell.get('col', cell.get('col_index', 0)) + row_span = cell.get('row_span', 1) + col_span = cell.get('col_span', 1) + num_rows = max(num_rows, row + row_span) + num_cols = max(num_cols, col + col_span) + + if num_rows == 0 or num_cols == 0: + return None + + # Initialize grid with empty strings + grid = [['' for _ in range(num_cols)] for _ in range(num_rows)] + # Track which cells are covered by spans + covered = [[False for _ in range(num_cols)] for _ in range(num_rows)] + # Track span commands + span_commands = [] + + # Fill grid with cell content + for cell in cells: + row = cell.get('row', cell.get('row_index', 0)) + col = cell.get('col', cell.get('col_index', 0)) + row_span = cell.get('row_span', 1) + col_span = cell.get('col_span', 1) + + # Get cell text + text = cell.get('content', cell.get('text', '')) + if not isinstance(text, str): + text = str(text) if text else '' + + # Escape HTML special characters + text = text.replace('&', '&').replace('<', '<').replace('>', '>') + + # Place content in the top-left cell of the span + if 0 <= row < num_rows and 0 <= col < num_cols: + grid[row][col] = text + + # Mark covered cells for spans + if row_span > 1 or col_span > 1: + # Add SPAN command + span_commands.append(( + 'SPAN', + (col, row), + (col + col_span - 1, row + row_span - 1) + )) + # Mark cells as covered + for r in range(row, min(row + row_span, num_rows)): + for c in range(col, min(col + col_span, num_cols)): + if r != row or c != col: + covered[r][c] = True + + # Build table data with Paragraphs data = [] - for row in rows_data: + for row_idx in range(num_rows): row_data = [] - row_cells = row.get('cells', []) - for cell in row_cells: - # Support both 'text' and 'content' keys - text = cell.get('text', cell.get('content', '')) - if not isinstance(text, str): - text = str(text) if text else '' - # Escape HTML special characters - text = text.replace('&', '&').replace('<', '<').replace('>', '>') - row_data.append(Paragraph(text, styles['TableCell'])) - if row_data: - data.append(row_data) + for col_idx in range(num_cols): + if covered[row_idx][col_idx]: + # Empty cell for covered spans + row_data.append('') + else: + text = grid[row_idx][col_idx] + row_data.append(Paragraph(text, styles['TableCell'])) + data.append(row_data) if not data: return None # Create table table = Table(data) - table.setStyle(TableStyle([ + + # Build style commands + style_commands = [ ('GRID', (0, 0), (-1, -1), 0.5, colors.black), ('VALIGN', (0, 0), (-1, -1), 'TOP'), ('LEFTPADDING', (0, 0), (-1, -1), 6), @@ -3769,11 +3829,20 @@ class PDFGeneratorService: ('TOPPADDING', (0, 0), (-1, -1), 4), ('BOTTOMPADDING', (0, 0), (-1, -1), 4), ('BACKGROUND', (0, 0), (-1, 0), colors.lightgrey), # Header row - ])) + ] + + # Add span commands + style_commands.extend(span_commands) + + table.setStyle(TableStyle(style_commands)) + + logger.info(f"[REFLOW TABLE] Created table with {num_rows}x{num_cols} cells, {len(span_commands)} spans") return table except Exception as e: logger.error(f"Failed to create reflow table: {e}") + import traceback + traceback.print_exc() return None def _embed_image_reflow( @@ -4189,8 +4258,31 @@ class PDFGeneratorService: pdf_canvas.setPageSize((current_page_width, current_page_height)) - # Process elements + # Process elements: + # - Tables: draw borders + translated cell text (with dynamic font sizing) + # - Text elements: draw at original positions, SKIP if inside table bbox + # - Images: draw at original positions elements = page_data.get('elements', []) + + # Collect table bboxes to skip text elements inside tables + table_bboxes = [] + for elem in elements: + if elem.get('type') in ('table', 'Table'): + elem_bbox = elem.get('bbox', {}) + if elem_bbox: + table_bboxes.append(elem_bbox) + + def is_inside_table(text_bbox): + """Check if text bbox is inside any table bbox.""" + margin = 5 + for tb in table_bboxes: + if (text_bbox.get('x0', 0) >= tb.get('x0', 0) - margin and + text_bbox.get('y0', 0) >= tb.get('y0', 0) - margin and + text_bbox.get('x1', 0) <= tb.get('x1', 0) + margin and + text_bbox.get('y1', 0) <= tb.get('y1', 0) + margin): + return True + return False + for elem in elements: elem_type = elem.get('type', 'text') content = elem.get('content', '') @@ -4211,6 +4303,22 @@ class PDFGeneratorService: # Handle different element types if elem_type in ('image', 'figure', 'Image', 'Figure', 'chart', 'Chart'): + # Skip large vector_graphics charts - they're visual decorations that cover text + if elem_type in ('chart', 'Chart'): + elem_content = elem.get('content', {}) + is_vector_graphics = ( + isinstance(elem_content, dict) and + elem_content.get('source') == 'vector_graphics' + ) + if is_vector_graphics: + page_area = current_page_width * current_page_height + elem_area = box_width * box_height + coverage_ratio = elem_area / page_area if page_area > 0 else 0 + if coverage_ratio > 0.5: + logger.info(f"Skipping large vector_graphics chart " + f"(covers {coverage_ratio*100:.1f}% of page)") + continue + # Draw image img = self._embed_image_reflow(elem, image_dir) if img: @@ -4229,6 +4337,11 @@ class PDFGeneratorService: ) elif isinstance(content, str) and content.strip(): + # Skip text elements inside table bboxes + # (Table cells are rendered by _draw_translated_table with dynamic font sizing) + if is_inside_table(bbox): + continue + # Text element - use Paragraph for word wrapping # Escape special characters safe_content = content.replace('&', '&') @@ -4290,106 +4403,140 @@ class PDFGeneratorService: image_dir: Path ): """ - Draw a table with translated content using Platypus Table. + Draw a table with translated content. - Supports adaptive column widths and text wrapping within cells. + Approach: + 1. Draw cell borders using cell_boxes from metadata + 2. Render translated text in each cell with dynamic font sizing + 3. Draw embedded images at their original positions + + Text is rendered with dynamic font sizing to fit within cells. + Minimum font size is 6pt for readability. Args: pdf_canvas: ReportLab canvas - elem: Table element dict + elem: Table element dict with metadata containing cell_boxes page_height: Page height for coordinate transformation image_dir: Directory containing images """ - from reportlab.platypus import Table, TableStyle, Paragraph - from reportlab.lib.styles import ParagraphStyle from reportlab.lib import colors + from reportlab.lib.styles import ParagraphStyle + from reportlab.platypus import Paragraph + + MIN_FONT_SIZE = 6 # Minimum font size for readability try: content = elem.get('content', {}) bbox = elem.get('bbox', {}) + metadata = elem.get('metadata', {}) if not bbox: return - x0 = bbox.get('x0', 0) - y0 = bbox.get('y0', 0) - x1 = bbox.get('x1', 0) - y1 = bbox.get('y1', 0) - table_width = x1 - x0 - table_height = y1 - y0 - - # Parse table content - if isinstance(content, dict): - rows = content.get('rows', []) - cells = content.get('cells', []) + # Get table bounding box + if isinstance(bbox, dict): + tx0 = bbox.get('x0', 0) + ty0 = bbox.get('y0', 0) + tx1 = bbox.get('x1', 0) + ty1 = bbox.get('y1', 0) else: - return + tx0, ty0, tx1, ty1 = bbox[:4] if len(bbox) >= 4 else (0, 0, 0, 0) - if not rows and not cells: - return + table_width = tx1 - tx0 + table_height = ty1 - ty0 - # Build table data - table_data = [] + # Step 1: Draw outer table border + pdf_canvas.setStrokeColor(colors.black) + pdf_canvas.setLineWidth(1.0) + pdf_y_bottom = page_height - ty1 + pdf_canvas.rect(tx0, pdf_y_bottom, table_width, table_height, stroke=1, fill=0) - if rows: - for row in rows: - row_cells = row if isinstance(row, list) else row.get('cells', []) - row_data = [] - for cell in row_cells: - if isinstance(cell, str): - cell_text = cell - elif isinstance(cell, dict): - cell_text = cell.get('content', cell.get('text', '')) - else: - cell_text = str(cell) if cell else '' + # Step 2: Draw cell borders using cell_boxes + cell_boxes = metadata.get('cell_boxes', []) + if cell_boxes: + # Normalize cell boxes for grid alignment + if hasattr(self, '_normalize_cell_boxes_to_grid'): + cell_boxes = self._normalize_cell_boxes_to_grid(cell_boxes) - # Create paragraph for text wrapping - safe_text = str(cell_text).replace('&', '&') - safe_text = safe_text.replace('<', '<').replace('>', '>') + pdf_canvas.setLineWidth(0.5) + for box in cell_boxes: + if len(box) >= 4: + cx0, cy0, cx1, cy1 = box[:4] + cell_width = cx1 - cx0 + cell_height = cy1 - cy0 + pdf_cell_y = page_height - cy1 + pdf_canvas.rect(cx0, pdf_cell_y, cell_width, cell_height, stroke=1, fill=0) - cell_style = ParagraphStyle( - f'cell_{id(cell)}', - fontName=self.font_name if self.font_registered else 'Helvetica', - fontSize=9, - leading=11, - wordWrap='CJK', - ) - para = Paragraph(safe_text, cell_style) - row_data.append(para) + # Step 3: Render translated text in each cell + cells = content.get('cells', []) if isinstance(content, dict) else [] + font_name = self.font_name if self.font_registered else 'Helvetica' - if row_data: - table_data.append(row_data) + for i, cell in enumerate(cells): + cell_text = cell.get('content', cell.get('text', '')) + if not cell_text or not cell_text.strip(): + continue - if not table_data: - return + # Get cell bounding box by index + if i >= len(cell_boxes): + continue - # Calculate column widths - num_cols = max(len(row) for row in table_data) if table_data else 1 - col_width = table_width / num_cols if num_cols > 0 else table_width + cx0, cy0, cx1, cy1 = cell_boxes[i][:4] + cell_width = cx1 - cx0 + cell_height = cy1 - cy0 - # Create table - table = Table(table_data, colWidths=[col_width] * num_cols) + # Skip tiny cells + if cell_width < 10 or cell_height < 10: + continue - # Apply table style - table.setStyle(TableStyle([ - ('GRID', (0, 0), (-1, -1), 0.5, colors.black), - ('VALIGN', (0, 0), (-1, -1), 'TOP'), - ('LEFTPADDING', (0, 0), (-1, -1), 4), - ('RIGHTPADDING', (0, 0), (-1, -1), 4), - ('TOPPADDING', (0, 0), (-1, -1), 2), - ('BOTTOMPADDING', (0, 0), (-1, -1), 2), - ])) + # Prepare text (escape HTML special chars) + safe_text = str(cell_text).replace('&', '&') + safe_text = safe_text.replace('<', '<').replace('>', '>') + safe_text = safe_text.replace('\n', '
') - # Wrap and draw table - t_width, t_height = table.wrap(table_width, table_height * 2) + # Dynamic font sizing: start at 10pt, shrink until text fits + padding = 3 + available_width = cell_width - padding * 2 + available_height = cell_height - padding * 2 - # Convert to PDF coordinates - pdf_y = page_height - y0 - t_height + if available_width <= 0 or available_height <= 0: + continue - table.drawOn(pdf_canvas, x0, pdf_y) + # Try font sizes from 10pt down to MIN_FONT_SIZE + for font_size in range(10, MIN_FONT_SIZE - 1, -1): + cell_style = ParagraphStyle( + f'cell_{i}_{font_size}', + fontName=font_name, + fontSize=font_size, + leading=font_size * 1.15, + wordWrap='CJK', + ) + para = Paragraph(safe_text, cell_style) + para_width, para_height = para.wrap(available_width, available_height * 10) + + if para_height <= available_height: + break # Text fits at this font size + + # Draw text (centered vertically in cell) + text_x = cx0 + padding + # Calculate vertical position (top-aligned within cell) + text_y = page_height - cy0 - padding - min(para_height, available_height) + + para.drawOn(pdf_canvas, text_x, text_y) + + logger.info(f"[TRANSLATED TABLE] Drew table with {len(cell_boxes)} borders, {len(cells)} cells") + + # Step 4: Draw embedded images + embedded_images = metadata.get('embedded_images', []) + if embedded_images and image_dir: + for emb_img in embedded_images: + self._draw_embedded_image( + pdf_canvas, emb_img, page_height, image_dir, 1.0, 1.0 + ) except Exception as e: logger.error(f"Failed to draw translated table: {e}") + import traceback + traceback.print_exc() # Singleton instance diff --git a/openspec/changes/archive/2025-12-03-fix-pdf-table-rendering/design.md b/openspec/changes/archive/2025-12-03-fix-pdf-table-rendering/design.md new file mode 100644 index 0000000..6576214 --- /dev/null +++ b/openspec/changes/archive/2025-12-03-fix-pdf-table-rendering/design.md @@ -0,0 +1,68 @@ +# Design: Fix PDF Table Rendering + +## Context +OCR track produces tables with: +- `cell_boxes`: Accurate pixel coordinates for each cell border +- `cells`: Content with row/col indices and row_span/col_span +- `embedded_images`: Images within table cells + +Current implementations fail to use these correctly: +- **Reflow PDF**: Ignores merged cells, misaligns content +- **Translated Layout PDF**: Creates new Table object instead of using cell_boxes + +## Goals / Non-Goals + +**Goals:** +- Translated Layout PDF tables match untranslated Layout PDF quality +- Reflow PDF tables are readable and correctly structured +- Embedded images appear in both formats + +**Non-Goals:** +- Perfect pixel-level replication of original table styling +- Support for complex nested tables + +## Decisions + +### Decision 1: Translated Layout PDF uses Layered Rendering +**What**: Draw cell borders using `cell_boxes`, then render translated text in each cell separately +**Why**: This matches the working approach in `_draw_table_with_cell_boxes()` for untranslated PDFs + +```python +# Step 1: Draw borders using cell_boxes +for cell_box in cell_boxes: + pdf_canvas.rect(x, y, width, height) + +# Step 2: Render text for each cell +for cell in cells: + cell_bbox = find_matching_cell_box(cell, cell_boxes) + draw_text_in_bbox(translated_content, cell_bbox) +``` + +### Decision 2: Reflow PDF uses ReportLab SPAN for merged cells +**What**: Apply `('SPAN', (col1, row1), (col2, row2))` style for merged cells +**Why**: ReportLab's Table natively supports merged cells via TableStyle + +```python +# Build span commands from cell data +for cell in cells: + if cell.row_span > 1 or cell.col_span > 1: + spans.append(('SPAN', + (cell.col, cell.row), + (cell.col + cell.col_span - 1, cell.row + cell.row_span - 1))) +``` + +### Decision 3: Column widths from cell_boxes ratio +**What**: Calculate column widths proportionally from cell_boxes +**Why**: Preserves original table structure in reflow mode + +## Risks / Trade-offs + +| Risk | Mitigation | +|------|------------| +| Text overflow in translated cells | Shrink font (min 8pt) or truncate with ellipsis | +| cell_boxes not matching cells count | Fall back to equal-width columns | +| Complex merged cell patterns | Handle simple spans, skip complex patterns | + +## Open Questions +- Should reflow PDF preserve exact column width ratios or allow ReportLab auto-sizing? +- How to handle cells with both text and images? diff --git a/openspec/changes/archive/2025-12-03-fix-pdf-table-rendering/proposal.md b/openspec/changes/archive/2025-12-03-fix-pdf-table-rendering/proposal.md new file mode 100644 index 0000000..2650cf7 --- /dev/null +++ b/openspec/changes/archive/2025-12-03-fix-pdf-table-rendering/proposal.md @@ -0,0 +1,18 @@ +# Change: Fix PDF Table Rendering Issues + +## Why +OCR track PDF exports have significant table rendering problems: +1. **Reflow PDF** (both translated and untranslated): Tables are misaligned due to missing row_span/col_span support +2. **Translated Layout PDF**: Table borders disappear and text overlaps because it doesn't use the accurate `cell_boxes` positioning + +## What Changes +- **Translated Layout PDF**: Adopt layered rendering approach (borders + text separately) using `cell_boxes` from metadata +- **Reflow PDF Tables**: Fix cell extraction and add basic merged cell support +- Ensure embedded images in tables are rendered correctly in all PDF formats + +## Impact +- Affected specs: result-export +- Affected code: + - `backend/app/services/pdf_generator_service.py` + - `_draw_translated_table()` - needs complete rewrite + - `_create_reflow_table()` - needs merged cell support diff --git a/openspec/changes/archive/2025-12-03-fix-pdf-table-rendering/specs/result-export/spec.md b/openspec/changes/archive/2025-12-03-fix-pdf-table-rendering/specs/result-export/spec.md new file mode 100644 index 0000000..245c527 --- /dev/null +++ b/openspec/changes/archive/2025-12-03-fix-pdf-table-rendering/specs/result-export/spec.md @@ -0,0 +1,41 @@ +## MODIFIED Requirements + +### Requirement: Translated Layout PDF Generation +The system SHALL generate layout-preserving PDFs with translated content that maintain accurate table structure. + +#### Scenario: Table with accurate borders +- **GIVEN** an OCR result with tables containing `cell_boxes` metadata +- **WHEN** generating translated layout PDF +- **THEN** table cell borders SHALL be drawn at positions matching `cell_boxes` +- **AND** translated text SHALL be rendered within each cell's bounding box + +#### Scenario: Text overflow handling +- **GIVEN** translated text longer than original text +- **WHEN** text exceeds cell bounding box +- **THEN** the system SHALL reduce font size (minimum 8pt) to fit content +- **OR** truncate with ellipsis if minimum font size is insufficient + +#### Scenario: Embedded images in tables +- **GIVEN** a table with `embedded_images` in metadata +- **WHEN** generating translated layout PDF +- **THEN** images SHALL be rendered at their original positions within the table + +### Requirement: Reflow PDF Table Rendering +The system SHALL generate reflow PDFs with properly structured tables including merged cell support. + +#### Scenario: Basic table rendering +- **GIVEN** an OCR result with table cells containing `row`, `col`, `content` +- **WHEN** generating reflow PDF +- **THEN** cells SHALL be grouped by row and column indices +- **AND** table SHALL render with visible borders + +#### Scenario: Merged cells support +- **GIVEN** table cells with `row_span` or `col_span` greater than 1 +- **WHEN** generating reflow PDF +- **THEN** the system SHALL apply appropriate cell spanning +- **AND** merged cells SHALL display content without duplication + +#### Scenario: Column width calculation +- **GIVEN** a table with `cell_boxes` metadata +- **WHEN** generating reflow PDF +- **THEN** column widths SHOULD be proportional to original cell widths diff --git a/openspec/changes/archive/2025-12-03-fix-pdf-table-rendering/tasks.md b/openspec/changes/archive/2025-12-03-fix-pdf-table-rendering/tasks.md new file mode 100644 index 0000000..0926685 --- /dev/null +++ b/openspec/changes/archive/2025-12-03-fix-pdf-table-rendering/tasks.md @@ -0,0 +1,20 @@ +# Tasks: Fix PDF Table Rendering + +## 1. Translated Layout PDF - Table Fix (P0) +- [ ] 1.1 Refactor `_draw_translated_table()` to use layered rendering approach +- [ ] 1.2 Use `cell_boxes` from metadata for accurate border positioning +- [ ] 1.3 Render translated text within each cell's bbox using Paragraph with wordWrap +- [ ] 1.4 Handle text overflow (shrink font to minimum 8pt or truncate) +- [ ] 1.5 Draw embedded images at correct positions + +## 2. Reflow PDF - Table Fix (P1) +- [ ] 2.1 Fix `_create_reflow_table()` cell extraction from content dict +- [ ] 2.2 Add row_span/col_span handling using ReportLab SPAN style +- [ ] 2.3 Calculate proportional column widths based on cell_boxes +- [ ] 2.4 Embed images in table cells instead of after table + +## 3. Testing & Validation +- [ ] 3.1 Test with task 48b9e849-f6e3-462f-83a1-911ded701958 (has merged cells) +- [ ] 3.2 Verify translated layout PDF has visible borders +- [ ] 3.3 Verify reflow PDF tables align correctly +- [ ] 3.4 Verify embedded images appear in both formats