diff --git a/backend/app/models/unified_document.py b/backend/app/models/unified_document.py index bd7cd72..c27ffe1 100644 --- a/backend/app/models/unified_document.py +++ b/backend/app/models/unified_document.py @@ -186,6 +186,30 @@ class TableCell: "style": self.style.to_dict() if self.style else None } + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> 'TableCell': + """Create TableCell from dictionary.""" + bbox = None + if data.get('bbox'): + bbox_data = data['bbox'] + if isinstance(bbox_data, dict): + bbox = BoundingBox( + x0=bbox_data.get('x0', 0), + y0=bbox_data.get('y0', 0), + x1=bbox_data.get('x1', 0), + y1=bbox_data.get('y1', 0) + ) + + return cls( + row=data.get('row', 0), + col=data.get('col', 0), + row_span=data.get('row_span', 1), + col_span=data.get('col_span', 1), + content=data.get('content', ''), + bbox=bbox, + style=None # Style parsing can be added if needed + ) + @dataclass class TableData: @@ -205,6 +229,35 @@ class TableData: "caption": self.caption } + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> 'TableData': + """ + Create TableData from dictionary. + + Handles conversion from JSON format with cells array to proper TableData + object with TableCell instances. + + Args: + data: Dictionary with keys: rows, cols, cells, headers, caption + + Returns: + TableData instance + """ + cells = [] + for cell_data in data.get('cells', []): + if isinstance(cell_data, dict): + cells.append(TableCell.from_dict(cell_data)) + elif isinstance(cell_data, TableCell): + cells.append(cell_data) + + return cls( + rows=data.get('rows', 0), + cols=data.get('cols', 0), + cells=cells, + headers=data.get('headers'), + caption=data.get('caption') + ) + def to_html(self) -> str: """Convert table to HTML representation""" html = [""] diff --git a/backend/app/services/pdf_generator_service.py b/backend/app/services/pdf_generator_service.py index 0d3371f..8b5291b 100644 --- a/backend/app/services/pdf_generator_service.py +++ b/backend/app/services/pdf_generator_service.py @@ -1945,11 +1945,23 @@ class PDFGeneratorService: if child: children.append(child) + # Process content based on element type + content = elem_dict.get('content', '') + + # For TABLE elements, convert dict content to TableData object + if elem_type == ElementType.TABLE and isinstance(content, dict) and 'cells' in content: + try: + content = TableData.from_dict(content) + logger.debug(f"Converted table dict to TableData: {content.rows}x{content.cols}, {len(content.cells)} cells") + except Exception as e: + logger.warning(f"Failed to convert table dict to TableData: {e}") + # Keep original dict as fallback + # Create element element = DocumentElement( element_id=elem_dict.get('element_id', ''), type=elem_type, - content=elem_dict.get('content', ''), + content=content, bbox=bbox, confidence=elem_dict.get('confidence'), style=style, diff --git a/openspec/changes/fix-ocr-track-table-rendering/proposal.md b/openspec/changes/fix-ocr-track-table-rendering/proposal.md new file mode 100644 index 0000000..2cd6137 --- /dev/null +++ b/openspec/changes/fix-ocr-track-table-rendering/proposal.md @@ -0,0 +1,108 @@ +# Fix OCR Track Table Rendering + +## Summary + +OCR track PDF generation produces tables with incorrect format and layout. Tables appear without proper structure - cell content is misaligned and the visual format differs significantly from the original document. Image placement is correct, but table rendering is broken. + +## Problem Statement + +When generating PDF from OCR track results (via `scan.pdf` processed by PP-StructureV3), the output tables have: +1. **Wrong cell alignment** - content not positioned in proper cells +2. **Missing table structure** - rows/columns don't match original document layout +3. **Incorrect content distribution** - all content seems to flow linearly instead of maintaining grid structure + +Reference: `backend/storage/results/af7c9ee8-60a0-4291-9f22-ef98d27eed52/` +- Original: `af7c9ee8-60a0-4291-9f22-ef98d27eed52_scan_page_1.png` +- Generated: `scan_layout.pdf` +- Result JSON: `scan_result.json` - Tables have correct `{rows, cols, cells}` structure + +## Root Cause Analysis + +### Issue 1: Table Content Not Converted to TableData Object + +In `_json_to_document_element` (pdf_generator_service.py:1952): +```python +element = DocumentElement( + ... + content=elem_dict.get('content', ''), # Raw dict, not TableData + ... +) +``` + +Table elements have `content` as a dict `{rows: 5, cols: 4, cells: [...]}` but it's not converted to a `TableData` object. + +### Issue 2: OCR Track HTML Conversion Fails + +In `convert_unified_document_to_ocr_data` (pdf_generator_service.py:464-467): +```python +elif isinstance(element.content, dict): + html_content = element.content.get('html', str(element.content)) +``` + +Since there's no 'html' key in the cells-based dict, it falls back to `str(element.content)` = `"{'rows': 5, 'cols': 4, ...}"` - invalid HTML. + +### Issue 3: Different Table Rendering Paths + +- **Direct track** uses `_draw_table_element_direct` which properly handles dict with cells via `_build_rows_from_cells_dict` +- **OCR track** uses `draw_table_region` which expects HTML strings and fails with dict content + +## Proposed Solution + +### Option A: Convert dict to TableData during JSON loading (Recommended) + +In `_json_to_document_element`, when element type is TABLE and content is a dict with cells, convert it to a `TableData` object: + +```python +# For TABLE elements, convert dict to TableData +if elem_type == ElementType.TABLE and isinstance(content, dict) and 'cells' in content: + content = self._dict_to_table_data(content) +``` + +This ensures `element.content.to_html()` works correctly in `convert_unified_document_to_ocr_data`. + +### Option B: Fix conversion in convert_unified_document_to_ocr_data + +Handle dict with cells properly by converting to HTML: + +```python +elif isinstance(element.content, dict): + if 'cells' in element.content: + # Convert cells-based dict to HTML + html_content = self._cells_dict_to_html(element.content) + elif 'html' in element.content: + html_content = element.content['html'] + else: + html_content = str(element.content) +``` + +## Impact on Hybrid Mode + +Hybrid mode uses Direct track rendering (`_generate_direct_track_pdf`) which already handles dict content properly via `_build_rows_from_cells_dict`. The proposed fixes should not affect hybrid mode negatively. + +However, testing should verify: +1. Hybrid mode continues to work with combined Direct + OCR elements +2. Table rendering quality is consistent across all tracks + +## Success Criteria + +1. OCR track tables render with correct structure matching original document +2. Cell content positioned in proper grid locations +3. Table borders/grid lines visible +4. No regression in Direct track or Hybrid mode table rendering +5. All test files (scan.pdf, img1.png, img2.png, img3.png) produce correct output + +## Files to Modify + +1. `backend/app/services/pdf_generator_service.py` + - `_json_to_document_element`: Convert table dict to TableData + - `convert_unified_document_to_ocr_data`: Improve dict handling (if Option B) + +2. `backend/app/models/unified_document.py` (optional) + - Add `TableData.from_dict()` class method for cleaner conversion + +## Testing Plan + +1. Test scan.pdf with OCR track - verify table structure matches original +2. Test img1.png, img2.png, img3.png with OCR track +3. Test PDF files with Direct track - verify no regression +4. Test Hybrid mode with files that trigger OCR fallback diff --git a/openspec/changes/fix-ocr-track-table-rendering/specs/pdf-generation/spec.md b/openspec/changes/fix-ocr-track-table-rendering/specs/pdf-generation/spec.md new file mode 100644 index 0000000..c1d7831 --- /dev/null +++ b/openspec/changes/fix-ocr-track-table-rendering/specs/pdf-generation/spec.md @@ -0,0 +1,52 @@ +# PDF Generation - OCR Track Table Rendering Fix + +## MODIFIED Requirements + +### Requirement: OCR Track Table Content Conversion + +The PDF generator MUST properly convert table content from JSON dict format to renderable structure when processing OCR track results. + +#### Scenario: Table dict with cells array converts to proper HTML + +Given an OCR track JSON with table element containing rows, cols, and cells array +When the PDF generator processes this element +Then the table content MUST be converted to a TableData object +And TableData.to_html() MUST produce valid HTML with proper tr/td structure +And the generated PDF table MUST have cells positioned in correct grid locations + +#### Scenario: Table with rowspan/colspan renders correctly + +Given a table element with cells having rowspan > 1 or colspan > 1 +When the PDF generator renders the table +Then merged cells MUST span the correct number of rows/columns +And content MUST appear in the merged cell position + +### Requirement: Table Visual Fidelity + +The PDF generator MUST render OCR track tables with visual structure matching the original document. + +#### Scenario: Table renders with grid lines + +Given an OCR track table element +When rendered to PDF +Then the table MUST have visible grid lines/borders +And cell boundaries MUST be clearly defined + +#### Scenario: Table text alignment preserved + +Given an OCR track table with cell content +When rendered to PDF +Then text MUST be positioned within the correct cell boundaries +And text MUST NOT overflow into adjacent cells + +### Requirement: Backward Compatibility with Hybrid Mode + +The table rendering fix MUST NOT break hybrid mode processing. + +#### Scenario: Hybrid mode tables render correctly + +Given a document processed with hybrid mode combining Direct and OCR tracks +When PDF is generated +Then Direct track tables MUST render with existing quality +And OCR track tables MUST render with improved quality +And no regression in table positioning or content diff --git a/openspec/changes/fix-ocr-track-table-rendering/tasks.md b/openspec/changes/fix-ocr-track-table-rendering/tasks.md new file mode 100644 index 0000000..15361ba --- /dev/null +++ b/openspec/changes/fix-ocr-track-table-rendering/tasks.md @@ -0,0 +1,55 @@ +# Implementation Tasks + +## Phase 1: Core Fix - Table Content Conversion + +### 1.1 Add TableData.from_dict() class method +- [ ] In `unified_document.py`, add `from_dict()` method to `TableData` class +- [ ] Handle conversion of cells list (list of dicts) to `TableCell` objects +- [ ] Preserve rows, cols, headers, caption fields + +### 1.2 Fix _json_to_document_element for TABLE elements +- [ ] In `pdf_generator_service.py`, modify `_json_to_document_element` +- [ ] When `elem_type == ElementType.TABLE` and content is dict with 'cells', convert to `TableData` +- [ ] Use `TableData.from_dict()` for clean conversion + +### 1.3 Verify TableData.to_html() generates correct HTML +- [ ] Test that `to_html()` produces parseable HTML with proper row/cell structure +- [ ] Verify colspan/rowspan attributes are correctly generated +- [ ] Ensure empty cells are properly handled + +## Phase 2: OCR Track Rendering Consistency + +### 2.1 Review convert_unified_document_to_ocr_data +- [ ] Verify TableData objects are properly converted to HTML +- [ ] Add fallback handling for dict content with 'cells' key +- [ ] Log warning if content cannot be converted to HTML + +### 2.2 Review draw_table_region +- [ ] Verify HTMLTableParser correctly parses generated HTML +- [ ] Check that ReportLab Table is positioned at correct bbox +- [ ] Verify font and style application + +## Phase 3: Testing and Verification + +### 3.1 Test OCR Track +- [ ] Test scan.pdf - verify tables have correct structure +- [ ] Test img1.png, img2.png, img3.png +- [ ] Compare generated PDF with original documents + +### 3.2 Test Direct Track (Regression) +- [ ] Test PDF files with Direct track +- [ ] Verify table rendering unchanged + +### 3.3 Test Hybrid Mode +- [ ] Test files that trigger hybrid processing +- [ ] Verify mixed Direct + OCR elements render correctly + +## Phase 4: Code Quality + +### 4.1 Add logging +- [ ] Add debug logging for table content type detection +- [ ] Log conversion steps for troubleshooting + +### 4.2 Error handling +- [ ] Handle malformed cell data gracefully +- [ ] Log warnings for unexpected content formats