wip: add TableData.from_dict() for OCR track table parsing (incomplete)

Add TableData.from_dict() and TableCell.from_dict() methods to convert JSON table dicts to proper TableData objects during UnifiedDocument parsing. Modified _json_to_document_element() to detect TABLE elements with dict content containing 'cells' key and convert to TableData. Note: This fix ensures table elements have proper to_html() method available but the rendered output still needs investigation - tables may still render incorrectly in OCR track PDFs. Files changed: - unified_document.py: Add from_dict() class methods - pdf_generator_service.py: Convert table dicts during JSON parsing - Add fix-ocr-track-table-rendering proposal 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-26 19:16:51 +08:00
parent 6e050eb540
commit c65df754cf
5 changed files with 281 additions and 1 deletions
--- a/backend/app/models/unified_document.py
+++ b/backend/app/models/unified_document.py
@@ -186,6 +186,30 @@ class TableCell:
            "style": self.style.to_dict() if self.style else None
        }

+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> 'TableCell':
+        """Create TableCell from dictionary."""
+        bbox = None
+        if data.get('bbox'):
+            bbox_data = data['bbox']
+            if isinstance(bbox_data, dict):
+                bbox = BoundingBox(
+                    x0=bbox_data.get('x0', 0),
+                    y0=bbox_data.get('y0', 0),
+                    x1=bbox_data.get('x1', 0),
+                    y1=bbox_data.get('y1', 0)
+                )
+
+        return cls(
+            row=data.get('row', 0),
+            col=data.get('col', 0),
+            row_span=data.get('row_span', 1),
+            col_span=data.get('col_span', 1),
+            content=data.get('content', ''),
+            bbox=bbox,
+            style=None  # Style parsing can be added if needed
+        )
+

@dataclass
 class TableData:
@@ -205,6 +229,35 @@ class TableData:
            "caption": self.caption
        }

+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> 'TableData':
+        """
+        Create TableData from dictionary.
+
+        Handles conversion from JSON format with cells array to proper TableData
+        object with TableCell instances.
+
+        Args:
+            data: Dictionary with keys: rows, cols, cells, headers, caption
+
+        Returns:
+            TableData instance
+        """
+        cells = []
+        for cell_data in data.get('cells', []):
+            if isinstance(cell_data, dict):
+                cells.append(TableCell.from_dict(cell_data))
+            elif isinstance(cell_data, TableCell):
+                cells.append(cell_data)
+
+        return cls(
+            rows=data.get('rows', 0),
+            cols=data.get('cols', 0),
+            cells=cells,
+            headers=data.get('headers'),
+            caption=data.get('caption')
+        )
+
    def to_html(self) -> str:
        """Convert table to HTML representation"""
        html = ["<table>"]
--- a/backend/app/services/pdf_generator_service.py
+++ b/backend/app/services/pdf_generator_service.py
@@ -1945,11 +1945,23 @@ class PDFGeneratorService:
                if child:
                    children.append(child)

+            # Process content based on element type
+            content = elem_dict.get('content', '')
+
+            # For TABLE elements, convert dict content to TableData object
+            if elem_type == ElementType.TABLE and isinstance(content, dict) and 'cells' in content:
+                try:
+                    content = TableData.from_dict(content)
+                    logger.debug(f"Converted table dict to TableData: {content.rows}x{content.cols}, {len(content.cells)} cells")
+                except Exception as e:
+                    logger.warning(f"Failed to convert table dict to TableData: {e}")
+                    # Keep original dict as fallback
+
            # Create element
            element = DocumentElement(
                element_id=elem_dict.get('element_id', ''),
                type=elem_type,
-                content=elem_dict.get('content', ''),
+                content=content,
                bbox=bbox,
                confidence=elem_dict.get('confidence'),
                style=style,