wip: add TableData.from_dict() for OCR track table parsing (incomplete)
Add TableData.from_dict() and TableCell.from_dict() methods to convert JSON table dicts to proper TableData objects during UnifiedDocument parsing. Modified _json_to_document_element() to detect TABLE elements with dict content containing 'cells' key and convert to TableData. Note: This fix ensures table elements have proper to_html() method available but the rendered output still needs investigation - tables may still render incorrectly in OCR track PDFs. Files changed: - unified_document.py: Add from_dict() class methods - pdf_generator_service.py: Convert table dicts during JSON parsing - Add fix-ocr-track-table-rendering proposal 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -186,6 +186,30 @@ class TableCell:
|
||||
"style": self.style.to_dict() if self.style else None
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Dict[str, Any]) -> 'TableCell':
|
||||
"""Create TableCell from dictionary."""
|
||||
bbox = None
|
||||
if data.get('bbox'):
|
||||
bbox_data = data['bbox']
|
||||
if isinstance(bbox_data, dict):
|
||||
bbox = BoundingBox(
|
||||
x0=bbox_data.get('x0', 0),
|
||||
y0=bbox_data.get('y0', 0),
|
||||
x1=bbox_data.get('x1', 0),
|
||||
y1=bbox_data.get('y1', 0)
|
||||
)
|
||||
|
||||
return cls(
|
||||
row=data.get('row', 0),
|
||||
col=data.get('col', 0),
|
||||
row_span=data.get('row_span', 1),
|
||||
col_span=data.get('col_span', 1),
|
||||
content=data.get('content', ''),
|
||||
bbox=bbox,
|
||||
style=None # Style parsing can be added if needed
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class TableData:
|
||||
@@ -205,6 +229,35 @@ class TableData:
|
||||
"caption": self.caption
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Dict[str, Any]) -> 'TableData':
|
||||
"""
|
||||
Create TableData from dictionary.
|
||||
|
||||
Handles conversion from JSON format with cells array to proper TableData
|
||||
object with TableCell instances.
|
||||
|
||||
Args:
|
||||
data: Dictionary with keys: rows, cols, cells, headers, caption
|
||||
|
||||
Returns:
|
||||
TableData instance
|
||||
"""
|
||||
cells = []
|
||||
for cell_data in data.get('cells', []):
|
||||
if isinstance(cell_data, dict):
|
||||
cells.append(TableCell.from_dict(cell_data))
|
||||
elif isinstance(cell_data, TableCell):
|
||||
cells.append(cell_data)
|
||||
|
||||
return cls(
|
||||
rows=data.get('rows', 0),
|
||||
cols=data.get('cols', 0),
|
||||
cells=cells,
|
||||
headers=data.get('headers'),
|
||||
caption=data.get('caption')
|
||||
)
|
||||
|
||||
def to_html(self) -> str:
|
||||
"""Convert table to HTML representation"""
|
||||
html = ["<table>"]
|
||||
|
||||
Reference in New Issue
Block a user