wip: add TableData.from_dict() for OCR track table parsing (incomplete)
Add TableData.from_dict() and TableCell.from_dict() methods to convert JSON table dicts to proper TableData objects during UnifiedDocument parsing. Modified _json_to_document_element() to detect TABLE elements with dict content containing 'cells' key and convert to TableData. Note: This fix ensures table elements have proper to_html() method available but the rendered output still needs investigation - tables may still render incorrectly in OCR track PDFs. Files changed: - unified_document.py: Add from_dict() class methods - pdf_generator_service.py: Convert table dicts during JSON parsing - Add fix-ocr-track-table-rendering proposal 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -186,6 +186,30 @@ class TableCell:
|
||||
"style": self.style.to_dict() if self.style else None
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Dict[str, Any]) -> 'TableCell':
|
||||
"""Create TableCell from dictionary."""
|
||||
bbox = None
|
||||
if data.get('bbox'):
|
||||
bbox_data = data['bbox']
|
||||
if isinstance(bbox_data, dict):
|
||||
bbox = BoundingBox(
|
||||
x0=bbox_data.get('x0', 0),
|
||||
y0=bbox_data.get('y0', 0),
|
||||
x1=bbox_data.get('x1', 0),
|
||||
y1=bbox_data.get('y1', 0)
|
||||
)
|
||||
|
||||
return cls(
|
||||
row=data.get('row', 0),
|
||||
col=data.get('col', 0),
|
||||
row_span=data.get('row_span', 1),
|
||||
col_span=data.get('col_span', 1),
|
||||
content=data.get('content', ''),
|
||||
bbox=bbox,
|
||||
style=None # Style parsing can be added if needed
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class TableData:
|
||||
@@ -205,6 +229,35 @@ class TableData:
|
||||
"caption": self.caption
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Dict[str, Any]) -> 'TableData':
|
||||
"""
|
||||
Create TableData from dictionary.
|
||||
|
||||
Handles conversion from JSON format with cells array to proper TableData
|
||||
object with TableCell instances.
|
||||
|
||||
Args:
|
||||
data: Dictionary with keys: rows, cols, cells, headers, caption
|
||||
|
||||
Returns:
|
||||
TableData instance
|
||||
"""
|
||||
cells = []
|
||||
for cell_data in data.get('cells', []):
|
||||
if isinstance(cell_data, dict):
|
||||
cells.append(TableCell.from_dict(cell_data))
|
||||
elif isinstance(cell_data, TableCell):
|
||||
cells.append(cell_data)
|
||||
|
||||
return cls(
|
||||
rows=data.get('rows', 0),
|
||||
cols=data.get('cols', 0),
|
||||
cells=cells,
|
||||
headers=data.get('headers'),
|
||||
caption=data.get('caption')
|
||||
)
|
||||
|
||||
def to_html(self) -> str:
|
||||
"""Convert table to HTML representation"""
|
||||
html = ["<table>"]
|
||||
|
||||
@@ -1945,11 +1945,23 @@ class PDFGeneratorService:
|
||||
if child:
|
||||
children.append(child)
|
||||
|
||||
# Process content based on element type
|
||||
content = elem_dict.get('content', '')
|
||||
|
||||
# For TABLE elements, convert dict content to TableData object
|
||||
if elem_type == ElementType.TABLE and isinstance(content, dict) and 'cells' in content:
|
||||
try:
|
||||
content = TableData.from_dict(content)
|
||||
logger.debug(f"Converted table dict to TableData: {content.rows}x{content.cols}, {len(content.cells)} cells")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to convert table dict to TableData: {e}")
|
||||
# Keep original dict as fallback
|
||||
|
||||
# Create element
|
||||
element = DocumentElement(
|
||||
element_id=elem_dict.get('element_id', ''),
|
||||
type=elem_type,
|
||||
content=elem_dict.get('content', ''),
|
||||
content=content,
|
||||
bbox=bbox,
|
||||
confidence=elem_dict.get('confidence'),
|
||||
style=style,
|
||||
|
||||
Reference in New Issue
Block a user