wip: add TableData.from_dict() for OCR track table parsing (incomplete)

Add TableData.from_dict() and TableCell.from_dict() methods to convert
JSON table dicts to proper TableData objects during UnifiedDocument parsing.

Modified _json_to_document_element() to detect TABLE elements with dict
content containing 'cells' key and convert to TableData.

Note: This fix ensures table elements have proper to_html() method available
but the rendered output still needs investigation - tables may still render
incorrectly in OCR track PDFs.

Files changed:
- unified_document.py: Add from_dict() class methods
- pdf_generator_service.py: Convert table dicts during JSON parsing
- Add fix-ocr-track-table-rendering proposal

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
egg
2025-11-26 19:16:51 +08:00
parent 6e050eb540
commit c65df754cf
5 changed files with 281 additions and 1 deletions

View File

@@ -186,6 +186,30 @@ class TableCell:
"style": self.style.to_dict() if self.style else None
}
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> 'TableCell':
"""Create TableCell from dictionary."""
bbox = None
if data.get('bbox'):
bbox_data = data['bbox']
if isinstance(bbox_data, dict):
bbox = BoundingBox(
x0=bbox_data.get('x0', 0),
y0=bbox_data.get('y0', 0),
x1=bbox_data.get('x1', 0),
y1=bbox_data.get('y1', 0)
)
return cls(
row=data.get('row', 0),
col=data.get('col', 0),
row_span=data.get('row_span', 1),
col_span=data.get('col_span', 1),
content=data.get('content', ''),
bbox=bbox,
style=None # Style parsing can be added if needed
)
@dataclass
class TableData:
@@ -205,6 +229,35 @@ class TableData:
"caption": self.caption
}
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> 'TableData':
"""
Create TableData from dictionary.
Handles conversion from JSON format with cells array to proper TableData
object with TableCell instances.
Args:
data: Dictionary with keys: rows, cols, cells, headers, caption
Returns:
TableData instance
"""
cells = []
for cell_data in data.get('cells', []):
if isinstance(cell_data, dict):
cells.append(TableCell.from_dict(cell_data))
elif isinstance(cell_data, TableCell):
cells.append(cell_data)
return cls(
rows=data.get('rows', 0),
cols=data.get('cols', 0),
cells=cells,
headers=data.get('headers'),
caption=data.get('caption')
)
def to_html(self) -> str:
"""Convert table to HTML representation"""
html = ["<table>"]

View File

@@ -1945,11 +1945,23 @@ class PDFGeneratorService:
if child:
children.append(child)
# Process content based on element type
content = elem_dict.get('content', '')
# For TABLE elements, convert dict content to TableData object
if elem_type == ElementType.TABLE and isinstance(content, dict) and 'cells' in content:
try:
content = TableData.from_dict(content)
logger.debug(f"Converted table dict to TableData: {content.rows}x{content.cols}, {len(content.cells)} cells")
except Exception as e:
logger.warning(f"Failed to convert table dict to TableData: {e}")
# Keep original dict as fallback
# Create element
element = DocumentElement(
element_id=elem_dict.get('element_id', ''),
type=elem_type,
content=elem_dict.get('content', ''),
content=content,
bbox=bbox,
confidence=elem_dict.get('confidence'),
style=style,