feat: implement hybrid image extraction and memory management
Backend: - Add hybrid image extraction for Direct track (inline image blocks) - Add render_inline_image_regions() fallback when OCR doesn't find images - Add check_document_for_missing_images() for detecting missing images - Add memory management system (MemoryGuard, ModelManager, ServicePool) - Update pdf_generator_service to handle HYBRID processing track - Add ElementType.LOGO for logo extraction Frontend: - Fix PDF viewer re-rendering issues with memoization - Add TaskNotFound component and useTaskValidation hook - Disable StrictMode due to react-pdf incompatibility - Fix task detail and results page loading states 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -212,26 +212,44 @@ class TableData:
|
||||
if self.caption:
|
||||
html.append(f"<caption>{self.caption}</caption>")
|
||||
|
||||
# Group cells by row
|
||||
rows_data = {}
|
||||
# Group cells by row and column for quick lookup
|
||||
cell_map = {}
|
||||
for cell in self.cells:
|
||||
if cell.row not in rows_data:
|
||||
rows_data[cell.row] = []
|
||||
rows_data[cell.row].append(cell)
|
||||
cell_map[(cell.row, cell.col)] = cell
|
||||
|
||||
# Generate HTML
|
||||
# Track which cells are covered by row/col spans
|
||||
covered = set()
|
||||
for cell in self.cells:
|
||||
if cell.row_span > 1 or cell.col_span > 1:
|
||||
for r in range(cell.row, cell.row + cell.row_span):
|
||||
for c in range(cell.col, cell.col + cell.col_span):
|
||||
if (r, c) != (cell.row, cell.col):
|
||||
covered.add((r, c))
|
||||
|
||||
# Generate HTML with proper column filling
|
||||
for row_idx in range(self.rows):
|
||||
html.append("<tr>")
|
||||
if row_idx in rows_data:
|
||||
for cell in sorted(rows_data[row_idx], key=lambda c: c.col):
|
||||
for col_idx in range(self.cols):
|
||||
# Skip cells covered by row/col spans
|
||||
if (row_idx, col_idx) in covered:
|
||||
continue
|
||||
|
||||
cell = cell_map.get((row_idx, col_idx))
|
||||
tag = "th" if row_idx == 0 and self.headers else "td"
|
||||
|
||||
if cell:
|
||||
span_attrs = []
|
||||
if cell.row_span > 1:
|
||||
span_attrs.append(f'rowspan="{cell.row_span}"')
|
||||
if cell.col_span > 1:
|
||||
span_attrs.append(f'colspan="{cell.col_span}"')
|
||||
span_str = " ".join(span_attrs)
|
||||
tag = "th" if row_idx == 0 and self.headers else "td"
|
||||
html.append(f'<{tag} {span_str}>{cell.content}</{tag}>')
|
||||
content = cell.content if cell.content else ""
|
||||
html.append(f'<{tag} {span_str}>{content}</{tag}>')
|
||||
else:
|
||||
# Fill in empty cell for missing positions
|
||||
html.append(f'<{tag}></{tag}>')
|
||||
|
||||
html.append("</tr>")
|
||||
|
||||
html.append("</table>")
|
||||
|
||||
Reference in New Issue
Block a user