feat: implement hybrid image extraction and memory management

Backend:
- Add hybrid image extraction for Direct track (inline image blocks)
- Add render_inline_image_regions() fallback when OCR doesn't find images
- Add check_document_for_missing_images() for detecting missing images
- Add memory management system (MemoryGuard, ModelManager, ServicePool)
- Update pdf_generator_service to handle HYBRID processing track
- Add ElementType.LOGO for logo extraction

Frontend:
- Fix PDF viewer re-rendering issues with memoization
- Add TaskNotFound component and useTaskValidation hook
- Disable StrictMode due to react-pdf incompatibility
- Fix task detail and results page loading states

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
egg
2025-11-26 10:56:22 +08:00
parent ba8ddf2b68
commit 1afdb822c3
26 changed files with 8273 additions and 366 deletions

View File

@@ -212,26 +212,44 @@ class TableData:
if self.caption:
html.append(f"<caption>{self.caption}</caption>")
# Group cells by row
rows_data = {}
# Group cells by row and column for quick lookup
cell_map = {}
for cell in self.cells:
if cell.row not in rows_data:
rows_data[cell.row] = []
rows_data[cell.row].append(cell)
cell_map[(cell.row, cell.col)] = cell
# Generate HTML
# Track which cells are covered by row/col spans
covered = set()
for cell in self.cells:
if cell.row_span > 1 or cell.col_span > 1:
for r in range(cell.row, cell.row + cell.row_span):
for c in range(cell.col, cell.col + cell.col_span):
if (r, c) != (cell.row, cell.col):
covered.add((r, c))
# Generate HTML with proper column filling
for row_idx in range(self.rows):
html.append("<tr>")
if row_idx in rows_data:
for cell in sorted(rows_data[row_idx], key=lambda c: c.col):
for col_idx in range(self.cols):
# Skip cells covered by row/col spans
if (row_idx, col_idx) in covered:
continue
cell = cell_map.get((row_idx, col_idx))
tag = "th" if row_idx == 0 and self.headers else "td"
if cell:
span_attrs = []
if cell.row_span > 1:
span_attrs.append(f'rowspan="{cell.row_span}"')
if cell.col_span > 1:
span_attrs.append(f'colspan="{cell.col_span}"')
span_str = " ".join(span_attrs)
tag = "th" if row_idx == 0 and self.headers else "td"
html.append(f'<{tag} {span_str}>{cell.content}</{tag}>')
content = cell.content if cell.content else ""
html.append(f'<{tag} {span_str}>{content}</{tag}>')
else:
# Fill in empty cell for missing positions
html.append(f'<{tag}></{tag}>')
html.append("</tr>")
html.append("</table>")