feat: implement hybrid image extraction and memory management
Backend: - Add hybrid image extraction for Direct track (inline image blocks) - Add render_inline_image_regions() fallback when OCR doesn't find images - Add check_document_for_missing_images() for detecting missing images - Add memory management system (MemoryGuard, ModelManager, ServicePool) - Update pdf_generator_service to handle HYBRID processing track - Add ElementType.LOGO for logo extraction Frontend: - Fix PDF viewer re-rendering issues with memoization - Add TaskNotFound component and useTaskValidation hook - Disable StrictMode due to react-pdf incompatibility - Fix task detail and results page loading states 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -247,9 +247,11 @@ class DirectExtractionEngine:
|
||||
element_counter += len(image_elements)
|
||||
|
||||
# Extract vector graphics (charts, diagrams) from drawing commands
|
||||
# Pass table_bboxes to filter out table border drawings before clustering
|
||||
if self.enable_image_extraction:
|
||||
vector_elements = self._extract_vector_graphics(
|
||||
page, page_num, document_id, element_counter, output_dir
|
||||
page, page_num, document_id, element_counter, output_dir,
|
||||
table_bboxes=table_bboxes
|
||||
)
|
||||
elements.extend(vector_elements)
|
||||
element_counter += len(vector_elements)
|
||||
@@ -705,40 +707,52 @@ class DirectExtractionEngine:
|
||||
y1=bbox_data[3]
|
||||
)
|
||||
|
||||
# Extract column widths from table cells
|
||||
# Extract column widths from table cells by analyzing X boundaries
|
||||
column_widths = []
|
||||
if hasattr(table, 'cells') and table.cells:
|
||||
# Group cells by column
|
||||
cols_x = {}
|
||||
# Collect all unique X boundaries (both left and right edges)
|
||||
x_boundaries = set()
|
||||
for cell in table.cells:
|
||||
col_idx = None
|
||||
# Determine column index by x0 position
|
||||
for idx, x0 in enumerate(sorted(set(c[0] for c in table.cells))):
|
||||
if abs(cell[0] - x0) < 1.0: # Within 1pt tolerance
|
||||
col_idx = idx
|
||||
break
|
||||
x_boundaries.add(round(cell[0], 1)) # x0 (left edge)
|
||||
x_boundaries.add(round(cell[2], 1)) # x1 (right edge)
|
||||
|
||||
if col_idx is not None:
|
||||
if col_idx not in cols_x:
|
||||
cols_x[col_idx] = {'x0': cell[0], 'x1': cell[2]}
|
||||
else:
|
||||
cols_x[col_idx]['x1'] = max(cols_x[col_idx]['x1'], cell[2])
|
||||
# Sort boundaries to get column edges
|
||||
sorted_x = sorted(x_boundaries)
|
||||
|
||||
# Calculate width for each column
|
||||
for col_idx in sorted(cols_x.keys()):
|
||||
width = cols_x[col_idx]['x1'] - cols_x[col_idx]['x0']
|
||||
column_widths.append(width)
|
||||
# Calculate column widths from adjacent boundaries
|
||||
if len(sorted_x) >= 2:
|
||||
column_widths = [sorted_x[i+1] - sorted_x[i] for i in range(len(sorted_x)-1)]
|
||||
logger.debug(f"Calculated column widths from {len(sorted_x)} boundaries: {column_widths}")
|
||||
|
||||
# Extract row heights from table cells by analyzing Y boundaries
|
||||
row_heights = []
|
||||
if hasattr(table, 'cells') and table.cells:
|
||||
# Collect all unique Y boundaries (both top and bottom edges)
|
||||
y_boundaries = set()
|
||||
for cell in table.cells:
|
||||
y_boundaries.add(round(cell[1], 1)) # y0 (top edge)
|
||||
y_boundaries.add(round(cell[3], 1)) # y1 (bottom edge)
|
||||
|
||||
# Sort boundaries to get row edges
|
||||
sorted_y = sorted(y_boundaries)
|
||||
|
||||
# Calculate row heights from adjacent boundaries
|
||||
if len(sorted_y) >= 2:
|
||||
row_heights = [sorted_y[i+1] - sorted_y[i] for i in range(len(sorted_y)-1)]
|
||||
logger.debug(f"Calculated row heights from {len(sorted_y)} boundaries: {row_heights}")
|
||||
|
||||
# Create table cells
|
||||
# Note: Include ALL cells (even empty ones) to preserve table structure
|
||||
# This is critical for correct HTML generation and PDF rendering
|
||||
cells = []
|
||||
for row_idx, row in enumerate(data):
|
||||
for col_idx, cell_text in enumerate(row):
|
||||
if cell_text:
|
||||
cells.append(TableCell(
|
||||
row=row_idx,
|
||||
col=col_idx,
|
||||
content=str(cell_text) if cell_text else ""
|
||||
))
|
||||
# Always add cell, even if empty, to maintain table structure
|
||||
cells.append(TableCell(
|
||||
row=row_idx,
|
||||
col=col_idx,
|
||||
content=str(cell_text) if cell_text else ""
|
||||
))
|
||||
|
||||
# Create table data
|
||||
table_data = TableData(
|
||||
@@ -748,8 +762,13 @@ class DirectExtractionEngine:
|
||||
headers=data[0] if data else None # Assume first row is header
|
||||
)
|
||||
|
||||
# Store column widths in metadata
|
||||
metadata = {"column_widths": column_widths} if column_widths else None
|
||||
# Store column widths and row heights in metadata
|
||||
metadata = {}
|
||||
if column_widths:
|
||||
metadata["column_widths"] = column_widths
|
||||
if row_heights:
|
||||
metadata["row_heights"] = row_heights
|
||||
metadata = metadata if metadata else None
|
||||
|
||||
return DocumentElement(
|
||||
element_id=f"table_{page_num}_{counter}",
|
||||
@@ -978,7 +997,9 @@ class DirectExtractionEngine:
|
||||
image_filename = f"{document_id}_p{page_num}_img{img_idx}.png"
|
||||
image_path = output_dir / image_filename
|
||||
pix.save(str(image_path))
|
||||
image_data["saved_path"] = str(image_path)
|
||||
# Store relative filename only (consistent with OCR track)
|
||||
# PDF generator will join with result_dir to get full path
|
||||
image_data["saved_path"] = image_filename
|
||||
logger.debug(f"Saved image to {image_path}")
|
||||
|
||||
element = DocumentElement(
|
||||
@@ -1001,12 +1022,272 @@ class DirectExtractionEngine:
|
||||
|
||||
return elements
|
||||
|
||||
def has_missing_images(self, page: fitz.Page) -> bool:
|
||||
"""
|
||||
Detect if a page likely has images that weren't extracted.
|
||||
|
||||
This checks for inline image blocks (type=1 in text dict) which indicate
|
||||
graphics composed of many small image blocks (like logos) that
|
||||
page.get_images() cannot detect.
|
||||
|
||||
Args:
|
||||
page: PyMuPDF page object
|
||||
|
||||
Returns:
|
||||
True if there are likely missing images that need OCR extraction
|
||||
"""
|
||||
try:
|
||||
# Check if get_images found anything
|
||||
standard_images = page.get_images()
|
||||
if standard_images:
|
||||
return False # Standard images were found, no need for fallback
|
||||
|
||||
# Check for inline image blocks (type=1)
|
||||
text_dict = page.get_text("dict", sort=True)
|
||||
blocks = text_dict.get("blocks", [])
|
||||
|
||||
image_block_count = sum(1 for b in blocks if b.get("type") == 1)
|
||||
|
||||
# If there are many inline image blocks, likely there's a logo or graphic
|
||||
if image_block_count >= 10:
|
||||
logger.info(f"Detected {image_block_count} inline image blocks - may need OCR for image extraction")
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Error checking for missing images: {e}")
|
||||
return False
|
||||
|
||||
def check_document_for_missing_images(self, pdf_path: Path) -> List[int]:
|
||||
"""
|
||||
Check a PDF document for pages that likely have missing images.
|
||||
|
||||
This opens the PDF and checks each page for inline image blocks
|
||||
that weren't extracted by get_images().
|
||||
|
||||
Args:
|
||||
pdf_path: Path to the PDF file
|
||||
|
||||
Returns:
|
||||
List of page numbers (1-indexed) that have missing images
|
||||
"""
|
||||
pages_with_missing_images = []
|
||||
|
||||
try:
|
||||
doc = fitz.open(str(pdf_path))
|
||||
for page_num in range(len(doc)):
|
||||
page = doc[page_num]
|
||||
if self.has_missing_images(page):
|
||||
pages_with_missing_images.append(page_num + 1) # 1-indexed
|
||||
doc.close()
|
||||
|
||||
if pages_with_missing_images:
|
||||
logger.info(f"Document has missing images on pages: {pages_with_missing_images}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error checking document for missing images: {e}")
|
||||
|
||||
return pages_with_missing_images
|
||||
|
||||
def render_inline_image_regions(
|
||||
self,
|
||||
pdf_path: Path,
|
||||
unified_doc: 'UnifiedDocument',
|
||||
pages: List[int],
|
||||
output_dir: Optional[Path] = None
|
||||
) -> int:
|
||||
"""
|
||||
Render inline image regions and add them to the unified document.
|
||||
|
||||
This is a fallback when OCR doesn't detect images. It clusters inline
|
||||
image blocks (type=1) and renders them as images.
|
||||
|
||||
Args:
|
||||
pdf_path: Path to the PDF file
|
||||
unified_doc: UnifiedDocument to add images to
|
||||
pages: List of page numbers (1-indexed) to process
|
||||
output_dir: Directory to save rendered images
|
||||
|
||||
Returns:
|
||||
Number of images added
|
||||
"""
|
||||
images_added = 0
|
||||
|
||||
try:
|
||||
doc = fitz.open(str(pdf_path))
|
||||
|
||||
for page_num in pages:
|
||||
if page_num < 1 or page_num > len(doc):
|
||||
continue
|
||||
|
||||
page = doc[page_num - 1] # 0-indexed
|
||||
page_rect = page.rect
|
||||
|
||||
# Get inline image blocks
|
||||
text_dict = page.get_text("dict", sort=True)
|
||||
blocks = text_dict.get("blocks", [])
|
||||
|
||||
image_blocks = []
|
||||
for block in blocks:
|
||||
if block.get("type") == 1: # Image block
|
||||
bbox = block.get("bbox")
|
||||
if bbox:
|
||||
image_blocks.append(fitz.Rect(bbox))
|
||||
|
||||
if len(image_blocks) < 5: # Reduced from 10
|
||||
logger.debug(f"Page {page_num}: Only {len(image_blocks)} inline image blocks, skipping")
|
||||
continue
|
||||
|
||||
logger.info(f"Page {page_num}: Found {len(image_blocks)} inline image blocks")
|
||||
|
||||
# Cluster nearby image blocks
|
||||
regions = self._cluster_nearby_rects(image_blocks, tolerance=5.0)
|
||||
logger.info(f"Page {page_num}: Clustered into {len(regions)} regions")
|
||||
|
||||
# Find the corresponding page in unified_doc
|
||||
target_page = None
|
||||
for p in unified_doc.pages:
|
||||
if p.page_number == page_num:
|
||||
target_page = p
|
||||
break
|
||||
|
||||
if not target_page:
|
||||
continue
|
||||
|
||||
for region_idx, region_rect in enumerate(regions):
|
||||
logger.info(f"Page {page_num} region {region_idx}: {region_rect} (w={region_rect.width:.1f}, h={region_rect.height:.1f})")
|
||||
|
||||
# Skip very small regions
|
||||
if region_rect.width < 30 or region_rect.height < 30:
|
||||
logger.info(f" -> Skipped: too small (min 30x30)")
|
||||
continue
|
||||
|
||||
# Skip regions that are primarily in the table area (below top 40%)
|
||||
# But allow regions that START in the top portion
|
||||
page_30_pct = page_rect.height * 0.3
|
||||
page_40_pct = page_rect.height * 0.4
|
||||
if region_rect.y0 > page_40_pct:
|
||||
logger.info(f" -> Skipped: y0={region_rect.y0:.1f} > 40% of page ({page_40_pct:.1f})")
|
||||
continue
|
||||
|
||||
logger.info(f"Rendering inline image region {region_idx} on page {page_num}: {region_rect}")
|
||||
|
||||
try:
|
||||
# Add small padding
|
||||
clip_rect = region_rect + (-2, -2, 2, 2)
|
||||
clip_rect.intersect(page_rect)
|
||||
|
||||
# Render at 2x resolution
|
||||
mat = fitz.Matrix(2, 2)
|
||||
pix = page.get_pixmap(clip=clip_rect, matrix=mat, alpha=False)
|
||||
|
||||
# Create bounding box
|
||||
bbox = BoundingBox(
|
||||
x0=clip_rect.x0,
|
||||
y0=clip_rect.y0,
|
||||
x1=clip_rect.x1,
|
||||
y1=clip_rect.y1
|
||||
)
|
||||
|
||||
image_data = {
|
||||
"width": pix.width,
|
||||
"height": pix.height,
|
||||
"colorspace": "rgb",
|
||||
"type": "inline_region"
|
||||
}
|
||||
|
||||
# Save image if output directory provided
|
||||
if output_dir:
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
doc_id = unified_doc.document_id or "unknown"
|
||||
image_filename = f"{doc_id}_p{page_num}_logo{region_idx}.png"
|
||||
image_path = output_dir / image_filename
|
||||
pix.save(str(image_path))
|
||||
image_data["saved_path"] = image_filename
|
||||
logger.info(f"Saved inline image region to {image_path}")
|
||||
|
||||
element = DocumentElement(
|
||||
element_id=f"logo_{page_num}_{region_idx}",
|
||||
type=ElementType.LOGO,
|
||||
content=image_data,
|
||||
bbox=bbox,
|
||||
confidence=0.9,
|
||||
metadata={
|
||||
"region_type": "inline_image_blocks",
|
||||
"block_count": len(image_blocks)
|
||||
}
|
||||
)
|
||||
target_page.elements.append(element)
|
||||
images_added += 1
|
||||
|
||||
pix = None # Free memory
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error rendering inline image region {region_idx}: {e}")
|
||||
|
||||
doc.close()
|
||||
|
||||
if images_added > 0:
|
||||
current_images = unified_doc.metadata.total_images or 0
|
||||
unified_doc.metadata.total_images = current_images + images_added
|
||||
logger.info(f"Added {images_added} inline image regions to document")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error rendering inline image regions: {e}")
|
||||
|
||||
return images_added
|
||||
|
||||
def _cluster_nearby_rects(self, rects: List[fitz.Rect], tolerance: float = 5.0) -> List[fitz.Rect]:
|
||||
"""Cluster nearby rectangles into regions."""
|
||||
if not rects:
|
||||
return []
|
||||
|
||||
sorted_rects = sorted(rects, key=lambda r: (r.y0, r.x0))
|
||||
|
||||
merged = []
|
||||
for rect in sorted_rects:
|
||||
merged_with_existing = False
|
||||
for i, region in enumerate(merged):
|
||||
expanded = region + (-tolerance, -tolerance, tolerance, tolerance)
|
||||
if expanded.intersects(rect):
|
||||
merged[i] = region | rect
|
||||
merged_with_existing = True
|
||||
break
|
||||
if not merged_with_existing:
|
||||
merged.append(rect)
|
||||
|
||||
# Second pass: merge any regions that now overlap
|
||||
changed = True
|
||||
while changed:
|
||||
changed = False
|
||||
new_merged = []
|
||||
skip = set()
|
||||
|
||||
for i, r1 in enumerate(merged):
|
||||
if i in skip:
|
||||
continue
|
||||
current = r1
|
||||
for j, r2 in enumerate(merged[i+1:], start=i+1):
|
||||
if j in skip:
|
||||
continue
|
||||
expanded = current + (-tolerance, -tolerance, tolerance, tolerance)
|
||||
if expanded.intersects(r2):
|
||||
current = current | r2
|
||||
skip.add(j)
|
||||
changed = True
|
||||
new_merged.append(current)
|
||||
merged = new_merged
|
||||
|
||||
return merged
|
||||
|
||||
def _extract_vector_graphics(self,
|
||||
page: fitz.Page,
|
||||
page_num: int,
|
||||
document_id: str,
|
||||
counter: int,
|
||||
output_dir: Optional[Path]) -> List[DocumentElement]:
|
||||
output_dir: Optional[Path],
|
||||
table_bboxes: Optional[List[BoundingBox]] = None) -> List[DocumentElement]:
|
||||
"""
|
||||
Extract vector graphics (charts, diagrams) from page.
|
||||
|
||||
@@ -1020,6 +1301,7 @@ class DirectExtractionEngine:
|
||||
document_id: Unique document identifier
|
||||
counter: Starting counter for element IDs
|
||||
output_dir: Directory to save rendered graphics
|
||||
table_bboxes: List of table bounding boxes to exclude table border drawings
|
||||
|
||||
Returns:
|
||||
List of DocumentElement objects representing vector graphics
|
||||
@@ -1034,16 +1316,25 @@ class DirectExtractionEngine:
|
||||
|
||||
logger.debug(f"Page {page_num} contains {len(drawings)} vector drawing commands")
|
||||
|
||||
# Filter out drawings that are likely table borders
|
||||
# Table borders are typically thin rectangular lines within table regions
|
||||
non_table_drawings = self._filter_table_border_drawings(drawings, table_bboxes)
|
||||
logger.debug(f"After filtering table borders: {len(non_table_drawings)} drawings remain")
|
||||
|
||||
if not non_table_drawings:
|
||||
logger.debug("All drawings appear to be table borders, no vector graphics to extract")
|
||||
return elements
|
||||
|
||||
# Cluster drawings into groups (charts, diagrams, etc.)
|
||||
try:
|
||||
# PyMuPDF's cluster_drawings() groups nearby drawings automatically
|
||||
drawing_clusters = page.cluster_drawings()
|
||||
# Use custom clustering that only considers non-table drawings
|
||||
drawing_clusters = self._cluster_non_table_drawings(page, non_table_drawings)
|
||||
logger.debug(f"Clustered into {len(drawing_clusters)} groups")
|
||||
except (AttributeError, TypeError) as e:
|
||||
# cluster_drawings not available or has different signature
|
||||
# Fallback: try to identify charts by analyzing drawing density
|
||||
logger.warning(f"cluster_drawings() failed ({e}), using fallback method")
|
||||
drawing_clusters = self._cluster_drawings_fallback(page, drawings)
|
||||
logger.warning(f"Custom clustering failed ({e}), using fallback method")
|
||||
drawing_clusters = self._cluster_drawings_fallback(page, non_table_drawings)
|
||||
|
||||
for cluster_idx, bbox in enumerate(drawing_clusters):
|
||||
# Ignore small regions (likely noise or separator lines)
|
||||
@@ -1148,6 +1439,124 @@ class DirectExtractionEngine:
|
||||
|
||||
return filtered_clusters
|
||||
|
||||
def _filter_table_border_drawings(self, drawings: list, table_bboxes: Optional[List[BoundingBox]]) -> list:
|
||||
"""
|
||||
Filter out drawings that are likely table borders.
|
||||
|
||||
Table borders are typically:
|
||||
- Thin rectangular lines (height or width < 5pt)
|
||||
- Located within or on the edge of table bounding boxes
|
||||
|
||||
Args:
|
||||
drawings: List of PyMuPDF drawing objects
|
||||
table_bboxes: List of table bounding boxes
|
||||
|
||||
Returns:
|
||||
List of drawings that are NOT table borders (likely logos, charts, etc.)
|
||||
"""
|
||||
if not table_bboxes:
|
||||
return drawings
|
||||
|
||||
non_table_drawings = []
|
||||
table_border_count = 0
|
||||
|
||||
for drawing in drawings:
|
||||
rect = drawing.get('rect')
|
||||
if not rect:
|
||||
continue
|
||||
|
||||
draw_rect = fitz.Rect(rect)
|
||||
|
||||
# Check if this drawing is a thin line (potential table border)
|
||||
is_thin_line = draw_rect.width < 5 or draw_rect.height < 5
|
||||
|
||||
# Check if drawing overlaps significantly with any table
|
||||
overlaps_table = False
|
||||
for table_bbox in table_bboxes:
|
||||
table_rect = fitz.Rect(table_bbox.x0, table_bbox.y0, table_bbox.x1, table_bbox.y1)
|
||||
|
||||
# Expand table rect slightly to include border lines on edges
|
||||
expanded_table = table_rect + (-5, -5, 5, 5)
|
||||
|
||||
if expanded_table.contains(draw_rect) or expanded_table.intersects(draw_rect):
|
||||
# Calculate overlap ratio
|
||||
intersection = draw_rect & expanded_table
|
||||
if not intersection.is_empty:
|
||||
overlap_ratio = intersection.get_area() / draw_rect.get_area() if draw_rect.get_area() > 0 else 0
|
||||
|
||||
# If drawing is mostly inside table region, it's likely a border
|
||||
if overlap_ratio > 0.8:
|
||||
overlaps_table = True
|
||||
break
|
||||
|
||||
# Keep drawing if it's NOT (thin line AND overlapping table)
|
||||
# This keeps: logos (complex shapes), charts outside tables, etc.
|
||||
if is_thin_line and overlaps_table:
|
||||
table_border_count += 1
|
||||
else:
|
||||
non_table_drawings.append(drawing)
|
||||
|
||||
if table_border_count > 0:
|
||||
logger.debug(f"Filtered out {table_border_count} table border drawings")
|
||||
|
||||
return non_table_drawings
|
||||
|
||||
def _cluster_non_table_drawings(self, page: fitz.Page, drawings: list) -> list:
|
||||
"""
|
||||
Cluster non-table drawings into groups.
|
||||
|
||||
This method clusters drawings that have been pre-filtered to exclude table borders.
|
||||
It uses a more conservative clustering approach suitable for logos and charts.
|
||||
|
||||
Args:
|
||||
page: PyMuPDF page object
|
||||
drawings: Pre-filtered list of drawings (excluding table borders)
|
||||
|
||||
Returns:
|
||||
List of fitz.Rect representing clustered drawing regions
|
||||
"""
|
||||
if not drawings:
|
||||
return []
|
||||
|
||||
# Collect all drawing bounding boxes
|
||||
bboxes = []
|
||||
for drawing in drawings:
|
||||
rect = drawing.get('rect')
|
||||
if rect:
|
||||
bboxes.append(fitz.Rect(rect))
|
||||
|
||||
if not bboxes:
|
||||
return []
|
||||
|
||||
# More conservative clustering with smaller tolerance
|
||||
# This prevents grouping distant graphics together
|
||||
clusters = []
|
||||
tolerance = 10 # Smaller tolerance than fallback (was 20)
|
||||
|
||||
for bbox in bboxes:
|
||||
# Try to merge with existing cluster
|
||||
merged = False
|
||||
for i, cluster in enumerate(clusters):
|
||||
# Check if bbox is close to this cluster
|
||||
expanded_cluster = cluster + (-tolerance, -tolerance, tolerance, tolerance)
|
||||
if expanded_cluster.intersects(bbox):
|
||||
# Merge bbox into cluster
|
||||
clusters[i] = cluster | bbox # Union of rectangles
|
||||
merged = True
|
||||
break
|
||||
|
||||
if not merged:
|
||||
# Create new cluster
|
||||
clusters.append(bbox)
|
||||
|
||||
# Filter out very small clusters (noise)
|
||||
# Keep minimum 30x30 for logos (smaller than default 50x50)
|
||||
filtered_clusters = [c for c in clusters if c.width >= 30 and c.height >= 30]
|
||||
|
||||
logger.debug(f"Non-table clustering: {len(bboxes)} drawings -> {len(clusters)} clusters -> {len(filtered_clusters)} filtered")
|
||||
|
||||
return filtered_clusters
|
||||
|
||||
def _deduplicate_table_chart_overlap(self, elements: List[DocumentElement]) -> List[DocumentElement]:
|
||||
"""
|
||||
Intelligently resolve TABLE-CHART overlaps based on table structure completeness.
|
||||
|
||||
Reference in New Issue
Block a user