feat: implement hybrid image extraction and memory management

Backend: - Add hybrid image extraction for Direct track (inline image blocks) - Add render_inline_image_regions() fallback when OCR doesn't find images - Add check_document_for_missing_images() for detecting missing images - Add memory management system (MemoryGuard, ModelManager, ServicePool) - Update pdf_generator_service to handle HYBRID processing track - Add ElementType.LOGO for logo extraction Frontend: - Fix PDF viewer re-rendering issues with memoization - Add TaskNotFound component and useTaskValidation hook - Disable StrictMode due to react-pdf incompatibility - Fix task detail and results page loading states 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-26 10:56:22 +08:00
parent ba8ddf2b68
commit 1afdb822c3
26 changed files with 8273 additions and 366 deletions
--- a/backend/app/services/direct_extraction_engine.py
+++ b/backend/app/services/direct_extraction_engine.py
@@ -247,9 +247,11 @@ class DirectExtractionEngine:
            element_counter += len(image_elements)

        # Extract vector graphics (charts, diagrams) from drawing commands
+        # Pass table_bboxes to filter out table border drawings before clustering
        if self.enable_image_extraction:
            vector_elements = self._extract_vector_graphics(
-                page, page_num, document_id, element_counter, output_dir
+                page, page_num, document_id, element_counter, output_dir,
+                table_bboxes=table_bboxes
            )
            elements.extend(vector_elements)
            element_counter += len(vector_elements)
@@ -705,40 +707,52 @@ class DirectExtractionEngine:
                y1=bbox_data[3]
            )

-            # Extract column widths from table cells
+            # Extract column widths from table cells by analyzing X boundaries
            column_widths = []
            if hasattr(table, 'cells') and table.cells:
-                # Group cells by column
-                cols_x = {}
+                # Collect all unique X boundaries (both left and right edges)
+                x_boundaries = set()
                for cell in table.cells:
-                    col_idx = None
-                    # Determine column index by x0 position
-                    for idx, x0 in enumerate(sorted(set(c[0] for c in table.cells))):
-                        if abs(cell[0] - x0) < 1.0:  # Within 1pt tolerance
-                            col_idx = idx
-                            break
+                    x_boundaries.add(round(cell[0], 1))  # x0 (left edge)
+                    x_boundaries.add(round(cell[2], 1))  # x1 (right edge)

-                    if col_idx is not None:
-                        if col_idx not in cols_x:
-                            cols_x[col_idx] = {'x0': cell[0], 'x1': cell[2]}
-                        else:
-                            cols_x[col_idx]['x1'] = max(cols_x[col_idx]['x1'], cell[2])
+                # Sort boundaries to get column edges
+                sorted_x = sorted(x_boundaries)

-                # Calculate width for each column
-                for col_idx in sorted(cols_x.keys()):
-                    width = cols_x[col_idx]['x1'] - cols_x[col_idx]['x0']
-                    column_widths.append(width)
+                # Calculate column widths from adjacent boundaries
+                if len(sorted_x) >= 2:
+                    column_widths = [sorted_x[i+1] - sorted_x[i] for i in range(len(sorted_x)-1)]
+                    logger.debug(f"Calculated column widths from {len(sorted_x)} boundaries: {column_widths}")
+
+            # Extract row heights from table cells by analyzing Y boundaries
+            row_heights = []
+            if hasattr(table, 'cells') and table.cells:
+                # Collect all unique Y boundaries (both top and bottom edges)
+                y_boundaries = set()
+                for cell in table.cells:
+                    y_boundaries.add(round(cell[1], 1))  # y0 (top edge)
+                    y_boundaries.add(round(cell[3], 1))  # y1 (bottom edge)
+
+                # Sort boundaries to get row edges
+                sorted_y = sorted(y_boundaries)
+
+                # Calculate row heights from adjacent boundaries
+                if len(sorted_y) >= 2:
+                    row_heights = [sorted_y[i+1] - sorted_y[i] for i in range(len(sorted_y)-1)]
+                    logger.debug(f"Calculated row heights from {len(sorted_y)} boundaries: {row_heights}")

            # Create table cells
+            # Note: Include ALL cells (even empty ones) to preserve table structure
+            # This is critical for correct HTML generation and PDF rendering
            cells = []
            for row_idx, row in enumerate(data):
                for col_idx, cell_text in enumerate(row):
-                    if cell_text:
-                        cells.append(TableCell(
-                            row=row_idx,
-                            col=col_idx,
-                            content=str(cell_text) if cell_text else ""
-                        ))
+                    # Always add cell, even if empty, to maintain table structure
+                    cells.append(TableCell(
+                        row=row_idx,
+                        col=col_idx,
+                        content=str(cell_text) if cell_text else ""
+                    ))

            # Create table data
            table_data = TableData(
@@ -748,8 +762,13 @@ class DirectExtractionEngine:
                headers=data[0] if data else None  # Assume first row is header
            )

-            # Store column widths in metadata
-            metadata = {"column_widths": column_widths} if column_widths else None
+            # Store column widths and row heights in metadata
+            metadata = {}
+            if column_widths:
+                metadata["column_widths"] = column_widths
+            if row_heights:
+                metadata["row_heights"] = row_heights
+            metadata = metadata if metadata else None

            return DocumentElement(
                element_id=f"table_{page_num}_{counter}",
@@ -978,7 +997,9 @@ class DirectExtractionEngine:
                    image_filename = f"{document_id}_p{page_num}_img{img_idx}.png"
                    image_path = output_dir / image_filename
                    pix.save(str(image_path))
-                    image_data["saved_path"] = str(image_path)
+                    # Store relative filename only (consistent with OCR track)
+                    # PDF generator will join with result_dir to get full path
+                    image_data["saved_path"] = image_filename
                    logger.debug(f"Saved image to {image_path}")

                element = DocumentElement(
@@ -1001,12 +1022,272 @@ class DirectExtractionEngine:

        return elements

+    def has_missing_images(self, page: fitz.Page) -> bool:
+        """
+        Detect if a page likely has images that weren't extracted.
+
+        This checks for inline image blocks (type=1 in text dict) which indicate
+        graphics composed of many small image blocks (like logos) that
+        page.get_images() cannot detect.
+
+        Args:
+            page: PyMuPDF page object
+
+        Returns:
+            True if there are likely missing images that need OCR extraction
+        """
+        try:
+            # Check if get_images found anything
+            standard_images = page.get_images()
+            if standard_images:
+                return False  # Standard images were found, no need for fallback
+
+            # Check for inline image blocks (type=1)
+            text_dict = page.get_text("dict", sort=True)
+            blocks = text_dict.get("blocks", [])
+
+            image_block_count = sum(1 for b in blocks if b.get("type") == 1)
+
+            # If there are many inline image blocks, likely there's a logo or graphic
+            if image_block_count >= 10:
+                logger.info(f"Detected {image_block_count} inline image blocks - may need OCR for image extraction")
+                return True
+
+            return False
+
+        except Exception as e:
+            logger.warning(f"Error checking for missing images: {e}")
+            return False
+
+    def check_document_for_missing_images(self, pdf_path: Path) -> List[int]:
+        """
+        Check a PDF document for pages that likely have missing images.
+
+        This opens the PDF and checks each page for inline image blocks
+        that weren't extracted by get_images().
+
+        Args:
+            pdf_path: Path to the PDF file
+
+        Returns:
+            List of page numbers (1-indexed) that have missing images
+        """
+        pages_with_missing_images = []
+
+        try:
+            doc = fitz.open(str(pdf_path))
+            for page_num in range(len(doc)):
+                page = doc[page_num]
+                if self.has_missing_images(page):
+                    pages_with_missing_images.append(page_num + 1)  # 1-indexed
+            doc.close()
+
+            if pages_with_missing_images:
+                logger.info(f"Document has missing images on pages: {pages_with_missing_images}")
+
+        except Exception as e:
+            logger.error(f"Error checking document for missing images: {e}")
+
+        return pages_with_missing_images
+
+    def render_inline_image_regions(
+        self,
+        pdf_path: Path,
+        unified_doc: 'UnifiedDocument',
+        pages: List[int],
+        output_dir: Optional[Path] = None
+    ) -> int:
+        """
+        Render inline image regions and add them to the unified document.
+
+        This is a fallback when OCR doesn't detect images. It clusters inline
+        image blocks (type=1) and renders them as images.
+
+        Args:
+            pdf_path: Path to the PDF file
+            unified_doc: UnifiedDocument to add images to
+            pages: List of page numbers (1-indexed) to process
+            output_dir: Directory to save rendered images
+
+        Returns:
+            Number of images added
+        """
+        images_added = 0
+
+        try:
+            doc = fitz.open(str(pdf_path))
+
+            for page_num in pages:
+                if page_num < 1 or page_num > len(doc):
+                    continue
+
+                page = doc[page_num - 1]  # 0-indexed
+                page_rect = page.rect
+
+                # Get inline image blocks
+                text_dict = page.get_text("dict", sort=True)
+                blocks = text_dict.get("blocks", [])
+
+                image_blocks = []
+                for block in blocks:
+                    if block.get("type") == 1:  # Image block
+                        bbox = block.get("bbox")
+                        if bbox:
+                            image_blocks.append(fitz.Rect(bbox))
+
+                if len(image_blocks) < 5:  # Reduced from 10
+                    logger.debug(f"Page {page_num}: Only {len(image_blocks)} inline image blocks, skipping")
+                    continue
+
+                logger.info(f"Page {page_num}: Found {len(image_blocks)} inline image blocks")
+
+                # Cluster nearby image blocks
+                regions = self._cluster_nearby_rects(image_blocks, tolerance=5.0)
+                logger.info(f"Page {page_num}: Clustered into {len(regions)} regions")
+
+                # Find the corresponding page in unified_doc
+                target_page = None
+                for p in unified_doc.pages:
+                    if p.page_number == page_num:
+                        target_page = p
+                        break
+
+                if not target_page:
+                    continue
+
+                for region_idx, region_rect in enumerate(regions):
+                    logger.info(f"Page {page_num} region {region_idx}: {region_rect} (w={region_rect.width:.1f}, h={region_rect.height:.1f})")
+
+                    # Skip very small regions
+                    if region_rect.width < 30 or region_rect.height < 30:
+                        logger.info(f"  -> Skipped: too small (min 30x30)")
+                        continue
+
+                    # Skip regions that are primarily in the table area (below top 40%)
+                    # But allow regions that START in the top portion
+                    page_30_pct = page_rect.height * 0.3
+                    page_40_pct = page_rect.height * 0.4
+                    if region_rect.y0 > page_40_pct:
+                        logger.info(f"  -> Skipped: y0={region_rect.y0:.1f} > 40% of page ({page_40_pct:.1f})")
+                        continue
+
+                    logger.info(f"Rendering inline image region {region_idx} on page {page_num}: {region_rect}")
+
+                    try:
+                        # Add small padding
+                        clip_rect = region_rect + (-2, -2, 2, 2)
+                        clip_rect.intersect(page_rect)
+
+                        # Render at 2x resolution
+                        mat = fitz.Matrix(2, 2)
+                        pix = page.get_pixmap(clip=clip_rect, matrix=mat, alpha=False)
+
+                        # Create bounding box
+                        bbox = BoundingBox(
+                            x0=clip_rect.x0,
+                            y0=clip_rect.y0,
+                            x1=clip_rect.x1,
+                            y1=clip_rect.y1
+                        )
+
+                        image_data = {
+                            "width": pix.width,
+                            "height": pix.height,
+                            "colorspace": "rgb",
+                            "type": "inline_region"
+                        }
+
+                        # Save image if output directory provided
+                        if output_dir:
+                            output_dir.mkdir(parents=True, exist_ok=True)
+                            doc_id = unified_doc.document_id or "unknown"
+                            image_filename = f"{doc_id}_p{page_num}_logo{region_idx}.png"
+                            image_path = output_dir / image_filename
+                            pix.save(str(image_path))
+                            image_data["saved_path"] = image_filename
+                            logger.info(f"Saved inline image region to {image_path}")
+
+                        element = DocumentElement(
+                            element_id=f"logo_{page_num}_{region_idx}",
+                            type=ElementType.LOGO,
+                            content=image_data,
+                            bbox=bbox,
+                            confidence=0.9,
+                            metadata={
+                                "region_type": "inline_image_blocks",
+                                "block_count": len(image_blocks)
+                            }
+                        )
+                        target_page.elements.append(element)
+                        images_added += 1
+
+                        pix = None  # Free memory
+
+                    except Exception as e:
+                        logger.error(f"Error rendering inline image region {region_idx}: {e}")
+
+            doc.close()
+
+            if images_added > 0:
+                current_images = unified_doc.metadata.total_images or 0
+                unified_doc.metadata.total_images = current_images + images_added
+                logger.info(f"Added {images_added} inline image regions to document")
+
+        except Exception as e:
+            logger.error(f"Error rendering inline image regions: {e}")
+
+        return images_added
+
+    def _cluster_nearby_rects(self, rects: List[fitz.Rect], tolerance: float = 5.0) -> List[fitz.Rect]:
+        """Cluster nearby rectangles into regions."""
+        if not rects:
+            return []
+
+        sorted_rects = sorted(rects, key=lambda r: (r.y0, r.x0))
+
+        merged = []
+        for rect in sorted_rects:
+            merged_with_existing = False
+            for i, region in enumerate(merged):
+                expanded = region + (-tolerance, -tolerance, tolerance, tolerance)
+                if expanded.intersects(rect):
+                    merged[i] = region | rect
+                    merged_with_existing = True
+                    break
+            if not merged_with_existing:
+                merged.append(rect)
+
+        # Second pass: merge any regions that now overlap
+        changed = True
+        while changed:
+            changed = False
+            new_merged = []
+            skip = set()
+
+            for i, r1 in enumerate(merged):
+                if i in skip:
+                    continue
+                current = r1
+                for j, r2 in enumerate(merged[i+1:], start=i+1):
+                    if j in skip:
+                        continue
+                    expanded = current + (-tolerance, -tolerance, tolerance, tolerance)
+                    if expanded.intersects(r2):
+                        current = current | r2
+                        skip.add(j)
+                        changed = True
+                new_merged.append(current)
+            merged = new_merged
+
+        return merged
+
    def _extract_vector_graphics(self,
                                 page: fitz.Page,
                                 page_num: int,
                                 document_id: str,
                                 counter: int,
-                                 output_dir: Optional[Path]) -> List[DocumentElement]:
+                                 output_dir: Optional[Path],
+                                 table_bboxes: Optional[List[BoundingBox]] = None) -> List[DocumentElement]:
        """
        Extract vector graphics (charts, diagrams) from page.

@@ -1020,6 +1301,7 @@ class DirectExtractionEngine:
            document_id: Unique document identifier
            counter: Starting counter for element IDs
            output_dir: Directory to save rendered graphics
+            table_bboxes: List of table bounding boxes to exclude table border drawings

        Returns:
            List of DocumentElement objects representing vector graphics
@@ -1034,16 +1316,25 @@ class DirectExtractionEngine:

            logger.debug(f"Page {page_num} contains {len(drawings)} vector drawing commands")

+            # Filter out drawings that are likely table borders
+            # Table borders are typically thin rectangular lines within table regions
+            non_table_drawings = self._filter_table_border_drawings(drawings, table_bboxes)
+            logger.debug(f"After filtering table borders: {len(non_table_drawings)} drawings remain")
+
+            if not non_table_drawings:
+                logger.debug("All drawings appear to be table borders, no vector graphics to extract")
+                return elements
+
            # Cluster drawings into groups (charts, diagrams, etc.)
            try:
-                # PyMuPDF's cluster_drawings() groups nearby drawings automatically
-                drawing_clusters = page.cluster_drawings()
+                # Use custom clustering that only considers non-table drawings
+                drawing_clusters = self._cluster_non_table_drawings(page, non_table_drawings)
                logger.debug(f"Clustered into {len(drawing_clusters)} groups")
            except (AttributeError, TypeError) as e:
                # cluster_drawings not available or has different signature
                # Fallback: try to identify charts by analyzing drawing density
-                logger.warning(f"cluster_drawings() failed ({e}), using fallback method")
-                drawing_clusters = self._cluster_drawings_fallback(page, drawings)
+                logger.warning(f"Custom clustering failed ({e}), using fallback method")
+                drawing_clusters = self._cluster_drawings_fallback(page, non_table_drawings)

            for cluster_idx, bbox in enumerate(drawing_clusters):
                # Ignore small regions (likely noise or separator lines)
@@ -1148,6 +1439,124 @@ class DirectExtractionEngine:

        return filtered_clusters

+    def _filter_table_border_drawings(self, drawings: list, table_bboxes: Optional[List[BoundingBox]]) -> list:
+        """
+        Filter out drawings that are likely table borders.
+
+        Table borders are typically:
+        - Thin rectangular lines (height or width < 5pt)
+        - Located within or on the edge of table bounding boxes
+
+        Args:
+            drawings: List of PyMuPDF drawing objects
+            table_bboxes: List of table bounding boxes
+
+        Returns:
+            List of drawings that are NOT table borders (likely logos, charts, etc.)
+        """
+        if not table_bboxes:
+            return drawings
+
+        non_table_drawings = []
+        table_border_count = 0
+
+        for drawing in drawings:
+            rect = drawing.get('rect')
+            if not rect:
+                continue
+
+            draw_rect = fitz.Rect(rect)
+
+            # Check if this drawing is a thin line (potential table border)
+            is_thin_line = draw_rect.width < 5 or draw_rect.height < 5
+
+            # Check if drawing overlaps significantly with any table
+            overlaps_table = False
+            for table_bbox in table_bboxes:
+                table_rect = fitz.Rect(table_bbox.x0, table_bbox.y0, table_bbox.x1, table_bbox.y1)
+
+                # Expand table rect slightly to include border lines on edges
+                expanded_table = table_rect + (-5, -5, 5, 5)
+
+                if expanded_table.contains(draw_rect) or expanded_table.intersects(draw_rect):
+                    # Calculate overlap ratio
+                    intersection = draw_rect & expanded_table
+                    if not intersection.is_empty:
+                        overlap_ratio = intersection.get_area() / draw_rect.get_area() if draw_rect.get_area() > 0 else 0
+
+                        # If drawing is mostly inside table region, it's likely a border
+                        if overlap_ratio > 0.8:
+                            overlaps_table = True
+                            break
+
+            # Keep drawing if it's NOT (thin line AND overlapping table)
+            # This keeps: logos (complex shapes), charts outside tables, etc.
+            if is_thin_line and overlaps_table:
+                table_border_count += 1
+            else:
+                non_table_drawings.append(drawing)
+
+        if table_border_count > 0:
+            logger.debug(f"Filtered out {table_border_count} table border drawings")
+
+        return non_table_drawings
+
+    def _cluster_non_table_drawings(self, page: fitz.Page, drawings: list) -> list:
+        """
+        Cluster non-table drawings into groups.
+
+        This method clusters drawings that have been pre-filtered to exclude table borders.
+        It uses a more conservative clustering approach suitable for logos and charts.
+
+        Args:
+            page: PyMuPDF page object
+            drawings: Pre-filtered list of drawings (excluding table borders)
+
+        Returns:
+            List of fitz.Rect representing clustered drawing regions
+        """
+        if not drawings:
+            return []
+
+        # Collect all drawing bounding boxes
+        bboxes = []
+        for drawing in drawings:
+            rect = drawing.get('rect')
+            if rect:
+                bboxes.append(fitz.Rect(rect))
+
+        if not bboxes:
+            return []
+
+        # More conservative clustering with smaller tolerance
+        # This prevents grouping distant graphics together
+        clusters = []
+        tolerance = 10  # Smaller tolerance than fallback (was 20)
+
+        for bbox in bboxes:
+            # Try to merge with existing cluster
+            merged = False
+            for i, cluster in enumerate(clusters):
+                # Check if bbox is close to this cluster
+                expanded_cluster = cluster + (-tolerance, -tolerance, tolerance, tolerance)
+                if expanded_cluster.intersects(bbox):
+                    # Merge bbox into cluster
+                    clusters[i] = cluster | bbox  # Union of rectangles
+                    merged = True
+                    break
+
+            if not merged:
+                # Create new cluster
+                clusters.append(bbox)
+
+        # Filter out very small clusters (noise)
+        # Keep minimum 30x30 for logos (smaller than default 50x50)
+        filtered_clusters = [c for c in clusters if c.width >= 30 and c.height >= 30]
+
+        logger.debug(f"Non-table clustering: {len(bboxes)} drawings -> {len(clusters)} clusters -> {len(filtered_clusters)} filtered")
+
+        return filtered_clusters
+
    def _deduplicate_table_chart_overlap(self, elements: List[DocumentElement]) -> List[DocumentElement]:
        """
        Intelligently resolve TABLE-CHART overlaps based on table structure completeness.