feat: add black/white covering image detection

Implements detection of embedded images used for redaction/covering: - Analyzes embedded images for mostly black (avg RGB <= 30) or white (>= 245) - Uses PIL to efficiently sample image colors - Gets image position on page via get_image_rects() - Integrates with existing preprocessing pipeline - Adds covering_images to page metadata and quality report Detection results: - demo_docs/edit3.pdf: 10 black covering images detected (7 on P1, 3 on P2) Quality report now includes: - total_covering_images count - Per-page covering_images details with bbox, color_type, size 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-04 07:42:55 +08:00
parent 3903bcf77d
commit d6387adbd1
1 changed files with 123 additions and 10 deletions
--- a/backend/app/services/direct_extraction_engine.py
+++ b/backend/app/services/direct_extraction_engine.py
@@ -120,7 +120,8 @@ class DirectExtractionEngine:
                    doc[page_num],
                    page_num + 1,
                    document_id,
-                    output_dir
+                    output_dir,
+                    doc  # Pass doc for covering image detection
                )
                pages.append(page)

@@ -211,7 +212,8 @@ class DirectExtractionEngine:
                     page: fitz.Page,
                     page_num: int,
                     document_id: str,
-                     output_dir: Optional[Path]) -> Page:
+                     output_dir: Optional[Path],
+                     doc: fitz.Document = None) -> Page:
        """Extract content from a single page with preprocessing pipeline."""
        elements = []
        element_counter = 0
@@ -219,8 +221,8 @@ class DirectExtractionEngine:
        # =====================================================================
        # PREPROCESSING PIPELINE
        # =====================================================================
-        # Step 1: Run preprocessing (sanitization, white-out detection)
-        preprocess_result = self._preprocess_page(page, page_num)
+        # Step 1: Run preprocessing (sanitization, white-out detection, covering images)
+        preprocess_result = self._preprocess_page(page, page_num, doc)
        covered_bboxes = preprocess_result.get('covered_word_bboxes', [])

        # Get page-level metadata (for final Page metadata)
@@ -337,13 +339,16 @@ class DirectExtractionEngine:
        elements = self._filter_page_numbers(elements, dimensions.height)

        # Step 3.2-3.3: Garble detection and OCR fallback recommendation
+        covering_images = preprocess_result.get('covering_images', [])
        page_metadata = {
            "has_drawings": len(drawings) > 0,
            "drawing_count": len(drawings),
            "link_count": len(links),
            "preprocessing": {
                "sanitized": preprocess_result.get('sanitized', False),
-                "whiteout_regions_found": len(covered_bboxes)
+                "whiteout_regions_found": len(covered_bboxes) - len(covering_images),  # Vector rects only
+                "covering_images_found": len(covering_images),
+                "covering_images": covering_images  # Full details for debugging
            }
        }

@@ -1856,27 +1861,31 @@ class DirectExtractionEngine:
    # PDF Preprocessing Pipeline Methods
    # =========================================================================

-    def _preprocess_page(self, page: fitz.Page, page_num: int) -> Dict[str, Any]:
+    def _preprocess_page(self, page: fitz.Page, page_num: int, doc: fitz.Document = None) -> Dict[str, Any]:
        """
        Run preprocessing pipeline on a page before extraction.

        Pipeline steps:
        1. Content sanitization (clean_contents)
        2. Hidden layer detection (OCG)
-        3. White-out detection
+        3. White-out/black-out detection (vector rectangles)
+        4. Covering image detection (embedded black/white images)

        Args:
            page: PyMuPDF page object
            page_num: Page number (1-indexed)
+            doc: PyMuPDF document object (needed for image analysis)

        Returns:
            Dict with preprocessing results:
-            - covered_word_bboxes: List of bboxes for text covered by white rectangles
+            - covered_word_bboxes: List of bboxes for text covered by rectangles/images
+            - covering_images: List of covering image info
            - hidden_layers: List of hidden OCG layer names
            - sanitized: Whether content was sanitized
        """
        result = {
            'covered_word_bboxes': [],
+            'covering_images': [],
            'hidden_layers': [],
            'sanitized': False
        }
@@ -1890,7 +1899,7 @@ class DirectExtractionEngine:
            except Exception as e:
                logger.warning(f"Page {page_num}: Content sanitization failed: {e}")

-        # Step 1.3: White-out detection
+        # Step 1.3: White-out/black-out detection (vector rectangles)
        if self.enable_whiteout_detection:
            covered = self._detect_whiteout_covered_text(page, page_num)
            result['covered_word_bboxes'] = [fitz.Rect(w['bbox']) for w in covered]
@@ -1903,6 +1912,19 @@ class DirectExtractionEngine:
                logger.info(f"Page {page_num}: Detected {len(covered)} covered text regions "
                           f"(white: {white_covered}, black/redaction: {black_covered}, other: {other_covered})")

+        # Step 1.4: Covering image detection (embedded black/white images)
+        if self.enable_whiteout_detection and doc is not None:
+            covering_images = self._detect_covering_images(page, doc, page_num)
+            result['covering_images'] = covering_images
+            # Add covering image bboxes to the covered_word_bboxes list
+            for img in covering_images:
+                result['covered_word_bboxes'].append(fitz.Rect(img['bbox']))
+            if covering_images:
+                black_imgs = sum(1 for c in covering_images if c['color_type'] == 'image_black')
+                white_imgs = sum(1 for c in covering_images if c['color_type'] == 'image_white')
+                logger.info(f"Page {page_num}: Detected {len(covering_images)} covering images "
+                           f"(black: {black_imgs}, white: {white_imgs})")
+
        return result

    def _detect_whiteout_covered_text(self, page: fitz.Page, page_num: int) -> List[Dict]:
@@ -1989,6 +2011,95 @@ class DirectExtractionEngine:

        return covered_words

+    def _detect_covering_images(self, page: fitz.Page, doc: fitz.Document, page_num: int) -> List[Dict]:
+        """
+        Detect embedded images that are mostly black/white (likely covering/redaction).
+
+        Args:
+            page: PyMuPDF page object
+            doc: PyMuPDF document object (needed for image extraction)
+            page_num: Page number for logging
+
+        Returns:
+            List of dicts with covering image info: {'bbox', 'color_type', 'avg_color'}
+        """
+        covering_images = []
+
+        try:
+            # Get all images on the page with their positions
+            image_list = page.get_images(full=True)
+
+            if not image_list:
+                return covering_images
+
+            for img_info in image_list:
+                xref = img_info[0]
+                width = img_info[2]
+                height = img_info[3]
+
+                # Skip very small images (icons, bullets)
+                if width < 20 or height < 10:
+                    continue
+
+                try:
+                    # Extract image data
+                    base_image = doc.extract_image(xref)
+                    img_bytes = base_image.get('image')
+                    if not img_bytes:
+                        continue
+
+                    # Analyze image color using PIL
+                    from PIL import Image
+                    import io
+
+                    img = Image.open(io.BytesIO(img_bytes))
+                    if img.mode != 'RGB':
+                        img = img.convert('RGB')
+
+                    # Sample pixels for efficiency (don't analyze every pixel)
+                    img_small = img.resize((min(50, img.width), min(50, img.height)))
+                    pixels = list(img_small.getdata())
+
+                    if not pixels:
+                        continue
+
+                    avg_r = sum(p[0] for p in pixels) / len(pixels)
+                    avg_g = sum(p[1] for p in pixels) / len(pixels)
+                    avg_b = sum(p[2] for p in pixels) / len(pixels)
+
+                    # Determine if image is mostly black or white
+                    color_type = None
+                    if avg_r <= 30 and avg_g <= 30 and avg_b <= 30:
+                        color_type = 'image_black'
+                    elif avg_r >= 245 and avg_g >= 245 and avg_b >= 245:
+                        color_type = 'image_white'
+
+                    if color_type:
+                        # Get image position on page
+                        # We need to find the image rectangle on the page
+                        for img_rect in page.get_image_rects(xref):
+                            covering_images.append({
+                                'bbox': tuple(img_rect),
+                                'color_type': color_type,
+                                'avg_color': (avg_r, avg_g, avg_b),
+                                'size': (width, height)
+                            })
+
+                except Exception as e:
+                    logger.debug(f"Page {page_num}: Failed to analyze image xref={xref}: {e}")
+                    continue
+
+            if covering_images:
+                black_count = sum(1 for c in covering_images if c['color_type'] == 'image_black')
+                white_count = sum(1 for c in covering_images if c['color_type'] == 'image_white')
+                logger.debug(f"Page {page_num}: Found {len(covering_images)} covering images "
+                            f"(black: {black_count}, white: {white_count})")
+
+        except Exception as e:
+            logger.warning(f"Page {page_num}: Failed to detect covering images: {e}")
+
+        return covering_images
+
    def _get_hidden_ocg_layers(self, doc: fitz.Document) -> List[str]:
        """
        Get list of hidden Optional Content Group (OCG) layer names.
@@ -2410,7 +2521,8 @@ class DirectExtractionEngine:
            'needs_ocr_fallback': False,
            'preprocessing_stats': {
                'pages_sanitized': 0,
-                'total_whiteout_regions': 0
+                'total_whiteout_regions': 0,
+                'total_covering_images': 0
            }
        }

@@ -2437,6 +2549,7 @@ class DirectExtractionEngine:
            if preprocessing.get('sanitized', False):
                report['preprocessing_stats']['pages_sanitized'] += 1
            report['preprocessing_stats']['total_whiteout_regions'] += preprocessing.get('whiteout_regions_found', 0)
+            report['preprocessing_stats']['total_covering_images'] += preprocessing.get('covering_images_found', 0)

        # Calculate average garble rate
        if pages_with_garble > 0: