From 9437387ef1eb40319bf32b2dd92dc41d9d8e20df Mon Sep 17 00:00:00 2001
From: egg <lin4637lin4637@gmail.com>
Date: Thu, 4 Dec 2025 07:48:38 +0800
Subject: [PATCH] fix: add IoU text coverage check and page boundary validation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Vector rectangles:
- Add page boundary check (skip out-of-bounds rectangles)
- Clip rectangles to page boundaries

Covering images:
- Add page boundary check (skip out-of-bounds images)
- Add IoU-based text coverage verification
- Only report images that actually cover text (>= 50% word coverage)
- Add covered_text_count to detection results

This reduces false positives from black logos or decorative images
that don't actually cover any text content.

Test results (edit3.pdf):
- Before: 10 covering images detected
- After: 6 covering images detected (4 filtered - no text coverage)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 .../app/services/direct_extraction_engine.py  | 65 +++++++++++++++----
 1 file changed, 54 insertions(+), 11 deletions(-)

diff --git a/backend/app/services/direct_extraction_engine.py b/backend/app/services/direct_extraction_engine.py
index 4f29b06..9ea6141 100644
--- a/backend/app/services/direct_extraction_engine.py
+++ b/backend/app/services/direct_extraction_engine.py
@@ -1941,6 +1941,7 @@ class DirectExtractionEngine:
             List of dicts with covered text info: {'text', 'bbox', 'coverage', 'color_type'}
         """
         covered_words = []
+        page_rect = page.rect  # Page boundaries
 
         # Get all drawings and find solid-filled rectangles
         drawings = page.get_drawings()
@@ -1960,6 +1961,13 @@ class DirectExtractionEngine:
                 if fitz_rect.width < 5 or fitz_rect.height < 5:
                     continue
 
+                # Skip rectangles completely outside page boundaries
+                if not fitz_rect.intersects(page_rect):
+                    continue
+
+                # Clip rectangle to page boundaries
+                fitz_rect = fitz_rect & page_rect
+
                 # Detect white rectangles (white-out / correction tape)
                 # Must be pure white (>= 0.98) to avoid false positives from light backgrounds
                 if r >= 0.98 and g >= 0.98 and b >= 0.98:
@@ -1975,7 +1983,7 @@ class DirectExtractionEngine:
         # Log detected covering rectangles by type
         white_count = sum(1 for _, t in covering_rects if t == 'white')
         black_count = sum(1 for _, t in covering_rects if t == 'black')
-        logger.debug(f"Page {page_num}: Found {len(covering_rects)} covering rectangles "
+        logger.debug(f"Page {page_num}: Found {len(covering_rects)} potential covering rectangles "
                     f"(white: {white_count}, black/redaction: {black_count})")
 
         # Get all text words with bounding boxes
@@ -2013,7 +2021,12 @@ class DirectExtractionEngine:
 
     def _detect_covering_images(self, page: fitz.Page, doc: fitz.Document, page_num: int) -> List[Dict]:
         """
-        Detect embedded images that are mostly black/white (likely covering/redaction).
+        Detect embedded images that are mostly black/white AND actually cover text.
+
+        Only reports images that:
+        1. Are mostly solid black or white
+        2. Are within page boundaries
+        3. Actually overlap with text content (IoU check)
 
         Args:
             page: PyMuPDF page object
@@ -2021,9 +2034,10 @@ class DirectExtractionEngine:
             page_num: Page number for logging
 
         Returns:
-            List of dicts with covering image info: {'bbox', 'color_type', 'avg_color'}
+            List of dicts with covering image info: {'bbox', 'color_type', 'avg_color', 'covered_text_count'}
         """
         covering_images = []
+        page_rect = page.rect  # Page boundaries
 
         try:
             # Get all images on the page with their positions
@@ -2032,6 +2046,9 @@ class DirectExtractionEngine:
             if not image_list:
                 return covering_images
 
+            # Get all text words for coverage check
+            words = page.get_text("words")  # (x0, y0, x1, y1, word, block_no, line_no, word_no)
+
             for img_info in image_list:
                 xref = img_info[0]
                 width = img_info[2]
@@ -2076,14 +2093,39 @@ class DirectExtractionEngine:
 
                     if color_type:
                         # Get image position on page
-                        # We need to find the image rectangle on the page
                         for img_rect in page.get_image_rects(xref):
-                            covering_images.append({
-                                'bbox': tuple(img_rect),
-                                'color_type': color_type,
-                                'avg_color': (avg_r, avg_g, avg_b),
-                                'size': (width, height)
-                            })
+                            # Skip images completely outside page boundaries
+                            if not img_rect.intersects(page_rect):
+                                continue
+
+                            # Clip image rect to page boundaries
+                            clipped_rect = img_rect & page_rect
+
+                            # Check if image actually covers any text (IoU check)
+                            covered_text_count = 0
+                            for word_info in words:
+                                word_rect = fitz.Rect(word_info[:4])
+                                word_area = word_rect.width * word_rect.height
+                                if word_area <= 0:
+                                    continue
+
+                                intersection = word_rect & clipped_rect
+                                if not intersection.is_empty:
+                                    intersection_area = intersection.width * intersection.height
+                                    coverage_ratio = intersection_area / word_area
+                                    # Count as covered if >= 50% of word is under the image
+                                    if coverage_ratio >= 0.5:
+                                        covered_text_count += 1
+
+                            # Only report if image actually covers text
+                            if covered_text_count > 0:
+                                covering_images.append({
+                                    'bbox': tuple(clipped_rect),
+                                    'color_type': color_type,
+                                    'avg_color': (avg_r, avg_g, avg_b),
+                                    'size': (width, height),
+                                    'covered_text_count': covered_text_count
+                                })
 
                 except Exception as e:
                     logger.debug(f"Page {page_num}: Failed to analyze image xref={xref}: {e}")
@@ -2092,8 +2134,9 @@ class DirectExtractionEngine:
             if covering_images:
                 black_count = sum(1 for c in covering_images if c['color_type'] == 'image_black')
                 white_count = sum(1 for c in covering_images if c['color_type'] == 'image_white')
+                total_covered = sum(c.get('covered_text_count', 0) for c in covering_images)
                 logger.debug(f"Page {page_num}: Found {len(covering_images)} covering images "
-                            f"(black: {black_count}, white: {white_count})")
+                            f"(black: {black_count}, white: {white_count}, covering {total_covered} text regions)")
 
         except Exception as e:
             logger.warning(f"Page {page_num}: Failed to detect covering images: {e}")