From 9437387ef1eb40319bf32b2dd92dc41d9d8e20df Mon Sep 17 00:00:00 2001 From: egg Date: Thu, 4 Dec 2025 07:48:38 +0800 Subject: [PATCH] fix: add IoU text coverage check and page boundary validation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Vector rectangles: - Add page boundary check (skip out-of-bounds rectangles) - Clip rectangles to page boundaries Covering images: - Add page boundary check (skip out-of-bounds images) - Add IoU-based text coverage verification - Only report images that actually cover text (>= 50% word coverage) - Add covered_text_count to detection results This reduces false positives from black logos or decorative images that don't actually cover any text content. Test results (edit3.pdf): - Before: 10 covering images detected - After: 6 covering images detected (4 filtered - no text coverage) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../app/services/direct_extraction_engine.py | 65 +++++++++++++++---- 1 file changed, 54 insertions(+), 11 deletions(-) diff --git a/backend/app/services/direct_extraction_engine.py b/backend/app/services/direct_extraction_engine.py index 4f29b06..9ea6141 100644 --- a/backend/app/services/direct_extraction_engine.py +++ b/backend/app/services/direct_extraction_engine.py @@ -1941,6 +1941,7 @@ class DirectExtractionEngine: List of dicts with covered text info: {'text', 'bbox', 'coverage', 'color_type'} """ covered_words = [] + page_rect = page.rect # Page boundaries # Get all drawings and find solid-filled rectangles drawings = page.get_drawings() @@ -1960,6 +1961,13 @@ class DirectExtractionEngine: if fitz_rect.width < 5 or fitz_rect.height < 5: continue + # Skip rectangles completely outside page boundaries + if not fitz_rect.intersects(page_rect): + continue + + # Clip rectangle to page boundaries + fitz_rect = fitz_rect & page_rect + # Detect white rectangles (white-out / correction tape) # Must be pure white (>= 0.98) to avoid false positives from light backgrounds if r >= 0.98 and g >= 0.98 and b >= 0.98: @@ -1975,7 +1983,7 @@ class DirectExtractionEngine: # Log detected covering rectangles by type white_count = sum(1 for _, t in covering_rects if t == 'white') black_count = sum(1 for _, t in covering_rects if t == 'black') - logger.debug(f"Page {page_num}: Found {len(covering_rects)} covering rectangles " + logger.debug(f"Page {page_num}: Found {len(covering_rects)} potential covering rectangles " f"(white: {white_count}, black/redaction: {black_count})") # Get all text words with bounding boxes @@ -2013,7 +2021,12 @@ class DirectExtractionEngine: def _detect_covering_images(self, page: fitz.Page, doc: fitz.Document, page_num: int) -> List[Dict]: """ - Detect embedded images that are mostly black/white (likely covering/redaction). + Detect embedded images that are mostly black/white AND actually cover text. + + Only reports images that: + 1. Are mostly solid black or white + 2. Are within page boundaries + 3. Actually overlap with text content (IoU check) Args: page: PyMuPDF page object @@ -2021,9 +2034,10 @@ class DirectExtractionEngine: page_num: Page number for logging Returns: - List of dicts with covering image info: {'bbox', 'color_type', 'avg_color'} + List of dicts with covering image info: {'bbox', 'color_type', 'avg_color', 'covered_text_count'} """ covering_images = [] + page_rect = page.rect # Page boundaries try: # Get all images on the page with their positions @@ -2032,6 +2046,9 @@ class DirectExtractionEngine: if not image_list: return covering_images + # Get all text words for coverage check + words = page.get_text("words") # (x0, y0, x1, y1, word, block_no, line_no, word_no) + for img_info in image_list: xref = img_info[0] width = img_info[2] @@ -2076,14 +2093,39 @@ class DirectExtractionEngine: if color_type: # Get image position on page - # We need to find the image rectangle on the page for img_rect in page.get_image_rects(xref): - covering_images.append({ - 'bbox': tuple(img_rect), - 'color_type': color_type, - 'avg_color': (avg_r, avg_g, avg_b), - 'size': (width, height) - }) + # Skip images completely outside page boundaries + if not img_rect.intersects(page_rect): + continue + + # Clip image rect to page boundaries + clipped_rect = img_rect & page_rect + + # Check if image actually covers any text (IoU check) + covered_text_count = 0 + for word_info in words: + word_rect = fitz.Rect(word_info[:4]) + word_area = word_rect.width * word_rect.height + if word_area <= 0: + continue + + intersection = word_rect & clipped_rect + if not intersection.is_empty: + intersection_area = intersection.width * intersection.height + coverage_ratio = intersection_area / word_area + # Count as covered if >= 50% of word is under the image + if coverage_ratio >= 0.5: + covered_text_count += 1 + + # Only report if image actually covers text + if covered_text_count > 0: + covering_images.append({ + 'bbox': tuple(clipped_rect), + 'color_type': color_type, + 'avg_color': (avg_r, avg_g, avg_b), + 'size': (width, height), + 'covered_text_count': covered_text_count + }) except Exception as e: logger.debug(f"Page {page_num}: Failed to analyze image xref={xref}: {e}") @@ -2092,8 +2134,9 @@ class DirectExtractionEngine: if covering_images: black_count = sum(1 for c in covering_images if c['color_type'] == 'image_black') white_count = sum(1 for c in covering_images if c['color_type'] == 'image_white') + total_covered = sum(c.get('covered_text_count', 0) for c in covering_images) logger.debug(f"Page {page_num}: Found {len(covering_images)} covering images " - f"(black: {black_count}, white: {white_count})") + f"(black: {black_count}, white: {white_count}, covering {total_covered} text regions)") except Exception as e: logger.warning(f"Page {page_num}: Failed to detect covering images: {e}")