fix: add IoU text coverage check and page boundary validation
Vector rectangles: - Add page boundary check (skip out-of-bounds rectangles) - Clip rectangles to page boundaries Covering images: - Add page boundary check (skip out-of-bounds images) - Add IoU-based text coverage verification - Only report images that actually cover text (>= 50% word coverage) - Add covered_text_count to detection results This reduces false positives from black logos or decorative images that don't actually cover any text content. Test results (edit3.pdf): - Before: 10 covering images detected - After: 6 covering images detected (4 filtered - no text coverage) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -1941,6 +1941,7 @@ class DirectExtractionEngine:
|
|||||||
List of dicts with covered text info: {'text', 'bbox', 'coverage', 'color_type'}
|
List of dicts with covered text info: {'text', 'bbox', 'coverage', 'color_type'}
|
||||||
"""
|
"""
|
||||||
covered_words = []
|
covered_words = []
|
||||||
|
page_rect = page.rect # Page boundaries
|
||||||
|
|
||||||
# Get all drawings and find solid-filled rectangles
|
# Get all drawings and find solid-filled rectangles
|
||||||
drawings = page.get_drawings()
|
drawings = page.get_drawings()
|
||||||
@@ -1960,6 +1961,13 @@ class DirectExtractionEngine:
|
|||||||
if fitz_rect.width < 5 or fitz_rect.height < 5:
|
if fitz_rect.width < 5 or fitz_rect.height < 5:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# Skip rectangles completely outside page boundaries
|
||||||
|
if not fitz_rect.intersects(page_rect):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Clip rectangle to page boundaries
|
||||||
|
fitz_rect = fitz_rect & page_rect
|
||||||
|
|
||||||
# Detect white rectangles (white-out / correction tape)
|
# Detect white rectangles (white-out / correction tape)
|
||||||
# Must be pure white (>= 0.98) to avoid false positives from light backgrounds
|
# Must be pure white (>= 0.98) to avoid false positives from light backgrounds
|
||||||
if r >= 0.98 and g >= 0.98 and b >= 0.98:
|
if r >= 0.98 and g >= 0.98 and b >= 0.98:
|
||||||
@@ -1975,7 +1983,7 @@ class DirectExtractionEngine:
|
|||||||
# Log detected covering rectangles by type
|
# Log detected covering rectangles by type
|
||||||
white_count = sum(1 for _, t in covering_rects if t == 'white')
|
white_count = sum(1 for _, t in covering_rects if t == 'white')
|
||||||
black_count = sum(1 for _, t in covering_rects if t == 'black')
|
black_count = sum(1 for _, t in covering_rects if t == 'black')
|
||||||
logger.debug(f"Page {page_num}: Found {len(covering_rects)} covering rectangles "
|
logger.debug(f"Page {page_num}: Found {len(covering_rects)} potential covering rectangles "
|
||||||
f"(white: {white_count}, black/redaction: {black_count})")
|
f"(white: {white_count}, black/redaction: {black_count})")
|
||||||
|
|
||||||
# Get all text words with bounding boxes
|
# Get all text words with bounding boxes
|
||||||
@@ -2013,7 +2021,12 @@ class DirectExtractionEngine:
|
|||||||
|
|
||||||
def _detect_covering_images(self, page: fitz.Page, doc: fitz.Document, page_num: int) -> List[Dict]:
|
def _detect_covering_images(self, page: fitz.Page, doc: fitz.Document, page_num: int) -> List[Dict]:
|
||||||
"""
|
"""
|
||||||
Detect embedded images that are mostly black/white (likely covering/redaction).
|
Detect embedded images that are mostly black/white AND actually cover text.
|
||||||
|
|
||||||
|
Only reports images that:
|
||||||
|
1. Are mostly solid black or white
|
||||||
|
2. Are within page boundaries
|
||||||
|
3. Actually overlap with text content (IoU check)
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
page: PyMuPDF page object
|
page: PyMuPDF page object
|
||||||
@@ -2021,9 +2034,10 @@ class DirectExtractionEngine:
|
|||||||
page_num: Page number for logging
|
page_num: Page number for logging
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
List of dicts with covering image info: {'bbox', 'color_type', 'avg_color'}
|
List of dicts with covering image info: {'bbox', 'color_type', 'avg_color', 'covered_text_count'}
|
||||||
"""
|
"""
|
||||||
covering_images = []
|
covering_images = []
|
||||||
|
page_rect = page.rect # Page boundaries
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Get all images on the page with their positions
|
# Get all images on the page with their positions
|
||||||
@@ -2032,6 +2046,9 @@ class DirectExtractionEngine:
|
|||||||
if not image_list:
|
if not image_list:
|
||||||
return covering_images
|
return covering_images
|
||||||
|
|
||||||
|
# Get all text words for coverage check
|
||||||
|
words = page.get_text("words") # (x0, y0, x1, y1, word, block_no, line_no, word_no)
|
||||||
|
|
||||||
for img_info in image_list:
|
for img_info in image_list:
|
||||||
xref = img_info[0]
|
xref = img_info[0]
|
||||||
width = img_info[2]
|
width = img_info[2]
|
||||||
@@ -2076,14 +2093,39 @@ class DirectExtractionEngine:
|
|||||||
|
|
||||||
if color_type:
|
if color_type:
|
||||||
# Get image position on page
|
# Get image position on page
|
||||||
# We need to find the image rectangle on the page
|
|
||||||
for img_rect in page.get_image_rects(xref):
|
for img_rect in page.get_image_rects(xref):
|
||||||
covering_images.append({
|
# Skip images completely outside page boundaries
|
||||||
'bbox': tuple(img_rect),
|
if not img_rect.intersects(page_rect):
|
||||||
'color_type': color_type,
|
continue
|
||||||
'avg_color': (avg_r, avg_g, avg_b),
|
|
||||||
'size': (width, height)
|
# Clip image rect to page boundaries
|
||||||
})
|
clipped_rect = img_rect & page_rect
|
||||||
|
|
||||||
|
# Check if image actually covers any text (IoU check)
|
||||||
|
covered_text_count = 0
|
||||||
|
for word_info in words:
|
||||||
|
word_rect = fitz.Rect(word_info[:4])
|
||||||
|
word_area = word_rect.width * word_rect.height
|
||||||
|
if word_area <= 0:
|
||||||
|
continue
|
||||||
|
|
||||||
|
intersection = word_rect & clipped_rect
|
||||||
|
if not intersection.is_empty:
|
||||||
|
intersection_area = intersection.width * intersection.height
|
||||||
|
coverage_ratio = intersection_area / word_area
|
||||||
|
# Count as covered if >= 50% of word is under the image
|
||||||
|
if coverage_ratio >= 0.5:
|
||||||
|
covered_text_count += 1
|
||||||
|
|
||||||
|
# Only report if image actually covers text
|
||||||
|
if covered_text_count > 0:
|
||||||
|
covering_images.append({
|
||||||
|
'bbox': tuple(clipped_rect),
|
||||||
|
'color_type': color_type,
|
||||||
|
'avg_color': (avg_r, avg_g, avg_b),
|
||||||
|
'size': (width, height),
|
||||||
|
'covered_text_count': covered_text_count
|
||||||
|
})
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.debug(f"Page {page_num}: Failed to analyze image xref={xref}: {e}")
|
logger.debug(f"Page {page_num}: Failed to analyze image xref={xref}: {e}")
|
||||||
@@ -2092,8 +2134,9 @@ class DirectExtractionEngine:
|
|||||||
if covering_images:
|
if covering_images:
|
||||||
black_count = sum(1 for c in covering_images if c['color_type'] == 'image_black')
|
black_count = sum(1 for c in covering_images if c['color_type'] == 'image_black')
|
||||||
white_count = sum(1 for c in covering_images if c['color_type'] == 'image_white')
|
white_count = sum(1 for c in covering_images if c['color_type'] == 'image_white')
|
||||||
|
total_covered = sum(c.get('covered_text_count', 0) for c in covering_images)
|
||||||
logger.debug(f"Page {page_num}: Found {len(covering_images)} covering images "
|
logger.debug(f"Page {page_num}: Found {len(covering_images)} covering images "
|
||||||
f"(black: {black_count}, white: {white_count})")
|
f"(black: {black_count}, white: {white_count}, covering {total_covered} text regions)")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"Page {page_num}: Failed to detect covering images: {e}")
|
logger.warning(f"Page {page_num}: Failed to detect covering images: {e}")
|
||||||
|
|||||||
Reference in New Issue
Block a user