feat: extend covering detection to include black/redaction rectangles
Expands whiteout detection to handle: - White rectangles (RGB >= 0.95) - correction tape / white-out - Black rectangles (RGB <= 0.05) - redaction / censoring - Other solid fills (very dark or very light) - potential covering Adds color_type to covered text results for better logging. Logs now show breakdown by cover type (white, black, other). 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -1894,14 +1894,20 @@ class DirectExtractionEngine:
|
||||
if self.enable_whiteout_detection:
|
||||
covered = self._detect_whiteout_covered_text(page, page_num)
|
||||
result['covered_word_bboxes'] = [fitz.Rect(w['bbox']) for w in covered]
|
||||
result['covered_words_detail'] = covered # Include color_type info
|
||||
if covered:
|
||||
logger.info(f"Page {page_num}: Detected {len(covered)} text regions covered by white-out")
|
||||
# Count by color type
|
||||
white_covered = sum(1 for c in covered if c.get('color_type') == 'white')
|
||||
black_covered = sum(1 for c in covered if c.get('color_type') == 'black')
|
||||
other_covered = len(covered) - white_covered - black_covered
|
||||
logger.info(f"Page {page_num}: Detected {len(covered)} covered text regions "
|
||||
f"(white: {white_covered}, black/redaction: {black_covered}, other: {other_covered})")
|
||||
|
||||
return result
|
||||
|
||||
def _detect_whiteout_covered_text(self, page: fitz.Page, page_num: int) -> List[Dict]:
|
||||
"""
|
||||
Detect text covered by white rectangles ("white-out" / "correction tape" effect).
|
||||
Detect text covered by solid color rectangles (white-out, black redaction, or any solid fill).
|
||||
|
||||
Uses IoU (Intersection over Union) to determine if text is covered.
|
||||
|
||||
@@ -1910,30 +1916,52 @@ class DirectExtractionEngine:
|
||||
page_num: Page number for logging
|
||||
|
||||
Returns:
|
||||
List of dicts with covered text info: {'text', 'bbox', 'coverage'}
|
||||
List of dicts with covered text info: {'text', 'bbox', 'coverage', 'color_type'}
|
||||
"""
|
||||
covered_words = []
|
||||
|
||||
# Get all drawings and find white-filled rectangles
|
||||
# Get all drawings and find solid-filled rectangles
|
||||
drawings = page.get_drawings()
|
||||
white_rects = []
|
||||
covering_rects = [] # List of (rect, color_type)
|
||||
|
||||
for d in drawings:
|
||||
fill_color = d.get('fill')
|
||||
# Check for white fill (RGB all 1.0 or close to it)
|
||||
if fill_color:
|
||||
if isinstance(fill_color, (tuple, list)) and len(fill_color) >= 3:
|
||||
if fill_color and isinstance(fill_color, (tuple, list)) and len(fill_color) >= 3:
|
||||
r, g, b = fill_color[:3]
|
||||
# Allow slight tolerance for "almost white"
|
||||
if r >= 0.95 and g >= 0.95 and b >= 0.95:
|
||||
rect = d.get('rect')
|
||||
if rect:
|
||||
white_rects.append(fitz.Rect(rect))
|
||||
if not rect:
|
||||
continue
|
||||
|
||||
if not white_rects:
|
||||
fitz_rect = fitz.Rect(rect)
|
||||
|
||||
# Skip very small rectangles (likely not covering blocks)
|
||||
if fitz_rect.width < 5 or fitz_rect.height < 5:
|
||||
continue
|
||||
|
||||
# Detect white rectangles (white-out / correction tape)
|
||||
if r >= 0.95 and g >= 0.95 and b >= 0.95:
|
||||
covering_rects.append((fitz_rect, 'white'))
|
||||
# Detect black rectangles (redaction)
|
||||
elif r <= 0.05 and g <= 0.05 and b <= 0.05:
|
||||
covering_rects.append((fitz_rect, 'black'))
|
||||
# Detect other solid colors (uniform fill that might be covering)
|
||||
# Only consider if it's a solid color (low variance) and reasonably large
|
||||
elif fitz_rect.width >= 20 and fitz_rect.height >= 10:
|
||||
# Check if it's a saturated solid color (not gradient-like)
|
||||
# For now, detect very dark or very light solid fills
|
||||
avg_brightness = (r + g + b) / 3
|
||||
if avg_brightness <= 0.1 or avg_brightness >= 0.9:
|
||||
covering_rects.append((fitz_rect, 'solid'))
|
||||
|
||||
if not covering_rects:
|
||||
return covered_words
|
||||
|
||||
logger.debug(f"Page {page_num}: Found {len(white_rects)} white rectangles")
|
||||
# Log detected covering rectangles by type
|
||||
white_count = sum(1 for _, t in covering_rects if t == 'white')
|
||||
black_count = sum(1 for _, t in covering_rects if t == 'black')
|
||||
solid_count = sum(1 for _, t in covering_rects if t == 'solid')
|
||||
logger.debug(f"Page {page_num}: Found {len(covering_rects)} covering rectangles "
|
||||
f"(white: {white_count}, black: {black_count}, other: {solid_count})")
|
||||
|
||||
# Get all text words with bounding boxes
|
||||
# words format: (x0, y0, x1, y1, word, block_no, line_no, word_no)
|
||||
@@ -1947,9 +1975,9 @@ class DirectExtractionEngine:
|
||||
if word_area <= 0:
|
||||
continue
|
||||
|
||||
for white_rect in white_rects:
|
||||
for cover_rect, color_type in covering_rects:
|
||||
# Calculate intersection
|
||||
intersection = word_rect & white_rect
|
||||
intersection = word_rect & cover_rect
|
||||
if intersection.is_empty:
|
||||
continue
|
||||
|
||||
@@ -1961,7 +1989,8 @@ class DirectExtractionEngine:
|
||||
covered_words.append({
|
||||
'text': word_text,
|
||||
'bbox': tuple(word_rect),
|
||||
'coverage': coverage_ratio
|
||||
'coverage': coverage_ratio,
|
||||
'color_type': color_type
|
||||
})
|
||||
break # Word is covered, no need to check other rects
|
||||
|
||||
|
||||
Reference in New Issue
Block a user