fix: tighten covering detection thresholds to avoid false positives

- Increase white threshold from 0.95 to 0.98 (pure white only)
- Decrease black threshold from 0.05 to 0.02 (pure black only)
- Remove "other solid" detection (caused false positives on gray backgrounds)

This prevents light gray table cell backgrounds (RGB ~0.93) from being
incorrectly detected as covering/redaction rectangles.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
egg
2025-12-04 07:36:07 +08:00
parent bc66f72352
commit 3903bcf77d

View File

@@ -1939,19 +1939,13 @@ class DirectExtractionEngine:
continue continue
# Detect white rectangles (white-out / correction tape) # Detect white rectangles (white-out / correction tape)
if r >= 0.95 and g >= 0.95 and b >= 0.95: # Must be pure white (>= 0.98) to avoid false positives from light backgrounds
if r >= 0.98 and g >= 0.98 and b >= 0.98:
covering_rects.append((fitz_rect, 'white')) covering_rects.append((fitz_rect, 'white'))
# Detect black rectangles (redaction) # Detect black rectangles (redaction / censoring)
elif r <= 0.05 and g <= 0.05 and b <= 0.05: # Must be pure black (<= 0.02) to avoid false positives from dark elements
elif r <= 0.02 and g <= 0.02 and b <= 0.02:
covering_rects.append((fitz_rect, 'black')) covering_rects.append((fitz_rect, 'black'))
# Detect other solid colors (uniform fill that might be covering)
# Only consider if it's a solid color (low variance) and reasonably large
elif fitz_rect.width >= 20 and fitz_rect.height >= 10:
# Check if it's a saturated solid color (not gradient-like)
# For now, detect very dark or very light solid fills
avg_brightness = (r + g + b) / 3
if avg_brightness <= 0.1 or avg_brightness >= 0.9:
covering_rects.append((fitz_rect, 'solid'))
if not covering_rects: if not covering_rects:
return covered_words return covered_words
@@ -1959,9 +1953,8 @@ class DirectExtractionEngine:
# Log detected covering rectangles by type # Log detected covering rectangles by type
white_count = sum(1 for _, t in covering_rects if t == 'white') white_count = sum(1 for _, t in covering_rects if t == 'white')
black_count = sum(1 for _, t in covering_rects if t == 'black') black_count = sum(1 for _, t in covering_rects if t == 'black')
solid_count = sum(1 for _, t in covering_rects if t == 'solid')
logger.debug(f"Page {page_num}: Found {len(covering_rects)} covering rectangles " logger.debug(f"Page {page_num}: Found {len(covering_rects)} covering rectangles "
f"(white: {white_count}, black: {black_count}, other: {solid_count})") f"(white: {white_count}, black/redaction: {black_count})")
# Get all text words with bounding boxes # Get all text words with bounding boxes
# words format: (x0, y0, x1, y1, word, block_no, line_no, word_no) # words format: (x0, y0, x1, y1, word, block_no, line_no, word_no)