fix: tighten covering detection thresholds to avoid false positives
- Increase white threshold from 0.95 to 0.98 (pure white only) - Decrease black threshold from 0.05 to 0.02 (pure black only) - Remove "other solid" detection (caused false positives on gray backgrounds) This prevents light gray table cell backgrounds (RGB ~0.93) from being incorrectly detected as covering/redaction rectangles. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -1939,19 +1939,13 @@ class DirectExtractionEngine:
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
# Detect white rectangles (white-out / correction tape)
|
# Detect white rectangles (white-out / correction tape)
|
||||||
if r >= 0.95 and g >= 0.95 and b >= 0.95:
|
# Must be pure white (>= 0.98) to avoid false positives from light backgrounds
|
||||||
|
if r >= 0.98 and g >= 0.98 and b >= 0.98:
|
||||||
covering_rects.append((fitz_rect, 'white'))
|
covering_rects.append((fitz_rect, 'white'))
|
||||||
# Detect black rectangles (redaction)
|
# Detect black rectangles (redaction / censoring)
|
||||||
elif r <= 0.05 and g <= 0.05 and b <= 0.05:
|
# Must be pure black (<= 0.02) to avoid false positives from dark elements
|
||||||
|
elif r <= 0.02 and g <= 0.02 and b <= 0.02:
|
||||||
covering_rects.append((fitz_rect, 'black'))
|
covering_rects.append((fitz_rect, 'black'))
|
||||||
# Detect other solid colors (uniform fill that might be covering)
|
|
||||||
# Only consider if it's a solid color (low variance) and reasonably large
|
|
||||||
elif fitz_rect.width >= 20 and fitz_rect.height >= 10:
|
|
||||||
# Check if it's a saturated solid color (not gradient-like)
|
|
||||||
# For now, detect very dark or very light solid fills
|
|
||||||
avg_brightness = (r + g + b) / 3
|
|
||||||
if avg_brightness <= 0.1 or avg_brightness >= 0.9:
|
|
||||||
covering_rects.append((fitz_rect, 'solid'))
|
|
||||||
|
|
||||||
if not covering_rects:
|
if not covering_rects:
|
||||||
return covered_words
|
return covered_words
|
||||||
@@ -1959,9 +1953,8 @@ class DirectExtractionEngine:
|
|||||||
# Log detected covering rectangles by type
|
# Log detected covering rectangles by type
|
||||||
white_count = sum(1 for _, t in covering_rects if t == 'white')
|
white_count = sum(1 for _, t in covering_rects if t == 'white')
|
||||||
black_count = sum(1 for _, t in covering_rects if t == 'black')
|
black_count = sum(1 for _, t in covering_rects if t == 'black')
|
||||||
solid_count = sum(1 for _, t in covering_rects if t == 'solid')
|
|
||||||
logger.debug(f"Page {page_num}: Found {len(covering_rects)} covering rectangles "
|
logger.debug(f"Page {page_num}: Found {len(covering_rects)} covering rectangles "
|
||||||
f"(white: {white_count}, black: {black_count}, other: {solid_count})")
|
f"(white: {white_count}, black/redaction: {black_count})")
|
||||||
|
|
||||||
# Get all text words with bounding boxes
|
# Get all text words with bounding boxes
|
||||||
# words format: (x0, y0, x1, y1, word, block_no, line_no, word_no)
|
# words format: (x0, y0, x1, y1, word, block_no, line_no, word_no)
|
||||||
|
|||||||
Reference in New Issue
Block a user