From 3903bcf77d9416238127f255eb2fb6b21238377d Mon Sep 17 00:00:00 2001 From: egg Date: Thu, 4 Dec 2025 07:36:07 +0800 Subject: [PATCH] fix: tighten covering detection thresholds to avoid false positives MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Increase white threshold from 0.95 to 0.98 (pure white only) - Decrease black threshold from 0.05 to 0.02 (pure black only) - Remove "other solid" detection (caused false positives on gray backgrounds) This prevents light gray table cell backgrounds (RGB ~0.93) from being incorrectly detected as covering/redaction rectangles. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../app/services/direct_extraction_engine.py | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/backend/app/services/direct_extraction_engine.py b/backend/app/services/direct_extraction_engine.py index 0c01971..cc1d8f9 100644 --- a/backend/app/services/direct_extraction_engine.py +++ b/backend/app/services/direct_extraction_engine.py @@ -1939,19 +1939,13 @@ class DirectExtractionEngine: continue # Detect white rectangles (white-out / correction tape) - if r >= 0.95 and g >= 0.95 and b >= 0.95: + # Must be pure white (>= 0.98) to avoid false positives from light backgrounds + if r >= 0.98 and g >= 0.98 and b >= 0.98: covering_rects.append((fitz_rect, 'white')) - # Detect black rectangles (redaction) - elif r <= 0.05 and g <= 0.05 and b <= 0.05: + # Detect black rectangles (redaction / censoring) + # Must be pure black (<= 0.02) to avoid false positives from dark elements + elif r <= 0.02 and g <= 0.02 and b <= 0.02: covering_rects.append((fitz_rect, 'black')) - # Detect other solid colors (uniform fill that might be covering) - # Only consider if it's a solid color (low variance) and reasonably large - elif fitz_rect.width >= 20 and fitz_rect.height >= 10: - # Check if it's a saturated solid color (not gradient-like) - # For now, detect very dark or very light solid fills - avg_brightness = (r + g + b) / 3 - if avg_brightness <= 0.1 or avg_brightness >= 0.9: - covering_rects.append((fitz_rect, 'solid')) if not covering_rects: return covered_words @@ -1959,9 +1953,8 @@ class DirectExtractionEngine: # Log detected covering rectangles by type white_count = sum(1 for _, t in covering_rects if t == 'white') black_count = sum(1 for _, t in covering_rects if t == 'black') - solid_count = sum(1 for _, t in covering_rects if t == 'solid') logger.debug(f"Page {page_num}: Found {len(covering_rects)} covering rectangles " - f"(white: {white_count}, black: {black_count}, other: {solid_count})") + f"(white: {white_count}, black/redaction: {black_count})") # Get all text words with bounding boxes # words format: (x0, y0, x1, y1, word, block_no, line_no, word_no)