From bc66f723521eae4c192a19695c490adcf9c8597a Mon Sep 17 00:00:00 2001 From: egg Date: Thu, 4 Dec 2025 07:34:35 +0800 Subject: [PATCH] feat: extend covering detection to include black/redaction rectangles MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Expands whiteout detection to handle: - White rectangles (RGB >= 0.95) - correction tape / white-out - Black rectangles (RGB <= 0.05) - redaction / censoring - Other solid fills (very dark or very light) - potential covering Adds color_type to covered text results for better logging. Logs now show breakdown by cover type (white, black, other). 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../app/services/direct_extraction_engine.py | 67 +++++++++++++------ 1 file changed, 48 insertions(+), 19 deletions(-) diff --git a/backend/app/services/direct_extraction_engine.py b/backend/app/services/direct_extraction_engine.py index 8512c32..0c01971 100644 --- a/backend/app/services/direct_extraction_engine.py +++ b/backend/app/services/direct_extraction_engine.py @@ -1894,14 +1894,20 @@ class DirectExtractionEngine: if self.enable_whiteout_detection: covered = self._detect_whiteout_covered_text(page, page_num) result['covered_word_bboxes'] = [fitz.Rect(w['bbox']) for w in covered] + result['covered_words_detail'] = covered # Include color_type info if covered: - logger.info(f"Page {page_num}: Detected {len(covered)} text regions covered by white-out") + # Count by color type + white_covered = sum(1 for c in covered if c.get('color_type') == 'white') + black_covered = sum(1 for c in covered if c.get('color_type') == 'black') + other_covered = len(covered) - white_covered - black_covered + logger.info(f"Page {page_num}: Detected {len(covered)} covered text regions " + f"(white: {white_covered}, black/redaction: {black_covered}, other: {other_covered})") return result def _detect_whiteout_covered_text(self, page: fitz.Page, page_num: int) -> List[Dict]: """ - Detect text covered by white rectangles ("white-out" / "correction tape" effect). + Detect text covered by solid color rectangles (white-out, black redaction, or any solid fill). Uses IoU (Intersection over Union) to determine if text is covered. @@ -1910,30 +1916,52 @@ class DirectExtractionEngine: page_num: Page number for logging Returns: - List of dicts with covered text info: {'text', 'bbox', 'coverage'} + List of dicts with covered text info: {'text', 'bbox', 'coverage', 'color_type'} """ covered_words = [] - # Get all drawings and find white-filled rectangles + # Get all drawings and find solid-filled rectangles drawings = page.get_drawings() - white_rects = [] + covering_rects = [] # List of (rect, color_type) for d in drawings: fill_color = d.get('fill') - # Check for white fill (RGB all 1.0 or close to it) - if fill_color: - if isinstance(fill_color, (tuple, list)) and len(fill_color) >= 3: - r, g, b = fill_color[:3] - # Allow slight tolerance for "almost white" - if r >= 0.95 and g >= 0.95 and b >= 0.95: - rect = d.get('rect') - if rect: - white_rects.append(fitz.Rect(rect)) + if fill_color and isinstance(fill_color, (tuple, list)) and len(fill_color) >= 3: + r, g, b = fill_color[:3] + rect = d.get('rect') + if not rect: + continue - if not white_rects: + fitz_rect = fitz.Rect(rect) + + # Skip very small rectangles (likely not covering blocks) + if fitz_rect.width < 5 or fitz_rect.height < 5: + continue + + # Detect white rectangles (white-out / correction tape) + if r >= 0.95 and g >= 0.95 and b >= 0.95: + covering_rects.append((fitz_rect, 'white')) + # Detect black rectangles (redaction) + elif r <= 0.05 and g <= 0.05 and b <= 0.05: + covering_rects.append((fitz_rect, 'black')) + # Detect other solid colors (uniform fill that might be covering) + # Only consider if it's a solid color (low variance) and reasonably large + elif fitz_rect.width >= 20 and fitz_rect.height >= 10: + # Check if it's a saturated solid color (not gradient-like) + # For now, detect very dark or very light solid fills + avg_brightness = (r + g + b) / 3 + if avg_brightness <= 0.1 or avg_brightness >= 0.9: + covering_rects.append((fitz_rect, 'solid')) + + if not covering_rects: return covered_words - logger.debug(f"Page {page_num}: Found {len(white_rects)} white rectangles") + # Log detected covering rectangles by type + white_count = sum(1 for _, t in covering_rects if t == 'white') + black_count = sum(1 for _, t in covering_rects if t == 'black') + solid_count = sum(1 for _, t in covering_rects if t == 'solid') + logger.debug(f"Page {page_num}: Found {len(covering_rects)} covering rectangles " + f"(white: {white_count}, black: {black_count}, other: {solid_count})") # Get all text words with bounding boxes # words format: (x0, y0, x1, y1, word, block_no, line_no, word_no) @@ -1947,9 +1975,9 @@ class DirectExtractionEngine: if word_area <= 0: continue - for white_rect in white_rects: + for cover_rect, color_type in covering_rects: # Calculate intersection - intersection = word_rect & white_rect + intersection = word_rect & cover_rect if intersection.is_empty: continue @@ -1961,7 +1989,8 @@ class DirectExtractionEngine: covered_words.append({ 'text': word_text, 'bbox': tuple(word_rect), - 'coverage': coverage_ratio + 'coverage': coverage_ratio, + 'color_type': color_type }) break # Word is covered, no need to check other rects