feat: add black/white covering image detection
Implements detection of embedded images used for redaction/covering: - Analyzes embedded images for mostly black (avg RGB <= 30) or white (>= 245) - Uses PIL to efficiently sample image colors - Gets image position on page via get_image_rects() - Integrates with existing preprocessing pipeline - Adds covering_images to page metadata and quality report Detection results: - demo_docs/edit3.pdf: 10 black covering images detected (7 on P1, 3 on P2) Quality report now includes: - total_covering_images count - Per-page covering_images details with bbox, color_type, size 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -120,7 +120,8 @@ class DirectExtractionEngine:
|
||||
doc[page_num],
|
||||
page_num + 1,
|
||||
document_id,
|
||||
output_dir
|
||||
output_dir,
|
||||
doc # Pass doc for covering image detection
|
||||
)
|
||||
pages.append(page)
|
||||
|
||||
@@ -211,7 +212,8 @@ class DirectExtractionEngine:
|
||||
page: fitz.Page,
|
||||
page_num: int,
|
||||
document_id: str,
|
||||
output_dir: Optional[Path]) -> Page:
|
||||
output_dir: Optional[Path],
|
||||
doc: fitz.Document = None) -> Page:
|
||||
"""Extract content from a single page with preprocessing pipeline."""
|
||||
elements = []
|
||||
element_counter = 0
|
||||
@@ -219,8 +221,8 @@ class DirectExtractionEngine:
|
||||
# =====================================================================
|
||||
# PREPROCESSING PIPELINE
|
||||
# =====================================================================
|
||||
# Step 1: Run preprocessing (sanitization, white-out detection)
|
||||
preprocess_result = self._preprocess_page(page, page_num)
|
||||
# Step 1: Run preprocessing (sanitization, white-out detection, covering images)
|
||||
preprocess_result = self._preprocess_page(page, page_num, doc)
|
||||
covered_bboxes = preprocess_result.get('covered_word_bboxes', [])
|
||||
|
||||
# Get page-level metadata (for final Page metadata)
|
||||
@@ -337,13 +339,16 @@ class DirectExtractionEngine:
|
||||
elements = self._filter_page_numbers(elements, dimensions.height)
|
||||
|
||||
# Step 3.2-3.3: Garble detection and OCR fallback recommendation
|
||||
covering_images = preprocess_result.get('covering_images', [])
|
||||
page_metadata = {
|
||||
"has_drawings": len(drawings) > 0,
|
||||
"drawing_count": len(drawings),
|
||||
"link_count": len(links),
|
||||
"preprocessing": {
|
||||
"sanitized": preprocess_result.get('sanitized', False),
|
||||
"whiteout_regions_found": len(covered_bboxes)
|
||||
"whiteout_regions_found": len(covered_bboxes) - len(covering_images), # Vector rects only
|
||||
"covering_images_found": len(covering_images),
|
||||
"covering_images": covering_images # Full details for debugging
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1856,27 +1861,31 @@ class DirectExtractionEngine:
|
||||
# PDF Preprocessing Pipeline Methods
|
||||
# =========================================================================
|
||||
|
||||
def _preprocess_page(self, page: fitz.Page, page_num: int) -> Dict[str, Any]:
|
||||
def _preprocess_page(self, page: fitz.Page, page_num: int, doc: fitz.Document = None) -> Dict[str, Any]:
|
||||
"""
|
||||
Run preprocessing pipeline on a page before extraction.
|
||||
|
||||
Pipeline steps:
|
||||
1. Content sanitization (clean_contents)
|
||||
2. Hidden layer detection (OCG)
|
||||
3. White-out detection
|
||||
3. White-out/black-out detection (vector rectangles)
|
||||
4. Covering image detection (embedded black/white images)
|
||||
|
||||
Args:
|
||||
page: PyMuPDF page object
|
||||
page_num: Page number (1-indexed)
|
||||
doc: PyMuPDF document object (needed for image analysis)
|
||||
|
||||
Returns:
|
||||
Dict with preprocessing results:
|
||||
- covered_word_bboxes: List of bboxes for text covered by white rectangles
|
||||
- covered_word_bboxes: List of bboxes for text covered by rectangles/images
|
||||
- covering_images: List of covering image info
|
||||
- hidden_layers: List of hidden OCG layer names
|
||||
- sanitized: Whether content was sanitized
|
||||
"""
|
||||
result = {
|
||||
'covered_word_bboxes': [],
|
||||
'covering_images': [],
|
||||
'hidden_layers': [],
|
||||
'sanitized': False
|
||||
}
|
||||
@@ -1890,7 +1899,7 @@ class DirectExtractionEngine:
|
||||
except Exception as e:
|
||||
logger.warning(f"Page {page_num}: Content sanitization failed: {e}")
|
||||
|
||||
# Step 1.3: White-out detection
|
||||
# Step 1.3: White-out/black-out detection (vector rectangles)
|
||||
if self.enable_whiteout_detection:
|
||||
covered = self._detect_whiteout_covered_text(page, page_num)
|
||||
result['covered_word_bboxes'] = [fitz.Rect(w['bbox']) for w in covered]
|
||||
@@ -1903,6 +1912,19 @@ class DirectExtractionEngine:
|
||||
logger.info(f"Page {page_num}: Detected {len(covered)} covered text regions "
|
||||
f"(white: {white_covered}, black/redaction: {black_covered}, other: {other_covered})")
|
||||
|
||||
# Step 1.4: Covering image detection (embedded black/white images)
|
||||
if self.enable_whiteout_detection and doc is not None:
|
||||
covering_images = self._detect_covering_images(page, doc, page_num)
|
||||
result['covering_images'] = covering_images
|
||||
# Add covering image bboxes to the covered_word_bboxes list
|
||||
for img in covering_images:
|
||||
result['covered_word_bboxes'].append(fitz.Rect(img['bbox']))
|
||||
if covering_images:
|
||||
black_imgs = sum(1 for c in covering_images if c['color_type'] == 'image_black')
|
||||
white_imgs = sum(1 for c in covering_images if c['color_type'] == 'image_white')
|
||||
logger.info(f"Page {page_num}: Detected {len(covering_images)} covering images "
|
||||
f"(black: {black_imgs}, white: {white_imgs})")
|
||||
|
||||
return result
|
||||
|
||||
def _detect_whiteout_covered_text(self, page: fitz.Page, page_num: int) -> List[Dict]:
|
||||
@@ -1989,6 +2011,95 @@ class DirectExtractionEngine:
|
||||
|
||||
return covered_words
|
||||
|
||||
def _detect_covering_images(self, page: fitz.Page, doc: fitz.Document, page_num: int) -> List[Dict]:
|
||||
"""
|
||||
Detect embedded images that are mostly black/white (likely covering/redaction).
|
||||
|
||||
Args:
|
||||
page: PyMuPDF page object
|
||||
doc: PyMuPDF document object (needed for image extraction)
|
||||
page_num: Page number for logging
|
||||
|
||||
Returns:
|
||||
List of dicts with covering image info: {'bbox', 'color_type', 'avg_color'}
|
||||
"""
|
||||
covering_images = []
|
||||
|
||||
try:
|
||||
# Get all images on the page with their positions
|
||||
image_list = page.get_images(full=True)
|
||||
|
||||
if not image_list:
|
||||
return covering_images
|
||||
|
||||
for img_info in image_list:
|
||||
xref = img_info[0]
|
||||
width = img_info[2]
|
||||
height = img_info[3]
|
||||
|
||||
# Skip very small images (icons, bullets)
|
||||
if width < 20 or height < 10:
|
||||
continue
|
||||
|
||||
try:
|
||||
# Extract image data
|
||||
base_image = doc.extract_image(xref)
|
||||
img_bytes = base_image.get('image')
|
||||
if not img_bytes:
|
||||
continue
|
||||
|
||||
# Analyze image color using PIL
|
||||
from PIL import Image
|
||||
import io
|
||||
|
||||
img = Image.open(io.BytesIO(img_bytes))
|
||||
if img.mode != 'RGB':
|
||||
img = img.convert('RGB')
|
||||
|
||||
# Sample pixels for efficiency (don't analyze every pixel)
|
||||
img_small = img.resize((min(50, img.width), min(50, img.height)))
|
||||
pixels = list(img_small.getdata())
|
||||
|
||||
if not pixels:
|
||||
continue
|
||||
|
||||
avg_r = sum(p[0] for p in pixels) / len(pixels)
|
||||
avg_g = sum(p[1] for p in pixels) / len(pixels)
|
||||
avg_b = sum(p[2] for p in pixels) / len(pixels)
|
||||
|
||||
# Determine if image is mostly black or white
|
||||
color_type = None
|
||||
if avg_r <= 30 and avg_g <= 30 and avg_b <= 30:
|
||||
color_type = 'image_black'
|
||||
elif avg_r >= 245 and avg_g >= 245 and avg_b >= 245:
|
||||
color_type = 'image_white'
|
||||
|
||||
if color_type:
|
||||
# Get image position on page
|
||||
# We need to find the image rectangle on the page
|
||||
for img_rect in page.get_image_rects(xref):
|
||||
covering_images.append({
|
||||
'bbox': tuple(img_rect),
|
||||
'color_type': color_type,
|
||||
'avg_color': (avg_r, avg_g, avg_b),
|
||||
'size': (width, height)
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
logger.debug(f"Page {page_num}: Failed to analyze image xref={xref}: {e}")
|
||||
continue
|
||||
|
||||
if covering_images:
|
||||
black_count = sum(1 for c in covering_images if c['color_type'] == 'image_black')
|
||||
white_count = sum(1 for c in covering_images if c['color_type'] == 'image_white')
|
||||
logger.debug(f"Page {page_num}: Found {len(covering_images)} covering images "
|
||||
f"(black: {black_count}, white: {white_count})")
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Page {page_num}: Failed to detect covering images: {e}")
|
||||
|
||||
return covering_images
|
||||
|
||||
def _get_hidden_ocg_layers(self, doc: fitz.Document) -> List[str]:
|
||||
"""
|
||||
Get list of hidden Optional Content Group (OCG) layer names.
|
||||
@@ -2410,7 +2521,8 @@ class DirectExtractionEngine:
|
||||
'needs_ocr_fallback': False,
|
||||
'preprocessing_stats': {
|
||||
'pages_sanitized': 0,
|
||||
'total_whiteout_regions': 0
|
||||
'total_whiteout_regions': 0,
|
||||
'total_covering_images': 0
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2437,6 +2549,7 @@ class DirectExtractionEngine:
|
||||
if preprocessing.get('sanitized', False):
|
||||
report['preprocessing_stats']['pages_sanitized'] += 1
|
||||
report['preprocessing_stats']['total_whiteout_regions'] += preprocessing.get('whiteout_regions_found', 0)
|
||||
report['preprocessing_stats']['total_covering_images'] += preprocessing.get('covering_images_found', 0)
|
||||
|
||||
# Calculate average garble rate
|
||||
if pages_with_garble > 0:
|
||||
|
||||
Reference in New Issue
Block a user