diff --git a/backend/app/services/direct_extraction_engine.py b/backend/app/services/direct_extraction_engine.py index cc1d8f9..4f29b06 100644 --- a/backend/app/services/direct_extraction_engine.py +++ b/backend/app/services/direct_extraction_engine.py @@ -120,7 +120,8 @@ class DirectExtractionEngine: doc[page_num], page_num + 1, document_id, - output_dir + output_dir, + doc # Pass doc for covering image detection ) pages.append(page) @@ -211,7 +212,8 @@ class DirectExtractionEngine: page: fitz.Page, page_num: int, document_id: str, - output_dir: Optional[Path]) -> Page: + output_dir: Optional[Path], + doc: fitz.Document = None) -> Page: """Extract content from a single page with preprocessing pipeline.""" elements = [] element_counter = 0 @@ -219,8 +221,8 @@ class DirectExtractionEngine: # ===================================================================== # PREPROCESSING PIPELINE # ===================================================================== - # Step 1: Run preprocessing (sanitization, white-out detection) - preprocess_result = self._preprocess_page(page, page_num) + # Step 1: Run preprocessing (sanitization, white-out detection, covering images) + preprocess_result = self._preprocess_page(page, page_num, doc) covered_bboxes = preprocess_result.get('covered_word_bboxes', []) # Get page-level metadata (for final Page metadata) @@ -337,13 +339,16 @@ class DirectExtractionEngine: elements = self._filter_page_numbers(elements, dimensions.height) # Step 3.2-3.3: Garble detection and OCR fallback recommendation + covering_images = preprocess_result.get('covering_images', []) page_metadata = { "has_drawings": len(drawings) > 0, "drawing_count": len(drawings), "link_count": len(links), "preprocessing": { "sanitized": preprocess_result.get('sanitized', False), - "whiteout_regions_found": len(covered_bboxes) + "whiteout_regions_found": len(covered_bboxes) - len(covering_images), # Vector rects only + "covering_images_found": len(covering_images), + "covering_images": covering_images # Full details for debugging } } @@ -1856,27 +1861,31 @@ class DirectExtractionEngine: # PDF Preprocessing Pipeline Methods # ========================================================================= - def _preprocess_page(self, page: fitz.Page, page_num: int) -> Dict[str, Any]: + def _preprocess_page(self, page: fitz.Page, page_num: int, doc: fitz.Document = None) -> Dict[str, Any]: """ Run preprocessing pipeline on a page before extraction. Pipeline steps: 1. Content sanitization (clean_contents) 2. Hidden layer detection (OCG) - 3. White-out detection + 3. White-out/black-out detection (vector rectangles) + 4. Covering image detection (embedded black/white images) Args: page: PyMuPDF page object page_num: Page number (1-indexed) + doc: PyMuPDF document object (needed for image analysis) Returns: Dict with preprocessing results: - - covered_word_bboxes: List of bboxes for text covered by white rectangles + - covered_word_bboxes: List of bboxes for text covered by rectangles/images + - covering_images: List of covering image info - hidden_layers: List of hidden OCG layer names - sanitized: Whether content was sanitized """ result = { 'covered_word_bboxes': [], + 'covering_images': [], 'hidden_layers': [], 'sanitized': False } @@ -1890,7 +1899,7 @@ class DirectExtractionEngine: except Exception as e: logger.warning(f"Page {page_num}: Content sanitization failed: {e}") - # Step 1.3: White-out detection + # Step 1.3: White-out/black-out detection (vector rectangles) if self.enable_whiteout_detection: covered = self._detect_whiteout_covered_text(page, page_num) result['covered_word_bboxes'] = [fitz.Rect(w['bbox']) for w in covered] @@ -1903,6 +1912,19 @@ class DirectExtractionEngine: logger.info(f"Page {page_num}: Detected {len(covered)} covered text regions " f"(white: {white_covered}, black/redaction: {black_covered}, other: {other_covered})") + # Step 1.4: Covering image detection (embedded black/white images) + if self.enable_whiteout_detection and doc is not None: + covering_images = self._detect_covering_images(page, doc, page_num) + result['covering_images'] = covering_images + # Add covering image bboxes to the covered_word_bboxes list + for img in covering_images: + result['covered_word_bboxes'].append(fitz.Rect(img['bbox'])) + if covering_images: + black_imgs = sum(1 for c in covering_images if c['color_type'] == 'image_black') + white_imgs = sum(1 for c in covering_images if c['color_type'] == 'image_white') + logger.info(f"Page {page_num}: Detected {len(covering_images)} covering images " + f"(black: {black_imgs}, white: {white_imgs})") + return result def _detect_whiteout_covered_text(self, page: fitz.Page, page_num: int) -> List[Dict]: @@ -1989,6 +2011,95 @@ class DirectExtractionEngine: return covered_words + def _detect_covering_images(self, page: fitz.Page, doc: fitz.Document, page_num: int) -> List[Dict]: + """ + Detect embedded images that are mostly black/white (likely covering/redaction). + + Args: + page: PyMuPDF page object + doc: PyMuPDF document object (needed for image extraction) + page_num: Page number for logging + + Returns: + List of dicts with covering image info: {'bbox', 'color_type', 'avg_color'} + """ + covering_images = [] + + try: + # Get all images on the page with their positions + image_list = page.get_images(full=True) + + if not image_list: + return covering_images + + for img_info in image_list: + xref = img_info[0] + width = img_info[2] + height = img_info[3] + + # Skip very small images (icons, bullets) + if width < 20 or height < 10: + continue + + try: + # Extract image data + base_image = doc.extract_image(xref) + img_bytes = base_image.get('image') + if not img_bytes: + continue + + # Analyze image color using PIL + from PIL import Image + import io + + img = Image.open(io.BytesIO(img_bytes)) + if img.mode != 'RGB': + img = img.convert('RGB') + + # Sample pixels for efficiency (don't analyze every pixel) + img_small = img.resize((min(50, img.width), min(50, img.height))) + pixels = list(img_small.getdata()) + + if not pixels: + continue + + avg_r = sum(p[0] for p in pixels) / len(pixels) + avg_g = sum(p[1] for p in pixels) / len(pixels) + avg_b = sum(p[2] for p in pixels) / len(pixels) + + # Determine if image is mostly black or white + color_type = None + if avg_r <= 30 and avg_g <= 30 and avg_b <= 30: + color_type = 'image_black' + elif avg_r >= 245 and avg_g >= 245 and avg_b >= 245: + color_type = 'image_white' + + if color_type: + # Get image position on page + # We need to find the image rectangle on the page + for img_rect in page.get_image_rects(xref): + covering_images.append({ + 'bbox': tuple(img_rect), + 'color_type': color_type, + 'avg_color': (avg_r, avg_g, avg_b), + 'size': (width, height) + }) + + except Exception as e: + logger.debug(f"Page {page_num}: Failed to analyze image xref={xref}: {e}") + continue + + if covering_images: + black_count = sum(1 for c in covering_images if c['color_type'] == 'image_black') + white_count = sum(1 for c in covering_images if c['color_type'] == 'image_white') + logger.debug(f"Page {page_num}: Found {len(covering_images)} covering images " + f"(black: {black_count}, white: {white_count})") + + except Exception as e: + logger.warning(f"Page {page_num}: Failed to detect covering images: {e}") + + return covering_images + def _get_hidden_ocg_layers(self, doc: fitz.Document) -> List[str]: """ Get list of hidden Optional Content Group (OCG) layer names. @@ -2410,7 +2521,8 @@ class DirectExtractionEngine: 'needs_ocr_fallback': False, 'preprocessing_stats': { 'pages_sanitized': 0, - 'total_whiteout_regions': 0 + 'total_whiteout_regions': 0, + 'total_covering_images': 0 } } @@ -2437,6 +2549,7 @@ class DirectExtractionEngine: if preprocessing.get('sanitized', False): report['preprocessing_stats']['pages_sanitized'] += 1 report['preprocessing_stats']['total_whiteout_regions'] += preprocessing.get('whiteout_regions_found', 0) + report['preprocessing_stats']['total_covering_images'] += preprocessing.get('covering_images_found', 0) # Calculate average garble rate if pages_with_garble > 0: