feat: add table detection options and scan artifact removal

- Add TableDetectionSelector component for wired/wireless/region detection - Add CV-based table line detector module (disabled due to poor performance) - Add scan artifact removal preprocessing step (removes faint horizontal lines) - Add PreprocessingConfig schema with remove_scan_artifacts option - Update frontend PreprocessingSettings with scan artifact toggle - Integrate table detection config into ProcessingPage - Archive extract-table-cell-boxes proposal 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-30 13:21:50 +08:00
parent f5a2c8a750
commit 95ae1f1bdb
17 changed files with 1906 additions and 344 deletions
--- a/backend/app/services/layout_preprocessing_service.py
+++ b/backend/app/services/layout_preprocessing_service.py
@@ -184,6 +184,99 @@ class LayoutPreprocessingService:

        return normalized

+    def remove_scan_artifacts(
+        self,
+        image: np.ndarray,
+        line_thickness: int = 5,
+        min_line_length_ratio: float = 0.3,
+        faint_threshold: int = 30
+    ) -> np.ndarray:
+        """
+        Remove horizontal scan line artifacts from scanned documents.
+
+        Scanner light bar artifacts appear as FAINT horizontal lines across the image.
+        Key distinction from table borders:
+        - Scan artifacts are LIGHT/FAINT (close to background color)
+        - Table borders are DARK/BOLD (high contrast)
+
+        Method:
+        1. Detect horizontal edges using Sobel filter
+        2. Filter to keep only FAINT edges (low contrast)
+        3. Find continuous horizontal segments
+        4. Remove only faint horizontal lines while preserving bold table borders
+
+        Args:
+            image: Input image (BGR)
+            line_thickness: Maximum thickness of lines to remove (pixels)
+            min_line_length_ratio: Minimum line length as ratio of image width (0.0-1.0)
+            faint_threshold: Maximum edge strength for "faint" lines (0-255)
+
+        Returns:
+            Image with scan artifacts removed (BGR)
+        """
+        h, w = image.shape[:2]
+        min_line_length = int(w * min_line_length_ratio)
+
+        # Convert to grayscale for detection
+        if len(image.shape) == 3:
+            gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+        else:
+            gray = image.copy()
+
+        # Step 1: Detect horizontal edges using Sobel (vertical gradient)
+        # Scan artifacts will have weak gradients, table borders will have strong gradients
+        sobel_y = cv2.Sobel(gray, cv2.CV_64F, 0, 1, ksize=3)
+        sobel_abs = np.abs(sobel_y).astype(np.uint8)
+
+        # Step 2: Find FAINT horizontal edges only (low gradient magnitude)
+        # Strong edges (table borders) have high sobel values
+        # Faint edges (scan artifacts) have low sobel values
+        faint_edges = (sobel_abs > 5) & (sobel_abs < faint_threshold)
+        faint_edges = faint_edges.astype(np.uint8) * 255
+
+        # Step 3: Use horizontal morphological operations to find continuous lines
+        horizontal_kernel = cv2.getStructuringElement(
+            cv2.MORPH_RECT,
+            (min_line_length, 1)
+        )
+
+        # Opening removes short segments, keeping only long horizontal lines
+        horizontal_lines = cv2.morphologyEx(
+            faint_edges, cv2.MORPH_OPEN, horizontal_kernel, iterations=1
+        )
+
+        # Dilate slightly to cover the full artifact width
+        dilate_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, line_thickness))
+        line_mask = cv2.dilate(horizontal_lines, dilate_kernel, iterations=1)
+
+        # Check if any artifacts were detected
+        artifact_pixels = np.sum(line_mask > 0)
+        if artifact_pixels < 100:
+            logger.debug("No faint scan artifacts detected")
+            return image
+
+        # Calculate artifact coverage
+        total_pixels = h * w
+        coverage_ratio = artifact_pixels / total_pixels
+
+        # Faint artifacts should cover a small portion of the image
+        if coverage_ratio > 0.05:  # More than 5% is suspicious
+            logger.debug(f"Faint artifact detection: coverage={coverage_ratio:.2%} (processing anyway)")
+
+        # Only process if coverage is not excessive
+        if coverage_ratio > 0.15:  # More than 15% is definitely too much
+            logger.debug(f"Artifact detection rejected: coverage too high ({coverage_ratio:.2%})")
+            return image
+
+        # Use inpainting to remove artifacts
+        result = cv2.inpaint(image, line_mask, inpaintRadius=3, flags=cv2.INPAINT_TELEA)
+
+        logger.info(
+            f"Scan artifacts removed: {artifact_pixels} pixels ({coverage_ratio:.2%}), faint_threshold={faint_threshold}"
+        )
+
+        return result
+
    def scale_for_layout_detection(
        self,
        image: np.ndarray,
@@ -346,9 +439,13 @@ class LayoutPreprocessingService:
        # Only enable for extremely low contrast (< 15) which indicates a scan quality issue
        binarize = False  # Disabled by default

+        # Scan artifact removal is always enabled in auto mode for scanned documents
+        remove_scan_artifacts = True
+
        logger.debug(
            f"Auto config: contrast={contrast} strength={contrast_strength:.2f}, "
-            f"sharpen={sharpen} strength={sharpen_strength:.2f}, binarize={binarize}"
+            f"sharpen={sharpen} strength={sharpen_strength:.2f}, binarize={binarize}, "
+            f"remove_scan_artifacts={remove_scan_artifacts}"
        )

        return PreprocessingConfig(
@@ -356,7 +453,8 @@ class LayoutPreprocessingService:
            contrast_strength=round(contrast_strength, 2),
            sharpen=sharpen,
            sharpen_strength=round(sharpen_strength, 2),
-            binarize=binarize
+            binarize=binarize,
+            remove_scan_artifacts=remove_scan_artifacts
        )

    def apply_contrast_enhancement(
@@ -550,7 +648,8 @@ class LayoutPreprocessingService:
                config_used=PreprocessingConfig(
                    contrast=PreprocessingContrastEnum.NONE,
                    sharpen=False,
-                    binarize=False
+                    binarize=False,
+                    remove_scan_artifacts=False
                ),
                quality_metrics=metrics,
                was_processed=scaling_info.was_scaled,  # True if scaling was applied
@@ -568,6 +667,13 @@ class LayoutPreprocessingService:
        processed = scaled_image.copy()
        was_processed = scaling_info.was_scaled  # Start with True if already scaled

+        # Step 0: Remove scan artifacts BEFORE any enhancement
+        # This prevents scanner light bar lines from being enhanced and misdetected as table borders
+        if getattr(config, 'remove_scan_artifacts', True):  # Default True for backwards compatibility
+            processed = self.remove_scan_artifacts(processed)
+            was_processed = True
+            logger.debug("Applied scan artifact removal")
+
        # Step 1: Contrast enhancement
        if config.contrast != PreprocessingContrastEnum.NONE:
            processed = self.apply_contrast_enhancement(