feat: add table detection options and scan artifact removal
- Add TableDetectionSelector component for wired/wireless/region detection - Add CV-based table line detector module (disabled due to poor performance) - Add scan artifact removal preprocessing step (removes faint horizontal lines) - Add PreprocessingConfig schema with remove_scan_artifacts option - Update frontend PreprocessingSettings with scan artifact toggle - Integrate table detection config into ProcessingPage - Archive extract-table-cell-boxes proposal 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -184,6 +184,99 @@ class LayoutPreprocessingService:
|
||||
|
||||
return normalized
|
||||
|
||||
def remove_scan_artifacts(
|
||||
self,
|
||||
image: np.ndarray,
|
||||
line_thickness: int = 5,
|
||||
min_line_length_ratio: float = 0.3,
|
||||
faint_threshold: int = 30
|
||||
) -> np.ndarray:
|
||||
"""
|
||||
Remove horizontal scan line artifacts from scanned documents.
|
||||
|
||||
Scanner light bar artifacts appear as FAINT horizontal lines across the image.
|
||||
Key distinction from table borders:
|
||||
- Scan artifacts are LIGHT/FAINT (close to background color)
|
||||
- Table borders are DARK/BOLD (high contrast)
|
||||
|
||||
Method:
|
||||
1. Detect horizontal edges using Sobel filter
|
||||
2. Filter to keep only FAINT edges (low contrast)
|
||||
3. Find continuous horizontal segments
|
||||
4. Remove only faint horizontal lines while preserving bold table borders
|
||||
|
||||
Args:
|
||||
image: Input image (BGR)
|
||||
line_thickness: Maximum thickness of lines to remove (pixels)
|
||||
min_line_length_ratio: Minimum line length as ratio of image width (0.0-1.0)
|
||||
faint_threshold: Maximum edge strength for "faint" lines (0-255)
|
||||
|
||||
Returns:
|
||||
Image with scan artifacts removed (BGR)
|
||||
"""
|
||||
h, w = image.shape[:2]
|
||||
min_line_length = int(w * min_line_length_ratio)
|
||||
|
||||
# Convert to grayscale for detection
|
||||
if len(image.shape) == 3:
|
||||
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
|
||||
else:
|
||||
gray = image.copy()
|
||||
|
||||
# Step 1: Detect horizontal edges using Sobel (vertical gradient)
|
||||
# Scan artifacts will have weak gradients, table borders will have strong gradients
|
||||
sobel_y = cv2.Sobel(gray, cv2.CV_64F, 0, 1, ksize=3)
|
||||
sobel_abs = np.abs(sobel_y).astype(np.uint8)
|
||||
|
||||
# Step 2: Find FAINT horizontal edges only (low gradient magnitude)
|
||||
# Strong edges (table borders) have high sobel values
|
||||
# Faint edges (scan artifacts) have low sobel values
|
||||
faint_edges = (sobel_abs > 5) & (sobel_abs < faint_threshold)
|
||||
faint_edges = faint_edges.astype(np.uint8) * 255
|
||||
|
||||
# Step 3: Use horizontal morphological operations to find continuous lines
|
||||
horizontal_kernel = cv2.getStructuringElement(
|
||||
cv2.MORPH_RECT,
|
||||
(min_line_length, 1)
|
||||
)
|
||||
|
||||
# Opening removes short segments, keeping only long horizontal lines
|
||||
horizontal_lines = cv2.morphologyEx(
|
||||
faint_edges, cv2.MORPH_OPEN, horizontal_kernel, iterations=1
|
||||
)
|
||||
|
||||
# Dilate slightly to cover the full artifact width
|
||||
dilate_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, line_thickness))
|
||||
line_mask = cv2.dilate(horizontal_lines, dilate_kernel, iterations=1)
|
||||
|
||||
# Check if any artifacts were detected
|
||||
artifact_pixels = np.sum(line_mask > 0)
|
||||
if artifact_pixels < 100:
|
||||
logger.debug("No faint scan artifacts detected")
|
||||
return image
|
||||
|
||||
# Calculate artifact coverage
|
||||
total_pixels = h * w
|
||||
coverage_ratio = artifact_pixels / total_pixels
|
||||
|
||||
# Faint artifacts should cover a small portion of the image
|
||||
if coverage_ratio > 0.05: # More than 5% is suspicious
|
||||
logger.debug(f"Faint artifact detection: coverage={coverage_ratio:.2%} (processing anyway)")
|
||||
|
||||
# Only process if coverage is not excessive
|
||||
if coverage_ratio > 0.15: # More than 15% is definitely too much
|
||||
logger.debug(f"Artifact detection rejected: coverage too high ({coverage_ratio:.2%})")
|
||||
return image
|
||||
|
||||
# Use inpainting to remove artifacts
|
||||
result = cv2.inpaint(image, line_mask, inpaintRadius=3, flags=cv2.INPAINT_TELEA)
|
||||
|
||||
logger.info(
|
||||
f"Scan artifacts removed: {artifact_pixels} pixels ({coverage_ratio:.2%}), faint_threshold={faint_threshold}"
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
def scale_for_layout_detection(
|
||||
self,
|
||||
image: np.ndarray,
|
||||
@@ -346,9 +439,13 @@ class LayoutPreprocessingService:
|
||||
# Only enable for extremely low contrast (< 15) which indicates a scan quality issue
|
||||
binarize = False # Disabled by default
|
||||
|
||||
# Scan artifact removal is always enabled in auto mode for scanned documents
|
||||
remove_scan_artifacts = True
|
||||
|
||||
logger.debug(
|
||||
f"Auto config: contrast={contrast} strength={contrast_strength:.2f}, "
|
||||
f"sharpen={sharpen} strength={sharpen_strength:.2f}, binarize={binarize}"
|
||||
f"sharpen={sharpen} strength={sharpen_strength:.2f}, binarize={binarize}, "
|
||||
f"remove_scan_artifacts={remove_scan_artifacts}"
|
||||
)
|
||||
|
||||
return PreprocessingConfig(
|
||||
@@ -356,7 +453,8 @@ class LayoutPreprocessingService:
|
||||
contrast_strength=round(contrast_strength, 2),
|
||||
sharpen=sharpen,
|
||||
sharpen_strength=round(sharpen_strength, 2),
|
||||
binarize=binarize
|
||||
binarize=binarize,
|
||||
remove_scan_artifacts=remove_scan_artifacts
|
||||
)
|
||||
|
||||
def apply_contrast_enhancement(
|
||||
@@ -550,7 +648,8 @@ class LayoutPreprocessingService:
|
||||
config_used=PreprocessingConfig(
|
||||
contrast=PreprocessingContrastEnum.NONE,
|
||||
sharpen=False,
|
||||
binarize=False
|
||||
binarize=False,
|
||||
remove_scan_artifacts=False
|
||||
),
|
||||
quality_metrics=metrics,
|
||||
was_processed=scaling_info.was_scaled, # True if scaling was applied
|
||||
@@ -568,6 +667,13 @@ class LayoutPreprocessingService:
|
||||
processed = scaled_image.copy()
|
||||
was_processed = scaling_info.was_scaled # Start with True if already scaled
|
||||
|
||||
# Step 0: Remove scan artifacts BEFORE any enhancement
|
||||
# This prevents scanner light bar lines from being enhanced and misdetected as table borders
|
||||
if getattr(config, 'remove_scan_artifacts', True): # Default True for backwards compatibility
|
||||
processed = self.remove_scan_artifacts(processed)
|
||||
was_processed = True
|
||||
logger.debug("Applied scan artifact removal")
|
||||
|
||||
# Step 1: Contrast enhancement
|
||||
if config.contrast != PreprocessingContrastEnum.NONE:
|
||||
processed = self.apply_contrast_enhancement(
|
||||
|
||||
Reference in New Issue
Block a user