feat: add table detection options and scan artifact removal

- Add TableDetectionSelector component for wired/wireless/region detection
- Add CV-based table line detector module (disabled due to poor performance)
- Add scan artifact removal preprocessing step (removes faint horizontal lines)
- Add PreprocessingConfig schema with remove_scan_artifacts option
- Update frontend PreprocessingSettings with scan artifact toggle
- Integrate table detection config into ProcessingPage
- Archive extract-table-cell-boxes proposal

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
egg
2025-11-30 13:21:50 +08:00
parent f5a2c8a750
commit 95ae1f1bdb
17 changed files with 1906 additions and 344 deletions

View File

@@ -184,6 +184,99 @@ class LayoutPreprocessingService:
return normalized
def remove_scan_artifacts(
self,
image: np.ndarray,
line_thickness: int = 5,
min_line_length_ratio: float = 0.3,
faint_threshold: int = 30
) -> np.ndarray:
"""
Remove horizontal scan line artifacts from scanned documents.
Scanner light bar artifacts appear as FAINT horizontal lines across the image.
Key distinction from table borders:
- Scan artifacts are LIGHT/FAINT (close to background color)
- Table borders are DARK/BOLD (high contrast)
Method:
1. Detect horizontal edges using Sobel filter
2. Filter to keep only FAINT edges (low contrast)
3. Find continuous horizontal segments
4. Remove only faint horizontal lines while preserving bold table borders
Args:
image: Input image (BGR)
line_thickness: Maximum thickness of lines to remove (pixels)
min_line_length_ratio: Minimum line length as ratio of image width (0.0-1.0)
faint_threshold: Maximum edge strength for "faint" lines (0-255)
Returns:
Image with scan artifacts removed (BGR)
"""
h, w = image.shape[:2]
min_line_length = int(w * min_line_length_ratio)
# Convert to grayscale for detection
if len(image.shape) == 3:
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
else:
gray = image.copy()
# Step 1: Detect horizontal edges using Sobel (vertical gradient)
# Scan artifacts will have weak gradients, table borders will have strong gradients
sobel_y = cv2.Sobel(gray, cv2.CV_64F, 0, 1, ksize=3)
sobel_abs = np.abs(sobel_y).astype(np.uint8)
# Step 2: Find FAINT horizontal edges only (low gradient magnitude)
# Strong edges (table borders) have high sobel values
# Faint edges (scan artifacts) have low sobel values
faint_edges = (sobel_abs > 5) & (sobel_abs < faint_threshold)
faint_edges = faint_edges.astype(np.uint8) * 255
# Step 3: Use horizontal morphological operations to find continuous lines
horizontal_kernel = cv2.getStructuringElement(
cv2.MORPH_RECT,
(min_line_length, 1)
)
# Opening removes short segments, keeping only long horizontal lines
horizontal_lines = cv2.morphologyEx(
faint_edges, cv2.MORPH_OPEN, horizontal_kernel, iterations=1
)
# Dilate slightly to cover the full artifact width
dilate_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, line_thickness))
line_mask = cv2.dilate(horizontal_lines, dilate_kernel, iterations=1)
# Check if any artifacts were detected
artifact_pixels = np.sum(line_mask > 0)
if artifact_pixels < 100:
logger.debug("No faint scan artifacts detected")
return image
# Calculate artifact coverage
total_pixels = h * w
coverage_ratio = artifact_pixels / total_pixels
# Faint artifacts should cover a small portion of the image
if coverage_ratio > 0.05: # More than 5% is suspicious
logger.debug(f"Faint artifact detection: coverage={coverage_ratio:.2%} (processing anyway)")
# Only process if coverage is not excessive
if coverage_ratio > 0.15: # More than 15% is definitely too much
logger.debug(f"Artifact detection rejected: coverage too high ({coverage_ratio:.2%})")
return image
# Use inpainting to remove artifacts
result = cv2.inpaint(image, line_mask, inpaintRadius=3, flags=cv2.INPAINT_TELEA)
logger.info(
f"Scan artifacts removed: {artifact_pixels} pixels ({coverage_ratio:.2%}), faint_threshold={faint_threshold}"
)
return result
def scale_for_layout_detection(
self,
image: np.ndarray,
@@ -346,9 +439,13 @@ class LayoutPreprocessingService:
# Only enable for extremely low contrast (< 15) which indicates a scan quality issue
binarize = False # Disabled by default
# Scan artifact removal is always enabled in auto mode for scanned documents
remove_scan_artifacts = True
logger.debug(
f"Auto config: contrast={contrast} strength={contrast_strength:.2f}, "
f"sharpen={sharpen} strength={sharpen_strength:.2f}, binarize={binarize}"
f"sharpen={sharpen} strength={sharpen_strength:.2f}, binarize={binarize}, "
f"remove_scan_artifacts={remove_scan_artifacts}"
)
return PreprocessingConfig(
@@ -356,7 +453,8 @@ class LayoutPreprocessingService:
contrast_strength=round(contrast_strength, 2),
sharpen=sharpen,
sharpen_strength=round(sharpen_strength, 2),
binarize=binarize
binarize=binarize,
remove_scan_artifacts=remove_scan_artifacts
)
def apply_contrast_enhancement(
@@ -550,7 +648,8 @@ class LayoutPreprocessingService:
config_used=PreprocessingConfig(
contrast=PreprocessingContrastEnum.NONE,
sharpen=False,
binarize=False
binarize=False,
remove_scan_artifacts=False
),
quality_metrics=metrics,
was_processed=scaling_info.was_scaled, # True if scaling was applied
@@ -568,6 +667,13 @@ class LayoutPreprocessingService:
processed = scaled_image.copy()
was_processed = scaling_info.was_scaled # Start with True if already scaled
# Step 0: Remove scan artifacts BEFORE any enhancement
# This prevents scanner light bar lines from being enhanced and misdetected as table borders
if getattr(config, 'remove_scan_artifacts', True): # Default True for backwards compatibility
processed = self.remove_scan_artifacts(processed)
was_processed = True
logger.debug("Applied scan artifact removal")
# Step 1: Contrast enhancement
if config.contrast != PreprocessingContrastEnum.NONE:
processed = self.apply_contrast_enhancement(