feat: add table detection options and scan artifact removal

- Add TableDetectionSelector component for wired/wireless/region detection - Add CV-based table line detector module (disabled due to poor performance) - Add scan artifact removal preprocessing step (removes faint horizontal lines) - Add PreprocessingConfig schema with remove_scan_artifacts option - Update frontend PreprocessingSettings with scan artifact toggle - Integrate table detection config into ProcessingPage - Archive extract-table-cell-boxes proposal 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-30 13:21:50 +08:00
parent f5a2c8a750
commit 95ae1f1bdb
17 changed files with 1906 additions and 344 deletions
--- a/backend/tests/test_layered_rendering.py
+++ b/backend/tests/test_layered_rendering.py
@@ -0,0 +1,135 @@
+"""
+Test script for layered rendering approach.
+Tests that table borders are drawn from cell_boxes
+while text is rendered at raw OCR positions.
+"""
+import sys
+sys.path.insert(0, '/home/egg/project/Tool_OCR/backend')
+
+import json
+from pathlib import Path
+from app.services.pdf_generator_service import PDFGeneratorService
+from app.services.gap_filling_service import GapFillingService
+
+
+def test_layered_rendering():
+    """Test the layered rendering approach."""
+    # Use existing test task
+    task_id = "84899366-f361-44f1-b989-5aba72419ca5"
+    result_dir = Path(f"/home/egg/project/Tool_OCR/backend/storage/results/{task_id}")
+
+    if not result_dir.exists():
+        print(f"[ERROR] Result directory not found: {result_dir}")
+        return False
+
+    # Load scan_result.json
+    scan_result_path = result_dir / "scan_result.json"
+    raw_ocr_path = result_dir / f"{task_id}_scan_page_1_raw_ocr_regions.json"
+
+    if not scan_result_path.exists():
+        print(f"[ERROR] scan_result.json not found")
+        return False
+
+    print(f"[INFO] Loading scan_result.json from {scan_result_path}")
+    with open(scan_result_path, 'r', encoding='utf-8') as f:
+        scan_result = json.load(f)
+
+    # Parse as UnifiedDocument using PDFGeneratorService's method
+    # scan_result IS the unified document (not nested under 'unified_document')
+    pdf_service = PDFGeneratorService()
+    unified_doc = pdf_service._json_to_unified_document(scan_result, result_dir)
+
+    if not unified_doc:
+        print(f"[ERROR] Failed to parse UnifiedDocument")
+        return False
+
+    print(f"[INFO] UnifiedDocument: {unified_doc.page_count} pages")
+
+    # Count elements
+    table_count = 0
+    text_count = 0
+    for page in unified_doc.pages:
+        for elem in page.elements:
+            if elem.type.value == 'table':
+                table_count += 1
+                # Check if cell_boxes are present (in metadata, not content)
+                cell_boxes = elem.metadata.get('cell_boxes', []) if elem.metadata else []
+                embedded_images = elem.metadata.get('embedded_images', []) if elem.metadata else []
+                print(f"[INFO] Table {elem.element_id}: {len(cell_boxes)} cell_boxes, {len(embedded_images)} embedded_images")
+            elif elem.type.value in ['text', 'paragraph', 'title']:
+                text_count += 1
+
+    print(f"[INFO] Tables: {table_count}, Text elements: {text_count}")
+
+    # Load raw OCR regions if available
+    raw_ocr_regions = []
+    if raw_ocr_path.exists():
+        print(f"[INFO] Loading raw OCR regions from {raw_ocr_path}")
+        with open(raw_ocr_path, 'r', encoding='utf-8') as f:
+            raw_ocr_data = json.load(f)
+            # Could be a list or dict with 'text_regions' key
+            if isinstance(raw_ocr_data, list):
+                raw_ocr_regions = raw_ocr_data
+            else:
+                raw_ocr_regions = raw_ocr_data.get('text_regions', [])
+        print(f"[INFO] Raw OCR regions: {len(raw_ocr_regions)}")
+
+    # Apply gap filling for each page
+    print(f"[INFO] Applying GapFillingService...")
+    gap_service = GapFillingService()
+    gap_filled_doc = unified_doc  # Start with original
+
+    for page in unified_doc.pages:
+        page_num = page.page_number
+        page_dims = page.dimensions
+
+        # Get elements for this page
+        pp_elements = page.elements
+
+        # Apply gap filling
+        filled_elements, stats = gap_service.fill_gaps(
+            raw_ocr_regions=raw_ocr_regions,
+            pp_structure_elements=pp_elements,
+            page_number=page_num,
+            pp_dimensions=page_dims
+        )
+
+        # Update the page's elements
+        page.elements = filled_elements
+        print(f"[INFO] Page {page_num}: Added {stats.get('gaps_filled', 0)} gap-filled regions")
+
+    # Count elements after gap filling
+    final_text_count = 0
+    for page in gap_filled_doc.pages:
+        for elem in page.elements:
+            if elem.type.value in ['text', 'paragraph', 'title']:
+                final_text_count += 1
+
+    print(f"[INFO] After gap filling: {final_text_count} text elements (was {text_count})")
+
+    # Generate PDF
+    print(f"[INFO] Generating PDF with layered rendering...")
+    output_pdf = result_dir / "test_layered_rendering.pdf"
+
+    try:
+        success = pdf_service.generate_from_unified_document(
+            unified_doc=gap_filled_doc,
+            output_path=output_pdf
+        )
+        if success:
+            print(f"[SUCCESS] PDF generated: {output_pdf}")
+            print(f"[INFO] PDF size: {output_pdf.stat().st_size} bytes")
+            return True
+        else:
+            print(f"[ERROR] PDF generation returned False")
+            return False
+    except Exception as e:
+        print(f"[ERROR] PDF generation failed: {e}")
+        import traceback
+        traceback.print_exc()
+        return False
+
+
+if __name__ == "__main__":
+    success = test_layered_rendering()
+    sys.exit(0 if success else 1)