test

2025-12-04 18:00:37 +08:00
parent 9437387ef1
commit 8265be1741
22 changed files with 2672 additions and 196 deletions
--- a/backend/tests/test_phase1_fixes.py
+++ b/backend/tests/test_phase1_fixes.py
@@ -0,0 +1,285 @@
+"""
+Phase 1 Bug Fixes Verification Tests
+
+Tests for:
+1.1 Direct Track table cell merging
+1.2 OCR Track image path preservation
+1.3 Cell boxes coordinate validation
+1.4 Tiny decoration image filtering
+1.5 Covering image removal
+"""
+
+import sys
+import os
+from pathlib import Path
+
+# Add backend to path
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+import fitz
+from app.services.direct_extraction_engine import DirectExtractionEngine
+from app.services.ocr_to_unified_converter import validate_cell_boxes
+from app.models.unified_document import TableCell
+
+
+def test_1_1_table_cell_merging():
+    """Test 1.1.5: Verify edit3.pdf returns correct merged cells"""
+    print("\n" + "="*60)
+    print("TEST 1.1: Direct Track Table Cell Merging")
+    print("="*60)
+
+    pdf_path = Path(__file__).parent.parent.parent / "demo_docs" / "edit3.pdf"
+    if not pdf_path.exists():
+        print(f"SKIP: {pdf_path} not found")
+        return False
+
+    doc = fitz.open(str(pdf_path))
+
+    total_cells = 0
+    merged_cells = 0
+
+    for page_num, page in enumerate(doc):
+        tables = page.find_tables()
+        for table_idx, table in enumerate(tables.tables):
+            data = table.extract()
+            cell_rects = getattr(table, 'cells', None)
+
+            if cell_rects:
+                num_rows = len(data)
+                num_cols = max(len(row) for row in data) if data else 0
+
+                # Count actual cells (non-None)
+                actual_cells = sum(1 for c in cell_rects if c is not None)
+                none_cells = sum(1 for c in cell_rects if c is None)
+
+                print(f"  Page {page_num}, Table {table_idx}:")
+                print(f"    Grid size: {num_rows} x {num_cols} = {num_rows * num_cols} positions")
+                print(f"    Actual cells: {actual_cells}")
+                print(f"    Merged positions (None): {none_cells}")
+
+                total_cells += actual_cells
+                if none_cells > 0:
+                    merged_cells += 1
+
+    doc.close()
+
+    print(f"\n  Total actual cells across all tables: {total_cells}")
+    print(f"  Tables with merging: {merged_cells}")
+
+    # According to PLAN.md, edit3.pdf should have 83 cells (not 204)
+    # The presence of None values indicates merging is detected
+    if total_cells > 0 and total_cells < 204:
+        print("  RESULT: PASS - Cell merging detected correctly")
+        return True
+    elif total_cells == 204:
+        print("  RESULT: FAIL - All cells treated as 1x1 (no merging detected)")
+        return False
+    else:
+        print(f"  RESULT: INCONCLUSIVE - {total_cells} cells found")
+        return None
+
+
+def test_1_3_cell_boxes_validation():
+    """Test 1.3: Verify cell_boxes coordinate validation"""
+    print("\n" + "="*60)
+    print("TEST 1.3: Cell Boxes Coordinate Validation")
+    print("="*60)
+
+    # Test case 1: Valid coordinates
+    valid_boxes = [
+        [10, 10, 100, 50],
+        [100, 10, 200, 50],
+        [10, 50, 200, 100]
+    ]
+    result = validate_cell_boxes(valid_boxes, [0, 0, 300, 200], 300, 200)
+    print(f"  Valid boxes: valid={result['valid']}, invalid_count={result['invalid_count']}")
+    assert result['valid'], "Valid boxes should pass validation"
+
+    # Test case 2: Out of bounds coordinates
+    invalid_boxes = [
+        [-10, 10, 100, 50],    # x0 < 0
+        [10, 10, 400, 50],     # x1 > page_width
+        [10, 10, 100, 300]     # y1 > page_height
+    ]
+    result = validate_cell_boxes(invalid_boxes, [0, 0, 300, 200], 300, 200)
+    print(f"  Invalid boxes: valid={result['valid']}, invalid_count={result['invalid_count']}")
+    assert not result['valid'], "Invalid boxes should fail validation"
+    assert result['invalid_count'] == 3, "Should detect 3 invalid boxes"
+
+    # Test case 3: Clamping
+    assert len(result['clamped_boxes']) == 3, "Should return clamped boxes"
+    clamped = result['clamped_boxes'][0]
+    assert clamped[0] >= 0, "Clamped x0 should be >= 0"
+
+    print("  RESULT: PASS - Coordinate validation works correctly")
+    return True
+
+
+def test_1_4_tiny_image_filtering():
+    """Test 1.4: Verify tiny decoration image filtering"""
+    print("\n" + "="*60)
+    print("TEST 1.4: Tiny Decoration Image Filtering")
+    print("="*60)
+
+    pdf_path = Path(__file__).parent.parent.parent / "demo_docs" / "edit3.pdf"
+    if not pdf_path.exists():
+        print(f"SKIP: {pdf_path} not found")
+        return None
+
+    doc = fitz.open(str(pdf_path))
+
+    tiny_count = 0
+    normal_count = 0
+    min_area = 200  # Same threshold as in DirectExtractionEngine
+
+    for page_num, page in enumerate(doc):
+        images = page.get_images()
+        for img in images:
+            xref = img[0]
+            rects = page.get_image_rects(xref)
+            if rects:
+                rect = rects[0]
+                area = (rect.x1 - rect.x0) * (rect.y1 - rect.y0)
+                if area < min_area:
+                    tiny_count += 1
+                    print(f"  Page {page_num}: Tiny image xref={xref}, area={area:.1f} px²")
+                else:
+                    normal_count += 1
+
+    doc.close()
+
+    print(f"\n  Tiny images (< {min_area} px²): {tiny_count}")
+    print(f"  Normal images: {normal_count}")
+
+    if tiny_count > 0:
+        print("  RESULT: PASS - Tiny images detected, will be filtered")
+        return True
+    else:
+        print("  RESULT: INFO - No tiny images found in test file")
+        return None
+
+
+def test_1_5_covering_image_detection():
+    """Test 1.5: Verify covering image detection"""
+    print("\n" + "="*60)
+    print("TEST 1.5: Covering Image Detection")
+    print("="*60)
+
+    pdf_path = Path(__file__).parent.parent.parent / "demo_docs" / "edit3.pdf"
+    if not pdf_path.exists():
+        print(f"SKIP: {pdf_path} not found")
+        return None
+
+    engine = DirectExtractionEngine(
+        enable_whiteout_detection=True,
+        whiteout_iou_threshold=0.8
+    )
+
+    doc = fitz.open(str(pdf_path))
+
+    total_covering = 0
+    for page_num, page in enumerate(doc):
+        result = engine._preprocess_page(page, page_num, doc)
+        covering_images = result.get('covering_images', [])
+
+        if covering_images:
+            print(f"  Page {page_num}: {len(covering_images)} covering images detected")
+            for img in covering_images[:3]:  # Show first 3
+                print(f"    - xref={img.get('xref')}, type={img.get('color_type')}, "
+                      f"bbox={[round(x, 1) for x in img.get('bbox', [])]}")
+            total_covering += len(covering_images)
+
+    doc.close()
+
+    print(f"\n  Total covering images detected: {total_covering}")
+
+    if total_covering > 0:
+        print("  RESULT: PASS - Covering images detected, will be filtered")
+        return True
+    else:
+        print("  RESULT: INFO - No covering images found in test file")
+        return None
+
+
+def test_direct_extraction_full():
+    """Full integration test for Direct Track extraction"""
+    print("\n" + "="*60)
+    print("INTEGRATION TEST: Direct Track Full Extraction")
+    print("="*60)
+
+    pdf_path = Path(__file__).parent.parent.parent / "demo_docs" / "edit3.pdf"
+    if not pdf_path.exists():
+        print(f"SKIP: {pdf_path} not found")
+        return None
+
+    engine = DirectExtractionEngine(
+        enable_table_detection=True,
+        enable_image_extraction=True,
+        min_image_area=200.0,
+        enable_whiteout_detection=True
+    )
+
+    try:
+        result = engine.extract(pdf_path)  # Pass Path object, not string
+
+        # Count elements
+        table_count = 0
+        image_count = 0
+        merged_table_count = 0
+
+        for page in result.pages:
+            for elem in page.elements:
+                if elem.type.value == 'table':
+                    table_count += 1
+                    if elem.content and hasattr(elem.content, 'cells'):
+                        # Check for merged cells
+                        for cell in elem.content.cells:
+                            if cell.row_span > 1 or cell.col_span > 1:
+                                merged_table_count += 1
+                                break
+                elif elem.type.value == 'image':
+                    image_count += 1
+
+        print(f"  Document ID: {result.document_id}")
+        print(f"  Pages: {len(result.pages)}")
+        print(f"  Tables: {table_count} (with merging: {merged_table_count})")
+        print(f"  Images: {image_count}")
+
+        print("  RESULT: PASS - Extraction completed successfully")
+        return True
+
+    except Exception as e:
+        print(f"  RESULT: FAIL - {e}")
+        import traceback
+        traceback.print_exc()
+        return False
+
+
+if __name__ == "__main__":
+    print("="*60)
+    print("Phase 1 Bug Fixes Verification Tests")
+    print("="*60)
+
+    results = {}
+
+    # Run tests
+    results['1.1_table_merging'] = test_1_1_table_cell_merging()
+    results['1.3_coord_validation'] = test_1_3_cell_boxes_validation()
+    results['1.4_tiny_filtering'] = test_1_4_tiny_image_filtering()
+    results['1.5_covering_detection'] = test_1_5_covering_image_detection()
+    results['integration'] = test_direct_extraction_full()
+
+    # Summary
+    print("\n" + "="*60)
+    print("TEST SUMMARY")
+    print("="*60)
+
+    for test_name, result in results.items():
+        status = "PASS" if result is True else "FAIL" if result is False else "SKIP/INFO"
+        print(f"  {test_name}: {status}")
+
+    passed = sum(1 for r in results.values() if r is True)
+    failed = sum(1 for r in results.values() if r is False)
+    skipped = sum(1 for r in results.values() if r is None)
+
+    print(f"\n  Total: {passed} passed, {failed} failed, {skipped} skipped/info")