test
This commit is contained in:
285
backend/tests/test_phase1_fixes.py
Normal file
285
backend/tests/test_phase1_fixes.py
Normal file
@@ -0,0 +1,285 @@
|
||||
"""
|
||||
Phase 1 Bug Fixes Verification Tests
|
||||
|
||||
Tests for:
|
||||
1.1 Direct Track table cell merging
|
||||
1.2 OCR Track image path preservation
|
||||
1.3 Cell boxes coordinate validation
|
||||
1.4 Tiny decoration image filtering
|
||||
1.5 Covering image removal
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
# Add backend to path
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
import fitz
|
||||
from app.services.direct_extraction_engine import DirectExtractionEngine
|
||||
from app.services.ocr_to_unified_converter import validate_cell_boxes
|
||||
from app.models.unified_document import TableCell
|
||||
|
||||
|
||||
def test_1_1_table_cell_merging():
|
||||
"""Test 1.1.5: Verify edit3.pdf returns correct merged cells"""
|
||||
print("\n" + "="*60)
|
||||
print("TEST 1.1: Direct Track Table Cell Merging")
|
||||
print("="*60)
|
||||
|
||||
pdf_path = Path(__file__).parent.parent.parent / "demo_docs" / "edit3.pdf"
|
||||
if not pdf_path.exists():
|
||||
print(f"SKIP: {pdf_path} not found")
|
||||
return False
|
||||
|
||||
doc = fitz.open(str(pdf_path))
|
||||
|
||||
total_cells = 0
|
||||
merged_cells = 0
|
||||
|
||||
for page_num, page in enumerate(doc):
|
||||
tables = page.find_tables()
|
||||
for table_idx, table in enumerate(tables.tables):
|
||||
data = table.extract()
|
||||
cell_rects = getattr(table, 'cells', None)
|
||||
|
||||
if cell_rects:
|
||||
num_rows = len(data)
|
||||
num_cols = max(len(row) for row in data) if data else 0
|
||||
|
||||
# Count actual cells (non-None)
|
||||
actual_cells = sum(1 for c in cell_rects if c is not None)
|
||||
none_cells = sum(1 for c in cell_rects if c is None)
|
||||
|
||||
print(f" Page {page_num}, Table {table_idx}:")
|
||||
print(f" Grid size: {num_rows} x {num_cols} = {num_rows * num_cols} positions")
|
||||
print(f" Actual cells: {actual_cells}")
|
||||
print(f" Merged positions (None): {none_cells}")
|
||||
|
||||
total_cells += actual_cells
|
||||
if none_cells > 0:
|
||||
merged_cells += 1
|
||||
|
||||
doc.close()
|
||||
|
||||
print(f"\n Total actual cells across all tables: {total_cells}")
|
||||
print(f" Tables with merging: {merged_cells}")
|
||||
|
||||
# According to PLAN.md, edit3.pdf should have 83 cells (not 204)
|
||||
# The presence of None values indicates merging is detected
|
||||
if total_cells > 0 and total_cells < 204:
|
||||
print(" RESULT: PASS - Cell merging detected correctly")
|
||||
return True
|
||||
elif total_cells == 204:
|
||||
print(" RESULT: FAIL - All cells treated as 1x1 (no merging detected)")
|
||||
return False
|
||||
else:
|
||||
print(f" RESULT: INCONCLUSIVE - {total_cells} cells found")
|
||||
return None
|
||||
|
||||
|
||||
def test_1_3_cell_boxes_validation():
|
||||
"""Test 1.3: Verify cell_boxes coordinate validation"""
|
||||
print("\n" + "="*60)
|
||||
print("TEST 1.3: Cell Boxes Coordinate Validation")
|
||||
print("="*60)
|
||||
|
||||
# Test case 1: Valid coordinates
|
||||
valid_boxes = [
|
||||
[10, 10, 100, 50],
|
||||
[100, 10, 200, 50],
|
||||
[10, 50, 200, 100]
|
||||
]
|
||||
result = validate_cell_boxes(valid_boxes, [0, 0, 300, 200], 300, 200)
|
||||
print(f" Valid boxes: valid={result['valid']}, invalid_count={result['invalid_count']}")
|
||||
assert result['valid'], "Valid boxes should pass validation"
|
||||
|
||||
# Test case 2: Out of bounds coordinates
|
||||
invalid_boxes = [
|
||||
[-10, 10, 100, 50], # x0 < 0
|
||||
[10, 10, 400, 50], # x1 > page_width
|
||||
[10, 10, 100, 300] # y1 > page_height
|
||||
]
|
||||
result = validate_cell_boxes(invalid_boxes, [0, 0, 300, 200], 300, 200)
|
||||
print(f" Invalid boxes: valid={result['valid']}, invalid_count={result['invalid_count']}")
|
||||
assert not result['valid'], "Invalid boxes should fail validation"
|
||||
assert result['invalid_count'] == 3, "Should detect 3 invalid boxes"
|
||||
|
||||
# Test case 3: Clamping
|
||||
assert len(result['clamped_boxes']) == 3, "Should return clamped boxes"
|
||||
clamped = result['clamped_boxes'][0]
|
||||
assert clamped[0] >= 0, "Clamped x0 should be >= 0"
|
||||
|
||||
print(" RESULT: PASS - Coordinate validation works correctly")
|
||||
return True
|
||||
|
||||
|
||||
def test_1_4_tiny_image_filtering():
|
||||
"""Test 1.4: Verify tiny decoration image filtering"""
|
||||
print("\n" + "="*60)
|
||||
print("TEST 1.4: Tiny Decoration Image Filtering")
|
||||
print("="*60)
|
||||
|
||||
pdf_path = Path(__file__).parent.parent.parent / "demo_docs" / "edit3.pdf"
|
||||
if not pdf_path.exists():
|
||||
print(f"SKIP: {pdf_path} not found")
|
||||
return None
|
||||
|
||||
doc = fitz.open(str(pdf_path))
|
||||
|
||||
tiny_count = 0
|
||||
normal_count = 0
|
||||
min_area = 200 # Same threshold as in DirectExtractionEngine
|
||||
|
||||
for page_num, page in enumerate(doc):
|
||||
images = page.get_images()
|
||||
for img in images:
|
||||
xref = img[0]
|
||||
rects = page.get_image_rects(xref)
|
||||
if rects:
|
||||
rect = rects[0]
|
||||
area = (rect.x1 - rect.x0) * (rect.y1 - rect.y0)
|
||||
if area < min_area:
|
||||
tiny_count += 1
|
||||
print(f" Page {page_num}: Tiny image xref={xref}, area={area:.1f} px²")
|
||||
else:
|
||||
normal_count += 1
|
||||
|
||||
doc.close()
|
||||
|
||||
print(f"\n Tiny images (< {min_area} px²): {tiny_count}")
|
||||
print(f" Normal images: {normal_count}")
|
||||
|
||||
if tiny_count > 0:
|
||||
print(" RESULT: PASS - Tiny images detected, will be filtered")
|
||||
return True
|
||||
else:
|
||||
print(" RESULT: INFO - No tiny images found in test file")
|
||||
return None
|
||||
|
||||
|
||||
def test_1_5_covering_image_detection():
|
||||
"""Test 1.5: Verify covering image detection"""
|
||||
print("\n" + "="*60)
|
||||
print("TEST 1.5: Covering Image Detection")
|
||||
print("="*60)
|
||||
|
||||
pdf_path = Path(__file__).parent.parent.parent / "demo_docs" / "edit3.pdf"
|
||||
if not pdf_path.exists():
|
||||
print(f"SKIP: {pdf_path} not found")
|
||||
return None
|
||||
|
||||
engine = DirectExtractionEngine(
|
||||
enable_whiteout_detection=True,
|
||||
whiteout_iou_threshold=0.8
|
||||
)
|
||||
|
||||
doc = fitz.open(str(pdf_path))
|
||||
|
||||
total_covering = 0
|
||||
for page_num, page in enumerate(doc):
|
||||
result = engine._preprocess_page(page, page_num, doc)
|
||||
covering_images = result.get('covering_images', [])
|
||||
|
||||
if covering_images:
|
||||
print(f" Page {page_num}: {len(covering_images)} covering images detected")
|
||||
for img in covering_images[:3]: # Show first 3
|
||||
print(f" - xref={img.get('xref')}, type={img.get('color_type')}, "
|
||||
f"bbox={[round(x, 1) for x in img.get('bbox', [])]}")
|
||||
total_covering += len(covering_images)
|
||||
|
||||
doc.close()
|
||||
|
||||
print(f"\n Total covering images detected: {total_covering}")
|
||||
|
||||
if total_covering > 0:
|
||||
print(" RESULT: PASS - Covering images detected, will be filtered")
|
||||
return True
|
||||
else:
|
||||
print(" RESULT: INFO - No covering images found in test file")
|
||||
return None
|
||||
|
||||
|
||||
def test_direct_extraction_full():
|
||||
"""Full integration test for Direct Track extraction"""
|
||||
print("\n" + "="*60)
|
||||
print("INTEGRATION TEST: Direct Track Full Extraction")
|
||||
print("="*60)
|
||||
|
||||
pdf_path = Path(__file__).parent.parent.parent / "demo_docs" / "edit3.pdf"
|
||||
if not pdf_path.exists():
|
||||
print(f"SKIP: {pdf_path} not found")
|
||||
return None
|
||||
|
||||
engine = DirectExtractionEngine(
|
||||
enable_table_detection=True,
|
||||
enable_image_extraction=True,
|
||||
min_image_area=200.0,
|
||||
enable_whiteout_detection=True
|
||||
)
|
||||
|
||||
try:
|
||||
result = engine.extract(pdf_path) # Pass Path object, not string
|
||||
|
||||
# Count elements
|
||||
table_count = 0
|
||||
image_count = 0
|
||||
merged_table_count = 0
|
||||
|
||||
for page in result.pages:
|
||||
for elem in page.elements:
|
||||
if elem.type.value == 'table':
|
||||
table_count += 1
|
||||
if elem.content and hasattr(elem.content, 'cells'):
|
||||
# Check for merged cells
|
||||
for cell in elem.content.cells:
|
||||
if cell.row_span > 1 or cell.col_span > 1:
|
||||
merged_table_count += 1
|
||||
break
|
||||
elif elem.type.value == 'image':
|
||||
image_count += 1
|
||||
|
||||
print(f" Document ID: {result.document_id}")
|
||||
print(f" Pages: {len(result.pages)}")
|
||||
print(f" Tables: {table_count} (with merging: {merged_table_count})")
|
||||
print(f" Images: {image_count}")
|
||||
|
||||
print(" RESULT: PASS - Extraction completed successfully")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f" RESULT: FAIL - {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return False
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("="*60)
|
||||
print("Phase 1 Bug Fixes Verification Tests")
|
||||
print("="*60)
|
||||
|
||||
results = {}
|
||||
|
||||
# Run tests
|
||||
results['1.1_table_merging'] = test_1_1_table_cell_merging()
|
||||
results['1.3_coord_validation'] = test_1_3_cell_boxes_validation()
|
||||
results['1.4_tiny_filtering'] = test_1_4_tiny_image_filtering()
|
||||
results['1.5_covering_detection'] = test_1_5_covering_image_detection()
|
||||
results['integration'] = test_direct_extraction_full()
|
||||
|
||||
# Summary
|
||||
print("\n" + "="*60)
|
||||
print("TEST SUMMARY")
|
||||
print("="*60)
|
||||
|
||||
for test_name, result in results.items():
|
||||
status = "PASS" if result is True else "FAIL" if result is False else "SKIP/INFO"
|
||||
print(f" {test_name}: {status}")
|
||||
|
||||
passed = sum(1 for r in results.values() if r is True)
|
||||
failed = sum(1 for r in results.values() if r is False)
|
||||
skipped = sum(1 for r in results.values() if r is None)
|
||||
|
||||
print(f"\n Total: {passed} passed, {failed} failed, {skipped} skipped/info")
|
||||
Reference in New Issue
Block a user