""" Phase 1 Bug Fixes Verification Tests Tests for: 1.1 Direct Track table cell merging 1.2 OCR Track image path preservation 1.3 Cell boxes coordinate validation 1.4 Tiny decoration image filtering 1.5 Covering image removal """ import sys import os from pathlib import Path # Add backend to path sys.path.insert(0, str(Path(__file__).parent.parent)) import fitz from app.services.direct_extraction_engine import DirectExtractionEngine from app.services.ocr_to_unified_converter import validate_cell_boxes from app.models.unified_document import TableCell def test_1_1_table_cell_merging(): """Test 1.1.5: Verify edit3.pdf returns correct merged cells""" print("\n" + "="*60) print("TEST 1.1: Direct Track Table Cell Merging") print("="*60) pdf_path = Path(__file__).parent.parent.parent / "demo_docs" / "edit3.pdf" if not pdf_path.exists(): print(f"SKIP: {pdf_path} not found") return False doc = fitz.open(str(pdf_path)) total_cells = 0 merged_cells = 0 for page_num, page in enumerate(doc): tables = page.find_tables() for table_idx, table in enumerate(tables.tables): data = table.extract() cell_rects = getattr(table, 'cells', None) if cell_rects: num_rows = len(data) num_cols = max(len(row) for row in data) if data else 0 # Count actual cells (non-None) actual_cells = sum(1 for c in cell_rects if c is not None) none_cells = sum(1 for c in cell_rects if c is None) print(f" Page {page_num}, Table {table_idx}:") print(f" Grid size: {num_rows} x {num_cols} = {num_rows * num_cols} positions") print(f" Actual cells: {actual_cells}") print(f" Merged positions (None): {none_cells}") total_cells += actual_cells if none_cells > 0: merged_cells += 1 doc.close() print(f"\n Total actual cells across all tables: {total_cells}") print(f" Tables with merging: {merged_cells}") # According to PLAN.md, edit3.pdf should have 83 cells (not 204) # The presence of None values indicates merging is detected if total_cells > 0 and total_cells < 204: print(" RESULT: PASS - Cell merging detected correctly") return True elif total_cells == 204: print(" RESULT: FAIL - All cells treated as 1x1 (no merging detected)") return False else: print(f" RESULT: INCONCLUSIVE - {total_cells} cells found") return None def test_1_3_cell_boxes_validation(): """Test 1.3: Verify cell_boxes coordinate validation""" print("\n" + "="*60) print("TEST 1.3: Cell Boxes Coordinate Validation") print("="*60) # Test case 1: Valid coordinates valid_boxes = [ [10, 10, 100, 50], [100, 10, 200, 50], [10, 50, 200, 100] ] result = validate_cell_boxes(valid_boxes, [0, 0, 300, 200], 300, 200) print(f" Valid boxes: valid={result['valid']}, invalid_count={result['invalid_count']}") assert result['valid'], "Valid boxes should pass validation" # Test case 2: Out of bounds coordinates invalid_boxes = [ [-10, 10, 100, 50], # x0 < 0 [10, 10, 400, 50], # x1 > page_width [10, 10, 100, 300] # y1 > page_height ] result = validate_cell_boxes(invalid_boxes, [0, 0, 300, 200], 300, 200) print(f" Invalid boxes: valid={result['valid']}, invalid_count={result['invalid_count']}") assert not result['valid'], "Invalid boxes should fail validation" assert result['invalid_count'] == 3, "Should detect 3 invalid boxes" # Test case 3: Clamping assert len(result['clamped_boxes']) == 3, "Should return clamped boxes" clamped = result['clamped_boxes'][0] assert clamped[0] >= 0, "Clamped x0 should be >= 0" print(" RESULT: PASS - Coordinate validation works correctly") return True def test_1_4_tiny_image_filtering(): """Test 1.4: Verify tiny decoration image filtering""" print("\n" + "="*60) print("TEST 1.4: Tiny Decoration Image Filtering") print("="*60) pdf_path = Path(__file__).parent.parent.parent / "demo_docs" / "edit3.pdf" if not pdf_path.exists(): print(f"SKIP: {pdf_path} not found") return None doc = fitz.open(str(pdf_path)) tiny_count = 0 normal_count = 0 min_area = 200 # Same threshold as in DirectExtractionEngine for page_num, page in enumerate(doc): images = page.get_images() for img in images: xref = img[0] rects = page.get_image_rects(xref) if rects: rect = rects[0] area = (rect.x1 - rect.x0) * (rect.y1 - rect.y0) if area < min_area: tiny_count += 1 print(f" Page {page_num}: Tiny image xref={xref}, area={area:.1f} px²") else: normal_count += 1 doc.close() print(f"\n Tiny images (< {min_area} px²): {tiny_count}") print(f" Normal images: {normal_count}") if tiny_count > 0: print(" RESULT: PASS - Tiny images detected, will be filtered") return True else: print(" RESULT: INFO - No tiny images found in test file") return None def test_1_5_covering_image_detection(): """Test 1.5: Verify covering image detection""" print("\n" + "="*60) print("TEST 1.5: Covering Image Detection") print("="*60) pdf_path = Path(__file__).parent.parent.parent / "demo_docs" / "edit3.pdf" if not pdf_path.exists(): print(f"SKIP: {pdf_path} not found") return None engine = DirectExtractionEngine( enable_whiteout_detection=True, whiteout_iou_threshold=0.8 ) doc = fitz.open(str(pdf_path)) total_covering = 0 for page_num, page in enumerate(doc): result = engine._preprocess_page(page, page_num, doc) covering_images = result.get('covering_images', []) if covering_images: print(f" Page {page_num}: {len(covering_images)} covering images detected") for img in covering_images[:3]: # Show first 3 print(f" - xref={img.get('xref')}, type={img.get('color_type')}, " f"bbox={[round(x, 1) for x in img.get('bbox', [])]}") total_covering += len(covering_images) doc.close() print(f"\n Total covering images detected: {total_covering}") if total_covering > 0: print(" RESULT: PASS - Covering images detected, will be filtered") return True else: print(" RESULT: INFO - No covering images found in test file") return None def test_direct_extraction_full(): """Full integration test for Direct Track extraction""" print("\n" + "="*60) print("INTEGRATION TEST: Direct Track Full Extraction") print("="*60) pdf_path = Path(__file__).parent.parent.parent / "demo_docs" / "edit3.pdf" if not pdf_path.exists(): print(f"SKIP: {pdf_path} not found") return None engine = DirectExtractionEngine( enable_table_detection=True, enable_image_extraction=True, min_image_area=200.0, enable_whiteout_detection=True ) try: result = engine.extract(pdf_path) # Pass Path object, not string # Count elements table_count = 0 image_count = 0 merged_table_count = 0 for page in result.pages: for elem in page.elements: if elem.type.value == 'table': table_count += 1 if elem.content and hasattr(elem.content, 'cells'): # Check for merged cells for cell in elem.content.cells: if cell.row_span > 1 or cell.col_span > 1: merged_table_count += 1 break elif elem.type.value == 'image': image_count += 1 print(f" Document ID: {result.document_id}") print(f" Pages: {len(result.pages)}") print(f" Tables: {table_count} (with merging: {merged_table_count})") print(f" Images: {image_count}") print(" RESULT: PASS - Extraction completed successfully") return True except Exception as e: print(f" RESULT: FAIL - {e}") import traceback traceback.print_exc() return False if __name__ == "__main__": print("="*60) print("Phase 1 Bug Fixes Verification Tests") print("="*60) results = {} # Run tests results['1.1_table_merging'] = test_1_1_table_cell_merging() results['1.3_coord_validation'] = test_1_3_cell_boxes_validation() results['1.4_tiny_filtering'] = test_1_4_tiny_image_filtering() results['1.5_covering_detection'] = test_1_5_covering_image_detection() results['integration'] = test_direct_extraction_full() # Summary print("\n" + "="*60) print("TEST SUMMARY") print("="*60) for test_name, result in results.items(): status = "PASS" if result is True else "FAIL" if result is False else "SKIP/INFO" print(f" {test_name}: {status}") passed = sum(1 for r in results.values() if r is True) failed = sum(1 for r in results.values() if r is False) skipped = sum(1 for r in results.values() if r is None) print(f"\n Total: {passed} passed, {failed} failed, {skipped} skipped/info")