""" Performance benchmarks for PP-StructureV3 parameter customization Measures memory usage, processing time, and engine initialization overhead """ import pytest import psutil import gc import time from pathlib import Path from unittest.mock import Mock, patch from app.services.ocr_service import OCRService @pytest.fixture def ocr_service(): """Create OCR service instance""" return OCRService() @pytest.fixture def sample_image(): """Find a sample image for testing""" # Try to find any image in demo_docs (using settings for path) from app.core.config import settings demo_dir = Path(settings.demo_docs_dir) if demo_dir.exists(): for ext in ['.pdf', '.png', '.jpg', '.jpeg']: images = list(demo_dir.glob(f'*{ext}')) if images: return images[0] return None class MemoryTracker: """Helper class to track memory usage""" def __init__(self): self.process = psutil.Process() self.start_memory = 0 self.peak_memory = 0 def start(self): """Start tracking memory""" gc.collect() # Force garbage collection self.start_memory = self.process.memory_info().rss / 1024 / 1024 # MB self.peak_memory = self.start_memory def sample(self): """Sample current memory""" current = self.process.memory_info().rss / 1024 / 1024 # MB self.peak_memory = max(self.peak_memory, current) return current def get_delta(self): """Get memory delta since start""" current = self.sample() return current - self.start_memory def get_peak_delta(self): """Get peak memory delta""" return self.peak_memory - self.start_memory @pytest.mark.performance class TestEngineInitializationPerformance: """Test performance of engine initialization with custom parameters""" def test_default_engine_initialization_time(self, ocr_service): """Measure time to initialize default (cached) engine""" print("\n=== Default Engine Initialization ===") with patch('app.services.ocr_service.PPStructureV3') as mock_ppstructure: mock_engine = Mock() mock_ppstructure.return_value = mock_engine # First initialization (creates engine) start = time.time() engine1 = ocr_service._ensure_structure_engine(custom_params=None) first_init_time = time.time() - start print(f"First initialization: {first_init_time * 1000:.2f}ms") # Second initialization (uses cache) start = time.time() engine2 = ocr_service._ensure_structure_engine(custom_params=None) cached_time = time.time() - start print(f"Cached access: {cached_time * 1000:.2f}ms") print(f"Speedup: {first_init_time / cached_time:.1f}x") # Verify caching works assert engine1 is engine2 assert mock_ppstructure.call_count == 1 # Cached access should be much faster assert cached_time < first_init_time / 10 def test_custom_engine_initialization_time(self, ocr_service): """Measure time to initialize engine with custom parameters""" print("\n=== Custom Engine Initialization ===") custom_params = { 'layout_detection_threshold': 0.15, 'text_det_thresh': 0.2 } with patch('app.services.ocr_service.PPStructureV3') as mock_ppstructure: mock_ppstructure.return_value = Mock() # Multiple initializations (no caching) times = [] for i in range(3): start = time.time() engine = ocr_service._ensure_structure_engine(custom_params=custom_params) init_time = time.time() - start times.append(init_time) print(f"Run {i+1}: {init_time * 1000:.2f}ms") avg_time = sum(times) / len(times) print(f"Average: {avg_time * 1000:.2f}ms") # Each call should create new engine (no caching) assert mock_ppstructure.call_count == 3 def test_parameter_extraction_overhead(self): """Measure overhead of parameter extraction and validation""" print("\n=== Parameter Extraction Overhead ===") from app.schemas.task import PPStructureV3Params # Test parameter validation performance iterations = 1000 # Valid parameters start = time.time() for _ in range(iterations): params = PPStructureV3Params( layout_detection_threshold=0.15, text_det_thresh=0.2 ) _ = params.model_dump(exclude_none=True) valid_time = time.time() - start print(f"Valid params ({iterations} iterations): {valid_time * 1000:.2f}ms") print(f"Per-operation: {valid_time / iterations * 1000:.4f}ms") # Validation should be fast assert valid_time / iterations < 0.001 # < 1ms per operation @pytest.mark.performance class TestMemoryUsage: """Test memory usage of custom parameters""" def test_default_engine_memory_usage(self, ocr_service): """Measure memory usage of default engine""" print("\n=== Default Engine Memory Usage ===") tracker = MemoryTracker() tracker.start() with patch('app.services.ocr_service.PPStructureV3') as mock_ppstructure: # Create mock engine with some memory footprint mock_engine = Mock() mock_engine.memory_size = 100 # Simulated memory mock_ppstructure.return_value = mock_engine print(f"Baseline memory: {tracker.start_memory:.2f} MB") # Initialize engine ocr_service._ensure_structure_engine(custom_params=None) memory_delta = tracker.get_delta() print(f"After initialization: {memory_delta:.2f} MB") # Access cached engine multiple times for _ in range(10): ocr_service._ensure_structure_engine(custom_params=None) memory_after_reuse = tracker.get_delta() print(f"After 10 reuses: {memory_after_reuse:.2f} MB") # Memory should not increase significantly with reuse assert abs(memory_after_reuse - memory_delta) < 10 # < 10MB increase def test_custom_engine_memory_cleanup(self, ocr_service): """Verify custom engines are properly cleaned up""" print("\n=== Custom Engine Memory Cleanup ===") tracker = MemoryTracker() tracker.start() custom_params = {'layout_detection_threshold': 0.15} with patch('app.services.ocr_service.PPStructureV3') as mock_ppstructure: mock_ppstructure.return_value = Mock() print(f"Baseline memory: {tracker.start_memory:.2f} MB") # Create multiple engines with custom params engines = [] for i in range(5): engine = ocr_service._ensure_structure_engine(custom_params=custom_params) engines.append(engine) if i == 0: first_engine_memory = tracker.get_delta() print(f"After 1st engine: {first_engine_memory:.2f} MB") memory_after_all = tracker.get_delta() print(f"After 5 engines: {memory_after_all:.2f} MB") # Clear references engines.clear() gc.collect() memory_after_cleanup = tracker.get_delta() print(f"After cleanup: {memory_after_cleanup:.2f} MB") # Memory should be recoverable (within 20% of peak) # This is a rough check as actual cleanup depends on Python GC peak_delta = tracker.get_peak_delta() print(f"Peak delta: {peak_delta:.2f} MB") def test_no_memory_leak_in_parameter_passing(self, ocr_service): """Test that parameter passing doesn't cause memory leaks""" print("\n=== Memory Leak Test ===") tracker = MemoryTracker() tracker.start() custom_params = { 'layout_detection_threshold': 0.15, 'text_det_thresh': 0.2, 'layout_merge_bboxes_mode': 'small' } with patch('app.services.ocr_service.PPStructureV3') as mock_ppstructure: mock_ppstructure.return_value = Mock() print(f"Baseline: {tracker.start_memory:.2f} MB") # Simulate many requests with custom params iterations = 100 for i in range(iterations): # Create engine engine = ocr_service._ensure_structure_engine(custom_params=custom_params.copy()) # Sample memory every 10 iterations if i % 10 == 0: memory_delta = tracker.get_delta() print(f"Iteration {i}: {memory_delta:.2f} MB") # Clear reference del engine # Force GC periodically if i % 50 == 0: gc.collect() final_memory = tracker.get_delta() print(f"Final: {final_memory:.2f} MB") print(f"Peak: {tracker.get_peak_delta():.2f} MB") # Memory growth should be bounded # Allow up to 50MB growth for 100 iterations assert tracker.get_peak_delta() < 50 @pytest.mark.performance class TestProcessingPerformance: """Test end-to-end processing performance with custom parameters""" def test_processing_time_comparison(self, ocr_service, sample_image): """Compare processing time: default vs custom parameters""" if sample_image is None: pytest.skip("No sample image available") print(f"\n=== Processing Time Comparison ===") print(f"Image: {sample_image.name}") with patch.object(ocr_service, 'get_ocr_engine') as mock_get_ocr: with patch.object(ocr_service, 'structure_engine', None): with patch('app.services.ocr_service.PPStructureV3') as mock_ppstructure: # Setup mocks mock_ocr_engine = Mock() mock_ocr_engine.ocr.return_value = [[[[0, 0], [100, 0], [100, 50], [0, 50]], ('test', 0.9)]] mock_get_ocr.return_value = mock_ocr_engine mock_structure_engine = Mock() mock_structure_engine.return_value = [] mock_ppstructure.return_value = mock_structure_engine # Test with default parameters start = time.time() result_default = ocr_service.process_image( image_path=sample_image, detect_layout=True, pp_structure_params=None ) time_default = time.time() - start print(f"Default params: {time_default * 1000:.2f}ms") # Test with custom parameters custom_params = { 'layout_detection_threshold': 0.15, 'text_det_thresh': 0.2 } start = time.time() result_custom = ocr_service.process_image( image_path=sample_image, detect_layout=True, pp_structure_params=custom_params ) time_custom = time.time() - start print(f"Custom params: {time_custom * 1000:.2f}ms") print(f"Difference: {abs(time_custom - time_default) * 1000:.2f}ms") # Both should succeed assert result_default['status'] == 'success' assert result_custom['status'] == 'success' @pytest.mark.performance @pytest.mark.benchmark class TestConcurrentPerformance: """Test performance under concurrent load""" def test_concurrent_custom_params_no_cache_pollution(self, ocr_service): """Verify custom params don't pollute cache in concurrent scenario""" print("\n=== Concurrent Cache Test ===") with patch('app.services.ocr_service.PPStructureV3') as mock_ppstructure: default_engine = Mock() default_engine.type = 'default' custom_engine = Mock() custom_engine.type = 'custom' # First call creates default engine mock_ppstructure.return_value = default_engine engine1 = ocr_service._ensure_structure_engine(custom_params=None) assert engine1.type == 'default' print("✓ Created default (cached) engine") # Second call with custom params creates new engine mock_ppstructure.return_value = custom_engine custom_params = {'layout_detection_threshold': 0.15} engine2 = ocr_service._ensure_structure_engine(custom_params=custom_params) assert engine2.type == 'custom' print("✓ Created custom (uncached) engine") # Third call without custom params should return cached default engine3 = ocr_service._ensure_structure_engine(custom_params=None) assert engine3.type == 'default' assert engine3 is engine1 print("✓ Retrieved default engine from cache (not polluted)") # Verify default engine was only created once assert mock_ppstructure.call_count == 2 # default + custom def run_benchmarks(): """Run all performance benchmarks and generate report""" print("=" * 60) print("PP-StructureV3 Parameters - Performance Benchmark Report") print("=" * 60) pytest.main([ __file__, '-v', '-s', '-m', 'performance', '--tb=short' ]) if __name__ == '__main__': run_benchmarks()