OCR/backend/tests/performance/test_ppstructure_params_performance.py

"""
Performance benchmarks for PP-StructureV3 parameter customization
Measures memory usage, processing time, and engine initialization overhead
"""

import pytest
import psutil
import gc
import time
from pathlib import Path
from unittest.mock import Mock, patch
from app.services.ocr_service import OCRService


@pytest.fixture
def ocr_service():
    """Create OCR service instance"""
    return OCRService()


@pytest.fixture
def sample_image():
    """Find a sample image for testing"""
    # Try to find any image in demo_docs
    demo_dir = Path('/home/egg/project/Tool_OCR/demo_docs')
    if demo_dir.exists():
        for ext in ['.pdf', '.png', '.jpg', '.jpeg']:
            images = list(demo_dir.glob(f'*{ext}'))
            if images:
                return images[0]
    return None


class MemoryTracker:
    """Helper class to track memory usage"""

    def __init__(self):
        self.process = psutil.Process()
        self.start_memory = 0
        self.peak_memory = 0

    def start(self):
        """Start tracking memory"""
        gc.collect()  # Force garbage collection
        self.start_memory = self.process.memory_info().rss / 1024 / 1024  # MB
        self.peak_memory = self.start_memory

    def sample(self):
        """Sample current memory"""
        current = self.process.memory_info().rss / 1024 / 1024  # MB
        self.peak_memory = max(self.peak_memory, current)
        return current

    def get_delta(self):
        """Get memory delta since start"""
        current = self.sample()
        return current - self.start_memory

    def get_peak_delta(self):
        """Get peak memory delta"""
        return self.peak_memory - self.start_memory


@pytest.mark.performance
class TestEngineInitializationPerformance:
    """Test performance of engine initialization with custom parameters"""

    def test_default_engine_initialization_time(self, ocr_service):
        """Measure time to initialize default (cached) engine"""
        print("\n=== Default Engine Initialization ===")

        with patch('app.services.ocr_service.PPStructureV3') as mock_ppstructure:
            mock_engine = Mock()
            mock_ppstructure.return_value = mock_engine

            # First initialization (creates engine)
            start = time.time()
            engine1 = ocr_service._ensure_structure_engine(custom_params=None)
            first_init_time = time.time() - start

            print(f"First initialization: {first_init_time * 1000:.2f}ms")

            # Second initialization (uses cache)
            start = time.time()
            engine2 = ocr_service._ensure_structure_engine(custom_params=None)
            cached_time = time.time() - start

            print(f"Cached access: {cached_time * 1000:.2f}ms")
            print(f"Speedup: {first_init_time / cached_time:.1f}x")

            # Verify caching works
            assert engine1 is engine2
            assert mock_ppstructure.call_count == 1

            # Cached access should be much faster
            assert cached_time < first_init_time / 10

    def test_custom_engine_initialization_time(self, ocr_service):
        """Measure time to initialize engine with custom parameters"""
        print("\n=== Custom Engine Initialization ===")

        custom_params = {
            'layout_detection_threshold': 0.15,
            'text_det_thresh': 0.2
        }

        with patch('app.services.ocr_service.PPStructureV3') as mock_ppstructure:
            mock_ppstructure.return_value = Mock()

            # Multiple initializations (no caching)
            times = []
            for i in range(3):
                start = time.time()
                engine = ocr_service._ensure_structure_engine(custom_params=custom_params)
                init_time = time.time() - start
                times.append(init_time)
                print(f"Run {i+1}: {init_time * 1000:.2f}ms")

            avg_time = sum(times) / len(times)
            print(f"Average: {avg_time * 1000:.2f}ms")

            # Each call should create new engine (no caching)
            assert mock_ppstructure.call_count == 3

    def test_parameter_extraction_overhead(self):
        """Measure overhead of parameter extraction and validation"""
        print("\n=== Parameter Extraction Overhead ===")

        from app.schemas.task import PPStructureV3Params

        # Test parameter validation performance
        iterations = 1000

        # Valid parameters
        start = time.time()
        for _ in range(iterations):
            params = PPStructureV3Params(
                layout_detection_threshold=0.15,
                text_det_thresh=0.2
            )
            _ = params.model_dump(exclude_none=True)
        valid_time = time.time() - start

        print(f"Valid params ({iterations} iterations): {valid_time * 1000:.2f}ms")
        print(f"Per-operation: {valid_time / iterations * 1000:.4f}ms")

        # Validation should be fast
        assert valid_time / iterations < 0.001  # < 1ms per operation


@pytest.mark.performance
class TestMemoryUsage:
    """Test memory usage of custom parameters"""

    def test_default_engine_memory_usage(self, ocr_service):
        """Measure memory usage of default engine"""
        print("\n=== Default Engine Memory Usage ===")

        tracker = MemoryTracker()
        tracker.start()

        with patch('app.services.ocr_service.PPStructureV3') as mock_ppstructure:
            # Create mock engine with some memory footprint
            mock_engine = Mock()
            mock_engine.memory_size = 100  # Simulated memory
            mock_ppstructure.return_value = mock_engine

            print(f"Baseline memory: {tracker.start_memory:.2f} MB")

            # Initialize engine
            ocr_service._ensure_structure_engine(custom_params=None)

            memory_delta = tracker.get_delta()
            print(f"After initialization: {memory_delta:.2f} MB")

            # Access cached engine multiple times
            for _ in range(10):
                ocr_service._ensure_structure_engine(custom_params=None)

            memory_after_reuse = tracker.get_delta()
            print(f"After 10 reuses: {memory_after_reuse:.2f} MB")

            # Memory should not increase significantly with reuse
            assert abs(memory_after_reuse - memory_delta) < 10  # < 10MB increase

    def test_custom_engine_memory_cleanup(self, ocr_service):
        """Verify custom engines are properly cleaned up"""
        print("\n=== Custom Engine Memory Cleanup ===")

        tracker = MemoryTracker()
        tracker.start()

        custom_params = {'layout_detection_threshold': 0.15}

        with patch('app.services.ocr_service.PPStructureV3') as mock_ppstructure:
            mock_ppstructure.return_value = Mock()

            print(f"Baseline memory: {tracker.start_memory:.2f} MB")

            # Create multiple engines with custom params
            engines = []
            for i in range(5):
                engine = ocr_service._ensure_structure_engine(custom_params=custom_params)
                engines.append(engine)
                if i == 0:
                    first_engine_memory = tracker.get_delta()
                    print(f"After 1st engine: {first_engine_memory:.2f} MB")

            memory_after_all = tracker.get_delta()
            print(f"After 5 engines: {memory_after_all:.2f} MB")

            # Clear references
            engines.clear()
            gc.collect()

            memory_after_cleanup = tracker.get_delta()
            print(f"After cleanup: {memory_after_cleanup:.2f} MB")

            # Memory should be recoverable (within 20% of peak)
            # This is a rough check as actual cleanup depends on Python GC
            peak_delta = tracker.get_peak_delta()
            print(f"Peak delta: {peak_delta:.2f} MB")

    def test_no_memory_leak_in_parameter_passing(self, ocr_service):
        """Test that parameter passing doesn't cause memory leaks"""
        print("\n=== Memory Leak Test ===")

        tracker = MemoryTracker()
        tracker.start()

        custom_params = {
            'layout_detection_threshold': 0.15,
            'text_det_thresh': 0.2,
            'layout_merge_bboxes_mode': 'small'
        }

        with patch('app.services.ocr_service.PPStructureV3') as mock_ppstructure:
            mock_ppstructure.return_value = Mock()

            print(f"Baseline: {tracker.start_memory:.2f} MB")

            # Simulate many requests with custom params
            iterations = 100
            for i in range(iterations):
                # Create engine
                engine = ocr_service._ensure_structure_engine(custom_params=custom_params.copy())

                # Sample memory every 10 iterations
                if i % 10 == 0:
                    memory_delta = tracker.get_delta()
                    print(f"Iteration {i}: {memory_delta:.2f} MB")

                # Clear reference
                del engine

                # Force GC periodically
                if i % 50 == 0:
                    gc.collect()

            final_memory = tracker.get_delta()
            print(f"Final: {final_memory:.2f} MB")
            print(f"Peak: {tracker.get_peak_delta():.2f} MB")

            # Memory growth should be bounded
            # Allow up to 50MB growth for 100 iterations
            assert tracker.get_peak_delta() < 50


@pytest.mark.performance
class TestProcessingPerformance:
    """Test end-to-end processing performance with custom parameters"""

    def test_processing_time_comparison(self, ocr_service, sample_image):
        """Compare processing time: default vs custom parameters"""
        if sample_image is None:
            pytest.skip("No sample image available")

        print(f"\n=== Processing Time Comparison ===")
        print(f"Image: {sample_image.name}")

        with patch.object(ocr_service, 'get_ocr_engine') as mock_get_ocr:
            with patch.object(ocr_service, 'structure_engine', None):
                with patch('app.services.ocr_service.PPStructureV3') as mock_ppstructure:
                    # Setup mocks
                    mock_ocr_engine = Mock()
                    mock_ocr_engine.ocr.return_value = [[[[0, 0], [100, 0], [100, 50], [0, 50]], ('test', 0.9)]]
                    mock_get_ocr.return_value = mock_ocr_engine

                    mock_structure_engine = Mock()
                    mock_structure_engine.return_value = []
                    mock_ppstructure.return_value = mock_structure_engine

                    # Test with default parameters
                    start = time.time()
                    result_default = ocr_service.process_image(
                        image_path=sample_image,
                        detect_layout=True,
                        pp_structure_params=None
                    )
                    time_default = time.time() - start

                    print(f"Default params: {time_default * 1000:.2f}ms")

                    # Test with custom parameters
                    custom_params = {
                        'layout_detection_threshold': 0.15,
                        'text_det_thresh': 0.2
                    }

                    start = time.time()
                    result_custom = ocr_service.process_image(
                        image_path=sample_image,
                        detect_layout=True,
                        pp_structure_params=custom_params
                    )
                    time_custom = time.time() - start

                    print(f"Custom params: {time_custom * 1000:.2f}ms")
                    print(f"Difference: {abs(time_custom - time_default) * 1000:.2f}ms")

                    # Both should succeed
                    assert result_default['status'] == 'success'
                    assert result_custom['status'] == 'success'


@pytest.mark.performance
@pytest.mark.benchmark
class TestConcurrentPerformance:
    """Test performance under concurrent load"""

    def test_concurrent_custom_params_no_cache_pollution(self, ocr_service):
        """Verify custom params don't pollute cache in concurrent scenario"""
        print("\n=== Concurrent Cache Test ===")

        with patch('app.services.ocr_service.PPStructureV3') as mock_ppstructure:
            default_engine = Mock()
            default_engine.type = 'default'

            custom_engine = Mock()
            custom_engine.type = 'custom'

            # First call creates default engine
            mock_ppstructure.return_value = default_engine
            engine1 = ocr_service._ensure_structure_engine(custom_params=None)
            assert engine1.type == 'default'
            print("✓ Created default (cached) engine")

            # Second call with custom params creates new engine
            mock_ppstructure.return_value = custom_engine
            custom_params = {'layout_detection_threshold': 0.15}
            engine2 = ocr_service._ensure_structure_engine(custom_params=custom_params)
            assert engine2.type == 'custom'
            print("✓ Created custom (uncached) engine")

            # Third call without custom params should return cached default
            engine3 = ocr_service._ensure_structure_engine(custom_params=None)
            assert engine3.type == 'default'
            assert engine3 is engine1
            print("✓ Retrieved default engine from cache (not polluted)")

            # Verify default engine was only created once
            assert mock_ppstructure.call_count == 2  # default + custom


def run_benchmarks():
    """Run all performance benchmarks and generate report"""
    print("=" * 60)
    print("PP-StructureV3 Parameters - Performance Benchmark Report")
    print("=" * 60)

    pytest.main([
        __file__,
        '-v',
        '-s',
        '-m', 'performance',
        '--tb=short'
    ])


if __name__ == '__main__':
    run_benchmarks()