""" Unit tests for PP-StructureV3 parameter customization """ import pytest import sys from pathlib import Path from unittest.mock import Mock, patch, MagicMock # Mock all external dependencies before importing OCRService sys.modules['paddleocr'] = MagicMock() sys.modules['PIL'] = MagicMock() sys.modules['pdf2image'] = MagicMock() # Mock paddle with version attribute paddle_mock = MagicMock() paddle_mock.__version__ = '2.5.0' paddle_mock.device.get_device.return_value = 'cpu' paddle_mock.device.get_available_device.return_value = 'cpu' sys.modules['paddle'] = paddle_mock # Mock torch torch_mock = MagicMock() torch_mock.cuda.is_available.return_value = False sys.modules['torch'] = torch_mock from app.services.ocr_service import OCRService from app.core.config import settings class TestPPStructureParamsValidation: """Test parameter validation and defaults""" def test_default_parameters_used_when_none_provided(self): """Verify that default settings are used when no custom params provided""" ocr_service = OCRService() with patch.object(ocr_service, 'structure_engine', None): with patch('app.services.ocr_service.PPStructureV3') as mock_ppstructure: mock_engine = Mock() mock_ppstructure.return_value = mock_engine # Call without custom params engine = ocr_service._ensure_structure_engine(custom_params=None) # Verify default settings were used mock_ppstructure.assert_called_once() call_kwargs = mock_ppstructure.call_args[1] assert call_kwargs['layout_threshold'] == settings.layout_detection_threshold assert call_kwargs['layout_nms'] == settings.layout_nms_threshold assert call_kwargs['text_det_thresh'] == settings.text_det_thresh def test_custom_parameters_override_defaults(self): """Verify that custom parameters override default settings""" ocr_service = OCRService() custom_params = { 'layout_detection_threshold': 0.1, 'layout_nms_threshold': 0.15, 'text_det_thresh': 0.25, 'layout_merge_bboxes_mode': 'large' } with patch('app.services.ocr_service.PPStructureV3') as mock_ppstructure: mock_engine = Mock() mock_ppstructure.return_value = mock_engine # Call with custom params engine = ocr_service._ensure_structure_engine(custom_params=custom_params) # Verify custom params were used call_kwargs = mock_ppstructure.call_args[1] assert call_kwargs['layout_threshold'] == 0.1 assert call_kwargs['layout_nms'] == 0.15 assert call_kwargs['text_det_thresh'] == 0.25 assert call_kwargs['layout_merge_bboxes_mode'] == 'large' def test_partial_custom_parameters(self): """Verify that partial custom params work (custom + defaults mix)""" ocr_service = OCRService() custom_params = { 'layout_detection_threshold': 0.15, # Other params should use defaults } with patch('app.services.ocr_service.PPStructureV3') as mock_ppstructure: mock_engine = Mock() mock_ppstructure.return_value = mock_engine engine = ocr_service._ensure_structure_engine(custom_params=custom_params) call_kwargs = mock_ppstructure.call_args[1] # Custom param used assert call_kwargs['layout_threshold'] == 0.15 # Default params used assert call_kwargs['layout_nms'] == settings.layout_nms_threshold assert call_kwargs['text_det_thresh'] == settings.text_det_thresh def test_custom_params_do_not_cache_engine(self): """Verify that custom params create a new engine (no caching)""" ocr_service = OCRService() custom_params = {'layout_detection_threshold': 0.1} with patch('app.services.ocr_service.PPStructureV3') as mock_ppstructure: mock_engine1 = Mock() mock_engine2 = Mock() mock_ppstructure.side_effect = [mock_engine1, mock_engine2] # First call with custom params engine1 = ocr_service._ensure_structure_engine(custom_params=custom_params) # Second call with same custom params should create NEW engine engine2 = ocr_service._ensure_structure_engine(custom_params=custom_params) # Verify two different engines were created assert mock_ppstructure.call_count == 2 assert engine1 is mock_engine1 assert engine2 is mock_engine2 def test_default_params_use_cached_engine(self): """Verify that default params use cached engine""" ocr_service = OCRService() with patch('app.services.ocr_service.PPStructureV3') as mock_ppstructure: mock_engine = Mock() mock_ppstructure.return_value = mock_engine # First call without custom params engine1 = ocr_service._ensure_structure_engine(custom_params=None) # Second call without custom params should use cached engine engine2 = ocr_service._ensure_structure_engine(custom_params=None) # Verify only one engine was created (caching works) assert mock_ppstructure.call_count == 1 assert engine1 is engine2 def test_invalid_custom_params_fallback_to_default(self): """Verify that invalid custom params fall back to default cached engine""" ocr_service = OCRService() # Create a cached default engine first with patch('app.services.ocr_service.PPStructureV3') as mock_ppstructure: default_engine = Mock() mock_ppstructure.return_value = default_engine # Initialize default engine ocr_service._ensure_structure_engine(custom_params=None) # Now test with invalid custom params that will raise error mock_ppstructure.side_effect = ValueError("Invalid parameter") # Should fall back to cached default engine engine = ocr_service._ensure_structure_engine(custom_params={'invalid': 'params'}) # Should return the default cached engine assert engine is default_engine class TestPPStructureParamsFlow: """Test parameter flow through processing pipeline""" def test_params_flow_through_process_image(self): """Verify params flow from process_image to analyze_layout""" ocr_service = OCRService() custom_params = {'layout_detection_threshold': 0.12} with patch.object(ocr_service, 'get_ocr_engine') as mock_get_ocr: with patch.object(ocr_service, 'analyze_layout') as mock_analyze: mock_get_ocr.return_value = Mock() mock_analyze.return_value = (None, []) # Mock OCR result mock_engine = Mock() mock_engine.ocr.return_value = [[[[0, 0], [100, 0], [100, 50], [0, 50]], ('test', 0.9)]] mock_get_ocr.return_value = mock_engine # Process with custom params ocr_service.process_image( image_path=Path('/tmp/test.jpg'), detect_layout=True, pp_structure_params=custom_params ) # Verify params were passed to analyze_layout mock_analyze.assert_called_once() call_kwargs = mock_analyze.call_args[1] assert call_kwargs['pp_structure_params'] == custom_params def test_params_flow_through_process_with_dual_track(self): """Verify params flow through dual-track processing""" ocr_service = OCRService() ocr_service.dual_track_enabled = True custom_params = {'text_det_thresh': 0.15} with patch.object(ocr_service, 'process_file_traditional') as mock_traditional: with patch('app.services.ocr_service.DocumentTypeDetector') as mock_detector: # Mock detector to return OCR track mock_recommendation = Mock() mock_recommendation.track = 'ocr' mock_recommendation.confidence = 0.9 mock_recommendation.reason = 'Test' mock_recommendation.metadata = {} mock_detector_instance = Mock() mock_detector_instance.detect.return_value = mock_recommendation mock_detector.return_value = mock_detector_instance mock_traditional.return_value = {'status': 'success'} # Process with custom params ocr_service.process_with_dual_track( file_path=Path('/tmp/test.pdf'), force_track='ocr', pp_structure_params=custom_params ) # Verify params were passed to traditional processing mock_traditional.assert_called_once() call_kwargs = mock_traditional.call_args[1] assert call_kwargs['pp_structure_params'] == custom_params def test_params_not_passed_to_direct_track(self): """Verify params are NOT used for direct extraction track""" ocr_service = OCRService() ocr_service.dual_track_enabled = True custom_params = {'layout_detection_threshold': 0.1} with patch('app.services.ocr_service.DocumentTypeDetector') as mock_detector: with patch('app.services.ocr_service.DirectExtractionEngine') as mock_direct: # Mock detector to return DIRECT track mock_recommendation = Mock() mock_recommendation.track = 'direct' mock_recommendation.confidence = 0.95 mock_recommendation.reason = 'Editable PDF' mock_recommendation.metadata = {} mock_detector_instance = Mock() mock_detector_instance.detect.return_value = mock_recommendation mock_detector.return_value = mock_detector_instance # Mock direct extraction engine mock_direct_instance = Mock() mock_direct_instance.extract.return_value = Mock( document_id='test-id', metadata=Mock(processing_track='direct') ) mock_direct.return_value = mock_direct_instance # Process with custom params on DIRECT track result = ocr_service.process_with_dual_track( file_path=Path('/tmp/test.pdf'), pp_structure_params=custom_params ) # Verify direct extraction was used (not OCR) mock_direct_instance.extract.assert_called_once() # PP-StructureV3 params should NOT be passed to direct extraction call_kwargs = mock_direct_instance.extract.call_args[1] assert 'pp_structure_params' not in call_kwargs class TestPPStructureParamsLogging: """Test parameter logging""" def test_custom_params_are_logged(self): """Verify custom parameters are logged for debugging""" ocr_service = OCRService() custom_params = { 'layout_detection_threshold': 0.1, 'text_det_thresh': 0.15 } with patch('app.services.ocr_service.PPStructureV3') as mock_ppstructure: with patch('app.services.ocr_service.logger') as mock_logger: mock_engine = Mock() mock_ppstructure.return_value = mock_engine # Call with custom params ocr_service._ensure_structure_engine(custom_params=custom_params) # Verify logging assert mock_logger.info.call_count >= 2 # Check that custom params were logged log_calls = [str(call) for call in mock_logger.info.call_args_list] assert any('custom' in str(call).lower() for call in log_calls) if __name__ == '__main__': pytest.main([__file__, '-v'])