""" Unit tests for Layout Model Selection feature in OCR Service. This replaces the deprecated PP-StructureV3 parameter tests. """ import pytest import sys from pathlib import Path from unittest.mock import Mock, patch, MagicMock # Mock all external dependencies before importing OCRService sys.modules['paddleocr'] = MagicMock() sys.modules['PIL'] = MagicMock() sys.modules['pdf2image'] = MagicMock() # Mock paddle with version attribute paddle_mock = MagicMock() paddle_mock.__version__ = '2.5.0' paddle_mock.device.get_device.return_value = 'cpu' paddle_mock.device.get_available_device.return_value = 'cpu' sys.modules['paddle'] = paddle_mock # Mock torch torch_mock = MagicMock() torch_mock.cuda.is_available.return_value = False sys.modules['torch'] = torch_mock from app.services.ocr_service import OCRService, LAYOUT_MODEL_MAPPING, _USE_PUBLAYNET_DEFAULT from app.core.config import settings class TestLayoutModelMapping: """Test layout model name mapping""" def test_layout_model_mapping_exists(self): """Verify LAYOUT_MODEL_MAPPING constant exists and has correct values""" assert 'chinese' in LAYOUT_MODEL_MAPPING assert 'default' in LAYOUT_MODEL_MAPPING assert 'cdla' in LAYOUT_MODEL_MAPPING def test_chinese_model_maps_to_pp_doclayout(self): """Verify 'chinese' maps to PP-DocLayout-S""" assert LAYOUT_MODEL_MAPPING['chinese'] == 'PP-DocLayout-S' def test_default_model_maps_to_publaynet_sentinel(self): """Verify 'default' maps to sentinel value for PubLayNet default""" # The 'default' model uses a sentinel value that signals "use PubLayNet default (no custom model)" assert LAYOUT_MODEL_MAPPING['default'] == _USE_PUBLAYNET_DEFAULT def test_cdla_model_maps_to_picodet(self): """Verify 'cdla' maps to picodet_lcnet_x1_0_fgd_layout_cdla""" assert LAYOUT_MODEL_MAPPING['cdla'] == 'picodet_lcnet_x1_0_fgd_layout_cdla' class TestLayoutModelEngine: """Test engine creation with different layout models""" def test_chinese_model_creates_engine_with_pp_doclayout(self): """Verify 'chinese' layout model uses PP-DocLayout-S""" ocr_service = OCRService() with patch.object(ocr_service, 'structure_engine', None): with patch('app.services.ocr_service.PPStructureV3') as mock_ppstructure: mock_engine = Mock() mock_ppstructure.return_value = mock_engine engine = ocr_service._ensure_structure_engine(layout_model='chinese') mock_ppstructure.assert_called_once() call_kwargs = mock_ppstructure.call_args[1] assert call_kwargs.get('layout_detection_model_name') == 'PP-DocLayout-S' def test_default_model_creates_engine_without_model_name(self): """Verify 'default' layout model does not specify model name (uses default)""" ocr_service = OCRService() with patch.object(ocr_service, 'structure_engine', None): with patch('app.services.ocr_service.PPStructureV3') as mock_ppstructure: mock_engine = Mock() mock_ppstructure.return_value = mock_engine engine = ocr_service._ensure_structure_engine(layout_model='default') mock_ppstructure.assert_called_once() call_kwargs = mock_ppstructure.call_args[1] # For 'default', layout_detection_model_name should be None or not set assert call_kwargs.get('layout_detection_model_name') is None def test_cdla_model_creates_engine_with_picodet(self): """Verify 'cdla' layout model uses picodet_lcnet_x1_0_fgd_layout_cdla""" ocr_service = OCRService() with patch.object(ocr_service, 'structure_engine', None): with patch('app.services.ocr_service.PPStructureV3') as mock_ppstructure: mock_engine = Mock() mock_ppstructure.return_value = mock_engine engine = ocr_service._ensure_structure_engine(layout_model='cdla') mock_ppstructure.assert_called_once() call_kwargs = mock_ppstructure.call_args[1] assert call_kwargs.get('layout_detection_model_name') == 'picodet_lcnet_x1_0_fgd_layout_cdla' def test_none_layout_model_uses_chinese_default(self): """Verify None layout_model defaults to 'chinese' model""" ocr_service = OCRService() with patch.object(ocr_service, 'structure_engine', None): with patch('app.services.ocr_service.PPStructureV3') as mock_ppstructure: mock_engine = Mock() mock_ppstructure.return_value = mock_engine # Pass None for layout_model engine = ocr_service._ensure_structure_engine(layout_model=None) mock_ppstructure.assert_called_once() call_kwargs = mock_ppstructure.call_args[1] # Should use 'chinese' model as default assert call_kwargs.get('layout_detection_model_name') == 'PP-DocLayout-S' class TestLayoutModelCaching: """Test engine caching behavior with layout models""" def test_same_layout_model_uses_cached_engine(self): """Verify same layout model reuses cached engine""" ocr_service = OCRService() with patch('app.services.ocr_service.PPStructureV3') as mock_ppstructure: mock_engine = Mock() mock_ppstructure.return_value = mock_engine # First call with 'chinese' engine1 = ocr_service._ensure_structure_engine(layout_model='chinese') # Second call with same model should use cache engine2 = ocr_service._ensure_structure_engine(layout_model='chinese') # Verify only one engine was created assert mock_ppstructure.call_count == 1 assert engine1 is engine2 def test_different_layout_model_creates_new_engine(self): """Verify different layout model creates new engine""" ocr_service = OCRService() with patch('app.services.ocr_service.PPStructureV3') as mock_ppstructure: mock_engine1 = Mock() mock_engine2 = Mock() mock_ppstructure.side_effect = [mock_engine1, mock_engine2] # First call with 'chinese' engine1 = ocr_service._ensure_structure_engine(layout_model='chinese') # Second call with 'cdla' should create new engine engine2 = ocr_service._ensure_structure_engine(layout_model='cdla') # Verify two engines were created assert mock_ppstructure.call_count == 2 assert engine1 is not engine2 class TestLayoutModelFlow: """Test layout model parameter flow through processing pipeline""" def test_layout_model_passed_to_engine_creation(self): """Verify layout_model is passed through to _ensure_structure_engine""" ocr_service = OCRService() # Test that _ensure_structure_engine accepts layout_model parameter with patch('app.services.ocr_service.PPStructureV3') as mock_ppstructure: mock_engine = Mock() mock_ppstructure.return_value = mock_engine # Call with specific layout_model engine = ocr_service._ensure_structure_engine(layout_model='cdla') # Verify correct model was requested mock_ppstructure.assert_called_once() call_kwargs = mock_ppstructure.call_args[1] assert call_kwargs.get('layout_detection_model_name') == 'picodet_lcnet_x1_0_fgd_layout_cdla' def test_layout_model_default_behavior(self): """Verify default layout model behavior when None is passed""" ocr_service = OCRService() with patch('app.services.ocr_service.PPStructureV3') as mock_ppstructure: mock_engine = Mock() mock_ppstructure.return_value = mock_engine # Call without layout_model (None) engine = ocr_service._ensure_structure_engine(layout_model=None) # Should use config default (PP-DocLayout-S) mock_ppstructure.assert_called_once() call_kwargs = mock_ppstructure.call_args[1] assert call_kwargs.get('layout_detection_model_name') == settings.layout_detection_model_name def test_layout_model_unknown_value_falls_back(self): """Verify unknown layout model falls back to config default""" ocr_service = OCRService() with patch('app.services.ocr_service.PPStructureV3') as mock_ppstructure: mock_engine = Mock() mock_ppstructure.return_value = mock_engine # Call with unknown layout_model engine = ocr_service._ensure_structure_engine(layout_model='unknown_model') # Should use config default mock_ppstructure.assert_called_once() call_kwargs = mock_ppstructure.call_args[1] assert call_kwargs.get('layout_detection_model_name') == settings.layout_detection_model_name class TestLayoutModelLogging: """Test layout model logging""" def test_layout_model_is_logged(self): """Verify layout model selection is logged""" ocr_service = OCRService() with patch('app.services.ocr_service.PPStructureV3') as mock_ppstructure: with patch('app.services.ocr_service.logger') as mock_logger: mock_engine = Mock() mock_ppstructure.return_value = mock_engine # Call with specific layout_model ocr_service._ensure_structure_engine(layout_model='cdla') # Verify logging occurred assert mock_logger.info.call_count >= 1 # Check that model name was logged log_calls = [str(call) for call in mock_logger.info.call_args_list] assert any('cdla' in str(call).lower() or 'layout' in str(call).lower() for call in log_calls) if __name__ == '__main__': pytest.main([__file__, '-v'])