""" Tool_OCR - Document Preprocessor Unit Tests Tests for app/services/preprocessor.py """ import pytest from pathlib import Path from PIL import Image from app.services.preprocessor import DocumentPreprocessor @pytest.mark.unit class TestDocumentPreprocessor: """Test suite for DocumentPreprocessor""" def test_init(self, preprocessor): """Test preprocessor initialization""" assert preprocessor is not None assert preprocessor.max_file_size > 0 assert len(preprocessor.allowed_extensions) > 0 assert 'png' in preprocessor.allowed_extensions assert 'jpg' in preprocessor.allowed_extensions assert 'pdf' in preprocessor.allowed_extensions def test_supported_formats(self, preprocessor): """Test that all expected formats are supported""" expected_image_formats = ['png', 'jpg', 'jpeg', 'bmp', 'tiff', 'tif'] expected_pdf_format = ['pdf'] for fmt in expected_image_formats: assert fmt in preprocessor.SUPPORTED_IMAGE_FORMATS for fmt in expected_pdf_format: assert fmt in preprocessor.SUPPORTED_PDF_FORMAT all_formats = expected_image_formats + expected_pdf_format assert set(preprocessor.ALL_SUPPORTED_FORMATS) == set(all_formats) @pytest.mark.unit class TestFileValidation: """Test file validation methods""" def test_validate_valid_png(self, preprocessor, sample_image_path): """Test validation of a valid PNG file""" is_valid, file_format, error = preprocessor.validate_file(sample_image_path) assert is_valid is True assert file_format == 'png' assert error is None def test_validate_valid_jpg(self, preprocessor, sample_jpg_path): """Test validation of a valid JPG file""" is_valid, file_format, error = preprocessor.validate_file(sample_jpg_path) assert is_valid is True assert file_format == 'jpg' assert error is None def test_validate_valid_pdf(self, preprocessor, sample_pdf_path): """Test validation of a valid PDF file""" is_valid, file_format, error = preprocessor.validate_file(sample_pdf_path) assert is_valid is True assert file_format == 'pdf' assert error is None def test_validate_nonexistent_file(self, preprocessor, temp_dir): """Test validation of a non-existent file""" fake_path = temp_dir / "nonexistent.png" is_valid, file_format, error = preprocessor.validate_file(fake_path) assert is_valid is False assert file_format is None assert "not found" in error.lower() def test_validate_large_file(self, preprocessor, large_file_path): """Test validation of a file exceeding size limit""" is_valid, file_format, error = preprocessor.validate_file(large_file_path) assert is_valid is False assert file_format is None assert "too large" in error.lower() def test_validate_unsupported_format(self, preprocessor, unsupported_file_path): """Test validation of unsupported file format""" is_valid, file_format, error = preprocessor.validate_file(unsupported_file_path) assert is_valid is False assert "not allowed" in error.lower() or "unsupported" in error.lower() def test_validate_corrupted_image(self, preprocessor, corrupted_image_path): """Test validation of a corrupted image file""" is_valid, file_format, error = preprocessor.validate_file(corrupted_image_path) assert is_valid is False assert error is not None # Corrupted files may be detected as unsupported type or corrupted assert ("corrupted" in error.lower() or "unsupported" in error.lower() or "not allowed" in error.lower()) @pytest.mark.unit class TestMimeTypeMapping: """Test MIME type to format mapping""" def test_mime_to_format_png(self, preprocessor): """Test PNG MIME type mapping""" assert preprocessor._mime_to_format('image/png') == 'png' def test_mime_to_format_jpeg(self, preprocessor): """Test JPEG MIME type mapping""" assert preprocessor._mime_to_format('image/jpeg') == 'jpg' assert preprocessor._mime_to_format('image/jpg') == 'jpg' def test_mime_to_format_pdf(self, preprocessor): """Test PDF MIME type mapping""" assert preprocessor._mime_to_format('application/pdf') == 'pdf' def test_mime_to_format_tiff(self, preprocessor): """Test TIFF MIME type mapping""" assert preprocessor._mime_to_format('image/tiff') == 'tiff' assert preprocessor._mime_to_format('image/x-tiff') == 'tiff' def test_mime_to_format_bmp(self, preprocessor): """Test BMP MIME type mapping""" assert preprocessor._mime_to_format('image/bmp') == 'bmp' def test_mime_to_format_unknown(self, preprocessor): """Test unknown MIME type returns None""" assert preprocessor._mime_to_format('unknown/type') is None assert preprocessor._mime_to_format('text/plain') is None @pytest.mark.unit class TestIntegrityValidation: """Test file integrity validation""" def test_validate_integrity_valid_png(self, preprocessor, sample_image_path): """Test integrity check for valid PNG""" is_valid, error = preprocessor._validate_integrity(sample_image_path, 'png') assert is_valid is True assert error is None def test_validate_integrity_valid_jpg(self, preprocessor, sample_jpg_path): """Test integrity check for valid JPG""" is_valid, error = preprocessor._validate_integrity(sample_jpg_path, 'jpg') assert is_valid is True assert error is None def test_validate_integrity_valid_pdf(self, preprocessor, sample_pdf_path): """Test integrity check for valid PDF""" is_valid, error = preprocessor._validate_integrity(sample_pdf_path, 'pdf') assert is_valid is True assert error is None def test_validate_integrity_corrupted_image(self, preprocessor, corrupted_image_path): """Test integrity check for corrupted image""" is_valid, error = preprocessor._validate_integrity(corrupted_image_path, 'png') assert is_valid is False assert error is not None def test_validate_integrity_invalid_pdf_header(self, preprocessor, temp_dir): """Test integrity check for PDF with invalid header""" invalid_pdf = temp_dir / "invalid.pdf" with open(invalid_pdf, 'wb') as f: f.write(b'Not a PDF file') is_valid, error = preprocessor._validate_integrity(invalid_pdf, 'pdf') assert is_valid is False assert "invalid" in error.lower() or "header" in error.lower() def test_validate_integrity_unknown_format(self, preprocessor, temp_dir): """Test integrity check for unknown format""" test_file = temp_dir / "test.xyz" test_file.write_text("test") is_valid, error = preprocessor._validate_integrity(test_file, 'xyz') assert is_valid is False assert error is not None @pytest.mark.unit class TestImagePreprocessing: """Test image preprocessing functionality""" def test_preprocess_image_without_enhancement(self, preprocessor, sample_image_path): """Test preprocessing without enhancement (returns original)""" success, output_path, error = preprocessor.preprocess_image( sample_image_path, enhance=False ) assert success is True assert output_path == sample_image_path assert error is None def test_preprocess_image_with_enhancement(self, preprocessor, sample_image_with_text, temp_dir): """Test preprocessing with enhancement""" output_path = temp_dir / "processed.png" success, result_path, error = preprocessor.preprocess_image( sample_image_with_text, enhance=True, output_path=output_path ) assert success is True assert result_path == output_path assert result_path.exists() assert error is None # Verify the output is a valid image with Image.open(result_path) as img: assert img.size[0] > 0 assert img.size[1] > 0 def test_preprocess_image_auto_output_path(self, preprocessor, sample_image_with_text): """Test preprocessing with automatic output path""" success, result_path, error = preprocessor.preprocess_image( sample_image_with_text, enhance=True ) assert success is True assert result_path is not None assert result_path.exists() assert "processed_" in result_path.name assert error is None def test_preprocess_nonexistent_image(self, preprocessor, temp_dir): """Test preprocessing with non-existent image""" fake_path = temp_dir / "nonexistent.png" success, result_path, error = preprocessor.preprocess_image( fake_path, enhance=True ) assert success is False assert result_path is None assert error is not None def test_preprocess_corrupted_image(self, preprocessor, corrupted_image_path): """Test preprocessing with corrupted image""" success, result_path, error = preprocessor.preprocess_image( corrupted_image_path, enhance=True ) assert success is False assert result_path is None assert error is not None @pytest.mark.unit class TestFileInfo: """Test file information retrieval""" def test_get_file_info_png(self, preprocessor, sample_image_path): """Test getting file info for PNG""" info = preprocessor.get_file_info(sample_image_path) assert info['name'] == sample_image_path.name assert info['path'] == str(sample_image_path) assert info['size'] > 0 assert info['size_mb'] > 0 assert info['mime_type'] == 'image/png' assert info['format'] == 'png' assert 'created_at' in info assert 'modified_at' in info def test_get_file_info_jpg(self, preprocessor, sample_jpg_path): """Test getting file info for JPG""" info = preprocessor.get_file_info(sample_jpg_path) assert info['name'] == sample_jpg_path.name assert info['mime_type'] == 'image/jpeg' assert info['format'] == 'jpg' def test_get_file_info_pdf(self, preprocessor, sample_pdf_path): """Test getting file info for PDF""" info = preprocessor.get_file_info(sample_pdf_path) assert info['name'] == sample_pdf_path.name assert info['mime_type'] == 'application/pdf' assert info['format'] == 'pdf' def test_get_file_info_size_calculation(self, preprocessor, sample_image_path): """Test that file size is correctly calculated""" info = preprocessor.get_file_info(sample_image_path) actual_size = sample_image_path.stat().st_size assert info['size'] == actual_size assert abs(info['size_mb'] - (actual_size / (1024 * 1024))) < 0.001 @pytest.mark.unit class TestEdgeCases: """Test edge cases and error handling""" def test_validate_empty_file(self, preprocessor, temp_dir): """Test validation of empty file""" empty_file = temp_dir / "empty.png" empty_file.touch() is_valid, file_format, error = preprocessor.validate_file(empty_file) # Should fail because empty file has no valid MIME type or is corrupted assert is_valid is False def test_validate_file_with_wrong_extension(self, preprocessor, temp_dir): """Test validation of file with misleading extension""" # Create a PNG file but name it .txt misleading_file = temp_dir / "image.txt" img = Image.new('RGB', (10, 10), color='white') img.save(misleading_file, 'PNG') # Validation uses MIME detection, not extension # So a PNG file named .txt should pass if PNG is in allowed_extensions is_valid, file_format, error = preprocessor.validate_file(misleading_file) # Should succeed because MIME detection finds it's a PNG # (preprocessor uses magic number detection, not file extension) assert is_valid is True assert file_format == 'png' def test_preprocess_very_small_image(self, preprocessor, temp_dir): """Test preprocessing of very small image""" small_image = temp_dir / "small.png" img = Image.new('RGB', (5, 5), color='white') img.save(small_image, 'PNG') success, result_path, error = preprocessor.preprocess_image( small_image, enhance=True ) # Should succeed even with very small image assert success is True assert result_path is not None assert result_path.exists()