OCR/backend/tests/test_preprocessor.py

"""
Tool_OCR - Document Preprocessor Unit Tests
Tests for app/services/preprocessor.py
"""

import pytest
from pathlib import Path
from PIL import Image

from app.services.preprocessor import DocumentPreprocessor


@pytest.mark.unit
class TestDocumentPreprocessor:
    """Test suite for DocumentPreprocessor"""

    def test_init(self, preprocessor):
        """Test preprocessor initialization"""
        assert preprocessor is not None
        assert preprocessor.max_file_size > 0
        assert len(preprocessor.allowed_extensions) > 0
        assert 'png' in preprocessor.allowed_extensions
        assert 'jpg' in preprocessor.allowed_extensions
        assert 'pdf' in preprocessor.allowed_extensions

    def test_supported_formats(self, preprocessor):
        """Test that all expected formats are supported"""
        expected_image_formats = ['png', 'jpg', 'jpeg', 'bmp', 'tiff', 'tif']
        expected_pdf_format = ['pdf']

        for fmt in expected_image_formats:
            assert fmt in preprocessor.SUPPORTED_IMAGE_FORMATS

        for fmt in expected_pdf_format:
            assert fmt in preprocessor.SUPPORTED_PDF_FORMAT

        all_formats = expected_image_formats + expected_pdf_format
        assert set(preprocessor.ALL_SUPPORTED_FORMATS) == set(all_formats)


@pytest.mark.unit
class TestFileValidation:
    """Test file validation methods"""

    def test_validate_valid_png(self, preprocessor, sample_image_path):
        """Test validation of a valid PNG file"""
        is_valid, file_format, error = preprocessor.validate_file(sample_image_path)

        assert is_valid is True
        assert file_format == 'png'
        assert error is None

    def test_validate_valid_jpg(self, preprocessor, sample_jpg_path):
        """Test validation of a valid JPG file"""
        is_valid, file_format, error = preprocessor.validate_file(sample_jpg_path)

        assert is_valid is True
        assert file_format == 'jpg'
        assert error is None

    def test_validate_valid_pdf(self, preprocessor, sample_pdf_path):
        """Test validation of a valid PDF file"""
        is_valid, file_format, error = preprocessor.validate_file(sample_pdf_path)

        assert is_valid is True
        assert file_format == 'pdf'
        assert error is None

    def test_validate_nonexistent_file(self, preprocessor, temp_dir):
        """Test validation of a non-existent file"""
        fake_path = temp_dir / "nonexistent.png"
        is_valid, file_format, error = preprocessor.validate_file(fake_path)

        assert is_valid is False
        assert file_format is None
        assert "not found" in error.lower()

    def test_validate_large_file(self, preprocessor, large_file_path):
        """Test validation of a file exceeding size limit"""
        is_valid, file_format, error = preprocessor.validate_file(large_file_path)

        assert is_valid is False
        assert file_format is None
        assert "too large" in error.lower()

    def test_validate_unsupported_format(self, preprocessor, unsupported_file_path):
        """Test validation of unsupported file format"""
        is_valid, file_format, error = preprocessor.validate_file(unsupported_file_path)

        assert is_valid is False
        assert "not allowed" in error.lower() or "unsupported" in error.lower()

    def test_validate_corrupted_image(self, preprocessor, corrupted_image_path):
        """Test validation of a corrupted image file"""
        is_valid, file_format, error = preprocessor.validate_file(corrupted_image_path)

        assert is_valid is False
        assert error is not None
        # Corrupted files may be detected as unsupported type or corrupted
        assert ("corrupted" in error.lower() or
                "unsupported" in error.lower() or
                "not allowed" in error.lower())


@pytest.mark.unit
class TestMimeTypeMapping:
    """Test MIME type to format mapping"""

    def test_mime_to_format_png(self, preprocessor):
        """Test PNG MIME type mapping"""
        assert preprocessor._mime_to_format('image/png') == 'png'

    def test_mime_to_format_jpeg(self, preprocessor):
        """Test JPEG MIME type mapping"""
        assert preprocessor._mime_to_format('image/jpeg') == 'jpg'
        assert preprocessor._mime_to_format('image/jpg') == 'jpg'

    def test_mime_to_format_pdf(self, preprocessor):
        """Test PDF MIME type mapping"""
        assert preprocessor._mime_to_format('application/pdf') == 'pdf'

    def test_mime_to_format_tiff(self, preprocessor):
        """Test TIFF MIME type mapping"""
        assert preprocessor._mime_to_format('image/tiff') == 'tiff'
        assert preprocessor._mime_to_format('image/x-tiff') == 'tiff'

    def test_mime_to_format_bmp(self, preprocessor):
        """Test BMP MIME type mapping"""
        assert preprocessor._mime_to_format('image/bmp') == 'bmp'

    def test_mime_to_format_unknown(self, preprocessor):
        """Test unknown MIME type returns None"""
        assert preprocessor._mime_to_format('unknown/type') is None
        assert preprocessor._mime_to_format('text/plain') is None


@pytest.mark.unit
class TestIntegrityValidation:
    """Test file integrity validation"""

    def test_validate_integrity_valid_png(self, preprocessor, sample_image_path):
        """Test integrity check for valid PNG"""
        is_valid, error = preprocessor._validate_integrity(sample_image_path, 'png')

        assert is_valid is True
        assert error is None

    def test_validate_integrity_valid_jpg(self, preprocessor, sample_jpg_path):
        """Test integrity check for valid JPG"""
        is_valid, error = preprocessor._validate_integrity(sample_jpg_path, 'jpg')

        assert is_valid is True
        assert error is None

    def test_validate_integrity_valid_pdf(self, preprocessor, sample_pdf_path):
        """Test integrity check for valid PDF"""
        is_valid, error = preprocessor._validate_integrity(sample_pdf_path, 'pdf')

        assert is_valid is True
        assert error is None

    def test_validate_integrity_corrupted_image(self, preprocessor, corrupted_image_path):
        """Test integrity check for corrupted image"""
        is_valid, error = preprocessor._validate_integrity(corrupted_image_path, 'png')

        assert is_valid is False
        assert error is not None

    def test_validate_integrity_invalid_pdf_header(self, preprocessor, temp_dir):
        """Test integrity check for PDF with invalid header"""
        invalid_pdf = temp_dir / "invalid.pdf"
        with open(invalid_pdf, 'wb') as f:
            f.write(b'Not a PDF file')

        is_valid, error = preprocessor._validate_integrity(invalid_pdf, 'pdf')

        assert is_valid is False
        assert "invalid" in error.lower() or "header" in error.lower()

    def test_validate_integrity_unknown_format(self, preprocessor, temp_dir):
        """Test integrity check for unknown format"""
        test_file = temp_dir / "test.xyz"
        test_file.write_text("test")

        is_valid, error = preprocessor._validate_integrity(test_file, 'xyz')

        assert is_valid is False
        assert error is not None


@pytest.mark.unit
class TestImagePreprocessing:
    """Test image preprocessing functionality"""

    def test_preprocess_image_without_enhancement(self, preprocessor, sample_image_path):
        """Test preprocessing without enhancement (returns original)"""
        success, output_path, error = preprocessor.preprocess_image(
            sample_image_path,
            enhance=False
        )

        assert success is True
        assert output_path == sample_image_path
        assert error is None

    def test_preprocess_image_with_enhancement(self, preprocessor, sample_image_with_text, temp_dir):
        """Test preprocessing with enhancement"""
        output_path = temp_dir / "processed.png"

        success, result_path, error = preprocessor.preprocess_image(
            sample_image_with_text,
            enhance=True,
            output_path=output_path
        )

        assert success is True
        assert result_path == output_path
        assert result_path.exists()
        assert error is None

        # Verify the output is a valid image
        with Image.open(result_path) as img:
            assert img.size[0] > 0
            assert img.size[1] > 0

    def test_preprocess_image_auto_output_path(self, preprocessor, sample_image_with_text):
        """Test preprocessing with automatic output path"""
        success, result_path, error = preprocessor.preprocess_image(
            sample_image_with_text,
            enhance=True
        )

        assert success is True
        assert result_path is not None
        assert result_path.exists()
        assert "processed_" in result_path.name
        assert error is None

    def test_preprocess_nonexistent_image(self, preprocessor, temp_dir):
        """Test preprocessing with non-existent image"""
        fake_path = temp_dir / "nonexistent.png"

        success, result_path, error = preprocessor.preprocess_image(
            fake_path,
            enhance=True
        )

        assert success is False
        assert result_path is None
        assert error is not None

    def test_preprocess_corrupted_image(self, preprocessor, corrupted_image_path):
        """Test preprocessing with corrupted image"""
        success, result_path, error = preprocessor.preprocess_image(
            corrupted_image_path,
            enhance=True
        )

        assert success is False
        assert result_path is None
        assert error is not None


@pytest.mark.unit
class TestFileInfo:
    """Test file information retrieval"""

    def test_get_file_info_png(self, preprocessor, sample_image_path):
        """Test getting file info for PNG"""
        info = preprocessor.get_file_info(sample_image_path)

        assert info['name'] == sample_image_path.name
        assert info['path'] == str(sample_image_path)
        assert info['size'] > 0
        assert info['size_mb'] > 0
        assert info['mime_type'] == 'image/png'
        assert info['format'] == 'png'
        assert 'created_at' in info
        assert 'modified_at' in info

    def test_get_file_info_jpg(self, preprocessor, sample_jpg_path):
        """Test getting file info for JPG"""
        info = preprocessor.get_file_info(sample_jpg_path)

        assert info['name'] == sample_jpg_path.name
        assert info['mime_type'] == 'image/jpeg'
        assert info['format'] == 'jpg'

    def test_get_file_info_pdf(self, preprocessor, sample_pdf_path):
        """Test getting file info for PDF"""
        info = preprocessor.get_file_info(sample_pdf_path)

        assert info['name'] == sample_pdf_path.name
        assert info['mime_type'] == 'application/pdf'
        assert info['format'] == 'pdf'

    def test_get_file_info_size_calculation(self, preprocessor, sample_image_path):
        """Test that file size is correctly calculated"""
        info = preprocessor.get_file_info(sample_image_path)

        actual_size = sample_image_path.stat().st_size
        assert info['size'] == actual_size
        assert abs(info['size_mb'] - (actual_size / (1024 * 1024))) < 0.001


@pytest.mark.unit
class TestEdgeCases:
    """Test edge cases and error handling"""

    def test_validate_empty_file(self, preprocessor, temp_dir):
        """Test validation of empty file"""
        empty_file = temp_dir / "empty.png"
        empty_file.touch()

        is_valid, file_format, error = preprocessor.validate_file(empty_file)

        # Should fail because empty file has no valid MIME type or is corrupted
        assert is_valid is False

    def test_validate_file_with_wrong_extension(self, preprocessor, temp_dir):
        """Test validation of file with misleading extension"""
        # Create a PNG file but name it .txt
        misleading_file = temp_dir / "image.txt"
        img = Image.new('RGB', (10, 10), color='white')
        img.save(misleading_file, 'PNG')

        # Validation uses MIME detection, not extension
        # So a PNG file named .txt should pass if PNG is in allowed_extensions
        is_valid, file_format, error = preprocessor.validate_file(misleading_file)

        # Should succeed because MIME detection finds it's a PNG
        # (preprocessor uses magic number detection, not file extension)
        assert is_valid is True
        assert file_format == 'png'

    def test_preprocess_very_small_image(self, preprocessor, temp_dir):
        """Test preprocessing of very small image"""
        small_image = temp_dir / "small.png"
        img = Image.new('RGB', (5, 5), color='white')
        img.save(small_image, 'PNG')

        success, result_path, error = preprocessor.preprocess_image(
            small_image,
            enhance=True
        )

        # Should succeed even with very small image
        assert success is True
        assert result_path is not None
        assert result_path.exists()