351 lines
13 KiB
Python
351 lines
13 KiB
Python
"""
|
|
Tool_OCR - Document Preprocessor Unit Tests
|
|
Tests for app/services/preprocessor.py
|
|
"""
|
|
|
|
import pytest
|
|
from pathlib import Path
|
|
from PIL import Image
|
|
|
|
from app.services.preprocessor import DocumentPreprocessor
|
|
|
|
|
|
@pytest.mark.unit
|
|
class TestDocumentPreprocessor:
|
|
"""Test suite for DocumentPreprocessor"""
|
|
|
|
def test_init(self, preprocessor):
|
|
"""Test preprocessor initialization"""
|
|
assert preprocessor is not None
|
|
assert preprocessor.max_file_size > 0
|
|
assert len(preprocessor.allowed_extensions) > 0
|
|
assert 'png' in preprocessor.allowed_extensions
|
|
assert 'jpg' in preprocessor.allowed_extensions
|
|
assert 'pdf' in preprocessor.allowed_extensions
|
|
|
|
def test_supported_formats(self, preprocessor):
|
|
"""Test that all expected formats are supported"""
|
|
expected_image_formats = ['png', 'jpg', 'jpeg', 'bmp', 'tiff', 'tif']
|
|
expected_pdf_format = ['pdf']
|
|
|
|
for fmt in expected_image_formats:
|
|
assert fmt in preprocessor.SUPPORTED_IMAGE_FORMATS
|
|
|
|
for fmt in expected_pdf_format:
|
|
assert fmt in preprocessor.SUPPORTED_PDF_FORMAT
|
|
|
|
all_formats = expected_image_formats + expected_pdf_format
|
|
assert set(preprocessor.ALL_SUPPORTED_FORMATS) == set(all_formats)
|
|
|
|
|
|
@pytest.mark.unit
|
|
class TestFileValidation:
|
|
"""Test file validation methods"""
|
|
|
|
def test_validate_valid_png(self, preprocessor, sample_image_path):
|
|
"""Test validation of a valid PNG file"""
|
|
is_valid, file_format, error = preprocessor.validate_file(sample_image_path)
|
|
|
|
assert is_valid is True
|
|
assert file_format == 'png'
|
|
assert error is None
|
|
|
|
def test_validate_valid_jpg(self, preprocessor, sample_jpg_path):
|
|
"""Test validation of a valid JPG file"""
|
|
is_valid, file_format, error = preprocessor.validate_file(sample_jpg_path)
|
|
|
|
assert is_valid is True
|
|
assert file_format == 'jpg'
|
|
assert error is None
|
|
|
|
def test_validate_valid_pdf(self, preprocessor, sample_pdf_path):
|
|
"""Test validation of a valid PDF file"""
|
|
is_valid, file_format, error = preprocessor.validate_file(sample_pdf_path)
|
|
|
|
assert is_valid is True
|
|
assert file_format == 'pdf'
|
|
assert error is None
|
|
|
|
def test_validate_nonexistent_file(self, preprocessor, temp_dir):
|
|
"""Test validation of a non-existent file"""
|
|
fake_path = temp_dir / "nonexistent.png"
|
|
is_valid, file_format, error = preprocessor.validate_file(fake_path)
|
|
|
|
assert is_valid is False
|
|
assert file_format is None
|
|
assert "not found" in error.lower()
|
|
|
|
def test_validate_large_file(self, preprocessor, large_file_path):
|
|
"""Test validation of a file exceeding size limit"""
|
|
is_valid, file_format, error = preprocessor.validate_file(large_file_path)
|
|
|
|
assert is_valid is False
|
|
assert file_format is None
|
|
assert "too large" in error.lower()
|
|
|
|
def test_validate_unsupported_format(self, preprocessor, unsupported_file_path):
|
|
"""Test validation of unsupported file format"""
|
|
is_valid, file_format, error = preprocessor.validate_file(unsupported_file_path)
|
|
|
|
assert is_valid is False
|
|
assert "not allowed" in error.lower() or "unsupported" in error.lower()
|
|
|
|
def test_validate_corrupted_image(self, preprocessor, corrupted_image_path):
|
|
"""Test validation of a corrupted image file"""
|
|
is_valid, file_format, error = preprocessor.validate_file(corrupted_image_path)
|
|
|
|
assert is_valid is False
|
|
assert error is not None
|
|
# Corrupted files may be detected as unsupported type or corrupted
|
|
assert ("corrupted" in error.lower() or
|
|
"unsupported" in error.lower() or
|
|
"not allowed" in error.lower())
|
|
|
|
|
|
@pytest.mark.unit
|
|
class TestMimeTypeMapping:
|
|
"""Test MIME type to format mapping"""
|
|
|
|
def test_mime_to_format_png(self, preprocessor):
|
|
"""Test PNG MIME type mapping"""
|
|
assert preprocessor._mime_to_format('image/png') == 'png'
|
|
|
|
def test_mime_to_format_jpeg(self, preprocessor):
|
|
"""Test JPEG MIME type mapping"""
|
|
assert preprocessor._mime_to_format('image/jpeg') == 'jpg'
|
|
assert preprocessor._mime_to_format('image/jpg') == 'jpg'
|
|
|
|
def test_mime_to_format_pdf(self, preprocessor):
|
|
"""Test PDF MIME type mapping"""
|
|
assert preprocessor._mime_to_format('application/pdf') == 'pdf'
|
|
|
|
def test_mime_to_format_tiff(self, preprocessor):
|
|
"""Test TIFF MIME type mapping"""
|
|
assert preprocessor._mime_to_format('image/tiff') == 'tiff'
|
|
assert preprocessor._mime_to_format('image/x-tiff') == 'tiff'
|
|
|
|
def test_mime_to_format_bmp(self, preprocessor):
|
|
"""Test BMP MIME type mapping"""
|
|
assert preprocessor._mime_to_format('image/bmp') == 'bmp'
|
|
|
|
def test_mime_to_format_unknown(self, preprocessor):
|
|
"""Test unknown MIME type returns None"""
|
|
assert preprocessor._mime_to_format('unknown/type') is None
|
|
assert preprocessor._mime_to_format('text/plain') is None
|
|
|
|
|
|
@pytest.mark.unit
|
|
class TestIntegrityValidation:
|
|
"""Test file integrity validation"""
|
|
|
|
def test_validate_integrity_valid_png(self, preprocessor, sample_image_path):
|
|
"""Test integrity check for valid PNG"""
|
|
is_valid, error = preprocessor._validate_integrity(sample_image_path, 'png')
|
|
|
|
assert is_valid is True
|
|
assert error is None
|
|
|
|
def test_validate_integrity_valid_jpg(self, preprocessor, sample_jpg_path):
|
|
"""Test integrity check for valid JPG"""
|
|
is_valid, error = preprocessor._validate_integrity(sample_jpg_path, 'jpg')
|
|
|
|
assert is_valid is True
|
|
assert error is None
|
|
|
|
def test_validate_integrity_valid_pdf(self, preprocessor, sample_pdf_path):
|
|
"""Test integrity check for valid PDF"""
|
|
is_valid, error = preprocessor._validate_integrity(sample_pdf_path, 'pdf')
|
|
|
|
assert is_valid is True
|
|
assert error is None
|
|
|
|
def test_validate_integrity_corrupted_image(self, preprocessor, corrupted_image_path):
|
|
"""Test integrity check for corrupted image"""
|
|
is_valid, error = preprocessor._validate_integrity(corrupted_image_path, 'png')
|
|
|
|
assert is_valid is False
|
|
assert error is not None
|
|
|
|
def test_validate_integrity_invalid_pdf_header(self, preprocessor, temp_dir):
|
|
"""Test integrity check for PDF with invalid header"""
|
|
invalid_pdf = temp_dir / "invalid.pdf"
|
|
with open(invalid_pdf, 'wb') as f:
|
|
f.write(b'Not a PDF file')
|
|
|
|
is_valid, error = preprocessor._validate_integrity(invalid_pdf, 'pdf')
|
|
|
|
assert is_valid is False
|
|
assert "invalid" in error.lower() or "header" in error.lower()
|
|
|
|
def test_validate_integrity_unknown_format(self, preprocessor, temp_dir):
|
|
"""Test integrity check for unknown format"""
|
|
test_file = temp_dir / "test.xyz"
|
|
test_file.write_text("test")
|
|
|
|
is_valid, error = preprocessor._validate_integrity(test_file, 'xyz')
|
|
|
|
assert is_valid is False
|
|
assert error is not None
|
|
|
|
|
|
@pytest.mark.unit
|
|
class TestImagePreprocessing:
|
|
"""Test image preprocessing functionality"""
|
|
|
|
def test_preprocess_image_without_enhancement(self, preprocessor, sample_image_path):
|
|
"""Test preprocessing without enhancement (returns original)"""
|
|
success, output_path, error = preprocessor.preprocess_image(
|
|
sample_image_path,
|
|
enhance=False
|
|
)
|
|
|
|
assert success is True
|
|
assert output_path == sample_image_path
|
|
assert error is None
|
|
|
|
def test_preprocess_image_with_enhancement(self, preprocessor, sample_image_with_text, temp_dir):
|
|
"""Test preprocessing with enhancement"""
|
|
output_path = temp_dir / "processed.png"
|
|
|
|
success, result_path, error = preprocessor.preprocess_image(
|
|
sample_image_with_text,
|
|
enhance=True,
|
|
output_path=output_path
|
|
)
|
|
|
|
assert success is True
|
|
assert result_path == output_path
|
|
assert result_path.exists()
|
|
assert error is None
|
|
|
|
# Verify the output is a valid image
|
|
with Image.open(result_path) as img:
|
|
assert img.size[0] > 0
|
|
assert img.size[1] > 0
|
|
|
|
def test_preprocess_image_auto_output_path(self, preprocessor, sample_image_with_text):
|
|
"""Test preprocessing with automatic output path"""
|
|
success, result_path, error = preprocessor.preprocess_image(
|
|
sample_image_with_text,
|
|
enhance=True
|
|
)
|
|
|
|
assert success is True
|
|
assert result_path is not None
|
|
assert result_path.exists()
|
|
assert "processed_" in result_path.name
|
|
assert error is None
|
|
|
|
def test_preprocess_nonexistent_image(self, preprocessor, temp_dir):
|
|
"""Test preprocessing with non-existent image"""
|
|
fake_path = temp_dir / "nonexistent.png"
|
|
|
|
success, result_path, error = preprocessor.preprocess_image(
|
|
fake_path,
|
|
enhance=True
|
|
)
|
|
|
|
assert success is False
|
|
assert result_path is None
|
|
assert error is not None
|
|
|
|
def test_preprocess_corrupted_image(self, preprocessor, corrupted_image_path):
|
|
"""Test preprocessing with corrupted image"""
|
|
success, result_path, error = preprocessor.preprocess_image(
|
|
corrupted_image_path,
|
|
enhance=True
|
|
)
|
|
|
|
assert success is False
|
|
assert result_path is None
|
|
assert error is not None
|
|
|
|
|
|
@pytest.mark.unit
|
|
class TestFileInfo:
|
|
"""Test file information retrieval"""
|
|
|
|
def test_get_file_info_png(self, preprocessor, sample_image_path):
|
|
"""Test getting file info for PNG"""
|
|
info = preprocessor.get_file_info(sample_image_path)
|
|
|
|
assert info['name'] == sample_image_path.name
|
|
assert info['path'] == str(sample_image_path)
|
|
assert info['size'] > 0
|
|
assert info['size_mb'] > 0
|
|
assert info['mime_type'] == 'image/png'
|
|
assert info['format'] == 'png'
|
|
assert 'created_at' in info
|
|
assert 'modified_at' in info
|
|
|
|
def test_get_file_info_jpg(self, preprocessor, sample_jpg_path):
|
|
"""Test getting file info for JPG"""
|
|
info = preprocessor.get_file_info(sample_jpg_path)
|
|
|
|
assert info['name'] == sample_jpg_path.name
|
|
assert info['mime_type'] == 'image/jpeg'
|
|
assert info['format'] == 'jpg'
|
|
|
|
def test_get_file_info_pdf(self, preprocessor, sample_pdf_path):
|
|
"""Test getting file info for PDF"""
|
|
info = preprocessor.get_file_info(sample_pdf_path)
|
|
|
|
assert info['name'] == sample_pdf_path.name
|
|
assert info['mime_type'] == 'application/pdf'
|
|
assert info['format'] == 'pdf'
|
|
|
|
def test_get_file_info_size_calculation(self, preprocessor, sample_image_path):
|
|
"""Test that file size is correctly calculated"""
|
|
info = preprocessor.get_file_info(sample_image_path)
|
|
|
|
actual_size = sample_image_path.stat().st_size
|
|
assert info['size'] == actual_size
|
|
assert abs(info['size_mb'] - (actual_size / (1024 * 1024))) < 0.001
|
|
|
|
|
|
@pytest.mark.unit
|
|
class TestEdgeCases:
|
|
"""Test edge cases and error handling"""
|
|
|
|
def test_validate_empty_file(self, preprocessor, temp_dir):
|
|
"""Test validation of empty file"""
|
|
empty_file = temp_dir / "empty.png"
|
|
empty_file.touch()
|
|
|
|
is_valid, file_format, error = preprocessor.validate_file(empty_file)
|
|
|
|
# Should fail because empty file has no valid MIME type or is corrupted
|
|
assert is_valid is False
|
|
|
|
def test_validate_file_with_wrong_extension(self, preprocessor, temp_dir):
|
|
"""Test validation of file with misleading extension"""
|
|
# Create a PNG file but name it .txt
|
|
misleading_file = temp_dir / "image.txt"
|
|
img = Image.new('RGB', (10, 10), color='white')
|
|
img.save(misleading_file, 'PNG')
|
|
|
|
# Validation uses MIME detection, not extension
|
|
# So a PNG file named .txt should pass if PNG is in allowed_extensions
|
|
is_valid, file_format, error = preprocessor.validate_file(misleading_file)
|
|
|
|
# Should succeed because MIME detection finds it's a PNG
|
|
# (preprocessor uses magic number detection, not file extension)
|
|
assert is_valid is True
|
|
assert file_format == 'png'
|
|
|
|
def test_preprocess_very_small_image(self, preprocessor, temp_dir):
|
|
"""Test preprocessing of very small image"""
|
|
small_image = temp_dir / "small.png"
|
|
img = Image.new('RGB', (5, 5), color='white')
|
|
img.save(small_image, 'PNG')
|
|
|
|
success, result_path, error = preprocessor.preprocess_image(
|
|
small_image,
|
|
enhance=True
|
|
)
|
|
|
|
# Should succeed even with very small image
|
|
assert success is True
|
|
assert result_path is not None
|
|
assert result_path.exists()
|