This commit is contained in:
beabigegg
2025-11-12 22:53:17 +08:00
commit da700721fa
130 changed files with 23393 additions and 0 deletions

View File

@@ -0,0 +1,350 @@
"""
Tool_OCR - Document Preprocessor Unit Tests
Tests for app/services/preprocessor.py
"""
import pytest
from pathlib import Path
from PIL import Image
from app.services.preprocessor import DocumentPreprocessor
@pytest.mark.unit
class TestDocumentPreprocessor:
"""Test suite for DocumentPreprocessor"""
def test_init(self, preprocessor):
"""Test preprocessor initialization"""
assert preprocessor is not None
assert preprocessor.max_file_size > 0
assert len(preprocessor.allowed_extensions) > 0
assert 'png' in preprocessor.allowed_extensions
assert 'jpg' in preprocessor.allowed_extensions
assert 'pdf' in preprocessor.allowed_extensions
def test_supported_formats(self, preprocessor):
"""Test that all expected formats are supported"""
expected_image_formats = ['png', 'jpg', 'jpeg', 'bmp', 'tiff', 'tif']
expected_pdf_format = ['pdf']
for fmt in expected_image_formats:
assert fmt in preprocessor.SUPPORTED_IMAGE_FORMATS
for fmt in expected_pdf_format:
assert fmt in preprocessor.SUPPORTED_PDF_FORMAT
all_formats = expected_image_formats + expected_pdf_format
assert set(preprocessor.ALL_SUPPORTED_FORMATS) == set(all_formats)
@pytest.mark.unit
class TestFileValidation:
"""Test file validation methods"""
def test_validate_valid_png(self, preprocessor, sample_image_path):
"""Test validation of a valid PNG file"""
is_valid, file_format, error = preprocessor.validate_file(sample_image_path)
assert is_valid is True
assert file_format == 'png'
assert error is None
def test_validate_valid_jpg(self, preprocessor, sample_jpg_path):
"""Test validation of a valid JPG file"""
is_valid, file_format, error = preprocessor.validate_file(sample_jpg_path)
assert is_valid is True
assert file_format == 'jpg'
assert error is None
def test_validate_valid_pdf(self, preprocessor, sample_pdf_path):
"""Test validation of a valid PDF file"""
is_valid, file_format, error = preprocessor.validate_file(sample_pdf_path)
assert is_valid is True
assert file_format == 'pdf'
assert error is None
def test_validate_nonexistent_file(self, preprocessor, temp_dir):
"""Test validation of a non-existent file"""
fake_path = temp_dir / "nonexistent.png"
is_valid, file_format, error = preprocessor.validate_file(fake_path)
assert is_valid is False
assert file_format is None
assert "not found" in error.lower()
def test_validate_large_file(self, preprocessor, large_file_path):
"""Test validation of a file exceeding size limit"""
is_valid, file_format, error = preprocessor.validate_file(large_file_path)
assert is_valid is False
assert file_format is None
assert "too large" in error.lower()
def test_validate_unsupported_format(self, preprocessor, unsupported_file_path):
"""Test validation of unsupported file format"""
is_valid, file_format, error = preprocessor.validate_file(unsupported_file_path)
assert is_valid is False
assert "not allowed" in error.lower() or "unsupported" in error.lower()
def test_validate_corrupted_image(self, preprocessor, corrupted_image_path):
"""Test validation of a corrupted image file"""
is_valid, file_format, error = preprocessor.validate_file(corrupted_image_path)
assert is_valid is False
assert error is not None
# Corrupted files may be detected as unsupported type or corrupted
assert ("corrupted" in error.lower() or
"unsupported" in error.lower() or
"not allowed" in error.lower())
@pytest.mark.unit
class TestMimeTypeMapping:
"""Test MIME type to format mapping"""
def test_mime_to_format_png(self, preprocessor):
"""Test PNG MIME type mapping"""
assert preprocessor._mime_to_format('image/png') == 'png'
def test_mime_to_format_jpeg(self, preprocessor):
"""Test JPEG MIME type mapping"""
assert preprocessor._mime_to_format('image/jpeg') == 'jpg'
assert preprocessor._mime_to_format('image/jpg') == 'jpg'
def test_mime_to_format_pdf(self, preprocessor):
"""Test PDF MIME type mapping"""
assert preprocessor._mime_to_format('application/pdf') == 'pdf'
def test_mime_to_format_tiff(self, preprocessor):
"""Test TIFF MIME type mapping"""
assert preprocessor._mime_to_format('image/tiff') == 'tiff'
assert preprocessor._mime_to_format('image/x-tiff') == 'tiff'
def test_mime_to_format_bmp(self, preprocessor):
"""Test BMP MIME type mapping"""
assert preprocessor._mime_to_format('image/bmp') == 'bmp'
def test_mime_to_format_unknown(self, preprocessor):
"""Test unknown MIME type returns None"""
assert preprocessor._mime_to_format('unknown/type') is None
assert preprocessor._mime_to_format('text/plain') is None
@pytest.mark.unit
class TestIntegrityValidation:
"""Test file integrity validation"""
def test_validate_integrity_valid_png(self, preprocessor, sample_image_path):
"""Test integrity check for valid PNG"""
is_valid, error = preprocessor._validate_integrity(sample_image_path, 'png')
assert is_valid is True
assert error is None
def test_validate_integrity_valid_jpg(self, preprocessor, sample_jpg_path):
"""Test integrity check for valid JPG"""
is_valid, error = preprocessor._validate_integrity(sample_jpg_path, 'jpg')
assert is_valid is True
assert error is None
def test_validate_integrity_valid_pdf(self, preprocessor, sample_pdf_path):
"""Test integrity check for valid PDF"""
is_valid, error = preprocessor._validate_integrity(sample_pdf_path, 'pdf')
assert is_valid is True
assert error is None
def test_validate_integrity_corrupted_image(self, preprocessor, corrupted_image_path):
"""Test integrity check for corrupted image"""
is_valid, error = preprocessor._validate_integrity(corrupted_image_path, 'png')
assert is_valid is False
assert error is not None
def test_validate_integrity_invalid_pdf_header(self, preprocessor, temp_dir):
"""Test integrity check for PDF with invalid header"""
invalid_pdf = temp_dir / "invalid.pdf"
with open(invalid_pdf, 'wb') as f:
f.write(b'Not a PDF file')
is_valid, error = preprocessor._validate_integrity(invalid_pdf, 'pdf')
assert is_valid is False
assert "invalid" in error.lower() or "header" in error.lower()
def test_validate_integrity_unknown_format(self, preprocessor, temp_dir):
"""Test integrity check for unknown format"""
test_file = temp_dir / "test.xyz"
test_file.write_text("test")
is_valid, error = preprocessor._validate_integrity(test_file, 'xyz')
assert is_valid is False
assert error is not None
@pytest.mark.unit
class TestImagePreprocessing:
"""Test image preprocessing functionality"""
def test_preprocess_image_without_enhancement(self, preprocessor, sample_image_path):
"""Test preprocessing without enhancement (returns original)"""
success, output_path, error = preprocessor.preprocess_image(
sample_image_path,
enhance=False
)
assert success is True
assert output_path == sample_image_path
assert error is None
def test_preprocess_image_with_enhancement(self, preprocessor, sample_image_with_text, temp_dir):
"""Test preprocessing with enhancement"""
output_path = temp_dir / "processed.png"
success, result_path, error = preprocessor.preprocess_image(
sample_image_with_text,
enhance=True,
output_path=output_path
)
assert success is True
assert result_path == output_path
assert result_path.exists()
assert error is None
# Verify the output is a valid image
with Image.open(result_path) as img:
assert img.size[0] > 0
assert img.size[1] > 0
def test_preprocess_image_auto_output_path(self, preprocessor, sample_image_with_text):
"""Test preprocessing with automatic output path"""
success, result_path, error = preprocessor.preprocess_image(
sample_image_with_text,
enhance=True
)
assert success is True
assert result_path is not None
assert result_path.exists()
assert "processed_" in result_path.name
assert error is None
def test_preprocess_nonexistent_image(self, preprocessor, temp_dir):
"""Test preprocessing with non-existent image"""
fake_path = temp_dir / "nonexistent.png"
success, result_path, error = preprocessor.preprocess_image(
fake_path,
enhance=True
)
assert success is False
assert result_path is None
assert error is not None
def test_preprocess_corrupted_image(self, preprocessor, corrupted_image_path):
"""Test preprocessing with corrupted image"""
success, result_path, error = preprocessor.preprocess_image(
corrupted_image_path,
enhance=True
)
assert success is False
assert result_path is None
assert error is not None
@pytest.mark.unit
class TestFileInfo:
"""Test file information retrieval"""
def test_get_file_info_png(self, preprocessor, sample_image_path):
"""Test getting file info for PNG"""
info = preprocessor.get_file_info(sample_image_path)
assert info['name'] == sample_image_path.name
assert info['path'] == str(sample_image_path)
assert info['size'] > 0
assert info['size_mb'] > 0
assert info['mime_type'] == 'image/png'
assert info['format'] == 'png'
assert 'created_at' in info
assert 'modified_at' in info
def test_get_file_info_jpg(self, preprocessor, sample_jpg_path):
"""Test getting file info for JPG"""
info = preprocessor.get_file_info(sample_jpg_path)
assert info['name'] == sample_jpg_path.name
assert info['mime_type'] == 'image/jpeg'
assert info['format'] == 'jpg'
def test_get_file_info_pdf(self, preprocessor, sample_pdf_path):
"""Test getting file info for PDF"""
info = preprocessor.get_file_info(sample_pdf_path)
assert info['name'] == sample_pdf_path.name
assert info['mime_type'] == 'application/pdf'
assert info['format'] == 'pdf'
def test_get_file_info_size_calculation(self, preprocessor, sample_image_path):
"""Test that file size is correctly calculated"""
info = preprocessor.get_file_info(sample_image_path)
actual_size = sample_image_path.stat().st_size
assert info['size'] == actual_size
assert abs(info['size_mb'] - (actual_size / (1024 * 1024))) < 0.001
@pytest.mark.unit
class TestEdgeCases:
"""Test edge cases and error handling"""
def test_validate_empty_file(self, preprocessor, temp_dir):
"""Test validation of empty file"""
empty_file = temp_dir / "empty.png"
empty_file.touch()
is_valid, file_format, error = preprocessor.validate_file(empty_file)
# Should fail because empty file has no valid MIME type or is corrupted
assert is_valid is False
def test_validate_file_with_wrong_extension(self, preprocessor, temp_dir):
"""Test validation of file with misleading extension"""
# Create a PNG file but name it .txt
misleading_file = temp_dir / "image.txt"
img = Image.new('RGB', (10, 10), color='white')
img.save(misleading_file, 'PNG')
# Validation uses MIME detection, not extension
# So a PNG file named .txt should pass if PNG is in allowed_extensions
is_valid, file_format, error = preprocessor.validate_file(misleading_file)
# Should succeed because MIME detection finds it's a PNG
# (preprocessor uses magic number detection, not file extension)
assert is_valid is True
assert file_format == 'png'
def test_preprocess_very_small_image(self, preprocessor, temp_dir):
"""Test preprocessing of very small image"""
small_image = temp_dir / "small.png"
img = Image.new('RGB', (5, 5), color='white')
img.save(small_image, 'PNG')
success, result_path, error = preprocessor.preprocess_image(
small_image,
enhance=True
)
# Should succeed even with very small image
assert success is True
assert result_path is not None
assert result_path.exists()