""" MIME Type Validation Service using Magic Bytes Detection. This module provides file content type validation by examining the actual file content (magic bytes) rather than trusting the file extension or Content-Type header. """ import logging from typing import Optional, Tuple, Dict, Set, BinaryIO from io import BytesIO logger = logging.getLogger(__name__) class MimeValidationError(Exception): """Raised when MIME type validation fails.""" pass class FileMismatchError(MimeValidationError): """Raised when file extension doesn't match actual content type.""" pass class UnsupportedMimeError(MimeValidationError): """Raised when file has an unsupported MIME type.""" pass # Magic bytes signatures for common file types # Format: { bytes_pattern: (mime_type, extensions) } MAGIC_SIGNATURES: Dict[bytes, Tuple[str, Set[str]]] = { # Images b'\xFF\xD8\xFF': ('image/jpeg', {'jpg', 'jpeg', 'jpe'}), b'\x89PNG\r\n\x1a\n': ('image/png', {'png'}), b'GIF87a': ('image/gif', {'gif'}), b'GIF89a': ('image/gif', {'gif'}), b'RIFF': ('image/webp', {'webp'}), # WebP starts with RIFF, then WEBP b'BM': ('image/bmp', {'bmp'}), # PDF b'%PDF': ('application/pdf', {'pdf'}), # Microsoft Office (Modern formats - ZIP-based) b'PK\x03\x04': ('application/zip', {'zip', 'docx', 'xlsx', 'pptx', 'odt', 'ods', 'odp', 'jar'}), # Microsoft Office (Legacy formats - Compound Document) b'\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1': ('application/msword', {'doc', 'xls', 'ppt', 'msi'}), # Archives b'\x1f\x8b': ('application/gzip', {'gz', 'tgz'}), b'\x42\x5a\x68': ('application/x-bzip2', {'bz2'}), b'\x37\x7A\xBC\xAF\x27\x1C': ('application/x-7z-compressed', {'7z'}), b'Rar!\x1a\x07': ('application/x-rar-compressed', {'rar'}), # Text/Data formats - these are harder to detect, usually fallback to extension b' Optional[str]: """ Detect MIME type from file content using magic bytes. Args: file_content: The raw file bytes (at least first 16 bytes needed) Returns: Detected MIME type or None if unknown """ if len(file_content) < 2: return None # Check each magic signature for magic_bytes, (mime_type, _) in MAGIC_SIGNATURES.items(): if file_content.startswith(magic_bytes): # Special case for WebP: check for WEBP after RIFF if magic_bytes == b'RIFF' and len(file_content) >= 12: if file_content[8:12] == b'WEBP': return 'image/webp' else: continue # Not WebP, might be something else return mime_type return None def validate_file_content( self, file_content: bytes, declared_extension: str, declared_mime_type: Optional[str] = None, trusted_source: bool = False ) -> Tuple[bool, str, Optional[str]]: """ Validate file content against declared extension and MIME type. Args: file_content: The raw file bytes declared_extension: The file extension (without dot) declared_mime_type: The Content-Type header value (optional) trusted_source: If True and bypass_for_trusted is enabled, skip validation Returns: Tuple of (is_valid, detected_mime_type, error_message) """ # Bypass for trusted sources if configured if trusted_source and self.bypass_for_trusted: logger.debug("MIME validation bypassed for trusted source") return True, declared_mime_type or 'application/octet-stream', None # Detect actual MIME type detected_mime = self.detect_mime_type(file_content) ext_lower = declared_extension.lower() # Check if detected MIME is blocked (dangerous executable) if detected_mime in BLOCKED_MIME_TYPES: logger.warning( "Blocked dangerous file type detected: %s (claimed extension: %s)", detected_mime, ext_lower ) return False, detected_mime, "File type not allowed for security reasons" # If we couldn't detect the MIME type, fall back to extension-based check if detected_mime is None: # For text/data files, detection is unreliable # Trust the extension if it's in our allowed list if ext_lower in EXTENSION_TO_MIME: expected_mimes = EXTENSION_TO_MIME[ext_lower] # Check if any expected MIME is in allowed set if expected_mimes & self.allowed_mime_types: logger.debug( "MIME detection inconclusive for extension %s, allowing based on extension", ext_lower ) # Return the first expected MIME type return True, next(iter(expected_mimes)), None # Unknown extension or MIME type logger.warning( "Could not detect MIME type for file with extension: %s", ext_lower ) return True, 'application/octet-stream', None # Check if detected MIME is in allowed set if detected_mime not in self.allowed_mime_types: logger.warning( "Unsupported MIME type detected: %s (extension: %s)", detected_mime, ext_lower ) return False, detected_mime, f"Unsupported file type: {detected_mime}" # Verify extension matches detected MIME type if ext_lower in EXTENSION_TO_MIME: expected_mimes = EXTENSION_TO_MIME[ext_lower] # Special handling for ZIP-based formats (docx, xlsx, pptx) if detected_mime == 'application/zip' and ext_lower in {'docx', 'xlsx', 'pptx', 'odt', 'ods', 'odp'}: # These are valid - ZIP container with specific extension return True, detected_mime, None # Check if detected MIME matches any expected MIME for this extension if detected_mime not in expected_mimes: # Mismatch detected! logger.warning( "File type mismatch: extension '%s' but detected '%s'", ext_lower, detected_mime ) return False, detected_mime, f"File type mismatch: extension indicates {ext_lower} but content is {detected_mime}" return True, detected_mime, None async def validate_upload_file( self, file_content: bytes, filename: str, content_type: Optional[str] = None, trusted_source: bool = False ) -> Tuple[bool, str, Optional[str]]: """ Validate an uploaded file. Args: file_content: The raw file bytes filename: The uploaded filename content_type: The Content-Type header value trusted_source: If True and bypass is enabled, skip validation Returns: Tuple of (is_valid, detected_mime_type, error_message) """ # Extract extension extension = filename.rsplit('.', 1)[-1] if '.' in filename else '' return self.validate_file_content( file_content=file_content, declared_extension=extension, declared_mime_type=content_type, trusted_source=trusted_source ) # Singleton instance with default configuration mime_validation_service = MimeValidationService()