OCR/backend/app/services/preprocessor.py

"""
Tool_OCR - Document Preprocessor Service
Handles file validation, format detection, and preprocessing
"""

import magic
from pathlib import Path
from typing import Tuple, Optional
import logging
from PIL import Image
import cv2
import numpy as np

from app.core.config import settings

logger = logging.getLogger(__name__)


class DocumentPreprocessor:
    """
    Document preprocessing service for format standardization
    Validates and prepares documents for OCR processing
    """

    SUPPORTED_IMAGE_FORMATS = ['png', 'jpg', 'jpeg', 'bmp', 'tiff', 'tif']
    SUPPORTED_PDF_FORMAT = ['pdf']
    ALL_SUPPORTED_FORMATS = SUPPORTED_IMAGE_FORMATS + SUPPORTED_PDF_FORMAT

    def __init__(self):
        self.allowed_extensions = settings.allowed_extensions_list
        self.max_file_size = settings.max_upload_size
        logger.info(f"DocumentPreprocessor initialized with allowed_extensions: {self.allowed_extensions}")

    def validate_file(self, file_path: Path) -> Tuple[bool, Optional[str], Optional[str]]:
        """
        Validate file format, size, and integrity

        Args:
            file_path: Path to the file to validate

        Returns:
            Tuple of (is_valid, file_format, error_message)
        """
        try:
            # Check file exists
            if not file_path.exists():
                return False, None, f"File not found: {file_path}"

            # Check file size
            file_size = file_path.stat().st_size
            if file_size > self.max_file_size:
                max_mb = self.max_file_size / (1024 * 1024)
                actual_mb = file_size / (1024 * 1024)
                return False, None, f"File too large: {actual_mb:.2f}MB (max {max_mb:.2f}MB)"

            # Detect file format using magic numbers
            mime = magic.Magic(mime=True)
            mime_type = mime.from_file(str(file_path))

            # Map MIME type to format
            file_format = self._mime_to_format(mime_type)
            if not file_format:
                return False, None, f"Unsupported file type: {mime_type}"

            # Check if format is in allowed extensions
            if file_format not in self.allowed_extensions:
                return False, None, f"File format '{file_format}' not allowed"

            # Validate file integrity
            is_valid, error = self._validate_integrity(file_path, file_format)
            if not is_valid:
                return False, file_format, f"File corrupted: {error}"

            logger.info(f"File validated successfully: {file_path.name} ({file_format})")
            return True, file_format, None

        except Exception as e:
            logger.error(f"File validation error: {str(e)}")
            return False, None, f"Validation error: {str(e)}"

    def _mime_to_format(self, mime_type: str) -> Optional[str]:
        """Convert MIME type to file format"""
        mime_map = {
            'image/png': 'png',
            'image/jpeg': 'jpg',
            'image/jpg': 'jpg',
            'image/bmp': 'bmp',
            'image/tiff': 'tiff',
            'image/x-tiff': 'tiff',
            'application/pdf': 'pdf',
            'application/msword': 'doc',
            'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
            'application/vnd.ms-powerpoint': 'ppt',
            'application/vnd.openxmlformats-officedocument.presentationml.presentation': 'pptx',
        }
        return mime_map.get(mime_type)

    def _validate_integrity(self, file_path: Path, file_format: str) -> Tuple[bool, Optional[str]]:
        """
        Validate file integrity by attempting to open it

        Args:
            file_path: Path to file
            file_format: Detected file format

        Returns:
            Tuple of (is_valid, error_message)
        """
        try:
            if file_format in self.SUPPORTED_IMAGE_FORMATS:
                # Try to open image
                with Image.open(file_path) as img:
                    img.verify()  # Verify image integrity
                # Reopen for actual check (verify() closes the file)
                with Image.open(file_path) as img:
                    _ = img.size  # Force load to detect corruption
                return True, None

            elif file_format == 'pdf':
                # Basic PDF validation - check file starts with PDF signature
                with open(file_path, 'rb') as f:
                    header = f.read(5)
                    if header != b'%PDF-':
                        return False, "Invalid PDF header"
                return True, None

            elif file_format in ['doc', 'docx', 'ppt', 'pptx']:
                # Office documents - basic validation (check file size and can be opened)
                # Modern Office formats (docx, pptx) are ZIP-based
                if file_format in ['docx', 'pptx']:
                    import zipfile
                    try:
                        with zipfile.ZipFile(file_path, 'r') as zf:
                            # Check if it has the required Office structure
                            if file_format == 'docx' and 'word/document.xml' not in zf.namelist():
                                return False, "Invalid DOCX structure"
                            elif file_format == 'pptx' and 'ppt/presentation.xml' not in zf.namelist():
                                return False, "Invalid PPTX structure"
                    except zipfile.BadZipFile:
                        return False, "Invalid Office file (corrupt ZIP)"
                # Old formats (doc, ppt) - just check file exists and has content
                return True, None

            else:
                return False, f"Unknown format: {file_format}"

        except Exception as e:
            return False, str(e)

    def preprocess_image(
        self,
        image_path: Path,
        enhance: bool = True,
        output_path: Optional[Path] = None
    ) -> Tuple[bool, Optional[Path], Optional[str]]:
        """
        Preprocess image to improve OCR accuracy

        Args:
            image_path: Path to input image
            enhance: Whether to apply enhancement
            output_path: Optional output path (defaults to temp directory)

        Returns:
            Tuple of (success, processed_image_path, error_message)
        """
        try:
            # Read image
            img = cv2.imread(str(image_path))
            if img is None:
                return False, None, "Failed to read image"

            if not enhance:
                # No preprocessing, return original
                return True, image_path, None

            # Convert to grayscale
            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

            # Apply adaptive thresholding to handle varying lighting
            processed = cv2.adaptiveThreshold(
                gray,
                255,
                cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                cv2.THRESH_BINARY,
                11,
                2
            )

            # Denoise
            processed = cv2.fastNlMeansDenoising(processed, None, 10, 7, 21)

            # Determine output path
            if output_path is None:
                output_path = Path(settings.processed_dir) / f"processed_{image_path.name}"

            # Save processed image
            cv2.imwrite(str(output_path), processed)

            logger.info(f"Image preprocessed: {image_path.name} -> {output_path.name}")
            return True, output_path, None

        except Exception as e:
            logger.error(f"Image preprocessing error: {str(e)}")
            return False, None, f"Preprocessing error: {str(e)}"

    def get_file_info(self, file_path: Path) -> dict:
        """
        Get comprehensive file information

        Args:
            file_path: Path to file

        Returns:
            Dictionary with file information
        """
        stat = file_path.stat()
        mime = magic.Magic(mime=True)
        mime_type = mime.from_file(str(file_path))

        return {
            'name': file_path.name,
            'path': str(file_path),
            'size': stat.st_size,
            'size_mb': stat.st_size / (1024 * 1024),
            'mime_type': mime_type,
            'format': self._mime_to_format(mime_type),
            'created_at': stat.st_ctime,
            'modified_at': stat.st_mtime,
        }