first
This commit is contained in:
230
backend/app/services/preprocessor.py
Normal file
230
backend/app/services/preprocessor.py
Normal file
@@ -0,0 +1,230 @@
|
||||
"""
|
||||
Tool_OCR - Document Preprocessor Service
|
||||
Handles file validation, format detection, and preprocessing
|
||||
"""
|
||||
|
||||
import magic
|
||||
from pathlib import Path
|
||||
from typing import Tuple, Optional
|
||||
import logging
|
||||
from PIL import Image
|
||||
import cv2
|
||||
import numpy as np
|
||||
|
||||
from app.core.config import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DocumentPreprocessor:
|
||||
"""
|
||||
Document preprocessing service for format standardization
|
||||
Validates and prepares documents for OCR processing
|
||||
"""
|
||||
|
||||
SUPPORTED_IMAGE_FORMATS = ['png', 'jpg', 'jpeg', 'bmp', 'tiff', 'tif']
|
||||
SUPPORTED_PDF_FORMAT = ['pdf']
|
||||
ALL_SUPPORTED_FORMATS = SUPPORTED_IMAGE_FORMATS + SUPPORTED_PDF_FORMAT
|
||||
|
||||
def __init__(self):
|
||||
self.allowed_extensions = settings.allowed_extensions_list
|
||||
self.max_file_size = settings.max_upload_size
|
||||
logger.info(f"DocumentPreprocessor initialized with allowed_extensions: {self.allowed_extensions}")
|
||||
|
||||
def validate_file(self, file_path: Path) -> Tuple[bool, Optional[str], Optional[str]]:
|
||||
"""
|
||||
Validate file format, size, and integrity
|
||||
|
||||
Args:
|
||||
file_path: Path to the file to validate
|
||||
|
||||
Returns:
|
||||
Tuple of (is_valid, file_format, error_message)
|
||||
"""
|
||||
try:
|
||||
# Check file exists
|
||||
if not file_path.exists():
|
||||
return False, None, f"File not found: {file_path}"
|
||||
|
||||
# Check file size
|
||||
file_size = file_path.stat().st_size
|
||||
if file_size > self.max_file_size:
|
||||
max_mb = self.max_file_size / (1024 * 1024)
|
||||
actual_mb = file_size / (1024 * 1024)
|
||||
return False, None, f"File too large: {actual_mb:.2f}MB (max {max_mb:.2f}MB)"
|
||||
|
||||
# Detect file format using magic numbers
|
||||
mime = magic.Magic(mime=True)
|
||||
mime_type = mime.from_file(str(file_path))
|
||||
|
||||
# Map MIME type to format
|
||||
file_format = self._mime_to_format(mime_type)
|
||||
if not file_format:
|
||||
return False, None, f"Unsupported file type: {mime_type}"
|
||||
|
||||
# Check if format is in allowed extensions
|
||||
if file_format not in self.allowed_extensions:
|
||||
return False, None, f"File format '{file_format}' not allowed"
|
||||
|
||||
# Validate file integrity
|
||||
is_valid, error = self._validate_integrity(file_path, file_format)
|
||||
if not is_valid:
|
||||
return False, file_format, f"File corrupted: {error}"
|
||||
|
||||
logger.info(f"File validated successfully: {file_path.name} ({file_format})")
|
||||
return True, file_format, None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"File validation error: {str(e)}")
|
||||
return False, None, f"Validation error: {str(e)}"
|
||||
|
||||
def _mime_to_format(self, mime_type: str) -> Optional[str]:
|
||||
"""Convert MIME type to file format"""
|
||||
mime_map = {
|
||||
'image/png': 'png',
|
||||
'image/jpeg': 'jpg',
|
||||
'image/jpg': 'jpg',
|
||||
'image/bmp': 'bmp',
|
||||
'image/tiff': 'tiff',
|
||||
'image/x-tiff': 'tiff',
|
||||
'application/pdf': 'pdf',
|
||||
'application/msword': 'doc',
|
||||
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
|
||||
'application/vnd.ms-powerpoint': 'ppt',
|
||||
'application/vnd.openxmlformats-officedocument.presentationml.presentation': 'pptx',
|
||||
}
|
||||
return mime_map.get(mime_type)
|
||||
|
||||
def _validate_integrity(self, file_path: Path, file_format: str) -> Tuple[bool, Optional[str]]:
|
||||
"""
|
||||
Validate file integrity by attempting to open it
|
||||
|
||||
Args:
|
||||
file_path: Path to file
|
||||
file_format: Detected file format
|
||||
|
||||
Returns:
|
||||
Tuple of (is_valid, error_message)
|
||||
"""
|
||||
try:
|
||||
if file_format in self.SUPPORTED_IMAGE_FORMATS:
|
||||
# Try to open image
|
||||
with Image.open(file_path) as img:
|
||||
img.verify() # Verify image integrity
|
||||
# Reopen for actual check (verify() closes the file)
|
||||
with Image.open(file_path) as img:
|
||||
_ = img.size # Force load to detect corruption
|
||||
return True, None
|
||||
|
||||
elif file_format == 'pdf':
|
||||
# Basic PDF validation - check file starts with PDF signature
|
||||
with open(file_path, 'rb') as f:
|
||||
header = f.read(5)
|
||||
if header != b'%PDF-':
|
||||
return False, "Invalid PDF header"
|
||||
return True, None
|
||||
|
||||
elif file_format in ['doc', 'docx', 'ppt', 'pptx']:
|
||||
# Office documents - basic validation (check file size and can be opened)
|
||||
# Modern Office formats (docx, pptx) are ZIP-based
|
||||
if file_format in ['docx', 'pptx']:
|
||||
import zipfile
|
||||
try:
|
||||
with zipfile.ZipFile(file_path, 'r') as zf:
|
||||
# Check if it has the required Office structure
|
||||
if file_format == 'docx' and 'word/document.xml' not in zf.namelist():
|
||||
return False, "Invalid DOCX structure"
|
||||
elif file_format == 'pptx' and 'ppt/presentation.xml' not in zf.namelist():
|
||||
return False, "Invalid PPTX structure"
|
||||
except zipfile.BadZipFile:
|
||||
return False, "Invalid Office file (corrupt ZIP)"
|
||||
# Old formats (doc, ppt) - just check file exists and has content
|
||||
return True, None
|
||||
|
||||
else:
|
||||
return False, f"Unknown format: {file_format}"
|
||||
|
||||
except Exception as e:
|
||||
return False, str(e)
|
||||
|
||||
def preprocess_image(
|
||||
self,
|
||||
image_path: Path,
|
||||
enhance: bool = True,
|
||||
output_path: Optional[Path] = None
|
||||
) -> Tuple[bool, Optional[Path], Optional[str]]:
|
||||
"""
|
||||
Preprocess image to improve OCR accuracy
|
||||
|
||||
Args:
|
||||
image_path: Path to input image
|
||||
enhance: Whether to apply enhancement
|
||||
output_path: Optional output path (defaults to temp directory)
|
||||
|
||||
Returns:
|
||||
Tuple of (success, processed_image_path, error_message)
|
||||
"""
|
||||
try:
|
||||
# Read image
|
||||
img = cv2.imread(str(image_path))
|
||||
if img is None:
|
||||
return False, None, "Failed to read image"
|
||||
|
||||
if not enhance:
|
||||
# No preprocessing, return original
|
||||
return True, image_path, None
|
||||
|
||||
# Convert to grayscale
|
||||
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
||||
|
||||
# Apply adaptive thresholding to handle varying lighting
|
||||
processed = cv2.adaptiveThreshold(
|
||||
gray,
|
||||
255,
|
||||
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
||||
cv2.THRESH_BINARY,
|
||||
11,
|
||||
2
|
||||
)
|
||||
|
||||
# Denoise
|
||||
processed = cv2.fastNlMeansDenoising(processed, None, 10, 7, 21)
|
||||
|
||||
# Determine output path
|
||||
if output_path is None:
|
||||
output_path = Path(settings.processed_dir) / f"processed_{image_path.name}"
|
||||
|
||||
# Save processed image
|
||||
cv2.imwrite(str(output_path), processed)
|
||||
|
||||
logger.info(f"Image preprocessed: {image_path.name} -> {output_path.name}")
|
||||
return True, output_path, None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Image preprocessing error: {str(e)}")
|
||||
return False, None, f"Preprocessing error: {str(e)}"
|
||||
|
||||
def get_file_info(self, file_path: Path) -> dict:
|
||||
"""
|
||||
Get comprehensive file information
|
||||
|
||||
Args:
|
||||
file_path: Path to file
|
||||
|
||||
Returns:
|
||||
Dictionary with file information
|
||||
"""
|
||||
stat = file_path.stat()
|
||||
mime = magic.Magic(mime=True)
|
||||
mime_type = mime.from_file(str(file_path))
|
||||
|
||||
return {
|
||||
'name': file_path.name,
|
||||
'path': str(file_path),
|
||||
'size': stat.st_size,
|
||||
'size_mb': stat.st_size / (1024 * 1024),
|
||||
'mime_type': mime_type,
|
||||
'format': self._mime_to_format(mime_type),
|
||||
'created_at': stat.st_ctime,
|
||||
'modified_at': stat.st_mtime,
|
||||
}
|
||||
Reference in New Issue
Block a user