first

2025-11-12 22:53:17 +08:00
commit da700721fa
130 changed files with 23393 additions and 0 deletions
--- a/backend/app/services/preprocessor.py
+++ b/backend/app/services/preprocessor.py
@@ -0,0 +1,230 @@
+"""
+Tool_OCR - Document Preprocessor Service
+Handles file validation, format detection, and preprocessing
+"""
+
+import magic
+from pathlib import Path
+from typing import Tuple, Optional
+import logging
+from PIL import Image
+import cv2
+import numpy as np
+
+from app.core.config import settings
+
+logger = logging.getLogger(__name__)
+
+
+class DocumentPreprocessor:
+    """
+    Document preprocessing service for format standardization
+    Validates and prepares documents for OCR processing
+    """
+
+    SUPPORTED_IMAGE_FORMATS = ['png', 'jpg', 'jpeg', 'bmp', 'tiff', 'tif']
+    SUPPORTED_PDF_FORMAT = ['pdf']
+    ALL_SUPPORTED_FORMATS = SUPPORTED_IMAGE_FORMATS + SUPPORTED_PDF_FORMAT
+
+    def __init__(self):
+        self.allowed_extensions = settings.allowed_extensions_list
+        self.max_file_size = settings.max_upload_size
+        logger.info(f"DocumentPreprocessor initialized with allowed_extensions: {self.allowed_extensions}")
+
+    def validate_file(self, file_path: Path) -> Tuple[bool, Optional[str], Optional[str]]:
+        """
+        Validate file format, size, and integrity
+
+        Args:
+            file_path: Path to the file to validate
+
+        Returns:
+            Tuple of (is_valid, file_format, error_message)
+        """
+        try:
+            # Check file exists
+            if not file_path.exists():
+                return False, None, f"File not found: {file_path}"
+
+            # Check file size
+            file_size = file_path.stat().st_size
+            if file_size > self.max_file_size:
+                max_mb = self.max_file_size / (1024 * 1024)
+                actual_mb = file_size / (1024 * 1024)
+                return False, None, f"File too large: {actual_mb:.2f}MB (max {max_mb:.2f}MB)"
+
+            # Detect file format using magic numbers
+            mime = magic.Magic(mime=True)
+            mime_type = mime.from_file(str(file_path))
+
+            # Map MIME type to format
+            file_format = self._mime_to_format(mime_type)
+            if not file_format:
+                return False, None, f"Unsupported file type: {mime_type}"
+
+            # Check if format is in allowed extensions
+            if file_format not in self.allowed_extensions:
+                return False, None, f"File format '{file_format}' not allowed"
+
+            # Validate file integrity
+            is_valid, error = self._validate_integrity(file_path, file_format)
+            if not is_valid:
+                return False, file_format, f"File corrupted: {error}"
+
+            logger.info(f"File validated successfully: {file_path.name} ({file_format})")
+            return True, file_format, None
+
+        except Exception as e:
+            logger.error(f"File validation error: {str(e)}")
+            return False, None, f"Validation error: {str(e)}"
+
+    def _mime_to_format(self, mime_type: str) -> Optional[str]:
+        """Convert MIME type to file format"""
+        mime_map = {
+            'image/png': 'png',
+            'image/jpeg': 'jpg',
+            'image/jpg': 'jpg',
+            'image/bmp': 'bmp',
+            'image/tiff': 'tiff',
+            'image/x-tiff': 'tiff',
+            'application/pdf': 'pdf',
+            'application/msword': 'doc',
+            'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
+            'application/vnd.ms-powerpoint': 'ppt',
+            'application/vnd.openxmlformats-officedocument.presentationml.presentation': 'pptx',
+        }
+        return mime_map.get(mime_type)
+
+    def _validate_integrity(self, file_path: Path, file_format: str) -> Tuple[bool, Optional[str]]:
+        """
+        Validate file integrity by attempting to open it
+
+        Args:
+            file_path: Path to file
+            file_format: Detected file format
+
+        Returns:
+            Tuple of (is_valid, error_message)
+        """
+        try:
+            if file_format in self.SUPPORTED_IMAGE_FORMATS:
+                # Try to open image
+                with Image.open(file_path) as img:
+                    img.verify()  # Verify image integrity
+                # Reopen for actual check (verify() closes the file)
+                with Image.open(file_path) as img:
+                    _ = img.size  # Force load to detect corruption
+                return True, None
+
+            elif file_format == 'pdf':
+                # Basic PDF validation - check file starts with PDF signature
+                with open(file_path, 'rb') as f:
+                    header = f.read(5)
+                    if header != b'%PDF-':
+                        return False, "Invalid PDF header"
+                return True, None
+
+            elif file_format in ['doc', 'docx', 'ppt', 'pptx']:
+                # Office documents - basic validation (check file size and can be opened)
+                # Modern Office formats (docx, pptx) are ZIP-based
+                if file_format in ['docx', 'pptx']:
+                    import zipfile
+                    try:
+                        with zipfile.ZipFile(file_path, 'r') as zf:
+                            # Check if it has the required Office structure
+                            if file_format == 'docx' and 'word/document.xml' not in zf.namelist():
+                                return False, "Invalid DOCX structure"
+                            elif file_format == 'pptx' and 'ppt/presentation.xml' not in zf.namelist():
+                                return False, "Invalid PPTX structure"
+                    except zipfile.BadZipFile:
+                        return False, "Invalid Office file (corrupt ZIP)"
+                # Old formats (doc, ppt) - just check file exists and has content
+                return True, None
+
+            else:
+                return False, f"Unknown format: {file_format}"
+
+        except Exception as e:
+            return False, str(e)
+
+    def preprocess_image(
+        self,
+        image_path: Path,
+        enhance: bool = True,
+        output_path: Optional[Path] = None
+    ) -> Tuple[bool, Optional[Path], Optional[str]]:
+        """
+        Preprocess image to improve OCR accuracy
+
+        Args:
+            image_path: Path to input image
+            enhance: Whether to apply enhancement
+            output_path: Optional output path (defaults to temp directory)
+
+        Returns:
+            Tuple of (success, processed_image_path, error_message)
+        """
+        try:
+            # Read image
+            img = cv2.imread(str(image_path))
+            if img is None:
+                return False, None, "Failed to read image"
+
+            if not enhance:
+                # No preprocessing, return original
+                return True, image_path, None
+
+            # Convert to grayscale
+            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+
+            # Apply adaptive thresholding to handle varying lighting
+            processed = cv2.adaptiveThreshold(
+                gray,
+                255,
+                cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+                cv2.THRESH_BINARY,
+                11,
+                2
+            )
+
+            # Denoise
+            processed = cv2.fastNlMeansDenoising(processed, None, 10, 7, 21)
+
+            # Determine output path
+            if output_path is None:
+                output_path = Path(settings.processed_dir) / f"processed_{image_path.name}"
+
+            # Save processed image
+            cv2.imwrite(str(output_path), processed)
+
+            logger.info(f"Image preprocessed: {image_path.name} -> {output_path.name}")
+            return True, output_path, None
+
+        except Exception as e:
+            logger.error(f"Image preprocessing error: {str(e)}")
+            return False, None, f"Preprocessing error: {str(e)}"
+
+    def get_file_info(self, file_path: Path) -> dict:
+        """
+        Get comprehensive file information
+
+        Args:
+            file_path: Path to file
+
+        Returns:
+            Dictionary with file information
+        """
+        stat = file_path.stat()
+        mime = magic.Magic(mime=True)
+        mime_type = mime.from_file(str(file_path))
+
+        return {
+            'name': file_path.name,
+            'path': str(file_path),
+            'size': stat.st_size,
+            'size_mb': stat.st_size / (1024 * 1024),
+            'mime_type': mime_type,
+            'format': self._mime_to_format(mime_type),
+            'created_at': stat.st_ctime,
+            'modified_at': stat.st_mtime,
+        }