#!/usr/bin/env python3 """ Tool_OCR - Service Layer Integration Test Tests core services before API implementation """ import sys import logging from pathlib import Path from datetime import datetime # Add backend to path sys.path.insert(0, str(Path(__file__).parent)) from app.core.config import settings from app.core.database import engine, SessionLocal, Base from app.models.user import User from app.models.ocr import OCRBatch, OCRFile, OCRResult, FileStatus, BatchStatus from app.services.preprocessor import DocumentPreprocessor from app.services.ocr_service import OCRService from app.services.pdf_generator import PDFGenerator from app.services.file_manager import FileManager # Configure logging logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" ) logger = logging.getLogger(__name__) class ServiceTester: """Service layer integration tester""" def __init__(self): """Initialize tester""" self.db = SessionLocal() self.preprocessor = DocumentPreprocessor() self.ocr_service = OCRService() self.pdf_generator = PDFGenerator() self.file_manager = FileManager() self.test_results = { "database": False, "preprocessor": False, "ocr_engine": False, "pdf_generator": False, "file_manager": False, } def cleanup(self): """Cleanup resources""" self.db.close() def test_database_connection(self) -> bool: """Test 1: Database connection and models""" try: logger.info("=" * 80) logger.info("TEST 1: Database Connection") logger.info("=" * 80) # Test connection from sqlalchemy import text self.db.execute(text("SELECT 1")) logger.info("✓ Database connection successful") # Check if tables exist from sqlalchemy import inspect inspector = inspect(engine) tables = inspector.get_table_names() required_tables = [ 'paddle_ocr_users', 'paddle_ocr_batches', 'paddle_ocr_files', 'paddle_ocr_results', 'paddle_ocr_export_rules', 'paddle_ocr_translation_configs' ] missing_tables = [t for t in required_tables if t not in tables] if missing_tables: logger.error(f"✗ Missing tables: {missing_tables}") return False logger.info(f"✓ All required tables exist: {', '.join(required_tables)}") # Test creating a test user (will rollback) test_user = User( username=f"test_user_{datetime.now().timestamp()}", email=f"test_{datetime.now().timestamp()}@example.com", password_hash="test_hash_123", is_active=True, is_admin=False ) self.db.add(test_user) self.db.flush() logger.info(f"✓ Test user created with ID: {test_user.id}") self.db.rollback() # Don't actually save test user logger.info("✓ Database test completed successfully\n") self.test_results["database"] = True return True except Exception as e: logger.error(f"✗ Database test failed: {e}\n") return False def test_preprocessor(self) -> bool: """Test 2: Document preprocessor""" try: logger.info("=" * 80) logger.info("TEST 2: Document Preprocessor") logger.info("=" * 80) # Check supported formats formats = ['.png', '.jpg', '.jpeg', '.pdf'] logger.info(f"✓ Supported formats: {formats}") # Check max file size max_size_mb = settings.max_upload_size / (1024 * 1024) logger.info(f"✓ Max upload size: {max_size_mb} MB") logger.info("✓ Preprocessor initialized successfully\n") self.test_results["preprocessor"] = True return True except Exception as e: logger.error(f"✗ Preprocessor test failed: {e}\n") return False def test_ocr_engine(self) -> bool: """Test 3: OCR engine initialization""" try: logger.info("=" * 80) logger.info("TEST 3: OCR Engine (PaddleOCR)") logger.info("=" * 80) # Test OCR engine lazy loading logger.info("Initializing PaddleOCR engine (this may take a moment)...") ocr_engine = self.ocr_service.get_ocr_engine(lang='ch') logger.info("✓ PaddleOCR engine initialized for Chinese") # Test structure engine logger.info("Initializing PP-Structure engine...") structure_engine = self.ocr_service.get_structure_engine() logger.info("✓ PP-Structure engine initialized") # Check confidence threshold logger.info(f"✓ Confidence threshold: {self.ocr_service.confidence_threshold}") logger.info("✓ OCR engine test completed successfully\n") self.test_results["ocr_engine"] = True return True except Exception as e: logger.error(f"✗ OCR engine test failed: {e}") logger.error(" Make sure PaddleOCR models are downloaded:") logger.error(" - PaddleOCR will auto-download on first use (~900MB)") logger.error(" - Requires stable internet connection") logger.error("") return False def test_pdf_generator(self) -> bool: """Test 4: PDF generator""" try: logger.info("=" * 80) logger.info("TEST 4: PDF Generator") logger.info("=" * 80) # Check Pandoc availability pandoc_available = self.pdf_generator.check_pandoc_available() if pandoc_available: logger.info("✓ Pandoc is installed and available") else: logger.warning("⚠ Pandoc not found - will use WeasyPrint fallback") # Check available templates templates = self.pdf_generator.get_available_templates() logger.info(f"✓ Available CSS templates: {', '.join(templates.keys())}") logger.info("✓ PDF generator test completed successfully\n") self.test_results["pdf_generator"] = True return True except Exception as e: logger.error(f"✗ PDF generator test failed: {e}\n") return False def test_file_manager(self) -> bool: """Test 5: File manager""" try: logger.info("=" * 80) logger.info("TEST 5: File Manager") logger.info("=" * 80) # Check upload directory upload_dir = Path(settings.upload_dir) if upload_dir.exists(): logger.info(f"✓ Upload directory exists: {upload_dir}") else: upload_dir.mkdir(parents=True, exist_ok=True) logger.info(f"✓ Created upload directory: {upload_dir}") # Test batch directory creation test_batch_id = 99999 # Use high number to avoid conflicts batch_dir = self.file_manager.create_batch_directory(test_batch_id) logger.info(f"✓ Created test batch directory: {batch_dir}") # Check subdirectories subdirs = ["inputs", "outputs/markdown", "outputs/json", "outputs/images", "exports"] for subdir in subdirs: subdir_path = batch_dir / subdir if subdir_path.exists(): logger.info(f" ✓ {subdir}") else: logger.error(f" ✗ Missing: {subdir}") return False # Cleanup test directory import shutil shutil.rmtree(batch_dir.parent, ignore_errors=True) logger.info("✓ Cleaned up test batch directory") logger.info("✓ File manager test completed successfully\n") self.test_results["file_manager"] = True return True except Exception as e: logger.error(f"✗ File manager test failed: {e}\n") return False def run_all_tests(self): """Run all service tests""" logger.info("\n" + "=" * 80) logger.info("Tool_OCR Service Layer Integration Test") logger.info("=" * 80 + "\n") try: # Run tests in order self.test_database_connection() self.test_preprocessor() self.test_ocr_engine() self.test_pdf_generator() self.test_file_manager() # Print summary logger.info("=" * 80) logger.info("TEST SUMMARY") logger.info("=" * 80) total_tests = len(self.test_results) passed_tests = sum(1 for result in self.test_results.values() if result) for test_name, result in self.test_results.items(): status = "✓ PASS" if result else "✗ FAIL" logger.info(f"{status:8} - {test_name}") logger.info("-" * 80) logger.info(f"Total: {passed_tests}/{total_tests} tests passed") if passed_tests == total_tests: logger.info("\n🎉 All service layer tests passed! Ready to implement API endpoints.") return 0 else: logger.error(f"\n❌ {total_tests - passed_tests} test(s) failed. Please fix issues before proceeding.") return 1 finally: self.cleanup() def main(): """Main test entry point""" tester = ServiceTester() exit_code = tester.run_all_tests() sys.exit(exit_code) if __name__ == "__main__": main()