287 lines
9.7 KiB
Python
287 lines
9.7 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Tool_OCR - Service Layer Integration Test
|
|
Tests core services before API implementation
|
|
"""
|
|
|
|
import sys
|
|
import logging
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
|
|
# Add backend to path
|
|
sys.path.insert(0, str(Path(__file__).parent))
|
|
|
|
from app.core.config import settings
|
|
from app.core.database import engine, SessionLocal, Base
|
|
from app.models.user import User
|
|
from app.models.ocr import OCRBatch, OCRFile, OCRResult, FileStatus, BatchStatus
|
|
from app.services.preprocessor import DocumentPreprocessor
|
|
from app.services.ocr_service import OCRService
|
|
from app.services.pdf_generator import PDFGenerator
|
|
from app.services.file_manager import FileManager
|
|
|
|
|
|
# Configure logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class ServiceTester:
|
|
"""Service layer integration tester"""
|
|
|
|
def __init__(self):
|
|
"""Initialize tester"""
|
|
self.db = SessionLocal()
|
|
self.preprocessor = DocumentPreprocessor()
|
|
self.ocr_service = OCRService()
|
|
self.pdf_generator = PDFGenerator()
|
|
self.file_manager = FileManager()
|
|
self.test_results = {
|
|
"database": False,
|
|
"preprocessor": False,
|
|
"ocr_engine": False,
|
|
"pdf_generator": False,
|
|
"file_manager": False,
|
|
}
|
|
|
|
def cleanup(self):
|
|
"""Cleanup resources"""
|
|
self.db.close()
|
|
|
|
def test_database_connection(self) -> bool:
|
|
"""Test 1: Database connection and models"""
|
|
try:
|
|
logger.info("=" * 80)
|
|
logger.info("TEST 1: Database Connection")
|
|
logger.info("=" * 80)
|
|
|
|
# Test connection
|
|
from sqlalchemy import text
|
|
self.db.execute(text("SELECT 1"))
|
|
logger.info("✓ Database connection successful")
|
|
|
|
# Check if tables exist
|
|
from sqlalchemy import inspect
|
|
inspector = inspect(engine)
|
|
tables = inspector.get_table_names()
|
|
|
|
required_tables = [
|
|
'paddle_ocr_users',
|
|
'paddle_ocr_batches',
|
|
'paddle_ocr_files',
|
|
'paddle_ocr_results',
|
|
'paddle_ocr_export_rules',
|
|
'paddle_ocr_translation_configs'
|
|
]
|
|
|
|
missing_tables = [t for t in required_tables if t not in tables]
|
|
if missing_tables:
|
|
logger.error(f"✗ Missing tables: {missing_tables}")
|
|
return False
|
|
|
|
logger.info(f"✓ All required tables exist: {', '.join(required_tables)}")
|
|
|
|
# Test creating a test user (will rollback)
|
|
test_user = User(
|
|
username=f"test_user_{datetime.now().timestamp()}",
|
|
email=f"test_{datetime.now().timestamp()}@example.com",
|
|
password_hash="test_hash_123",
|
|
is_active=True,
|
|
is_admin=False
|
|
)
|
|
self.db.add(test_user)
|
|
self.db.flush()
|
|
logger.info(f"✓ Test user created with ID: {test_user.id}")
|
|
|
|
self.db.rollback() # Don't actually save test user
|
|
logger.info("✓ Database test completed successfully\n")
|
|
|
|
self.test_results["database"] = True
|
|
return True
|
|
|
|
except Exception as e:
|
|
logger.error(f"✗ Database test failed: {e}\n")
|
|
return False
|
|
|
|
def test_preprocessor(self) -> bool:
|
|
"""Test 2: Document preprocessor"""
|
|
try:
|
|
logger.info("=" * 80)
|
|
logger.info("TEST 2: Document Preprocessor")
|
|
logger.info("=" * 80)
|
|
|
|
# Check supported formats
|
|
formats = ['.png', '.jpg', '.jpeg', '.pdf']
|
|
logger.info(f"✓ Supported formats: {formats}")
|
|
|
|
# Check max file size
|
|
max_size_mb = settings.max_upload_size / (1024 * 1024)
|
|
logger.info(f"✓ Max upload size: {max_size_mb} MB")
|
|
|
|
logger.info("✓ Preprocessor initialized successfully\n")
|
|
|
|
self.test_results["preprocessor"] = True
|
|
return True
|
|
|
|
except Exception as e:
|
|
logger.error(f"✗ Preprocessor test failed: {e}\n")
|
|
return False
|
|
|
|
def test_ocr_engine(self) -> bool:
|
|
"""Test 3: OCR engine initialization"""
|
|
try:
|
|
logger.info("=" * 80)
|
|
logger.info("TEST 3: OCR Engine (PaddleOCR)")
|
|
logger.info("=" * 80)
|
|
|
|
# Test OCR engine lazy loading
|
|
logger.info("Initializing PaddleOCR engine (this may take a moment)...")
|
|
ocr_engine = self.ocr_service.get_ocr_engine(lang='ch')
|
|
logger.info("✓ PaddleOCR engine initialized for Chinese")
|
|
|
|
# Test structure engine
|
|
logger.info("Initializing PP-Structure engine...")
|
|
structure_engine = self.ocr_service.get_structure_engine()
|
|
logger.info("✓ PP-Structure engine initialized")
|
|
|
|
# Check confidence threshold
|
|
logger.info(f"✓ Confidence threshold: {self.ocr_service.confidence_threshold}")
|
|
|
|
logger.info("✓ OCR engine test completed successfully\n")
|
|
|
|
self.test_results["ocr_engine"] = True
|
|
return True
|
|
|
|
except Exception as e:
|
|
logger.error(f"✗ OCR engine test failed: {e}")
|
|
logger.error(" Make sure PaddleOCR models are downloaded:")
|
|
logger.error(" - PaddleOCR will auto-download on first use (~900MB)")
|
|
logger.error(" - Requires stable internet connection")
|
|
logger.error("")
|
|
return False
|
|
|
|
def test_pdf_generator(self) -> bool:
|
|
"""Test 4: PDF generator"""
|
|
try:
|
|
logger.info("=" * 80)
|
|
logger.info("TEST 4: PDF Generator")
|
|
logger.info("=" * 80)
|
|
|
|
# Check Pandoc availability
|
|
pandoc_available = self.pdf_generator.check_pandoc_available()
|
|
if pandoc_available:
|
|
logger.info("✓ Pandoc is installed and available")
|
|
else:
|
|
logger.warning("⚠ Pandoc not found - will use WeasyPrint fallback")
|
|
|
|
# Check available templates
|
|
templates = self.pdf_generator.get_available_templates()
|
|
logger.info(f"✓ Available CSS templates: {', '.join(templates.keys())}")
|
|
|
|
logger.info("✓ PDF generator test completed successfully\n")
|
|
|
|
self.test_results["pdf_generator"] = True
|
|
return True
|
|
|
|
except Exception as e:
|
|
logger.error(f"✗ PDF generator test failed: {e}\n")
|
|
return False
|
|
|
|
def test_file_manager(self) -> bool:
|
|
"""Test 5: File manager"""
|
|
try:
|
|
logger.info("=" * 80)
|
|
logger.info("TEST 5: File Manager")
|
|
logger.info("=" * 80)
|
|
|
|
# Check upload directory
|
|
upload_dir = Path(settings.upload_dir)
|
|
if upload_dir.exists():
|
|
logger.info(f"✓ Upload directory exists: {upload_dir}")
|
|
else:
|
|
upload_dir.mkdir(parents=True, exist_ok=True)
|
|
logger.info(f"✓ Created upload directory: {upload_dir}")
|
|
|
|
# Test batch directory creation
|
|
test_batch_id = 99999 # Use high number to avoid conflicts
|
|
batch_dir = self.file_manager.create_batch_directory(test_batch_id)
|
|
logger.info(f"✓ Created test batch directory: {batch_dir}")
|
|
|
|
# Check subdirectories
|
|
subdirs = ["inputs", "outputs/markdown", "outputs/json", "outputs/images", "exports"]
|
|
for subdir in subdirs:
|
|
subdir_path = batch_dir / subdir
|
|
if subdir_path.exists():
|
|
logger.info(f" ✓ {subdir}")
|
|
else:
|
|
logger.error(f" ✗ Missing: {subdir}")
|
|
return False
|
|
|
|
# Cleanup test directory
|
|
import shutil
|
|
shutil.rmtree(batch_dir.parent, ignore_errors=True)
|
|
logger.info("✓ Cleaned up test batch directory")
|
|
|
|
logger.info("✓ File manager test completed successfully\n")
|
|
|
|
self.test_results["file_manager"] = True
|
|
return True
|
|
|
|
except Exception as e:
|
|
logger.error(f"✗ File manager test failed: {e}\n")
|
|
return False
|
|
|
|
def run_all_tests(self):
|
|
"""Run all service tests"""
|
|
logger.info("\n" + "=" * 80)
|
|
logger.info("Tool_OCR Service Layer Integration Test")
|
|
logger.info("=" * 80 + "\n")
|
|
|
|
try:
|
|
# Run tests in order
|
|
self.test_database_connection()
|
|
self.test_preprocessor()
|
|
self.test_ocr_engine()
|
|
self.test_pdf_generator()
|
|
self.test_file_manager()
|
|
|
|
# Print summary
|
|
logger.info("=" * 80)
|
|
logger.info("TEST SUMMARY")
|
|
logger.info("=" * 80)
|
|
|
|
total_tests = len(self.test_results)
|
|
passed_tests = sum(1 for result in self.test_results.values() if result)
|
|
|
|
for test_name, result in self.test_results.items():
|
|
status = "✓ PASS" if result else "✗ FAIL"
|
|
logger.info(f"{status:8} - {test_name}")
|
|
|
|
logger.info("-" * 80)
|
|
logger.info(f"Total: {passed_tests}/{total_tests} tests passed")
|
|
|
|
if passed_tests == total_tests:
|
|
logger.info("\n🎉 All service layer tests passed! Ready to implement API endpoints.")
|
|
return 0
|
|
else:
|
|
logger.error(f"\n❌ {total_tests - passed_tests} test(s) failed. Please fix issues before proceeding.")
|
|
return 1
|
|
|
|
finally:
|
|
self.cleanup()
|
|
|
|
|
|
def main():
|
|
"""Main test entry point"""
|
|
tester = ServiceTester()
|
|
exit_code = tester.run_all_tests()
|
|
sys.exit(exit_code)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|