-- Tool_OCR Database Schema with External API Authentication -- Version: 2.0.0 -- Date: 2025-11-14 -- Description: Complete database redesign with user task isolation and history -- ============================================ -- Drop existing tables (if needed) -- ============================================ -- Uncomment these lines to drop existing tables -- DROP TABLE IF EXISTS tool_ocr_sessions; -- DROP TABLE IF EXISTS tool_ocr_task_files; -- DROP TABLE IF EXISTS tool_ocr_tasks; -- DROP TABLE IF EXISTS tool_ocr_users; -- ============================================ -- 1. Users Table -- ============================================ CREATE TABLE IF NOT EXISTS tool_ocr_users ( id INT PRIMARY KEY AUTO_INCREMENT, email VARCHAR(255) UNIQUE NOT NULL COMMENT 'Primary identifier from Azure AD', display_name VARCHAR(255) COMMENT 'Display name from API response', created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, last_login TIMESTAMP NULL, is_active BOOLEAN DEFAULT TRUE, INDEX idx_email (email), INDEX idx_active (is_active) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci COMMENT='User accounts authenticated via external API'; -- ============================================ -- 2. OCR Tasks Table -- ============================================ CREATE TABLE IF NOT EXISTS tool_ocr_tasks ( id INT PRIMARY KEY AUTO_INCREMENT, user_id INT NOT NULL COMMENT 'Foreign key to users table', task_id VARCHAR(255) UNIQUE NOT NULL COMMENT 'Unique task identifier (UUID)', filename VARCHAR(255), file_type VARCHAR(50), status ENUM('pending', 'processing', 'completed', 'failed') DEFAULT 'pending', result_json_path VARCHAR(500) COMMENT 'Path to JSON result file', result_markdown_path VARCHAR(500) COMMENT 'Path to Markdown result file', result_pdf_path VARCHAR(500) COMMENT 'Path to searchable PDF file', error_message TEXT COMMENT 'Error details if task failed', processing_time_ms INT COMMENT 'Processing time in milliseconds', created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, completed_at TIMESTAMP NULL, file_deleted BOOLEAN DEFAULT FALSE COMMENT 'Track if files were auto-deleted', FOREIGN KEY (user_id) REFERENCES tool_ocr_users(id) ON DELETE CASCADE, INDEX idx_user_status (user_id, status), INDEX idx_created (created_at), INDEX idx_task_id (task_id), INDEX idx_filename (filename) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci COMMENT='OCR processing tasks with user association'; -- ============================================ -- 3. Task Files Table -- ============================================ CREATE TABLE IF NOT EXISTS tool_ocr_task_files ( id INT PRIMARY KEY AUTO_INCREMENT, task_id INT NOT NULL COMMENT 'Foreign key to tasks table', original_name VARCHAR(255), stored_path VARCHAR(500) COMMENT 'Actual file path on server', file_size BIGINT COMMENT 'File size in bytes', mime_type VARCHAR(100), file_hash VARCHAR(64) COMMENT 'SHA256 hash for deduplication', created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, FOREIGN KEY (task_id) REFERENCES tool_ocr_tasks(id) ON DELETE CASCADE, INDEX idx_task (task_id), INDEX idx_hash (file_hash) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci COMMENT='Files associated with OCR tasks'; -- ============================================ -- 4. Sessions Table (Token Storage) -- ============================================ CREATE TABLE IF NOT EXISTS tool_ocr_sessions ( id INT PRIMARY KEY AUTO_INCREMENT, user_id INT NOT NULL COMMENT 'Foreign key to users table', session_id VARCHAR(255) UNIQUE NOT NULL COMMENT 'Unique session identifier', access_token TEXT COMMENT 'Azure AD access token (encrypted)', id_token TEXT COMMENT 'Azure AD ID token (encrypted)', refresh_token TEXT COMMENT 'Refresh token if available', expires_at TIMESTAMP NOT NULL COMMENT 'Token expiration time', created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, last_accessed TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, is_active BOOLEAN DEFAULT TRUE, ip_address VARCHAR(45) COMMENT 'Client IP address', user_agent TEXT COMMENT 'Client user agent', FOREIGN KEY (user_id) REFERENCES tool_ocr_users(id) ON DELETE CASCADE, INDEX idx_user (user_id), INDEX idx_session (session_id), INDEX idx_expires (expires_at), INDEX idx_active (is_active) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci COMMENT='User session and token management'; -- ============================================ -- 5. Audit Log Table (Optional) -- ============================================ CREATE TABLE IF NOT EXISTS tool_ocr_audit_logs ( id BIGINT PRIMARY KEY AUTO_INCREMENT, user_id INT COMMENT 'User who performed the action', action VARCHAR(100) NOT NULL COMMENT 'Action performed', entity_type VARCHAR(50) COMMENT 'Type of entity affected', entity_id INT COMMENT 'ID of entity affected', details JSON COMMENT 'Additional details in JSON format', ip_address VARCHAR(45), user_agent TEXT, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, INDEX idx_user (user_id), INDEX idx_action (action), INDEX idx_created (created_at) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci COMMENT='Audit trail for all system actions'; -- ============================================ -- Views for Common Queries -- ============================================ -- User task statistics view CREATE OR REPLACE VIEW tool_ocr_user_stats AS SELECT u.id as user_id, u.email, u.display_name, COUNT(DISTINCT t.id) as total_tasks, SUM(CASE WHEN t.status = 'completed' THEN 1 ELSE 0 END) as completed_tasks, SUM(CASE WHEN t.status = 'failed' THEN 1 ELSE 0 END) as failed_tasks, SUM(CASE WHEN t.status = 'processing' THEN 1 ELSE 0 END) as processing_tasks, SUM(CASE WHEN t.status = 'pending' THEN 1 ELSE 0 END) as pending_tasks, AVG(t.processing_time_ms) as avg_processing_time_ms, MAX(t.created_at) as last_task_created FROM tool_ocr_users u LEFT JOIN tool_ocr_tasks t ON u.id = t.user_id GROUP BY u.id, u.email, u.display_name; -- Recent tasks view CREATE OR REPLACE VIEW tool_ocr_recent_tasks AS SELECT t.*, u.email as user_email, u.display_name as user_name FROM tool_ocr_tasks t INNER JOIN tool_ocr_users u ON t.user_id = u.id ORDER BY t.created_at DESC LIMIT 100; -- ============================================ -- Stored Procedures (Optional) -- ============================================ DELIMITER $$ -- Procedure to clean up expired sessions CREATE PROCEDURE IF NOT EXISTS cleanup_expired_sessions() BEGIN DELETE FROM tool_ocr_sessions WHERE expires_at < NOW() OR is_active = FALSE; END$$ -- Procedure to clean up old tasks CREATE PROCEDURE IF NOT EXISTS cleanup_old_tasks(IN days_to_keep INT) BEGIN UPDATE tool_ocr_tasks SET file_deleted = TRUE WHERE created_at < DATE_SUB(NOW(), INTERVAL days_to_keep DAY) AND status IN ('completed', 'failed'); END$$ DELIMITER ; -- ============================================ -- Initial Data (Optional) -- ============================================ -- Add any initial data here if needed -- ============================================ -- Grants (Adjust as needed) -- ============================================ -- GRANT ALL PRIVILEGES ON tool_ocr_* TO 'tool_ocr_user'@'localhost'; -- FLUSH PRIVILEGES;