diff --git a/openspec/changes/migrate-to-external-api-authentication/database_schema.sql b/openspec/changes/migrate-to-external-api-authentication/database_schema.sql new file mode 100644 index 0000000..13a58e6 --- /dev/null +++ b/openspec/changes/migrate-to-external-api-authentication/database_schema.sql @@ -0,0 +1,183 @@ +-- Tool_OCR Database Schema with External API Authentication +-- Version: 2.0.0 +-- Date: 2025-11-14 +-- Description: Complete database redesign with user task isolation and history + +-- ============================================ +-- Drop existing tables (if needed) +-- ============================================ +-- Uncomment these lines to drop existing tables +-- DROP TABLE IF EXISTS tool_ocr_sessions; +-- DROP TABLE IF EXISTS tool_ocr_task_files; +-- DROP TABLE IF EXISTS tool_ocr_tasks; +-- DROP TABLE IF EXISTS tool_ocr_users; + +-- ============================================ +-- 1. Users Table +-- ============================================ +CREATE TABLE IF NOT EXISTS tool_ocr_users ( + id INT PRIMARY KEY AUTO_INCREMENT, + email VARCHAR(255) UNIQUE NOT NULL COMMENT 'Primary identifier from Azure AD', + display_name VARCHAR(255) COMMENT 'Display name from API response', + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + last_login TIMESTAMP NULL, + is_active BOOLEAN DEFAULT TRUE, + INDEX idx_email (email), + INDEX idx_active (is_active) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci +COMMENT='User accounts authenticated via external API'; + +-- ============================================ +-- 2. OCR Tasks Table +-- ============================================ +CREATE TABLE IF NOT EXISTS tool_ocr_tasks ( + id INT PRIMARY KEY AUTO_INCREMENT, + user_id INT NOT NULL COMMENT 'Foreign key to users table', + task_id VARCHAR(255) UNIQUE NOT NULL COMMENT 'Unique task identifier (UUID)', + filename VARCHAR(255), + file_type VARCHAR(50), + status ENUM('pending', 'processing', 'completed', 'failed') DEFAULT 'pending', + result_json_path VARCHAR(500) COMMENT 'Path to JSON result file', + result_markdown_path VARCHAR(500) COMMENT 'Path to Markdown result file', + result_pdf_path VARCHAR(500) COMMENT 'Path to searchable PDF file', + error_message TEXT COMMENT 'Error details if task failed', + processing_time_ms INT COMMENT 'Processing time in milliseconds', + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, + completed_at TIMESTAMP NULL, + file_deleted BOOLEAN DEFAULT FALSE COMMENT 'Track if files were auto-deleted', + FOREIGN KEY (user_id) REFERENCES tool_ocr_users(id) ON DELETE CASCADE, + INDEX idx_user_status (user_id, status), + INDEX idx_created (created_at), + INDEX idx_task_id (task_id), + INDEX idx_filename (filename) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci +COMMENT='OCR processing tasks with user association'; + +-- ============================================ +-- 3. Task Files Table +-- ============================================ +CREATE TABLE IF NOT EXISTS tool_ocr_task_files ( + id INT PRIMARY KEY AUTO_INCREMENT, + task_id INT NOT NULL COMMENT 'Foreign key to tasks table', + original_name VARCHAR(255), + stored_path VARCHAR(500) COMMENT 'Actual file path on server', + file_size BIGINT COMMENT 'File size in bytes', + mime_type VARCHAR(100), + file_hash VARCHAR(64) COMMENT 'SHA256 hash for deduplication', + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + FOREIGN KEY (task_id) REFERENCES tool_ocr_tasks(id) ON DELETE CASCADE, + INDEX idx_task (task_id), + INDEX idx_hash (file_hash) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci +COMMENT='Files associated with OCR tasks'; + +-- ============================================ +-- 4. Sessions Table (Token Storage) +-- ============================================ +CREATE TABLE IF NOT EXISTS tool_ocr_sessions ( + id INT PRIMARY KEY AUTO_INCREMENT, + user_id INT NOT NULL COMMENT 'Foreign key to users table', + session_id VARCHAR(255) UNIQUE NOT NULL COMMENT 'Unique session identifier', + access_token TEXT COMMENT 'Azure AD access token (encrypted)', + id_token TEXT COMMENT 'Azure AD ID token (encrypted)', + refresh_token TEXT COMMENT 'Refresh token if available', + expires_at TIMESTAMP NOT NULL COMMENT 'Token expiration time', + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + last_accessed TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, + is_active BOOLEAN DEFAULT TRUE, + ip_address VARCHAR(45) COMMENT 'Client IP address', + user_agent TEXT COMMENT 'Client user agent', + FOREIGN KEY (user_id) REFERENCES tool_ocr_users(id) ON DELETE CASCADE, + INDEX idx_user (user_id), + INDEX idx_session (session_id), + INDEX idx_expires (expires_at), + INDEX idx_active (is_active) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci +COMMENT='User session and token management'; + +-- ============================================ +-- 5. Audit Log Table (Optional) +-- ============================================ +CREATE TABLE IF NOT EXISTS tool_ocr_audit_logs ( + id BIGINT PRIMARY KEY AUTO_INCREMENT, + user_id INT COMMENT 'User who performed the action', + action VARCHAR(100) NOT NULL COMMENT 'Action performed', + entity_type VARCHAR(50) COMMENT 'Type of entity affected', + entity_id INT COMMENT 'ID of entity affected', + details JSON COMMENT 'Additional details in JSON format', + ip_address VARCHAR(45), + user_agent TEXT, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + INDEX idx_user (user_id), + INDEX idx_action (action), + INDEX idx_created (created_at) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci +COMMENT='Audit trail for all system actions'; + +-- ============================================ +-- Views for Common Queries +-- ============================================ + +-- User task statistics view +CREATE OR REPLACE VIEW tool_ocr_user_stats AS +SELECT + u.id as user_id, + u.email, + u.display_name, + COUNT(DISTINCT t.id) as total_tasks, + SUM(CASE WHEN t.status = 'completed' THEN 1 ELSE 0 END) as completed_tasks, + SUM(CASE WHEN t.status = 'failed' THEN 1 ELSE 0 END) as failed_tasks, + SUM(CASE WHEN t.status = 'processing' THEN 1 ELSE 0 END) as processing_tasks, + SUM(CASE WHEN t.status = 'pending' THEN 1 ELSE 0 END) as pending_tasks, + AVG(t.processing_time_ms) as avg_processing_time_ms, + MAX(t.created_at) as last_task_created +FROM tool_ocr_users u +LEFT JOIN tool_ocr_tasks t ON u.id = t.user_id +GROUP BY u.id, u.email, u.display_name; + +-- Recent tasks view +CREATE OR REPLACE VIEW tool_ocr_recent_tasks AS +SELECT + t.*, + u.email as user_email, + u.display_name as user_name +FROM tool_ocr_tasks t +INNER JOIN tool_ocr_users u ON t.user_id = u.id +ORDER BY t.created_at DESC +LIMIT 100; + +-- ============================================ +-- Stored Procedures (Optional) +-- ============================================ + +DELIMITER $$ + +-- Procedure to clean up expired sessions +CREATE PROCEDURE IF NOT EXISTS cleanup_expired_sessions() +BEGIN + DELETE FROM tool_ocr_sessions + WHERE expires_at < NOW() OR is_active = FALSE; +END$$ + +-- Procedure to clean up old tasks +CREATE PROCEDURE IF NOT EXISTS cleanup_old_tasks(IN days_to_keep INT) +BEGIN + UPDATE tool_ocr_tasks + SET file_deleted = TRUE + WHERE created_at < DATE_SUB(NOW(), INTERVAL days_to_keep DAY) + AND status IN ('completed', 'failed'); +END$$ + +DELIMITER ; + +-- ============================================ +-- Initial Data (Optional) +-- ============================================ +-- Add any initial data here if needed + +-- ============================================ +-- Grants (Adjust as needed) +-- ============================================ +-- GRANT ALL PRIVILEGES ON tool_ocr_* TO 'tool_ocr_user'@'localhost'; +-- FLUSH PRIVILEGES; \ No newline at end of file diff --git a/openspec/changes/migrate-to-external-api-authentication/proposal.md b/openspec/changes/migrate-to-external-api-authentication/proposal.md index 0737d79..21bcccb 100644 --- a/openspec/changes/migrate-to-external-api-authentication/proposal.md +++ b/openspec/changes/migrate-to-external-api-authentication/proposal.md @@ -75,9 +75,11 @@ By migrating to the external API authentication service at https://pj-auth-api.v **Complete Redesign (No backward compatibility needed)**: -1. **users table (redesigned)**: +**Table Prefix**: `tool_ocr_` (for clear separation from other systems in the same database) + +1. **tool_ocr_users table (redesigned)**: ```sql - CREATE TABLE users ( + CREATE TABLE tool_ocr_users ( id INT PRIMARY KEY AUTO_INCREMENT, email VARCHAR(255) UNIQUE NOT NULL, -- Primary identifier from Azure AD display_name VARCHAR(255), -- Display name from API response @@ -88,9 +90,9 @@ By migrating to the external API authentication service at https://pj-auth-api.v ``` Note: No Azure AD ID storage needed - email is sufficient as unique identifier -2. **ocr_tasks table (new - for task history)**: +2. **tool_ocr_tasks table (new - for task history)**: ```sql - CREATE TABLE ocr_tasks ( + CREATE TABLE tool_ocr_tasks ( id INT PRIMARY KEY AUTO_INCREMENT, user_id INT NOT NULL, -- Foreign key to users table task_id VARCHAR(255) UNIQUE, -- Unique task identifier @@ -104,22 +106,37 @@ By migrating to the external API authentication service at https://pj-auth-api.v updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, completed_at TIMESTAMP NULL, file_deleted BOOLEAN DEFAULT FALSE, -- Track if files were auto-deleted - FOREIGN KEY (user_id) REFERENCES users(id), + FOREIGN KEY (user_id) REFERENCES tool_ocr_users(id), INDEX idx_user_status (user_id, status), INDEX idx_created (created_at) ); ``` -3. **task_files table (for multiple files per task)**: +3. **tool_ocr_task_files table (for multiple files per task)**: ```sql - CREATE TABLE task_files ( + CREATE TABLE tool_ocr_task_files ( id INT PRIMARY KEY AUTO_INCREMENT, task_id INT NOT NULL, original_name VARCHAR(255), stored_path VARCHAR(500), file_size BIGINT, mime_type VARCHAR(100), - FOREIGN KEY (task_id) REFERENCES ocr_tasks(id) ON DELETE CASCADE + FOREIGN KEY (task_id) REFERENCES tool_ocr_tasks(id) ON DELETE CASCADE + ); + ``` + +4. **tool_ocr_sessions table (for token management)**: + ```sql + CREATE TABLE tool_ocr_sessions ( + id INT PRIMARY KEY AUTO_INCREMENT, + user_id INT NOT NULL, + access_token TEXT, + id_token TEXT, + expires_at TIMESTAMP, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + FOREIGN KEY (user_id) REFERENCES tool_ocr_users(id) ON DELETE CASCADE, + INDEX idx_user (user_id), + INDEX idx_expires (expires_at) ); ``` @@ -227,6 +244,7 @@ By migrating to the external API authentication service at https://pj-auth-api.v - `TASK_RETENTION_DAYS` = 30 (auto-delete old tasks) - `MAX_TASKS_PER_USER` = 1000 (limit per user) - `ENABLE_TASK_HISTORY` = true (enable history feature) + - `DATABASE_TABLE_PREFIX` = "tool_ocr_" (table naming prefix) ### Security Considerations - HTTPS required for all authentication requests diff --git a/openspec/changes/migrate-to-external-api-authentication/tasks.md b/openspec/changes/migrate-to-external-api-authentication/tasks.md index c2d18de..40528b6 100644 --- a/openspec/changes/migrate-to-external-api-authentication/tasks.md +++ b/openspec/changes/migrate-to-external-api-authentication/tasks.md @@ -5,21 +5,23 @@ - Export current schema and data - Document any important data to preserve - [ ] 1.2 Drop old tables - - Remove existing users table - - Remove any related tables + - Remove existing tables with old naming convention - Clear database for fresh start -- [ ] 1.3 Create new database schema - - Create new `users` table (email as primary identifier) - - Create `ocr_tasks` table with user association - - Create `task_files` table for file tracking +- [ ] 1.3 Create new database schema with `tool_ocr_` prefix + - Create new `tool_ocr_users` table (email as primary identifier) + - Create `tool_ocr_tasks` table with user association + - Create `tool_ocr_task_files` table for file tracking + - Create `tool_ocr_sessions` table for token storage - Add proper indexes for performance - [ ] 1.4 Create SQLAlchemy models - - User model (simplified) - - Task model with user relationship - - TaskFile model with cascade delete + - User model (mapped to `tool_ocr_users`) + - Task model (mapped to `tool_ocr_tasks`) + - TaskFile model (mapped to `tool_ocr_task_files`) + - Session model (mapped to `tool_ocr_sessions`) + - Configure table prefix in base model - [ ] 1.5 Generate Alembic migration - Create initial migration for new schema - - Test migration script + - Test migration script with proper table prefixes ## 2. Configuration Management - [ ] 2.1 Update environment configuration @@ -30,9 +32,11 @@ - Add `TASK_RETENTION_DAYS` for auto-cleanup - Add `MAX_TASKS_PER_USER` for limits - Add `ENABLE_TASK_HISTORY` feature flag + - Add `DATABASE_TABLE_PREFIX` = "tool_ocr_" - [ ] 2.2 Update Settings class - Add external auth settings to `backend/app/core/config.py` - Add task management settings + - Add database table prefix configuration - Add validation for new configuration values - Remove old authentication settings