chore: project cleanup and prepare for dual-track processing refactor

- Removed all test files and directories
- Deleted outdated documentation (will be rewritten)
- Cleaned up temporary files, logs, and uploads
- Archived 5 completed OpenSpec proposals
- Created new dual-track-document-processing proposal with complete OpenSpec structure
  - Dual-track architecture: OCR track (PaddleOCR) + Direct track (PyMuPDF)
  - UnifiedDocument model for consistent output
  - Support for structure-preserving translation
- Updated .gitignore to prevent future test/temp files

This is a major cleanup preparing for the complete refactoring of the document processing pipeline.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
egg
2025-11-18 20:02:31 +08:00
parent 0edc56b03f
commit cd3cbea49d
64 changed files with 3573 additions and 8190 deletions

View File

@@ -0,0 +1,183 @@
-- Tool_OCR Database Schema with External API Authentication
-- Version: 2.0.0
-- Date: 2025-11-14
-- Description: Complete database redesign with user task isolation and history
-- ============================================
-- Drop existing tables (if needed)
-- ============================================
-- Uncomment these lines to drop existing tables
-- DROP TABLE IF EXISTS tool_ocr_sessions;
-- DROP TABLE IF EXISTS tool_ocr_task_files;
-- DROP TABLE IF EXISTS tool_ocr_tasks;
-- DROP TABLE IF EXISTS tool_ocr_users;
-- ============================================
-- 1. Users Table
-- ============================================
CREATE TABLE IF NOT EXISTS tool_ocr_users (
id INT PRIMARY KEY AUTO_INCREMENT,
email VARCHAR(255) UNIQUE NOT NULL COMMENT 'Primary identifier from Azure AD',
display_name VARCHAR(255) COMMENT 'Display name from API response',
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
last_login TIMESTAMP NULL,
is_active BOOLEAN DEFAULT TRUE,
INDEX idx_email (email),
INDEX idx_active (is_active)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci
COMMENT='User accounts authenticated via external API';
-- ============================================
-- 2. OCR Tasks Table
-- ============================================
CREATE TABLE IF NOT EXISTS tool_ocr_tasks (
id INT PRIMARY KEY AUTO_INCREMENT,
user_id INT NOT NULL COMMENT 'Foreign key to users table',
task_id VARCHAR(255) UNIQUE NOT NULL COMMENT 'Unique task identifier (UUID)',
filename VARCHAR(255),
file_type VARCHAR(50),
status ENUM('pending', 'processing', 'completed', 'failed') DEFAULT 'pending',
result_json_path VARCHAR(500) COMMENT 'Path to JSON result file',
result_markdown_path VARCHAR(500) COMMENT 'Path to Markdown result file',
result_pdf_path VARCHAR(500) COMMENT 'Path to searchable PDF file',
error_message TEXT COMMENT 'Error details if task failed',
processing_time_ms INT COMMENT 'Processing time in milliseconds',
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
completed_at TIMESTAMP NULL,
file_deleted BOOLEAN DEFAULT FALSE COMMENT 'Track if files were auto-deleted',
FOREIGN KEY (user_id) REFERENCES tool_ocr_users(id) ON DELETE CASCADE,
INDEX idx_user_status (user_id, status),
INDEX idx_created (created_at),
INDEX idx_task_id (task_id),
INDEX idx_filename (filename)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci
COMMENT='OCR processing tasks with user association';
-- ============================================
-- 3. Task Files Table
-- ============================================
CREATE TABLE IF NOT EXISTS tool_ocr_task_files (
id INT PRIMARY KEY AUTO_INCREMENT,
task_id INT NOT NULL COMMENT 'Foreign key to tasks table',
original_name VARCHAR(255),
stored_path VARCHAR(500) COMMENT 'Actual file path on server',
file_size BIGINT COMMENT 'File size in bytes',
mime_type VARCHAR(100),
file_hash VARCHAR(64) COMMENT 'SHA256 hash for deduplication',
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (task_id) REFERENCES tool_ocr_tasks(id) ON DELETE CASCADE,
INDEX idx_task (task_id),
INDEX idx_hash (file_hash)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci
COMMENT='Files associated with OCR tasks';
-- ============================================
-- 4. Sessions Table (Token Storage)
-- ============================================
CREATE TABLE IF NOT EXISTS tool_ocr_sessions (
id INT PRIMARY KEY AUTO_INCREMENT,
user_id INT NOT NULL COMMENT 'Foreign key to users table',
session_id VARCHAR(255) UNIQUE NOT NULL COMMENT 'Unique session identifier',
access_token TEXT COMMENT 'Azure AD access token (encrypted)',
id_token TEXT COMMENT 'Azure AD ID token (encrypted)',
refresh_token TEXT COMMENT 'Refresh token if available',
expires_at TIMESTAMP NOT NULL COMMENT 'Token expiration time',
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
last_accessed TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
is_active BOOLEAN DEFAULT TRUE,
ip_address VARCHAR(45) COMMENT 'Client IP address',
user_agent TEXT COMMENT 'Client user agent',
FOREIGN KEY (user_id) REFERENCES tool_ocr_users(id) ON DELETE CASCADE,
INDEX idx_user (user_id),
INDEX idx_session (session_id),
INDEX idx_expires (expires_at),
INDEX idx_active (is_active)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci
COMMENT='User session and token management';
-- ============================================
-- 5. Audit Log Table (Optional)
-- ============================================
CREATE TABLE IF NOT EXISTS tool_ocr_audit_logs (
id BIGINT PRIMARY KEY AUTO_INCREMENT,
user_id INT COMMENT 'User who performed the action',
action VARCHAR(100) NOT NULL COMMENT 'Action performed',
entity_type VARCHAR(50) COMMENT 'Type of entity affected',
entity_id INT COMMENT 'ID of entity affected',
details JSON COMMENT 'Additional details in JSON format',
ip_address VARCHAR(45),
user_agent TEXT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
INDEX idx_user (user_id),
INDEX idx_action (action),
INDEX idx_created (created_at)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci
COMMENT='Audit trail for all system actions';
-- ============================================
-- Views for Common Queries
-- ============================================
-- User task statistics view
CREATE OR REPLACE VIEW tool_ocr_user_stats AS
SELECT
u.id as user_id,
u.email,
u.display_name,
COUNT(DISTINCT t.id) as total_tasks,
SUM(CASE WHEN t.status = 'completed' THEN 1 ELSE 0 END) as completed_tasks,
SUM(CASE WHEN t.status = 'failed' THEN 1 ELSE 0 END) as failed_tasks,
SUM(CASE WHEN t.status = 'processing' THEN 1 ELSE 0 END) as processing_tasks,
SUM(CASE WHEN t.status = 'pending' THEN 1 ELSE 0 END) as pending_tasks,
AVG(t.processing_time_ms) as avg_processing_time_ms,
MAX(t.created_at) as last_task_created
FROM tool_ocr_users u
LEFT JOIN tool_ocr_tasks t ON u.id = t.user_id
GROUP BY u.id, u.email, u.display_name;
-- Recent tasks view
CREATE OR REPLACE VIEW tool_ocr_recent_tasks AS
SELECT
t.*,
u.email as user_email,
u.display_name as user_name
FROM tool_ocr_tasks t
INNER JOIN tool_ocr_users u ON t.user_id = u.id
ORDER BY t.created_at DESC
LIMIT 100;
-- ============================================
-- Stored Procedures (Optional)
-- ============================================
DELIMITER $$
-- Procedure to clean up expired sessions
CREATE PROCEDURE IF NOT EXISTS cleanup_expired_sessions()
BEGIN
DELETE FROM tool_ocr_sessions
WHERE expires_at < NOW() OR is_active = FALSE;
END$$
-- Procedure to clean up old tasks
CREATE PROCEDURE IF NOT EXISTS cleanup_old_tasks(IN days_to_keep INT)
BEGIN
UPDATE tool_ocr_tasks
SET file_deleted = TRUE
WHERE created_at < DATE_SUB(NOW(), INTERVAL days_to_keep DAY)
AND status IN ('completed', 'failed');
END$$
DELIMITER ;
-- ============================================
-- Initial Data (Optional)
-- ============================================
-- Add any initial data here if needed
-- ============================================
-- Grants (Adjust as needed)
-- ============================================
-- GRANT ALL PRIVILEGES ON tool_ocr_* TO 'tool_ocr_user'@'localhost';
-- FLUSH PRIVILEGES;