feat: implement kanban real-time sync and fix workload cache

## Kanban Real-time Sync (NEW-002)
- Backend:
  - WebSocket endpoint: /ws/projects/{project_id}
  - Project room management in ConnectionManager
  - Redis Pub/Sub: project:{project_id}:tasks channel
  - Task CRUD event publishing (5 event types)
  - Redis connection retry with exponential backoff
  - Race condition fix in broadcast_to_project

- Frontend:
  - ProjectSyncContext for WebSocket management
  - Reconnection with exponential backoff (max 5 attempts)
  - Multi-tab event deduplication via event_id
  - Live/Offline connection indicator
  - Optimistic updates with rollback

- Spec:
  - collaboration spec: +1 requirement (Project Real-time Sync)
  - 7 new scenarios for real-time sync

## Workload Cache Fix (NEW-001)
- Added cache invalidation to all task endpoints:
  - create_task, update_task, update_task_status
  - delete_task, restore_task, assign_task
- Extended to clear heatmap cache as well

## OpenSpec Archive
- 2026-01-05-add-kanban-realtime-sync

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
beabigegg
2026-01-05 20:28:42 +08:00
parent 9b220523ff
commit 69b81d9241
13 changed files with 1470 additions and 31 deletions

View File

@@ -1,3 +1,4 @@
import logging
import uuid
from datetime import datetime, timezone
from typing import List, Optional
@@ -5,6 +6,7 @@ from fastapi import APIRouter, Depends, HTTPException, status, Query, Request
from sqlalchemy.orm import Session
from app.core.database import get_db
from app.core.redis_pubsub import publish_task_event
from app.models import User, Project, Task, TaskStatus, AuditAction, Blocker
from app.schemas.task import (
TaskCreate, TaskUpdate, TaskResponse, TaskWithDetails, TaskListResponse,
@@ -16,6 +18,9 @@ from app.middleware.auth import (
from app.middleware.audit import get_audit_metadata
from app.services.audit_service import AuditService
from app.services.trigger_service import TriggerService
from app.services.workload_cache import invalidate_user_workload_cache
logger = logging.getLogger(__name__)
router = APIRouter(tags=["tasks"])
@@ -231,6 +236,40 @@ async def create_task(
db.commit()
db.refresh(task)
# Invalidate workload cache if task has an assignee
if task.assignee_id:
invalidate_user_workload_cache(task.assignee_id)
# Publish real-time event
try:
await publish_task_event(
project_id=str(task.project_id),
event_type="task_created",
task_data={
"task_id": str(task.id),
"project_id": str(task.project_id),
"title": task.title,
"description": task.description,
"status_id": str(task.status_id) if task.status_id else None,
"status_name": task.status.name if task.status else None,
"status_color": task.status.color if task.status else None,
"assignee_id": str(task.assignee_id) if task.assignee_id else None,
"assignee_name": task.assignee.name if task.assignee else None,
"priority": task.priority,
"due_date": str(task.due_date) if task.due_date else None,
"time_estimate": task.original_estimate,
"original_estimate": task.original_estimate,
"parent_task_id": str(task.parent_task_id) if task.parent_task_id else None,
"position": task.position,
"created_by": str(task.created_by),
"creator_name": task.creator.name if task.creator else None,
"created_at": str(task.created_at),
},
triggered_by=str(current_user.id)
)
except Exception as e:
logger.warning(f"Failed to publish task_created event: {e}")
return task
@@ -341,6 +380,40 @@ async def update_task(
db.commit()
db.refresh(task)
# Invalidate workload cache if original_estimate changed and task has an assignee
if "original_estimate" in update_data and task.assignee_id:
invalidate_user_workload_cache(task.assignee_id)
# Publish real-time event
try:
await publish_task_event(
project_id=str(task.project_id),
event_type="task_updated",
task_data={
"task_id": str(task.id),
"project_id": str(task.project_id),
"title": task.title,
"description": task.description,
"status_id": str(task.status_id) if task.status_id else None,
"status_name": task.status.name if task.status else None,
"status_color": task.status.color if task.status else None,
"assignee_id": str(task.assignee_id) if task.assignee_id else None,
"assignee_name": task.assignee.name if task.assignee else None,
"priority": task.priority,
"due_date": str(task.due_date) if task.due_date else None,
"time_estimate": task.original_estimate,
"original_estimate": task.original_estimate,
"time_spent": task.time_spent,
"parent_task_id": str(task.parent_task_id) if task.parent_task_id else None,
"position": task.position,
"updated_at": str(task.updated_at),
"updated_fields": list(update_data.keys()),
},
triggered_by=str(current_user.id)
)
except Exception as e:
logger.warning(f"Failed to publish task_updated event: {e}")
return task
@@ -408,6 +481,26 @@ async def delete_task(
db.commit()
db.refresh(task)
# Invalidate workload cache for assignee
if task.assignee_id:
invalidate_user_workload_cache(task.assignee_id)
# Publish real-time event
try:
await publish_task_event(
project_id=str(task.project_id),
event_type="task_deleted",
task_data={
"task_id": str(task.id),
"project_id": str(task.project_id),
"title": task.title,
"parent_task_id": str(task.parent_task_id) if task.parent_task_id else None,
},
triggered_by=str(current_user.id)
)
except Exception as e:
logger.warning(f"Failed to publish task_deleted event: {e}")
return task
@@ -461,6 +554,10 @@ async def restore_task(
db.commit()
db.refresh(task)
# Invalidate workload cache for assignee
if task.assignee_id:
invalidate_user_workload_cache(task.assignee_id)
return task
@@ -500,8 +597,9 @@ async def update_task_status(
detail="Status not found in this project",
)
# Capture old status for triggers
# Capture old status for triggers and event publishing
old_status_id = task.status_id
old_status_name = task.status.name if task.status else None
task.status_id = status_data.status_id
@@ -530,6 +628,32 @@ async def update_task_status(
db.commit()
db.refresh(task)
# Invalidate workload cache when status changes (affects completed/incomplete task calculations)
if old_status_id != status_data.status_id and task.assignee_id:
invalidate_user_workload_cache(task.assignee_id)
# Publish real-time event
try:
await publish_task_event(
project_id=str(task.project_id),
event_type="task_status_changed",
task_data={
"task_id": str(task.id),
"project_id": str(task.project_id),
"title": task.title,
"old_status_id": str(old_status_id) if old_status_id else None,
"old_status_name": old_status_name,
"new_status_id": str(task.status_id) if task.status_id else None,
"new_status_name": task.status.name if task.status else None,
"new_status_color": task.status.color if task.status else None,
"assignee_id": str(task.assignee_id) if task.assignee_id else None,
"blocker_flag": task.blocker_flag,
},
triggered_by=str(current_user.id)
)
except Exception as e:
logger.warning(f"Failed to publish task_status_changed event: {e}")
return task
@@ -568,6 +692,7 @@ async def assign_task(
)
old_assignee_id = task.assignee_id
old_assignee_name = task.assignee.name if task.assignee else None
task.assignee_id = assign_data.assignee_id
# Audit log
@@ -594,6 +719,34 @@ async def assign_task(
db.commit()
db.refresh(task)
# Invalidate workload cache for both old and new assignees
if old_assignee_id != assign_data.assignee_id:
if old_assignee_id:
invalidate_user_workload_cache(old_assignee_id)
if assign_data.assignee_id:
invalidate_user_workload_cache(assign_data.assignee_id)
# Publish real-time event
try:
await publish_task_event(
project_id=str(task.project_id),
event_type="task_assigned",
task_data={
"task_id": str(task.id),
"project_id": str(task.project_id),
"title": task.title,
"old_assignee_id": str(old_assignee_id) if old_assignee_id else None,
"old_assignee_name": old_assignee_name,
"new_assignee_id": str(task.assignee_id) if task.assignee_id else None,
"new_assignee_name": task.assignee.name if task.assignee else None,
"status_id": str(task.status_id) if task.status_id else None,
"status_name": task.status.name if task.status else None,
},
triggered_by=str(current_user.id)
)
except Exception as e:
logger.warning(f"Failed to publish task_assigned event: {e}")
return task

View File

@@ -7,9 +7,10 @@ from sqlalchemy.orm import Session
from app.core.database import SessionLocal
from app.core.security import decode_access_token
from app.core.redis import get_redis_sync
from app.models import User, Notification
from app.models import User, Notification, Project
from app.services.websocket_manager import manager
from app.core.redis_pubsub import NotificationSubscriber
from app.core.redis_pubsub import NotificationSubscriber, ProjectTaskSubscriber
from app.middleware.auth import check_project_access
logger = logging.getLogger(__name__)
router = APIRouter(tags=["websocket"])
@@ -226,3 +227,182 @@ async def websocket_notifications(
pass
await subscriber.stop()
await manager.disconnect(websocket, user_id)
async def verify_project_access(user_id: str, project_id: str) -> tuple[bool, Project | None]:
"""
Check if user has access to the project.
Args:
user_id: The user's ID
project_id: The project's ID
Returns:
Tuple of (has_access: bool, project: Project | None)
"""
db = SessionLocal()
try:
# Get the user
user = db.query(User).filter(User.id == user_id).first()
if user is None or not user.is_active:
return False, None
# Get the project
project = db.query(Project).filter(Project.id == project_id).first()
if project is None:
return False, None
# Check access using existing middleware function
has_access = check_project_access(user, project)
return has_access, project
finally:
db.close()
@router.websocket("/ws/projects/{project_id}")
async def websocket_project_sync(
websocket: WebSocket,
project_id: str,
token: str = Query(..., description="JWT token for authentication"),
):
"""
WebSocket endpoint for project task real-time sync.
Connect with: ws://host/ws/projects/{project_id}?token=<jwt_token>
Messages sent by server:
- {"type": "connected", "data": {"project_id": "...", "user_id": "..."}}
- {"type": "task_created", "data": {...}, "triggered_by": "..."}
- {"type": "task_updated", "data": {...}, "triggered_by": "..."}
- {"type": "task_status_changed", "data": {...}, "triggered_by": "..."}
- {"type": "task_deleted", "data": {...}, "triggered_by": "..."}
- {"type": "task_assigned", "data": {...}, "triggered_by": "..."}
- {"type": "ping"} / {"type": "pong"}
Messages accepted from client:
- {"type": "ping"} - Client keepalive ping
"""
# Authenticate user
user_id, user = await get_user_from_token(token)
if user_id is None:
await websocket.close(code=4001, reason="Invalid or expired token")
return
# Verify user has access to the project
has_access, project = await verify_project_access(user_id, project_id)
if not has_access:
await websocket.close(code=4003, reason="Access denied to this project")
return
if project is None:
await websocket.close(code=4004, reason="Project not found")
return
# Accept connection and join project room
await websocket.accept()
await manager.join_project(websocket, user_id, project_id)
# Create Redis subscriber for project task events
subscriber = ProjectTaskSubscriber(project_id)
async def handle_redis_message(event_data: dict):
"""Forward Redis pub/sub task events to WebSocket."""
try:
# Forward the event directly (it already contains type, data, triggered_by)
await websocket.send_json(event_data)
except Exception as e:
logger.error(f"Error forwarding task event to WebSocket: {e}")
redis_task = None
try:
# Send initial connection success message
await websocket.send_json({
"type": "connected",
"data": {
"project_id": project_id,
"user_id": user_id,
"project_title": project.title if project else None,
},
})
logger.info(f"User {user_id} connected to project {project_id} WebSocket")
# Start Redis pub/sub subscription in background
await subscriber.start()
redis_task = asyncio.create_task(subscriber.listen(handle_redis_message))
# Heartbeat tracking (reuse same configuration as notifications)
waiting_for_pong = False
ping_sent_at = 0.0
last_activity = time.time()
while True:
# Calculate appropriate timeout based on state
if waiting_for_pong:
# When waiting for pong, use remaining pong timeout
remaining = PONG_TIMEOUT - (time.time() - ping_sent_at)
if remaining <= 0:
logger.warning(f"Pong timeout for user {user_id} in project {project_id}, disconnecting")
break
timeout = remaining
else:
# When not waiting, use remaining ping interval
remaining = PING_INTERVAL - (time.time() - last_activity)
if remaining <= 0:
# Time to send ping immediately
try:
await websocket.send_json({"type": "ping"})
waiting_for_pong = True
ping_sent_at = time.time()
last_activity = ping_sent_at
timeout = PONG_TIMEOUT
except Exception:
break
else:
timeout = remaining
try:
# Wait for messages from client
data = await asyncio.wait_for(
websocket.receive_json(),
timeout=timeout
)
last_activity = time.time()
msg_type = data.get("type")
# Handle ping message from client
if msg_type == "ping":
await websocket.send_json({"type": "pong"})
# Handle pong message from client (response to our ping)
elif msg_type == "pong":
waiting_for_pong = False
logger.debug(f"Pong received from user {user_id} in project {project_id}")
except asyncio.TimeoutError:
if waiting_for_pong:
# Strict timeout check
if time.time() - ping_sent_at >= PONG_TIMEOUT:
logger.warning(f"Pong timeout for user {user_id} in project {project_id}, disconnecting")
break
# If not waiting_for_pong, loop will handle sending ping at top
except WebSocketDisconnect:
logger.info(f"User {user_id} disconnected from project {project_id} WebSocket")
except Exception as e:
logger.error(f"WebSocket error for project {project_id}: {e}")
finally:
# Clean up Redis subscription
if redis_task:
redis_task.cancel()
try:
await redis_task
except asyncio.CancelledError:
pass
await subscriber.stop()
await manager.leave_project(websocket, user_id, project_id)
logger.info(f"User {user_id} left project {project_id} room")

View File

@@ -1,7 +1,10 @@
"""Redis Pub/Sub service for cross-process notification broadcasting."""
import asyncio
import json
import logging
import uuid
from datetime import datetime
from typing import Optional, Callable, Any
import redis.asyncio as aioredis
@@ -9,6 +12,10 @@ from app.core.config import settings
logger = logging.getLogger(__name__)
# Redis retry configuration
MAX_REDIS_RETRIES = 3
REDIS_RETRY_DELAY = 0.5 # seconds (base delay for exponential backoff)
# Global async Redis client for pub/sub
_pubsub_redis: Optional[aioredis.Redis] = None
@@ -18,6 +25,11 @@ def get_channel_name(user_id: str) -> str:
return f"notifications:{user_id}"
def get_project_channel_name(project_id: str) -> str:
"""Get the Redis channel name for project task events."""
return f"project:{project_id}:tasks"
async def get_pubsub_redis() -> aioredis.Redis:
"""Get or create the async Redis client for pub/sub."""
global _pubsub_redis
@@ -120,3 +132,187 @@ class NotificationSubscriber:
@property
def is_running(self) -> bool:
return self._running
async def _reset_pubsub_redis() -> None:
"""Reset the Redis connection on failure."""
global _pubsub_redis
if _pubsub_redis is not None:
try:
await _pubsub_redis.close()
except Exception:
pass
_pubsub_redis = None
async def publish_task_event(
project_id: str,
event_type: str,
task_data: dict,
triggered_by: str
) -> bool:
"""
Publish a task event to a project's channel with retry logic.
Args:
project_id: The project ID
event_type: Event type (task_created, task_updated, task_status_changed, task_deleted, task_assigned)
task_data: The task data to include in the event
triggered_by: User ID who triggered this event
Returns:
True if published successfully, False otherwise
"""
channel = get_project_channel_name(project_id)
message = json.dumps({
"type": event_type,
"event_id": str(uuid.uuid4()), # Unique event ID for multi-tab deduplication
"data": task_data,
"triggered_by": triggered_by,
"timestamp": datetime.utcnow().isoformat(),
}, default=str)
for attempt in range(MAX_REDIS_RETRIES):
try:
redis_client = await get_pubsub_redis()
# Test connection with ping before publishing
await redis_client.ping()
await redis_client.publish(channel, message)
logger.debug(f"Published task event '{event_type}' to channel {channel}")
return True
except Exception as e:
logger.warning(f"Redis publish attempt {attempt + 1}/{MAX_REDIS_RETRIES} failed: {e}")
if attempt < MAX_REDIS_RETRIES - 1:
# Exponential backoff
await asyncio.sleep(REDIS_RETRY_DELAY * (attempt + 1))
# Reset connection on failure
await _reset_pubsub_redis()
else:
logger.error(f"Failed to publish task event '{event_type}' after {MAX_REDIS_RETRIES} attempts")
return False
return False
class ProjectTaskSubscriber:
"""
Subscriber for project task events via Redis Pub/Sub.
Used by WebSocket connections to receive real-time task updates.
Includes automatic reconnection handling.
"""
def __init__(self, project_id: str):
self.project_id = project_id
self.channel = get_project_channel_name(project_id)
self.pubsub: Optional[aioredis.client.PubSub] = None
self._running = False
self._reconnect_attempts = 0
async def start(self) -> None:
"""Start subscribing to the project's task channel with retry logic."""
for attempt in range(MAX_REDIS_RETRIES):
try:
redis_client = await get_pubsub_redis()
# Test connection health
await redis_client.ping()
self.pubsub = redis_client.pubsub()
await self.pubsub.subscribe(self.channel)
self._running = True
self._reconnect_attempts = 0
logger.debug(f"Subscribed to project task channel {self.channel}")
return
except Exception as e:
logger.warning(f"Redis subscribe attempt {attempt + 1}/{MAX_REDIS_RETRIES} failed: {e}")
if attempt < MAX_REDIS_RETRIES - 1:
await asyncio.sleep(REDIS_RETRY_DELAY * (attempt + 1))
await _reset_pubsub_redis()
else:
logger.error(f"Failed to subscribe to channel {self.channel} after {MAX_REDIS_RETRIES} attempts")
raise
async def _reconnect(self) -> bool:
"""Attempt to reconnect to Redis and resubscribe."""
self._reconnect_attempts += 1
if self._reconnect_attempts > MAX_REDIS_RETRIES:
logger.error(f"Max reconnection attempts reached for channel {self.channel}")
return False
logger.info(f"Attempting to reconnect to Redis (attempt {self._reconnect_attempts}/{MAX_REDIS_RETRIES})")
# Clean up old pubsub
if self.pubsub:
try:
await self.pubsub.close()
except Exception:
pass
self.pubsub = None
# Reset global connection
await _reset_pubsub_redis()
# Wait with exponential backoff
await asyncio.sleep(REDIS_RETRY_DELAY * self._reconnect_attempts)
try:
redis_client = await get_pubsub_redis()
await redis_client.ping()
self.pubsub = redis_client.pubsub()
await self.pubsub.subscribe(self.channel)
self._reconnect_attempts = 0
logger.info(f"Successfully reconnected to channel {self.channel}")
return True
except Exception as e:
logger.warning(f"Reconnection attempt failed: {e}")
return False
async def stop(self) -> None:
"""Stop subscribing and clean up."""
self._running = False
if self.pubsub:
try:
await self.pubsub.unsubscribe(self.channel)
await self.pubsub.close()
except Exception as e:
logger.warning(f"Error during pubsub cleanup: {e}")
self.pubsub = None
logger.debug(f"Unsubscribed from project task channel {self.channel}")
async def listen(self, callback: Callable[[dict], Any]) -> None:
"""
Listen for task events and call the callback for each event.
Includes automatic reconnection on connection failures.
Args:
callback: Async function to call with each task event dict.
The dict contains: type, data, triggered_by
"""
if not self.pubsub:
raise RuntimeError("Subscriber not started. Call start() first.")
while self._running:
try:
async for message in self.pubsub.listen():
if not self._running:
break
if message["type"] == "message":
try:
data = json.loads(message["data"])
await callback(data)
except json.JSONDecodeError:
logger.warning(f"Invalid JSON in task event: {message['data']}")
except Exception as e:
logger.error(f"Error processing task event: {e}")
except Exception as e:
if not self._running:
break
logger.warning(f"Redis connection error in task listener: {e}")
# Attempt to reconnect
if await self._reconnect():
continue # Resume listening after successful reconnection
else:
logger.error(f"Failed to recover connection for channel {self.channel}")
break
@property
def is_running(self) -> bool:
return self._running

View File

@@ -1,17 +1,23 @@
import json
import asyncio
from typing import Dict, Set, Optional
import logging
from typing import Dict, Set, Optional, Tuple
from fastapi import WebSocket
from app.core.redis import get_redis_sync
logger = logging.getLogger(__name__)
class ConnectionManager:
"""Manager for WebSocket connections."""
def __init__(self):
# user_id -> set of WebSocket connections
# user_id -> set of WebSocket connections (for notifications)
self.active_connections: Dict[str, Set[WebSocket]] = {}
# project_id -> set of (user_id, WebSocket) tuples (for project sync)
self.project_connections: Dict[str, Set[Tuple[str, WebSocket]]] = {}
self._lock = asyncio.Lock()
self._project_lock = asyncio.Lock()
async def connect(self, websocket: WebSocket, user_id: str):
"""Accept and track a new WebSocket connection."""
@@ -52,6 +58,93 @@ class ConnectionManager:
"""Check if a user has any active connections."""
return user_id in self.active_connections and len(self.active_connections[user_id]) > 0
# Project room management methods
async def join_project(self, websocket: WebSocket, user_id: str, project_id: str):
"""
Add user to a project room for real-time task sync.
Args:
websocket: The WebSocket connection
user_id: The user's ID
project_id: The project to join
"""
async with self._project_lock:
if project_id not in self.project_connections:
self.project_connections[project_id] = set()
self.project_connections[project_id].add((user_id, websocket))
logger.debug(f"User {user_id} joined project room {project_id}")
async def leave_project(self, websocket: WebSocket, user_id: str, project_id: str):
"""
Remove user from a project room.
Args:
websocket: The WebSocket connection
user_id: The user's ID
project_id: The project to leave
"""
async with self._project_lock:
if project_id in self.project_connections:
self.project_connections[project_id].discard((user_id, websocket))
if not self.project_connections[project_id]:
del self.project_connections[project_id]
logger.debug(f"User {user_id} left project room {project_id}")
async def broadcast_to_project(
self,
project_id: str,
message: dict,
exclude_user_id: Optional[str] = None
):
"""
Broadcast message to all users in a project room.
Args:
project_id: The project room to broadcast to
message: The message to send
exclude_user_id: Optional user ID to exclude from broadcast (e.g., the sender)
"""
# Create snapshot while holding lock to prevent race condition
async with self._project_lock:
if project_id not in self.project_connections:
return
connections_snapshot = list(self.project_connections[project_id])
disconnected = set()
for user_id, websocket in connections_snapshot:
# Skip excluded user (sender)
if exclude_user_id and user_id == exclude_user_id:
continue
try:
await websocket.send_json(message)
except Exception as e:
logger.warning(f"Failed to send message to user {user_id} in project {project_id}: {e}")
disconnected.add((user_id, websocket))
# Clean up disconnected connections
if disconnected:
async with self._project_lock:
for conn in disconnected:
if project_id in self.project_connections:
self.project_connections[project_id].discard(conn)
if not self.project_connections[project_id]:
del self.project_connections[project_id]
def get_project_user_count(self, project_id: str) -> int:
"""Get the number of unique users in a project room."""
if project_id not in self.project_connections:
return 0
unique_users = set(user_id for user_id, _ in self.project_connections[project_id])
return len(unique_users)
def is_user_in_project(self, user_id: str, project_id: str) -> bool:
"""Check if a user has any active connections to a project room."""
if project_id not in self.project_connections:
return False
return any(uid == user_id for uid, _ in self.project_connections[project_id])
# Global connection manager instance
manager = ConnectionManager()

View File

@@ -155,9 +155,19 @@ def invalidate_user_workload_cache(user_id: str) -> None:
"""
Invalidate all cached workload data for a user.
This clears:
1. User-specific workload cache (workload:user:{user_id}:*)
2. All heatmap caches (workload:heatmap:*) since heatmap aggregates may include this user
Note: This uses pattern matching which may be slow for large datasets.
For Phase 1, we rely on TTL expiration instead of active invalidation.
"""
pattern = f"workload:*:{user_id}:*"
for key in redis_client.scan_iter(match=pattern):
# Clear user-specific workload cache
user_pattern = f"workload:user:{user_id}:*"
for key in redis_client.scan_iter(match=user_pattern):
redis_client.delete(key)
# Clear all heatmap caches since they aggregate user data
heatmap_pattern = "workload:heatmap:*"
for key in redis_client.scan_iter(match=heatmap_pattern):
redis_client.delete(key)