""" Sidecar API Router Provides HTTP endpoints for browser-based clients to access the Whisper transcription sidecar functionality. """ import os import tempfile import base64 from typing import Optional from fastapi import APIRouter, HTTPException, UploadFile, File, WebSocket, WebSocketDisconnect from fastapi.responses import JSONResponse from pydantic import BaseModel from ..sidecar_manager import get_sidecar_manager router = APIRouter(prefix="/sidecar", tags=["Sidecar"]) class TranscribeRequest(BaseModel): """Request for transcribing audio from base64 data.""" audio_data: str # Base64 encoded audio (webm/opus) class AudioChunkRequest(BaseModel): """Request for sending an audio chunk in streaming mode.""" data: str # Base64 encoded PCM audio @router.get("/status") async def get_sidecar_status(): """ Get the current status of the sidecar transcription engine. Returns: Status object with ready state, whisper model info, etc. """ manager = get_sidecar_manager() return manager.get_status() @router.post("/start") async def start_sidecar(): """ Start the sidecar transcription engine. This is typically called automatically on backend startup, but can be used to restart the sidecar if needed. """ manager = get_sidecar_manager() if not manager.is_available(): raise HTTPException( status_code=503, detail="Sidecar not available. Check if sidecar/transcriber.py and sidecar/venv exist." ) success = await manager.start() if not success: raise HTTPException( status_code=503, detail="Failed to start sidecar. Check backend logs for details." ) return {"status": "started", "ready": manager.ready} @router.post("/stop") async def stop_sidecar(): """Stop the sidecar transcription engine.""" manager = get_sidecar_manager() manager.stop() return {"status": "stopped"} @router.post("/transcribe") async def transcribe_audio(request: TranscribeRequest): """ Transcribe base64-encoded audio data. The audio should be in webm/opus format (as recorded by MediaRecorder). """ manager = get_sidecar_manager() if not manager.ready: raise HTTPException( status_code=503, detail="Sidecar not ready. Please wait for model to load." ) try: # Decode base64 audio audio_data = base64.b64decode(request.audio_data) # Save to temp file with tempfile.NamedTemporaryFile(suffix=".webm", delete=False) as f: f.write(audio_data) temp_path = f.name try: # Transcribe result = await manager.transcribe_file(temp_path) if result.get("error"): raise HTTPException(status_code=500, detail=result["error"]) return { "result": result.get("result", ""), "file": result.get("file", "") } finally: # Clean up temp file os.unlink(temp_path) except base64.binascii.Error: raise HTTPException(status_code=400, detail="Invalid base64 audio data") except Exception as e: raise HTTPException(status_code=500, detail=str(e)) @router.post("/transcribe-file") async def transcribe_audio_file(file: UploadFile = File(...)): """ Transcribe an uploaded audio file. Accepts common audio formats: mp3, wav, m4a, webm, ogg, flac, aac """ manager = get_sidecar_manager() if not manager.ready: raise HTTPException( status_code=503, detail="Sidecar not ready. Please wait for model to load." ) # Validate file extension allowed_extensions = {".mp3", ".wav", ".m4a", ".webm", ".ogg", ".flac", ".aac"} ext = os.path.splitext(file.filename or "")[1].lower() if ext not in allowed_extensions: raise HTTPException( status_code=400, detail=f"Unsupported audio format. Allowed: {', '.join(allowed_extensions)}" ) try: # Save uploaded file with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as f: content = await file.read() f.write(content) temp_path = f.name try: result = await manager.transcribe_file(temp_path) if result.get("error"): raise HTTPException(status_code=500, detail=result["error"]) return { "result": result.get("result", ""), "filename": file.filename } finally: os.unlink(temp_path) except Exception as e: raise HTTPException(status_code=500, detail=str(e)) @router.post("/stream/start") async def start_streaming(): """ Start a streaming transcription session. Returns a session ID that should be used for subsequent audio chunks. """ manager = get_sidecar_manager() if not manager.ready: raise HTTPException( status_code=503, detail="Sidecar not ready. Please wait for model to load." ) result = await manager.start_stream() if result.get("error"): raise HTTPException(status_code=500, detail=result["error"]) return result @router.post("/stream/chunk") async def send_audio_chunk(request: AudioChunkRequest): """ Send an audio chunk for streaming transcription. The audio should be base64-encoded PCM data (16-bit, 16kHz, mono). Returns a transcription segment if speech end was detected, or null if more audio is needed. """ manager = get_sidecar_manager() if not manager.ready: raise HTTPException( status_code=503, detail="Sidecar not ready" ) result = await manager.send_audio_chunk(request.data) # Result may be None if no segment ready yet if result is None: return {"segment": None} if result.get("error"): raise HTTPException(status_code=500, detail=result["error"]) return {"segment": result} @router.post("/stream/stop") async def stop_streaming(): """ Stop the streaming transcription session. Returns any final transcription segments and session statistics. """ manager = get_sidecar_manager() result = await manager.stop_stream() if result.get("error"): raise HTTPException(status_code=500, detail=result["error"]) return result @router.post("/segment-audio") async def segment_audio_file(file: UploadFile = File(...), max_chunk_seconds: int = 300): """ Segment an audio file using VAD for natural speech boundaries. This is used for processing large audio files before cloud transcription. Args: file: The audio file to segment max_chunk_seconds: Maximum duration per chunk (default 300s / 5 minutes) Returns: List of segment metadata with file paths """ manager = get_sidecar_manager() if not manager.ready: raise HTTPException( status_code=503, detail="Sidecar not ready. Please wait for model to load." ) try: # Save uploaded file ext = os.path.splitext(file.filename or "")[1].lower() or ".wav" with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as f: content = await file.read() f.write(content) temp_path = f.name try: result = await manager.segment_audio(temp_path, max_chunk_seconds) if result.get("error"): raise HTTPException(status_code=500, detail=result["error"]) return result finally: # Keep temp file for now - segments reference it # Will be cleaned up by the transcription process pass except Exception as e: raise HTTPException(status_code=500, detail=str(e)) @router.websocket("/ws") async def websocket_endpoint(websocket: WebSocket): """ WebSocket endpoint for real-time streaming transcription. Protocol: 1. Client connects 2. Client sends: {"action": "start_stream"} 3. Server responds: {"status": "streaming", "session_id": "..."} 4. Client sends: {"action": "audio_chunk", "data": ""} 5. Server responds: {"segment": {...}} when speech detected, or {"segment": null} 6. Client sends: {"action": "stop_stream"} 7. Server responds: {"status": "stream_stopped", ...} """ await websocket.accept() manager = get_sidecar_manager() if not manager.ready: await websocket.send_json({"error": "Sidecar not ready"}) await websocket.close() return try: while True: data = await websocket.receive_json() action = data.get("action") if action == "start_stream": result = await manager.start_stream() await websocket.send_json(result) elif action == "audio_chunk": audio_data = data.get("data") if audio_data: result = await manager.send_audio_chunk(audio_data) await websocket.send_json({"segment": result}) else: await websocket.send_json({"error": "No audio data"}) elif action == "stop_stream": result = await manager.stop_stream() await websocket.send_json(result) break elif action == "ping": await websocket.send_json({"status": "pong"}) else: await websocket.send_json({"error": f"Unknown action: {action}"}) except WebSocketDisconnect: # Clean up streaming session if active if manager._is_streaming(): await manager.stop_stream() except Exception as e: await websocket.send_json({"error": str(e)}) await websocket.close()