- Add sidecar management to backend (sidecar_manager.py) - Add sidecar API router for browser mode (/api/sidecar/*) - Add browser-api.js polyfill for running in Chrome/Edge - Add "Open in Browser" button when audio access fails - Update build scripts with new sidecar modules - Add start-browser.sh for development browser mode Browser mode allows users to open the app in their system browser when Electron's audio access is blocked by security software. The backend manages the sidecar process in browser mode (BROWSER_MODE=true). 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
347 lines
9.8 KiB
Python
347 lines
9.8 KiB
Python
"""
|
|
Sidecar API Router
|
|
|
|
Provides HTTP endpoints for browser-based clients to access
|
|
the Whisper transcription sidecar functionality.
|
|
"""
|
|
|
|
import os
|
|
import tempfile
|
|
import base64
|
|
from typing import Optional
|
|
from fastapi import APIRouter, HTTPException, UploadFile, File, WebSocket, WebSocketDisconnect
|
|
from fastapi.responses import JSONResponse
|
|
from pydantic import BaseModel
|
|
|
|
from ..sidecar_manager import get_sidecar_manager
|
|
|
|
router = APIRouter(prefix="/sidecar", tags=["Sidecar"])
|
|
|
|
|
|
class TranscribeRequest(BaseModel):
|
|
"""Request for transcribing audio from base64 data."""
|
|
audio_data: str # Base64 encoded audio (webm/opus)
|
|
|
|
|
|
class AudioChunkRequest(BaseModel):
|
|
"""Request for sending an audio chunk in streaming mode."""
|
|
data: str # Base64 encoded PCM audio
|
|
|
|
|
|
@router.get("/status")
|
|
async def get_sidecar_status():
|
|
"""
|
|
Get the current status of the sidecar transcription engine.
|
|
|
|
Returns:
|
|
Status object with ready state, whisper model info, etc.
|
|
"""
|
|
manager = get_sidecar_manager()
|
|
return manager.get_status()
|
|
|
|
|
|
@router.post("/start")
|
|
async def start_sidecar():
|
|
"""
|
|
Start the sidecar transcription engine.
|
|
|
|
This is typically called automatically on backend startup,
|
|
but can be used to restart the sidecar if needed.
|
|
"""
|
|
manager = get_sidecar_manager()
|
|
|
|
if not manager.is_available():
|
|
raise HTTPException(
|
|
status_code=503,
|
|
detail="Sidecar not available. Check if sidecar/transcriber.py and sidecar/venv exist."
|
|
)
|
|
|
|
success = await manager.start()
|
|
if not success:
|
|
raise HTTPException(
|
|
status_code=503,
|
|
detail="Failed to start sidecar. Check backend logs for details."
|
|
)
|
|
|
|
return {"status": "started", "ready": manager.ready}
|
|
|
|
|
|
@router.post("/stop")
|
|
async def stop_sidecar():
|
|
"""Stop the sidecar transcription engine."""
|
|
manager = get_sidecar_manager()
|
|
manager.stop()
|
|
return {"status": "stopped"}
|
|
|
|
|
|
@router.post("/transcribe")
|
|
async def transcribe_audio(request: TranscribeRequest):
|
|
"""
|
|
Transcribe base64-encoded audio data.
|
|
|
|
The audio should be in webm/opus format (as recorded by MediaRecorder).
|
|
"""
|
|
manager = get_sidecar_manager()
|
|
|
|
if not manager.ready:
|
|
raise HTTPException(
|
|
status_code=503,
|
|
detail="Sidecar not ready. Please wait for model to load."
|
|
)
|
|
|
|
try:
|
|
# Decode base64 audio
|
|
audio_data = base64.b64decode(request.audio_data)
|
|
|
|
# Save to temp file
|
|
with tempfile.NamedTemporaryFile(suffix=".webm", delete=False) as f:
|
|
f.write(audio_data)
|
|
temp_path = f.name
|
|
|
|
try:
|
|
# Transcribe
|
|
result = await manager.transcribe_file(temp_path)
|
|
|
|
if result.get("error"):
|
|
raise HTTPException(status_code=500, detail=result["error"])
|
|
|
|
return {
|
|
"result": result.get("result", ""),
|
|
"file": result.get("file", "")
|
|
}
|
|
|
|
finally:
|
|
# Clean up temp file
|
|
os.unlink(temp_path)
|
|
|
|
except base64.binascii.Error:
|
|
raise HTTPException(status_code=400, detail="Invalid base64 audio data")
|
|
except Exception as e:
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
|
|
@router.post("/transcribe-file")
|
|
async def transcribe_audio_file(file: UploadFile = File(...)):
|
|
"""
|
|
Transcribe an uploaded audio file.
|
|
|
|
Accepts common audio formats: mp3, wav, m4a, webm, ogg, flac, aac
|
|
"""
|
|
manager = get_sidecar_manager()
|
|
|
|
if not manager.ready:
|
|
raise HTTPException(
|
|
status_code=503,
|
|
detail="Sidecar not ready. Please wait for model to load."
|
|
)
|
|
|
|
# Validate file extension
|
|
allowed_extensions = {".mp3", ".wav", ".m4a", ".webm", ".ogg", ".flac", ".aac"}
|
|
ext = os.path.splitext(file.filename or "")[1].lower()
|
|
if ext not in allowed_extensions:
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail=f"Unsupported audio format. Allowed: {', '.join(allowed_extensions)}"
|
|
)
|
|
|
|
try:
|
|
# Save uploaded file
|
|
with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as f:
|
|
content = await file.read()
|
|
f.write(content)
|
|
temp_path = f.name
|
|
|
|
try:
|
|
result = await manager.transcribe_file(temp_path)
|
|
|
|
if result.get("error"):
|
|
raise HTTPException(status_code=500, detail=result["error"])
|
|
|
|
return {
|
|
"result": result.get("result", ""),
|
|
"filename": file.filename
|
|
}
|
|
|
|
finally:
|
|
os.unlink(temp_path)
|
|
|
|
except Exception as e:
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
|
|
@router.post("/stream/start")
|
|
async def start_streaming():
|
|
"""
|
|
Start a streaming transcription session.
|
|
|
|
Returns a session ID that should be used for subsequent audio chunks.
|
|
"""
|
|
manager = get_sidecar_manager()
|
|
|
|
if not manager.ready:
|
|
raise HTTPException(
|
|
status_code=503,
|
|
detail="Sidecar not ready. Please wait for model to load."
|
|
)
|
|
|
|
result = await manager.start_stream()
|
|
|
|
if result.get("error"):
|
|
raise HTTPException(status_code=500, detail=result["error"])
|
|
|
|
return result
|
|
|
|
|
|
@router.post("/stream/chunk")
|
|
async def send_audio_chunk(request: AudioChunkRequest):
|
|
"""
|
|
Send an audio chunk for streaming transcription.
|
|
|
|
The audio should be base64-encoded PCM data (16-bit, 16kHz, mono).
|
|
|
|
Returns a transcription segment if speech end was detected,
|
|
or null if more audio is needed.
|
|
"""
|
|
manager = get_sidecar_manager()
|
|
|
|
if not manager.ready:
|
|
raise HTTPException(
|
|
status_code=503,
|
|
detail="Sidecar not ready"
|
|
)
|
|
|
|
result = await manager.send_audio_chunk(request.data)
|
|
|
|
# Result may be None if no segment ready yet
|
|
if result is None:
|
|
return {"segment": None}
|
|
|
|
if result.get("error"):
|
|
raise HTTPException(status_code=500, detail=result["error"])
|
|
|
|
return {"segment": result}
|
|
|
|
|
|
@router.post("/stream/stop")
|
|
async def stop_streaming():
|
|
"""
|
|
Stop the streaming transcription session.
|
|
|
|
Returns any final transcription segments and session statistics.
|
|
"""
|
|
manager = get_sidecar_manager()
|
|
|
|
result = await manager.stop_stream()
|
|
|
|
if result.get("error"):
|
|
raise HTTPException(status_code=500, detail=result["error"])
|
|
|
|
return result
|
|
|
|
|
|
@router.post("/segment-audio")
|
|
async def segment_audio_file(file: UploadFile = File(...), max_chunk_seconds: int = 300):
|
|
"""
|
|
Segment an audio file using VAD for natural speech boundaries.
|
|
|
|
This is used for processing large audio files before cloud transcription.
|
|
|
|
Args:
|
|
file: The audio file to segment
|
|
max_chunk_seconds: Maximum duration per chunk (default 300s / 5 minutes)
|
|
|
|
Returns:
|
|
List of segment metadata with file paths
|
|
"""
|
|
manager = get_sidecar_manager()
|
|
|
|
if not manager.ready:
|
|
raise HTTPException(
|
|
status_code=503,
|
|
detail="Sidecar not ready. Please wait for model to load."
|
|
)
|
|
|
|
try:
|
|
# Save uploaded file
|
|
ext = os.path.splitext(file.filename or "")[1].lower() or ".wav"
|
|
with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as f:
|
|
content = await file.read()
|
|
f.write(content)
|
|
temp_path = f.name
|
|
|
|
try:
|
|
result = await manager.segment_audio(temp_path, max_chunk_seconds)
|
|
|
|
if result.get("error"):
|
|
raise HTTPException(status_code=500, detail=result["error"])
|
|
|
|
return result
|
|
|
|
finally:
|
|
# Keep temp file for now - segments reference it
|
|
# Will be cleaned up by the transcription process
|
|
pass
|
|
|
|
except Exception as e:
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
|
|
@router.websocket("/ws")
|
|
async def websocket_endpoint(websocket: WebSocket):
|
|
"""
|
|
WebSocket endpoint for real-time streaming transcription.
|
|
|
|
Protocol:
|
|
1. Client connects
|
|
2. Client sends: {"action": "start_stream"}
|
|
3. Server responds: {"status": "streaming", "session_id": "..."}
|
|
4. Client sends: {"action": "audio_chunk", "data": "<base64_pcm>"}
|
|
5. Server responds: {"segment": {...}} when speech detected, or {"segment": null}
|
|
6. Client sends: {"action": "stop_stream"}
|
|
7. Server responds: {"status": "stream_stopped", ...}
|
|
"""
|
|
await websocket.accept()
|
|
|
|
manager = get_sidecar_manager()
|
|
|
|
if not manager.ready:
|
|
await websocket.send_json({"error": "Sidecar not ready"})
|
|
await websocket.close()
|
|
return
|
|
|
|
try:
|
|
while True:
|
|
data = await websocket.receive_json()
|
|
action = data.get("action")
|
|
|
|
if action == "start_stream":
|
|
result = await manager.start_stream()
|
|
await websocket.send_json(result)
|
|
|
|
elif action == "audio_chunk":
|
|
audio_data = data.get("data")
|
|
if audio_data:
|
|
result = await manager.send_audio_chunk(audio_data)
|
|
await websocket.send_json({"segment": result})
|
|
else:
|
|
await websocket.send_json({"error": "No audio data"})
|
|
|
|
elif action == "stop_stream":
|
|
result = await manager.stop_stream()
|
|
await websocket.send_json(result)
|
|
break
|
|
|
|
elif action == "ping":
|
|
await websocket.send_json({"status": "pong"})
|
|
|
|
else:
|
|
await websocket.send_json({"error": f"Unknown action: {action}"})
|
|
|
|
except WebSocketDisconnect:
|
|
# Clean up streaming session if active
|
|
if manager._is_streaming():
|
|
await manager.stop_stream()
|
|
except Exception as e:
|
|
await websocket.send_json({"error": str(e)})
|
|
await websocket.close()
|