From 7d3fc72bd2a8acdebfae0bb098506735aee31afd Mon Sep 17 00:00:00 2001 From: egg Date: Mon, 22 Dec 2025 16:41:25 +0800 Subject: [PATCH] feat: Add browser mode fallback for Kaspersky audio blocking MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add sidecar management to backend (sidecar_manager.py) - Add sidecar API router for browser mode (/api/sidecar/*) - Add browser-api.js polyfill for running in Chrome/Edge - Add "Open in Browser" button when audio access fails - Update build scripts with new sidecar modules - Add start-browser.sh for development browser mode Browser mode allows users to open the app in their system browser when Electron's audio access is blocked by security software. The backend manages the sidecar process in browser mode (BROWSER_MODE=true). 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- backend/app/main.py | 64 ++++- backend/app/routers/sidecar.py | 346 +++++++++++++++++++++++++++ backend/app/sidecar_manager.py | 307 ++++++++++++++++++++++++ backend/build.py | 2 + client/src/main.js | 32 ++- client/src/pages/login.html | 2 + client/src/pages/meeting-detail.html | 68 ++++++ client/src/pages/meetings.html | 2 + client/src/preload.js | 4 + client/src/services/browser-api.js | 288 ++++++++++++++++++++++ scripts/build-client.bat | 2 + start-browser.sh | 260 ++++++++++++++++++++ 12 files changed, 1374 insertions(+), 3 deletions(-) create mode 100644 backend/app/routers/sidecar.py create mode 100644 backend/app/sidecar_manager.py create mode 100644 client/src/services/browser-api.js create mode 100755 start-browser.sh diff --git a/backend/app/main.py b/backend/app/main.py index 75293b7..0c6c781 100644 --- a/backend/app/main.py +++ b/backend/app/main.py @@ -1,9 +1,19 @@ +import os +from pathlib import Path from fastapi import FastAPI from fastapi.middleware.cors import CORSMiddleware +from fastapi.staticfiles import StaticFiles +from fastapi.responses import FileResponse from contextlib import asynccontextmanager from .database import init_db_pool, init_tables -from .routers import auth, meetings, ai, export +from .routers import auth, meetings, ai, export, sidecar +from .sidecar_manager import get_sidecar_manager + +# Determine client directory path +BACKEND_DIR = Path(__file__).parent.parent +PROJECT_DIR = BACKEND_DIR.parent +CLIENT_DIR = PROJECT_DIR / "client" / "src" @asynccontextmanager @@ -11,8 +21,25 @@ async def lifespan(app: FastAPI): # Startup init_db_pool() init_tables() + + # Only start sidecar in browser mode (not when Electron manages it) + # Set BROWSER_MODE=true in start-browser.sh to enable + browser_mode = os.environ.get("BROWSER_MODE", "").lower() == "true" + sidecar_mgr = get_sidecar_manager() + + if browser_mode and sidecar_mgr.is_available(): + print("[Backend] Browser mode: Starting sidecar...") + await sidecar_mgr.start() + elif browser_mode: + print("[Backend] Browser mode: Sidecar not available (transcription disabled)") + else: + print("[Backend] Electron mode: Sidecar managed by Electron") + yield - # Shutdown (cleanup if needed) + + # Shutdown - only stop if we started it + if browser_mode: + sidecar_mgr.stop() app = FastAPI( @@ -36,9 +63,42 @@ app.include_router(auth.router, prefix="/api", tags=["Authentication"]) app.include_router(meetings.router, prefix="/api", tags=["Meetings"]) app.include_router(ai.router, prefix="/api", tags=["AI"]) app.include_router(export.router, prefix="/api", tags=["Export"]) +app.include_router(sidecar.router, prefix="/api", tags=["Sidecar"]) @app.get("/api/health") async def health_check(): """Health check endpoint.""" return {"status": "healthy", "service": "meeting-assistant"} + + +# ======================================== +# Browser Mode: Serve static files +# ======================================== + +# Check if client directory exists for browser mode +if CLIENT_DIR.exists(): + # Serve static assets (CSS, JS, etc.) + app.mount("/styles", StaticFiles(directory=CLIENT_DIR / "styles"), name="styles") + app.mount("/services", StaticFiles(directory=CLIENT_DIR / "services"), name="services") + app.mount("/config", StaticFiles(directory=CLIENT_DIR / "config"), name="config") + + @app.get("/") + async def serve_login(): + """Serve login page.""" + return FileResponse(CLIENT_DIR / "pages" / "login.html") + + @app.get("/login") + async def serve_login_page(): + """Serve login page.""" + return FileResponse(CLIENT_DIR / "pages" / "login.html") + + @app.get("/meetings") + async def serve_meetings_page(): + """Serve meetings list page.""" + return FileResponse(CLIENT_DIR / "pages" / "meetings.html") + + @app.get("/meeting-detail") + async def serve_meeting_detail_page(): + """Serve meeting detail page.""" + return FileResponse(CLIENT_DIR / "pages" / "meeting-detail.html") diff --git a/backend/app/routers/sidecar.py b/backend/app/routers/sidecar.py new file mode 100644 index 0000000..12c14c0 --- /dev/null +++ b/backend/app/routers/sidecar.py @@ -0,0 +1,346 @@ +""" +Sidecar API Router + +Provides HTTP endpoints for browser-based clients to access +the Whisper transcription sidecar functionality. +""" + +import os +import tempfile +import base64 +from typing import Optional +from fastapi import APIRouter, HTTPException, UploadFile, File, WebSocket, WebSocketDisconnect +from fastapi.responses import JSONResponse +from pydantic import BaseModel + +from ..sidecar_manager import get_sidecar_manager + +router = APIRouter(prefix="/sidecar", tags=["Sidecar"]) + + +class TranscribeRequest(BaseModel): + """Request for transcribing audio from base64 data.""" + audio_data: str # Base64 encoded audio (webm/opus) + + +class AudioChunkRequest(BaseModel): + """Request for sending an audio chunk in streaming mode.""" + data: str # Base64 encoded PCM audio + + +@router.get("/status") +async def get_sidecar_status(): + """ + Get the current status of the sidecar transcription engine. + + Returns: + Status object with ready state, whisper model info, etc. + """ + manager = get_sidecar_manager() + return manager.get_status() + + +@router.post("/start") +async def start_sidecar(): + """ + Start the sidecar transcription engine. + + This is typically called automatically on backend startup, + but can be used to restart the sidecar if needed. + """ + manager = get_sidecar_manager() + + if not manager.is_available(): + raise HTTPException( + status_code=503, + detail="Sidecar not available. Check if sidecar/transcriber.py and sidecar/venv exist." + ) + + success = await manager.start() + if not success: + raise HTTPException( + status_code=503, + detail="Failed to start sidecar. Check backend logs for details." + ) + + return {"status": "started", "ready": manager.ready} + + +@router.post("/stop") +async def stop_sidecar(): + """Stop the sidecar transcription engine.""" + manager = get_sidecar_manager() + manager.stop() + return {"status": "stopped"} + + +@router.post("/transcribe") +async def transcribe_audio(request: TranscribeRequest): + """ + Transcribe base64-encoded audio data. + + The audio should be in webm/opus format (as recorded by MediaRecorder). + """ + manager = get_sidecar_manager() + + if not manager.ready: + raise HTTPException( + status_code=503, + detail="Sidecar not ready. Please wait for model to load." + ) + + try: + # Decode base64 audio + audio_data = base64.b64decode(request.audio_data) + + # Save to temp file + with tempfile.NamedTemporaryFile(suffix=".webm", delete=False) as f: + f.write(audio_data) + temp_path = f.name + + try: + # Transcribe + result = await manager.transcribe_file(temp_path) + + if result.get("error"): + raise HTTPException(status_code=500, detail=result["error"]) + + return { + "result": result.get("result", ""), + "file": result.get("file", "") + } + + finally: + # Clean up temp file + os.unlink(temp_path) + + except base64.binascii.Error: + raise HTTPException(status_code=400, detail="Invalid base64 audio data") + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + + +@router.post("/transcribe-file") +async def transcribe_audio_file(file: UploadFile = File(...)): + """ + Transcribe an uploaded audio file. + + Accepts common audio formats: mp3, wav, m4a, webm, ogg, flac, aac + """ + manager = get_sidecar_manager() + + if not manager.ready: + raise HTTPException( + status_code=503, + detail="Sidecar not ready. Please wait for model to load." + ) + + # Validate file extension + allowed_extensions = {".mp3", ".wav", ".m4a", ".webm", ".ogg", ".flac", ".aac"} + ext = os.path.splitext(file.filename or "")[1].lower() + if ext not in allowed_extensions: + raise HTTPException( + status_code=400, + detail=f"Unsupported audio format. Allowed: {', '.join(allowed_extensions)}" + ) + + try: + # Save uploaded file + with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as f: + content = await file.read() + f.write(content) + temp_path = f.name + + try: + result = await manager.transcribe_file(temp_path) + + if result.get("error"): + raise HTTPException(status_code=500, detail=result["error"]) + + return { + "result": result.get("result", ""), + "filename": file.filename + } + + finally: + os.unlink(temp_path) + + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + + +@router.post("/stream/start") +async def start_streaming(): + """ + Start a streaming transcription session. + + Returns a session ID that should be used for subsequent audio chunks. + """ + manager = get_sidecar_manager() + + if not manager.ready: + raise HTTPException( + status_code=503, + detail="Sidecar not ready. Please wait for model to load." + ) + + result = await manager.start_stream() + + if result.get("error"): + raise HTTPException(status_code=500, detail=result["error"]) + + return result + + +@router.post("/stream/chunk") +async def send_audio_chunk(request: AudioChunkRequest): + """ + Send an audio chunk for streaming transcription. + + The audio should be base64-encoded PCM data (16-bit, 16kHz, mono). + + Returns a transcription segment if speech end was detected, + or null if more audio is needed. + """ + manager = get_sidecar_manager() + + if not manager.ready: + raise HTTPException( + status_code=503, + detail="Sidecar not ready" + ) + + result = await manager.send_audio_chunk(request.data) + + # Result may be None if no segment ready yet + if result is None: + return {"segment": None} + + if result.get("error"): + raise HTTPException(status_code=500, detail=result["error"]) + + return {"segment": result} + + +@router.post("/stream/stop") +async def stop_streaming(): + """ + Stop the streaming transcription session. + + Returns any final transcription segments and session statistics. + """ + manager = get_sidecar_manager() + + result = await manager.stop_stream() + + if result.get("error"): + raise HTTPException(status_code=500, detail=result["error"]) + + return result + + +@router.post("/segment-audio") +async def segment_audio_file(file: UploadFile = File(...), max_chunk_seconds: int = 300): + """ + Segment an audio file using VAD for natural speech boundaries. + + This is used for processing large audio files before cloud transcription. + + Args: + file: The audio file to segment + max_chunk_seconds: Maximum duration per chunk (default 300s / 5 minutes) + + Returns: + List of segment metadata with file paths + """ + manager = get_sidecar_manager() + + if not manager.ready: + raise HTTPException( + status_code=503, + detail="Sidecar not ready. Please wait for model to load." + ) + + try: + # Save uploaded file + ext = os.path.splitext(file.filename or "")[1].lower() or ".wav" + with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as f: + content = await file.read() + f.write(content) + temp_path = f.name + + try: + result = await manager.segment_audio(temp_path, max_chunk_seconds) + + if result.get("error"): + raise HTTPException(status_code=500, detail=result["error"]) + + return result + + finally: + # Keep temp file for now - segments reference it + # Will be cleaned up by the transcription process + pass + + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + + +@router.websocket("/ws") +async def websocket_endpoint(websocket: WebSocket): + """ + WebSocket endpoint for real-time streaming transcription. + + Protocol: + 1. Client connects + 2. Client sends: {"action": "start_stream"} + 3. Server responds: {"status": "streaming", "session_id": "..."} + 4. Client sends: {"action": "audio_chunk", "data": ""} + 5. Server responds: {"segment": {...}} when speech detected, or {"segment": null} + 6. Client sends: {"action": "stop_stream"} + 7. Server responds: {"status": "stream_stopped", ...} + """ + await websocket.accept() + + manager = get_sidecar_manager() + + if not manager.ready: + await websocket.send_json({"error": "Sidecar not ready"}) + await websocket.close() + return + + try: + while True: + data = await websocket.receive_json() + action = data.get("action") + + if action == "start_stream": + result = await manager.start_stream() + await websocket.send_json(result) + + elif action == "audio_chunk": + audio_data = data.get("data") + if audio_data: + result = await manager.send_audio_chunk(audio_data) + await websocket.send_json({"segment": result}) + else: + await websocket.send_json({"error": "No audio data"}) + + elif action == "stop_stream": + result = await manager.stop_stream() + await websocket.send_json(result) + break + + elif action == "ping": + await websocket.send_json({"status": "pong"}) + + else: + await websocket.send_json({"error": f"Unknown action: {action}"}) + + except WebSocketDisconnect: + # Clean up streaming session if active + if manager._is_streaming(): + await manager.stop_stream() + except Exception as e: + await websocket.send_json({"error": str(e)}) + await websocket.close() diff --git a/backend/app/sidecar_manager.py b/backend/app/sidecar_manager.py new file mode 100644 index 0000000..3a2713a --- /dev/null +++ b/backend/app/sidecar_manager.py @@ -0,0 +1,307 @@ +""" +Sidecar Process Manager + +Manages the Python sidecar process for speech-to-text transcription. +Provides an interface for the backend to communicate with the sidecar +via subprocess stdin/stdout. +""" + +import asyncio +import json +import os +import subprocess +import sys +import tempfile +import base64 +from pathlib import Path +from typing import Optional, Dict, Any, Callable +from threading import Thread, Lock +import queue + + +class SidecarManager: + """ + Manages the Whisper transcription sidecar process. + + The sidecar is a Python process running transcriber.py that handles + speech-to-text conversion using faster-whisper. + """ + + def __init__(self): + self.process: Optional[subprocess.Popen] = None + self.ready = False + self.whisper_info: Optional[Dict] = None + self._lock = Lock() + self._response_queue = queue.Queue() + self._reader_thread: Optional[Thread] = None + self._progress_callbacks: list[Callable] = [] + self._last_status: Dict[str, Any] = {} + + # Paths + self.project_dir = Path(__file__).parent.parent.parent + self.sidecar_dir = self.project_dir / "sidecar" + self.transcriber_path = self.sidecar_dir / "transcriber.py" + self.venv_python = self.sidecar_dir / "venv" / "bin" / "python" + + def is_available(self) -> bool: + """Check if sidecar is available (files exist).""" + return self.transcriber_path.exists() and self.venv_python.exists() + + def get_status(self) -> Dict[str, Any]: + """Get current sidecar status.""" + return { + "ready": self.ready, + "streaming": self._is_streaming(), + "whisper": self.whisper_info, + "available": self.is_available(), + "browserMode": False, + **self._last_status + } + + def _is_streaming(self) -> bool: + """Check if currently in streaming mode.""" + return self._last_status.get("streaming", False) + + async def start(self) -> bool: + """Start the sidecar process.""" + if self.process and self.process.poll() is None: + return True # Already running + + if not self.is_available(): + print(f"[Sidecar] Not available: transcriber={self.transcriber_path.exists()}, venv={self.venv_python.exists()}") + return False + + try: + # Get Whisper configuration from environment + env = os.environ.copy() + env["WHISPER_MODEL"] = os.getenv("WHISPER_MODEL", "medium") + env["WHISPER_DEVICE"] = os.getenv("WHISPER_DEVICE", "cpu") + env["WHISPER_COMPUTE"] = os.getenv("WHISPER_COMPUTE", "int8") + + print(f"[Sidecar] Starting with model={env['WHISPER_MODEL']}, device={env['WHISPER_DEVICE']}, compute={env['WHISPER_COMPUTE']}") + + self.process = subprocess.Popen( + [str(self.venv_python), str(self.transcriber_path), "--server"], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + env=env, + cwd=str(self.sidecar_dir), + bufsize=1, # Line buffered + text=True + ) + + # Start reader threads + self._reader_thread = Thread(target=self._read_stdout, daemon=True) + self._reader_thread.start() + + stderr_thread = Thread(target=self._read_stderr, daemon=True) + stderr_thread.start() + + # Wait for ready signal + try: + response = await asyncio.wait_for( + asyncio.get_event_loop().run_in_executor( + None, self._wait_for_ready + ), + timeout=120.0 # 2 minutes for model download + ) + if response and response.get("status") == "ready": + self.ready = True + print("[Sidecar] Ready") + return True + except asyncio.TimeoutError: + print("[Sidecar] Timeout waiting for ready") + self.stop() + return False + + except Exception as e: + print(f"[Sidecar] Start error: {e}") + return False + + return False + + def _wait_for_ready(self) -> Optional[Dict]: + """Wait for the ready signal from sidecar.""" + while True: + try: + response = self._response_queue.get(timeout=1.0) + status = response.get("status", "") + + # Track progress events + if status in ["downloading_model", "model_downloaded", "model_cached", + "loading_model", "model_loaded", "model_error"]: + self._last_status = response + self._notify_progress(response) + + if status == "model_loaded": + # Extract whisper info + self.whisper_info = { + "model": os.getenv("WHISPER_MODEL", "medium"), + "device": os.getenv("WHISPER_DEVICE", "cpu"), + "compute": os.getenv("WHISPER_COMPUTE", "int8"), + "configSource": "environment" + } + elif status == "model_error": + self.whisper_info = {"error": response.get("error", "Unknown error")} + + if status == "ready": + return response + + except queue.Empty: + if self.process and self.process.poll() is not None: + return None # Process died + continue + + def _read_stdout(self): + """Read stdout from sidecar process.""" + if not self.process or not self.process.stdout: + return + + for line in self.process.stdout: + line = line.strip() + if not line: + continue + try: + data = json.loads(line) + self._response_queue.put(data) + except json.JSONDecodeError as e: + print(f"[Sidecar] Invalid JSON: {line[:100]}") + + def _read_stderr(self): + """Read stderr from sidecar process.""" + if not self.process or not self.process.stderr: + return + + for line in self.process.stderr: + line = line.strip() + if line: + # Try to parse as JSON (some status messages go to stderr) + try: + data = json.loads(line) + if "status" in data or "warning" in data: + self._notify_progress(data) + except json.JSONDecodeError: + print(f"[Sidecar stderr] {line}") + + def _notify_progress(self, data: Dict): + """Notify all progress callbacks.""" + for callback in self._progress_callbacks: + try: + callback(data) + except Exception as e: + print(f"[Sidecar] Progress callback error: {e}") + + def add_progress_callback(self, callback: Callable): + """Add a callback for progress updates.""" + self._progress_callbacks.append(callback) + + def remove_progress_callback(self, callback: Callable): + """Remove a progress callback.""" + if callback in self._progress_callbacks: + self._progress_callbacks.remove(callback) + + async def send_command(self, command: Dict) -> Optional[Dict]: + """Send a command to the sidecar and wait for response.""" + if not self.process or self.process.poll() is not None: + return {"error": "Sidecar not running"} + + with self._lock: + try: + # Clear queue before sending + while not self._response_queue.empty(): + try: + self._response_queue.get_nowait() + except queue.Empty: + break + + # Send command + cmd_json = json.dumps(command) + "\n" + self.process.stdin.write(cmd_json) + self.process.stdin.flush() + + # Wait for response + try: + response = await asyncio.wait_for( + asyncio.get_event_loop().run_in_executor( + None, lambda: self._response_queue.get(timeout=60.0) + ), + timeout=65.0 + ) + return response + except (asyncio.TimeoutError, queue.Empty): + return {"error": "Command timeout"} + + except Exception as e: + return {"error": f"Command error: {e}"} + + async def transcribe_file(self, audio_path: str) -> Dict: + """Transcribe an audio file.""" + return await self.send_command({ + "action": "transcribe", + "file": audio_path + }) or {"error": "No response"} + + async def start_stream(self) -> Dict: + """Start a streaming transcription session.""" + result = await self.send_command({"action": "start_stream"}) + if result and result.get("status") == "streaming": + self._last_status["streaming"] = True + return result or {"error": "No response"} + + async def send_audio_chunk(self, base64_audio: str) -> Optional[Dict]: + """Send an audio chunk for streaming transcription.""" + return await self.send_command({ + "action": "audio_chunk", + "data": base64_audio + }) + + async def stop_stream(self) -> Dict: + """Stop the streaming session.""" + result = await self.send_command({"action": "stop_stream"}) + self._last_status["streaming"] = False + return result or {"error": "No response"} + + async def segment_audio(self, file_path: str, max_chunk_seconds: int = 300) -> Dict: + """Segment an audio file using VAD.""" + return await self.send_command({ + "action": "segment_audio", + "file_path": file_path, + "max_chunk_seconds": max_chunk_seconds + }) or {"error": "No response"} + + def stop(self): + """Stop the sidecar process.""" + self.ready = False + self._last_status = {} + + if self.process: + try: + # Try graceful shutdown + self.process.stdin.write('{"action": "quit"}\n') + self.process.stdin.flush() + self.process.wait(timeout=5.0) + except: + pass + finally: + if self.process.poll() is None: + self.process.terminate() + try: + self.process.wait(timeout=2.0) + except: + self.process.kill() + self.process = None + + print("[Sidecar] Stopped") + + +# Global instance +_sidecar_manager: Optional[SidecarManager] = None + + +def get_sidecar_manager() -> SidecarManager: + """Get or create the global sidecar manager instance.""" + global _sidecar_manager + if _sidecar_manager is None: + _sidecar_manager = SidecarManager() + return _sidecar_manager diff --git a/backend/build.py b/backend/build.py index f48268e..874c62d 100644 --- a/backend/build.py +++ b/backend/build.py @@ -96,6 +96,8 @@ def build(): "--hidden-import", "app.routers.meetings", "--hidden-import", "app.routers.ai", "--hidden-import", "app.routers.export", + "--hidden-import", "app.routers.sidecar", + "--hidden-import", "app.sidecar_manager", "--hidden-import", "app.models", "--hidden-import", "app.models.schemas", # Collect package data diff --git a/client/src/main.js b/client/src/main.js index 5875a99..fcacbba 100644 --- a/client/src/main.js +++ b/client/src/main.js @@ -1,4 +1,4 @@ -const { app, BrowserWindow, ipcMain, session } = require("electron"); +const { app, BrowserWindow, ipcMain, session, shell } = require("electron"); const path = require("path"); const fs = require("fs"); const { spawn } = require("child_process"); @@ -724,3 +724,33 @@ ipcMain.handle("transcribe-audio", async (event, audioFilePath) => { }, 60000); }); }); + +// === Browser Mode Handler === +// Opens the current page in the system's default browser +// This is useful when Electron's audio access is blocked by security software + +ipcMain.handle("open-in-browser", async () => { + const backendConfig = appConfig?.backend || {}; + const host = backendConfig.host || "127.0.0.1"; + const port = backendConfig.port || 8000; + + // Determine the current page URL + let currentPage = "login"; + if (mainWindow) { + const currentUrl = mainWindow.webContents.getURL(); + if (currentUrl.includes("meetings.html")) { + currentPage = "meetings"; + } else if (currentUrl.includes("meeting-detail.html")) { + currentPage = "meeting-detail"; + } + } + + const browserUrl = `http://${host}:${port}/${currentPage}`; + + try { + await shell.openExternal(browserUrl); + return { success: true, url: browserUrl }; + } catch (error) { + return { error: error.message }; + } +}); diff --git a/client/src/pages/login.html b/client/src/pages/login.html index fbafc0b..64c00d6 100644 --- a/client/src/pages/login.html +++ b/client/src/pages/login.html @@ -26,6 +26,8 @@