feat: Meeting Assistant MVP - Complete implementation

Enterprise Meeting Knowledge Management System with: Backend (FastAPI): - Authentication proxy with JWT (pj-auth-api integration) - MySQL database with 4 tables (users, meetings, conclusions, actions) - Meeting CRUD with system code generation (C-YYYYMMDD-XX, A-YYYYMMDD-XX) - Dify LLM integration for AI summarization - Excel export with openpyxl - 20 unit tests (all passing) Client (Electron): - Login page with company auth - Meeting list with create/delete - Meeting detail with real-time transcription - Editable transcript textarea (single block, easy editing) - AI summarization with conclusions/action items - 5-second segment recording (efficient for long meetings) Sidecar (Python): - faster-whisper medium model with int8 quantization - ONNX Runtime VAD (lightweight, ~20MB vs PyTorch ~2GB) - Chinese punctuation processing - OpenCC for Traditional Chinese conversion - Anti-hallucination parameters - Auto-cleanup of temp audio files OpenSpec: - add-meeting-assistant-mvp (47 tasks, archived) - add-realtime-transcription (29 tasks, archived) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-10 20:17:44 +08:00
commit 8b6184ecc5
65 changed files with 10510 additions and 0 deletions
--- a/sidecar/transcriber.py
+++ b/sidecar/transcriber.py
@@ -0,0 +1,510 @@
+#!/usr/bin/env python3
+"""
+Meeting Assistant Transcription Sidecar
+
+Provides speech-to-text transcription using faster-whisper
+with automatic Traditional Chinese conversion via OpenCC.
+
+Modes:
+1. File mode: transcriber.py <audio_file>
+2. Server mode: transcriber.py (default, listens on stdin for JSON commands)
+3. Streaming mode: Continuous audio processing with VAD segmentation
+
+Uses ONNX Runtime for VAD (lightweight, ~20MB vs PyTorch ~2GB)
+"""
+
+import sys
+import os
+import json
+import tempfile
+import base64
+import uuid
+import re
+import urllib.request
+from pathlib import Path
+from typing import Optional, List
+
+os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
+
+try:
+    from faster_whisper import WhisperModel
+    import opencc
+    import numpy as np
+except ImportError as e:
+    print(json.dumps({"error": f"Missing dependency: {e}"}), file=sys.stderr)
+    sys.exit(1)
+
+# Try to import ONNX Runtime for VAD
+try:
+    import onnxruntime as ort
+    ONNX_AVAILABLE = True
+except ImportError:
+    ONNX_AVAILABLE = False
+
+
+class ChinesePunctuator:
+    """Rule-based Chinese punctuation processor."""
+
+    QUESTION_PATTERNS = [
+        r'嗎$', r'呢$', r'什麼$', r'怎麼$', r'為什麼$', r'哪裡$', r'哪個$',
+        r'誰$', r'幾$', r'多少$', r'是否$', r'能否$', r'可否$', r'有沒有$',
+        r'是不是$', r'會不會$', r'能不能$', r'可不可以$', r'好不好$', r'對不對$'
+    ]
+
+    def __init__(self):
+        self.question_regex = re.compile('|'.join(self.QUESTION_PATTERNS))
+
+    def add_punctuation(self, text: str, word_timestamps: Optional[List] = None) -> str:
+        """Add punctuation to transcribed text."""
+        if not text:
+            return text
+
+        text = text.strip()
+
+        # Already has ending punctuation
+        if text and text[-1] in '。？！，；：':
+            return text
+
+        # Check for question patterns
+        if self.question_regex.search(text):
+            return text + '？'
+
+        # Default to period for statements
+        return text + '。'
+
+    def process_segments(self, segments: List[dict]) -> str:
+        """Process multiple segments with timestamps to add punctuation."""
+        result_parts = []
+
+        for i, seg in enumerate(segments):
+            text = seg.get('text', '').strip()
+            if not text:
+                continue
+
+            # Check for long pause before next segment (comma opportunity)
+            if i < len(segments) - 1:
+                next_seg = segments[i + 1]
+                gap = next_seg.get('start', 0) - seg.get('end', 0)
+                if gap > 0.5 and not text[-1] in '。？！，；：':
+                    # Long pause, add comma if not end of sentence
+                    if not self.question_regex.search(text):
+                        text = text + '，'
+
+            result_parts.append(text)
+
+        # Join and add final punctuation
+        result = ''.join(result_parts)
+        return self.add_punctuation(result)
+
+
+class SileroVAD:
+    """Silero VAD using ONNX Runtime (lightweight alternative to PyTorch)."""
+
+    MODEL_URL = "https://github.com/snakers4/silero-vad/raw/master/src/silero_vad/data/silero_vad.onnx"
+
+    def __init__(self, model_path: Optional[str] = None, threshold: float = 0.5):
+        self.threshold = threshold
+        self.session = None
+        self._h = np.zeros((2, 1, 64), dtype=np.float32)
+        self._c = np.zeros((2, 1, 64), dtype=np.float32)
+        self.sample_rate = 16000
+
+        if not ONNX_AVAILABLE:
+            print(json.dumps({"warning": "onnxruntime not available, VAD disabled"}), file=sys.stderr)
+            return
+
+        # Determine model path
+        if model_path is None:
+            cache_dir = Path.home() / ".cache" / "silero-vad"
+            cache_dir.mkdir(parents=True, exist_ok=True)
+            model_path = cache_dir / "silero_vad.onnx"
+
+        # Download if not exists
+        if not Path(model_path).exists():
+            print(json.dumps({"status": "downloading_vad_model"}), file=sys.stderr)
+            try:
+                urllib.request.urlretrieve(self.MODEL_URL, model_path)
+                print(json.dumps({"status": "vad_model_downloaded"}), file=sys.stderr)
+            except Exception as e:
+                print(json.dumps({"warning": f"VAD model download failed: {e}"}), file=sys.stderr)
+                return
+
+        # Load ONNX model
+        try:
+            self.session = ort.InferenceSession(
+                str(model_path),
+                providers=['CPUExecutionProvider']
+            )
+            print(json.dumps({"status": "vad_loaded"}), file=sys.stderr)
+        except Exception as e:
+            print(json.dumps({"warning": f"VAD load failed: {e}"}), file=sys.stderr)
+
+    def reset_states(self):
+        """Reset hidden states."""
+        self._h = np.zeros((2, 1, 64), dtype=np.float32)
+        self._c = np.zeros((2, 1, 64), dtype=np.float32)
+
+    def __call__(self, audio: np.ndarray) -> float:
+        """Run VAD on audio chunk, return speech probability."""
+        if self.session is None:
+            return 0.5  # Neutral if VAD not available
+
+        # Ensure correct shape (batch, samples)
+        if audio.ndim == 1:
+            audio = audio[np.newaxis, :]
+
+        # Run inference
+        ort_inputs = {
+            'input': audio.astype(np.float32),
+            'sr': np.array([self.sample_rate], dtype=np.int64),
+            'h': self._h,
+            'c': self._c
+        }
+
+        output, self._h, self._c = self.session.run(None, ort_inputs)
+        return float(output[0][0])
+
+
+class VADProcessor:
+    """Voice Activity Detection processor."""
+
+    def __init__(self, sample_rate: int = 16000, threshold: float = 0.5, vad_model: Optional[SileroVAD] = None):
+        self.sample_rate = sample_rate
+        self.threshold = threshold
+        # Reuse pre-loaded VAD model if provided
+        self.vad = vad_model if vad_model else (SileroVAD(threshold=threshold) if ONNX_AVAILABLE else None)
+        self.reset()
+
+    def reset(self):
+        """Reset VAD state."""
+        self.audio_buffer = np.array([], dtype=np.float32)
+        self.speech_buffer = np.array([], dtype=np.float32)
+        self.speech_started = False
+        self.silence_samples = 0
+        self.speech_samples = 0
+        if self.vad:
+            self.vad.reset_states()
+
+    def process_chunk(self, audio_chunk: np.ndarray) -> Optional[np.ndarray]:
+        """
+        Process audio chunk and return speech segment if speech end detected.
+
+        Returns:
+            Speech audio if end detected, None otherwise
+        """
+        self.audio_buffer = np.concatenate([self.audio_buffer, audio_chunk])
+
+        # Fallback: time-based segmentation if no VAD
+        if self.vad is None or self.vad.session is None:
+            # Every 5 seconds, return the buffer
+            if len(self.audio_buffer) >= self.sample_rate * 5:
+                result = self.audio_buffer.copy()
+                self.audio_buffer = np.array([], dtype=np.float32)
+                return result
+            return None
+
+        # Process in 512-sample windows (32ms at 16kHz)
+        window_size = 512
+        silence_threshold_samples = int(0.5 * self.sample_rate)  # 500ms
+        max_speech_samples = int(15 * self.sample_rate)  # 15s max
+
+        while len(self.audio_buffer) >= window_size:
+            window = self.audio_buffer[:window_size]
+            self.audio_buffer = self.audio_buffer[window_size:]
+
+            # Run VAD
+            speech_prob = self.vad(window)
+
+            if speech_prob >= self.threshold:
+                if not self.speech_started:
+                    self.speech_started = True
+                    self.speech_buffer = np.array([], dtype=np.float32)
+                self.speech_buffer = np.concatenate([self.speech_buffer, window])
+                self.silence_samples = 0
+                self.speech_samples += window_size
+            else:
+                if self.speech_started:
+                    self.speech_buffer = np.concatenate([self.speech_buffer, window])
+                    self.silence_samples += window_size
+
+            # Force segment if speech too long
+            if self.speech_samples >= max_speech_samples:
+                result = self.speech_buffer.copy()
+                self.speech_started = False
+                self.speech_buffer = np.array([], dtype=np.float32)
+                self.silence_samples = 0
+                self.speech_samples = 0
+                return result
+
+            # Detect end of speech (500ms silence)
+            if self.speech_started and self.silence_samples >= silence_threshold_samples:
+                if len(self.speech_buffer) > self.sample_rate * 0.3:  # At least 300ms
+                    result = self.speech_buffer.copy()
+                    self.speech_started = False
+                    self.speech_buffer = np.array([], dtype=np.float32)
+                    self.silence_samples = 0
+                    self.speech_samples = 0
+                    return result
+
+        return None
+
+    def flush(self) -> Optional[np.ndarray]:
+        """Flush remaining audio."""
+        # Combine any remaining audio
+        remaining = np.concatenate([self.speech_buffer, self.audio_buffer])
+        if len(remaining) > self.sample_rate * 0.5:  # At least 500ms
+            self.reset()
+            return remaining
+        self.reset()
+        return None
+
+
+class StreamingSession:
+    """Manages a streaming transcription session."""
+
+    def __init__(self, transcriber: 'Transcriber', vad_model: Optional[SileroVAD] = None):
+        self.session_id = str(uuid.uuid4())
+        self.transcriber = transcriber
+        self.vad = VADProcessor(vad_model=vad_model)
+        self.segment_id = 0
+        self.active = True
+
+    def process_chunk(self, audio_data: str) -> Optional[dict]:
+        """Process base64-encoded audio chunk."""
+        try:
+            # Decode base64 to raw PCM (16-bit, 16kHz, mono)
+            pcm_data = base64.b64decode(audio_data)
+            audio = np.frombuffer(pcm_data, dtype=np.int16).astype(np.float32) / 32768.0
+
+            # Run VAD
+            speech_segment = self.vad.process_chunk(audio)
+
+            if speech_segment is not None and len(speech_segment) > 0:
+                return self._transcribe_segment(speech_segment)
+
+            return None
+
+        except Exception as e:
+            return {"error": f"Chunk processing error: {e}"}
+
+    def _transcribe_segment(self, audio: np.ndarray) -> dict:
+        """Transcribe a speech segment."""
+        self.segment_id += 1
+
+        # Save to temp file for Whisper
+        temp_file = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
+        try:
+            import wave
+            with wave.open(temp_file.name, 'wb') as wf:
+                wf.setnchannels(1)
+                wf.setsampwidth(2)
+                wf.setframerate(16000)
+                wf.writeframes((audio * 32768).astype(np.int16).tobytes())
+
+            # Transcribe
+            text = self.transcriber.transcribe_file(temp_file.name, add_punctuation=True)
+
+            return {
+                "segment_id": self.segment_id,
+                "text": text,
+                "is_final": True,
+                "duration": len(audio) / 16000
+            }
+        finally:
+            os.unlink(temp_file.name)
+
+    def stop(self) -> dict:
+        """Stop the session and flush remaining audio."""
+        self.active = False
+        results = []
+
+        # Flush VAD buffer
+        remaining = self.vad.flush()
+        if remaining is not None and len(remaining) > 0:
+            result = self._transcribe_segment(remaining)
+            if result and not result.get('error'):
+                results.append(result)
+
+        return {
+            "status": "stream_stopped",
+            "session_id": self.session_id,
+            "total_segments": self.segment_id,
+            "final_segments": results
+        }
+
+
+class Transcriber:
+    """Main transcription engine."""
+
+    def __init__(self, model_size: str = "medium", device: str = "cpu", compute_type: str = "int8"):
+        self.model = None
+        self.converter = None
+        self.punctuator = ChinesePunctuator()
+        self.streaming_session: Optional[StreamingSession] = None
+        self.vad_model: Optional[SileroVAD] = None
+
+        try:
+            print(json.dumps({"status": "loading_model", "model": model_size}), file=sys.stderr)
+            self.model = WhisperModel(model_size, device=device, compute_type=compute_type)
+            self.converter = opencc.OpenCC("s2twp")
+            print(json.dumps({"status": "model_loaded"}), file=sys.stderr)
+
+            # Pre-load VAD model at startup (not when streaming starts)
+            if ONNX_AVAILABLE:
+                self.vad_model = SileroVAD()
+
+        except Exception as e:
+            print(json.dumps({"error": f"Failed to load model: {e}"}), file=sys.stderr)
+            raise
+
+    def transcribe_file(self, audio_path: str, add_punctuation: bool = False) -> str:
+        """Transcribe an audio file to text."""
+        if not self.model:
+            return ""
+
+        if not os.path.exists(audio_path):
+            print(json.dumps({"error": f"File not found: {audio_path}"}), file=sys.stderr)
+            return ""
+
+        try:
+            segments, info = self.model.transcribe(
+                audio_path,
+                language="zh",  # Use "nan" for Taiwanese/Hokkien, "zh" for Mandarin
+                beam_size=5,
+                vad_filter=True,
+                word_timestamps=add_punctuation,
+                # Anti-hallucination settings
+                condition_on_previous_text=False,  # Prevents hallucination propagation
+                no_speech_threshold=0.6,           # Higher = stricter silence detection
+                compression_ratio_threshold=2.4,   # Filter repetitive/hallucinated text
+                log_prob_threshold=-1.0,           # Filter low-confidence output
+                temperature=0.0,                   # Deterministic output (no sampling)
+            )
+
+            if add_punctuation:
+                # Collect segments with timestamps for punctuation
+                seg_list = []
+                for segment in segments:
+                    seg_list.append({
+                        'text': segment.text,
+                        'start': segment.start,
+                        'end': segment.end
+                    })
+                text = self.punctuator.process_segments(seg_list)
+            else:
+                text = ""
+                for segment in segments:
+                    text += segment.text
+
+            # Convert to Traditional Chinese
+            if text and self.converter:
+                text = self.converter.convert(text)
+
+            return text.strip()
+
+        except Exception as e:
+            print(json.dumps({"error": f"Transcription error: {e}"}), file=sys.stderr)
+            return ""
+
+    def handle_command(self, cmd: dict) -> Optional[dict]:
+        """Handle a JSON command."""
+        action = cmd.get("action")
+
+        if action == "transcribe":
+            # File-based transcription (legacy)
+            audio_path = cmd.get("file")
+            if audio_path:
+                text = self.transcribe_file(audio_path, add_punctuation=True)
+                return {"result": text, "file": audio_path}
+            return {"error": "No file specified"}
+
+        elif action == "start_stream":
+            # Start streaming session
+            if self.streaming_session and self.streaming_session.active:
+                return {"error": "Stream already active"}
+            # Pass pre-loaded VAD model to avoid download delay
+            self.streaming_session = StreamingSession(self, vad_model=self.vad_model)
+            return {
+                "status": "streaming",
+                "session_id": self.streaming_session.session_id
+            }
+
+        elif action == "audio_chunk":
+            # Process audio chunk
+            if not self.streaming_session or not self.streaming_session.active:
+                return {"error": "No active stream"}
+            data = cmd.get("data")
+            if not data:
+                return {"error": "No audio data"}
+            result = self.streaming_session.process_chunk(data)
+            return result  # May be None if no segment ready
+
+        elif action == "stop_stream":
+            # Stop streaming session
+            if not self.streaming_session:
+                return {"error": "No active stream"}
+            result = self.streaming_session.stop()
+            self.streaming_session = None
+            return result
+
+        elif action == "ping":
+            return {"status": "pong"}
+
+        elif action == "quit":
+            return {"status": "exiting"}
+
+        else:
+            return {"error": f"Unknown action: {action}"}
+
+    def run_server(self):
+        """Run in server mode, reading JSON commands from stdin."""
+        print(json.dumps({"status": "ready"}))
+        sys.stdout.flush()
+
+        for line in sys.stdin:
+            line = line.strip()
+            if not line:
+                continue
+
+            try:
+                cmd = json.loads(line)
+                result = self.handle_command(cmd)
+
+                if result:
+                    print(json.dumps(result))
+                    sys.stdout.flush()
+
+                if cmd.get("action") == "quit":
+                    break
+
+            except json.JSONDecodeError as e:
+                print(json.dumps({"error": f"Invalid JSON: {e}"}))
+                sys.stdout.flush()
+
+
+def main():
+    model_size = os.environ.get("WHISPER_MODEL", "small")
+    device = os.environ.get("WHISPER_DEVICE", "cpu")
+    compute_type = os.environ.get("WHISPER_COMPUTE", "int8")
+
+    try:
+        transcriber = Transcriber(model_size, device, compute_type)
+
+        if len(sys.argv) > 1:
+            if sys.argv[1] == "--server":
+                transcriber.run_server()
+            else:
+                # File mode
+                text = transcriber.transcribe_file(sys.argv[1], add_punctuation=True)
+                print(text)
+        else:
+            # Default to server mode
+            transcriber.run_server()
+
+    except Exception as e:
+        print(json.dumps({"error": str(e)}), file=sys.stderr)
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()