feat: Meeting Assistant MVP - Complete implementation

Enterprise Meeting Knowledge Management System with:

Backend (FastAPI):
- Authentication proxy with JWT (pj-auth-api integration)
- MySQL database with 4 tables (users, meetings, conclusions, actions)
- Meeting CRUD with system code generation (C-YYYYMMDD-XX, A-YYYYMMDD-XX)
- Dify LLM integration for AI summarization
- Excel export with openpyxl
- 20 unit tests (all passing)

Client (Electron):
- Login page with company auth
- Meeting list with create/delete
- Meeting detail with real-time transcription
- Editable transcript textarea (single block, easy editing)
- AI summarization with conclusions/action items
- 5-second segment recording (efficient for long meetings)

Sidecar (Python):
- faster-whisper medium model with int8 quantization
- ONNX Runtime VAD (lightweight, ~20MB vs PyTorch ~2GB)
- Chinese punctuation processing
- OpenCC for Traditional Chinese conversion
- Anti-hallucination parameters
- Auto-cleanup of temp audio files

OpenSpec:
- add-meeting-assistant-mvp (47 tasks, archived)
- add-realtime-transcription (29 tasks, archived)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
egg
2025-12-10 20:17:44 +08:00
commit 8b6184ecc5
65 changed files with 10510 additions and 0 deletions

510
sidecar/transcriber.py Normal file
View File

@@ -0,0 +1,510 @@
#!/usr/bin/env python3
"""
Meeting Assistant Transcription Sidecar
Provides speech-to-text transcription using faster-whisper
with automatic Traditional Chinese conversion via OpenCC.
Modes:
1. File mode: transcriber.py <audio_file>
2. Server mode: transcriber.py (default, listens on stdin for JSON commands)
3. Streaming mode: Continuous audio processing with VAD segmentation
Uses ONNX Runtime for VAD (lightweight, ~20MB vs PyTorch ~2GB)
"""
import sys
import os
import json
import tempfile
import base64
import uuid
import re
import urllib.request
from pathlib import Path
from typing import Optional, List
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
try:
from faster_whisper import WhisperModel
import opencc
import numpy as np
except ImportError as e:
print(json.dumps({"error": f"Missing dependency: {e}"}), file=sys.stderr)
sys.exit(1)
# Try to import ONNX Runtime for VAD
try:
import onnxruntime as ort
ONNX_AVAILABLE = True
except ImportError:
ONNX_AVAILABLE = False
class ChinesePunctuator:
"""Rule-based Chinese punctuation processor."""
QUESTION_PATTERNS = [
r'嗎$', r'呢$', r'什麼$', r'怎麼$', r'為什麼$', r'哪裡$', r'哪個$',
r'誰$', r'幾$', r'多少$', r'是否$', r'能否$', r'可否$', r'有沒有$',
r'是不是$', r'會不會$', r'能不能$', r'可不可以$', r'好不好$', r'對不對$'
]
def __init__(self):
self.question_regex = re.compile('|'.join(self.QUESTION_PATTERNS))
def add_punctuation(self, text: str, word_timestamps: Optional[List] = None) -> str:
"""Add punctuation to transcribed text."""
if not text:
return text
text = text.strip()
# Already has ending punctuation
if text and text[-1] in '。?!,;:':
return text
# Check for question patterns
if self.question_regex.search(text):
return text + ''
# Default to period for statements
return text + ''
def process_segments(self, segments: List[dict]) -> str:
"""Process multiple segments with timestamps to add punctuation."""
result_parts = []
for i, seg in enumerate(segments):
text = seg.get('text', '').strip()
if not text:
continue
# Check for long pause before next segment (comma opportunity)
if i < len(segments) - 1:
next_seg = segments[i + 1]
gap = next_seg.get('start', 0) - seg.get('end', 0)
if gap > 0.5 and not text[-1] in '。?!,;:':
# Long pause, add comma if not end of sentence
if not self.question_regex.search(text):
text = text + ''
result_parts.append(text)
# Join and add final punctuation
result = ''.join(result_parts)
return self.add_punctuation(result)
class SileroVAD:
"""Silero VAD using ONNX Runtime (lightweight alternative to PyTorch)."""
MODEL_URL = "https://github.com/snakers4/silero-vad/raw/master/src/silero_vad/data/silero_vad.onnx"
def __init__(self, model_path: Optional[str] = None, threshold: float = 0.5):
self.threshold = threshold
self.session = None
self._h = np.zeros((2, 1, 64), dtype=np.float32)
self._c = np.zeros((2, 1, 64), dtype=np.float32)
self.sample_rate = 16000
if not ONNX_AVAILABLE:
print(json.dumps({"warning": "onnxruntime not available, VAD disabled"}), file=sys.stderr)
return
# Determine model path
if model_path is None:
cache_dir = Path.home() / ".cache" / "silero-vad"
cache_dir.mkdir(parents=True, exist_ok=True)
model_path = cache_dir / "silero_vad.onnx"
# Download if not exists
if not Path(model_path).exists():
print(json.dumps({"status": "downloading_vad_model"}), file=sys.stderr)
try:
urllib.request.urlretrieve(self.MODEL_URL, model_path)
print(json.dumps({"status": "vad_model_downloaded"}), file=sys.stderr)
except Exception as e:
print(json.dumps({"warning": f"VAD model download failed: {e}"}), file=sys.stderr)
return
# Load ONNX model
try:
self.session = ort.InferenceSession(
str(model_path),
providers=['CPUExecutionProvider']
)
print(json.dumps({"status": "vad_loaded"}), file=sys.stderr)
except Exception as e:
print(json.dumps({"warning": f"VAD load failed: {e}"}), file=sys.stderr)
def reset_states(self):
"""Reset hidden states."""
self._h = np.zeros((2, 1, 64), dtype=np.float32)
self._c = np.zeros((2, 1, 64), dtype=np.float32)
def __call__(self, audio: np.ndarray) -> float:
"""Run VAD on audio chunk, return speech probability."""
if self.session is None:
return 0.5 # Neutral if VAD not available
# Ensure correct shape (batch, samples)
if audio.ndim == 1:
audio = audio[np.newaxis, :]
# Run inference
ort_inputs = {
'input': audio.astype(np.float32),
'sr': np.array([self.sample_rate], dtype=np.int64),
'h': self._h,
'c': self._c
}
output, self._h, self._c = self.session.run(None, ort_inputs)
return float(output[0][0])
class VADProcessor:
"""Voice Activity Detection processor."""
def __init__(self, sample_rate: int = 16000, threshold: float = 0.5, vad_model: Optional[SileroVAD] = None):
self.sample_rate = sample_rate
self.threshold = threshold
# Reuse pre-loaded VAD model if provided
self.vad = vad_model if vad_model else (SileroVAD(threshold=threshold) if ONNX_AVAILABLE else None)
self.reset()
def reset(self):
"""Reset VAD state."""
self.audio_buffer = np.array([], dtype=np.float32)
self.speech_buffer = np.array([], dtype=np.float32)
self.speech_started = False
self.silence_samples = 0
self.speech_samples = 0
if self.vad:
self.vad.reset_states()
def process_chunk(self, audio_chunk: np.ndarray) -> Optional[np.ndarray]:
"""
Process audio chunk and return speech segment if speech end detected.
Returns:
Speech audio if end detected, None otherwise
"""
self.audio_buffer = np.concatenate([self.audio_buffer, audio_chunk])
# Fallback: time-based segmentation if no VAD
if self.vad is None or self.vad.session is None:
# Every 5 seconds, return the buffer
if len(self.audio_buffer) >= self.sample_rate * 5:
result = self.audio_buffer.copy()
self.audio_buffer = np.array([], dtype=np.float32)
return result
return None
# Process in 512-sample windows (32ms at 16kHz)
window_size = 512
silence_threshold_samples = int(0.5 * self.sample_rate) # 500ms
max_speech_samples = int(15 * self.sample_rate) # 15s max
while len(self.audio_buffer) >= window_size:
window = self.audio_buffer[:window_size]
self.audio_buffer = self.audio_buffer[window_size:]
# Run VAD
speech_prob = self.vad(window)
if speech_prob >= self.threshold:
if not self.speech_started:
self.speech_started = True
self.speech_buffer = np.array([], dtype=np.float32)
self.speech_buffer = np.concatenate([self.speech_buffer, window])
self.silence_samples = 0
self.speech_samples += window_size
else:
if self.speech_started:
self.speech_buffer = np.concatenate([self.speech_buffer, window])
self.silence_samples += window_size
# Force segment if speech too long
if self.speech_samples >= max_speech_samples:
result = self.speech_buffer.copy()
self.speech_started = False
self.speech_buffer = np.array([], dtype=np.float32)
self.silence_samples = 0
self.speech_samples = 0
return result
# Detect end of speech (500ms silence)
if self.speech_started and self.silence_samples >= silence_threshold_samples:
if len(self.speech_buffer) > self.sample_rate * 0.3: # At least 300ms
result = self.speech_buffer.copy()
self.speech_started = False
self.speech_buffer = np.array([], dtype=np.float32)
self.silence_samples = 0
self.speech_samples = 0
return result
return None
def flush(self) -> Optional[np.ndarray]:
"""Flush remaining audio."""
# Combine any remaining audio
remaining = np.concatenate([self.speech_buffer, self.audio_buffer])
if len(remaining) > self.sample_rate * 0.5: # At least 500ms
self.reset()
return remaining
self.reset()
return None
class StreamingSession:
"""Manages a streaming transcription session."""
def __init__(self, transcriber: 'Transcriber', vad_model: Optional[SileroVAD] = None):
self.session_id = str(uuid.uuid4())
self.transcriber = transcriber
self.vad = VADProcessor(vad_model=vad_model)
self.segment_id = 0
self.active = True
def process_chunk(self, audio_data: str) -> Optional[dict]:
"""Process base64-encoded audio chunk."""
try:
# Decode base64 to raw PCM (16-bit, 16kHz, mono)
pcm_data = base64.b64decode(audio_data)
audio = np.frombuffer(pcm_data, dtype=np.int16).astype(np.float32) / 32768.0
# Run VAD
speech_segment = self.vad.process_chunk(audio)
if speech_segment is not None and len(speech_segment) > 0:
return self._transcribe_segment(speech_segment)
return None
except Exception as e:
return {"error": f"Chunk processing error: {e}"}
def _transcribe_segment(self, audio: np.ndarray) -> dict:
"""Transcribe a speech segment."""
self.segment_id += 1
# Save to temp file for Whisper
temp_file = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
try:
import wave
with wave.open(temp_file.name, 'wb') as wf:
wf.setnchannels(1)
wf.setsampwidth(2)
wf.setframerate(16000)
wf.writeframes((audio * 32768).astype(np.int16).tobytes())
# Transcribe
text = self.transcriber.transcribe_file(temp_file.name, add_punctuation=True)
return {
"segment_id": self.segment_id,
"text": text,
"is_final": True,
"duration": len(audio) / 16000
}
finally:
os.unlink(temp_file.name)
def stop(self) -> dict:
"""Stop the session and flush remaining audio."""
self.active = False
results = []
# Flush VAD buffer
remaining = self.vad.flush()
if remaining is not None and len(remaining) > 0:
result = self._transcribe_segment(remaining)
if result and not result.get('error'):
results.append(result)
return {
"status": "stream_stopped",
"session_id": self.session_id,
"total_segments": self.segment_id,
"final_segments": results
}
class Transcriber:
"""Main transcription engine."""
def __init__(self, model_size: str = "medium", device: str = "cpu", compute_type: str = "int8"):
self.model = None
self.converter = None
self.punctuator = ChinesePunctuator()
self.streaming_session: Optional[StreamingSession] = None
self.vad_model: Optional[SileroVAD] = None
try:
print(json.dumps({"status": "loading_model", "model": model_size}), file=sys.stderr)
self.model = WhisperModel(model_size, device=device, compute_type=compute_type)
self.converter = opencc.OpenCC("s2twp")
print(json.dumps({"status": "model_loaded"}), file=sys.stderr)
# Pre-load VAD model at startup (not when streaming starts)
if ONNX_AVAILABLE:
self.vad_model = SileroVAD()
except Exception as e:
print(json.dumps({"error": f"Failed to load model: {e}"}), file=sys.stderr)
raise
def transcribe_file(self, audio_path: str, add_punctuation: bool = False) -> str:
"""Transcribe an audio file to text."""
if not self.model:
return ""
if not os.path.exists(audio_path):
print(json.dumps({"error": f"File not found: {audio_path}"}), file=sys.stderr)
return ""
try:
segments, info = self.model.transcribe(
audio_path,
language="zh", # Use "nan" for Taiwanese/Hokkien, "zh" for Mandarin
beam_size=5,
vad_filter=True,
word_timestamps=add_punctuation,
# Anti-hallucination settings
condition_on_previous_text=False, # Prevents hallucination propagation
no_speech_threshold=0.6, # Higher = stricter silence detection
compression_ratio_threshold=2.4, # Filter repetitive/hallucinated text
log_prob_threshold=-1.0, # Filter low-confidence output
temperature=0.0, # Deterministic output (no sampling)
)
if add_punctuation:
# Collect segments with timestamps for punctuation
seg_list = []
for segment in segments:
seg_list.append({
'text': segment.text,
'start': segment.start,
'end': segment.end
})
text = self.punctuator.process_segments(seg_list)
else:
text = ""
for segment in segments:
text += segment.text
# Convert to Traditional Chinese
if text and self.converter:
text = self.converter.convert(text)
return text.strip()
except Exception as e:
print(json.dumps({"error": f"Transcription error: {e}"}), file=sys.stderr)
return ""
def handle_command(self, cmd: dict) -> Optional[dict]:
"""Handle a JSON command."""
action = cmd.get("action")
if action == "transcribe":
# File-based transcription (legacy)
audio_path = cmd.get("file")
if audio_path:
text = self.transcribe_file(audio_path, add_punctuation=True)
return {"result": text, "file": audio_path}
return {"error": "No file specified"}
elif action == "start_stream":
# Start streaming session
if self.streaming_session and self.streaming_session.active:
return {"error": "Stream already active"}
# Pass pre-loaded VAD model to avoid download delay
self.streaming_session = StreamingSession(self, vad_model=self.vad_model)
return {
"status": "streaming",
"session_id": self.streaming_session.session_id
}
elif action == "audio_chunk":
# Process audio chunk
if not self.streaming_session or not self.streaming_session.active:
return {"error": "No active stream"}
data = cmd.get("data")
if not data:
return {"error": "No audio data"}
result = self.streaming_session.process_chunk(data)
return result # May be None if no segment ready
elif action == "stop_stream":
# Stop streaming session
if not self.streaming_session:
return {"error": "No active stream"}
result = self.streaming_session.stop()
self.streaming_session = None
return result
elif action == "ping":
return {"status": "pong"}
elif action == "quit":
return {"status": "exiting"}
else:
return {"error": f"Unknown action: {action}"}
def run_server(self):
"""Run in server mode, reading JSON commands from stdin."""
print(json.dumps({"status": "ready"}))
sys.stdout.flush()
for line in sys.stdin:
line = line.strip()
if not line:
continue
try:
cmd = json.loads(line)
result = self.handle_command(cmd)
if result:
print(json.dumps(result))
sys.stdout.flush()
if cmd.get("action") == "quit":
break
except json.JSONDecodeError as e:
print(json.dumps({"error": f"Invalid JSON: {e}"}))
sys.stdout.flush()
def main():
model_size = os.environ.get("WHISPER_MODEL", "small")
device = os.environ.get("WHISPER_DEVICE", "cpu")
compute_type = os.environ.get("WHISPER_COMPUTE", "int8")
try:
transcriber = Transcriber(model_size, device, compute_type)
if len(sys.argv) > 1:
if sys.argv[1] == "--server":
transcriber.run_server()
else:
# File mode
text = transcriber.transcribe_file(sys.argv[1], add_punctuation=True)
print(text)
else:
# Default to server mode
transcriber.run_server()
except Exception as e:
print(json.dumps({"error": str(e)}), file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()