Files
Meeting_Assistant/sidecar/transcriber.py
egg 8b6184ecc5 feat: Meeting Assistant MVP - Complete implementation
Enterprise Meeting Knowledge Management System with:

Backend (FastAPI):
- Authentication proxy with JWT (pj-auth-api integration)
- MySQL database with 4 tables (users, meetings, conclusions, actions)
- Meeting CRUD with system code generation (C-YYYYMMDD-XX, A-YYYYMMDD-XX)
- Dify LLM integration for AI summarization
- Excel export with openpyxl
- 20 unit tests (all passing)

Client (Electron):
- Login page with company auth
- Meeting list with create/delete
- Meeting detail with real-time transcription
- Editable transcript textarea (single block, easy editing)
- AI summarization with conclusions/action items
- 5-second segment recording (efficient for long meetings)

Sidecar (Python):
- faster-whisper medium model with int8 quantization
- ONNX Runtime VAD (lightweight, ~20MB vs PyTorch ~2GB)
- Chinese punctuation processing
- OpenCC for Traditional Chinese conversion
- Anti-hallucination parameters
- Auto-cleanup of temp audio files

OpenSpec:
- add-meeting-assistant-mvp (47 tasks, archived)
- add-realtime-transcription (29 tasks, archived)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-10 20:17:44 +08:00

511 lines
18 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Meeting Assistant Transcription Sidecar
Provides speech-to-text transcription using faster-whisper
with automatic Traditional Chinese conversion via OpenCC.
Modes:
1. File mode: transcriber.py <audio_file>
2. Server mode: transcriber.py (default, listens on stdin for JSON commands)
3. Streaming mode: Continuous audio processing with VAD segmentation
Uses ONNX Runtime for VAD (lightweight, ~20MB vs PyTorch ~2GB)
"""
import sys
import os
import json
import tempfile
import base64
import uuid
import re
import urllib.request
from pathlib import Path
from typing import Optional, List
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
try:
from faster_whisper import WhisperModel
import opencc
import numpy as np
except ImportError as e:
print(json.dumps({"error": f"Missing dependency: {e}"}), file=sys.stderr)
sys.exit(1)
# Try to import ONNX Runtime for VAD
try:
import onnxruntime as ort
ONNX_AVAILABLE = True
except ImportError:
ONNX_AVAILABLE = False
class ChinesePunctuator:
"""Rule-based Chinese punctuation processor."""
QUESTION_PATTERNS = [
r'嗎$', r'呢$', r'什麼$', r'怎麼$', r'為什麼$', r'哪裡$', r'哪個$',
r'誰$', r'幾$', r'多少$', r'是否$', r'能否$', r'可否$', r'有沒有$',
r'是不是$', r'會不會$', r'能不能$', r'可不可以$', r'好不好$', r'對不對$'
]
def __init__(self):
self.question_regex = re.compile('|'.join(self.QUESTION_PATTERNS))
def add_punctuation(self, text: str, word_timestamps: Optional[List] = None) -> str:
"""Add punctuation to transcribed text."""
if not text:
return text
text = text.strip()
# Already has ending punctuation
if text and text[-1] in '。?!,;:':
return text
# Check for question patterns
if self.question_regex.search(text):
return text + ''
# Default to period for statements
return text + ''
def process_segments(self, segments: List[dict]) -> str:
"""Process multiple segments with timestamps to add punctuation."""
result_parts = []
for i, seg in enumerate(segments):
text = seg.get('text', '').strip()
if not text:
continue
# Check for long pause before next segment (comma opportunity)
if i < len(segments) - 1:
next_seg = segments[i + 1]
gap = next_seg.get('start', 0) - seg.get('end', 0)
if gap > 0.5 and not text[-1] in '。?!,;:':
# Long pause, add comma if not end of sentence
if not self.question_regex.search(text):
text = text + ''
result_parts.append(text)
# Join and add final punctuation
result = ''.join(result_parts)
return self.add_punctuation(result)
class SileroVAD:
"""Silero VAD using ONNX Runtime (lightweight alternative to PyTorch)."""
MODEL_URL = "https://github.com/snakers4/silero-vad/raw/master/src/silero_vad/data/silero_vad.onnx"
def __init__(self, model_path: Optional[str] = None, threshold: float = 0.5):
self.threshold = threshold
self.session = None
self._h = np.zeros((2, 1, 64), dtype=np.float32)
self._c = np.zeros((2, 1, 64), dtype=np.float32)
self.sample_rate = 16000
if not ONNX_AVAILABLE:
print(json.dumps({"warning": "onnxruntime not available, VAD disabled"}), file=sys.stderr)
return
# Determine model path
if model_path is None:
cache_dir = Path.home() / ".cache" / "silero-vad"
cache_dir.mkdir(parents=True, exist_ok=True)
model_path = cache_dir / "silero_vad.onnx"
# Download if not exists
if not Path(model_path).exists():
print(json.dumps({"status": "downloading_vad_model"}), file=sys.stderr)
try:
urllib.request.urlretrieve(self.MODEL_URL, model_path)
print(json.dumps({"status": "vad_model_downloaded"}), file=sys.stderr)
except Exception as e:
print(json.dumps({"warning": f"VAD model download failed: {e}"}), file=sys.stderr)
return
# Load ONNX model
try:
self.session = ort.InferenceSession(
str(model_path),
providers=['CPUExecutionProvider']
)
print(json.dumps({"status": "vad_loaded"}), file=sys.stderr)
except Exception as e:
print(json.dumps({"warning": f"VAD load failed: {e}"}), file=sys.stderr)
def reset_states(self):
"""Reset hidden states."""
self._h = np.zeros((2, 1, 64), dtype=np.float32)
self._c = np.zeros((2, 1, 64), dtype=np.float32)
def __call__(self, audio: np.ndarray) -> float:
"""Run VAD on audio chunk, return speech probability."""
if self.session is None:
return 0.5 # Neutral if VAD not available
# Ensure correct shape (batch, samples)
if audio.ndim == 1:
audio = audio[np.newaxis, :]
# Run inference
ort_inputs = {
'input': audio.astype(np.float32),
'sr': np.array([self.sample_rate], dtype=np.int64),
'h': self._h,
'c': self._c
}
output, self._h, self._c = self.session.run(None, ort_inputs)
return float(output[0][0])
class VADProcessor:
"""Voice Activity Detection processor."""
def __init__(self, sample_rate: int = 16000, threshold: float = 0.5, vad_model: Optional[SileroVAD] = None):
self.sample_rate = sample_rate
self.threshold = threshold
# Reuse pre-loaded VAD model if provided
self.vad = vad_model if vad_model else (SileroVAD(threshold=threshold) if ONNX_AVAILABLE else None)
self.reset()
def reset(self):
"""Reset VAD state."""
self.audio_buffer = np.array([], dtype=np.float32)
self.speech_buffer = np.array([], dtype=np.float32)
self.speech_started = False
self.silence_samples = 0
self.speech_samples = 0
if self.vad:
self.vad.reset_states()
def process_chunk(self, audio_chunk: np.ndarray) -> Optional[np.ndarray]:
"""
Process audio chunk and return speech segment if speech end detected.
Returns:
Speech audio if end detected, None otherwise
"""
self.audio_buffer = np.concatenate([self.audio_buffer, audio_chunk])
# Fallback: time-based segmentation if no VAD
if self.vad is None or self.vad.session is None:
# Every 5 seconds, return the buffer
if len(self.audio_buffer) >= self.sample_rate * 5:
result = self.audio_buffer.copy()
self.audio_buffer = np.array([], dtype=np.float32)
return result
return None
# Process in 512-sample windows (32ms at 16kHz)
window_size = 512
silence_threshold_samples = int(0.5 * self.sample_rate) # 500ms
max_speech_samples = int(15 * self.sample_rate) # 15s max
while len(self.audio_buffer) >= window_size:
window = self.audio_buffer[:window_size]
self.audio_buffer = self.audio_buffer[window_size:]
# Run VAD
speech_prob = self.vad(window)
if speech_prob >= self.threshold:
if not self.speech_started:
self.speech_started = True
self.speech_buffer = np.array([], dtype=np.float32)
self.speech_buffer = np.concatenate([self.speech_buffer, window])
self.silence_samples = 0
self.speech_samples += window_size
else:
if self.speech_started:
self.speech_buffer = np.concatenate([self.speech_buffer, window])
self.silence_samples += window_size
# Force segment if speech too long
if self.speech_samples >= max_speech_samples:
result = self.speech_buffer.copy()
self.speech_started = False
self.speech_buffer = np.array([], dtype=np.float32)
self.silence_samples = 0
self.speech_samples = 0
return result
# Detect end of speech (500ms silence)
if self.speech_started and self.silence_samples >= silence_threshold_samples:
if len(self.speech_buffer) > self.sample_rate * 0.3: # At least 300ms
result = self.speech_buffer.copy()
self.speech_started = False
self.speech_buffer = np.array([], dtype=np.float32)
self.silence_samples = 0
self.speech_samples = 0
return result
return None
def flush(self) -> Optional[np.ndarray]:
"""Flush remaining audio."""
# Combine any remaining audio
remaining = np.concatenate([self.speech_buffer, self.audio_buffer])
if len(remaining) > self.sample_rate * 0.5: # At least 500ms
self.reset()
return remaining
self.reset()
return None
class StreamingSession:
"""Manages a streaming transcription session."""
def __init__(self, transcriber: 'Transcriber', vad_model: Optional[SileroVAD] = None):
self.session_id = str(uuid.uuid4())
self.transcriber = transcriber
self.vad = VADProcessor(vad_model=vad_model)
self.segment_id = 0
self.active = True
def process_chunk(self, audio_data: str) -> Optional[dict]:
"""Process base64-encoded audio chunk."""
try:
# Decode base64 to raw PCM (16-bit, 16kHz, mono)
pcm_data = base64.b64decode(audio_data)
audio = np.frombuffer(pcm_data, dtype=np.int16).astype(np.float32) / 32768.0
# Run VAD
speech_segment = self.vad.process_chunk(audio)
if speech_segment is not None and len(speech_segment) > 0:
return self._transcribe_segment(speech_segment)
return None
except Exception as e:
return {"error": f"Chunk processing error: {e}"}
def _transcribe_segment(self, audio: np.ndarray) -> dict:
"""Transcribe a speech segment."""
self.segment_id += 1
# Save to temp file for Whisper
temp_file = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
try:
import wave
with wave.open(temp_file.name, 'wb') as wf:
wf.setnchannels(1)
wf.setsampwidth(2)
wf.setframerate(16000)
wf.writeframes((audio * 32768).astype(np.int16).tobytes())
# Transcribe
text = self.transcriber.transcribe_file(temp_file.name, add_punctuation=True)
return {
"segment_id": self.segment_id,
"text": text,
"is_final": True,
"duration": len(audio) / 16000
}
finally:
os.unlink(temp_file.name)
def stop(self) -> dict:
"""Stop the session and flush remaining audio."""
self.active = False
results = []
# Flush VAD buffer
remaining = self.vad.flush()
if remaining is not None and len(remaining) > 0:
result = self._transcribe_segment(remaining)
if result and not result.get('error'):
results.append(result)
return {
"status": "stream_stopped",
"session_id": self.session_id,
"total_segments": self.segment_id,
"final_segments": results
}
class Transcriber:
"""Main transcription engine."""
def __init__(self, model_size: str = "medium", device: str = "cpu", compute_type: str = "int8"):
self.model = None
self.converter = None
self.punctuator = ChinesePunctuator()
self.streaming_session: Optional[StreamingSession] = None
self.vad_model: Optional[SileroVAD] = None
try:
print(json.dumps({"status": "loading_model", "model": model_size}), file=sys.stderr)
self.model = WhisperModel(model_size, device=device, compute_type=compute_type)
self.converter = opencc.OpenCC("s2twp")
print(json.dumps({"status": "model_loaded"}), file=sys.stderr)
# Pre-load VAD model at startup (not when streaming starts)
if ONNX_AVAILABLE:
self.vad_model = SileroVAD()
except Exception as e:
print(json.dumps({"error": f"Failed to load model: {e}"}), file=sys.stderr)
raise
def transcribe_file(self, audio_path: str, add_punctuation: bool = False) -> str:
"""Transcribe an audio file to text."""
if not self.model:
return ""
if not os.path.exists(audio_path):
print(json.dumps({"error": f"File not found: {audio_path}"}), file=sys.stderr)
return ""
try:
segments, info = self.model.transcribe(
audio_path,
language="zh", # Use "nan" for Taiwanese/Hokkien, "zh" for Mandarin
beam_size=5,
vad_filter=True,
word_timestamps=add_punctuation,
# Anti-hallucination settings
condition_on_previous_text=False, # Prevents hallucination propagation
no_speech_threshold=0.6, # Higher = stricter silence detection
compression_ratio_threshold=2.4, # Filter repetitive/hallucinated text
log_prob_threshold=-1.0, # Filter low-confidence output
temperature=0.0, # Deterministic output (no sampling)
)
if add_punctuation:
# Collect segments with timestamps for punctuation
seg_list = []
for segment in segments:
seg_list.append({
'text': segment.text,
'start': segment.start,
'end': segment.end
})
text = self.punctuator.process_segments(seg_list)
else:
text = ""
for segment in segments:
text += segment.text
# Convert to Traditional Chinese
if text and self.converter:
text = self.converter.convert(text)
return text.strip()
except Exception as e:
print(json.dumps({"error": f"Transcription error: {e}"}), file=sys.stderr)
return ""
def handle_command(self, cmd: dict) -> Optional[dict]:
"""Handle a JSON command."""
action = cmd.get("action")
if action == "transcribe":
# File-based transcription (legacy)
audio_path = cmd.get("file")
if audio_path:
text = self.transcribe_file(audio_path, add_punctuation=True)
return {"result": text, "file": audio_path}
return {"error": "No file specified"}
elif action == "start_stream":
# Start streaming session
if self.streaming_session and self.streaming_session.active:
return {"error": "Stream already active"}
# Pass pre-loaded VAD model to avoid download delay
self.streaming_session = StreamingSession(self, vad_model=self.vad_model)
return {
"status": "streaming",
"session_id": self.streaming_session.session_id
}
elif action == "audio_chunk":
# Process audio chunk
if not self.streaming_session or not self.streaming_session.active:
return {"error": "No active stream"}
data = cmd.get("data")
if not data:
return {"error": "No audio data"}
result = self.streaming_session.process_chunk(data)
return result # May be None if no segment ready
elif action == "stop_stream":
# Stop streaming session
if not self.streaming_session:
return {"error": "No active stream"}
result = self.streaming_session.stop()
self.streaming_session = None
return result
elif action == "ping":
return {"status": "pong"}
elif action == "quit":
return {"status": "exiting"}
else:
return {"error": f"Unknown action: {action}"}
def run_server(self):
"""Run in server mode, reading JSON commands from stdin."""
print(json.dumps({"status": "ready"}))
sys.stdout.flush()
for line in sys.stdin:
line = line.strip()
if not line:
continue
try:
cmd = json.loads(line)
result = self.handle_command(cmd)
if result:
print(json.dumps(result))
sys.stdout.flush()
if cmd.get("action") == "quit":
break
except json.JSONDecodeError as e:
print(json.dumps({"error": f"Invalid JSON: {e}"}))
sys.stdout.flush()
def main():
model_size = os.environ.get("WHISPER_MODEL", "small")
device = os.environ.get("WHISPER_DEVICE", "cpu")
compute_type = os.environ.get("WHISPER_COMPUTE", "int8")
try:
transcriber = Transcriber(model_size, device, compute_type)
if len(sys.argv) > 1:
if sys.argv[1] == "--server":
transcriber.run_server()
else:
# File mode
text = transcriber.transcribe_file(sys.argv[1], add_punctuation=True)
print(text)
else:
# Default to server mode
transcriber.run_server()
except Exception as e:
print(json.dumps({"error": str(e)}), file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()