feat: Add Dify audio transcription with VAD chunking and SSE progress
- Add audio file upload transcription via Dify STT API - Implement VAD-based audio segmentation in sidecar (3-min chunks) - Add SSE endpoint for real-time transcription progress updates - Fix chunk size enforcement for reliable uploads - Add retry logic with exponential backoff for API calls - Support Python 3.13+ with audioop-lts package - Update frontend with Chinese progress messages and chunk display - Improve start.sh health check with retry loop 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -3,3 +3,5 @@ faster-whisper>=1.0.0
|
||||
opencc-python-reimplemented>=0.1.7
|
||||
numpy>=1.26.0
|
||||
onnxruntime>=1.16.0
|
||||
pydub>=0.25.0
|
||||
audioop-lts>=0.2.1 # Required for Python 3.13+ (audioop removed from stdlib)
|
||||
|
||||
@@ -20,9 +20,10 @@ import tempfile
|
||||
import base64
|
||||
import uuid
|
||||
import re
|
||||
import wave
|
||||
import urllib.request
|
||||
from pathlib import Path
|
||||
from typing import Optional, List
|
||||
from typing import Optional, List, Tuple
|
||||
|
||||
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
|
||||
|
||||
@@ -105,8 +106,7 @@ class SileroVAD:
|
||||
def __init__(self, model_path: Optional[str] = None, threshold: float = 0.5):
|
||||
self.threshold = threshold
|
||||
self.session = None
|
||||
self._h = np.zeros((2, 1, 64), dtype=np.float32)
|
||||
self._c = np.zeros((2, 1, 64), dtype=np.float32)
|
||||
self._state = np.zeros((2, 1, 128), dtype=np.float32)
|
||||
self.sample_rate = 16000
|
||||
|
||||
if not ONNX_AVAILABLE:
|
||||
@@ -141,8 +141,7 @@ class SileroVAD:
|
||||
|
||||
def reset_states(self):
|
||||
"""Reset hidden states."""
|
||||
self._h = np.zeros((2, 1, 64), dtype=np.float32)
|
||||
self._c = np.zeros((2, 1, 64), dtype=np.float32)
|
||||
self._state = np.zeros((2, 1, 128), dtype=np.float32)
|
||||
|
||||
def __call__(self, audio: np.ndarray) -> float:
|
||||
"""Run VAD on audio chunk, return speech probability."""
|
||||
@@ -153,15 +152,14 @@ class SileroVAD:
|
||||
if audio.ndim == 1:
|
||||
audio = audio[np.newaxis, :]
|
||||
|
||||
# Run inference
|
||||
# Run inference with updated model format
|
||||
ort_inputs = {
|
||||
'input': audio.astype(np.float32),
|
||||
'sr': np.array([self.sample_rate], dtype=np.int64),
|
||||
'h': self._h,
|
||||
'c': self._c
|
||||
'state': self._state,
|
||||
'sr': np.array(self.sample_rate, dtype=np.int64)
|
||||
}
|
||||
|
||||
output, self._h, self._c = self.session.run(None, ort_inputs)
|
||||
output, self._state = self.session.run(None, ort_inputs)
|
||||
return float(output[0][0])
|
||||
|
||||
|
||||
@@ -406,6 +404,193 @@ class Transcriber:
|
||||
print(json.dumps({"error": f"Transcription error: {e}"}), file=sys.stderr)
|
||||
return ""
|
||||
|
||||
def segment_audio_file(
|
||||
self,
|
||||
audio_path: str,
|
||||
max_chunk_seconds: int = 300,
|
||||
min_silence_ms: int = 500,
|
||||
output_dir: Optional[str] = None
|
||||
) -> dict:
|
||||
"""
|
||||
Segment an audio file using VAD for natural speech boundaries.
|
||||
|
||||
Args:
|
||||
audio_path: Path to the audio file
|
||||
max_chunk_seconds: Maximum duration per chunk (default 5 minutes)
|
||||
min_silence_ms: Minimum silence duration to consider as boundary (default 500ms)
|
||||
output_dir: Directory to save chunks (default: temp directory)
|
||||
|
||||
Returns:
|
||||
dict with segments list and metadata
|
||||
"""
|
||||
try:
|
||||
# Import audio processing libraries
|
||||
try:
|
||||
from pydub import AudioSegment
|
||||
except ImportError:
|
||||
return {"error": "pydub not installed. Run: pip install pydub"}
|
||||
|
||||
if not os.path.exists(audio_path):
|
||||
return {"error": f"File not found: {audio_path}"}
|
||||
|
||||
# Create output directory
|
||||
if output_dir is None:
|
||||
output_dir = tempfile.mkdtemp(prefix="audio_segments_")
|
||||
else:
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
# Load audio file and convert to mono 16kHz
|
||||
print(json.dumps({"status": "loading_audio", "file": audio_path}), file=sys.stderr)
|
||||
audio = AudioSegment.from_file(audio_path)
|
||||
audio = audio.set_channels(1).set_frame_rate(16000)
|
||||
total_duration_ms = len(audio)
|
||||
total_duration_sec = total_duration_ms / 1000
|
||||
|
||||
print(json.dumps({
|
||||
"status": "audio_loaded",
|
||||
"duration_seconds": total_duration_sec
|
||||
}), file=sys.stderr)
|
||||
|
||||
# Convert to numpy for VAD processing
|
||||
samples = np.array(audio.get_array_of_samples(), dtype=np.float32) / 32768.0
|
||||
|
||||
# Run VAD to detect speech regions
|
||||
segments = []
|
||||
current_start = 0
|
||||
max_chunk_samples = max_chunk_seconds * 16000
|
||||
min_silence_samples = int(min_silence_ms * 16 ) # 16 samples per ms at 16kHz
|
||||
|
||||
if self.vad_model is None or self.vad_model.session is None:
|
||||
# No VAD available, use fixed-time splitting
|
||||
print(json.dumps({"warning": "VAD not available, using fixed-time splitting"}), file=sys.stderr)
|
||||
chunk_idx = 0
|
||||
for start_sample in range(0, len(samples), max_chunk_samples):
|
||||
end_sample = min(start_sample + max_chunk_samples, len(samples))
|
||||
chunk_samples = samples[start_sample:end_sample]
|
||||
|
||||
# Export chunk
|
||||
chunk_path = os.path.join(output_dir, f"chunk_{chunk_idx:03d}.wav")
|
||||
self._export_wav(chunk_samples, chunk_path)
|
||||
|
||||
segments.append({
|
||||
"index": chunk_idx,
|
||||
"path": chunk_path,
|
||||
"start": start_sample / 16000,
|
||||
"end": end_sample / 16000,
|
||||
"duration": (end_sample - start_sample) / 16000
|
||||
})
|
||||
chunk_idx += 1
|
||||
else:
|
||||
# Use VAD for intelligent splitting
|
||||
print(json.dumps({"status": "running_vad"}), file=sys.stderr)
|
||||
self.vad_model.reset_states()
|
||||
|
||||
# Find silence regions for splitting
|
||||
window_size = 512
|
||||
silence_starts = []
|
||||
in_silence = False
|
||||
silence_start = 0
|
||||
|
||||
for i in range(0, len(samples) - window_size, window_size):
|
||||
window = samples[i:i + window_size]
|
||||
speech_prob = self.vad_model(window)
|
||||
|
||||
if speech_prob < 0.3: # Silence threshold
|
||||
if not in_silence:
|
||||
in_silence = True
|
||||
silence_start = i
|
||||
else:
|
||||
if in_silence:
|
||||
silence_duration = i - silence_start
|
||||
if silence_duration >= min_silence_samples:
|
||||
# Mark middle of silence as potential split point
|
||||
silence_starts.append(silence_start + silence_duration // 2)
|
||||
in_silence = False
|
||||
|
||||
# Add end of file as final split point
|
||||
silence_starts.append(len(samples))
|
||||
|
||||
# Create segments based on silence boundaries
|
||||
chunk_idx = 0
|
||||
current_start = 0
|
||||
|
||||
for split_point in silence_starts:
|
||||
# Check if we need to split here
|
||||
chunk_duration = split_point - current_start
|
||||
|
||||
if chunk_duration >= max_chunk_samples or split_point == len(samples):
|
||||
# Find the best split point before max duration
|
||||
if chunk_duration > max_chunk_samples:
|
||||
# Find nearest silence point before max
|
||||
best_split = current_start + max_chunk_samples
|
||||
for sp in silence_starts:
|
||||
if current_start < sp <= current_start + max_chunk_samples:
|
||||
best_split = sp
|
||||
split_point = best_split
|
||||
|
||||
# Export chunk
|
||||
chunk_samples = samples[current_start:split_point]
|
||||
if len(chunk_samples) > 8000: # At least 0.5 seconds
|
||||
chunk_path = os.path.join(output_dir, f"chunk_{chunk_idx:03d}.wav")
|
||||
self._export_wav(chunk_samples, chunk_path)
|
||||
|
||||
segments.append({
|
||||
"index": chunk_idx,
|
||||
"path": chunk_path,
|
||||
"start": current_start / 16000,
|
||||
"end": split_point / 16000,
|
||||
"duration": (split_point - current_start) / 16000
|
||||
})
|
||||
chunk_idx += 1
|
||||
|
||||
current_start = split_point
|
||||
|
||||
# Handle any remaining audio - split into max_chunk_samples pieces
|
||||
while current_start < len(samples):
|
||||
remaining_len = len(samples) - current_start
|
||||
if remaining_len < 8000: # Less than 0.5 seconds
|
||||
break
|
||||
|
||||
# Determine chunk end (respect max_chunk_samples)
|
||||
chunk_end = min(current_start + max_chunk_samples, len(samples))
|
||||
chunk_samples = samples[current_start:chunk_end]
|
||||
|
||||
chunk_path = os.path.join(output_dir, f"chunk_{chunk_idx:03d}.wav")
|
||||
self._export_wav(chunk_samples, chunk_path)
|
||||
segments.append({
|
||||
"index": chunk_idx,
|
||||
"path": chunk_path,
|
||||
"start": current_start / 16000,
|
||||
"end": chunk_end / 16000,
|
||||
"duration": len(chunk_samples) / 16000
|
||||
})
|
||||
chunk_idx += 1
|
||||
current_start = chunk_end
|
||||
|
||||
print(json.dumps({
|
||||
"status": "segmentation_complete",
|
||||
"total_segments": len(segments)
|
||||
}), file=sys.stderr)
|
||||
|
||||
return {
|
||||
"status": "success",
|
||||
"segments": segments,
|
||||
"total_segments": len(segments),
|
||||
"total_duration": total_duration_sec,
|
||||
"output_dir": output_dir
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
return {"error": f"Segmentation error: {str(e)}"}
|
||||
|
||||
def _export_wav(self, samples: np.ndarray, output_path: str):
|
||||
"""Export numpy samples to WAV file."""
|
||||
with wave.open(output_path, 'wb') as wf:
|
||||
wf.setnchannels(1)
|
||||
wf.setsampwidth(2)
|
||||
wf.setframerate(16000)
|
||||
wf.writeframes((samples * 32768).astype(np.int16).tobytes())
|
||||
|
||||
def handle_command(self, cmd: dict) -> Optional[dict]:
|
||||
"""Handle a JSON command."""
|
||||
action = cmd.get("action")
|
||||
@@ -447,6 +632,21 @@ class Transcriber:
|
||||
self.streaming_session = None
|
||||
return result
|
||||
|
||||
elif action == "segment_audio":
|
||||
# Segment audio file using VAD
|
||||
file_path = cmd.get("file_path")
|
||||
if not file_path:
|
||||
return {"error": "No file_path specified"}
|
||||
max_chunk_seconds = cmd.get("max_chunk_seconds", 300)
|
||||
min_silence_ms = cmd.get("min_silence_ms", 500)
|
||||
output_dir = cmd.get("output_dir")
|
||||
return self.segment_audio_file(
|
||||
file_path,
|
||||
max_chunk_seconds=max_chunk_seconds,
|
||||
min_silence_ms=min_silence_ms,
|
||||
output_dir=output_dir
|
||||
)
|
||||
|
||||
elif action == "ping":
|
||||
return {"status": "pong"}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user