feat: Add Dify audio transcription with VAD chunking and SSE progress

- Add audio file upload transcription via Dify STT API
- Implement VAD-based audio segmentation in sidecar (3-min chunks)
- Add SSE endpoint for real-time transcription progress updates
- Fix chunk size enforcement for reliable uploads
- Add retry logic with exponential backoff for API calls
- Support Python 3.13+ with audioop-lts package
- Update frontend with Chinese progress messages and chunk display
- Improve start.sh health check with retry loop

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
egg
2025-12-11 21:00:27 +08:00
parent e790f48967
commit 263eb1c394
10 changed files with 1008 additions and 16 deletions

View File

@@ -9,6 +9,7 @@ DB_NAME=db_A060
AUTH_API_URL=https://pj-auth-api.vercel.app/api/auth/login AUTH_API_URL=https://pj-auth-api.vercel.app/api/auth/login
DIFY_API_URL=https://dify.theaken.com/v1 DIFY_API_URL=https://dify.theaken.com/v1
DIFY_API_KEY=app-xxxxxxxxxxx DIFY_API_KEY=app-xxxxxxxxxxx
DIFY_STT_API_KEY=app-xxxxxxxxxxx
# Application Settings # Application Settings
ADMIN_EMAIL=ymirliu@panjit.com.tw ADMIN_EMAIL=ymirliu@panjit.com.tw

View File

@@ -16,6 +16,7 @@ class Settings:
) )
DIFY_API_URL: str = os.getenv("DIFY_API_URL", "https://dify.theaken.com/v1") DIFY_API_URL: str = os.getenv("DIFY_API_URL", "https://dify.theaken.com/v1")
DIFY_API_KEY: str = os.getenv("DIFY_API_KEY", "") DIFY_API_KEY: str = os.getenv("DIFY_API_KEY", "")
DIFY_STT_API_KEY: str = os.getenv("DIFY_STT_API_KEY", "")
ADMIN_EMAIL: str = os.getenv("ADMIN_EMAIL", "ymirliu@panjit.com.tw") ADMIN_EMAIL: str = os.getenv("ADMIN_EMAIL", "ymirliu@panjit.com.tw")
JWT_SECRET: str = os.getenv("JWT_SECRET", "meeting-assistant-secret") JWT_SECRET: str = os.getenv("JWT_SECRET", "meeting-assistant-secret")

View File

@@ -1,11 +1,22 @@
from fastapi import APIRouter, HTTPException, Depends from fastapi import APIRouter, HTTPException, Depends, UploadFile, File
from fastapi.responses import StreamingResponse
import httpx import httpx
import json import json
import os
import tempfile
import subprocess
import shutil
import asyncio
from typing import Optional, AsyncGenerator
from ..config import settings from ..config import settings
from ..models import SummarizeRequest, SummarizeResponse, ActionItemCreate, TokenPayload from ..models import SummarizeRequest, SummarizeResponse, ActionItemCreate, TokenPayload
from .auth import get_current_user from .auth import get_current_user
# Supported audio formats
SUPPORTED_AUDIO_FORMATS = {".mp3", ".wav", ".m4a", ".webm", ".ogg", ".flac", ".aac"}
MAX_FILE_SIZE = 500 * 1024 * 1024 # 500MB
router = APIRouter() router = APIRouter()
@@ -74,6 +85,9 @@ def parse_dify_response(answer: str) -> dict:
Parse Dify response to extract conclusions and action items. Parse Dify response to extract conclusions and action items.
Attempts JSON parsing first, then falls back to text parsing. Attempts JSON parsing first, then falls back to text parsing.
""" """
print(f"[Dify Summarize] Raw answer length: {len(answer)} chars")
print(f"[Dify Summarize] Raw answer preview: {answer[:500]}...")
# Try to find JSON in the response # Try to find JSON in the response
try: try:
# Look for JSON block # Look for JSON block
@@ -90,13 +104,424 @@ def parse_dify_response(answer: str) -> dict:
raise ValueError("No JSON found") raise ValueError("No JSON found")
data = json.loads(json_str) data = json.loads(json_str)
print(f"[Dify Summarize] Parsed JSON keys: {list(data.keys())}")
print(f"[Dify Summarize] conclusions count: {len(data.get('conclusions', []))}")
print(f"[Dify Summarize] action_items count: {len(data.get('action_items', []))}")
return { return {
"conclusions": data.get("conclusions", []), "conclusions": data.get("conclusions", []),
"action_items": data.get("action_items", []), "action_items": data.get("action_items", []),
} }
except (ValueError, json.JSONDecodeError): except (ValueError, json.JSONDecodeError) as e:
print(f"[Dify Summarize] JSON parse failed: {e}")
# Fallback: return raw answer as single conclusion # Fallback: return raw answer as single conclusion
return { return {
"conclusions": [answer] if answer else [], "conclusions": [answer] if answer else [],
"action_items": [], "action_items": [],
} }
@router.post("/ai/transcribe-audio")
async def transcribe_audio(
file: UploadFile = File(...),
current_user: TokenPayload = Depends(get_current_user)
):
"""
Transcribe an uploaded audio file using Dify STT service.
Large files are automatically chunked using VAD segmentation.
"""
if not settings.DIFY_STT_API_KEY:
raise HTTPException(status_code=503, detail="Dify STT API not configured")
# Validate file extension
file_ext = os.path.splitext(file.filename or "")[1].lower()
if file_ext not in SUPPORTED_AUDIO_FORMATS:
raise HTTPException(
status_code=400,
detail=f"Unsupported audio format. Supported: {', '.join(SUPPORTED_AUDIO_FORMATS)}"
)
# Create temp directory for processing
temp_dir = tempfile.mkdtemp(prefix="transcribe_")
temp_file_path = os.path.join(temp_dir, f"upload{file_ext}")
try:
# Save uploaded file
file_size = 0
with open(temp_file_path, "wb") as f:
while chunk := await file.read(1024 * 1024): # 1MB chunks
file_size += len(chunk)
if file_size > MAX_FILE_SIZE:
raise HTTPException(
status_code=413,
detail=f"File too large. Maximum size: {MAX_FILE_SIZE // (1024*1024)}MB"
)
f.write(chunk)
print(f"[Transcribe] Saved uploaded file: {temp_file_path}, size: {file_size} bytes")
# Call sidecar to segment audio
segments = await segment_audio_with_sidecar(temp_file_path, temp_dir)
if "error" in segments:
raise HTTPException(status_code=500, detail=segments["error"])
segment_list = segments.get("segments", [])
total_segments = len(segment_list)
print(f"[Transcribe] Segmentation complete: {total_segments} chunks created")
for seg in segment_list:
print(f" - Chunk {seg.get('index')}: {seg.get('path')} ({seg.get('duration', 0):.1f}s)")
if total_segments == 0:
raise HTTPException(status_code=400, detail="No audio content detected")
# Transcribe each chunk via Dify STT
transcriptions = []
failed_chunks = []
async with httpx.AsyncClient() as client:
for i, segment in enumerate(segment_list):
chunk_path = segment.get("path")
chunk_index = segment.get("index", i)
print(f"[Transcribe] Processing chunk {chunk_index + 1}/{total_segments}: {chunk_path}")
if not chunk_path:
print(f"[Transcribe] ERROR: Chunk {chunk_index} has no path!")
failed_chunks.append(chunk_index)
continue
if not os.path.exists(chunk_path):
print(f"[Transcribe] ERROR: Chunk file does not exist: {chunk_path}")
failed_chunks.append(chunk_index)
continue
chunk_size = os.path.getsize(chunk_path)
print(f"[Transcribe] Chunk {chunk_index} exists, size: {chunk_size} bytes")
# Call Dify STT API with retry
text = await transcribe_chunk_with_dify(
client, chunk_path, current_user.email
)
if text:
print(f"[Transcribe] Chunk {chunk_index} transcribed: {len(text)} chars")
transcriptions.append(text)
else:
print(f"[Transcribe] Chunk {chunk_index} transcription failed (no text returned)")
failed_chunks.append(chunk_index)
# Concatenate all transcriptions
final_transcript = " ".join(transcriptions)
print(f"[Transcribe] Complete: {len(transcriptions)}/{total_segments} chunks transcribed")
if failed_chunks:
print(f"[Transcribe] Failed chunks: {failed_chunks}")
return {
"transcript": final_transcript,
"chunks_processed": len(transcriptions),
"chunks_total": total_segments,
"chunks_failed": len(failed_chunks),
"total_duration_seconds": segments.get("total_duration", 0),
"language": "zh"
}
finally:
# Clean up temp files
shutil.rmtree(temp_dir, ignore_errors=True)
@router.post("/ai/transcribe-audio-stream")
async def transcribe_audio_stream(
file: UploadFile = File(...),
current_user: TokenPayload = Depends(get_current_user)
):
"""
Transcribe an uploaded audio file with real-time progress via SSE.
Returns Server-Sent Events for progress updates.
"""
if not settings.DIFY_STT_API_KEY:
raise HTTPException(status_code=503, detail="Dify STT API not configured")
# Validate file extension
file_ext = os.path.splitext(file.filename or "")[1].lower()
if file_ext not in SUPPORTED_AUDIO_FORMATS:
raise HTTPException(
status_code=400,
detail=f"Unsupported audio format. Supported: {', '.join(SUPPORTED_AUDIO_FORMATS)}"
)
# Read file into memory for streaming
file_content = await file.read()
if len(file_content) > MAX_FILE_SIZE:
raise HTTPException(
status_code=413,
detail=f"File too large. Maximum size: {MAX_FILE_SIZE // (1024*1024)}MB"
)
async def generate_progress() -> AsyncGenerator[str, None]:
temp_dir = tempfile.mkdtemp(prefix="transcribe_")
temp_file_path = os.path.join(temp_dir, f"upload{file_ext}")
try:
# Save file
with open(temp_file_path, "wb") as f:
f.write(file_content)
yield f"data: {json.dumps({'event': 'start', 'message': '音訊檔案已接收,開始處理...'})}\n\n"
# Segment audio
yield f"data: {json.dumps({'event': 'segmenting', 'message': '正在分析音訊並分割片段...'})}\n\n"
segments = await segment_audio_with_sidecar(temp_file_path, temp_dir)
if "error" in segments:
yield f"data: {json.dumps({'event': 'error', 'message': segments['error']})}\n\n"
return
segment_list = segments.get("segments", [])
total_segments = len(segment_list)
total_duration = segments.get("total_duration", 0)
if total_segments == 0:
yield f"data: {json.dumps({'event': 'error', 'message': '未檢測到音訊內容'})}\n\n"
return
yield f"data: {json.dumps({'event': 'segments_ready', 'total': total_segments, 'duration': total_duration, 'message': f'分割完成,共 {total_segments} 個片段'})}\n\n"
# Transcribe each chunk
transcriptions = []
async with httpx.AsyncClient() as client:
for i, segment in enumerate(segment_list):
chunk_path = segment.get("path")
chunk_index = segment.get("index", i)
chunk_duration = segment.get("duration", 0)
yield f"data: {json.dumps({'event': 'chunk_start', 'chunk': chunk_index + 1, 'total': total_segments, 'duration': chunk_duration, 'message': f'正在轉錄片段 {chunk_index + 1}/{total_segments}...'})}\n\n"
if not chunk_path or not os.path.exists(chunk_path):
yield f"data: {json.dumps({'event': 'chunk_error', 'chunk': chunk_index + 1, 'message': f'片段 {chunk_index + 1} 檔案不存在'})}\n\n"
continue
text = await transcribe_chunk_with_dify(
client, chunk_path, current_user.email
)
if text:
transcriptions.append(text)
yield f"data: {json.dumps({'event': 'chunk_done', 'chunk': chunk_index + 1, 'total': total_segments, 'text_length': len(text), 'message': f'片段 {chunk_index + 1} 完成'})}\n\n"
else:
yield f"data: {json.dumps({'event': 'chunk_error', 'chunk': chunk_index + 1, 'message': f'片段 {chunk_index + 1} 轉錄失敗'})}\n\n"
# Final result
final_transcript = " ".join(transcriptions)
yield f"data: {json.dumps({'event': 'complete', 'transcript': final_transcript, 'chunks_processed': len(transcriptions), 'chunks_total': total_segments, 'duration': total_duration})}\n\n"
finally:
shutil.rmtree(temp_dir, ignore_errors=True)
return StreamingResponse(
generate_progress(),
media_type="text/event-stream",
headers={
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"X-Accel-Buffering": "no"
}
)
async def segment_audio_with_sidecar(audio_path: str, output_dir: str) -> dict:
"""Call sidecar to segment audio file using VAD."""
# Find sidecar script
sidecar_dir = os.path.join(os.path.dirname(__file__), "..", "..", "..", "sidecar")
sidecar_script = os.path.join(sidecar_dir, "transcriber.py")
venv_python = os.path.join(sidecar_dir, "venv", "bin", "python")
# Use venv python if available, otherwise system python
python_cmd = venv_python if os.path.exists(venv_python) else "python3"
if not os.path.exists(sidecar_script):
return {"error": "Sidecar not found"}
try:
# Prepare command
cmd_input = json.dumps({
"action": "segment_audio",
"file_path": audio_path,
"max_chunk_seconds": 180, # 3 minutes (smaller chunks for reliable upload)
"min_silence_ms": 500,
"output_dir": output_dir
})
# Run sidecar process
process = await asyncio.create_subprocess_exec(
python_cmd, sidecar_script,
stdin=asyncio.subprocess.PIPE,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
cwd=sidecar_dir
)
# Send command and wait for response
stdout, stderr = await asyncio.wait_for(
process.communicate(input=f"{cmd_input}\n{{\"action\": \"quit\"}}\n".encode()),
timeout=600 # 10 minutes timeout for large files
)
# Parse response (skip status messages, find the segment result)
for line in stdout.decode().strip().split('\n'):
if line:
try:
data = json.loads(line)
if data.get("status") == "success" or "segments" in data:
return data
if "error" in data:
return data
except json.JSONDecodeError:
continue
return {"error": "No valid response from sidecar"}
except asyncio.TimeoutError:
return {"error": "Sidecar timeout during segmentation"}
except Exception as e:
return {"error": f"Sidecar error: {str(e)}"}
async def upload_file_to_dify(
client: httpx.AsyncClient,
file_path: str,
user_email: str
) -> Optional[str]:
"""Upload a file to Dify and return the file ID."""
try:
upload_url = f"{settings.DIFY_API_URL}/files/upload"
file_size = os.path.getsize(file_path)
print(f"[Upload] File: {file_path}, size: {file_size / (1024*1024):.1f} MB")
# Adjust timeout based on file size (minimum 60s, ~1MB per 5 seconds)
timeout_seconds = max(60.0, file_size / (1024 * 1024) * 5)
print(f"[Upload] Using timeout: {timeout_seconds:.0f}s")
with open(file_path, "rb") as f:
files = {"file": (os.path.basename(file_path), f, "audio/wav")}
response = await client.post(
upload_url,
headers={
"Authorization": f"Bearer {settings.DIFY_STT_API_KEY}",
},
files=files,
data={"user": user_email},
timeout=timeout_seconds,
)
print(f"[Upload] Response: {response.status_code}")
if response.status_code == 201 or response.status_code == 200:
data = response.json()
file_id = data.get("id")
print(f"[Upload] Success, file_id: {file_id}")
return file_id
print(f"[Upload] Error: {response.status_code} - {response.text[:500]}")
return None
except httpx.ReadError as e:
print(f"[Upload] Network read error (connection reset): {e}")
return None
except httpx.TimeoutException as e:
print(f"[Upload] Timeout: {e}")
return None
except Exception as e:
import traceback
print(f"[Upload] Error: {e}")
print(traceback.format_exc())
return None
async def transcribe_chunk_with_dify(
client: httpx.AsyncClient,
chunk_path: str,
user_email: str,
max_retries: int = 3
) -> Optional[str]:
"""Transcribe a single audio chunk via Dify chat API with file upload."""
for attempt in range(max_retries):
try:
print(f"[Dify] Attempt {attempt + 1}/{max_retries} for chunk: {chunk_path}")
# Step 1: Upload file to Dify (with retry inside this attempt)
file_id = None
for upload_attempt in range(2): # 2 upload attempts per main attempt
file_id = await upload_file_to_dify(client, chunk_path, user_email)
if file_id:
break
print(f"[Dify] Upload attempt {upload_attempt + 1} failed, retrying...")
await asyncio.sleep(1)
if not file_id:
print(f"[Dify] Failed to upload file after retries: {chunk_path}")
if attempt < max_retries - 1:
await asyncio.sleep(2 ** attempt)
continue
return None
print(f"[Dify] File uploaded, file_id: {file_id}")
# Step 2: Send chat message with file to request transcription
response = await client.post(
f"{settings.DIFY_API_URL}/chat-messages",
headers={
"Authorization": f"Bearer {settings.DIFY_STT_API_KEY}",
"Content-Type": "application/json",
},
json={
"inputs": {},
"query": "請將這段音檔轉錄成文字,只回傳轉錄的文字內容,不要加任何額外說明。",
"response_mode": "blocking",
"user": user_email,
"files": [
{
"type": "audio",
"transfer_method": "local_file",
"upload_file_id": file_id
}
]
},
timeout=300.0, # 5 minutes per chunk (increased for longer segments)
)
print(f"[Dify] Chat response: {response.status_code}")
if response.status_code == 200:
data = response.json()
answer = data.get("answer", "")
print(f"[Dify] Transcription success, length: {len(answer)} chars")
return answer
# Retry on server errors or rate limits
if response.status_code >= 500 or response.status_code == 429:
print(f"[Dify] Server error {response.status_code}, will retry...")
if attempt < max_retries - 1:
wait_time = 2 ** attempt
if response.status_code == 429:
wait_time = 10 # Wait longer for rate limits
await asyncio.sleep(wait_time)
continue
# Log error but don't fail entire transcription
print(f"[Dify] Chat error for chunk: {response.status_code} - {response.text[:500]}")
return None
except httpx.TimeoutException:
if attempt < max_retries - 1:
await asyncio.sleep(2 ** attempt)
continue
return None
except Exception as e:
print(f"Chunk transcription error: {e}")
return None
return None

View File

@@ -4,6 +4,7 @@ python-dotenv>=1.0.0
mysql-connector-python>=9.0.0 mysql-connector-python>=9.0.0
pydantic>=2.10.0 pydantic>=2.10.0
httpx>=0.27.0 httpx>=0.27.0
python-multipart>=0.0.9
python-jose[cryptography]>=3.3.0 python-jose[cryptography]>=3.3.0
openpyxl>=3.1.2 openpyxl>=3.1.2
pytest>=8.0.0 pytest>=8.0.0

View File

@@ -96,6 +96,33 @@
color: #666; color: #666;
font-style: italic; font-style: italic;
} }
.upload-progress {
display: none;
padding: 10px 15px;
background: #fff3e0;
border-radius: 6px;
margin-bottom: 10px;
}
.upload-progress.active {
display: block;
}
.upload-progress-bar {
height: 6px;
background: #e0e0e0;
border-radius: 3px;
overflow: hidden;
margin-top: 8px;
}
.upload-progress-fill {
height: 100%;
background: #ff9800;
width: 0%;
transition: width 0.3s ease;
}
.upload-progress-text {
font-size: 13px;
color: #e65100;
}
.transcript-textarea { .transcript-textarea {
width: 100%; width: 100%;
min-height: 400px; min-height: 400px;
@@ -143,8 +170,10 @@
<div class="panel"> <div class="panel">
<div class="panel-header"> <div class="panel-header">
<span>Transcript (逐字稿)</span> <span>Transcript (逐字稿)</span>
<div class="recording-controls" style="padding: 0;"> <div class="recording-controls" style="padding: 0; display: flex; gap: 8px;">
<button class="btn btn-danger" id="record-btn">Start Recording</button> <button class="btn btn-danger" id="record-btn">Start Recording</button>
<button class="btn btn-secondary" id="upload-audio-btn">Upload Audio</button>
<input type="file" id="audio-file-input" accept=".mp3,.wav,.m4a,.webm,.ogg,.flac,.aac" style="display: none;">
</div> </div>
</div> </div>
<div class="panel-body"> <div class="panel-body">
@@ -155,6 +184,14 @@
<span class="segment-count" id="segment-count">Segments: 0</span> <span class="segment-count" id="segment-count">Segments: 0</span>
</div> </div>
<!-- Upload Progress -->
<div id="upload-progress" class="upload-progress">
<span class="upload-progress-text" id="upload-progress-text">Uploading...</span>
<div class="upload-progress-bar">
<div class="upload-progress-fill" id="upload-progress-fill"></div>
</div>
</div>
<!-- Single Transcript Textarea --> <!-- Single Transcript Textarea -->
<div id="transcript-container"> <div id="transcript-container">
<textarea <textarea
@@ -203,7 +240,8 @@
updateMeeting, updateMeeting,
deleteMeeting, deleteMeeting,
exportMeeting, exportMeeting,
summarizeTranscript summarizeTranscript,
transcribeAudio
} from '../services/api.js'; } from '../services/api.js';
const meetingId = localStorage.getItem('currentMeetingId'); const meetingId = localStorage.getItem('currentMeetingId');
@@ -234,6 +272,11 @@
const deleteBtn = document.getElementById('delete-btn'); const deleteBtn = document.getElementById('delete-btn');
const addConclusionBtn = document.getElementById('add-conclusion-btn'); const addConclusionBtn = document.getElementById('add-conclusion-btn');
const addActionBtn = document.getElementById('add-action-btn'); const addActionBtn = document.getElementById('add-action-btn');
const uploadAudioBtn = document.getElementById('upload-audio-btn');
const audioFileInput = document.getElementById('audio-file-input');
const uploadProgressEl = document.getElementById('upload-progress');
const uploadProgressText = document.getElementById('upload-progress-text');
const uploadProgressFill = document.getElementById('upload-progress-fill');
// Load meeting data // Load meeting data
async function loadMeeting() { async function loadMeeting() {
@@ -460,6 +503,86 @@
processingIndicatorEl.classList.add('hidden'); processingIndicatorEl.classList.add('hidden');
} }
// === Audio File Upload ===
uploadAudioBtn.addEventListener('click', () => {
if (isRecording) {
alert('Please stop recording before uploading audio.');
return;
}
audioFileInput.click();
});
audioFileInput.addEventListener('change', async (e) => {
const file = e.target.files[0];
if (!file) return;
// Validate file size (500MB max)
const maxSize = 500 * 1024 * 1024;
if (file.size > maxSize) {
alert('File too large. Maximum size is 500MB.');
audioFileInput.value = '';
return;
}
// Confirm if transcript has content
const currentTranscript = transcriptTextEl.value.trim();
if (currentTranscript) {
if (!confirm('This will replace the existing transcript. Do you want to continue?')) {
audioFileInput.value = '';
return;
}
}
// Start upload
uploadAudioBtn.disabled = true;
recordBtn.disabled = true;
uploadProgressEl.classList.add('active');
uploadProgressFill.style.width = '0%';
uploadProgressText.textContent = 'Uploading audio file...';
try {
const result = await transcribeAudio(file, (progress) => {
if (progress.phase === 'uploading') {
uploadProgressFill.style.width = `${progress.progress}%`;
uploadProgressText.textContent = `上傳中: ${progress.progress}%`;
} else if (progress.phase === 'processing') {
uploadProgressFill.style.width = `${progress.progress}%`;
uploadProgressText.textContent = progress.message || '處理中...';
} else if (progress.phase === 'transcribing') {
uploadProgressFill.style.width = `${progress.progress}%`;
if (progress.total && progress.current) {
uploadProgressText.textContent = `轉錄中: ${progress.current}/${progress.total} 片段 (${progress.progress}%)`;
} else {
uploadProgressText.textContent = progress.message || '轉錄中...';
}
} else if (progress.phase === 'complete') {
uploadProgressFill.style.width = '100%';
uploadProgressText.textContent = progress.message || '轉錄完成';
}
});
// Success - update transcript
transcriptTextEl.value = result.transcript || '';
const chunksInfo = result.chunks_failed > 0
? `${result.chunks_processed}/${result.chunks_total} 片段成功`
: `${result.chunks_processed} 片段`;
uploadProgressText.textContent = `轉錄完成!(${chunksInfo}, ${Math.round(result.total_duration_seconds)}秒)`;
// Auto-hide progress after 3 seconds
setTimeout(() => {
uploadProgressEl.classList.remove('active');
}, 3000);
} catch (error) {
alert('Error transcribing audio: ' + error.message);
uploadProgressEl.classList.remove('active');
} finally {
uploadAudioBtn.disabled = false;
recordBtn.disabled = false;
audioFileInput.value = '';
}
});
// === Streaming Event Handlers (legacy, kept for future use) === // === Streaming Event Handlers (legacy, kept for future use) ===
window.electronAPI.onTranscriptionSegment((segment) => { window.electronAPI.onTranscriptionSegment((segment) => {
console.log('Received segment:', segment); console.log('Received segment:', segment);

View File

@@ -141,6 +141,231 @@ export async function summarizeTranscript(transcript) {
}); });
} }
export async function transcribeAudio(file, onProgress = null) {
const url = `${API_BASE_URL}/ai/transcribe-audio-stream`;
const formData = new FormData();
formData.append("file", file);
const token = getToken();
return new Promise((resolve, reject) => {
// Use fetch for SSE support
fetch(url, {
method: "POST",
headers: {
Authorization: token ? `Bearer ${token}` : undefined,
},
body: formData,
})
.then((response) => {
if (response.status === 401) {
clearToken();
window.electronAPI?.navigate("login");
throw new Error("Session expired, please login again");
}
if (!response.ok) {
return response.json().then((error) => {
throw new Error(error.detail || `HTTP error ${response.status}`);
});
}
if (onProgress) {
onProgress({ phase: "processing", progress: 0, message: "處理中..." });
}
// Read SSE stream
const reader = response.body.getReader();
const decoder = new TextDecoder();
let buffer = "";
let result = null;
let totalChunks = 0;
let processedChunks = 0;
function processLine(line) {
if (line.startsWith("data: ")) {
try {
const data = JSON.parse(line.slice(6));
switch (data.event) {
case "start":
case "segmenting":
if (onProgress) {
onProgress({
phase: "processing",
progress: 5,
message: data.message,
});
}
break;
case "segments_ready":
totalChunks = data.total;
if (onProgress) {
onProgress({
phase: "transcribing",
progress: 10,
total: totalChunks,
current: 0,
message: data.message,
});
}
break;
case "chunk_start":
if (onProgress) {
const progress = 10 + ((data.chunk - 1) / totalChunks) * 85;
onProgress({
phase: "transcribing",
progress: Math.round(progress),
total: totalChunks,
current: data.chunk,
message: data.message,
});
}
break;
case "chunk_done":
processedChunks++;
if (onProgress) {
const progress = 10 + (data.chunk / totalChunks) * 85;
onProgress({
phase: "transcribing",
progress: Math.round(progress),
total: totalChunks,
current: data.chunk,
message: data.message,
});
}
break;
case "chunk_error":
console.warn(`Chunk ${data.chunk} error: ${data.message}`);
break;
case "error":
throw new Error(data.message);
case "complete":
result = {
transcript: data.transcript,
chunks_processed: data.chunks_processed,
chunks_total: data.chunks_total,
total_duration_seconds: data.duration,
language: "zh",
};
if (onProgress) {
onProgress({
phase: "complete",
progress: 100,
message: "轉錄完成",
});
}
break;
}
} catch (e) {
console.warn("SSE parse error:", e, line);
}
}
}
function read() {
reader
.read()
.then(({ done, value }) => {
if (done) {
// Process any remaining buffer
if (buffer.trim()) {
buffer.split("\n").forEach(processLine);
}
if (result) {
resolve(result);
} else {
reject(new Error("Transcription failed - no result received"));
}
return;
}
buffer += decoder.decode(value, { stream: true });
const lines = buffer.split("\n");
buffer = lines.pop() || ""; // Keep incomplete line in buffer
lines.forEach(processLine);
read();
})
.catch(reject);
}
read();
})
.catch(reject);
});
}
// Legacy non-streaming version (fallback)
export async function transcribeAudioLegacy(file, onProgress = null) {
const url = `${API_BASE_URL}/ai/transcribe-audio`;
const formData = new FormData();
formData.append("file", file);
const token = getToken();
return new Promise((resolve, reject) => {
const xhr = new XMLHttpRequest();
xhr.upload.addEventListener("progress", (event) => {
if (event.lengthComputable && onProgress) {
const percentComplete = Math.round((event.loaded / event.total) * 100);
onProgress({ phase: "uploading", progress: percentComplete });
}
});
xhr.addEventListener("load", () => {
if (xhr.status >= 200 && xhr.status < 300) {
try {
const response = JSON.parse(xhr.responseText);
resolve(response);
} catch (e) {
reject(new Error("Invalid response format"));
}
} else if (xhr.status === 401) {
clearToken();
window.electronAPI?.navigate("login");
reject(new Error("Session expired, please login again"));
} else {
try {
const error = JSON.parse(xhr.responseText);
reject(new Error(error.detail || `HTTP error ${xhr.status}`));
} catch (e) {
reject(new Error(`HTTP error ${xhr.status}`));
}
}
});
xhr.addEventListener("error", () => {
reject(new Error("Network error"));
});
xhr.addEventListener("timeout", () => {
reject(new Error("Request timeout"));
});
xhr.open("POST", url, true);
xhr.timeout = 600000; // 10 minutes for large files
if (token) {
xhr.setRequestHeader("Authorization", `Bearer ${token}`);
}
xhr.send(formData);
// Notify processing phase after upload completes
if (onProgress) {
xhr.upload.addEventListener("loadend", () => {
onProgress({ phase: "processing", progress: 0 });
});
}
});
}
// Export API // Export API
export async function exportMeeting(id) { export async function exportMeeting(id) {
return request(`/meetings/${id}/export`, { return request(`/meetings/${id}/export`, {

View File

@@ -3,3 +3,5 @@ faster-whisper>=1.0.0
opencc-python-reimplemented>=0.1.7 opencc-python-reimplemented>=0.1.7
numpy>=1.26.0 numpy>=1.26.0
onnxruntime>=1.16.0 onnxruntime>=1.16.0
pydub>=0.25.0
audioop-lts>=0.2.1 # Required for Python 3.13+ (audioop removed from stdlib)

View File

@@ -20,9 +20,10 @@ import tempfile
import base64 import base64
import uuid import uuid
import re import re
import wave
import urllib.request import urllib.request
from pathlib import Path from pathlib import Path
from typing import Optional, List from typing import Optional, List, Tuple
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE" os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
@@ -105,8 +106,7 @@ class SileroVAD:
def __init__(self, model_path: Optional[str] = None, threshold: float = 0.5): def __init__(self, model_path: Optional[str] = None, threshold: float = 0.5):
self.threshold = threshold self.threshold = threshold
self.session = None self.session = None
self._h = np.zeros((2, 1, 64), dtype=np.float32) self._state = np.zeros((2, 1, 128), dtype=np.float32)
self._c = np.zeros((2, 1, 64), dtype=np.float32)
self.sample_rate = 16000 self.sample_rate = 16000
if not ONNX_AVAILABLE: if not ONNX_AVAILABLE:
@@ -141,8 +141,7 @@ class SileroVAD:
def reset_states(self): def reset_states(self):
"""Reset hidden states.""" """Reset hidden states."""
self._h = np.zeros((2, 1, 64), dtype=np.float32) self._state = np.zeros((2, 1, 128), dtype=np.float32)
self._c = np.zeros((2, 1, 64), dtype=np.float32)
def __call__(self, audio: np.ndarray) -> float: def __call__(self, audio: np.ndarray) -> float:
"""Run VAD on audio chunk, return speech probability.""" """Run VAD on audio chunk, return speech probability."""
@@ -153,15 +152,14 @@ class SileroVAD:
if audio.ndim == 1: if audio.ndim == 1:
audio = audio[np.newaxis, :] audio = audio[np.newaxis, :]
# Run inference # Run inference with updated model format
ort_inputs = { ort_inputs = {
'input': audio.astype(np.float32), 'input': audio.astype(np.float32),
'sr': np.array([self.sample_rate], dtype=np.int64), 'state': self._state,
'h': self._h, 'sr': np.array(self.sample_rate, dtype=np.int64)
'c': self._c
} }
output, self._h, self._c = self.session.run(None, ort_inputs) output, self._state = self.session.run(None, ort_inputs)
return float(output[0][0]) return float(output[0][0])
@@ -406,6 +404,193 @@ class Transcriber:
print(json.dumps({"error": f"Transcription error: {e}"}), file=sys.stderr) print(json.dumps({"error": f"Transcription error: {e}"}), file=sys.stderr)
return "" return ""
def segment_audio_file(
self,
audio_path: str,
max_chunk_seconds: int = 300,
min_silence_ms: int = 500,
output_dir: Optional[str] = None
) -> dict:
"""
Segment an audio file using VAD for natural speech boundaries.
Args:
audio_path: Path to the audio file
max_chunk_seconds: Maximum duration per chunk (default 5 minutes)
min_silence_ms: Minimum silence duration to consider as boundary (default 500ms)
output_dir: Directory to save chunks (default: temp directory)
Returns:
dict with segments list and metadata
"""
try:
# Import audio processing libraries
try:
from pydub import AudioSegment
except ImportError:
return {"error": "pydub not installed. Run: pip install pydub"}
if not os.path.exists(audio_path):
return {"error": f"File not found: {audio_path}"}
# Create output directory
if output_dir is None:
output_dir = tempfile.mkdtemp(prefix="audio_segments_")
else:
os.makedirs(output_dir, exist_ok=True)
# Load audio file and convert to mono 16kHz
print(json.dumps({"status": "loading_audio", "file": audio_path}), file=sys.stderr)
audio = AudioSegment.from_file(audio_path)
audio = audio.set_channels(1).set_frame_rate(16000)
total_duration_ms = len(audio)
total_duration_sec = total_duration_ms / 1000
print(json.dumps({
"status": "audio_loaded",
"duration_seconds": total_duration_sec
}), file=sys.stderr)
# Convert to numpy for VAD processing
samples = np.array(audio.get_array_of_samples(), dtype=np.float32) / 32768.0
# Run VAD to detect speech regions
segments = []
current_start = 0
max_chunk_samples = max_chunk_seconds * 16000
min_silence_samples = int(min_silence_ms * 16 ) # 16 samples per ms at 16kHz
if self.vad_model is None or self.vad_model.session is None:
# No VAD available, use fixed-time splitting
print(json.dumps({"warning": "VAD not available, using fixed-time splitting"}), file=sys.stderr)
chunk_idx = 0
for start_sample in range(0, len(samples), max_chunk_samples):
end_sample = min(start_sample + max_chunk_samples, len(samples))
chunk_samples = samples[start_sample:end_sample]
# Export chunk
chunk_path = os.path.join(output_dir, f"chunk_{chunk_idx:03d}.wav")
self._export_wav(chunk_samples, chunk_path)
segments.append({
"index": chunk_idx,
"path": chunk_path,
"start": start_sample / 16000,
"end": end_sample / 16000,
"duration": (end_sample - start_sample) / 16000
})
chunk_idx += 1
else:
# Use VAD for intelligent splitting
print(json.dumps({"status": "running_vad"}), file=sys.stderr)
self.vad_model.reset_states()
# Find silence regions for splitting
window_size = 512
silence_starts = []
in_silence = False
silence_start = 0
for i in range(0, len(samples) - window_size, window_size):
window = samples[i:i + window_size]
speech_prob = self.vad_model(window)
if speech_prob < 0.3: # Silence threshold
if not in_silence:
in_silence = True
silence_start = i
else:
if in_silence:
silence_duration = i - silence_start
if silence_duration >= min_silence_samples:
# Mark middle of silence as potential split point
silence_starts.append(silence_start + silence_duration // 2)
in_silence = False
# Add end of file as final split point
silence_starts.append(len(samples))
# Create segments based on silence boundaries
chunk_idx = 0
current_start = 0
for split_point in silence_starts:
# Check if we need to split here
chunk_duration = split_point - current_start
if chunk_duration >= max_chunk_samples or split_point == len(samples):
# Find the best split point before max duration
if chunk_duration > max_chunk_samples:
# Find nearest silence point before max
best_split = current_start + max_chunk_samples
for sp in silence_starts:
if current_start < sp <= current_start + max_chunk_samples:
best_split = sp
split_point = best_split
# Export chunk
chunk_samples = samples[current_start:split_point]
if len(chunk_samples) > 8000: # At least 0.5 seconds
chunk_path = os.path.join(output_dir, f"chunk_{chunk_idx:03d}.wav")
self._export_wav(chunk_samples, chunk_path)
segments.append({
"index": chunk_idx,
"path": chunk_path,
"start": current_start / 16000,
"end": split_point / 16000,
"duration": (split_point - current_start) / 16000
})
chunk_idx += 1
current_start = split_point
# Handle any remaining audio - split into max_chunk_samples pieces
while current_start < len(samples):
remaining_len = len(samples) - current_start
if remaining_len < 8000: # Less than 0.5 seconds
break
# Determine chunk end (respect max_chunk_samples)
chunk_end = min(current_start + max_chunk_samples, len(samples))
chunk_samples = samples[current_start:chunk_end]
chunk_path = os.path.join(output_dir, f"chunk_{chunk_idx:03d}.wav")
self._export_wav(chunk_samples, chunk_path)
segments.append({
"index": chunk_idx,
"path": chunk_path,
"start": current_start / 16000,
"end": chunk_end / 16000,
"duration": len(chunk_samples) / 16000
})
chunk_idx += 1
current_start = chunk_end
print(json.dumps({
"status": "segmentation_complete",
"total_segments": len(segments)
}), file=sys.stderr)
return {
"status": "success",
"segments": segments,
"total_segments": len(segments),
"total_duration": total_duration_sec,
"output_dir": output_dir
}
except Exception as e:
return {"error": f"Segmentation error: {str(e)}"}
def _export_wav(self, samples: np.ndarray, output_path: str):
"""Export numpy samples to WAV file."""
with wave.open(output_path, 'wb') as wf:
wf.setnchannels(1)
wf.setsampwidth(2)
wf.setframerate(16000)
wf.writeframes((samples * 32768).astype(np.int16).tobytes())
def handle_command(self, cmd: dict) -> Optional[dict]: def handle_command(self, cmd: dict) -> Optional[dict]:
"""Handle a JSON command.""" """Handle a JSON command."""
action = cmd.get("action") action = cmd.get("action")
@@ -447,6 +632,21 @@ class Transcriber:
self.streaming_session = None self.streaming_session = None
return result return result
elif action == "segment_audio":
# Segment audio file using VAD
file_path = cmd.get("file_path")
if not file_path:
return {"error": "No file_path specified"}
max_chunk_seconds = cmd.get("max_chunk_seconds", 300)
min_silence_ms = cmd.get("min_silence_ms", 500)
output_dir = cmd.get("output_dir")
return self.segment_audio_file(
file_path,
max_chunk_seconds=max_chunk_seconds,
min_silence_ms=min_silence_ms,
output_dir=output_dir
)
elif action == "ping": elif action == "ping":
return {"status": "pong"} return {"status": "pong"}

View File

@@ -173,9 +173,23 @@ start_backend() {
local backend_pid=$! local backend_pid=$!
echo "BACKEND_PID=$backend_pid" >> "$PID_FILE" echo "BACKEND_PID=$backend_pid" >> "$PID_FILE"
# 等待啟動 # 等待啟動(最多等待 15 秒)
sleep 2 local max_wait=15
local waited=0
log_info "等待後端服務啟動..."
while [ $waited -lt $max_wait ]; do
sleep 1
waited=$((waited + 1))
# 檢查健康狀態
if curl -s http://localhost:$BACKEND_PORT/api/health > /dev/null 2>&1; then
log_success "後端服務已啟動 (PID: $backend_pid, Port: $BACKEND_PORT)"
return 0
fi
done
# 最後再檢查一次 port 狀態
if check_port $BACKEND_PORT; then if check_port $BACKEND_PORT; then
log_success "後端服務已啟動 (PID: $backend_pid, Port: $BACKEND_PORT)" log_success "後端服務已啟動 (PID: $backend_pid, Port: $BACKEND_PORT)"
else else