feat: Add Dify audio transcription with VAD chunking and SSE progress
- Add audio file upload transcription via Dify STT API - Implement VAD-based audio segmentation in sidecar (3-min chunks) - Add SSE endpoint for real-time transcription progress updates - Fix chunk size enforcement for reliable uploads - Add retry logic with exponential backoff for API calls - Support Python 3.13+ with audioop-lts package - Update frontend with Chinese progress messages and chunk display - Improve start.sh health check with retry loop 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -9,6 +9,7 @@ DB_NAME=db_A060
|
|||||||
AUTH_API_URL=https://pj-auth-api.vercel.app/api/auth/login
|
AUTH_API_URL=https://pj-auth-api.vercel.app/api/auth/login
|
||||||
DIFY_API_URL=https://dify.theaken.com/v1
|
DIFY_API_URL=https://dify.theaken.com/v1
|
||||||
DIFY_API_KEY=app-xxxxxxxxxxx
|
DIFY_API_KEY=app-xxxxxxxxxxx
|
||||||
|
DIFY_STT_API_KEY=app-xxxxxxxxxxx
|
||||||
|
|
||||||
# Application Settings
|
# Application Settings
|
||||||
ADMIN_EMAIL=ymirliu@panjit.com.tw
|
ADMIN_EMAIL=ymirliu@panjit.com.tw
|
||||||
|
|||||||
@@ -16,6 +16,7 @@ class Settings:
|
|||||||
)
|
)
|
||||||
DIFY_API_URL: str = os.getenv("DIFY_API_URL", "https://dify.theaken.com/v1")
|
DIFY_API_URL: str = os.getenv("DIFY_API_URL", "https://dify.theaken.com/v1")
|
||||||
DIFY_API_KEY: str = os.getenv("DIFY_API_KEY", "")
|
DIFY_API_KEY: str = os.getenv("DIFY_API_KEY", "")
|
||||||
|
DIFY_STT_API_KEY: str = os.getenv("DIFY_STT_API_KEY", "")
|
||||||
|
|
||||||
ADMIN_EMAIL: str = os.getenv("ADMIN_EMAIL", "ymirliu@panjit.com.tw")
|
ADMIN_EMAIL: str = os.getenv("ADMIN_EMAIL", "ymirliu@panjit.com.tw")
|
||||||
JWT_SECRET: str = os.getenv("JWT_SECRET", "meeting-assistant-secret")
|
JWT_SECRET: str = os.getenv("JWT_SECRET", "meeting-assistant-secret")
|
||||||
|
|||||||
@@ -1,11 +1,22 @@
|
|||||||
from fastapi import APIRouter, HTTPException, Depends
|
from fastapi import APIRouter, HTTPException, Depends, UploadFile, File
|
||||||
|
from fastapi.responses import StreamingResponse
|
||||||
import httpx
|
import httpx
|
||||||
import json
|
import json
|
||||||
|
import os
|
||||||
|
import tempfile
|
||||||
|
import subprocess
|
||||||
|
import shutil
|
||||||
|
import asyncio
|
||||||
|
from typing import Optional, AsyncGenerator
|
||||||
|
|
||||||
from ..config import settings
|
from ..config import settings
|
||||||
from ..models import SummarizeRequest, SummarizeResponse, ActionItemCreate, TokenPayload
|
from ..models import SummarizeRequest, SummarizeResponse, ActionItemCreate, TokenPayload
|
||||||
from .auth import get_current_user
|
from .auth import get_current_user
|
||||||
|
|
||||||
|
# Supported audio formats
|
||||||
|
SUPPORTED_AUDIO_FORMATS = {".mp3", ".wav", ".m4a", ".webm", ".ogg", ".flac", ".aac"}
|
||||||
|
MAX_FILE_SIZE = 500 * 1024 * 1024 # 500MB
|
||||||
|
|
||||||
router = APIRouter()
|
router = APIRouter()
|
||||||
|
|
||||||
|
|
||||||
@@ -74,6 +85,9 @@ def parse_dify_response(answer: str) -> dict:
|
|||||||
Parse Dify response to extract conclusions and action items.
|
Parse Dify response to extract conclusions and action items.
|
||||||
Attempts JSON parsing first, then falls back to text parsing.
|
Attempts JSON parsing first, then falls back to text parsing.
|
||||||
"""
|
"""
|
||||||
|
print(f"[Dify Summarize] Raw answer length: {len(answer)} chars")
|
||||||
|
print(f"[Dify Summarize] Raw answer preview: {answer[:500]}...")
|
||||||
|
|
||||||
# Try to find JSON in the response
|
# Try to find JSON in the response
|
||||||
try:
|
try:
|
||||||
# Look for JSON block
|
# Look for JSON block
|
||||||
@@ -90,13 +104,424 @@ def parse_dify_response(answer: str) -> dict:
|
|||||||
raise ValueError("No JSON found")
|
raise ValueError("No JSON found")
|
||||||
|
|
||||||
data = json.loads(json_str)
|
data = json.loads(json_str)
|
||||||
|
print(f"[Dify Summarize] Parsed JSON keys: {list(data.keys())}")
|
||||||
|
print(f"[Dify Summarize] conclusions count: {len(data.get('conclusions', []))}")
|
||||||
|
print(f"[Dify Summarize] action_items count: {len(data.get('action_items', []))}")
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"conclusions": data.get("conclusions", []),
|
"conclusions": data.get("conclusions", []),
|
||||||
"action_items": data.get("action_items", []),
|
"action_items": data.get("action_items", []),
|
||||||
}
|
}
|
||||||
except (ValueError, json.JSONDecodeError):
|
except (ValueError, json.JSONDecodeError) as e:
|
||||||
|
print(f"[Dify Summarize] JSON parse failed: {e}")
|
||||||
# Fallback: return raw answer as single conclusion
|
# Fallback: return raw answer as single conclusion
|
||||||
return {
|
return {
|
||||||
"conclusions": [answer] if answer else [],
|
"conclusions": [answer] if answer else [],
|
||||||
"action_items": [],
|
"action_items": [],
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/ai/transcribe-audio")
|
||||||
|
async def transcribe_audio(
|
||||||
|
file: UploadFile = File(...),
|
||||||
|
current_user: TokenPayload = Depends(get_current_user)
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Transcribe an uploaded audio file using Dify STT service.
|
||||||
|
Large files are automatically chunked using VAD segmentation.
|
||||||
|
"""
|
||||||
|
if not settings.DIFY_STT_API_KEY:
|
||||||
|
raise HTTPException(status_code=503, detail="Dify STT API not configured")
|
||||||
|
|
||||||
|
# Validate file extension
|
||||||
|
file_ext = os.path.splitext(file.filename or "")[1].lower()
|
||||||
|
if file_ext not in SUPPORTED_AUDIO_FORMATS:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=400,
|
||||||
|
detail=f"Unsupported audio format. Supported: {', '.join(SUPPORTED_AUDIO_FORMATS)}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create temp directory for processing
|
||||||
|
temp_dir = tempfile.mkdtemp(prefix="transcribe_")
|
||||||
|
temp_file_path = os.path.join(temp_dir, f"upload{file_ext}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Save uploaded file
|
||||||
|
file_size = 0
|
||||||
|
with open(temp_file_path, "wb") as f:
|
||||||
|
while chunk := await file.read(1024 * 1024): # 1MB chunks
|
||||||
|
file_size += len(chunk)
|
||||||
|
if file_size > MAX_FILE_SIZE:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=413,
|
||||||
|
detail=f"File too large. Maximum size: {MAX_FILE_SIZE // (1024*1024)}MB"
|
||||||
|
)
|
||||||
|
f.write(chunk)
|
||||||
|
|
||||||
|
print(f"[Transcribe] Saved uploaded file: {temp_file_path}, size: {file_size} bytes")
|
||||||
|
|
||||||
|
# Call sidecar to segment audio
|
||||||
|
segments = await segment_audio_with_sidecar(temp_file_path, temp_dir)
|
||||||
|
|
||||||
|
if "error" in segments:
|
||||||
|
raise HTTPException(status_code=500, detail=segments["error"])
|
||||||
|
|
||||||
|
segment_list = segments.get("segments", [])
|
||||||
|
total_segments = len(segment_list)
|
||||||
|
|
||||||
|
print(f"[Transcribe] Segmentation complete: {total_segments} chunks created")
|
||||||
|
for seg in segment_list:
|
||||||
|
print(f" - Chunk {seg.get('index')}: {seg.get('path')} ({seg.get('duration', 0):.1f}s)")
|
||||||
|
|
||||||
|
if total_segments == 0:
|
||||||
|
raise HTTPException(status_code=400, detail="No audio content detected")
|
||||||
|
|
||||||
|
# Transcribe each chunk via Dify STT
|
||||||
|
transcriptions = []
|
||||||
|
failed_chunks = []
|
||||||
|
async with httpx.AsyncClient() as client:
|
||||||
|
for i, segment in enumerate(segment_list):
|
||||||
|
chunk_path = segment.get("path")
|
||||||
|
chunk_index = segment.get("index", i)
|
||||||
|
|
||||||
|
print(f"[Transcribe] Processing chunk {chunk_index + 1}/{total_segments}: {chunk_path}")
|
||||||
|
|
||||||
|
if not chunk_path:
|
||||||
|
print(f"[Transcribe] ERROR: Chunk {chunk_index} has no path!")
|
||||||
|
failed_chunks.append(chunk_index)
|
||||||
|
continue
|
||||||
|
|
||||||
|
if not os.path.exists(chunk_path):
|
||||||
|
print(f"[Transcribe] ERROR: Chunk file does not exist: {chunk_path}")
|
||||||
|
failed_chunks.append(chunk_index)
|
||||||
|
continue
|
||||||
|
|
||||||
|
chunk_size = os.path.getsize(chunk_path)
|
||||||
|
print(f"[Transcribe] Chunk {chunk_index} exists, size: {chunk_size} bytes")
|
||||||
|
|
||||||
|
# Call Dify STT API with retry
|
||||||
|
text = await transcribe_chunk_with_dify(
|
||||||
|
client, chunk_path, current_user.email
|
||||||
|
)
|
||||||
|
if text:
|
||||||
|
print(f"[Transcribe] Chunk {chunk_index} transcribed: {len(text)} chars")
|
||||||
|
transcriptions.append(text)
|
||||||
|
else:
|
||||||
|
print(f"[Transcribe] Chunk {chunk_index} transcription failed (no text returned)")
|
||||||
|
failed_chunks.append(chunk_index)
|
||||||
|
|
||||||
|
# Concatenate all transcriptions
|
||||||
|
final_transcript = " ".join(transcriptions)
|
||||||
|
|
||||||
|
print(f"[Transcribe] Complete: {len(transcriptions)}/{total_segments} chunks transcribed")
|
||||||
|
if failed_chunks:
|
||||||
|
print(f"[Transcribe] Failed chunks: {failed_chunks}")
|
||||||
|
|
||||||
|
return {
|
||||||
|
"transcript": final_transcript,
|
||||||
|
"chunks_processed": len(transcriptions),
|
||||||
|
"chunks_total": total_segments,
|
||||||
|
"chunks_failed": len(failed_chunks),
|
||||||
|
"total_duration_seconds": segments.get("total_duration", 0),
|
||||||
|
"language": "zh"
|
||||||
|
}
|
||||||
|
|
||||||
|
finally:
|
||||||
|
# Clean up temp files
|
||||||
|
shutil.rmtree(temp_dir, ignore_errors=True)
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/ai/transcribe-audio-stream")
|
||||||
|
async def transcribe_audio_stream(
|
||||||
|
file: UploadFile = File(...),
|
||||||
|
current_user: TokenPayload = Depends(get_current_user)
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Transcribe an uploaded audio file with real-time progress via SSE.
|
||||||
|
Returns Server-Sent Events for progress updates.
|
||||||
|
"""
|
||||||
|
if not settings.DIFY_STT_API_KEY:
|
||||||
|
raise HTTPException(status_code=503, detail="Dify STT API not configured")
|
||||||
|
|
||||||
|
# Validate file extension
|
||||||
|
file_ext = os.path.splitext(file.filename or "")[1].lower()
|
||||||
|
if file_ext not in SUPPORTED_AUDIO_FORMATS:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=400,
|
||||||
|
detail=f"Unsupported audio format. Supported: {', '.join(SUPPORTED_AUDIO_FORMATS)}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Read file into memory for streaming
|
||||||
|
file_content = await file.read()
|
||||||
|
if len(file_content) > MAX_FILE_SIZE:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=413,
|
||||||
|
detail=f"File too large. Maximum size: {MAX_FILE_SIZE // (1024*1024)}MB"
|
||||||
|
)
|
||||||
|
|
||||||
|
async def generate_progress() -> AsyncGenerator[str, None]:
|
||||||
|
temp_dir = tempfile.mkdtemp(prefix="transcribe_")
|
||||||
|
temp_file_path = os.path.join(temp_dir, f"upload{file_ext}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Save file
|
||||||
|
with open(temp_file_path, "wb") as f:
|
||||||
|
f.write(file_content)
|
||||||
|
|
||||||
|
yield f"data: {json.dumps({'event': 'start', 'message': '音訊檔案已接收,開始處理...'})}\n\n"
|
||||||
|
|
||||||
|
# Segment audio
|
||||||
|
yield f"data: {json.dumps({'event': 'segmenting', 'message': '正在分析音訊並分割片段...'})}\n\n"
|
||||||
|
|
||||||
|
segments = await segment_audio_with_sidecar(temp_file_path, temp_dir)
|
||||||
|
|
||||||
|
if "error" in segments:
|
||||||
|
yield f"data: {json.dumps({'event': 'error', 'message': segments['error']})}\n\n"
|
||||||
|
return
|
||||||
|
|
||||||
|
segment_list = segments.get("segments", [])
|
||||||
|
total_segments = len(segment_list)
|
||||||
|
total_duration = segments.get("total_duration", 0)
|
||||||
|
|
||||||
|
if total_segments == 0:
|
||||||
|
yield f"data: {json.dumps({'event': 'error', 'message': '未檢測到音訊內容'})}\n\n"
|
||||||
|
return
|
||||||
|
|
||||||
|
yield f"data: {json.dumps({'event': 'segments_ready', 'total': total_segments, 'duration': total_duration, 'message': f'分割完成,共 {total_segments} 個片段'})}\n\n"
|
||||||
|
|
||||||
|
# Transcribe each chunk
|
||||||
|
transcriptions = []
|
||||||
|
async with httpx.AsyncClient() as client:
|
||||||
|
for i, segment in enumerate(segment_list):
|
||||||
|
chunk_path = segment.get("path")
|
||||||
|
chunk_index = segment.get("index", i)
|
||||||
|
chunk_duration = segment.get("duration", 0)
|
||||||
|
|
||||||
|
yield f"data: {json.dumps({'event': 'chunk_start', 'chunk': chunk_index + 1, 'total': total_segments, 'duration': chunk_duration, 'message': f'正在轉錄片段 {chunk_index + 1}/{total_segments}...'})}\n\n"
|
||||||
|
|
||||||
|
if not chunk_path or not os.path.exists(chunk_path):
|
||||||
|
yield f"data: {json.dumps({'event': 'chunk_error', 'chunk': chunk_index + 1, 'message': f'片段 {chunk_index + 1} 檔案不存在'})}\n\n"
|
||||||
|
continue
|
||||||
|
|
||||||
|
text = await transcribe_chunk_with_dify(
|
||||||
|
client, chunk_path, current_user.email
|
||||||
|
)
|
||||||
|
|
||||||
|
if text:
|
||||||
|
transcriptions.append(text)
|
||||||
|
yield f"data: {json.dumps({'event': 'chunk_done', 'chunk': chunk_index + 1, 'total': total_segments, 'text_length': len(text), 'message': f'片段 {chunk_index + 1} 完成'})}\n\n"
|
||||||
|
else:
|
||||||
|
yield f"data: {json.dumps({'event': 'chunk_error', 'chunk': chunk_index + 1, 'message': f'片段 {chunk_index + 1} 轉錄失敗'})}\n\n"
|
||||||
|
|
||||||
|
# Final result
|
||||||
|
final_transcript = " ".join(transcriptions)
|
||||||
|
yield f"data: {json.dumps({'event': 'complete', 'transcript': final_transcript, 'chunks_processed': len(transcriptions), 'chunks_total': total_segments, 'duration': total_duration})}\n\n"
|
||||||
|
|
||||||
|
finally:
|
||||||
|
shutil.rmtree(temp_dir, ignore_errors=True)
|
||||||
|
|
||||||
|
return StreamingResponse(
|
||||||
|
generate_progress(),
|
||||||
|
media_type="text/event-stream",
|
||||||
|
headers={
|
||||||
|
"Cache-Control": "no-cache",
|
||||||
|
"Connection": "keep-alive",
|
||||||
|
"X-Accel-Buffering": "no"
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def segment_audio_with_sidecar(audio_path: str, output_dir: str) -> dict:
|
||||||
|
"""Call sidecar to segment audio file using VAD."""
|
||||||
|
# Find sidecar script
|
||||||
|
sidecar_dir = os.path.join(os.path.dirname(__file__), "..", "..", "..", "sidecar")
|
||||||
|
sidecar_script = os.path.join(sidecar_dir, "transcriber.py")
|
||||||
|
venv_python = os.path.join(sidecar_dir, "venv", "bin", "python")
|
||||||
|
|
||||||
|
# Use venv python if available, otherwise system python
|
||||||
|
python_cmd = venv_python if os.path.exists(venv_python) else "python3"
|
||||||
|
|
||||||
|
if not os.path.exists(sidecar_script):
|
||||||
|
return {"error": "Sidecar not found"}
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Prepare command
|
||||||
|
cmd_input = json.dumps({
|
||||||
|
"action": "segment_audio",
|
||||||
|
"file_path": audio_path,
|
||||||
|
"max_chunk_seconds": 180, # 3 minutes (smaller chunks for reliable upload)
|
||||||
|
"min_silence_ms": 500,
|
||||||
|
"output_dir": output_dir
|
||||||
|
})
|
||||||
|
|
||||||
|
# Run sidecar process
|
||||||
|
process = await asyncio.create_subprocess_exec(
|
||||||
|
python_cmd, sidecar_script,
|
||||||
|
stdin=asyncio.subprocess.PIPE,
|
||||||
|
stdout=asyncio.subprocess.PIPE,
|
||||||
|
stderr=asyncio.subprocess.PIPE,
|
||||||
|
cwd=sidecar_dir
|
||||||
|
)
|
||||||
|
|
||||||
|
# Send command and wait for response
|
||||||
|
stdout, stderr = await asyncio.wait_for(
|
||||||
|
process.communicate(input=f"{cmd_input}\n{{\"action\": \"quit\"}}\n".encode()),
|
||||||
|
timeout=600 # 10 minutes timeout for large files
|
||||||
|
)
|
||||||
|
|
||||||
|
# Parse response (skip status messages, find the segment result)
|
||||||
|
for line in stdout.decode().strip().split('\n'):
|
||||||
|
if line:
|
||||||
|
try:
|
||||||
|
data = json.loads(line)
|
||||||
|
if data.get("status") == "success" or "segments" in data:
|
||||||
|
return data
|
||||||
|
if "error" in data:
|
||||||
|
return data
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
continue
|
||||||
|
|
||||||
|
return {"error": "No valid response from sidecar"}
|
||||||
|
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
return {"error": "Sidecar timeout during segmentation"}
|
||||||
|
except Exception as e:
|
||||||
|
return {"error": f"Sidecar error: {str(e)}"}
|
||||||
|
|
||||||
|
|
||||||
|
async def upload_file_to_dify(
|
||||||
|
client: httpx.AsyncClient,
|
||||||
|
file_path: str,
|
||||||
|
user_email: str
|
||||||
|
) -> Optional[str]:
|
||||||
|
"""Upload a file to Dify and return the file ID."""
|
||||||
|
try:
|
||||||
|
upload_url = f"{settings.DIFY_API_URL}/files/upload"
|
||||||
|
|
||||||
|
file_size = os.path.getsize(file_path)
|
||||||
|
print(f"[Upload] File: {file_path}, size: {file_size / (1024*1024):.1f} MB")
|
||||||
|
|
||||||
|
# Adjust timeout based on file size (minimum 60s, ~1MB per 5 seconds)
|
||||||
|
timeout_seconds = max(60.0, file_size / (1024 * 1024) * 5)
|
||||||
|
print(f"[Upload] Using timeout: {timeout_seconds:.0f}s")
|
||||||
|
|
||||||
|
with open(file_path, "rb") as f:
|
||||||
|
files = {"file": (os.path.basename(file_path), f, "audio/wav")}
|
||||||
|
response = await client.post(
|
||||||
|
upload_url,
|
||||||
|
headers={
|
||||||
|
"Authorization": f"Bearer {settings.DIFY_STT_API_KEY}",
|
||||||
|
},
|
||||||
|
files=files,
|
||||||
|
data={"user": user_email},
|
||||||
|
timeout=timeout_seconds,
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f"[Upload] Response: {response.status_code}")
|
||||||
|
|
||||||
|
if response.status_code == 201 or response.status_code == 200:
|
||||||
|
data = response.json()
|
||||||
|
file_id = data.get("id")
|
||||||
|
print(f"[Upload] Success, file_id: {file_id}")
|
||||||
|
return file_id
|
||||||
|
|
||||||
|
print(f"[Upload] Error: {response.status_code} - {response.text[:500]}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
except httpx.ReadError as e:
|
||||||
|
print(f"[Upload] Network read error (connection reset): {e}")
|
||||||
|
return None
|
||||||
|
except httpx.TimeoutException as e:
|
||||||
|
print(f"[Upload] Timeout: {e}")
|
||||||
|
return None
|
||||||
|
except Exception as e:
|
||||||
|
import traceback
|
||||||
|
print(f"[Upload] Error: {e}")
|
||||||
|
print(traceback.format_exc())
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
async def transcribe_chunk_with_dify(
|
||||||
|
client: httpx.AsyncClient,
|
||||||
|
chunk_path: str,
|
||||||
|
user_email: str,
|
||||||
|
max_retries: int = 3
|
||||||
|
) -> Optional[str]:
|
||||||
|
"""Transcribe a single audio chunk via Dify chat API with file upload."""
|
||||||
|
for attempt in range(max_retries):
|
||||||
|
try:
|
||||||
|
print(f"[Dify] Attempt {attempt + 1}/{max_retries} for chunk: {chunk_path}")
|
||||||
|
|
||||||
|
# Step 1: Upload file to Dify (with retry inside this attempt)
|
||||||
|
file_id = None
|
||||||
|
for upload_attempt in range(2): # 2 upload attempts per main attempt
|
||||||
|
file_id = await upload_file_to_dify(client, chunk_path, user_email)
|
||||||
|
if file_id:
|
||||||
|
break
|
||||||
|
print(f"[Dify] Upload attempt {upload_attempt + 1} failed, retrying...")
|
||||||
|
await asyncio.sleep(1)
|
||||||
|
|
||||||
|
if not file_id:
|
||||||
|
print(f"[Dify] Failed to upload file after retries: {chunk_path}")
|
||||||
|
if attempt < max_retries - 1:
|
||||||
|
await asyncio.sleep(2 ** attempt)
|
||||||
|
continue
|
||||||
|
return None
|
||||||
|
|
||||||
|
print(f"[Dify] File uploaded, file_id: {file_id}")
|
||||||
|
|
||||||
|
# Step 2: Send chat message with file to request transcription
|
||||||
|
response = await client.post(
|
||||||
|
f"{settings.DIFY_API_URL}/chat-messages",
|
||||||
|
headers={
|
||||||
|
"Authorization": f"Bearer {settings.DIFY_STT_API_KEY}",
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
},
|
||||||
|
json={
|
||||||
|
"inputs": {},
|
||||||
|
"query": "請將這段音檔轉錄成文字,只回傳轉錄的文字內容,不要加任何額外說明。",
|
||||||
|
"response_mode": "blocking",
|
||||||
|
"user": user_email,
|
||||||
|
"files": [
|
||||||
|
{
|
||||||
|
"type": "audio",
|
||||||
|
"transfer_method": "local_file",
|
||||||
|
"upload_file_id": file_id
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
timeout=300.0, # 5 minutes per chunk (increased for longer segments)
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f"[Dify] Chat response: {response.status_code}")
|
||||||
|
|
||||||
|
if response.status_code == 200:
|
||||||
|
data = response.json()
|
||||||
|
answer = data.get("answer", "")
|
||||||
|
print(f"[Dify] Transcription success, length: {len(answer)} chars")
|
||||||
|
return answer
|
||||||
|
|
||||||
|
# Retry on server errors or rate limits
|
||||||
|
if response.status_code >= 500 or response.status_code == 429:
|
||||||
|
print(f"[Dify] Server error {response.status_code}, will retry...")
|
||||||
|
if attempt < max_retries - 1:
|
||||||
|
wait_time = 2 ** attempt
|
||||||
|
if response.status_code == 429:
|
||||||
|
wait_time = 10 # Wait longer for rate limits
|
||||||
|
await asyncio.sleep(wait_time)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Log error but don't fail entire transcription
|
||||||
|
print(f"[Dify] Chat error for chunk: {response.status_code} - {response.text[:500]}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
except httpx.TimeoutException:
|
||||||
|
if attempt < max_retries - 1:
|
||||||
|
await asyncio.sleep(2 ** attempt)
|
||||||
|
continue
|
||||||
|
return None
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Chunk transcription error: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
return None
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ python-dotenv>=1.0.0
|
|||||||
mysql-connector-python>=9.0.0
|
mysql-connector-python>=9.0.0
|
||||||
pydantic>=2.10.0
|
pydantic>=2.10.0
|
||||||
httpx>=0.27.0
|
httpx>=0.27.0
|
||||||
|
python-multipart>=0.0.9
|
||||||
python-jose[cryptography]>=3.3.0
|
python-jose[cryptography]>=3.3.0
|
||||||
openpyxl>=3.1.2
|
openpyxl>=3.1.2
|
||||||
pytest>=8.0.0
|
pytest>=8.0.0
|
||||||
|
|||||||
Binary file not shown.
@@ -96,6 +96,33 @@
|
|||||||
color: #666;
|
color: #666;
|
||||||
font-style: italic;
|
font-style: italic;
|
||||||
}
|
}
|
||||||
|
.upload-progress {
|
||||||
|
display: none;
|
||||||
|
padding: 10px 15px;
|
||||||
|
background: #fff3e0;
|
||||||
|
border-radius: 6px;
|
||||||
|
margin-bottom: 10px;
|
||||||
|
}
|
||||||
|
.upload-progress.active {
|
||||||
|
display: block;
|
||||||
|
}
|
||||||
|
.upload-progress-bar {
|
||||||
|
height: 6px;
|
||||||
|
background: #e0e0e0;
|
||||||
|
border-radius: 3px;
|
||||||
|
overflow: hidden;
|
||||||
|
margin-top: 8px;
|
||||||
|
}
|
||||||
|
.upload-progress-fill {
|
||||||
|
height: 100%;
|
||||||
|
background: #ff9800;
|
||||||
|
width: 0%;
|
||||||
|
transition: width 0.3s ease;
|
||||||
|
}
|
||||||
|
.upload-progress-text {
|
||||||
|
font-size: 13px;
|
||||||
|
color: #e65100;
|
||||||
|
}
|
||||||
.transcript-textarea {
|
.transcript-textarea {
|
||||||
width: 100%;
|
width: 100%;
|
||||||
min-height: 400px;
|
min-height: 400px;
|
||||||
@@ -143,8 +170,10 @@
|
|||||||
<div class="panel">
|
<div class="panel">
|
||||||
<div class="panel-header">
|
<div class="panel-header">
|
||||||
<span>Transcript (逐字稿)</span>
|
<span>Transcript (逐字稿)</span>
|
||||||
<div class="recording-controls" style="padding: 0;">
|
<div class="recording-controls" style="padding: 0; display: flex; gap: 8px;">
|
||||||
<button class="btn btn-danger" id="record-btn">Start Recording</button>
|
<button class="btn btn-danger" id="record-btn">Start Recording</button>
|
||||||
|
<button class="btn btn-secondary" id="upload-audio-btn">Upload Audio</button>
|
||||||
|
<input type="file" id="audio-file-input" accept=".mp3,.wav,.m4a,.webm,.ogg,.flac,.aac" style="display: none;">
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
<div class="panel-body">
|
<div class="panel-body">
|
||||||
@@ -155,6 +184,14 @@
|
|||||||
<span class="segment-count" id="segment-count">Segments: 0</span>
|
<span class="segment-count" id="segment-count">Segments: 0</span>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
<!-- Upload Progress -->
|
||||||
|
<div id="upload-progress" class="upload-progress">
|
||||||
|
<span class="upload-progress-text" id="upload-progress-text">Uploading...</span>
|
||||||
|
<div class="upload-progress-bar">
|
||||||
|
<div class="upload-progress-fill" id="upload-progress-fill"></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
<!-- Single Transcript Textarea -->
|
<!-- Single Transcript Textarea -->
|
||||||
<div id="transcript-container">
|
<div id="transcript-container">
|
||||||
<textarea
|
<textarea
|
||||||
@@ -203,7 +240,8 @@
|
|||||||
updateMeeting,
|
updateMeeting,
|
||||||
deleteMeeting,
|
deleteMeeting,
|
||||||
exportMeeting,
|
exportMeeting,
|
||||||
summarizeTranscript
|
summarizeTranscript,
|
||||||
|
transcribeAudio
|
||||||
} from '../services/api.js';
|
} from '../services/api.js';
|
||||||
|
|
||||||
const meetingId = localStorage.getItem('currentMeetingId');
|
const meetingId = localStorage.getItem('currentMeetingId');
|
||||||
@@ -234,6 +272,11 @@
|
|||||||
const deleteBtn = document.getElementById('delete-btn');
|
const deleteBtn = document.getElementById('delete-btn');
|
||||||
const addConclusionBtn = document.getElementById('add-conclusion-btn');
|
const addConclusionBtn = document.getElementById('add-conclusion-btn');
|
||||||
const addActionBtn = document.getElementById('add-action-btn');
|
const addActionBtn = document.getElementById('add-action-btn');
|
||||||
|
const uploadAudioBtn = document.getElementById('upload-audio-btn');
|
||||||
|
const audioFileInput = document.getElementById('audio-file-input');
|
||||||
|
const uploadProgressEl = document.getElementById('upload-progress');
|
||||||
|
const uploadProgressText = document.getElementById('upload-progress-text');
|
||||||
|
const uploadProgressFill = document.getElementById('upload-progress-fill');
|
||||||
|
|
||||||
// Load meeting data
|
// Load meeting data
|
||||||
async function loadMeeting() {
|
async function loadMeeting() {
|
||||||
@@ -460,6 +503,86 @@
|
|||||||
processingIndicatorEl.classList.add('hidden');
|
processingIndicatorEl.classList.add('hidden');
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// === Audio File Upload ===
|
||||||
|
uploadAudioBtn.addEventListener('click', () => {
|
||||||
|
if (isRecording) {
|
||||||
|
alert('Please stop recording before uploading audio.');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
audioFileInput.click();
|
||||||
|
});
|
||||||
|
|
||||||
|
audioFileInput.addEventListener('change', async (e) => {
|
||||||
|
const file = e.target.files[0];
|
||||||
|
if (!file) return;
|
||||||
|
|
||||||
|
// Validate file size (500MB max)
|
||||||
|
const maxSize = 500 * 1024 * 1024;
|
||||||
|
if (file.size > maxSize) {
|
||||||
|
alert('File too large. Maximum size is 500MB.');
|
||||||
|
audioFileInput.value = '';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Confirm if transcript has content
|
||||||
|
const currentTranscript = transcriptTextEl.value.trim();
|
||||||
|
if (currentTranscript) {
|
||||||
|
if (!confirm('This will replace the existing transcript. Do you want to continue?')) {
|
||||||
|
audioFileInput.value = '';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Start upload
|
||||||
|
uploadAudioBtn.disabled = true;
|
||||||
|
recordBtn.disabled = true;
|
||||||
|
uploadProgressEl.classList.add('active');
|
||||||
|
uploadProgressFill.style.width = '0%';
|
||||||
|
uploadProgressText.textContent = 'Uploading audio file...';
|
||||||
|
|
||||||
|
try {
|
||||||
|
const result = await transcribeAudio(file, (progress) => {
|
||||||
|
if (progress.phase === 'uploading') {
|
||||||
|
uploadProgressFill.style.width = `${progress.progress}%`;
|
||||||
|
uploadProgressText.textContent = `上傳中: ${progress.progress}%`;
|
||||||
|
} else if (progress.phase === 'processing') {
|
||||||
|
uploadProgressFill.style.width = `${progress.progress}%`;
|
||||||
|
uploadProgressText.textContent = progress.message || '處理中...';
|
||||||
|
} else if (progress.phase === 'transcribing') {
|
||||||
|
uploadProgressFill.style.width = `${progress.progress}%`;
|
||||||
|
if (progress.total && progress.current) {
|
||||||
|
uploadProgressText.textContent = `轉錄中: ${progress.current}/${progress.total} 片段 (${progress.progress}%)`;
|
||||||
|
} else {
|
||||||
|
uploadProgressText.textContent = progress.message || '轉錄中...';
|
||||||
|
}
|
||||||
|
} else if (progress.phase === 'complete') {
|
||||||
|
uploadProgressFill.style.width = '100%';
|
||||||
|
uploadProgressText.textContent = progress.message || '轉錄完成';
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Success - update transcript
|
||||||
|
transcriptTextEl.value = result.transcript || '';
|
||||||
|
const chunksInfo = result.chunks_failed > 0
|
||||||
|
? `${result.chunks_processed}/${result.chunks_total} 片段成功`
|
||||||
|
: `${result.chunks_processed} 片段`;
|
||||||
|
uploadProgressText.textContent = `轉錄完成!(${chunksInfo}, ${Math.round(result.total_duration_seconds)}秒)`;
|
||||||
|
|
||||||
|
// Auto-hide progress after 3 seconds
|
||||||
|
setTimeout(() => {
|
||||||
|
uploadProgressEl.classList.remove('active');
|
||||||
|
}, 3000);
|
||||||
|
|
||||||
|
} catch (error) {
|
||||||
|
alert('Error transcribing audio: ' + error.message);
|
||||||
|
uploadProgressEl.classList.remove('active');
|
||||||
|
} finally {
|
||||||
|
uploadAudioBtn.disabled = false;
|
||||||
|
recordBtn.disabled = false;
|
||||||
|
audioFileInput.value = '';
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
// === Streaming Event Handlers (legacy, kept for future use) ===
|
// === Streaming Event Handlers (legacy, kept for future use) ===
|
||||||
window.electronAPI.onTranscriptionSegment((segment) => {
|
window.electronAPI.onTranscriptionSegment((segment) => {
|
||||||
console.log('Received segment:', segment);
|
console.log('Received segment:', segment);
|
||||||
|
|||||||
@@ -141,6 +141,231 @@ export async function summarizeTranscript(transcript) {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export async function transcribeAudio(file, onProgress = null) {
|
||||||
|
const url = `${API_BASE_URL}/ai/transcribe-audio-stream`;
|
||||||
|
const formData = new FormData();
|
||||||
|
formData.append("file", file);
|
||||||
|
|
||||||
|
const token = getToken();
|
||||||
|
|
||||||
|
return new Promise((resolve, reject) => {
|
||||||
|
// Use fetch for SSE support
|
||||||
|
fetch(url, {
|
||||||
|
method: "POST",
|
||||||
|
headers: {
|
||||||
|
Authorization: token ? `Bearer ${token}` : undefined,
|
||||||
|
},
|
||||||
|
body: formData,
|
||||||
|
})
|
||||||
|
.then((response) => {
|
||||||
|
if (response.status === 401) {
|
||||||
|
clearToken();
|
||||||
|
window.electronAPI?.navigate("login");
|
||||||
|
throw new Error("Session expired, please login again");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!response.ok) {
|
||||||
|
return response.json().then((error) => {
|
||||||
|
throw new Error(error.detail || `HTTP error ${response.status}`);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
if (onProgress) {
|
||||||
|
onProgress({ phase: "processing", progress: 0, message: "處理中..." });
|
||||||
|
}
|
||||||
|
|
||||||
|
// Read SSE stream
|
||||||
|
const reader = response.body.getReader();
|
||||||
|
const decoder = new TextDecoder();
|
||||||
|
let buffer = "";
|
||||||
|
let result = null;
|
||||||
|
let totalChunks = 0;
|
||||||
|
let processedChunks = 0;
|
||||||
|
|
||||||
|
function processLine(line) {
|
||||||
|
if (line.startsWith("data: ")) {
|
||||||
|
try {
|
||||||
|
const data = JSON.parse(line.slice(6));
|
||||||
|
|
||||||
|
switch (data.event) {
|
||||||
|
case "start":
|
||||||
|
case "segmenting":
|
||||||
|
if (onProgress) {
|
||||||
|
onProgress({
|
||||||
|
phase: "processing",
|
||||||
|
progress: 5,
|
||||||
|
message: data.message,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
|
case "segments_ready":
|
||||||
|
totalChunks = data.total;
|
||||||
|
if (onProgress) {
|
||||||
|
onProgress({
|
||||||
|
phase: "transcribing",
|
||||||
|
progress: 10,
|
||||||
|
total: totalChunks,
|
||||||
|
current: 0,
|
||||||
|
message: data.message,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
|
case "chunk_start":
|
||||||
|
if (onProgress) {
|
||||||
|
const progress = 10 + ((data.chunk - 1) / totalChunks) * 85;
|
||||||
|
onProgress({
|
||||||
|
phase: "transcribing",
|
||||||
|
progress: Math.round(progress),
|
||||||
|
total: totalChunks,
|
||||||
|
current: data.chunk,
|
||||||
|
message: data.message,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
|
case "chunk_done":
|
||||||
|
processedChunks++;
|
||||||
|
if (onProgress) {
|
||||||
|
const progress = 10 + (data.chunk / totalChunks) * 85;
|
||||||
|
onProgress({
|
||||||
|
phase: "transcribing",
|
||||||
|
progress: Math.round(progress),
|
||||||
|
total: totalChunks,
|
||||||
|
current: data.chunk,
|
||||||
|
message: data.message,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
|
case "chunk_error":
|
||||||
|
console.warn(`Chunk ${data.chunk} error: ${data.message}`);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case "error":
|
||||||
|
throw new Error(data.message);
|
||||||
|
|
||||||
|
case "complete":
|
||||||
|
result = {
|
||||||
|
transcript: data.transcript,
|
||||||
|
chunks_processed: data.chunks_processed,
|
||||||
|
chunks_total: data.chunks_total,
|
||||||
|
total_duration_seconds: data.duration,
|
||||||
|
language: "zh",
|
||||||
|
};
|
||||||
|
if (onProgress) {
|
||||||
|
onProgress({
|
||||||
|
phase: "complete",
|
||||||
|
progress: 100,
|
||||||
|
message: "轉錄完成",
|
||||||
|
});
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
console.warn("SSE parse error:", e, line);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function read() {
|
||||||
|
reader
|
||||||
|
.read()
|
||||||
|
.then(({ done, value }) => {
|
||||||
|
if (done) {
|
||||||
|
// Process any remaining buffer
|
||||||
|
if (buffer.trim()) {
|
||||||
|
buffer.split("\n").forEach(processLine);
|
||||||
|
}
|
||||||
|
if (result) {
|
||||||
|
resolve(result);
|
||||||
|
} else {
|
||||||
|
reject(new Error("Transcription failed - no result received"));
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
buffer += decoder.decode(value, { stream: true });
|
||||||
|
const lines = buffer.split("\n");
|
||||||
|
buffer = lines.pop() || ""; // Keep incomplete line in buffer
|
||||||
|
|
||||||
|
lines.forEach(processLine);
|
||||||
|
read();
|
||||||
|
})
|
||||||
|
.catch(reject);
|
||||||
|
}
|
||||||
|
|
||||||
|
read();
|
||||||
|
})
|
||||||
|
.catch(reject);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Legacy non-streaming version (fallback)
|
||||||
|
export async function transcribeAudioLegacy(file, onProgress = null) {
|
||||||
|
const url = `${API_BASE_URL}/ai/transcribe-audio`;
|
||||||
|
const formData = new FormData();
|
||||||
|
formData.append("file", file);
|
||||||
|
|
||||||
|
const token = getToken();
|
||||||
|
|
||||||
|
return new Promise((resolve, reject) => {
|
||||||
|
const xhr = new XMLHttpRequest();
|
||||||
|
|
||||||
|
xhr.upload.addEventListener("progress", (event) => {
|
||||||
|
if (event.lengthComputable && onProgress) {
|
||||||
|
const percentComplete = Math.round((event.loaded / event.total) * 100);
|
||||||
|
onProgress({ phase: "uploading", progress: percentComplete });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
xhr.addEventListener("load", () => {
|
||||||
|
if (xhr.status >= 200 && xhr.status < 300) {
|
||||||
|
try {
|
||||||
|
const response = JSON.parse(xhr.responseText);
|
||||||
|
resolve(response);
|
||||||
|
} catch (e) {
|
||||||
|
reject(new Error("Invalid response format"));
|
||||||
|
}
|
||||||
|
} else if (xhr.status === 401) {
|
||||||
|
clearToken();
|
||||||
|
window.electronAPI?.navigate("login");
|
||||||
|
reject(new Error("Session expired, please login again"));
|
||||||
|
} else {
|
||||||
|
try {
|
||||||
|
const error = JSON.parse(xhr.responseText);
|
||||||
|
reject(new Error(error.detail || `HTTP error ${xhr.status}`));
|
||||||
|
} catch (e) {
|
||||||
|
reject(new Error(`HTTP error ${xhr.status}`));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
xhr.addEventListener("error", () => {
|
||||||
|
reject(new Error("Network error"));
|
||||||
|
});
|
||||||
|
|
||||||
|
xhr.addEventListener("timeout", () => {
|
||||||
|
reject(new Error("Request timeout"));
|
||||||
|
});
|
||||||
|
|
||||||
|
xhr.open("POST", url, true);
|
||||||
|
xhr.timeout = 600000; // 10 minutes for large files
|
||||||
|
if (token) {
|
||||||
|
xhr.setRequestHeader("Authorization", `Bearer ${token}`);
|
||||||
|
}
|
||||||
|
xhr.send(formData);
|
||||||
|
|
||||||
|
// Notify processing phase after upload completes
|
||||||
|
if (onProgress) {
|
||||||
|
xhr.upload.addEventListener("loadend", () => {
|
||||||
|
onProgress({ phase: "processing", progress: 0 });
|
||||||
|
});
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
// Export API
|
// Export API
|
||||||
export async function exportMeeting(id) {
|
export async function exportMeeting(id) {
|
||||||
return request(`/meetings/${id}/export`, {
|
return request(`/meetings/${id}/export`, {
|
||||||
|
|||||||
@@ -3,3 +3,5 @@ faster-whisper>=1.0.0
|
|||||||
opencc-python-reimplemented>=0.1.7
|
opencc-python-reimplemented>=0.1.7
|
||||||
numpy>=1.26.0
|
numpy>=1.26.0
|
||||||
onnxruntime>=1.16.0
|
onnxruntime>=1.16.0
|
||||||
|
pydub>=0.25.0
|
||||||
|
audioop-lts>=0.2.1 # Required for Python 3.13+ (audioop removed from stdlib)
|
||||||
|
|||||||
@@ -20,9 +20,10 @@ import tempfile
|
|||||||
import base64
|
import base64
|
||||||
import uuid
|
import uuid
|
||||||
import re
|
import re
|
||||||
|
import wave
|
||||||
import urllib.request
|
import urllib.request
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional, List
|
from typing import Optional, List, Tuple
|
||||||
|
|
||||||
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
|
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
|
||||||
|
|
||||||
@@ -105,8 +106,7 @@ class SileroVAD:
|
|||||||
def __init__(self, model_path: Optional[str] = None, threshold: float = 0.5):
|
def __init__(self, model_path: Optional[str] = None, threshold: float = 0.5):
|
||||||
self.threshold = threshold
|
self.threshold = threshold
|
||||||
self.session = None
|
self.session = None
|
||||||
self._h = np.zeros((2, 1, 64), dtype=np.float32)
|
self._state = np.zeros((2, 1, 128), dtype=np.float32)
|
||||||
self._c = np.zeros((2, 1, 64), dtype=np.float32)
|
|
||||||
self.sample_rate = 16000
|
self.sample_rate = 16000
|
||||||
|
|
||||||
if not ONNX_AVAILABLE:
|
if not ONNX_AVAILABLE:
|
||||||
@@ -141,8 +141,7 @@ class SileroVAD:
|
|||||||
|
|
||||||
def reset_states(self):
|
def reset_states(self):
|
||||||
"""Reset hidden states."""
|
"""Reset hidden states."""
|
||||||
self._h = np.zeros((2, 1, 64), dtype=np.float32)
|
self._state = np.zeros((2, 1, 128), dtype=np.float32)
|
||||||
self._c = np.zeros((2, 1, 64), dtype=np.float32)
|
|
||||||
|
|
||||||
def __call__(self, audio: np.ndarray) -> float:
|
def __call__(self, audio: np.ndarray) -> float:
|
||||||
"""Run VAD on audio chunk, return speech probability."""
|
"""Run VAD on audio chunk, return speech probability."""
|
||||||
@@ -153,15 +152,14 @@ class SileroVAD:
|
|||||||
if audio.ndim == 1:
|
if audio.ndim == 1:
|
||||||
audio = audio[np.newaxis, :]
|
audio = audio[np.newaxis, :]
|
||||||
|
|
||||||
# Run inference
|
# Run inference with updated model format
|
||||||
ort_inputs = {
|
ort_inputs = {
|
||||||
'input': audio.astype(np.float32),
|
'input': audio.astype(np.float32),
|
||||||
'sr': np.array([self.sample_rate], dtype=np.int64),
|
'state': self._state,
|
||||||
'h': self._h,
|
'sr': np.array(self.sample_rate, dtype=np.int64)
|
||||||
'c': self._c
|
|
||||||
}
|
}
|
||||||
|
|
||||||
output, self._h, self._c = self.session.run(None, ort_inputs)
|
output, self._state = self.session.run(None, ort_inputs)
|
||||||
return float(output[0][0])
|
return float(output[0][0])
|
||||||
|
|
||||||
|
|
||||||
@@ -406,6 +404,193 @@ class Transcriber:
|
|||||||
print(json.dumps({"error": f"Transcription error: {e}"}), file=sys.stderr)
|
print(json.dumps({"error": f"Transcription error: {e}"}), file=sys.stderr)
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
|
def segment_audio_file(
|
||||||
|
self,
|
||||||
|
audio_path: str,
|
||||||
|
max_chunk_seconds: int = 300,
|
||||||
|
min_silence_ms: int = 500,
|
||||||
|
output_dir: Optional[str] = None
|
||||||
|
) -> dict:
|
||||||
|
"""
|
||||||
|
Segment an audio file using VAD for natural speech boundaries.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
audio_path: Path to the audio file
|
||||||
|
max_chunk_seconds: Maximum duration per chunk (default 5 minutes)
|
||||||
|
min_silence_ms: Minimum silence duration to consider as boundary (default 500ms)
|
||||||
|
output_dir: Directory to save chunks (default: temp directory)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict with segments list and metadata
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Import audio processing libraries
|
||||||
|
try:
|
||||||
|
from pydub import AudioSegment
|
||||||
|
except ImportError:
|
||||||
|
return {"error": "pydub not installed. Run: pip install pydub"}
|
||||||
|
|
||||||
|
if not os.path.exists(audio_path):
|
||||||
|
return {"error": f"File not found: {audio_path}"}
|
||||||
|
|
||||||
|
# Create output directory
|
||||||
|
if output_dir is None:
|
||||||
|
output_dir = tempfile.mkdtemp(prefix="audio_segments_")
|
||||||
|
else:
|
||||||
|
os.makedirs(output_dir, exist_ok=True)
|
||||||
|
|
||||||
|
# Load audio file and convert to mono 16kHz
|
||||||
|
print(json.dumps({"status": "loading_audio", "file": audio_path}), file=sys.stderr)
|
||||||
|
audio = AudioSegment.from_file(audio_path)
|
||||||
|
audio = audio.set_channels(1).set_frame_rate(16000)
|
||||||
|
total_duration_ms = len(audio)
|
||||||
|
total_duration_sec = total_duration_ms / 1000
|
||||||
|
|
||||||
|
print(json.dumps({
|
||||||
|
"status": "audio_loaded",
|
||||||
|
"duration_seconds": total_duration_sec
|
||||||
|
}), file=sys.stderr)
|
||||||
|
|
||||||
|
# Convert to numpy for VAD processing
|
||||||
|
samples = np.array(audio.get_array_of_samples(), dtype=np.float32) / 32768.0
|
||||||
|
|
||||||
|
# Run VAD to detect speech regions
|
||||||
|
segments = []
|
||||||
|
current_start = 0
|
||||||
|
max_chunk_samples = max_chunk_seconds * 16000
|
||||||
|
min_silence_samples = int(min_silence_ms * 16 ) # 16 samples per ms at 16kHz
|
||||||
|
|
||||||
|
if self.vad_model is None or self.vad_model.session is None:
|
||||||
|
# No VAD available, use fixed-time splitting
|
||||||
|
print(json.dumps({"warning": "VAD not available, using fixed-time splitting"}), file=sys.stderr)
|
||||||
|
chunk_idx = 0
|
||||||
|
for start_sample in range(0, len(samples), max_chunk_samples):
|
||||||
|
end_sample = min(start_sample + max_chunk_samples, len(samples))
|
||||||
|
chunk_samples = samples[start_sample:end_sample]
|
||||||
|
|
||||||
|
# Export chunk
|
||||||
|
chunk_path = os.path.join(output_dir, f"chunk_{chunk_idx:03d}.wav")
|
||||||
|
self._export_wav(chunk_samples, chunk_path)
|
||||||
|
|
||||||
|
segments.append({
|
||||||
|
"index": chunk_idx,
|
||||||
|
"path": chunk_path,
|
||||||
|
"start": start_sample / 16000,
|
||||||
|
"end": end_sample / 16000,
|
||||||
|
"duration": (end_sample - start_sample) / 16000
|
||||||
|
})
|
||||||
|
chunk_idx += 1
|
||||||
|
else:
|
||||||
|
# Use VAD for intelligent splitting
|
||||||
|
print(json.dumps({"status": "running_vad"}), file=sys.stderr)
|
||||||
|
self.vad_model.reset_states()
|
||||||
|
|
||||||
|
# Find silence regions for splitting
|
||||||
|
window_size = 512
|
||||||
|
silence_starts = []
|
||||||
|
in_silence = False
|
||||||
|
silence_start = 0
|
||||||
|
|
||||||
|
for i in range(0, len(samples) - window_size, window_size):
|
||||||
|
window = samples[i:i + window_size]
|
||||||
|
speech_prob = self.vad_model(window)
|
||||||
|
|
||||||
|
if speech_prob < 0.3: # Silence threshold
|
||||||
|
if not in_silence:
|
||||||
|
in_silence = True
|
||||||
|
silence_start = i
|
||||||
|
else:
|
||||||
|
if in_silence:
|
||||||
|
silence_duration = i - silence_start
|
||||||
|
if silence_duration >= min_silence_samples:
|
||||||
|
# Mark middle of silence as potential split point
|
||||||
|
silence_starts.append(silence_start + silence_duration // 2)
|
||||||
|
in_silence = False
|
||||||
|
|
||||||
|
# Add end of file as final split point
|
||||||
|
silence_starts.append(len(samples))
|
||||||
|
|
||||||
|
# Create segments based on silence boundaries
|
||||||
|
chunk_idx = 0
|
||||||
|
current_start = 0
|
||||||
|
|
||||||
|
for split_point in silence_starts:
|
||||||
|
# Check if we need to split here
|
||||||
|
chunk_duration = split_point - current_start
|
||||||
|
|
||||||
|
if chunk_duration >= max_chunk_samples or split_point == len(samples):
|
||||||
|
# Find the best split point before max duration
|
||||||
|
if chunk_duration > max_chunk_samples:
|
||||||
|
# Find nearest silence point before max
|
||||||
|
best_split = current_start + max_chunk_samples
|
||||||
|
for sp in silence_starts:
|
||||||
|
if current_start < sp <= current_start + max_chunk_samples:
|
||||||
|
best_split = sp
|
||||||
|
split_point = best_split
|
||||||
|
|
||||||
|
# Export chunk
|
||||||
|
chunk_samples = samples[current_start:split_point]
|
||||||
|
if len(chunk_samples) > 8000: # At least 0.5 seconds
|
||||||
|
chunk_path = os.path.join(output_dir, f"chunk_{chunk_idx:03d}.wav")
|
||||||
|
self._export_wav(chunk_samples, chunk_path)
|
||||||
|
|
||||||
|
segments.append({
|
||||||
|
"index": chunk_idx,
|
||||||
|
"path": chunk_path,
|
||||||
|
"start": current_start / 16000,
|
||||||
|
"end": split_point / 16000,
|
||||||
|
"duration": (split_point - current_start) / 16000
|
||||||
|
})
|
||||||
|
chunk_idx += 1
|
||||||
|
|
||||||
|
current_start = split_point
|
||||||
|
|
||||||
|
# Handle any remaining audio - split into max_chunk_samples pieces
|
||||||
|
while current_start < len(samples):
|
||||||
|
remaining_len = len(samples) - current_start
|
||||||
|
if remaining_len < 8000: # Less than 0.5 seconds
|
||||||
|
break
|
||||||
|
|
||||||
|
# Determine chunk end (respect max_chunk_samples)
|
||||||
|
chunk_end = min(current_start + max_chunk_samples, len(samples))
|
||||||
|
chunk_samples = samples[current_start:chunk_end]
|
||||||
|
|
||||||
|
chunk_path = os.path.join(output_dir, f"chunk_{chunk_idx:03d}.wav")
|
||||||
|
self._export_wav(chunk_samples, chunk_path)
|
||||||
|
segments.append({
|
||||||
|
"index": chunk_idx,
|
||||||
|
"path": chunk_path,
|
||||||
|
"start": current_start / 16000,
|
||||||
|
"end": chunk_end / 16000,
|
||||||
|
"duration": len(chunk_samples) / 16000
|
||||||
|
})
|
||||||
|
chunk_idx += 1
|
||||||
|
current_start = chunk_end
|
||||||
|
|
||||||
|
print(json.dumps({
|
||||||
|
"status": "segmentation_complete",
|
||||||
|
"total_segments": len(segments)
|
||||||
|
}), file=sys.stderr)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"status": "success",
|
||||||
|
"segments": segments,
|
||||||
|
"total_segments": len(segments),
|
||||||
|
"total_duration": total_duration_sec,
|
||||||
|
"output_dir": output_dir
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return {"error": f"Segmentation error: {str(e)}"}
|
||||||
|
|
||||||
|
def _export_wav(self, samples: np.ndarray, output_path: str):
|
||||||
|
"""Export numpy samples to WAV file."""
|
||||||
|
with wave.open(output_path, 'wb') as wf:
|
||||||
|
wf.setnchannels(1)
|
||||||
|
wf.setsampwidth(2)
|
||||||
|
wf.setframerate(16000)
|
||||||
|
wf.writeframes((samples * 32768).astype(np.int16).tobytes())
|
||||||
|
|
||||||
def handle_command(self, cmd: dict) -> Optional[dict]:
|
def handle_command(self, cmd: dict) -> Optional[dict]:
|
||||||
"""Handle a JSON command."""
|
"""Handle a JSON command."""
|
||||||
action = cmd.get("action")
|
action = cmd.get("action")
|
||||||
@@ -447,6 +632,21 @@ class Transcriber:
|
|||||||
self.streaming_session = None
|
self.streaming_session = None
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
elif action == "segment_audio":
|
||||||
|
# Segment audio file using VAD
|
||||||
|
file_path = cmd.get("file_path")
|
||||||
|
if not file_path:
|
||||||
|
return {"error": "No file_path specified"}
|
||||||
|
max_chunk_seconds = cmd.get("max_chunk_seconds", 300)
|
||||||
|
min_silence_ms = cmd.get("min_silence_ms", 500)
|
||||||
|
output_dir = cmd.get("output_dir")
|
||||||
|
return self.segment_audio_file(
|
||||||
|
file_path,
|
||||||
|
max_chunk_seconds=max_chunk_seconds,
|
||||||
|
min_silence_ms=min_silence_ms,
|
||||||
|
output_dir=output_dir
|
||||||
|
)
|
||||||
|
|
||||||
elif action == "ping":
|
elif action == "ping":
|
||||||
return {"status": "pong"}
|
return {"status": "pong"}
|
||||||
|
|
||||||
|
|||||||
18
start.sh
18
start.sh
@@ -173,9 +173,23 @@ start_backend() {
|
|||||||
local backend_pid=$!
|
local backend_pid=$!
|
||||||
echo "BACKEND_PID=$backend_pid" >> "$PID_FILE"
|
echo "BACKEND_PID=$backend_pid" >> "$PID_FILE"
|
||||||
|
|
||||||
# 等待啟動
|
# 等待啟動(最多等待 15 秒)
|
||||||
sleep 2
|
local max_wait=15
|
||||||
|
local waited=0
|
||||||
|
log_info "等待後端服務啟動..."
|
||||||
|
|
||||||
|
while [ $waited -lt $max_wait ]; do
|
||||||
|
sleep 1
|
||||||
|
waited=$((waited + 1))
|
||||||
|
|
||||||
|
# 檢查健康狀態
|
||||||
|
if curl -s http://localhost:$BACKEND_PORT/api/health > /dev/null 2>&1; then
|
||||||
|
log_success "後端服務已啟動 (PID: $backend_pid, Port: $BACKEND_PORT)"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
# 最後再檢查一次 port 狀態
|
||||||
if check_port $BACKEND_PORT; then
|
if check_port $BACKEND_PORT; then
|
||||||
log_success "後端服務已啟動 (PID: $backend_pid, Port: $BACKEND_PORT)"
|
log_success "後端服務已啟動 (PID: $backend_pid, Port: $BACKEND_PORT)"
|
||||||
else
|
else
|
||||||
|
|||||||
Reference in New Issue
Block a user