feat: Add Dify audio transcription with VAD chunking and SSE progress

- Add audio file upload transcription via Dify STT API - Implement VAD-based audio segmentation in sidecar (3-min chunks) - Add SSE endpoint for real-time transcription progress updates - Fix chunk size enforcement for reliable uploads - Add retry logic with exponential backoff for API calls - Support Python 3.13+ with audioop-lts package - Update frontend with Chinese progress messages and chunk display - Improve start.sh health check with retry loop 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-11 21:00:27 +08:00
parent e790f48967
commit 263eb1c394
10 changed files with 1008 additions and 16 deletions
--- a/backend/.env.example
+++ b/backend/.env.example
@@ -9,6 +9,7 @@ DB_NAME=db_A060
 AUTH_API_URL=https://pj-auth-api.vercel.app/api/auth/login
 DIFY_API_URL=https://dify.theaken.com/v1
 DIFY_API_KEY=app-xxxxxxxxxxx
 DIFY_STT_API_KEY=app-xxxxxxxxxxx
 # Application Settings
 ADMIN_EMAIL=ymirliu@panjit.com.tw
--- a/backend/app/config.py
+++ b/backend/app/config.py
@@ -16,6 +16,7 @@ class Settings:
    )
    DIFY_API_URL: str = os.getenv("DIFY_API_URL", "https://dify.theaken.com/v1")
    DIFY_API_KEY: str = os.getenv("DIFY_API_KEY", "")
    DIFY_STT_API_KEY: str = os.getenv("DIFY_STT_API_KEY", "")
    ADMIN_EMAIL: str = os.getenv("ADMIN_EMAIL", "ymirliu@panjit.com.tw")
    JWT_SECRET: str = os.getenv("JWT_SECRET", "meeting-assistant-secret")
--- a/backend/app/routers/ai.py
+++ b/backend/app/routers/ai.py
@@ -1,11 +1,22 @@
-from fastapi import APIRouter, HTTPException, Depends
+from fastapi import APIRouter, HTTPException, Depends, UploadFile, File
 from fastapi.responses import StreamingResponse
 import httpx
 import json
 import os
 import tempfile
 import subprocess
 import shutil
 import asyncio
 from typing import Optional, AsyncGenerator
 from ..config import settings
 from ..models import SummarizeRequest, SummarizeResponse, ActionItemCreate, TokenPayload
 from .auth import get_current_user
 # Supported audio formats
 SUPPORTED_AUDIO_FORMATS = {".mp3", ".wav", ".m4a", ".webm", ".ogg", ".flac", ".aac"}
 MAX_FILE_SIZE = 500 * 1024 * 1024  # 500MB
 router = APIRouter()
@@ -74,6 +85,9 @@ def parse_dify_response(answer: str) -> dict:
    Parse Dify response to extract conclusions and action items.
    Attempts JSON parsing first, then falls back to text parsing.
    """
    print(f"[Dify Summarize] Raw answer length: {len(answer)} chars")
    print(f"[Dify Summarize] Raw answer preview: {answer[:500]}...")
    # Try to find JSON in the response
    try:
        # Look for JSON block
@@ -90,13 +104,424 @@ def parse_dify_response(answer: str) -> dict:
            raise ValueError("No JSON found")
        data = json.loads(json_str)
        print(f"[Dify Summarize] Parsed JSON keys: {list(data.keys())}")
        print(f"[Dify Summarize] conclusions count: {len(data.get('conclusions', []))}")
        print(f"[Dify Summarize] action_items count: {len(data.get('action_items', []))}")
        return {
            "conclusions": data.get("conclusions", []),
            "action_items": data.get("action_items", []),
        }
-    except (ValueError, json.JSONDecodeError):
+    except (ValueError, json.JSONDecodeError) as e:
        print(f"[Dify Summarize] JSON parse failed: {e}")
        # Fallback: return raw answer as single conclusion
        return {
            "conclusions": [answer] if answer else [],
            "action_items": [],
        }
@router.post("/ai/transcribe-audio")
 async def transcribe_audio(
    file: UploadFile = File(...),
    current_user: TokenPayload = Depends(get_current_user)
 ):
    """
    Transcribe an uploaded audio file using Dify STT service.
    Large files are automatically chunked using VAD segmentation.
    """
    if not settings.DIFY_STT_API_KEY:
        raise HTTPException(status_code=503, detail="Dify STT API not configured")
    # Validate file extension
    file_ext = os.path.splitext(file.filename or "")[1].lower()
    if file_ext not in SUPPORTED_AUDIO_FORMATS:
        raise HTTPException(
            status_code=400,
            detail=f"Unsupported audio format. Supported: {', '.join(SUPPORTED_AUDIO_FORMATS)}"
        )
    # Create temp directory for processing
    temp_dir = tempfile.mkdtemp(prefix="transcribe_")
    temp_file_path = os.path.join(temp_dir, f"upload{file_ext}")
    try:
        # Save uploaded file
        file_size = 0
        with open(temp_file_path, "wb") as f:
            while chunk := await file.read(1024 * 1024):  # 1MB chunks
                file_size += len(chunk)
                if file_size > MAX_FILE_SIZE:
                    raise HTTPException(
                        status_code=413,
                        detail=f"File too large. Maximum size: {MAX_FILE_SIZE // (1024*1024)}MB"
                    )
                f.write(chunk)
        print(f"[Transcribe] Saved uploaded file: {temp_file_path}, size: {file_size} bytes")
        # Call sidecar to segment audio
        segments = await segment_audio_with_sidecar(temp_file_path, temp_dir)
        if "error" in segments:
            raise HTTPException(status_code=500, detail=segments["error"])
        segment_list = segments.get("segments", [])
        total_segments = len(segment_list)
        print(f"[Transcribe] Segmentation complete: {total_segments} chunks created")
        for seg in segment_list:
            print(f"  - Chunk {seg.get('index')}: {seg.get('path')} ({seg.get('duration', 0):.1f}s)")
        if total_segments == 0:
            raise HTTPException(status_code=400, detail="No audio content detected")
        # Transcribe each chunk via Dify STT
        transcriptions = []
        failed_chunks = []
        async with httpx.AsyncClient() as client:
            for i, segment in enumerate(segment_list):
                chunk_path = segment.get("path")
                chunk_index = segment.get("index", i)
                print(f"[Transcribe] Processing chunk {chunk_index + 1}/{total_segments}: {chunk_path}")
                if not chunk_path:
                    print(f"[Transcribe] ERROR: Chunk {chunk_index} has no path!")
                    failed_chunks.append(chunk_index)
                    continue
                if not os.path.exists(chunk_path):
                    print(f"[Transcribe] ERROR: Chunk file does not exist: {chunk_path}")
                    failed_chunks.append(chunk_index)
                    continue
                chunk_size = os.path.getsize(chunk_path)
                print(f"[Transcribe] Chunk {chunk_index} exists, size: {chunk_size} bytes")
                # Call Dify STT API with retry
                text = await transcribe_chunk_with_dify(
                    client, chunk_path, current_user.email
                )
                if text:
                    print(f"[Transcribe] Chunk {chunk_index} transcribed: {len(text)} chars")
                    transcriptions.append(text)
                else:
                    print(f"[Transcribe] Chunk {chunk_index} transcription failed (no text returned)")
                    failed_chunks.append(chunk_index)
        # Concatenate all transcriptions
        final_transcript = " ".join(transcriptions)
        print(f"[Transcribe] Complete: {len(transcriptions)}/{total_segments} chunks transcribed")
        if failed_chunks:
            print(f"[Transcribe] Failed chunks: {failed_chunks}")
        return {
            "transcript": final_transcript,
            "chunks_processed": len(transcriptions),
            "chunks_total": total_segments,
            "chunks_failed": len(failed_chunks),
            "total_duration_seconds": segments.get("total_duration", 0),
            "language": "zh"
        }
    finally:
        # Clean up temp files
        shutil.rmtree(temp_dir, ignore_errors=True)
@router.post("/ai/transcribe-audio-stream")
 async def transcribe_audio_stream(
    file: UploadFile = File(...),
    current_user: TokenPayload = Depends(get_current_user)
 ):
    """
    Transcribe an uploaded audio file with real-time progress via SSE.
    Returns Server-Sent Events for progress updates.
    """
    if not settings.DIFY_STT_API_KEY:
        raise HTTPException(status_code=503, detail="Dify STT API not configured")
    # Validate file extension
    file_ext = os.path.splitext(file.filename or "")[1].lower()
    if file_ext not in SUPPORTED_AUDIO_FORMATS:
        raise HTTPException(
            status_code=400,
            detail=f"Unsupported audio format. Supported: {', '.join(SUPPORTED_AUDIO_FORMATS)}"
        )
    # Read file into memory for streaming
    file_content = await file.read()
    if len(file_content) > MAX_FILE_SIZE:
        raise HTTPException(
            status_code=413,
            detail=f"File too large. Maximum size: {MAX_FILE_SIZE // (1024*1024)}MB"
        )
    async def generate_progress() -> AsyncGenerator[str, None]:
        temp_dir = tempfile.mkdtemp(prefix="transcribe_")
        temp_file_path = os.path.join(temp_dir, f"upload{file_ext}")
        try:
            # Save file
            with open(temp_file_path, "wb") as f:
                f.write(file_content)
            yield f"data: {json.dumps({'event': 'start', 'message': '音訊檔案已接收，開始處理...'})}\n\n"
            # Segment audio
            yield f"data: {json.dumps({'event': 'segmenting', 'message': '正在分析音訊並分割片段...'})}\n\n"
            segments = await segment_audio_with_sidecar(temp_file_path, temp_dir)
            if "error" in segments:
                yield f"data: {json.dumps({'event': 'error', 'message': segments['error']})}\n\n"
                return
            segment_list = segments.get("segments", [])
            total_segments = len(segment_list)
            total_duration = segments.get("total_duration", 0)
            if total_segments == 0:
                yield f"data: {json.dumps({'event': 'error', 'message': '未檢測到音訊內容'})}\n\n"
                return
            yield f"data: {json.dumps({'event': 'segments_ready', 'total': total_segments, 'duration': total_duration, 'message': f'分割完成，共 {total_segments} 個片段'})}\n\n"
            # Transcribe each chunk
            transcriptions = []
            async with httpx.AsyncClient() as client:
                for i, segment in enumerate(segment_list):
                    chunk_path = segment.get("path")
                    chunk_index = segment.get("index", i)
                    chunk_duration = segment.get("duration", 0)
                    yield f"data: {json.dumps({'event': 'chunk_start', 'chunk': chunk_index + 1, 'total': total_segments, 'duration': chunk_duration, 'message': f'正在轉錄片段 {chunk_index + 1}/{total_segments}...'})}\n\n"
                    if not chunk_path or not os.path.exists(chunk_path):
                        yield f"data: {json.dumps({'event': 'chunk_error', 'chunk': chunk_index + 1, 'message': f'片段 {chunk_index + 1} 檔案不存在'})}\n\n"
                        continue
                    text = await transcribe_chunk_with_dify(
                        client, chunk_path, current_user.email
                    )
                    if text:
                        transcriptions.append(text)
                        yield f"data: {json.dumps({'event': 'chunk_done', 'chunk': chunk_index + 1, 'total': total_segments, 'text_length': len(text), 'message': f'片段 {chunk_index + 1} 完成'})}\n\n"
                    else:
                        yield f"data: {json.dumps({'event': 'chunk_error', 'chunk': chunk_index + 1, 'message': f'片段 {chunk_index + 1} 轉錄失敗'})}\n\n"
            # Final result
            final_transcript = " ".join(transcriptions)
            yield f"data: {json.dumps({'event': 'complete', 'transcript': final_transcript, 'chunks_processed': len(transcriptions), 'chunks_total': total_segments, 'duration': total_duration})}\n\n"
        finally:
            shutil.rmtree(temp_dir, ignore_errors=True)
    return StreamingResponse(
        generate_progress(),
        media_type="text/event-stream",
        headers={
            "Cache-Control": "no-cache",
            "Connection": "keep-alive",
            "X-Accel-Buffering": "no"
        }
    )
 async def segment_audio_with_sidecar(audio_path: str, output_dir: str) -> dict:
    """Call sidecar to segment audio file using VAD."""
    # Find sidecar script
    sidecar_dir = os.path.join(os.path.dirname(__file__), "..", "..", "..", "sidecar")
    sidecar_script = os.path.join(sidecar_dir, "transcriber.py")
    venv_python = os.path.join(sidecar_dir, "venv", "bin", "python")
    # Use venv python if available, otherwise system python
    python_cmd = venv_python if os.path.exists(venv_python) else "python3"
    if not os.path.exists(sidecar_script):
        return {"error": "Sidecar not found"}
    try:
        # Prepare command
        cmd_input = json.dumps({
            "action": "segment_audio",
            "file_path": audio_path,
            "max_chunk_seconds": 180,  # 3 minutes (smaller chunks for reliable upload)
            "min_silence_ms": 500,
            "output_dir": output_dir
        })
        # Run sidecar process
        process = await asyncio.create_subprocess_exec(
            python_cmd, sidecar_script,
            stdin=asyncio.subprocess.PIPE,
            stdout=asyncio.subprocess.PIPE,
            stderr=asyncio.subprocess.PIPE,
            cwd=sidecar_dir
        )
        # Send command and wait for response
        stdout, stderr = await asyncio.wait_for(
            process.communicate(input=f"{cmd_input}\n{{\"action\": \"quit\"}}\n".encode()),
            timeout=600  # 10 minutes timeout for large files
        )
        # Parse response (skip status messages, find the segment result)
        for line in stdout.decode().strip().split('\n'):
            if line:
                try:
                    data = json.loads(line)
                    if data.get("status") == "success" or "segments" in data:
                        return data
                    if "error" in data:
                        return data
                except json.JSONDecodeError:
                    continue
        return {"error": "No valid response from sidecar"}
    except asyncio.TimeoutError:
        return {"error": "Sidecar timeout during segmentation"}
    except Exception as e:
        return {"error": f"Sidecar error: {str(e)}"}
 async def upload_file_to_dify(
    client: httpx.AsyncClient,
    file_path: str,
    user_email: str
 ) -> Optional[str]:
    """Upload a file to Dify and return the file ID."""
    try:
        upload_url = f"{settings.DIFY_API_URL}/files/upload"
        file_size = os.path.getsize(file_path)
        print(f"[Upload] File: {file_path}, size: {file_size / (1024*1024):.1f} MB")
        # Adjust timeout based on file size (minimum 60s, ~1MB per 5 seconds)
        timeout_seconds = max(60.0, file_size / (1024 * 1024) * 5)
        print(f"[Upload] Using timeout: {timeout_seconds:.0f}s")
        with open(file_path, "rb") as f:
            files = {"file": (os.path.basename(file_path), f, "audio/wav")}
            response = await client.post(
                upload_url,
                headers={
                    "Authorization": f"Bearer {settings.DIFY_STT_API_KEY}",
                },
                files=files,
                data={"user": user_email},
                timeout=timeout_seconds,
            )
        print(f"[Upload] Response: {response.status_code}")
        if response.status_code == 201 or response.status_code == 200:
            data = response.json()
            file_id = data.get("id")
            print(f"[Upload] Success, file_id: {file_id}")
            return file_id
        print(f"[Upload] Error: {response.status_code} - {response.text[:500]}")
        return None
    except httpx.ReadError as e:
        print(f"[Upload] Network read error (connection reset): {e}")
        return None
    except httpx.TimeoutException as e:
        print(f"[Upload] Timeout: {e}")
        return None
    except Exception as e:
        import traceback
        print(f"[Upload] Error: {e}")
        print(traceback.format_exc())
        return None
 async def transcribe_chunk_with_dify(
    client: httpx.AsyncClient,
    chunk_path: str,
    user_email: str,
    max_retries: int = 3
 ) -> Optional[str]:
    """Transcribe a single audio chunk via Dify chat API with file upload."""
    for attempt in range(max_retries):
        try:
            print(f"[Dify] Attempt {attempt + 1}/{max_retries} for chunk: {chunk_path}")
            # Step 1: Upload file to Dify (with retry inside this attempt)
            file_id = None
            for upload_attempt in range(2):  # 2 upload attempts per main attempt
                file_id = await upload_file_to_dify(client, chunk_path, user_email)
                if file_id:
                    break
                print(f"[Dify] Upload attempt {upload_attempt + 1} failed, retrying...")
                await asyncio.sleep(1)
            if not file_id:
                print(f"[Dify] Failed to upload file after retries: {chunk_path}")
                if attempt < max_retries - 1:
                    await asyncio.sleep(2 ** attempt)
                    continue
                return None
            print(f"[Dify] File uploaded, file_id: {file_id}")
            # Step 2: Send chat message with file to request transcription
            response = await client.post(
                f"{settings.DIFY_API_URL}/chat-messages",
                headers={
                    "Authorization": f"Bearer {settings.DIFY_STT_API_KEY}",
                    "Content-Type": "application/json",
                },
                json={
                    "inputs": {},
                    "query": "請將這段音檔轉錄成文字，只回傳轉錄的文字內容，不要加任何額外說明。",
                    "response_mode": "blocking",
                    "user": user_email,
                    "files": [
                        {
                            "type": "audio",
                            "transfer_method": "local_file",
                            "upload_file_id": file_id
                        }
                    ]
                },
                timeout=300.0,  # 5 minutes per chunk (increased for longer segments)
            )
            print(f"[Dify] Chat response: {response.status_code}")
            if response.status_code == 200:
                data = response.json()
                answer = data.get("answer", "")
                print(f"[Dify] Transcription success, length: {len(answer)} chars")
                return answer
            # Retry on server errors or rate limits
            if response.status_code >= 500 or response.status_code == 429:
                print(f"[Dify] Server error {response.status_code}, will retry...")
                if attempt < max_retries - 1:
                    wait_time = 2 ** attempt
                    if response.status_code == 429:
                        wait_time = 10  # Wait longer for rate limits
                    await asyncio.sleep(wait_time)
                    continue
            # Log error but don't fail entire transcription
            print(f"[Dify] Chat error for chunk: {response.status_code} - {response.text[:500]}")
            return None
        except httpx.TimeoutException:
            if attempt < max_retries - 1:
                await asyncio.sleep(2 ** attempt)
                continue
            return None
        except Exception as e:
            print(f"Chunk transcription error: {e}")
            return None
    return None
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@@ -4,6 +4,7 @@ python-dotenv>=1.0.0
 mysql-connector-python>=9.0.0
 pydantic>=2.10.0
 httpx>=0.27.0
 python-multipart>=0.0.9
 python-jose[cryptography]>=3.3.0
 openpyxl>=3.1.2
 pytest>=8.0.0
--- a/backend/template/meeting_template.xlsx
+++ b/backend/template/meeting_template.xlsx
--- a/client/src/pages/meeting-detail.html
+++ b/client/src/pages/meeting-detail.html
@@ -96,6 +96,33 @@
      color: #666;
      font-style: italic;
    }
    .upload-progress {
      display: none;
      padding: 10px 15px;
      background: #fff3e0;
      border-radius: 6px;
      margin-bottom: 10px;
    }
    .upload-progress.active {
      display: block;
    }
    .upload-progress-bar {
      height: 6px;
      background: #e0e0e0;
      border-radius: 3px;
      overflow: hidden;
      margin-top: 8px;
    }
    .upload-progress-fill {
      height: 100%;
      background: #ff9800;
      width: 0%;
      transition: width 0.3s ease;
    }
    .upload-progress-text {
      font-size: 13px;
      color: #e65100;
    }
    .transcript-textarea {
      width: 100%;
      min-height: 400px;
@@ -143,8 +170,10 @@
      <div class="panel">
        <div class="panel-header">
          <span>Transcript (逐字稿)</span>
-          <div class="recording-controls" style="padding: 0;">
+          <div class="recording-controls" style="padding: 0; display: flex; gap: 8px;">
            <button class="btn btn-danger" id="record-btn">Start Recording</button>
            <button class="btn btn-secondary" id="upload-audio-btn">Upload Audio</button>
            <input type="file" id="audio-file-input" accept=".mp3,.wav,.m4a,.webm,.ogg,.flac,.aac" style="display: none;">
          </div>
        </div>
        <div class="panel-body">
@@ -155,6 +184,14 @@
            <span class="segment-count" id="segment-count">Segments: 0</span>
          </div>
          <!-- Upload Progress -->
          <div id="upload-progress" class="upload-progress">
            <span class="upload-progress-text" id="upload-progress-text">Uploading...</span>
            <div class="upload-progress-bar">
              <div class="upload-progress-fill" id="upload-progress-fill"></div>
            </div>
          </div>
          <!-- Single Transcript Textarea -->
          <div id="transcript-container">
            <textarea
@@ -203,7 +240,8 @@
      updateMeeting,
      deleteMeeting,
      exportMeeting,
-      summarizeTranscript
+      summarizeTranscript,
      transcribeAudio
    } from '../services/api.js';
    const meetingId = localStorage.getItem('currentMeetingId');
@@ -234,6 +272,11 @@
    const deleteBtn = document.getElementById('delete-btn');
    const addConclusionBtn = document.getElementById('add-conclusion-btn');
    const addActionBtn = document.getElementById('add-action-btn');
    const uploadAudioBtn = document.getElementById('upload-audio-btn');
    const audioFileInput = document.getElementById('audio-file-input');
    const uploadProgressEl = document.getElementById('upload-progress');
    const uploadProgressText = document.getElementById('upload-progress-text');
    const uploadProgressFill = document.getElementById('upload-progress-fill');
    // Load meeting data
    async function loadMeeting() {
@@ -460,6 +503,86 @@
      processingIndicatorEl.classList.add('hidden');
    }
    // === Audio File Upload ===
    uploadAudioBtn.addEventListener('click', () => {
      if (isRecording) {
        alert('Please stop recording before uploading audio.');
        return;
      }
      audioFileInput.click();
    });
    audioFileInput.addEventListener('change', async (e) => {
      const file = e.target.files[0];
      if (!file) return;
      // Validate file size (500MB max)
      const maxSize = 500 * 1024 * 1024;
      if (file.size > maxSize) {
        alert('File too large. Maximum size is 500MB.');
        audioFileInput.value = '';
        return;
      }
      // Confirm if transcript has content
      const currentTranscript = transcriptTextEl.value.trim();
      if (currentTranscript) {
        if (!confirm('This will replace the existing transcript. Do you want to continue?')) {
          audioFileInput.value = '';
          return;
        }
      }
      // Start upload
      uploadAudioBtn.disabled = true;
      recordBtn.disabled = true;
      uploadProgressEl.classList.add('active');
      uploadProgressFill.style.width = '0%';
      uploadProgressText.textContent = 'Uploading audio file...';
      try {
        const result = await transcribeAudio(file, (progress) => {
          if (progress.phase === 'uploading') {
            uploadProgressFill.style.width = `${progress.progress}%`;
            uploadProgressText.textContent = `上傳中: ${progress.progress}%`;
          } else if (progress.phase === 'processing') {
            uploadProgressFill.style.width = `${progress.progress}%`;
            uploadProgressText.textContent = progress.message || '處理中...';
          } else if (progress.phase === 'transcribing') {
            uploadProgressFill.style.width = `${progress.progress}%`;
            if (progress.total && progress.current) {
              uploadProgressText.textContent = `轉錄中: ${progress.current}/${progress.total} 片段 (${progress.progress}%)`;
            } else {
              uploadProgressText.textContent = progress.message || '轉錄中...';
            }
          } else if (progress.phase === 'complete') {
            uploadProgressFill.style.width = '100%';
            uploadProgressText.textContent = progress.message || '轉錄完成';
          }
        });
        // Success - update transcript
        transcriptTextEl.value = result.transcript || '';
        const chunksInfo = result.chunks_failed > 0
          ? `${result.chunks_processed}/${result.chunks_total} 片段成功`
          : `${result.chunks_processed} 片段`;
        uploadProgressText.textContent = `轉錄完成！(${chunksInfo}, ${Math.round(result.total_duration_seconds)}秒)`;
        // Auto-hide progress after 3 seconds
        setTimeout(() => {
          uploadProgressEl.classList.remove('active');
        }, 3000);
      } catch (error) {
        alert('Error transcribing audio: ' + error.message);
        uploadProgressEl.classList.remove('active');
      } finally {
        uploadAudioBtn.disabled = false;
        recordBtn.disabled = false;
        audioFileInput.value = '';
      }
    });
    // === Streaming Event Handlers (legacy, kept for future use) ===
    window.electronAPI.onTranscriptionSegment((segment) => {
      console.log('Received segment:', segment);
--- a/client/src/services/api.js
+++ b/client/src/services/api.js
@@ -141,6 +141,231 @@ export async function summarizeTranscript(transcript) {
  });
 }
 export async function transcribeAudio(file, onProgress = null) {
  const url = `${API_BASE_URL}/ai/transcribe-audio-stream`;
  const formData = new FormData();
  formData.append("file", file);
  const token = getToken();
  return new Promise((resolve, reject) => {
    // Use fetch for SSE support
    fetch(url, {
      method: "POST",
      headers: {
        Authorization: token ? `Bearer ${token}` : undefined,
      },
      body: formData,
    })
      .then((response) => {
        if (response.status === 401) {
          clearToken();
          window.electronAPI?.navigate("login");
          throw new Error("Session expired, please login again");
        }
        if (!response.ok) {
          return response.json().then((error) => {
            throw new Error(error.detail || `HTTP error ${response.status}`);
          });
        }
        if (onProgress) {
          onProgress({ phase: "processing", progress: 0, message: "處理中..." });
        }
        // Read SSE stream
        const reader = response.body.getReader();
        const decoder = new TextDecoder();
        let buffer = "";
        let result = null;
        let totalChunks = 0;
        let processedChunks = 0;
        function processLine(line) {
          if (line.startsWith("data: ")) {
            try {
              const data = JSON.parse(line.slice(6));
              switch (data.event) {
                case "start":
                case "segmenting":
                  if (onProgress) {
                    onProgress({
                      phase: "processing",
                      progress: 5,
                      message: data.message,
                    });
                  }
                  break;
                case "segments_ready":
                  totalChunks = data.total;
                  if (onProgress) {
                    onProgress({
                      phase: "transcribing",
                      progress: 10,
                      total: totalChunks,
                      current: 0,
                      message: data.message,
                    });
                  }
                  break;
                case "chunk_start":
                  if (onProgress) {
                    const progress = 10 + ((data.chunk - 1) / totalChunks) * 85;
                    onProgress({
                      phase: "transcribing",
                      progress: Math.round(progress),
                      total: totalChunks,
                      current: data.chunk,
                      message: data.message,
                    });
                  }
                  break;
                case "chunk_done":
                  processedChunks++;
                  if (onProgress) {
                    const progress = 10 + (data.chunk / totalChunks) * 85;
                    onProgress({
                      phase: "transcribing",
                      progress: Math.round(progress),
                      total: totalChunks,
                      current: data.chunk,
                      message: data.message,
                    });
                  }
                  break;
                case "chunk_error":
                  console.warn(`Chunk ${data.chunk} error: ${data.message}`);
                  break;
                case "error":
                  throw new Error(data.message);
                case "complete":
                  result = {
                    transcript: data.transcript,
                    chunks_processed: data.chunks_processed,
                    chunks_total: data.chunks_total,
                    total_duration_seconds: data.duration,
                    language: "zh",
                  };
                  if (onProgress) {
                    onProgress({
                      phase: "complete",
                      progress: 100,
                      message: "轉錄完成",
                    });
                  }
                  break;
              }
            } catch (e) {
              console.warn("SSE parse error:", e, line);
            }
          }
        }
        function read() {
          reader
            .read()
            .then(({ done, value }) => {
              if (done) {
                // Process any remaining buffer
                if (buffer.trim()) {
                  buffer.split("\n").forEach(processLine);
                }
                if (result) {
                  resolve(result);
                } else {
                  reject(new Error("Transcription failed - no result received"));
                }
                return;
              }
              buffer += decoder.decode(value, { stream: true });
              const lines = buffer.split("\n");
              buffer = lines.pop() || ""; // Keep incomplete line in buffer
              lines.forEach(processLine);
              read();
            })
            .catch(reject);
        }
        read();
      })
      .catch(reject);
  });
 }
 // Legacy non-streaming version (fallback)
 export async function transcribeAudioLegacy(file, onProgress = null) {
  const url = `${API_BASE_URL}/ai/transcribe-audio`;
  const formData = new FormData();
  formData.append("file", file);
  const token = getToken();
  return new Promise((resolve, reject) => {
    const xhr = new XMLHttpRequest();
    xhr.upload.addEventListener("progress", (event) => {
      if (event.lengthComputable && onProgress) {
        const percentComplete = Math.round((event.loaded / event.total) * 100);
        onProgress({ phase: "uploading", progress: percentComplete });
      }
    });
    xhr.addEventListener("load", () => {
      if (xhr.status >= 200 && xhr.status < 300) {
        try {
          const response = JSON.parse(xhr.responseText);
          resolve(response);
        } catch (e) {
          reject(new Error("Invalid response format"));
        }
      } else if (xhr.status === 401) {
        clearToken();
        window.electronAPI?.navigate("login");
        reject(new Error("Session expired, please login again"));
      } else {
        try {
          const error = JSON.parse(xhr.responseText);
          reject(new Error(error.detail || `HTTP error ${xhr.status}`));
        } catch (e) {
          reject(new Error(`HTTP error ${xhr.status}`));
        }
      }
    });
    xhr.addEventListener("error", () => {
      reject(new Error("Network error"));
    });
    xhr.addEventListener("timeout", () => {
      reject(new Error("Request timeout"));
    });
    xhr.open("POST", url, true);
    xhr.timeout = 600000; // 10 minutes for large files
    if (token) {
      xhr.setRequestHeader("Authorization", `Bearer ${token}`);
    }
    xhr.send(formData);
    // Notify processing phase after upload completes
    if (onProgress) {
      xhr.upload.addEventListener("loadend", () => {
        onProgress({ phase: "processing", progress: 0 });
      });
    }
  });
 }
 // Export API
 export async function exportMeeting(id) {
  return request(`/meetings/${id}/export`, {
--- a/sidecar/requirements.txt
+++ b/sidecar/requirements.txt
@@ -3,3 +3,5 @@ faster-whisper>=1.0.0
 opencc-python-reimplemented>=0.1.7
 numpy>=1.26.0
 onnxruntime>=1.16.0
 pydub>=0.25.0
 audioop-lts>=0.2.1  # Required for Python 3.13+ (audioop removed from stdlib)
--- a/sidecar/transcriber.py
+++ b/sidecar/transcriber.py
@@ -20,9 +20,10 @@ import tempfile
 import base64
 import uuid
 import re
 import wave
 import urllib.request
 from pathlib import Path
-from typing import Optional, List
+from typing import Optional, List, Tuple
 os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
@@ -105,8 +106,7 @@ class SileroVAD:
    def __init__(self, model_path: Optional[str] = None, threshold: float = 0.5):
        self.threshold = threshold
        self.session = None
-        self._h = np.zeros((2, 1, 64), dtype=np.float32)
+        self._state = np.zeros((2, 1, 128), dtype=np.float32)
        self._c = np.zeros((2, 1, 64), dtype=np.float32)
        self.sample_rate = 16000
        if not ONNX_AVAILABLE:
@@ -141,8 +141,7 @@ class SileroVAD:
    def reset_states(self):
        """Reset hidden states."""
-        self._h = np.zeros((2, 1, 64), dtype=np.float32)
+        self._state = np.zeros((2, 1, 128), dtype=np.float32)
        self._c = np.zeros((2, 1, 64), dtype=np.float32)
    def __call__(self, audio: np.ndarray) -> float:
        """Run VAD on audio chunk, return speech probability."""
@@ -153,15 +152,14 @@ class SileroVAD:
        if audio.ndim == 1:
            audio = audio[np.newaxis, :]
-        # Run inference
+        # Run inference with updated model format
        ort_inputs = {
            'input': audio.astype(np.float32),
-            'sr': np.array([self.sample_rate], dtype=np.int64),
+            'state': self._state,
-            'h': self._h,
+            'sr': np.array(self.sample_rate, dtype=np.int64)
            'c': self._c
        }
-        output, self._h, self._c = self.session.run(None, ort_inputs)
+        output, self._state = self.session.run(None, ort_inputs)
        return float(output[0][0])
@@ -406,6 +404,193 @@ class Transcriber:
            print(json.dumps({"error": f"Transcription error: {e}"}), file=sys.stderr)
            return ""
    def segment_audio_file(
        self,
        audio_path: str,
        max_chunk_seconds: int = 300,
        min_silence_ms: int = 500,
        output_dir: Optional[str] = None
    ) -> dict:
        """
        Segment an audio file using VAD for natural speech boundaries.
        Args:
            audio_path: Path to the audio file
            max_chunk_seconds: Maximum duration per chunk (default 5 minutes)
            min_silence_ms: Minimum silence duration to consider as boundary (default 500ms)
            output_dir: Directory to save chunks (default: temp directory)
        Returns:
            dict with segments list and metadata
        """
        try:
            # Import audio processing libraries
            try:
                from pydub import AudioSegment
            except ImportError:
                return {"error": "pydub not installed. Run: pip install pydub"}
            if not os.path.exists(audio_path):
                return {"error": f"File not found: {audio_path}"}
            # Create output directory
            if output_dir is None:
                output_dir = tempfile.mkdtemp(prefix="audio_segments_")
            else:
                os.makedirs(output_dir, exist_ok=True)
            # Load audio file and convert to mono 16kHz
            print(json.dumps({"status": "loading_audio", "file": audio_path}), file=sys.stderr)
            audio = AudioSegment.from_file(audio_path)
            audio = audio.set_channels(1).set_frame_rate(16000)
            total_duration_ms = len(audio)
            total_duration_sec = total_duration_ms / 1000
            print(json.dumps({
                "status": "audio_loaded",
                "duration_seconds": total_duration_sec
            }), file=sys.stderr)
            # Convert to numpy for VAD processing
            samples = np.array(audio.get_array_of_samples(), dtype=np.float32) / 32768.0
            # Run VAD to detect speech regions
            segments = []
            current_start = 0
            max_chunk_samples = max_chunk_seconds * 16000
            min_silence_samples = int(min_silence_ms * 16 )  # 16 samples per ms at 16kHz
            if self.vad_model is None or self.vad_model.session is None:
                # No VAD available, use fixed-time splitting
                print(json.dumps({"warning": "VAD not available, using fixed-time splitting"}), file=sys.stderr)
                chunk_idx = 0
                for start_sample in range(0, len(samples), max_chunk_samples):
                    end_sample = min(start_sample + max_chunk_samples, len(samples))
                    chunk_samples = samples[start_sample:end_sample]
                    # Export chunk
                    chunk_path = os.path.join(output_dir, f"chunk_{chunk_idx:03d}.wav")
                    self._export_wav(chunk_samples, chunk_path)
                    segments.append({
                        "index": chunk_idx,
                        "path": chunk_path,
                        "start": start_sample / 16000,
                        "end": end_sample / 16000,
                        "duration": (end_sample - start_sample) / 16000
                    })
                    chunk_idx += 1
            else:
                # Use VAD for intelligent splitting
                print(json.dumps({"status": "running_vad"}), file=sys.stderr)
                self.vad_model.reset_states()
                # Find silence regions for splitting
                window_size = 512
                silence_starts = []
                in_silence = False
                silence_start = 0
                for i in range(0, len(samples) - window_size, window_size):
                    window = samples[i:i + window_size]
                    speech_prob = self.vad_model(window)
                    if speech_prob < 0.3:  # Silence threshold
                        if not in_silence:
                            in_silence = True
                            silence_start = i
                    else:
                        if in_silence:
                            silence_duration = i - silence_start
                            if silence_duration >= min_silence_samples:
                                # Mark middle of silence as potential split point
                                silence_starts.append(silence_start + silence_duration // 2)
                            in_silence = False
                # Add end of file as final split point
                silence_starts.append(len(samples))
                # Create segments based on silence boundaries
                chunk_idx = 0
                current_start = 0
                for split_point in silence_starts:
                    # Check if we need to split here
                    chunk_duration = split_point - current_start
                    if chunk_duration >= max_chunk_samples or split_point == len(samples):
                        # Find the best split point before max duration
                        if chunk_duration > max_chunk_samples:
                            # Find nearest silence point before max
                            best_split = current_start + max_chunk_samples
                            for sp in silence_starts:
                                if current_start < sp <= current_start + max_chunk_samples:
                                    best_split = sp
                            split_point = best_split
                        # Export chunk
                        chunk_samples = samples[current_start:split_point]
                        if len(chunk_samples) > 8000:  # At least 0.5 seconds
                            chunk_path = os.path.join(output_dir, f"chunk_{chunk_idx:03d}.wav")
                            self._export_wav(chunk_samples, chunk_path)
                            segments.append({
                                "index": chunk_idx,
                                "path": chunk_path,
                                "start": current_start / 16000,
                                "end": split_point / 16000,
                                "duration": (split_point - current_start) / 16000
                            })
                            chunk_idx += 1
                        current_start = split_point
                # Handle any remaining audio - split into max_chunk_samples pieces
                while current_start < len(samples):
                    remaining_len = len(samples) - current_start
                    if remaining_len < 8000:  # Less than 0.5 seconds
                        break
                    # Determine chunk end (respect max_chunk_samples)
                    chunk_end = min(current_start + max_chunk_samples, len(samples))
                    chunk_samples = samples[current_start:chunk_end]
                    chunk_path = os.path.join(output_dir, f"chunk_{chunk_idx:03d}.wav")
                    self._export_wav(chunk_samples, chunk_path)
                    segments.append({
                        "index": chunk_idx,
                        "path": chunk_path,
                        "start": current_start / 16000,
                        "end": chunk_end / 16000,
                        "duration": len(chunk_samples) / 16000
                    })
                    chunk_idx += 1
                    current_start = chunk_end
            print(json.dumps({
                "status": "segmentation_complete",
                "total_segments": len(segments)
            }), file=sys.stderr)
            return {
                "status": "success",
                "segments": segments,
                "total_segments": len(segments),
                "total_duration": total_duration_sec,
                "output_dir": output_dir
            }
        except Exception as e:
            return {"error": f"Segmentation error: {str(e)}"}
    def _export_wav(self, samples: np.ndarray, output_path: str):
        """Export numpy samples to WAV file."""
        with wave.open(output_path, 'wb') as wf:
            wf.setnchannels(1)
            wf.setsampwidth(2)
            wf.setframerate(16000)
            wf.writeframes((samples * 32768).astype(np.int16).tobytes())
    def handle_command(self, cmd: dict) -> Optional[dict]:
        """Handle a JSON command."""
        action = cmd.get("action")
@@ -447,6 +632,21 @@ class Transcriber:
            self.streaming_session = None
            return result
        elif action == "segment_audio":
            # Segment audio file using VAD
            file_path = cmd.get("file_path")
            if not file_path:
                return {"error": "No file_path specified"}
            max_chunk_seconds = cmd.get("max_chunk_seconds", 300)
            min_silence_ms = cmd.get("min_silence_ms", 500)
            output_dir = cmd.get("output_dir")
            return self.segment_audio_file(
                file_path,
                max_chunk_seconds=max_chunk_seconds,
                min_silence_ms=min_silence_ms,
                output_dir=output_dir
            )
        elif action == "ping":
            return {"status": "pong"}
--- a/start.sh
+++ b/start.sh
@@ -173,9 +173,23 @@ start_backend() {
    local backend_pid=$!
    echo "BACKEND_PID=$backend_pid" >> "$PID_FILE"
-    # 等待啟動
+    # 等待啟動（最多等待 15 秒）
-    sleep 2
+    local max_wait=15
    local waited=0
    log_info "等待後端服務啟動..."
    while [ $waited -lt $max_wait ]; do
        sleep 1
        waited=$((waited + 1))
        # 檢查健康狀態
        if curl -s http://localhost:$BACKEND_PORT/api/health > /dev/null 2>&1; then
            log_success "後端服務已啟動 (PID: $backend_pid, Port: $BACKEND_PORT)"
            return 0
        fi
    done
    # 最後再檢查一次 port 狀態
    if check_port $BACKEND_PORT; then
        log_success "後端服務已啟動 (PID: $backend_pid, Port: $BACKEND_PORT)"
    else