feat: Add Dify audio transcription with VAD chunking and SSE progress

- Add audio file upload transcription via Dify STT API - Implement VAD-based audio segmentation in sidecar (3-min chunks) - Add SSE endpoint for real-time transcription progress updates - Fix chunk size enforcement for reliable uploads - Add retry logic with exponential backoff for API calls - Support Python 3.13+ with audioop-lts package - Update frontend with Chinese progress messages and chunk display - Improve start.sh health check with retry loop 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-11 21:00:27 +08:00
parent e790f48967
commit 263eb1c394
10 changed files with 1008 additions and 16 deletions
--- a/backend/.env.example
+++ b/backend/.env.example
@@ -9,6 +9,7 @@ DB_NAME=db_A060
 AUTH_API_URL=https://pj-auth-api.vercel.app/api/auth/login
 DIFY_API_URL=https://dify.theaken.com/v1
 DIFY_API_KEY=app-xxxxxxxxxxx
+DIFY_STT_API_KEY=app-xxxxxxxxxxx

 # Application Settings
 ADMIN_EMAIL=ymirliu@panjit.com.tw
--- a/backend/app/config.py
+++ b/backend/app/config.py
@@ -16,6 +16,7 @@ class Settings:
    )
    DIFY_API_URL: str = os.getenv("DIFY_API_URL", "https://dify.theaken.com/v1")
    DIFY_API_KEY: str = os.getenv("DIFY_API_KEY", "")
+    DIFY_STT_API_KEY: str = os.getenv("DIFY_STT_API_KEY", "")

    ADMIN_EMAIL: str = os.getenv("ADMIN_EMAIL", "ymirliu@panjit.com.tw")
    JWT_SECRET: str = os.getenv("JWT_SECRET", "meeting-assistant-secret")
--- a/backend/app/routers/ai.py
+++ b/backend/app/routers/ai.py
@@ -1,11 +1,22 @@
-from fastapi import APIRouter, HTTPException, Depends
+from fastapi import APIRouter, HTTPException, Depends, UploadFile, File
+from fastapi.responses import StreamingResponse
 import httpx
 import json
+import os
+import tempfile
+import subprocess
+import shutil
+import asyncio
+from typing import Optional, AsyncGenerator

 from ..config import settings
 from ..models import SummarizeRequest, SummarizeResponse, ActionItemCreate, TokenPayload
 from .auth import get_current_user

+# Supported audio formats
+SUPPORTED_AUDIO_FORMATS = {".mp3", ".wav", ".m4a", ".webm", ".ogg", ".flac", ".aac"}
+MAX_FILE_SIZE = 500 * 1024 * 1024  # 500MB
+
 router = APIRouter()


@@ -74,6 +85,9 @@ def parse_dify_response(answer: str) -> dict:
    Parse Dify response to extract conclusions and action items.
    Attempts JSON parsing first, then falls back to text parsing.
    """
+    print(f"[Dify Summarize] Raw answer length: {len(answer)} chars")
+    print(f"[Dify Summarize] Raw answer preview: {answer[:500]}...")
+
    # Try to find JSON in the response
    try:
        # Look for JSON block
@@ -90,13 +104,424 @@ def parse_dify_response(answer: str) -> dict:
            raise ValueError("No JSON found")

        data = json.loads(json_str)
+        print(f"[Dify Summarize] Parsed JSON keys: {list(data.keys())}")
+        print(f"[Dify Summarize] conclusions count: {len(data.get('conclusions', []))}")
+        print(f"[Dify Summarize] action_items count: {len(data.get('action_items', []))}")
+
        return {
            "conclusions": data.get("conclusions", []),
            "action_items": data.get("action_items", []),
        }
-    except (ValueError, json.JSONDecodeError):
+    except (ValueError, json.JSONDecodeError) as e:
+        print(f"[Dify Summarize] JSON parse failed: {e}")
        # Fallback: return raw answer as single conclusion
        return {
            "conclusions": [answer] if answer else [],
            "action_items": [],
        }
+
+
+@router.post("/ai/transcribe-audio")
+async def transcribe_audio(
+    file: UploadFile = File(...),
+    current_user: TokenPayload = Depends(get_current_user)
+):
+    """
+    Transcribe an uploaded audio file using Dify STT service.
+    Large files are automatically chunked using VAD segmentation.
+    """
+    if not settings.DIFY_STT_API_KEY:
+        raise HTTPException(status_code=503, detail="Dify STT API not configured")
+
+    # Validate file extension
+    file_ext = os.path.splitext(file.filename or "")[1].lower()
+    if file_ext not in SUPPORTED_AUDIO_FORMATS:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Unsupported audio format. Supported: {', '.join(SUPPORTED_AUDIO_FORMATS)}"
+        )
+
+    # Create temp directory for processing
+    temp_dir = tempfile.mkdtemp(prefix="transcribe_")
+    temp_file_path = os.path.join(temp_dir, f"upload{file_ext}")
+
+    try:
+        # Save uploaded file
+        file_size = 0
+        with open(temp_file_path, "wb") as f:
+            while chunk := await file.read(1024 * 1024):  # 1MB chunks
+                file_size += len(chunk)
+                if file_size > MAX_FILE_SIZE:
+                    raise HTTPException(
+                        status_code=413,
+                        detail=f"File too large. Maximum size: {MAX_FILE_SIZE // (1024*1024)}MB"
+                    )
+                f.write(chunk)
+
+        print(f"[Transcribe] Saved uploaded file: {temp_file_path}, size: {file_size} bytes")
+
+        # Call sidecar to segment audio
+        segments = await segment_audio_with_sidecar(temp_file_path, temp_dir)
+
+        if "error" in segments:
+            raise HTTPException(status_code=500, detail=segments["error"])
+
+        segment_list = segments.get("segments", [])
+        total_segments = len(segment_list)
+
+        print(f"[Transcribe] Segmentation complete: {total_segments} chunks created")
+        for seg in segment_list:
+            print(f"  - Chunk {seg.get('index')}: {seg.get('path')} ({seg.get('duration', 0):.1f}s)")
+
+        if total_segments == 0:
+            raise HTTPException(status_code=400, detail="No audio content detected")
+
+        # Transcribe each chunk via Dify STT
+        transcriptions = []
+        failed_chunks = []
+        async with httpx.AsyncClient() as client:
+            for i, segment in enumerate(segment_list):
+                chunk_path = segment.get("path")
+                chunk_index = segment.get("index", i)
+
+                print(f"[Transcribe] Processing chunk {chunk_index + 1}/{total_segments}: {chunk_path}")
+
+                if not chunk_path:
+                    print(f"[Transcribe] ERROR: Chunk {chunk_index} has no path!")
+                    failed_chunks.append(chunk_index)
+                    continue
+
+                if not os.path.exists(chunk_path):
+                    print(f"[Transcribe] ERROR: Chunk file does not exist: {chunk_path}")
+                    failed_chunks.append(chunk_index)
+                    continue
+
+                chunk_size = os.path.getsize(chunk_path)
+                print(f"[Transcribe] Chunk {chunk_index} exists, size: {chunk_size} bytes")
+
+                # Call Dify STT API with retry
+                text = await transcribe_chunk_with_dify(
+                    client, chunk_path, current_user.email
+                )
+                if text:
+                    print(f"[Transcribe] Chunk {chunk_index} transcribed: {len(text)} chars")
+                    transcriptions.append(text)
+                else:
+                    print(f"[Transcribe] Chunk {chunk_index} transcription failed (no text returned)")
+                    failed_chunks.append(chunk_index)
+
+        # Concatenate all transcriptions
+        final_transcript = " ".join(transcriptions)
+
+        print(f"[Transcribe] Complete: {len(transcriptions)}/{total_segments} chunks transcribed")
+        if failed_chunks:
+            print(f"[Transcribe] Failed chunks: {failed_chunks}")
+
+        return {
+            "transcript": final_transcript,
+            "chunks_processed": len(transcriptions),
+            "chunks_total": total_segments,
+            "chunks_failed": len(failed_chunks),
+            "total_duration_seconds": segments.get("total_duration", 0),
+            "language": "zh"
+        }
+
+    finally:
+        # Clean up temp files
+        shutil.rmtree(temp_dir, ignore_errors=True)
+
+
+@router.post("/ai/transcribe-audio-stream")
+async def transcribe_audio_stream(
+    file: UploadFile = File(...),
+    current_user: TokenPayload = Depends(get_current_user)
+):
+    """
+    Transcribe an uploaded audio file with real-time progress via SSE.
+    Returns Server-Sent Events for progress updates.
+    """
+    if not settings.DIFY_STT_API_KEY:
+        raise HTTPException(status_code=503, detail="Dify STT API not configured")
+
+    # Validate file extension
+    file_ext = os.path.splitext(file.filename or "")[1].lower()
+    if file_ext not in SUPPORTED_AUDIO_FORMATS:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Unsupported audio format. Supported: {', '.join(SUPPORTED_AUDIO_FORMATS)}"
+        )
+
+    # Read file into memory for streaming
+    file_content = await file.read()
+    if len(file_content) > MAX_FILE_SIZE:
+        raise HTTPException(
+            status_code=413,
+            detail=f"File too large. Maximum size: {MAX_FILE_SIZE // (1024*1024)}MB"
+        )
+
+    async def generate_progress() -> AsyncGenerator[str, None]:
+        temp_dir = tempfile.mkdtemp(prefix="transcribe_")
+        temp_file_path = os.path.join(temp_dir, f"upload{file_ext}")
+
+        try:
+            # Save file
+            with open(temp_file_path, "wb") as f:
+                f.write(file_content)
+
+            yield f"data: {json.dumps({'event': 'start', 'message': '音訊檔案已接收，開始處理...'})}\n\n"
+
+            # Segment audio
+            yield f"data: {json.dumps({'event': 'segmenting', 'message': '正在分析音訊並分割片段...'})}\n\n"
+
+            segments = await segment_audio_with_sidecar(temp_file_path, temp_dir)
+
+            if "error" in segments:
+                yield f"data: {json.dumps({'event': 'error', 'message': segments['error']})}\n\n"
+                return
+
+            segment_list = segments.get("segments", [])
+            total_segments = len(segment_list)
+            total_duration = segments.get("total_duration", 0)
+
+            if total_segments == 0:
+                yield f"data: {json.dumps({'event': 'error', 'message': '未檢測到音訊內容'})}\n\n"
+                return
+
+            yield f"data: {json.dumps({'event': 'segments_ready', 'total': total_segments, 'duration': total_duration, 'message': f'分割完成，共 {total_segments} 個片段'})}\n\n"
+
+            # Transcribe each chunk
+            transcriptions = []
+            async with httpx.AsyncClient() as client:
+                for i, segment in enumerate(segment_list):
+                    chunk_path = segment.get("path")
+                    chunk_index = segment.get("index", i)
+                    chunk_duration = segment.get("duration", 0)
+
+                    yield f"data: {json.dumps({'event': 'chunk_start', 'chunk': chunk_index + 1, 'total': total_segments, 'duration': chunk_duration, 'message': f'正在轉錄片段 {chunk_index + 1}/{total_segments}...'})}\n\n"
+
+                    if not chunk_path or not os.path.exists(chunk_path):
+                        yield f"data: {json.dumps({'event': 'chunk_error', 'chunk': chunk_index + 1, 'message': f'片段 {chunk_index + 1} 檔案不存在'})}\n\n"
+                        continue
+
+                    text = await transcribe_chunk_with_dify(
+                        client, chunk_path, current_user.email
+                    )
+
+                    if text:
+                        transcriptions.append(text)
+                        yield f"data: {json.dumps({'event': 'chunk_done', 'chunk': chunk_index + 1, 'total': total_segments, 'text_length': len(text), 'message': f'片段 {chunk_index + 1} 完成'})}\n\n"
+                    else:
+                        yield f"data: {json.dumps({'event': 'chunk_error', 'chunk': chunk_index + 1, 'message': f'片段 {chunk_index + 1} 轉錄失敗'})}\n\n"
+
+            # Final result
+            final_transcript = " ".join(transcriptions)
+            yield f"data: {json.dumps({'event': 'complete', 'transcript': final_transcript, 'chunks_processed': len(transcriptions), 'chunks_total': total_segments, 'duration': total_duration})}\n\n"
+
+        finally:
+            shutil.rmtree(temp_dir, ignore_errors=True)
+
+    return StreamingResponse(
+        generate_progress(),
+        media_type="text/event-stream",
+        headers={
+            "Cache-Control": "no-cache",
+            "Connection": "keep-alive",
+            "X-Accel-Buffering": "no"
+        }
+    )
+
+
+async def segment_audio_with_sidecar(audio_path: str, output_dir: str) -> dict:
+    """Call sidecar to segment audio file using VAD."""
+    # Find sidecar script
+    sidecar_dir = os.path.join(os.path.dirname(__file__), "..", "..", "..", "sidecar")
+    sidecar_script = os.path.join(sidecar_dir, "transcriber.py")
+    venv_python = os.path.join(sidecar_dir, "venv", "bin", "python")
+
+    # Use venv python if available, otherwise system python
+    python_cmd = venv_python if os.path.exists(venv_python) else "python3"
+
+    if not os.path.exists(sidecar_script):
+        return {"error": "Sidecar not found"}
+
+    try:
+        # Prepare command
+        cmd_input = json.dumps({
+            "action": "segment_audio",
+            "file_path": audio_path,
+            "max_chunk_seconds": 180,  # 3 minutes (smaller chunks for reliable upload)
+            "min_silence_ms": 500,
+            "output_dir": output_dir
+        })
+
+        # Run sidecar process
+        process = await asyncio.create_subprocess_exec(
+            python_cmd, sidecar_script,
+            stdin=asyncio.subprocess.PIPE,
+            stdout=asyncio.subprocess.PIPE,
+            stderr=asyncio.subprocess.PIPE,
+            cwd=sidecar_dir
+        )
+
+        # Send command and wait for response
+        stdout, stderr = await asyncio.wait_for(
+            process.communicate(input=f"{cmd_input}\n{{\"action\": \"quit\"}}\n".encode()),
+            timeout=600  # 10 minutes timeout for large files
+        )
+
+        # Parse response (skip status messages, find the segment result)
+        for line in stdout.decode().strip().split('\n'):
+            if line:
+                try:
+                    data = json.loads(line)
+                    if data.get("status") == "success" or "segments" in data:
+                        return data
+                    if "error" in data:
+                        return data
+                except json.JSONDecodeError:
+                    continue
+
+        return {"error": "No valid response from sidecar"}
+
+    except asyncio.TimeoutError:
+        return {"error": "Sidecar timeout during segmentation"}
+    except Exception as e:
+        return {"error": f"Sidecar error: {str(e)}"}
+
+
+async def upload_file_to_dify(
+    client: httpx.AsyncClient,
+    file_path: str,
+    user_email: str
+) -> Optional[str]:
+    """Upload a file to Dify and return the file ID."""
+    try:
+        upload_url = f"{settings.DIFY_API_URL}/files/upload"
+
+        file_size = os.path.getsize(file_path)
+        print(f"[Upload] File: {file_path}, size: {file_size / (1024*1024):.1f} MB")
+
+        # Adjust timeout based on file size (minimum 60s, ~1MB per 5 seconds)
+        timeout_seconds = max(60.0, file_size / (1024 * 1024) * 5)
+        print(f"[Upload] Using timeout: {timeout_seconds:.0f}s")
+
+        with open(file_path, "rb") as f:
+            files = {"file": (os.path.basename(file_path), f, "audio/wav")}
+            response = await client.post(
+                upload_url,
+                headers={
+                    "Authorization": f"Bearer {settings.DIFY_STT_API_KEY}",
+                },
+                files=files,
+                data={"user": user_email},
+                timeout=timeout_seconds,
+            )
+
+        print(f"[Upload] Response: {response.status_code}")
+
+        if response.status_code == 201 or response.status_code == 200:
+            data = response.json()
+            file_id = data.get("id")
+            print(f"[Upload] Success, file_id: {file_id}")
+            return file_id
+
+        print(f"[Upload] Error: {response.status_code} - {response.text[:500]}")
+        return None
+
+    except httpx.ReadError as e:
+        print(f"[Upload] Network read error (connection reset): {e}")
+        return None
+    except httpx.TimeoutException as e:
+        print(f"[Upload] Timeout: {e}")
+        return None
+    except Exception as e:
+        import traceback
+        print(f"[Upload] Error: {e}")
+        print(traceback.format_exc())
+        return None
+
+
+async def transcribe_chunk_with_dify(
+    client: httpx.AsyncClient,
+    chunk_path: str,
+    user_email: str,
+    max_retries: int = 3
+) -> Optional[str]:
+    """Transcribe a single audio chunk via Dify chat API with file upload."""
+    for attempt in range(max_retries):
+        try:
+            print(f"[Dify] Attempt {attempt + 1}/{max_retries} for chunk: {chunk_path}")
+
+            # Step 1: Upload file to Dify (with retry inside this attempt)
+            file_id = None
+            for upload_attempt in range(2):  # 2 upload attempts per main attempt
+                file_id = await upload_file_to_dify(client, chunk_path, user_email)
+                if file_id:
+                    break
+                print(f"[Dify] Upload attempt {upload_attempt + 1} failed, retrying...")
+                await asyncio.sleep(1)
+
+            if not file_id:
+                print(f"[Dify] Failed to upload file after retries: {chunk_path}")
+                if attempt < max_retries - 1:
+                    await asyncio.sleep(2 ** attempt)
+                    continue
+                return None
+
+            print(f"[Dify] File uploaded, file_id: {file_id}")
+
+            # Step 2: Send chat message with file to request transcription
+            response = await client.post(
+                f"{settings.DIFY_API_URL}/chat-messages",
+                headers={
+                    "Authorization": f"Bearer {settings.DIFY_STT_API_KEY}",
+                    "Content-Type": "application/json",
+                },
+                json={
+                    "inputs": {},
+                    "query": "請將這段音檔轉錄成文字，只回傳轉錄的文字內容，不要加任何額外說明。",
+                    "response_mode": "blocking",
+                    "user": user_email,
+                    "files": [
+                        {
+                            "type": "audio",
+                            "transfer_method": "local_file",
+                            "upload_file_id": file_id
+                        }
+                    ]
+                },
+                timeout=300.0,  # 5 minutes per chunk (increased for longer segments)
+            )
+
+            print(f"[Dify] Chat response: {response.status_code}")
+
+            if response.status_code == 200:
+                data = response.json()
+                answer = data.get("answer", "")
+                print(f"[Dify] Transcription success, length: {len(answer)} chars")
+                return answer
+
+            # Retry on server errors or rate limits
+            if response.status_code >= 500 or response.status_code == 429:
+                print(f"[Dify] Server error {response.status_code}, will retry...")
+                if attempt < max_retries - 1:
+                    wait_time = 2 ** attempt
+                    if response.status_code == 429:
+                        wait_time = 10  # Wait longer for rate limits
+                    await asyncio.sleep(wait_time)
+                    continue
+
+            # Log error but don't fail entire transcription
+            print(f"[Dify] Chat error for chunk: {response.status_code} - {response.text[:500]}")
+            return None
+
+        except httpx.TimeoutException:
+            if attempt < max_retries - 1:
+                await asyncio.sleep(2 ** attempt)
+                continue
+            return None
+        except Exception as e:
+            print(f"Chunk transcription error: {e}")
+            return None
+
+    return None
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@@ -4,6 +4,7 @@ python-dotenv>=1.0.0
 mysql-connector-python>=9.0.0
 pydantic>=2.10.0
 httpx>=0.27.0
+python-multipart>=0.0.9
 python-jose[cryptography]>=3.3.0
 openpyxl>=3.1.2
 pytest>=8.0.0
--- a/backend/template/meeting_template.xlsx
+++ b/backend/template/meeting_template.xlsx
--- a/client/src/pages/meeting-detail.html
+++ b/client/src/pages/meeting-detail.html
@@ -96,6 +96,33 @@
      color: #666;
      font-style: italic;
    }
+    .upload-progress {
+      display: none;
+      padding: 10px 15px;
+      background: #fff3e0;
+      border-radius: 6px;
+      margin-bottom: 10px;
+    }
+    .upload-progress.active {
+      display: block;
+    }
+    .upload-progress-bar {
+      height: 6px;
+      background: #e0e0e0;
+      border-radius: 3px;
+      overflow: hidden;
+      margin-top: 8px;
+    }
+    .upload-progress-fill {
+      height: 100%;
+      background: #ff9800;
+      width: 0%;
+      transition: width 0.3s ease;
+    }
+    .upload-progress-text {
+      font-size: 13px;
+      color: #e65100;
+    }
    .transcript-textarea {
      width: 100%;
      min-height: 400px;
@@ -143,8 +170,10 @@
      <div class="panel">
        <div class="panel-header">
          <span>Transcript (逐字稿)</span>
-          <div class="recording-controls" style="padding: 0;">
+          <div class="recording-controls" style="padding: 0; display: flex; gap: 8px;">
            <button class="btn btn-danger" id="record-btn">Start Recording</button>
+            <button class="btn btn-secondary" id="upload-audio-btn">Upload Audio</button>
+            <input type="file" id="audio-file-input" accept=".mp3,.wav,.m4a,.webm,.ogg,.flac,.aac" style="display: none;">
          </div>
        </div>
        <div class="panel-body">
@@ -155,6 +184,14 @@
            <span class="segment-count" id="segment-count">Segments: 0</span>
          </div>

+          <!-- Upload Progress -->
+          <div id="upload-progress" class="upload-progress">
+            <span class="upload-progress-text" id="upload-progress-text">Uploading...</span>
+            <div class="upload-progress-bar">
+              <div class="upload-progress-fill" id="upload-progress-fill"></div>
+            </div>
+          </div>
+
          <!-- Single Transcript Textarea -->
          <div id="transcript-container">
            <textarea
@@ -203,7 +240,8 @@
      updateMeeting,
      deleteMeeting,
      exportMeeting,
-      summarizeTranscript
+      summarizeTranscript,
+      transcribeAudio
    } from '../services/api.js';

    const meetingId = localStorage.getItem('currentMeetingId');
@@ -234,6 +272,11 @@
    const deleteBtn = document.getElementById('delete-btn');
    const addConclusionBtn = document.getElementById('add-conclusion-btn');
    const addActionBtn = document.getElementById('add-action-btn');
+    const uploadAudioBtn = document.getElementById('upload-audio-btn');
+    const audioFileInput = document.getElementById('audio-file-input');
+    const uploadProgressEl = document.getElementById('upload-progress');
+    const uploadProgressText = document.getElementById('upload-progress-text');
+    const uploadProgressFill = document.getElementById('upload-progress-fill');

    // Load meeting data
    async function loadMeeting() {
@@ -460,6 +503,86 @@
      processingIndicatorEl.classList.add('hidden');
    }

+    // === Audio File Upload ===
+    uploadAudioBtn.addEventListener('click', () => {
+      if (isRecording) {
+        alert('Please stop recording before uploading audio.');
+        return;
+      }
+      audioFileInput.click();
+    });
+
+    audioFileInput.addEventListener('change', async (e) => {
+      const file = e.target.files[0];
+      if (!file) return;
+
+      // Validate file size (500MB max)
+      const maxSize = 500 * 1024 * 1024;
+      if (file.size > maxSize) {
+        alert('File too large. Maximum size is 500MB.');
+        audioFileInput.value = '';
+        return;
+      }
+
+      // Confirm if transcript has content
+      const currentTranscript = transcriptTextEl.value.trim();
+      if (currentTranscript) {
+        if (!confirm('This will replace the existing transcript. Do you want to continue?')) {
+          audioFileInput.value = '';
+          return;
+        }
+      }
+
+      // Start upload
+      uploadAudioBtn.disabled = true;
+      recordBtn.disabled = true;
+      uploadProgressEl.classList.add('active');
+      uploadProgressFill.style.width = '0%';
+      uploadProgressText.textContent = 'Uploading audio file...';
+
+      try {
+        const result = await transcribeAudio(file, (progress) => {
+          if (progress.phase === 'uploading') {
+            uploadProgressFill.style.width = `${progress.progress}%`;
+            uploadProgressText.textContent = `上傳中: ${progress.progress}%`;
+          } else if (progress.phase === 'processing') {
+            uploadProgressFill.style.width = `${progress.progress}%`;
+            uploadProgressText.textContent = progress.message || '處理中...';
+          } else if (progress.phase === 'transcribing') {
+            uploadProgressFill.style.width = `${progress.progress}%`;
+            if (progress.total && progress.current) {
+              uploadProgressText.textContent = `轉錄中: ${progress.current}/${progress.total} 片段 (${progress.progress}%)`;
+            } else {
+              uploadProgressText.textContent = progress.message || '轉錄中...';
+            }
+          } else if (progress.phase === 'complete') {
+            uploadProgressFill.style.width = '100%';
+            uploadProgressText.textContent = progress.message || '轉錄完成';
+          }
+        });
+
+        // Success - update transcript
+        transcriptTextEl.value = result.transcript || '';
+        const chunksInfo = result.chunks_failed > 0
+          ? `${result.chunks_processed}/${result.chunks_total} 片段成功`
+          : `${result.chunks_processed} 片段`;
+        uploadProgressText.textContent = `轉錄完成！(${chunksInfo}, ${Math.round(result.total_duration_seconds)}秒)`;
+
+        // Auto-hide progress after 3 seconds
+        setTimeout(() => {
+          uploadProgressEl.classList.remove('active');
+        }, 3000);
+
+      } catch (error) {
+        alert('Error transcribing audio: ' + error.message);
+        uploadProgressEl.classList.remove('active');
+      } finally {
+        uploadAudioBtn.disabled = false;
+        recordBtn.disabled = false;
+        audioFileInput.value = '';
+      }
+    });
+
    // === Streaming Event Handlers (legacy, kept for future use) ===
    window.electronAPI.onTranscriptionSegment((segment) => {
      console.log('Received segment:', segment);
--- a/client/src/services/api.js
+++ b/client/src/services/api.js
@@ -141,6 +141,231 @@ export async function summarizeTranscript(transcript) {
  });
 }

+export async function transcribeAudio(file, onProgress = null) {
+  const url = `${API_BASE_URL}/ai/transcribe-audio-stream`;
+  const formData = new FormData();
+  formData.append("file", file);
+
+  const token = getToken();
+
+  return new Promise((resolve, reject) => {
+    // Use fetch for SSE support
+    fetch(url, {
+      method: "POST",
+      headers: {
+        Authorization: token ? `Bearer ${token}` : undefined,
+      },
+      body: formData,
+    })
+      .then((response) => {
+        if (response.status === 401) {
+          clearToken();
+          window.electronAPI?.navigate("login");
+          throw new Error("Session expired, please login again");
+        }
+
+        if (!response.ok) {
+          return response.json().then((error) => {
+            throw new Error(error.detail || `HTTP error ${response.status}`);
+          });
+        }
+
+        if (onProgress) {
+          onProgress({ phase: "processing", progress: 0, message: "處理中..." });
+        }
+
+        // Read SSE stream
+        const reader = response.body.getReader();
+        const decoder = new TextDecoder();
+        let buffer = "";
+        let result = null;
+        let totalChunks = 0;
+        let processedChunks = 0;
+
+        function processLine(line) {
+          if (line.startsWith("data: ")) {
+            try {
+              const data = JSON.parse(line.slice(6));
+
+              switch (data.event) {
+                case "start":
+                case "segmenting":
+                  if (onProgress) {
+                    onProgress({
+                      phase: "processing",
+                      progress: 5,
+                      message: data.message,
+                    });
+                  }
+                  break;
+
+                case "segments_ready":
+                  totalChunks = data.total;
+                  if (onProgress) {
+                    onProgress({
+                      phase: "transcribing",
+                      progress: 10,
+                      total: totalChunks,
+                      current: 0,
+                      message: data.message,
+                    });
+                  }
+                  break;
+
+                case "chunk_start":
+                  if (onProgress) {
+                    const progress = 10 + ((data.chunk - 1) / totalChunks) * 85;
+                    onProgress({
+                      phase: "transcribing",
+                      progress: Math.round(progress),
+                      total: totalChunks,
+                      current: data.chunk,
+                      message: data.message,
+                    });
+                  }
+                  break;
+
+                case "chunk_done":
+                  processedChunks++;
+                  if (onProgress) {
+                    const progress = 10 + (data.chunk / totalChunks) * 85;
+                    onProgress({
+                      phase: "transcribing",
+                      progress: Math.round(progress),
+                      total: totalChunks,
+                      current: data.chunk,
+                      message: data.message,
+                    });
+                  }
+                  break;
+
+                case "chunk_error":
+                  console.warn(`Chunk ${data.chunk} error: ${data.message}`);
+                  break;
+
+                case "error":
+                  throw new Error(data.message);
+
+                case "complete":
+                  result = {
+                    transcript: data.transcript,
+                    chunks_processed: data.chunks_processed,
+                    chunks_total: data.chunks_total,
+                    total_duration_seconds: data.duration,
+                    language: "zh",
+                  };
+                  if (onProgress) {
+                    onProgress({
+                      phase: "complete",
+                      progress: 100,
+                      message: "轉錄完成",
+                    });
+                  }
+                  break;
+              }
+            } catch (e) {
+              console.warn("SSE parse error:", e, line);
+            }
+          }
+        }
+
+        function read() {
+          reader
+            .read()
+            .then(({ done, value }) => {
+              if (done) {
+                // Process any remaining buffer
+                if (buffer.trim()) {
+                  buffer.split("\n").forEach(processLine);
+                }
+                if (result) {
+                  resolve(result);
+                } else {
+                  reject(new Error("Transcription failed - no result received"));
+                }
+                return;
+              }
+
+              buffer += decoder.decode(value, { stream: true });
+              const lines = buffer.split("\n");
+              buffer = lines.pop() || ""; // Keep incomplete line in buffer
+
+              lines.forEach(processLine);
+              read();
+            })
+            .catch(reject);
+        }
+
+        read();
+      })
+      .catch(reject);
+  });
+}
+
+// Legacy non-streaming version (fallback)
+export async function transcribeAudioLegacy(file, onProgress = null) {
+  const url = `${API_BASE_URL}/ai/transcribe-audio`;
+  const formData = new FormData();
+  formData.append("file", file);
+
+  const token = getToken();
+
+  return new Promise((resolve, reject) => {
+    const xhr = new XMLHttpRequest();
+
+    xhr.upload.addEventListener("progress", (event) => {
+      if (event.lengthComputable && onProgress) {
+        const percentComplete = Math.round((event.loaded / event.total) * 100);
+        onProgress({ phase: "uploading", progress: percentComplete });
+      }
+    });
+
+    xhr.addEventListener("load", () => {
+      if (xhr.status >= 200 && xhr.status < 300) {
+        try {
+          const response = JSON.parse(xhr.responseText);
+          resolve(response);
+        } catch (e) {
+          reject(new Error("Invalid response format"));
+        }
+      } else if (xhr.status === 401) {
+        clearToken();
+        window.electronAPI?.navigate("login");
+        reject(new Error("Session expired, please login again"));
+      } else {
+        try {
+          const error = JSON.parse(xhr.responseText);
+          reject(new Error(error.detail || `HTTP error ${xhr.status}`));
+        } catch (e) {
+          reject(new Error(`HTTP error ${xhr.status}`));
+        }
+      }
+    });
+
+    xhr.addEventListener("error", () => {
+      reject(new Error("Network error"));
+    });
+
+    xhr.addEventListener("timeout", () => {
+      reject(new Error("Request timeout"));
+    });
+
+    xhr.open("POST", url, true);
+    xhr.timeout = 600000; // 10 minutes for large files
+    if (token) {
+      xhr.setRequestHeader("Authorization", `Bearer ${token}`);
+    }
+    xhr.send(formData);
+
+    // Notify processing phase after upload completes
+    if (onProgress) {
+      xhr.upload.addEventListener("loadend", () => {
+        onProgress({ phase: "processing", progress: 0 });
+      });
+    }
+  });
+}
+
 // Export API
 export async function exportMeeting(id) {
  return request(`/meetings/${id}/export`, {
--- a/sidecar/requirements.txt
+++ b/sidecar/requirements.txt
@@ -3,3 +3,5 @@ faster-whisper>=1.0.0
 opencc-python-reimplemented>=0.1.7
 numpy>=1.26.0
 onnxruntime>=1.16.0
+pydub>=0.25.0
+audioop-lts>=0.2.1  # Required for Python 3.13+ (audioop removed from stdlib)
--- a/sidecar/transcriber.py
+++ b/sidecar/transcriber.py
@@ -20,9 +20,10 @@ import tempfile
 import base64
 import uuid
 import re
+import wave
 import urllib.request
 from pathlib import Path
-from typing import Optional, List
+from typing import Optional, List, Tuple

 os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

@@ -105,8 +106,7 @@ class SileroVAD:
    def __init__(self, model_path: Optional[str] = None, threshold: float = 0.5):
        self.threshold = threshold
        self.session = None
-        self._h = np.zeros((2, 1, 64), dtype=np.float32)
-        self._c = np.zeros((2, 1, 64), dtype=np.float32)
+        self._state = np.zeros((2, 1, 128), dtype=np.float32)
        self.sample_rate = 16000

        if not ONNX_AVAILABLE:
@@ -141,8 +141,7 @@ class SileroVAD:

    def reset_states(self):
        """Reset hidden states."""
-        self._h = np.zeros((2, 1, 64), dtype=np.float32)
-        self._c = np.zeros((2, 1, 64), dtype=np.float32)
+        self._state = np.zeros((2, 1, 128), dtype=np.float32)

    def __call__(self, audio: np.ndarray) -> float:
        """Run VAD on audio chunk, return speech probability."""
@@ -153,15 +152,14 @@ class SileroVAD:
        if audio.ndim == 1:
            audio = audio[np.newaxis, :]

-        # Run inference
+        # Run inference with updated model format
        ort_inputs = {
            'input': audio.astype(np.float32),
-            'sr': np.array([self.sample_rate], dtype=np.int64),
-            'h': self._h,
-            'c': self._c
+            'state': self._state,
+            'sr': np.array(self.sample_rate, dtype=np.int64)
        }

-        output, self._h, self._c = self.session.run(None, ort_inputs)
+        output, self._state = self.session.run(None, ort_inputs)
        return float(output[0][0])


@@ -406,6 +404,193 @@ class Transcriber:
            print(json.dumps({"error": f"Transcription error: {e}"}), file=sys.stderr)
            return ""

+    def segment_audio_file(
+        self,
+        audio_path: str,
+        max_chunk_seconds: int = 300,
+        min_silence_ms: int = 500,
+        output_dir: Optional[str] = None
+    ) -> dict:
+        """
+        Segment an audio file using VAD for natural speech boundaries.
+
+        Args:
+            audio_path: Path to the audio file
+            max_chunk_seconds: Maximum duration per chunk (default 5 minutes)
+            min_silence_ms: Minimum silence duration to consider as boundary (default 500ms)
+            output_dir: Directory to save chunks (default: temp directory)
+
+        Returns:
+            dict with segments list and metadata
+        """
+        try:
+            # Import audio processing libraries
+            try:
+                from pydub import AudioSegment
+            except ImportError:
+                return {"error": "pydub not installed. Run: pip install pydub"}
+
+            if not os.path.exists(audio_path):
+                return {"error": f"File not found: {audio_path}"}
+
+            # Create output directory
+            if output_dir is None:
+                output_dir = tempfile.mkdtemp(prefix="audio_segments_")
+            else:
+                os.makedirs(output_dir, exist_ok=True)
+
+            # Load audio file and convert to mono 16kHz
+            print(json.dumps({"status": "loading_audio", "file": audio_path}), file=sys.stderr)
+            audio = AudioSegment.from_file(audio_path)
+            audio = audio.set_channels(1).set_frame_rate(16000)
+            total_duration_ms = len(audio)
+            total_duration_sec = total_duration_ms / 1000
+
+            print(json.dumps({
+                "status": "audio_loaded",
+                "duration_seconds": total_duration_sec
+            }), file=sys.stderr)
+
+            # Convert to numpy for VAD processing
+            samples = np.array(audio.get_array_of_samples(), dtype=np.float32) / 32768.0
+
+            # Run VAD to detect speech regions
+            segments = []
+            current_start = 0
+            max_chunk_samples = max_chunk_seconds * 16000
+            min_silence_samples = int(min_silence_ms * 16 )  # 16 samples per ms at 16kHz
+
+            if self.vad_model is None or self.vad_model.session is None:
+                # No VAD available, use fixed-time splitting
+                print(json.dumps({"warning": "VAD not available, using fixed-time splitting"}), file=sys.stderr)
+                chunk_idx = 0
+                for start_sample in range(0, len(samples), max_chunk_samples):
+                    end_sample = min(start_sample + max_chunk_samples, len(samples))
+                    chunk_samples = samples[start_sample:end_sample]
+
+                    # Export chunk
+                    chunk_path = os.path.join(output_dir, f"chunk_{chunk_idx:03d}.wav")
+                    self._export_wav(chunk_samples, chunk_path)
+
+                    segments.append({
+                        "index": chunk_idx,
+                        "path": chunk_path,
+                        "start": start_sample / 16000,
+                        "end": end_sample / 16000,
+                        "duration": (end_sample - start_sample) / 16000
+                    })
+                    chunk_idx += 1
+            else:
+                # Use VAD for intelligent splitting
+                print(json.dumps({"status": "running_vad"}), file=sys.stderr)
+                self.vad_model.reset_states()
+
+                # Find silence regions for splitting
+                window_size = 512
+                silence_starts = []
+                in_silence = False
+                silence_start = 0
+
+                for i in range(0, len(samples) - window_size, window_size):
+                    window = samples[i:i + window_size]
+                    speech_prob = self.vad_model(window)
+
+                    if speech_prob < 0.3:  # Silence threshold
+                        if not in_silence:
+                            in_silence = True
+                            silence_start = i
+                    else:
+                        if in_silence:
+                            silence_duration = i - silence_start
+                            if silence_duration >= min_silence_samples:
+                                # Mark middle of silence as potential split point
+                                silence_starts.append(silence_start + silence_duration // 2)
+                            in_silence = False
+
+                # Add end of file as final split point
+                silence_starts.append(len(samples))
+
+                # Create segments based on silence boundaries
+                chunk_idx = 0
+                current_start = 0
+
+                for split_point in silence_starts:
+                    # Check if we need to split here
+                    chunk_duration = split_point - current_start
+
+                    if chunk_duration >= max_chunk_samples or split_point == len(samples):
+                        # Find the best split point before max duration
+                        if chunk_duration > max_chunk_samples:
+                            # Find nearest silence point before max
+                            best_split = current_start + max_chunk_samples
+                            for sp in silence_starts:
+                                if current_start < sp <= current_start + max_chunk_samples:
+                                    best_split = sp
+                            split_point = best_split
+
+                        # Export chunk
+                        chunk_samples = samples[current_start:split_point]
+                        if len(chunk_samples) > 8000:  # At least 0.5 seconds
+                            chunk_path = os.path.join(output_dir, f"chunk_{chunk_idx:03d}.wav")
+                            self._export_wav(chunk_samples, chunk_path)
+
+                            segments.append({
+                                "index": chunk_idx,
+                                "path": chunk_path,
+                                "start": current_start / 16000,
+                                "end": split_point / 16000,
+                                "duration": (split_point - current_start) / 16000
+                            })
+                            chunk_idx += 1
+
+                        current_start = split_point
+
+                # Handle any remaining audio - split into max_chunk_samples pieces
+                while current_start < len(samples):
+                    remaining_len = len(samples) - current_start
+                    if remaining_len < 8000:  # Less than 0.5 seconds
+                        break
+
+                    # Determine chunk end (respect max_chunk_samples)
+                    chunk_end = min(current_start + max_chunk_samples, len(samples))
+                    chunk_samples = samples[current_start:chunk_end]
+
+                    chunk_path = os.path.join(output_dir, f"chunk_{chunk_idx:03d}.wav")
+                    self._export_wav(chunk_samples, chunk_path)
+                    segments.append({
+                        "index": chunk_idx,
+                        "path": chunk_path,
+                        "start": current_start / 16000,
+                        "end": chunk_end / 16000,
+                        "duration": len(chunk_samples) / 16000
+                    })
+                    chunk_idx += 1
+                    current_start = chunk_end
+
+            print(json.dumps({
+                "status": "segmentation_complete",
+                "total_segments": len(segments)
+            }), file=sys.stderr)
+
+            return {
+                "status": "success",
+                "segments": segments,
+                "total_segments": len(segments),
+                "total_duration": total_duration_sec,
+                "output_dir": output_dir
+            }
+
+        except Exception as e:
+            return {"error": f"Segmentation error: {str(e)}"}
+
+    def _export_wav(self, samples: np.ndarray, output_path: str):
+        """Export numpy samples to WAV file."""
+        with wave.open(output_path, 'wb') as wf:
+            wf.setnchannels(1)
+            wf.setsampwidth(2)
+            wf.setframerate(16000)
+            wf.writeframes((samples * 32768).astype(np.int16).tobytes())
+
    def handle_command(self, cmd: dict) -> Optional[dict]:
        """Handle a JSON command."""
        action = cmd.get("action")
@@ -447,6 +632,21 @@ class Transcriber:
            self.streaming_session = None
            return result

+        elif action == "segment_audio":
+            # Segment audio file using VAD
+            file_path = cmd.get("file_path")
+            if not file_path:
+                return {"error": "No file_path specified"}
+            max_chunk_seconds = cmd.get("max_chunk_seconds", 300)
+            min_silence_ms = cmd.get("min_silence_ms", 500)
+            output_dir = cmd.get("output_dir")
+            return self.segment_audio_file(
+                file_path,
+                max_chunk_seconds=max_chunk_seconds,
+                min_silence_ms=min_silence_ms,
+                output_dir=output_dir
+            )
+
        elif action == "ping":
            return {"status": "pong"}

--- a/start.sh
+++ b/start.sh
@@ -173,9 +173,23 @@ start_backend() {
    local backend_pid=$!
    echo "BACKEND_PID=$backend_pid" >> "$PID_FILE"

-    # 等待啟動
-    sleep 2
+    # 等待啟動（最多等待 15 秒）
+    local max_wait=15
+    local waited=0
+    log_info "等待後端服務啟動..."

+    while [ $waited -lt $max_wait ]; do
+        sleep 1
+        waited=$((waited + 1))
+
+        # 檢查健康狀態
+        if curl -s http://localhost:$BACKEND_PORT/api/health > /dev/null 2>&1; then
+            log_success "後端服務已啟動 (PID: $backend_pid, Port: $BACKEND_PORT)"
+            return 0
+        fi
+    done
+
+    # 最後再檢查一次 port 狀態
    if check_port $BACKEND_PORT; then
        log_success "後端服務已啟動 (PID: $backend_pid, Port: $BACKEND_PORT)"
    else