feat: Add Dify audio transcription with VAD chunking and SSE progress

- Add audio file upload transcription via Dify STT API - Implement VAD-based audio segmentation in sidecar (3-min chunks) - Add SSE endpoint for real-time transcription progress updates - Fix chunk size enforcement for reliable uploads - Add retry logic with exponential backoff for API calls - Support Python 3.13+ with audioop-lts package - Update frontend with Chinese progress messages and chunk display - Improve start.sh health check with retry loop 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-11 21:00:27 +08:00
parent e790f48967
commit 263eb1c394
10 changed files with 1008 additions and 16 deletions
--- a/client/src/pages/meeting-detail.html
+++ b/client/src/pages/meeting-detail.html
@@ -96,6 +96,33 @@
      color: #666;
      font-style: italic;
    }
+    .upload-progress {
+      display: none;
+      padding: 10px 15px;
+      background: #fff3e0;
+      border-radius: 6px;
+      margin-bottom: 10px;
+    }
+    .upload-progress.active {
+      display: block;
+    }
+    .upload-progress-bar {
+      height: 6px;
+      background: #e0e0e0;
+      border-radius: 3px;
+      overflow: hidden;
+      margin-top: 8px;
+    }
+    .upload-progress-fill {
+      height: 100%;
+      background: #ff9800;
+      width: 0%;
+      transition: width 0.3s ease;
+    }
+    .upload-progress-text {
+      font-size: 13px;
+      color: #e65100;
+    }
    .transcript-textarea {
      width: 100%;
      min-height: 400px;
@@ -143,8 +170,10 @@
      <div class="panel">
        <div class="panel-header">
          <span>Transcript (逐字稿)</span>
-          <div class="recording-controls" style="padding: 0;">
+          <div class="recording-controls" style="padding: 0; display: flex; gap: 8px;">
            <button class="btn btn-danger" id="record-btn">Start Recording</button>
+            <button class="btn btn-secondary" id="upload-audio-btn">Upload Audio</button>
+            <input type="file" id="audio-file-input" accept=".mp3,.wav,.m4a,.webm,.ogg,.flac,.aac" style="display: none;">
          </div>
        </div>
        <div class="panel-body">
@@ -155,6 +184,14 @@
            <span class="segment-count" id="segment-count">Segments: 0</span>
          </div>

+          <!-- Upload Progress -->
+          <div id="upload-progress" class="upload-progress">
+            <span class="upload-progress-text" id="upload-progress-text">Uploading...</span>
+            <div class="upload-progress-bar">
+              <div class="upload-progress-fill" id="upload-progress-fill"></div>
+            </div>
+          </div>
+
          <!-- Single Transcript Textarea -->
          <div id="transcript-container">
            <textarea
@@ -203,7 +240,8 @@
      updateMeeting,
      deleteMeeting,
      exportMeeting,
-      summarizeTranscript
+      summarizeTranscript,
+      transcribeAudio
    } from '../services/api.js';

    const meetingId = localStorage.getItem('currentMeetingId');
@@ -234,6 +272,11 @@
    const deleteBtn = document.getElementById('delete-btn');
    const addConclusionBtn = document.getElementById('add-conclusion-btn');
    const addActionBtn = document.getElementById('add-action-btn');
+    const uploadAudioBtn = document.getElementById('upload-audio-btn');
+    const audioFileInput = document.getElementById('audio-file-input');
+    const uploadProgressEl = document.getElementById('upload-progress');
+    const uploadProgressText = document.getElementById('upload-progress-text');
+    const uploadProgressFill = document.getElementById('upload-progress-fill');

    // Load meeting data
    async function loadMeeting() {
@@ -460,6 +503,86 @@
      processingIndicatorEl.classList.add('hidden');
    }

+    // === Audio File Upload ===
+    uploadAudioBtn.addEventListener('click', () => {
+      if (isRecording) {
+        alert('Please stop recording before uploading audio.');
+        return;
+      }
+      audioFileInput.click();
+    });
+
+    audioFileInput.addEventListener('change', async (e) => {
+      const file = e.target.files[0];
+      if (!file) return;
+
+      // Validate file size (500MB max)
+      const maxSize = 500 * 1024 * 1024;
+      if (file.size > maxSize) {
+        alert('File too large. Maximum size is 500MB.');
+        audioFileInput.value = '';
+        return;
+      }
+
+      // Confirm if transcript has content
+      const currentTranscript = transcriptTextEl.value.trim();
+      if (currentTranscript) {
+        if (!confirm('This will replace the existing transcript. Do you want to continue?')) {
+          audioFileInput.value = '';
+          return;
+        }
+      }
+
+      // Start upload
+      uploadAudioBtn.disabled = true;
+      recordBtn.disabled = true;
+      uploadProgressEl.classList.add('active');
+      uploadProgressFill.style.width = '0%';
+      uploadProgressText.textContent = 'Uploading audio file...';
+
+      try {
+        const result = await transcribeAudio(file, (progress) => {
+          if (progress.phase === 'uploading') {
+            uploadProgressFill.style.width = `${progress.progress}%`;
+            uploadProgressText.textContent = `上傳中: ${progress.progress}%`;
+          } else if (progress.phase === 'processing') {
+            uploadProgressFill.style.width = `${progress.progress}%`;
+            uploadProgressText.textContent = progress.message || '處理中...';
+          } else if (progress.phase === 'transcribing') {
+            uploadProgressFill.style.width = `${progress.progress}%`;
+            if (progress.total && progress.current) {
+              uploadProgressText.textContent = `轉錄中: ${progress.current}/${progress.total} 片段 (${progress.progress}%)`;
+            } else {
+              uploadProgressText.textContent = progress.message || '轉錄中...';
+            }
+          } else if (progress.phase === 'complete') {
+            uploadProgressFill.style.width = '100%';
+            uploadProgressText.textContent = progress.message || '轉錄完成';
+          }
+        });
+
+        // Success - update transcript
+        transcriptTextEl.value = result.transcript || '';
+        const chunksInfo = result.chunks_failed > 0
+          ? `${result.chunks_processed}/${result.chunks_total} 片段成功`
+          : `${result.chunks_processed} 片段`;
+        uploadProgressText.textContent = `轉錄完成！(${chunksInfo}, ${Math.round(result.total_duration_seconds)}秒)`;
+
+        // Auto-hide progress after 3 seconds
+        setTimeout(() => {
+          uploadProgressEl.classList.remove('active');
+        }, 3000);
+
+      } catch (error) {
+        alert('Error transcribing audio: ' + error.message);
+        uploadProgressEl.classList.remove('active');
+      } finally {
+        uploadAudioBtn.disabled = false;
+        recordBtn.disabled = false;
+        audioFileInput.value = '';
+      }
+    });
+
    // === Streaming Event Handlers (legacy, kept for future use) ===
    window.electronAPI.onTranscriptionSegment((segment) => {
      console.log('Received segment:', segment);
--- a/client/src/services/api.js
+++ b/client/src/services/api.js
@@ -141,6 +141,231 @@ export async function summarizeTranscript(transcript) {
  });
 }

+export async function transcribeAudio(file, onProgress = null) {
+  const url = `${API_BASE_URL}/ai/transcribe-audio-stream`;
+  const formData = new FormData();
+  formData.append("file", file);
+
+  const token = getToken();
+
+  return new Promise((resolve, reject) => {
+    // Use fetch for SSE support
+    fetch(url, {
+      method: "POST",
+      headers: {
+        Authorization: token ? `Bearer ${token}` : undefined,
+      },
+      body: formData,
+    })
+      .then((response) => {
+        if (response.status === 401) {
+          clearToken();
+          window.electronAPI?.navigate("login");
+          throw new Error("Session expired, please login again");
+        }
+
+        if (!response.ok) {
+          return response.json().then((error) => {
+            throw new Error(error.detail || `HTTP error ${response.status}`);
+          });
+        }
+
+        if (onProgress) {
+          onProgress({ phase: "processing", progress: 0, message: "處理中..." });
+        }
+
+        // Read SSE stream
+        const reader = response.body.getReader();
+        const decoder = new TextDecoder();
+        let buffer = "";
+        let result = null;
+        let totalChunks = 0;
+        let processedChunks = 0;
+
+        function processLine(line) {
+          if (line.startsWith("data: ")) {
+            try {
+              const data = JSON.parse(line.slice(6));
+
+              switch (data.event) {
+                case "start":
+                case "segmenting":
+                  if (onProgress) {
+                    onProgress({
+                      phase: "processing",
+                      progress: 5,
+                      message: data.message,
+                    });
+                  }
+                  break;
+
+                case "segments_ready":
+                  totalChunks = data.total;
+                  if (onProgress) {
+                    onProgress({
+                      phase: "transcribing",
+                      progress: 10,
+                      total: totalChunks,
+                      current: 0,
+                      message: data.message,
+                    });
+                  }
+                  break;
+
+                case "chunk_start":
+                  if (onProgress) {
+                    const progress = 10 + ((data.chunk - 1) / totalChunks) * 85;
+                    onProgress({
+                      phase: "transcribing",
+                      progress: Math.round(progress),
+                      total: totalChunks,
+                      current: data.chunk,
+                      message: data.message,
+                    });
+                  }
+                  break;
+
+                case "chunk_done":
+                  processedChunks++;
+                  if (onProgress) {
+                    const progress = 10 + (data.chunk / totalChunks) * 85;
+                    onProgress({
+                      phase: "transcribing",
+                      progress: Math.round(progress),
+                      total: totalChunks,
+                      current: data.chunk,
+                      message: data.message,
+                    });
+                  }
+                  break;
+
+                case "chunk_error":
+                  console.warn(`Chunk ${data.chunk} error: ${data.message}`);
+                  break;
+
+                case "error":
+                  throw new Error(data.message);
+
+                case "complete":
+                  result = {
+                    transcript: data.transcript,
+                    chunks_processed: data.chunks_processed,
+                    chunks_total: data.chunks_total,
+                    total_duration_seconds: data.duration,
+                    language: "zh",
+                  };
+                  if (onProgress) {
+                    onProgress({
+                      phase: "complete",
+                      progress: 100,
+                      message: "轉錄完成",
+                    });
+                  }
+                  break;
+              }
+            } catch (e) {
+              console.warn("SSE parse error:", e, line);
+            }
+          }
+        }
+
+        function read() {
+          reader
+            .read()
+            .then(({ done, value }) => {
+              if (done) {
+                // Process any remaining buffer
+                if (buffer.trim()) {
+                  buffer.split("\n").forEach(processLine);
+                }
+                if (result) {
+                  resolve(result);
+                } else {
+                  reject(new Error("Transcription failed - no result received"));
+                }
+                return;
+              }
+
+              buffer += decoder.decode(value, { stream: true });
+              const lines = buffer.split("\n");
+              buffer = lines.pop() || ""; // Keep incomplete line in buffer
+
+              lines.forEach(processLine);
+              read();
+            })
+            .catch(reject);
+        }
+
+        read();
+      })
+      .catch(reject);
+  });
+}
+
+// Legacy non-streaming version (fallback)
+export async function transcribeAudioLegacy(file, onProgress = null) {
+  const url = `${API_BASE_URL}/ai/transcribe-audio`;
+  const formData = new FormData();
+  formData.append("file", file);
+
+  const token = getToken();
+
+  return new Promise((resolve, reject) => {
+    const xhr = new XMLHttpRequest();
+
+    xhr.upload.addEventListener("progress", (event) => {
+      if (event.lengthComputable && onProgress) {
+        const percentComplete = Math.round((event.loaded / event.total) * 100);
+        onProgress({ phase: "uploading", progress: percentComplete });
+      }
+    });
+
+    xhr.addEventListener("load", () => {
+      if (xhr.status >= 200 && xhr.status < 300) {
+        try {
+          const response = JSON.parse(xhr.responseText);
+          resolve(response);
+        } catch (e) {
+          reject(new Error("Invalid response format"));
+        }
+      } else if (xhr.status === 401) {
+        clearToken();
+        window.electronAPI?.navigate("login");
+        reject(new Error("Session expired, please login again"));
+      } else {
+        try {
+          const error = JSON.parse(xhr.responseText);
+          reject(new Error(error.detail || `HTTP error ${xhr.status}`));
+        } catch (e) {
+          reject(new Error(`HTTP error ${xhr.status}`));
+        }
+      }
+    });
+
+    xhr.addEventListener("error", () => {
+      reject(new Error("Network error"));
+    });
+
+    xhr.addEventListener("timeout", () => {
+      reject(new Error("Request timeout"));
+    });
+
+    xhr.open("POST", url, true);
+    xhr.timeout = 600000; // 10 minutes for large files
+    if (token) {
+      xhr.setRequestHeader("Authorization", `Bearer ${token}`);
+    }
+    xhr.send(formData);
+
+    // Notify processing phase after upload completes
+    if (onProgress) {
+      xhr.upload.addEventListener("loadend", () => {
+        onProgress({ phase: "processing", progress: 0 });
+      });
+    }
+  });
+}
+
 // Export API
 export async function exportMeeting(id) {
  return request(`/meetings/${id}/export`, {