feat: add continuous voice mode with VAD silence detection

- Voice mode: press mic once to enter, press again to exit - VAD (Voice Activity Detection) auto-stops recording after 1.5s silence - Continuous loop: speak → transcribe → agent responds → TTS plays → auto-listen - Voice mode UI: input bar hides, large mic button centered - Auto-restart listening when TTS playback finishes - Fallback: restart listening on text response if no TTS arrives
2026-03-11 20:23:01 +03:00
parent d3e09df01a
commit c477f660da
1 changed files with 102 additions and 6 deletions
--- a/gateway/platforms/web.py
+++ b/gateway/platforms/web.py
@@ -947,6 +947,21 @@ body {
    animation: pulse 1.5s infinite;
    box-shadow: 0 0 16px rgba(255,107,107,0.4);
 }
+#voice-btn.voice-mode {
+    background: var(--accent);
+    border-color: var(--accent);
+    color: #fff;
+    box-shadow: 0 0 16px var(--accent-glow);
+}
+#input-bar.voice-mode-active {
+    justify-content: center;
+}
+#input-bar.voice-mode-active #input,
+#input-bar.voice-mode-active #send-btn { display: none; }
+#input-bar.voice-mode-active #voice-btn {
+    width: 56px; height: 56px;
+}
+#input-bar.voice-mode-active #voice-btn svg { width: 26px; height: 26px; }
@keyframes pulse {
    0%, 100% { opacity: 1; }
    50% { opacity: 0.6; }
@@ -1014,6 +1029,9 @@ let authToken = '';
 let isRecording = false;
 let mediaRecorder = null;
 let audioChunks = [];
+let voiceMode = false;
+let voiceAwaitingResponse = false;
+let currentTtsAudio = null;
 let typingTimeout = null;
 let autoScroll = true;

@@ -1090,6 +1108,16 @@ function handleServerMessage(data) {
        case 'message':
            hideTyping();
            addBotMessage(data.id, data.content, data.timestamp);
+            // In voice mode, if no TTS audio is coming, restart listening after text
+            if (voiceMode && voiceAwaitingResponse) {
+                // Give a short delay for play_audio to arrive
+                setTimeout(() => {
+                    if (voiceMode && voiceAwaitingResponse && !currentTtsAudio) {
+                        voiceAwaitingResponse = false;
+                        startRecording();
+                    }
+                }, 2000);
+            }
            break;

        case 'edit':
@@ -1122,7 +1150,16 @@ function handleServerMessage(data) {

        case 'play_audio':
            // Invisible TTS playback — no UI element, just play audio
-            { const a = new Audio(data.url); a.play().catch(() => {}); }
+            {
+                const a = new Audio(data.url);
+                currentTtsAudio = a;
+                voiceAwaitingResponse = false;
+                a.onended = () => {
+                    currentTtsAudio = null;
+                    if (voiceMode) startRecording();
+                };
+                a.play().catch(() => { currentTtsAudio = null; if (voiceMode) startRecording(); });
+            }
            break;

        case 'error':
@@ -1155,28 +1192,63 @@ function autoGrow(el) {
    el.style.height = Math.min(el.scrollHeight, 120) + 'px';
 }

-// --- Voice Recording ---
+// --- Voice Mode & Recording with VAD ---
+const SILENCE_THRESHOLD = 0.015;
+const SILENCE_DURATION = 1500; // ms of silence to auto-stop
+
 async function toggleVoice() {
-    if (isRecording) {
-        stopRecording();
+    if (voiceMode) {
+        exitVoiceMode();
    } else {
-        await startRecording();
+        enterVoiceMode();
    }
 }

+function enterVoiceMode() {
+    voiceMode = true;
+    document.getElementById('voice-btn').classList.add('voice-mode');
+    document.getElementById('input-bar').classList.add('voice-mode-active');
+    startRecording();
+}
+
+function exitVoiceMode() {
+    voiceMode = false;
+    voiceAwaitingResponse = false;
+    document.getElementById('voice-btn').classList.remove('voice-mode');
+    document.getElementById('input-bar').classList.remove('voice-mode-active');
+    if (currentTtsAudio) { currentTtsAudio.pause(); currentTtsAudio = null; }
+    stopRecording();
+}
+
 async function startRecording() {
+    if (isRecording) return;
    try {
        const stream = await navigator.mediaDevices.getUserMedia({audio: true});
        audioChunks = [];
+
+        // Set up VAD with AnalyserNode
+        const audioCtx = new (window.AudioContext || window.webkitAudioContext)();
+        const source = audioCtx.createMediaStreamSource(stream);
+        const analyser = audioCtx.createAnalyser();
+        analyser.fftSize = 512;
+        source.connect(analyser);
+        const dataArray = new Float32Array(analyser.fftSize);
+        let silenceStart = null;
+        let hasSpoken = false;
+        let vadActive = true;
+
        mediaRecorder = new MediaRecorder(stream, {mimeType: 'audio/webm;codecs=opus'});
        mediaRecorder.ondataavailable = (e) => { if (e.data.size > 0) audioChunks.push(e.data); };
        mediaRecorder.onstop = async () => {
+            vadActive = false;
+            audioCtx.close();
            stream.getTracks().forEach(t => t.stop());
-            if (audioChunks.length === 0) return;
+            if (audioChunks.length === 0 || !hasSpoken) return;
            const blob = new Blob(audioChunks, {type: 'audio/webm'});
            const reader = new FileReader();
            reader.onloadend = () => {
                const b64 = reader.result.split(',')[1];
+                voiceAwaitingResponse = true;
                ws.send(JSON.stringify({type: 'voice', audio: b64, format: 'webm'}));
            };
            reader.readAsDataURL(blob);
@@ -1184,8 +1256,32 @@ async function startRecording() {
        mediaRecorder.start();
        isRecording = true;
        document.getElementById('voice-btn').classList.add('recording');
+
+        // VAD loop — detect silence to auto-stop
+        function checkVAD() {
+            if (!vadActive || !isRecording) return;
+            analyser.getFloatTimeDomainData(dataArray);
+            let sum = 0;
+            for (let i = 0; i < dataArray.length; i++) sum += dataArray[i] * dataArray[i];
+            const rms = Math.sqrt(sum / dataArray.length);
+
+            if (rms > SILENCE_THRESHOLD) {
+                hasSpoken = true;
+                silenceStart = null;
+            } else if (hasSpoken) {
+                if (!silenceStart) silenceStart = Date.now();
+                else if (Date.now() - silenceStart > SILENCE_DURATION) {
+                    stopRecording();
+                    return;
+                }
+            }
+            requestAnimationFrame(checkVAD);
+        }
+        requestAnimationFrame(checkVAD);
+
    } catch (err) {
        addSystemMessage('Microphone access denied.');
+        if (voiceMode) exitVoiceMode();
    }
 }