feat: add continuous voice mode with VAD silence detection

- Voice mode: press mic once to enter, press again to exit
- VAD (Voice Activity Detection) auto-stops recording after 1.5s silence
- Continuous loop: speak → transcribe → agent responds → TTS plays → auto-listen
- Voice mode UI: input bar hides, large mic button centered
- Auto-restart listening when TTS playback finishes
- Fallback: restart listening on text response if no TTS arrives
This commit is contained in:
0xbyt4
2026-03-11 20:23:01 +03:00
parent d3e09df01a
commit c477f660da

View File

@@ -947,6 +947,21 @@ body {
animation: pulse 1.5s infinite;
box-shadow: 0 0 16px rgba(255,107,107,0.4);
}
#voice-btn.voice-mode {
background: var(--accent);
border-color: var(--accent);
color: #fff;
box-shadow: 0 0 16px var(--accent-glow);
}
#input-bar.voice-mode-active {
justify-content: center;
}
#input-bar.voice-mode-active #input,
#input-bar.voice-mode-active #send-btn { display: none; }
#input-bar.voice-mode-active #voice-btn {
width: 56px; height: 56px;
}
#input-bar.voice-mode-active #voice-btn svg { width: 26px; height: 26px; }
@keyframes pulse {
0%, 100% { opacity: 1; }
50% { opacity: 0.6; }
@@ -1014,6 +1029,9 @@ let authToken = '';
let isRecording = false;
let mediaRecorder = null;
let audioChunks = [];
let voiceMode = false;
let voiceAwaitingResponse = false;
let currentTtsAudio = null;
let typingTimeout = null;
let autoScroll = true;
@@ -1090,6 +1108,16 @@ function handleServerMessage(data) {
case 'message':
hideTyping();
addBotMessage(data.id, data.content, data.timestamp);
// In voice mode, if no TTS audio is coming, restart listening after text
if (voiceMode && voiceAwaitingResponse) {
// Give a short delay for play_audio to arrive
setTimeout(() => {
if (voiceMode && voiceAwaitingResponse && !currentTtsAudio) {
voiceAwaitingResponse = false;
startRecording();
}
}, 2000);
}
break;
case 'edit':
@@ -1122,7 +1150,16 @@ function handleServerMessage(data) {
case 'play_audio':
// Invisible TTS playback — no UI element, just play audio
{ const a = new Audio(data.url); a.play().catch(() => {}); }
{
const a = new Audio(data.url);
currentTtsAudio = a;
voiceAwaitingResponse = false;
a.onended = () => {
currentTtsAudio = null;
if (voiceMode) startRecording();
};
a.play().catch(() => { currentTtsAudio = null; if (voiceMode) startRecording(); });
}
break;
case 'error':
@@ -1155,28 +1192,63 @@ function autoGrow(el) {
el.style.height = Math.min(el.scrollHeight, 120) + 'px';
}
// --- Voice Recording ---
// --- Voice Mode & Recording with VAD ---
const SILENCE_THRESHOLD = 0.015;
const SILENCE_DURATION = 1500; // ms of silence to auto-stop
async function toggleVoice() {
if (isRecording) {
stopRecording();
if (voiceMode) {
exitVoiceMode();
} else {
await startRecording();
enterVoiceMode();
}
}
function enterVoiceMode() {
voiceMode = true;
document.getElementById('voice-btn').classList.add('voice-mode');
document.getElementById('input-bar').classList.add('voice-mode-active');
startRecording();
}
function exitVoiceMode() {
voiceMode = false;
voiceAwaitingResponse = false;
document.getElementById('voice-btn').classList.remove('voice-mode');
document.getElementById('input-bar').classList.remove('voice-mode-active');
if (currentTtsAudio) { currentTtsAudio.pause(); currentTtsAudio = null; }
stopRecording();
}
async function startRecording() {
if (isRecording) return;
try {
const stream = await navigator.mediaDevices.getUserMedia({audio: true});
audioChunks = [];
// Set up VAD with AnalyserNode
const audioCtx = new (window.AudioContext || window.webkitAudioContext)();
const source = audioCtx.createMediaStreamSource(stream);
const analyser = audioCtx.createAnalyser();
analyser.fftSize = 512;
source.connect(analyser);
const dataArray = new Float32Array(analyser.fftSize);
let silenceStart = null;
let hasSpoken = false;
let vadActive = true;
mediaRecorder = new MediaRecorder(stream, {mimeType: 'audio/webm;codecs=opus'});
mediaRecorder.ondataavailable = (e) => { if (e.data.size > 0) audioChunks.push(e.data); };
mediaRecorder.onstop = async () => {
vadActive = false;
audioCtx.close();
stream.getTracks().forEach(t => t.stop());
if (audioChunks.length === 0) return;
if (audioChunks.length === 0 || !hasSpoken) return;
const blob = new Blob(audioChunks, {type: 'audio/webm'});
const reader = new FileReader();
reader.onloadend = () => {
const b64 = reader.result.split(',')[1];
voiceAwaitingResponse = true;
ws.send(JSON.stringify({type: 'voice', audio: b64, format: 'webm'}));
};
reader.readAsDataURL(blob);
@@ -1184,8 +1256,32 @@ async function startRecording() {
mediaRecorder.start();
isRecording = true;
document.getElementById('voice-btn').classList.add('recording');
// VAD loop — detect silence to auto-stop
function checkVAD() {
if (!vadActive || !isRecording) return;
analyser.getFloatTimeDomainData(dataArray);
let sum = 0;
for (let i = 0; i < dataArray.length; i++) sum += dataArray[i] * dataArray[i];
const rms = Math.sqrt(sum / dataArray.length);
if (rms > SILENCE_THRESHOLD) {
hasSpoken = true;
silenceStart = null;
} else if (hasSpoken) {
if (!silenceStart) silenceStart = Date.now();
else if (Date.now() - silenceStart > SILENCE_DURATION) {
stopRecording();
return;
}
}
requestAnimationFrame(checkVAD);
}
requestAnimationFrame(checkVAD);
} catch (err) {
addSystemMessage('Microphone access denied.');
if (voiceMode) exitVoiceMode();
}
}