From 7aed4d9c3478a299762bd7455a042716a2766d98 Mon Sep 17 00:00:00 2001 From: tcpsyn Date: Thu, 5 Feb 2026 16:32:27 -0700 Subject: [PATCH] Fix live caller audio latency and choppiness MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Reduce capture chunk from 4096 to 640 samples (256ms → 40ms) - Replace BufferSource scheduling with AudioWorklet playback ring buffer - Add 80ms jitter buffer with linear interpolation upsampling - Reduce host mic and live caller stream blocksizes from 4096/2048 to 1024 - Replace librosa.resample with numpy interpolation in send_audio_to_caller Co-Authored-By: Claude Opus 4.6 --- backend/services/audio.py | 4 +- backend/services/caller_service.py | 7 +- frontend/call-in.html | 2 +- frontend/index.html | 2 +- frontend/js/call-in.js | 101 +++++++++++++++++++++++------ 5 files changed, 89 insertions(+), 27 deletions(-) diff --git a/backend/services/audio.py b/backend/services/audio.py index 4d76cf5..05dbb0e 100644 --- a/backend/services/audio.py +++ b/backend/services/audio.py @@ -366,7 +366,7 @@ class AudioService: channels=num_channels, dtype=np.float32, callback=callback, - blocksize=2048, + blocksize=1024, ) self._live_caller_stream.start() print(f"[Audio] Live caller stream started on ch {self.live_caller_channel} @ {device_sr}Hz") @@ -438,7 +438,7 @@ class AudioService: channels=max_channels, samplerate=device_sr, dtype=np.float32, - blocksize=4096, + blocksize=1024, callback=callback, ) self._host_stream.start() diff --git a/backend/services/caller_service.py b/backend/services/caller_service.py index 9ac870c..75d282e 100644 --- a/backend/services/caller_service.py +++ b/backend/services/caller_service.py @@ -119,9 +119,12 @@ class CallerService: try: if sample_rate != 16000: import numpy as np - import librosa audio = np.frombuffer(pcm_data, dtype=np.int16).astype(np.float32) / 32768.0 - audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=16000) + ratio = 16000 / sample_rate + out_len = int(len(audio) * ratio) + indices = (np.arange(out_len) / ratio).astype(int) + indices = np.clip(indices, 0, len(audio) - 1) + audio = audio[indices] pcm_data = (audio * 32767).astype(np.int16).tobytes() await ws.send_bytes(pcm_data) except Exception as e: diff --git a/frontend/call-in.html b/frontend/call-in.html index b662c22..aa72937 100644 --- a/frontend/call-in.html +++ b/frontend/call-in.html @@ -150,6 +150,6 @@ - + diff --git a/frontend/index.html b/frontend/index.html index b15f4b9..e704219 100644 --- a/frontend/index.html +++ b/frontend/index.html @@ -207,6 +207,6 @@ - + diff --git a/frontend/js/call-in.js b/frontend/js/call-in.js index dbf4990..a7c4b76 100644 --- a/frontend/js/call-in.js +++ b/frontend/js/call-in.js @@ -8,7 +8,7 @@ let ws = null; let audioCtx = null; let micStream = null; let workletNode = null; -let nextPlayTime = 0; +let playbackNode = null; let callerId = null; const callBtn = document.getElementById('call-btn'); @@ -39,19 +39,19 @@ async function startCall() { // Set up AudioContext audioCtx = new AudioContext({ sampleRate: 48000 }); - // Register worklet processor inline via blob + // Register worklet processors inline via blob const processorCode = ` +// --- Capture processor: downsample to 16kHz, emit small chunks --- class CallerProcessor extends AudioWorkletProcessor { constructor() { super(); this.buffer = []; - this.targetSamples = 4096; // ~256ms at 16kHz + this.targetSamples = 640; // 40ms at 16kHz — low latency } process(inputs) { const input = inputs[0][0]; if (!input) return true; - // Downsample from sampleRate to 16000 const ratio = sampleRate / 16000; for (let i = 0; i < input.length; i += ratio) { const idx = Math.floor(i); @@ -60,7 +60,7 @@ class CallerProcessor extends AudioWorkletProcessor { } } - if (this.buffer.length >= this.targetSamples) { + while (this.buffer.length >= this.targetSamples) { const chunk = this.buffer.splice(0, this.targetSamples); const int16 = new Int16Array(chunk.length); for (let i = 0; i < chunk.length; i++) { @@ -73,6 +73,70 @@ class CallerProcessor extends AudioWorkletProcessor { } } registerProcessor('caller-processor', CallerProcessor); + +// --- Playback processor: ring buffer with 16kHz->sampleRate upsampling --- +class PlaybackProcessor extends AudioWorkletProcessor { + constructor() { + super(); + this.ringSize = 16000 * 3; // 3s ring buffer at 16kHz + this.ring = new Float32Array(this.ringSize); + this.writePos = 0; + this.readPos = 0; + this.available = 0; + this.started = false; + this.jitterMs = 80; // buffer 80ms before starting playback + this.jitterSamples = Math.floor(16000 * this.jitterMs / 1000); + + this.port.onmessage = (e) => { + const data = e.data; + for (let i = 0; i < data.length; i++) { + this.ring[this.writePos] = data[i]; + this.writePos = (this.writePos + 1) % this.ringSize; + } + this.available += data.length; + if (this.available > this.ringSize) { + // Overflow — skip ahead + this.available = this.ringSize; + this.readPos = (this.writePos - this.ringSize + this.ringSize) % this.ringSize; + } + }; + } + process(inputs, outputs) { + const output = outputs[0][0]; + if (!output) return true; + + // Wait for jitter buffer to fill before starting + if (!this.started) { + if (this.available < this.jitterSamples) { + output.fill(0); + return true; + } + this.started = true; + } + + const ratio = 16000 / sampleRate; + const srcNeeded = Math.ceil(output.length * ratio); + + if (this.available >= srcNeeded) { + for (let i = 0; i < output.length; i++) { + const srcPos = i * ratio; + const idx = Math.floor(srcPos); + const frac = srcPos - idx; + const p0 = (this.readPos + idx) % this.ringSize; + const p1 = (p0 + 1) % this.ringSize; + output[i] = this.ring[p0] * (1 - frac) + this.ring[p1] * frac; + } + this.readPos = (this.readPos + srcNeeded) % this.ringSize; + this.available -= srcNeeded; + } else { + // Underrun — silence, reset jitter buffer + output.fill(0); + this.started = false; + } + return true; + } +} +registerProcessor('playback-processor', PlaybackProcessor); `; const blob = new Blob([processorCode], { type: 'application/javascript' }); const blobUrl = URL.createObjectURL(blob); @@ -120,6 +184,10 @@ registerProcessor('caller-processor', CallerProcessor); source.connect(workletNode); // Don't connect worklet to destination — we don't want to hear our own mic + // Set up playback worklet for received audio + playbackNode = new AudioWorkletNode(audioCtx, 'playback-processor'); + playbackNode.connect(audioCtx.destination); + // Show mic meter const analyser = audioCtx.createAnalyser(); analyser.fftSize = 256; @@ -144,7 +212,6 @@ function handleControlMessage(msg) { setStatus(`Waiting in queue (position ${msg.position})...`, false); } else if (msg.status === 'on_air') { setStatus('ON AIR', true); - nextPlayTime = audioCtx.currentTime; } else if (msg.status === 'disconnected') { setStatus('Disconnected', false); cleanup(); @@ -152,27 +219,15 @@ function handleControlMessage(msg) { } function handleAudioData(buffer) { - if (!audioCtx) return; + if (!playbackNode) return; + // Convert Int16 PCM to Float32 and send to playback worklet const int16 = new Int16Array(buffer); const float32 = new Float32Array(int16.length); for (let i = 0; i < int16.length; i++) { float32[i] = int16[i] / 32768; } - - const audioBuf = audioCtx.createBuffer(1, float32.length, 16000); - audioBuf.copyToChannel(float32, 0); - - const source = audioCtx.createBufferSource(); - source.buffer = audioBuf; - source.connect(audioCtx.destination); - - const now = audioCtx.currentTime; - if (nextPlayTime < now) { - nextPlayTime = now; - } - source.start(nextPlayTime); - nextPlayTime += audioBuf.duration; + playbackNode.port.postMessage(float32, [float32.buffer]); } function hangUp() { @@ -188,6 +243,10 @@ function cleanup() { workletNode.disconnect(); workletNode = null; } + if (playbackNode) { + playbackNode.disconnect(); + playbackNode = null; + } if (micStream) { micStream.getTracks().forEach(t => t.stop()); micStream = null;