From 7aed4d9c3478a299762bd7455a042716a2766d98 Mon Sep 17 00:00:00 2001
From: tcpsyn <tcpsyn@gmail.com>
Date: Thu, 5 Feb 2026 16:32:27 -0700
Subject: [PATCH] Fix live caller audio latency and choppiness
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Reduce capture chunk from 4096 to 640 samples (256ms → 40ms)
- Replace BufferSource scheduling with AudioWorklet playback ring buffer
- Add 80ms jitter buffer with linear interpolation upsampling
- Reduce host mic and live caller stream blocksizes from 4096/2048 to 1024
- Replace librosa.resample with numpy interpolation in send_audio_to_caller

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 backend/services/audio.py          |   4 +-
 backend/services/caller_service.py |   7 +-
 frontend/call-in.html              |   2 +-
 frontend/index.html                |   2 +-
 frontend/js/call-in.js             | 101 +++++++++++++++++++++++------
 5 files changed, 89 insertions(+), 27 deletions(-)
diff --git a/backend/services/audio.py b/backend/services/audio.py
index 4d76cf5..05dbb0e 100644
--- a/backend/services/audio.py
+++ b/backend/services/audio.py
@@ -366,7 +366,7 @@ class AudioService:
             channels=num_channels,
             dtype=np.float32,
             callback=callback,
-            blocksize=2048,
+            blocksize=1024,
         )
         self._live_caller_stream.start()
         print(f"[Audio] Live caller stream started on ch {self.live_caller_channel} @ {device_sr}Hz")
@@ -438,7 +438,7 @@ class AudioService:
                 channels=max_channels,
                 samplerate=device_sr,
                 dtype=np.float32,
-                blocksize=4096,
+                blocksize=1024,
                 callback=callback,
             )
             self._host_stream.start()
diff --git a/backend/services/caller_service.py b/backend/services/caller_service.py
index 9ac870c..75d282e 100644
--- a/backend/services/caller_service.py
+++ b/backend/services/caller_service.py
@@ -119,9 +119,12 @@ class CallerService:
         try:
             if sample_rate != 16000:
                 import numpy as np
-                import librosa
                 audio = np.frombuffer(pcm_data, dtype=np.int16).astype(np.float32) / 32768.0
-                audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=16000)
+                ratio = 16000 / sample_rate
+                out_len = int(len(audio) * ratio)
+                indices = (np.arange(out_len) / ratio).astype(int)
+                indices = np.clip(indices, 0, len(audio) - 1)
+                audio = audio[indices]
                 pcm_data = (audio * 32767).astype(np.int16).tobytes()
             await ws.send_bytes(pcm_data)
         except Exception as e:
diff --git a/frontend/call-in.html b/frontend/call-in.html
index b662c22..aa72937 100644
--- a/frontend/call-in.html
+++ b/frontend/call-in.html
@@ -150,6 +150,6 @@
         </div>
     </div>
 
-    <script src="/js/call-in.js"></script>
+    <script src="/js/call-in.js?v=2"></script>
 </body>
 </html>
diff --git a/frontend/index.html b/frontend/index.html
index b15f4b9..e704219 100644
--- a/frontend/index.html
+++ b/frontend/index.html
@@ -207,6 +207,6 @@
         </div>
     </div>
 
-    <script src="/js/app.js?v=11"></script>
+    <script src="/js/app.js?v=12"></script>
 </body>
 </html>
diff --git a/frontend/js/call-in.js b/frontend/js/call-in.js
index dbf4990..a7c4b76 100644
--- a/frontend/js/call-in.js
+++ b/frontend/js/call-in.js
@@ -8,7 +8,7 @@ let ws = null;
 let audioCtx = null;
 let micStream = null;
 let workletNode = null;
-let nextPlayTime = 0;
+let playbackNode = null;
 let callerId = null;
 
 const callBtn = document.getElementById('call-btn');
@@ -39,19 +39,19 @@ async function startCall() {
         // Set up AudioContext
         audioCtx = new AudioContext({ sampleRate: 48000 });
 
-        // Register worklet processor inline via blob
+        // Register worklet processors inline via blob
         const processorCode = `
+// --- Capture processor: downsample to 16kHz, emit small chunks ---
 class CallerProcessor extends AudioWorkletProcessor {
     constructor() {
         super();
         this.buffer = [];
-        this.targetSamples = 4096; // ~256ms at 16kHz
+        this.targetSamples = 640; // 40ms at 16kHz — low latency
     }
     process(inputs) {
         const input = inputs[0][0];
         if (!input) return true;
 
-        // Downsample from sampleRate to 16000
         const ratio = sampleRate / 16000;
         for (let i = 0; i < input.length; i += ratio) {
             const idx = Math.floor(i);
@@ -60,7 +60,7 @@ class CallerProcessor extends AudioWorkletProcessor {
             }
         }
 
-        if (this.buffer.length >= this.targetSamples) {
+        while (this.buffer.length >= this.targetSamples) {
             const chunk = this.buffer.splice(0, this.targetSamples);
             const int16 = new Int16Array(chunk.length);
             for (let i = 0; i < chunk.length; i++) {
@@ -73,6 +73,70 @@ class CallerProcessor extends AudioWorkletProcessor {
     }
 }
 registerProcessor('caller-processor', CallerProcessor);
+
+// --- Playback processor: ring buffer with 16kHz->sampleRate upsampling ---
+class PlaybackProcessor extends AudioWorkletProcessor {
+    constructor() {
+        super();
+        this.ringSize = 16000 * 3; // 3s ring buffer at 16kHz
+        this.ring = new Float32Array(this.ringSize);
+        this.writePos = 0;
+        this.readPos = 0;
+        this.available = 0;
+        this.started = false;
+        this.jitterMs = 80; // buffer 80ms before starting playback
+        this.jitterSamples = Math.floor(16000 * this.jitterMs / 1000);
+
+        this.port.onmessage = (e) => {
+            const data = e.data;
+            for (let i = 0; i < data.length; i++) {
+                this.ring[this.writePos] = data[i];
+                this.writePos = (this.writePos + 1) % this.ringSize;
+            }
+            this.available += data.length;
+            if (this.available > this.ringSize) {
+                // Overflow — skip ahead
+                this.available = this.ringSize;
+                this.readPos = (this.writePos - this.ringSize + this.ringSize) % this.ringSize;
+            }
+        };
+    }
+    process(inputs, outputs) {
+        const output = outputs[0][0];
+        if (!output) return true;
+
+        // Wait for jitter buffer to fill before starting
+        if (!this.started) {
+            if (this.available < this.jitterSamples) {
+                output.fill(0);
+                return true;
+            }
+            this.started = true;
+        }
+
+        const ratio = 16000 / sampleRate;
+        const srcNeeded = Math.ceil(output.length * ratio);
+
+        if (this.available >= srcNeeded) {
+            for (let i = 0; i < output.length; i++) {
+                const srcPos = i * ratio;
+                const idx = Math.floor(srcPos);
+                const frac = srcPos - idx;
+                const p0 = (this.readPos + idx) % this.ringSize;
+                const p1 = (p0 + 1) % this.ringSize;
+                output[i] = this.ring[p0] * (1 - frac) + this.ring[p1] * frac;
+            }
+            this.readPos = (this.readPos + srcNeeded) % this.ringSize;
+            this.available -= srcNeeded;
+        } else {
+            // Underrun — silence, reset jitter buffer
+            output.fill(0);
+            this.started = false;
+        }
+        return true;
+    }
+}
+registerProcessor('playback-processor', PlaybackProcessor);
 `;
         const blob = new Blob([processorCode], { type: 'application/javascript' });
         const blobUrl = URL.createObjectURL(blob);
@@ -120,6 +184,10 @@ registerProcessor('caller-processor', CallerProcessor);
         source.connect(workletNode);
         // Don't connect worklet to destination — we don't want to hear our own mic
 
+        // Set up playback worklet for received audio
+        playbackNode = new AudioWorkletNode(audioCtx, 'playback-processor');
+        playbackNode.connect(audioCtx.destination);
+
         // Show mic meter
         const analyser = audioCtx.createAnalyser();
         analyser.fftSize = 256;
@@ -144,7 +212,6 @@ function handleControlMessage(msg) {
         setStatus(`Waiting in queue (position ${msg.position})...`, false);
     } else if (msg.status === 'on_air') {
         setStatus('ON AIR', true);
-        nextPlayTime = audioCtx.currentTime;
     } else if (msg.status === 'disconnected') {
         setStatus('Disconnected', false);
         cleanup();
@@ -152,27 +219,15 @@ function handleControlMessage(msg) {
 }
 
 function handleAudioData(buffer) {
-    if (!audioCtx) return;
+    if (!playbackNode) return;
 
+    // Convert Int16 PCM to Float32 and send to playback worklet
     const int16 = new Int16Array(buffer);
     const float32 = new Float32Array(int16.length);
     for (let i = 0; i < int16.length; i++) {
         float32[i] = int16[i] / 32768;
     }
-
-    const audioBuf = audioCtx.createBuffer(1, float32.length, 16000);
-    audioBuf.copyToChannel(float32, 0);
-
-    const source = audioCtx.createBufferSource();
-    source.buffer = audioBuf;
-    source.connect(audioCtx.destination);
-
-    const now = audioCtx.currentTime;
-    if (nextPlayTime < now) {
-        nextPlayTime = now;
-    }
-    source.start(nextPlayTime);
-    nextPlayTime += audioBuf.duration;
+    playbackNode.port.postMessage(float32, [float32.buffer]);
 }
 
 function hangUp() {
@@ -188,6 +243,10 @@ function cleanup() {
         workletNode.disconnect();
         workletNode = null;
     }
+    if (playbackNode) {
+        playbackNode.disconnect();
+        playbackNode = null;
+    }
     if (micStream) {
         micStream.getTracks().forEach(t => t.stop());
         micStream = null;