Fix live caller audio latency and choppiness

- Reduce capture chunk from 4096 to 640 samples (256ms → 40ms) - Replace BufferSource scheduling with AudioWorklet playback ring buffer - Add 80ms jitter buffer with linear interpolation upsampling - Reduce host mic and live caller stream blocksizes from 4096/2048 to 1024 - Replace librosa.resample with numpy interpolation in send_audio_to_caller Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-05 16:32:27 -07:00
parent ab36ad8d5b
commit 7aed4d9c34
5 changed files with 89 additions and 27 deletions
@@ -366,7 +366,7 @@ class AudioService:
            channels=num_channels,
            dtype=np.float32,
            callback=callback,
-            blocksize=2048,
+            blocksize=1024,
        )
        self._live_caller_stream.start()
        print(f"[Audio] Live caller stream started on ch {self.live_caller_channel} @ {device_sr}Hz")
@@ -438,7 +438,7 @@ class AudioService:
                channels=max_channels,
                samplerate=device_sr,
                dtype=np.float32,
-                blocksize=4096,
+                blocksize=1024,
                callback=callback,
            )
            self._host_stream.start()
@@ -119,9 +119,12 @@ class CallerService:
        try:
            if sample_rate != 16000:
                import numpy as np
                import librosa
                audio = np.frombuffer(pcm_data, dtype=np.int16).astype(np.float32) / 32768.0
-                audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=16000)
+                ratio = 16000 / sample_rate
                out_len = int(len(audio) * ratio)
                indices = (np.arange(out_len) / ratio).astype(int)
                indices = np.clip(indices, 0, len(audio) - 1)
                audio = audio[indices]
                pcm_data = (audio * 32767).astype(np.int16).tobytes()
            await ws.send_bytes(pcm_data)
        except Exception as e:
@@ -150,6 +150,6 @@
        </div>
    </div>
-    <script src="/js/call-in.js"></script>
+    <script src="/js/call-in.js?v=2"></script>
 </body>
 </html>
@@ -207,6 +207,6 @@
        </div>
    </div>
-    <script src="/js/app.js?v=11"></script>
+    <script src="/js/app.js?v=12"></script>
 </body>
 </html>
@@ -8,7 +8,7 @@ let ws = null;
 let audioCtx = null;
 let micStream = null;
 let workletNode = null;
-let nextPlayTime = 0;
+let playbackNode = null;
 let callerId = null;
 const callBtn = document.getElementById('call-btn');
@@ -39,19 +39,19 @@ async function startCall() {
        // Set up AudioContext
        audioCtx = new AudioContext({ sampleRate: 48000 });
-        // Register worklet processor inline via blob
+        // Register worklet processors inline via blob
        const processorCode = `
 // --- Capture processor: downsample to 16kHz, emit small chunks ---
 class CallerProcessor extends AudioWorkletProcessor {
    constructor() {
        super();
        this.buffer = [];
-        this.targetSamples = 4096; // ~256ms at 16kHz
+        this.targetSamples = 640; // 40ms at 16kHz — low latency
    }
    process(inputs) {
        const input = inputs[0][0];
        if (!input) return true;
        // Downsample from sampleRate to 16000
        const ratio = sampleRate / 16000;
        for (let i = 0; i < input.length; i += ratio) {
            const idx = Math.floor(i);
@@ -60,7 +60,7 @@ class CallerProcessor extends AudioWorkletProcessor {
            }
        }
-        if (this.buffer.length >= this.targetSamples) {
+        while (this.buffer.length >= this.targetSamples) {
            const chunk = this.buffer.splice(0, this.targetSamples);
            const int16 = new Int16Array(chunk.length);
            for (let i = 0; i < chunk.length; i++) {
@@ -73,6 +73,70 @@ class CallerProcessor extends AudioWorkletProcessor {
    }
 }
 registerProcessor('caller-processor', CallerProcessor);
 // --- Playback processor: ring buffer with 16kHz->sampleRate upsampling ---
 class PlaybackProcessor extends AudioWorkletProcessor {
    constructor() {
        super();
        this.ringSize = 16000 * 3; // 3s ring buffer at 16kHz
        this.ring = new Float32Array(this.ringSize);
        this.writePos = 0;
        this.readPos = 0;
        this.available = 0;
        this.started = false;
        this.jitterMs = 80; // buffer 80ms before starting playback
        this.jitterSamples = Math.floor(16000 * this.jitterMs / 1000);
        this.port.onmessage = (e) => {
            const data = e.data;
            for (let i = 0; i < data.length; i++) {
                this.ring[this.writePos] = data[i];
                this.writePos = (this.writePos + 1) % this.ringSize;
            }
            this.available += data.length;
            if (this.available > this.ringSize) {
                // Overflow — skip ahead
                this.available = this.ringSize;
                this.readPos = (this.writePos - this.ringSize + this.ringSize) % this.ringSize;
            }
        };
    }
    process(inputs, outputs) {
        const output = outputs[0][0];
        if (!output) return true;
        // Wait for jitter buffer to fill before starting
        if (!this.started) {
            if (this.available < this.jitterSamples) {
                output.fill(0);
                return true;
            }
            this.started = true;
        }
        const ratio = 16000 / sampleRate;
        const srcNeeded = Math.ceil(output.length * ratio);
        if (this.available >= srcNeeded) {
            for (let i = 0; i < output.length; i++) {
                const srcPos = i * ratio;
                const idx = Math.floor(srcPos);
                const frac = srcPos - idx;
                const p0 = (this.readPos + idx) % this.ringSize;
                const p1 = (p0 + 1) % this.ringSize;
                output[i] = this.ring[p0] * (1 - frac) + this.ring[p1] * frac;
            }
            this.readPos = (this.readPos + srcNeeded) % this.ringSize;
            this.available -= srcNeeded;
        } else {
            // Underrun — silence, reset jitter buffer
            output.fill(0);
            this.started = false;
        }
        return true;
    }
 }
 registerProcessor('playback-processor', PlaybackProcessor);
 `;
        const blob = new Blob([processorCode], { type: 'application/javascript' });
        const blobUrl = URL.createObjectURL(blob);
@@ -120,6 +184,10 @@ registerProcessor('caller-processor', CallerProcessor);
        source.connect(workletNode);
        // Don't connect worklet to destination — we don't want to hear our own mic
        // Set up playback worklet for received audio
        playbackNode = new AudioWorkletNode(audioCtx, 'playback-processor');
        playbackNode.connect(audioCtx.destination);
        // Show mic meter
        const analyser = audioCtx.createAnalyser();
        analyser.fftSize = 256;
@@ -144,7 +212,6 @@ function handleControlMessage(msg) {
        setStatus(`Waiting in queue (position ${msg.position})...`, false);
    } else if (msg.status === 'on_air') {
        setStatus('ON AIR', true);
        nextPlayTime = audioCtx.currentTime;
    } else if (msg.status === 'disconnected') {
        setStatus('Disconnected', false);
        cleanup();
@@ -152,27 +219,15 @@ function handleControlMessage(msg) {
 }
 function handleAudioData(buffer) {
-    if (!audioCtx) return;
+    if (!playbackNode) return;
    // Convert Int16 PCM to Float32 and send to playback worklet
    const int16 = new Int16Array(buffer);
    const float32 = new Float32Array(int16.length);
    for (let i = 0; i < int16.length; i++) {
        float32[i] = int16[i] / 32768;
    }
-
+    playbackNode.port.postMessage(float32, [float32.buffer]);
    const audioBuf = audioCtx.createBuffer(1, float32.length, 16000);
    audioBuf.copyToChannel(float32, 0);
    const source = audioCtx.createBufferSource();
    source.buffer = audioBuf;
    source.connect(audioCtx.destination);
    const now = audioCtx.currentTime;
    if (nextPlayTime < now) {
        nextPlayTime = now;
    }
    source.start(nextPlayTime);
    nextPlayTime += audioBuf.duration;
 }
 function hangUp() {
@@ -188,6 +243,10 @@ function cleanup() {
        workletNode.disconnect();
        workletNode = null;
    }
    if (playbackNode) {
        playbackNode.disconnect();
        playbackNode = null;
    }
    if (micStream) {
        micStream.getTracks().forEach(t => t.stop());
        micStream = null;