Fix live caller audio latency and choppiness
- Reduce capture chunk from 4096 to 640 samples (256ms → 40ms) - Replace BufferSource scheduling with AudioWorklet playback ring buffer - Add 80ms jitter buffer with linear interpolation upsampling - Reduce host mic and live caller stream blocksizes from 4096/2048 to 1024 - Replace librosa.resample with numpy interpolation in send_audio_to_caller Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -366,7 +366,7 @@ class AudioService:
|
|||||||
channels=num_channels,
|
channels=num_channels,
|
||||||
dtype=np.float32,
|
dtype=np.float32,
|
||||||
callback=callback,
|
callback=callback,
|
||||||
blocksize=2048,
|
blocksize=1024,
|
||||||
)
|
)
|
||||||
self._live_caller_stream.start()
|
self._live_caller_stream.start()
|
||||||
print(f"[Audio] Live caller stream started on ch {self.live_caller_channel} @ {device_sr}Hz")
|
print(f"[Audio] Live caller stream started on ch {self.live_caller_channel} @ {device_sr}Hz")
|
||||||
@@ -438,7 +438,7 @@ class AudioService:
|
|||||||
channels=max_channels,
|
channels=max_channels,
|
||||||
samplerate=device_sr,
|
samplerate=device_sr,
|
||||||
dtype=np.float32,
|
dtype=np.float32,
|
||||||
blocksize=4096,
|
blocksize=1024,
|
||||||
callback=callback,
|
callback=callback,
|
||||||
)
|
)
|
||||||
self._host_stream.start()
|
self._host_stream.start()
|
||||||
|
|||||||
@@ -119,9 +119,12 @@ class CallerService:
|
|||||||
try:
|
try:
|
||||||
if sample_rate != 16000:
|
if sample_rate != 16000:
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import librosa
|
|
||||||
audio = np.frombuffer(pcm_data, dtype=np.int16).astype(np.float32) / 32768.0
|
audio = np.frombuffer(pcm_data, dtype=np.int16).astype(np.float32) / 32768.0
|
||||||
audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=16000)
|
ratio = 16000 / sample_rate
|
||||||
|
out_len = int(len(audio) * ratio)
|
||||||
|
indices = (np.arange(out_len) / ratio).astype(int)
|
||||||
|
indices = np.clip(indices, 0, len(audio) - 1)
|
||||||
|
audio = audio[indices]
|
||||||
pcm_data = (audio * 32767).astype(np.int16).tobytes()
|
pcm_data = (audio * 32767).astype(np.int16).tobytes()
|
||||||
await ws.send_bytes(pcm_data)
|
await ws.send_bytes(pcm_data)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|||||||
@@ -150,6 +150,6 @@
|
|||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<script src="/js/call-in.js"></script>
|
<script src="/js/call-in.js?v=2"></script>
|
||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
||||||
|
|||||||
@@ -207,6 +207,6 @@
|
|||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<script src="/js/app.js?v=11"></script>
|
<script src="/js/app.js?v=12"></script>
|
||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ let ws = null;
|
|||||||
let audioCtx = null;
|
let audioCtx = null;
|
||||||
let micStream = null;
|
let micStream = null;
|
||||||
let workletNode = null;
|
let workletNode = null;
|
||||||
let nextPlayTime = 0;
|
let playbackNode = null;
|
||||||
let callerId = null;
|
let callerId = null;
|
||||||
|
|
||||||
const callBtn = document.getElementById('call-btn');
|
const callBtn = document.getElementById('call-btn');
|
||||||
@@ -39,19 +39,19 @@ async function startCall() {
|
|||||||
// Set up AudioContext
|
// Set up AudioContext
|
||||||
audioCtx = new AudioContext({ sampleRate: 48000 });
|
audioCtx = new AudioContext({ sampleRate: 48000 });
|
||||||
|
|
||||||
// Register worklet processor inline via blob
|
// Register worklet processors inline via blob
|
||||||
const processorCode = `
|
const processorCode = `
|
||||||
|
// --- Capture processor: downsample to 16kHz, emit small chunks ---
|
||||||
class CallerProcessor extends AudioWorkletProcessor {
|
class CallerProcessor extends AudioWorkletProcessor {
|
||||||
constructor() {
|
constructor() {
|
||||||
super();
|
super();
|
||||||
this.buffer = [];
|
this.buffer = [];
|
||||||
this.targetSamples = 4096; // ~256ms at 16kHz
|
this.targetSamples = 640; // 40ms at 16kHz — low latency
|
||||||
}
|
}
|
||||||
process(inputs) {
|
process(inputs) {
|
||||||
const input = inputs[0][0];
|
const input = inputs[0][0];
|
||||||
if (!input) return true;
|
if (!input) return true;
|
||||||
|
|
||||||
// Downsample from sampleRate to 16000
|
|
||||||
const ratio = sampleRate / 16000;
|
const ratio = sampleRate / 16000;
|
||||||
for (let i = 0; i < input.length; i += ratio) {
|
for (let i = 0; i < input.length; i += ratio) {
|
||||||
const idx = Math.floor(i);
|
const idx = Math.floor(i);
|
||||||
@@ -60,7 +60,7 @@ class CallerProcessor extends AudioWorkletProcessor {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (this.buffer.length >= this.targetSamples) {
|
while (this.buffer.length >= this.targetSamples) {
|
||||||
const chunk = this.buffer.splice(0, this.targetSamples);
|
const chunk = this.buffer.splice(0, this.targetSamples);
|
||||||
const int16 = new Int16Array(chunk.length);
|
const int16 = new Int16Array(chunk.length);
|
||||||
for (let i = 0; i < chunk.length; i++) {
|
for (let i = 0; i < chunk.length; i++) {
|
||||||
@@ -73,6 +73,70 @@ class CallerProcessor extends AudioWorkletProcessor {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
registerProcessor('caller-processor', CallerProcessor);
|
registerProcessor('caller-processor', CallerProcessor);
|
||||||
|
|
||||||
|
// --- Playback processor: ring buffer with 16kHz->sampleRate upsampling ---
|
||||||
|
class PlaybackProcessor extends AudioWorkletProcessor {
|
||||||
|
constructor() {
|
||||||
|
super();
|
||||||
|
this.ringSize = 16000 * 3; // 3s ring buffer at 16kHz
|
||||||
|
this.ring = new Float32Array(this.ringSize);
|
||||||
|
this.writePos = 0;
|
||||||
|
this.readPos = 0;
|
||||||
|
this.available = 0;
|
||||||
|
this.started = false;
|
||||||
|
this.jitterMs = 80; // buffer 80ms before starting playback
|
||||||
|
this.jitterSamples = Math.floor(16000 * this.jitterMs / 1000);
|
||||||
|
|
||||||
|
this.port.onmessage = (e) => {
|
||||||
|
const data = e.data;
|
||||||
|
for (let i = 0; i < data.length; i++) {
|
||||||
|
this.ring[this.writePos] = data[i];
|
||||||
|
this.writePos = (this.writePos + 1) % this.ringSize;
|
||||||
|
}
|
||||||
|
this.available += data.length;
|
||||||
|
if (this.available > this.ringSize) {
|
||||||
|
// Overflow — skip ahead
|
||||||
|
this.available = this.ringSize;
|
||||||
|
this.readPos = (this.writePos - this.ringSize + this.ringSize) % this.ringSize;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
process(inputs, outputs) {
|
||||||
|
const output = outputs[0][0];
|
||||||
|
if (!output) return true;
|
||||||
|
|
||||||
|
// Wait for jitter buffer to fill before starting
|
||||||
|
if (!this.started) {
|
||||||
|
if (this.available < this.jitterSamples) {
|
||||||
|
output.fill(0);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
this.started = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
const ratio = 16000 / sampleRate;
|
||||||
|
const srcNeeded = Math.ceil(output.length * ratio);
|
||||||
|
|
||||||
|
if (this.available >= srcNeeded) {
|
||||||
|
for (let i = 0; i < output.length; i++) {
|
||||||
|
const srcPos = i * ratio;
|
||||||
|
const idx = Math.floor(srcPos);
|
||||||
|
const frac = srcPos - idx;
|
||||||
|
const p0 = (this.readPos + idx) % this.ringSize;
|
||||||
|
const p1 = (p0 + 1) % this.ringSize;
|
||||||
|
output[i] = this.ring[p0] * (1 - frac) + this.ring[p1] * frac;
|
||||||
|
}
|
||||||
|
this.readPos = (this.readPos + srcNeeded) % this.ringSize;
|
||||||
|
this.available -= srcNeeded;
|
||||||
|
} else {
|
||||||
|
// Underrun — silence, reset jitter buffer
|
||||||
|
output.fill(0);
|
||||||
|
this.started = false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
registerProcessor('playback-processor', PlaybackProcessor);
|
||||||
`;
|
`;
|
||||||
const blob = new Blob([processorCode], { type: 'application/javascript' });
|
const blob = new Blob([processorCode], { type: 'application/javascript' });
|
||||||
const blobUrl = URL.createObjectURL(blob);
|
const blobUrl = URL.createObjectURL(blob);
|
||||||
@@ -120,6 +184,10 @@ registerProcessor('caller-processor', CallerProcessor);
|
|||||||
source.connect(workletNode);
|
source.connect(workletNode);
|
||||||
// Don't connect worklet to destination — we don't want to hear our own mic
|
// Don't connect worklet to destination — we don't want to hear our own mic
|
||||||
|
|
||||||
|
// Set up playback worklet for received audio
|
||||||
|
playbackNode = new AudioWorkletNode(audioCtx, 'playback-processor');
|
||||||
|
playbackNode.connect(audioCtx.destination);
|
||||||
|
|
||||||
// Show mic meter
|
// Show mic meter
|
||||||
const analyser = audioCtx.createAnalyser();
|
const analyser = audioCtx.createAnalyser();
|
||||||
analyser.fftSize = 256;
|
analyser.fftSize = 256;
|
||||||
@@ -144,7 +212,6 @@ function handleControlMessage(msg) {
|
|||||||
setStatus(`Waiting in queue (position ${msg.position})...`, false);
|
setStatus(`Waiting in queue (position ${msg.position})...`, false);
|
||||||
} else if (msg.status === 'on_air') {
|
} else if (msg.status === 'on_air') {
|
||||||
setStatus('ON AIR', true);
|
setStatus('ON AIR', true);
|
||||||
nextPlayTime = audioCtx.currentTime;
|
|
||||||
} else if (msg.status === 'disconnected') {
|
} else if (msg.status === 'disconnected') {
|
||||||
setStatus('Disconnected', false);
|
setStatus('Disconnected', false);
|
||||||
cleanup();
|
cleanup();
|
||||||
@@ -152,27 +219,15 @@ function handleControlMessage(msg) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
function handleAudioData(buffer) {
|
function handleAudioData(buffer) {
|
||||||
if (!audioCtx) return;
|
if (!playbackNode) return;
|
||||||
|
|
||||||
|
// Convert Int16 PCM to Float32 and send to playback worklet
|
||||||
const int16 = new Int16Array(buffer);
|
const int16 = new Int16Array(buffer);
|
||||||
const float32 = new Float32Array(int16.length);
|
const float32 = new Float32Array(int16.length);
|
||||||
for (let i = 0; i < int16.length; i++) {
|
for (let i = 0; i < int16.length; i++) {
|
||||||
float32[i] = int16[i] / 32768;
|
float32[i] = int16[i] / 32768;
|
||||||
}
|
}
|
||||||
|
playbackNode.port.postMessage(float32, [float32.buffer]);
|
||||||
const audioBuf = audioCtx.createBuffer(1, float32.length, 16000);
|
|
||||||
audioBuf.copyToChannel(float32, 0);
|
|
||||||
|
|
||||||
const source = audioCtx.createBufferSource();
|
|
||||||
source.buffer = audioBuf;
|
|
||||||
source.connect(audioCtx.destination);
|
|
||||||
|
|
||||||
const now = audioCtx.currentTime;
|
|
||||||
if (nextPlayTime < now) {
|
|
||||||
nextPlayTime = now;
|
|
||||||
}
|
|
||||||
source.start(nextPlayTime);
|
|
||||||
nextPlayTime += audioBuf.duration;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
function hangUp() {
|
function hangUp() {
|
||||||
@@ -188,6 +243,10 @@ function cleanup() {
|
|||||||
workletNode.disconnect();
|
workletNode.disconnect();
|
||||||
workletNode = null;
|
workletNode = null;
|
||||||
}
|
}
|
||||||
|
if (playbackNode) {
|
||||||
|
playbackNode.disconnect();
|
||||||
|
playbackNode = null;
|
||||||
|
}
|
||||||
if (micStream) {
|
if (micStream) {
|
||||||
micStream.getTracks().forEach(t => t.stop());
|
micStream.getTracks().forEach(t => t.stop());
|
||||||
micStream = null;
|
micStream = null;
|
||||||
|
|||||||
Reference in New Issue
Block a user