Replace queue with ring buffer jitter absorption for live caller audio

- Server: 150ms pre-buffer ring buffer eliminates gaps from timing mismatches
- Browser playback: 150ms jitter buffer (up from 80ms) for network jitter
- Capture chunks: 960 samples/60ms (better network efficiency)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-05 16:37:50 -07:00
parent 7aed4d9c34
commit 4d97ea9099
3 changed files with 53 additions and 26 deletions

View File

@@ -55,7 +55,7 @@ class AudioService:
# Live caller routing state # Live caller routing state
self._live_caller_stream: Optional[sd.OutputStream] = None self._live_caller_stream: Optional[sd.OutputStream] = None
self._live_caller_queue: Optional[queue.Queue] = None self._live_caller_write: Optional[Callable] = None
# Sample rates # Sample rates
self.input_sample_rate = 16000 # For Whisper self.input_sample_rate = 16000 # For Whisper
@@ -325,15 +325,13 @@ class AudioService:
self._caller_stop_event.set() self._caller_stop_event.set()
def _start_live_caller_stream(self): def _start_live_caller_stream(self):
"""Start persistent output stream for live caller audio""" """Start persistent output stream with ring buffer jitter absorption"""
if self._live_caller_stream is not None: if self._live_caller_stream is not None:
return return
if self.output_device is None: if self.output_device is None:
return return
self._live_caller_queue = queue.Queue()
device_info = sd.query_devices(self.output_device) device_info = sd.query_devices(self.output_device)
num_channels = device_info['max_output_channels'] num_channels = device_info['max_output_channels']
device_sr = int(device_info['default_samplerate']) device_sr = int(device_info['default_samplerate'])
@@ -343,22 +341,52 @@ class AudioService:
self._live_caller_num_channels = num_channels self._live_caller_num_channels = num_channels
self._live_caller_channel_idx = channel_idx self._live_caller_channel_idx = channel_idx
# Ring buffer: 3 seconds capacity, 150ms pre-buffer before playback starts
ring_size = int(device_sr * 3)
ring = np.zeros(ring_size, dtype=np.float32)
prebuffer_samples = int(device_sr * 0.15)
# Mutable state shared between writer (main thread) and reader (audio callback)
# CPython GIL makes individual int reads/writes atomic
state = {"write_pos": 0, "read_pos": 0, "avail": 0, "started": False}
def write_audio(data):
n = len(data)
wp = state["write_pos"]
if wp + n <= ring_size:
ring[wp:wp + n] = data
else:
first = ring_size - wp
ring[wp:] = data[:first]
ring[:n - first] = data[first:]
state["write_pos"] = (wp + n) % ring_size
state["avail"] += n
def callback(outdata, frames, time_info, status): def callback(outdata, frames, time_info, status):
outdata.fill(0) outdata.fill(0)
written = 0 avail = state["avail"]
while written < frames:
try: if not state["started"]:
chunk = self._live_caller_queue.get_nowait() if avail >= prebuffer_samples:
end = min(written + len(chunk), frames) state["started"] = True
count = end - written else:
outdata[written:end, channel_idx] = chunk[:count] return
if count < len(chunk):
# Put remainder back (rare) if avail < frames:
leftover = chunk[count:] # Underrun — stop and re-buffer
self._live_caller_queue.put(leftover) state["started"] = False
written = end return
except Exception:
break rp = state["read_pos"]
if rp + frames <= ring_size:
outdata[:frames, channel_idx] = ring[rp:rp + frames]
else:
first = ring_size - rp
outdata[:first, channel_idx] = ring[rp:]
outdata[first:frames, channel_idx] = ring[:frames - first]
state["read_pos"] = (rp + frames) % ring_size
state["avail"] -= frames
self._live_caller_write = write_audio
self._live_caller_stream = sd.OutputStream( self._live_caller_stream = sd.OutputStream(
device=self.output_device, device=self.output_device,
@@ -369,7 +397,7 @@ class AudioService:
blocksize=1024, blocksize=1024,
) )
self._live_caller_stream.start() self._live_caller_stream.start()
print(f"[Audio] Live caller stream started on ch {self.live_caller_channel} @ {device_sr}Hz") print(f"[Audio] Live caller stream started on ch {self.live_caller_channel} @ {device_sr}Hz (prebuffer {prebuffer_samples} samples)")
def _stop_live_caller_stream(self): def _stop_live_caller_stream(self):
"""Stop persistent live caller output stream""" """Stop persistent live caller output stream"""
@@ -377,7 +405,7 @@ class AudioService:
self._live_caller_stream.stop() self._live_caller_stream.stop()
self._live_caller_stream.close() self._live_caller_stream.close()
self._live_caller_stream = None self._live_caller_stream = None
self._live_caller_queue = None self._live_caller_write = None
print("[Audio] Live caller stream stopped") print("[Audio] Live caller stream stopped")
def route_real_caller_audio(self, pcm_data: bytes, sample_rate: int): def route_real_caller_audio(self, pcm_data: bytes, sample_rate: int):
@@ -385,14 +413,12 @@ class AudioService:
if self.output_device is None: if self.output_device is None:
return return
# Ensure persistent stream is running
if self._live_caller_stream is None: if self._live_caller_stream is None:
self._start_live_caller_stream() self._start_live_caller_stream()
try: try:
audio = np.frombuffer(pcm_data, dtype=np.int16).astype(np.float32) / 32768.0 audio = np.frombuffer(pcm_data, dtype=np.int16).astype(np.float32) / 32768.0
# Simple decimation/interpolation instead of librosa
device_sr = self._live_caller_device_sr device_sr = self._live_caller_device_sr
if sample_rate != device_sr: if sample_rate != device_sr:
ratio = device_sr / sample_rate ratio = device_sr / sample_rate
@@ -401,7 +427,8 @@ class AudioService:
indices = np.clip(indices, 0, len(audio) - 1) indices = np.clip(indices, 0, len(audio) - 1)
audio = audio[indices] audio = audio[indices]
self._live_caller_queue.put(audio) if self._live_caller_write:
self._live_caller_write(audio)
except Exception as e: except Exception as e:
print(f"Real caller audio routing error: {e}") print(f"Real caller audio routing error: {e}")

View File

@@ -150,6 +150,6 @@
</div> </div>
</div> </div>
<script src="/js/call-in.js?v=2"></script> <script src="/js/call-in.js?v=3"></script>
</body> </body>
</html> </html>

View File

@@ -46,7 +46,7 @@ class CallerProcessor extends AudioWorkletProcessor {
constructor() { constructor() {
super(); super();
this.buffer = []; this.buffer = [];
this.targetSamples = 640; // 40ms at 16kHz — low latency this.targetSamples = 960; // 60ms at 16kHz
} }
process(inputs) { process(inputs) {
const input = inputs[0][0]; const input = inputs[0][0];
@@ -84,7 +84,7 @@ class PlaybackProcessor extends AudioWorkletProcessor {
this.readPos = 0; this.readPos = 0;
this.available = 0; this.available = 0;
this.started = false; this.started = false;
this.jitterMs = 80; // buffer 80ms before starting playback this.jitterMs = 150; // buffer 150ms before starting playback
this.jitterSamples = Math.floor(16000 * this.jitterMs / 1000); this.jitterSamples = Math.floor(16000 * this.jitterMs / 1000);
this.port.onmessage = (e) => { this.port.onmessage = (e) => {