From ab36ad8d5b9475c735be7b3535538401f002a086 Mon Sep 17 00:00:00 2001 From: tcpsyn Date: Thu, 5 Feb 2026 16:24:27 -0700 Subject: [PATCH] Fix choppy audio and hanging when taking live callers - Use persistent callback-based output stream instead of opening/closing per chunk - Replace librosa.resample with simple decimation in real-time audio callbacks - Move host stream initialization to background thread to avoid blocking - Change live caller channel default to 9 Co-Authored-By: Claude Opus 4.6 --- backend/services/audio.py | 153 +++++++++++++++++++++++++------------- frontend/index.html | 2 +- frontend/js/app.js | 4 +- 3 files changed, 105 insertions(+), 54 deletions(-) diff --git a/backend/services/audio.py b/backend/services/audio.py index 97d2c56..4d76cf5 100644 --- a/backend/services/audio.py +++ b/backend/services/audio.py @@ -24,7 +24,7 @@ class AudioService: self.output_device: Optional[int] = None # Single output device (multi-channel) self.caller_channel: int = 1 # Channel for caller TTS - self.live_caller_channel: int = 4 # Channel for live caller audio + self.live_caller_channel: int = 9 # Channel for live caller audio self.music_channel: int = 2 # Channel for music self.sfx_channel: int = 3 # Channel for SFX self.phone_filter: bool = False # Phone filter on caller voices @@ -53,6 +53,10 @@ class AudioService: self._host_stream: Optional[sd.InputStream] = None self._host_send_callback: Optional[Callable] = None + # Live caller routing state + self._live_caller_stream: Optional[sd.OutputStream] = None + self._live_caller_queue: Optional[queue.Queue] = None + # Sample rates self.input_sample_rate = 16000 # For Whisper self.output_sample_rate = 24000 # For TTS @@ -320,38 +324,84 @@ class AudioService: """Stop any playing caller audio""" self._caller_stop_event.set() - def route_real_caller_audio(self, pcm_data: bytes, sample_rate: int): - """Route real caller PCM audio to the configured live caller Loopback channel""" - import librosa + def _start_live_caller_stream(self): + """Start persistent output stream for live caller audio""" + if self._live_caller_stream is not None: + return if self.output_device is None: return + self._live_caller_queue = queue.Queue() + + device_info = sd.query_devices(self.output_device) + num_channels = device_info['max_output_channels'] + device_sr = int(device_info['default_samplerate']) + channel_idx = min(self.live_caller_channel, num_channels) - 1 + + self._live_caller_device_sr = device_sr + self._live_caller_num_channels = num_channels + self._live_caller_channel_idx = channel_idx + + def callback(outdata, frames, time_info, status): + outdata.fill(0) + written = 0 + while written < frames: + try: + chunk = self._live_caller_queue.get_nowait() + end = min(written + len(chunk), frames) + count = end - written + outdata[written:end, channel_idx] = chunk[:count] + if count < len(chunk): + # Put remainder back (rare) + leftover = chunk[count:] + self._live_caller_queue.put(leftover) + written = end + except Exception: + break + + self._live_caller_stream = sd.OutputStream( + device=self.output_device, + samplerate=device_sr, + channels=num_channels, + dtype=np.float32, + callback=callback, + blocksize=2048, + ) + self._live_caller_stream.start() + print(f"[Audio] Live caller stream started on ch {self.live_caller_channel} @ {device_sr}Hz") + + def _stop_live_caller_stream(self): + """Stop persistent live caller output stream""" + if self._live_caller_stream: + self._live_caller_stream.stop() + self._live_caller_stream.close() + self._live_caller_stream = None + self._live_caller_queue = None + print("[Audio] Live caller stream stopped") + + def route_real_caller_audio(self, pcm_data: bytes, sample_rate: int): + """Route real caller PCM audio to the configured live caller Loopback channel""" + if self.output_device is None: + return + + # Ensure persistent stream is running + if self._live_caller_stream is None: + self._start_live_caller_stream() + try: - # Convert bytes to float32 audio = np.frombuffer(pcm_data, dtype=np.int16).astype(np.float32) / 32768.0 - device_info = sd.query_devices(self.output_device) - num_channels = device_info['max_output_channels'] - device_sr = int(device_info['default_samplerate']) - channel_idx = min(self.live_caller_channel, num_channels) - 1 - - # Resample to device sample rate if needed + # Simple decimation/interpolation instead of librosa + device_sr = self._live_caller_device_sr if sample_rate != device_sr: - audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=device_sr) + ratio = device_sr / sample_rate + out_len = int(len(audio) * ratio) + indices = (np.arange(out_len) / ratio).astype(int) + indices = np.clip(indices, 0, len(audio) - 1) + audio = audio[indices] - # Create multi-channel output - multi_ch = np.zeros((len(audio), num_channels), dtype=np.float32) - multi_ch[:, channel_idx] = audio - - # Write to output device - with sd.OutputStream( - device=self.output_device, - samplerate=device_sr, - channels=num_channels, - dtype=np.float32, - ) as stream: - stream.write(multi_ch) + self._live_caller_queue.put(audio) except Exception as e: print(f"Real caller audio routing error: {e}") @@ -366,44 +416,45 @@ class AudioService: self._host_send_callback = send_callback - device_info = sd.query_devices(self.input_device) - max_channels = device_info['max_input_channels'] - device_sr = int(device_info['default_samplerate']) - record_channel = min(self.input_channel, max_channels) - 1 + def _start(): + device_info = sd.query_devices(self.input_device) + max_channels = device_info['max_input_channels'] + device_sr = int(device_info['default_samplerate']) + record_channel = min(self.input_channel, max_channels) - 1 + step = max(1, int(device_sr / 16000)) - import librosa + def callback(indata, frames, time_info, status): + if not self._host_send_callback: + return + mono = indata[:, record_channel] + # Simple decimation to ~16kHz + if step > 1: + mono = mono[::step] + pcm = (mono * 32767).astype(np.int16).tobytes() + self._host_send_callback(pcm) - def callback(indata, frames, time_info, status): - if not self._host_send_callback: - return - # Extract the configured input channel - mono = indata[:, record_channel].copy() - # Resample to 16kHz if needed - if device_sr != 16000: - mono = librosa.resample(mono, orig_sr=device_sr, target_sr=16000) - # Convert float32 to int16 PCM - pcm = (mono * 32767).astype(np.int16).tobytes() - self._host_send_callback(pcm) + self._host_stream = sd.InputStream( + device=self.input_device, + channels=max_channels, + samplerate=device_sr, + dtype=np.float32, + blocksize=4096, + callback=callback, + ) + self._host_stream.start() + print(f"[Audio] Host mic streaming started (device {self.input_device} ch {self.input_channel} @ {device_sr}Hz)") - self._host_stream = sd.InputStream( - device=self.input_device, - channels=max_channels, - samplerate=device_sr, - dtype=np.float32, - blocksize=4096, - callback=callback, - ) - self._host_stream.start() - print(f"[Audio] Host mic streaming started (device {self.input_device} ch {self.input_channel} @ {device_sr}Hz)") + threading.Thread(target=_start, daemon=True).start() def stop_host_stream(self): - """Stop host mic streaming""" + """Stop host mic streaming and live caller output""" if self._host_stream: self._host_stream.stop() self._host_stream.close() self._host_stream = None self._host_send_callback = None print("[Audio] Host mic streaming stopped") + self._stop_live_caller_stream() # --- Music Playback --- diff --git a/frontend/index.html b/frontend/index.html index 8f9e2a7..b15f4b9 100644 --- a/frontend/index.html +++ b/frontend/index.html @@ -128,7 +128,7 @@
- +
diff --git a/frontend/js/app.js b/frontend/js/app.js index 06aeac3..8806571 100644 --- a/frontend/js/app.js +++ b/frontend/js/app.js @@ -229,7 +229,7 @@ async function loadAudioDevices() { if (inputCh) inputCh.value = settings.input_channel || 1; if (callerCh) callerCh.value = settings.caller_channel || 1; - if (liveCallerCh) liveCallerCh.value = settings.live_caller_channel || 4; + if (liveCallerCh) liveCallerCh.value = settings.live_caller_channel || 9; if (musicCh) musicCh.value = settings.music_channel || 2; if (sfxCh) sfxCh.value = settings.sfx_channel || 3; @@ -265,7 +265,7 @@ async function saveAudioDevices() { input_channel: inputChannel ? parseInt(inputChannel) : 1, output_device: outputDevice ? parseInt(outputDevice) : null, caller_channel: callerChannel ? parseInt(callerChannel) : 1, - live_caller_channel: liveCallerChannel ? parseInt(liveCallerChannel) : 4, + live_caller_channel: liveCallerChannel ? parseInt(liveCallerChannel) : 9, music_channel: musicChannel ? parseInt(musicChannel) : 2, sfx_channel: sfxChannel ? parseInt(sfxChannel) : 3, phone_filter: phoneFilterChecked