Fix choppy audio and hanging when taking live callers

- Use persistent callback-based output stream instead of opening/closing per chunk - Replace librosa.resample with simple decimation in real-time audio callbacks - Move host stream initialization to background thread to avoid blocking - Change live caller channel default to 9 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-05 16:24:27 -07:00
parent bcd0d96185
commit ab36ad8d5b
3 changed files with 105 additions and 54 deletions
@@ -24,7 +24,7 @@ class AudioService:
        self.output_device: Optional[int] = None  # Single output device (multi-channel)
        self.caller_channel: int = 1   # Channel for caller TTS
-        self.live_caller_channel: int = 4  # Channel for live caller audio
+        self.live_caller_channel: int = 9  # Channel for live caller audio
        self.music_channel: int = 2    # Channel for music
        self.sfx_channel: int = 3      # Channel for SFX
        self.phone_filter: bool = False  # Phone filter on caller voices
@@ -53,6 +53,10 @@ class AudioService:
        self._host_stream: Optional[sd.InputStream] = None
        self._host_send_callback: Optional[Callable] = None
        # Live caller routing state
        self._live_caller_stream: Optional[sd.OutputStream] = None
        self._live_caller_queue: Optional[queue.Queue] = None
        # Sample rates
        self.input_sample_rate = 16000  # For Whisper
        self.output_sample_rate = 24000  # For TTS
@@ -320,38 +324,84 @@ class AudioService:
        """Stop any playing caller audio"""
        self._caller_stop_event.set()
-    def route_real_caller_audio(self, pcm_data: bytes, sample_rate: int):
+    def _start_live_caller_stream(self):
-        """Route real caller PCM audio to the configured live caller Loopback channel"""
+        """Start persistent output stream for live caller audio"""
-        import librosa
+        if self._live_caller_stream is not None:
            return
        if self.output_device is None:
            return
        self._live_caller_queue = queue.Queue()
        device_info = sd.query_devices(self.output_device)
        num_channels = device_info['max_output_channels']
        device_sr = int(device_info['default_samplerate'])
        channel_idx = min(self.live_caller_channel, num_channels) - 1
        self._live_caller_device_sr = device_sr
        self._live_caller_num_channels = num_channels
        self._live_caller_channel_idx = channel_idx
        def callback(outdata, frames, time_info, status):
            outdata.fill(0)
            written = 0
            while written < frames:
                try:
                    chunk = self._live_caller_queue.get_nowait()
                    end = min(written + len(chunk), frames)
                    count = end - written
                    outdata[written:end, channel_idx] = chunk[:count]
                    if count < len(chunk):
                        # Put remainder back (rare)
                        leftover = chunk[count:]
                        self._live_caller_queue.put(leftover)
                    written = end
                except Exception:
                    break
        self._live_caller_stream = sd.OutputStream(
            device=self.output_device,
            samplerate=device_sr,
            channels=num_channels,
            dtype=np.float32,
            callback=callback,
            blocksize=2048,
        )
        self._live_caller_stream.start()
        print(f"[Audio] Live caller stream started on ch {self.live_caller_channel} @ {device_sr}Hz")
    def _stop_live_caller_stream(self):
        """Stop persistent live caller output stream"""
        if self._live_caller_stream:
            self._live_caller_stream.stop()
            self._live_caller_stream.close()
            self._live_caller_stream = None
            self._live_caller_queue = None
            print("[Audio] Live caller stream stopped")
    def route_real_caller_audio(self, pcm_data: bytes, sample_rate: int):
        """Route real caller PCM audio to the configured live caller Loopback channel"""
        if self.output_device is None:
            return
        # Ensure persistent stream is running
        if self._live_caller_stream is None:
            self._start_live_caller_stream()
        try:
            # Convert bytes to float32
            audio = np.frombuffer(pcm_data, dtype=np.int16).astype(np.float32) / 32768.0
-            device_info = sd.query_devices(self.output_device)
+            # Simple decimation/interpolation instead of librosa
-            num_channels = device_info['max_output_channels']
+            device_sr = self._live_caller_device_sr
            device_sr = int(device_info['default_samplerate'])
            channel_idx = min(self.live_caller_channel, num_channels) - 1
            # Resample to device sample rate if needed
            if sample_rate != device_sr:
-                audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=device_sr)
+                ratio = device_sr / sample_rate
                out_len = int(len(audio) * ratio)
                indices = (np.arange(out_len) / ratio).astype(int)
                indices = np.clip(indices, 0, len(audio) - 1)
                audio = audio[indices]
-            # Create multi-channel output
+            self._live_caller_queue.put(audio)
            multi_ch = np.zeros((len(audio), num_channels), dtype=np.float32)
            multi_ch[:, channel_idx] = audio
            # Write to output device
            with sd.OutputStream(
                device=self.output_device,
                samplerate=device_sr,
                channels=num_channels,
                dtype=np.float32,
            ) as stream:
                stream.write(multi_ch)
        except Exception as e:
            print(f"Real caller audio routing error: {e}")
@@ -366,44 +416,45 @@ class AudioService:
        self._host_send_callback = send_callback
-        device_info = sd.query_devices(self.input_device)
+        def _start():
-        max_channels = device_info['max_input_channels']
+            device_info = sd.query_devices(self.input_device)
-        device_sr = int(device_info['default_samplerate'])
+            max_channels = device_info['max_input_channels']
-        record_channel = min(self.input_channel, max_channels) - 1
+            device_sr = int(device_info['default_samplerate'])
            record_channel = min(self.input_channel, max_channels) - 1
            step = max(1, int(device_sr / 16000))
-        import librosa
+            def callback(indata, frames, time_info, status):
                if not self._host_send_callback:
                    return
                mono = indata[:, record_channel]
                # Simple decimation to ~16kHz
                if step > 1:
                    mono = mono[::step]
                pcm = (mono * 32767).astype(np.int16).tobytes()
                self._host_send_callback(pcm)
-        def callback(indata, frames, time_info, status):
+            self._host_stream = sd.InputStream(
-            if not self._host_send_callback:
+                device=self.input_device,
-                return
+                channels=max_channels,
-            # Extract the configured input channel
+                samplerate=device_sr,
-            mono = indata[:, record_channel].copy()
+                dtype=np.float32,
-            # Resample to 16kHz if needed
+                blocksize=4096,
-            if device_sr != 16000:
+                callback=callback,
-                mono = librosa.resample(mono, orig_sr=device_sr, target_sr=16000)
+            )
-            # Convert float32 to int16 PCM
+            self._host_stream.start()
-            pcm = (mono * 32767).astype(np.int16).tobytes()
+            print(f"[Audio] Host mic streaming started (device {self.input_device} ch {self.input_channel} @ {device_sr}Hz)")
            self._host_send_callback(pcm)
-        self._host_stream = sd.InputStream(
+        threading.Thread(target=_start, daemon=True).start()
            device=self.input_device,
            channels=max_channels,
            samplerate=device_sr,
            dtype=np.float32,
            blocksize=4096,
            callback=callback,
        )
        self._host_stream.start()
        print(f"[Audio] Host mic streaming started (device {self.input_device} ch {self.input_channel} @ {device_sr}Hz)")
    def stop_host_stream(self):
-        """Stop host mic streaming"""
+        """Stop host mic streaming and live caller output"""
        if self._host_stream:
            self._host_stream.stop()
            self._host_stream.close()
            self._host_stream = None
            self._host_send_callback = None
            print("[Audio] Host mic streaming stopped")
        self._stop_live_caller_stream()
    # --- Music Playback ---
@@ -128,7 +128,7 @@
                    </div>
                    <div class="channel-row">
                        <label>Caller Ch <input type="number" id="caller-channel" value="1" min="1" max="16" class="channel-input"></label>
-                        <label>Live Ch <input type="number" id="live-caller-channel" value="4" min="1" max="16" class="channel-input"></label>
+                        <label>Live Ch <input type="number" id="live-caller-channel" value="9" min="1" max="16" class="channel-input"></label>
                        <label>Music Ch <input type="number" id="music-channel" value="2" min="1" max="16" class="channel-input"></label>
                        <label>SFX Ch <input type="number" id="sfx-channel" value="3" min="1" max="16" class="channel-input"></label>
                    </div>
@@ -229,7 +229,7 @@ async function loadAudioDevices() {
        if (inputCh) inputCh.value = settings.input_channel || 1;
        if (callerCh) callerCh.value = settings.caller_channel || 1;
-        if (liveCallerCh) liveCallerCh.value = settings.live_caller_channel || 4;
+        if (liveCallerCh) liveCallerCh.value = settings.live_caller_channel || 9;
        if (musicCh) musicCh.value = settings.music_channel || 2;
        if (sfxCh) sfxCh.value = settings.sfx_channel || 3;
@@ -265,7 +265,7 @@ async function saveAudioDevices() {
            input_channel: inputChannel ? parseInt(inputChannel) : 1,
            output_device: outputDevice ? parseInt(outputDevice) : null,
            caller_channel: callerChannel ? parseInt(callerChannel) : 1,
-            live_caller_channel: liveCallerChannel ? parseInt(liveCallerChannel) : 4,
+            live_caller_channel: liveCallerChannel ? parseInt(liveCallerChannel) : 9,
            music_channel: musicChannel ? parseInt(musicChannel) : 2,
            sfx_channel: sfxChannel ? parseInt(sfxChannel) : 3,
            phone_filter: phoneFilterChecked