From ab36ad8d5b9475c735be7b3535538401f002a086 Mon Sep 17 00:00:00 2001
From: tcpsyn <tcpsyn@gmail.com>
Date: Thu, 5 Feb 2026 16:24:27 -0700
Subject: [PATCH] Fix choppy audio and hanging when taking live callers

- Use persistent callback-based output stream instead of opening/closing per chunk
- Replace librosa.resample with simple decimation in real-time audio callbacks
- Move host stream initialization to background thread to avoid blocking
- Change live caller channel default to 9

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 backend/services/audio.py | 153 +++++++++++++++++++++++++-------------
 frontend/index.html       |   2 +-
 frontend/js/app.js        |   4 +-
 3 files changed, 105 insertions(+), 54 deletions(-)

diff --git a/backend/services/audio.py b/backend/services/audio.py
index 97d2c56..4d76cf5 100644
--- a/backend/services/audio.py
+++ b/backend/services/audio.py
@@ -24,7 +24,7 @@ class AudioService:
 
         self.output_device: Optional[int] = None  # Single output device (multi-channel)
         self.caller_channel: int = 1   # Channel for caller TTS
-        self.live_caller_channel: int = 4  # Channel for live caller audio
+        self.live_caller_channel: int = 9  # Channel for live caller audio
         self.music_channel: int = 2    # Channel for music
         self.sfx_channel: int = 3      # Channel for SFX
         self.phone_filter: bool = False  # Phone filter on caller voices
@@ -53,6 +53,10 @@ class AudioService:
         self._host_stream: Optional[sd.InputStream] = None
         self._host_send_callback: Optional[Callable] = None
 
+        # Live caller routing state
+        self._live_caller_stream: Optional[sd.OutputStream] = None
+        self._live_caller_queue: Optional[queue.Queue] = None
+
         # Sample rates
         self.input_sample_rate = 16000  # For Whisper
         self.output_sample_rate = 24000  # For TTS
@@ -320,38 +324,84 @@ class AudioService:
         """Stop any playing caller audio"""
         self._caller_stop_event.set()
 
-    def route_real_caller_audio(self, pcm_data: bytes, sample_rate: int):
-        """Route real caller PCM audio to the configured live caller Loopback channel"""
-        import librosa
+    def _start_live_caller_stream(self):
+        """Start persistent output stream for live caller audio"""
+        if self._live_caller_stream is not None:
+            return
 
         if self.output_device is None:
             return
 
+        self._live_caller_queue = queue.Queue()
+
+        device_info = sd.query_devices(self.output_device)
+        num_channels = device_info['max_output_channels']
+        device_sr = int(device_info['default_samplerate'])
+        channel_idx = min(self.live_caller_channel, num_channels) - 1
+
+        self._live_caller_device_sr = device_sr
+        self._live_caller_num_channels = num_channels
+        self._live_caller_channel_idx = channel_idx
+
+        def callback(outdata, frames, time_info, status):
+            outdata.fill(0)
+            written = 0
+            while written < frames:
+                try:
+                    chunk = self._live_caller_queue.get_nowait()
+                    end = min(written + len(chunk), frames)
+                    count = end - written
+                    outdata[written:end, channel_idx] = chunk[:count]
+                    if count < len(chunk):
+                        # Put remainder back (rare)
+                        leftover = chunk[count:]
+                        self._live_caller_queue.put(leftover)
+                    written = end
+                except Exception:
+                    break
+
+        self._live_caller_stream = sd.OutputStream(
+            device=self.output_device,
+            samplerate=device_sr,
+            channels=num_channels,
+            dtype=np.float32,
+            callback=callback,
+            blocksize=2048,
+        )
+        self._live_caller_stream.start()
+        print(f"[Audio] Live caller stream started on ch {self.live_caller_channel} @ {device_sr}Hz")
+
+    def _stop_live_caller_stream(self):
+        """Stop persistent live caller output stream"""
+        if self._live_caller_stream:
+            self._live_caller_stream.stop()
+            self._live_caller_stream.close()
+            self._live_caller_stream = None
+            self._live_caller_queue = None
+            print("[Audio] Live caller stream stopped")
+
+    def route_real_caller_audio(self, pcm_data: bytes, sample_rate: int):
+        """Route real caller PCM audio to the configured live caller Loopback channel"""
+        if self.output_device is None:
+            return
+
+        # Ensure persistent stream is running
+        if self._live_caller_stream is None:
+            self._start_live_caller_stream()
+
         try:
-            # Convert bytes to float32
             audio = np.frombuffer(pcm_data, dtype=np.int16).astype(np.float32) / 32768.0
 
-            device_info = sd.query_devices(self.output_device)
-            num_channels = device_info['max_output_channels']
-            device_sr = int(device_info['default_samplerate'])
-            channel_idx = min(self.live_caller_channel, num_channels) - 1
-
-            # Resample to device sample rate if needed
+            # Simple decimation/interpolation instead of librosa
+            device_sr = self._live_caller_device_sr
             if sample_rate != device_sr:
-                audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=device_sr)
+                ratio = device_sr / sample_rate
+                out_len = int(len(audio) * ratio)
+                indices = (np.arange(out_len) / ratio).astype(int)
+                indices = np.clip(indices, 0, len(audio) - 1)
+                audio = audio[indices]
 
-            # Create multi-channel output
-            multi_ch = np.zeros((len(audio), num_channels), dtype=np.float32)
-            multi_ch[:, channel_idx] = audio
-
-            # Write to output device
-            with sd.OutputStream(
-                device=self.output_device,
-                samplerate=device_sr,
-                channels=num_channels,
-                dtype=np.float32,
-            ) as stream:
-                stream.write(multi_ch)
+            self._live_caller_queue.put(audio)
 
         except Exception as e:
             print(f"Real caller audio routing error: {e}")
@@ -366,44 +416,45 @@ class AudioService:
 
         self._host_send_callback = send_callback
 
-        device_info = sd.query_devices(self.input_device)
-        max_channels = device_info['max_input_channels']
-        device_sr = int(device_info['default_samplerate'])
-        record_channel = min(self.input_channel, max_channels) - 1
+        def _start():
+            device_info = sd.query_devices(self.input_device)
+            max_channels = device_info['max_input_channels']
+            device_sr = int(device_info['default_samplerate'])
+            record_channel = min(self.input_channel, max_channels) - 1
+            step = max(1, int(device_sr / 16000))
 
-        import librosa
+            def callback(indata, frames, time_info, status):
+                if not self._host_send_callback:
+                    return
+                mono = indata[:, record_channel]
+                # Simple decimation to ~16kHz
+                if step > 1:
+                    mono = mono[::step]
+                pcm = (mono * 32767).astype(np.int16).tobytes()
+                self._host_send_callback(pcm)
 
-        def callback(indata, frames, time_info, status):
-            if not self._host_send_callback:
-                return
-            # Extract the configured input channel
-            mono = indata[:, record_channel].copy()
-            # Resample to 16kHz if needed
-            if device_sr != 16000:
-                mono = librosa.resample(mono, orig_sr=device_sr, target_sr=16000)
-            # Convert float32 to int16 PCM
-            pcm = (mono * 32767).astype(np.int16).tobytes()
-            self._host_send_callback(pcm)
+            self._host_stream = sd.InputStream(
+                device=self.input_device,
+                channels=max_channels,
+                samplerate=device_sr,
+                dtype=np.float32,
+                blocksize=4096,
+                callback=callback,
+            )
+            self._host_stream.start()
+            print(f"[Audio] Host mic streaming started (device {self.input_device} ch {self.input_channel} @ {device_sr}Hz)")
 
-        self._host_stream = sd.InputStream(
-            device=self.input_device,
-            channels=max_channels,
-            samplerate=device_sr,
-            dtype=np.float32,
-            blocksize=4096,
-            callback=callback,
-        )
-        self._host_stream.start()
-        print(f"[Audio] Host mic streaming started (device {self.input_device} ch {self.input_channel} @ {device_sr}Hz)")
+        threading.Thread(target=_start, daemon=True).start()
 
     def stop_host_stream(self):
-        """Stop host mic streaming"""
+        """Stop host mic streaming and live caller output"""
         if self._host_stream:
             self._host_stream.stop()
             self._host_stream.close()
             self._host_stream = None
             self._host_send_callback = None
             print("[Audio] Host mic streaming stopped")
+        self._stop_live_caller_stream()
 
     # --- Music Playback ---
 
diff --git a/frontend/index.html b/frontend/index.html
index 8f9e2a7..b15f4b9 100644
--- a/frontend/index.html
+++ b/frontend/index.html
@@ -128,7 +128,7 @@
                     </div>
                     <div class="channel-row">
                         <label>Caller Ch <input type="number" id="caller-channel" value="1" min="1" max="16" class="channel-input"></label>
-                        <label>Live Ch <input type="number" id="live-caller-channel" value="4" min="1" max="16" class="channel-input"></label>
+                        <label>Live Ch <input type="number" id="live-caller-channel" value="9" min="1" max="16" class="channel-input"></label>
                         <label>Music Ch <input type="number" id="music-channel" value="2" min="1" max="16" class="channel-input"></label>
                         <label>SFX Ch <input type="number" id="sfx-channel" value="3" min="1" max="16" class="channel-input"></label>
                     </div>
diff --git a/frontend/js/app.js b/frontend/js/app.js
index 06aeac3..8806571 100644
--- a/frontend/js/app.js
+++ b/frontend/js/app.js
@@ -229,7 +229,7 @@ async function loadAudioDevices() {
 
         if (inputCh) inputCh.value = settings.input_channel || 1;
         if (callerCh) callerCh.value = settings.caller_channel || 1;
-        if (liveCallerCh) liveCallerCh.value = settings.live_caller_channel || 4;
+        if (liveCallerCh) liveCallerCh.value = settings.live_caller_channel || 9;
         if (musicCh) musicCh.value = settings.music_channel || 2;
         if (sfxCh) sfxCh.value = settings.sfx_channel || 3;
 
@@ -265,7 +265,7 @@ async function saveAudioDevices() {
             input_channel: inputChannel ? parseInt(inputChannel) : 1,
             output_device: outputDevice ? parseInt(outputDevice) : null,
             caller_channel: callerChannel ? parseInt(callerChannel) : 1,
-            live_caller_channel: liveCallerChannel ? parseInt(liveCallerChannel) : 4,
+            live_caller_channel: liveCallerChannel ? parseInt(liveCallerChannel) : 9,
             music_channel: musicChannel ? parseInt(musicChannel) : 2,
             sfx_channel: sfxChannel ? parseInt(sfxChannel) : 3,
             phone_filter: phoneFilterChecked