Fix choppy audio and hanging when taking live callers

- Use persistent callback-based output stream instead of opening/closing per chunk
- Replace librosa.resample with simple decimation in real-time audio callbacks
- Move host stream initialization to background thread to avoid blocking
- Change live caller channel default to 9

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-05 16:24:27 -07:00
parent bcd0d96185
commit ab36ad8d5b
3 changed files with 105 additions and 54 deletions

View File

@@ -24,7 +24,7 @@ class AudioService:
self.output_device: Optional[int] = None # Single output device (multi-channel)
self.caller_channel: int = 1 # Channel for caller TTS
self.live_caller_channel: int = 4 # Channel for live caller audio
self.live_caller_channel: int = 9 # Channel for live caller audio
self.music_channel: int = 2 # Channel for music
self.sfx_channel: int = 3 # Channel for SFX
self.phone_filter: bool = False # Phone filter on caller voices
@@ -53,6 +53,10 @@ class AudioService:
self._host_stream: Optional[sd.InputStream] = None
self._host_send_callback: Optional[Callable] = None
# Live caller routing state
self._live_caller_stream: Optional[sd.OutputStream] = None
self._live_caller_queue: Optional[queue.Queue] = None
# Sample rates
self.input_sample_rate = 16000 # For Whisper
self.output_sample_rate = 24000 # For TTS
@@ -320,38 +324,84 @@ class AudioService:
"""Stop any playing caller audio"""
self._caller_stop_event.set()
def route_real_caller_audio(self, pcm_data: bytes, sample_rate: int):
"""Route real caller PCM audio to the configured live caller Loopback channel"""
import librosa
def _start_live_caller_stream(self):
"""Start persistent output stream for live caller audio"""
if self._live_caller_stream is not None:
return
if self.output_device is None:
return
self._live_caller_queue = queue.Queue()
device_info = sd.query_devices(self.output_device)
num_channels = device_info['max_output_channels']
device_sr = int(device_info['default_samplerate'])
channel_idx = min(self.live_caller_channel, num_channels) - 1
self._live_caller_device_sr = device_sr
self._live_caller_num_channels = num_channels
self._live_caller_channel_idx = channel_idx
def callback(outdata, frames, time_info, status):
outdata.fill(0)
written = 0
while written < frames:
try:
chunk = self._live_caller_queue.get_nowait()
end = min(written + len(chunk), frames)
count = end - written
outdata[written:end, channel_idx] = chunk[:count]
if count < len(chunk):
# Put remainder back (rare)
leftover = chunk[count:]
self._live_caller_queue.put(leftover)
written = end
except Exception:
break
self._live_caller_stream = sd.OutputStream(
device=self.output_device,
samplerate=device_sr,
channels=num_channels,
dtype=np.float32,
callback=callback,
blocksize=2048,
)
self._live_caller_stream.start()
print(f"[Audio] Live caller stream started on ch {self.live_caller_channel} @ {device_sr}Hz")
def _stop_live_caller_stream(self):
"""Stop persistent live caller output stream"""
if self._live_caller_stream:
self._live_caller_stream.stop()
self._live_caller_stream.close()
self._live_caller_stream = None
self._live_caller_queue = None
print("[Audio] Live caller stream stopped")
def route_real_caller_audio(self, pcm_data: bytes, sample_rate: int):
"""Route real caller PCM audio to the configured live caller Loopback channel"""
if self.output_device is None:
return
# Ensure persistent stream is running
if self._live_caller_stream is None:
self._start_live_caller_stream()
try:
# Convert bytes to float32
audio = np.frombuffer(pcm_data, dtype=np.int16).astype(np.float32) / 32768.0
device_info = sd.query_devices(self.output_device)
num_channels = device_info['max_output_channels']
device_sr = int(device_info['default_samplerate'])
channel_idx = min(self.live_caller_channel, num_channels) - 1
# Resample to device sample rate if needed
# Simple decimation/interpolation instead of librosa
device_sr = self._live_caller_device_sr
if sample_rate != device_sr:
audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=device_sr)
ratio = device_sr / sample_rate
out_len = int(len(audio) * ratio)
indices = (np.arange(out_len) / ratio).astype(int)
indices = np.clip(indices, 0, len(audio) - 1)
audio = audio[indices]
# Create multi-channel output
multi_ch = np.zeros((len(audio), num_channels), dtype=np.float32)
multi_ch[:, channel_idx] = audio
# Write to output device
with sd.OutputStream(
device=self.output_device,
samplerate=device_sr,
channels=num_channels,
dtype=np.float32,
) as stream:
stream.write(multi_ch)
self._live_caller_queue.put(audio)
except Exception as e:
print(f"Real caller audio routing error: {e}")
@@ -366,44 +416,45 @@ class AudioService:
self._host_send_callback = send_callback
device_info = sd.query_devices(self.input_device)
max_channels = device_info['max_input_channels']
device_sr = int(device_info['default_samplerate'])
record_channel = min(self.input_channel, max_channels) - 1
def _start():
device_info = sd.query_devices(self.input_device)
max_channels = device_info['max_input_channels']
device_sr = int(device_info['default_samplerate'])
record_channel = min(self.input_channel, max_channels) - 1
step = max(1, int(device_sr / 16000))
import librosa
def callback(indata, frames, time_info, status):
if not self._host_send_callback:
return
mono = indata[:, record_channel]
# Simple decimation to ~16kHz
if step > 1:
mono = mono[::step]
pcm = (mono * 32767).astype(np.int16).tobytes()
self._host_send_callback(pcm)
def callback(indata, frames, time_info, status):
if not self._host_send_callback:
return
# Extract the configured input channel
mono = indata[:, record_channel].copy()
# Resample to 16kHz if needed
if device_sr != 16000:
mono = librosa.resample(mono, orig_sr=device_sr, target_sr=16000)
# Convert float32 to int16 PCM
pcm = (mono * 32767).astype(np.int16).tobytes()
self._host_send_callback(pcm)
self._host_stream = sd.InputStream(
device=self.input_device,
channels=max_channels,
samplerate=device_sr,
dtype=np.float32,
blocksize=4096,
callback=callback,
)
self._host_stream.start()
print(f"[Audio] Host mic streaming started (device {self.input_device} ch {self.input_channel} @ {device_sr}Hz)")
self._host_stream = sd.InputStream(
device=self.input_device,
channels=max_channels,
samplerate=device_sr,
dtype=np.float32,
blocksize=4096,
callback=callback,
)
self._host_stream.start()
print(f"[Audio] Host mic streaming started (device {self.input_device} ch {self.input_channel} @ {device_sr}Hz)")
threading.Thread(target=_start, daemon=True).start()
def stop_host_stream(self):
"""Stop host mic streaming"""
"""Stop host mic streaming and live caller output"""
if self._host_stream:
self._host_stream.stop()
self._host_stream.close()
self._host_stream = None
self._host_send_callback = None
print("[Audio] Host mic streaming stopped")
self._stop_live_caller_stream()
# --- Music Playback ---

View File

@@ -128,7 +128,7 @@
</div>
<div class="channel-row">
<label>Caller Ch <input type="number" id="caller-channel" value="1" min="1" max="16" class="channel-input"></label>
<label>Live Ch <input type="number" id="live-caller-channel" value="4" min="1" max="16" class="channel-input"></label>
<label>Live Ch <input type="number" id="live-caller-channel" value="9" min="1" max="16" class="channel-input"></label>
<label>Music Ch <input type="number" id="music-channel" value="2" min="1" max="16" class="channel-input"></label>
<label>SFX Ch <input type="number" id="sfx-channel" value="3" min="1" max="16" class="channel-input"></label>
</div>

View File

@@ -229,7 +229,7 @@ async function loadAudioDevices() {
if (inputCh) inputCh.value = settings.input_channel || 1;
if (callerCh) callerCh.value = settings.caller_channel || 1;
if (liveCallerCh) liveCallerCh.value = settings.live_caller_channel || 4;
if (liveCallerCh) liveCallerCh.value = settings.live_caller_channel || 9;
if (musicCh) musicCh.value = settings.music_channel || 2;
if (sfxCh) sfxCh.value = settings.sfx_channel || 3;
@@ -265,7 +265,7 @@ async function saveAudioDevices() {
input_channel: inputChannel ? parseInt(inputChannel) : 1,
output_device: outputDevice ? parseInt(outputDevice) : null,
caller_channel: callerChannel ? parseInt(callerChannel) : 1,
live_caller_channel: liveCallerChannel ? parseInt(liveCallerChannel) : 4,
live_caller_channel: liveCallerChannel ? parseInt(liveCallerChannel) : 9,
music_channel: musicChannel ? parseInt(musicChannel) : 2,
sfx_channel: sfxChannel ? parseInt(sfxChannel) : 3,
phone_filter: phoneFilterChecked