Fix choppy audio and hanging when taking live callers
- Use persistent callback-based output stream instead of opening/closing per chunk - Replace librosa.resample with simple decimation in real-time audio callbacks - Move host stream initialization to background thread to avoid blocking - Change live caller channel default to 9 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -24,7 +24,7 @@ class AudioService:
|
|||||||
|
|
||||||
self.output_device: Optional[int] = None # Single output device (multi-channel)
|
self.output_device: Optional[int] = None # Single output device (multi-channel)
|
||||||
self.caller_channel: int = 1 # Channel for caller TTS
|
self.caller_channel: int = 1 # Channel for caller TTS
|
||||||
self.live_caller_channel: int = 4 # Channel for live caller audio
|
self.live_caller_channel: int = 9 # Channel for live caller audio
|
||||||
self.music_channel: int = 2 # Channel for music
|
self.music_channel: int = 2 # Channel for music
|
||||||
self.sfx_channel: int = 3 # Channel for SFX
|
self.sfx_channel: int = 3 # Channel for SFX
|
||||||
self.phone_filter: bool = False # Phone filter on caller voices
|
self.phone_filter: bool = False # Phone filter on caller voices
|
||||||
@@ -53,6 +53,10 @@ class AudioService:
|
|||||||
self._host_stream: Optional[sd.InputStream] = None
|
self._host_stream: Optional[sd.InputStream] = None
|
||||||
self._host_send_callback: Optional[Callable] = None
|
self._host_send_callback: Optional[Callable] = None
|
||||||
|
|
||||||
|
# Live caller routing state
|
||||||
|
self._live_caller_stream: Optional[sd.OutputStream] = None
|
||||||
|
self._live_caller_queue: Optional[queue.Queue] = None
|
||||||
|
|
||||||
# Sample rates
|
# Sample rates
|
||||||
self.input_sample_rate = 16000 # For Whisper
|
self.input_sample_rate = 16000 # For Whisper
|
||||||
self.output_sample_rate = 24000 # For TTS
|
self.output_sample_rate = 24000 # For TTS
|
||||||
@@ -320,38 +324,84 @@ class AudioService:
|
|||||||
"""Stop any playing caller audio"""
|
"""Stop any playing caller audio"""
|
||||||
self._caller_stop_event.set()
|
self._caller_stop_event.set()
|
||||||
|
|
||||||
def route_real_caller_audio(self, pcm_data: bytes, sample_rate: int):
|
def _start_live_caller_stream(self):
|
||||||
"""Route real caller PCM audio to the configured live caller Loopback channel"""
|
"""Start persistent output stream for live caller audio"""
|
||||||
import librosa
|
if self._live_caller_stream is not None:
|
||||||
|
return
|
||||||
|
|
||||||
if self.output_device is None:
|
if self.output_device is None:
|
||||||
return
|
return
|
||||||
|
|
||||||
|
self._live_caller_queue = queue.Queue()
|
||||||
|
|
||||||
|
device_info = sd.query_devices(self.output_device)
|
||||||
|
num_channels = device_info['max_output_channels']
|
||||||
|
device_sr = int(device_info['default_samplerate'])
|
||||||
|
channel_idx = min(self.live_caller_channel, num_channels) - 1
|
||||||
|
|
||||||
|
self._live_caller_device_sr = device_sr
|
||||||
|
self._live_caller_num_channels = num_channels
|
||||||
|
self._live_caller_channel_idx = channel_idx
|
||||||
|
|
||||||
|
def callback(outdata, frames, time_info, status):
|
||||||
|
outdata.fill(0)
|
||||||
|
written = 0
|
||||||
|
while written < frames:
|
||||||
|
try:
|
||||||
|
chunk = self._live_caller_queue.get_nowait()
|
||||||
|
end = min(written + len(chunk), frames)
|
||||||
|
count = end - written
|
||||||
|
outdata[written:end, channel_idx] = chunk[:count]
|
||||||
|
if count < len(chunk):
|
||||||
|
# Put remainder back (rare)
|
||||||
|
leftover = chunk[count:]
|
||||||
|
self._live_caller_queue.put(leftover)
|
||||||
|
written = end
|
||||||
|
except Exception:
|
||||||
|
break
|
||||||
|
|
||||||
|
self._live_caller_stream = sd.OutputStream(
|
||||||
|
device=self.output_device,
|
||||||
|
samplerate=device_sr,
|
||||||
|
channels=num_channels,
|
||||||
|
dtype=np.float32,
|
||||||
|
callback=callback,
|
||||||
|
blocksize=2048,
|
||||||
|
)
|
||||||
|
self._live_caller_stream.start()
|
||||||
|
print(f"[Audio] Live caller stream started on ch {self.live_caller_channel} @ {device_sr}Hz")
|
||||||
|
|
||||||
|
def _stop_live_caller_stream(self):
|
||||||
|
"""Stop persistent live caller output stream"""
|
||||||
|
if self._live_caller_stream:
|
||||||
|
self._live_caller_stream.stop()
|
||||||
|
self._live_caller_stream.close()
|
||||||
|
self._live_caller_stream = None
|
||||||
|
self._live_caller_queue = None
|
||||||
|
print("[Audio] Live caller stream stopped")
|
||||||
|
|
||||||
|
def route_real_caller_audio(self, pcm_data: bytes, sample_rate: int):
|
||||||
|
"""Route real caller PCM audio to the configured live caller Loopback channel"""
|
||||||
|
if self.output_device is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Ensure persistent stream is running
|
||||||
|
if self._live_caller_stream is None:
|
||||||
|
self._start_live_caller_stream()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Convert bytes to float32
|
|
||||||
audio = np.frombuffer(pcm_data, dtype=np.int16).astype(np.float32) / 32768.0
|
audio = np.frombuffer(pcm_data, dtype=np.int16).astype(np.float32) / 32768.0
|
||||||
|
|
||||||
device_info = sd.query_devices(self.output_device)
|
# Simple decimation/interpolation instead of librosa
|
||||||
num_channels = device_info['max_output_channels']
|
device_sr = self._live_caller_device_sr
|
||||||
device_sr = int(device_info['default_samplerate'])
|
|
||||||
channel_idx = min(self.live_caller_channel, num_channels) - 1
|
|
||||||
|
|
||||||
# Resample to device sample rate if needed
|
|
||||||
if sample_rate != device_sr:
|
if sample_rate != device_sr:
|
||||||
audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=device_sr)
|
ratio = device_sr / sample_rate
|
||||||
|
out_len = int(len(audio) * ratio)
|
||||||
|
indices = (np.arange(out_len) / ratio).astype(int)
|
||||||
|
indices = np.clip(indices, 0, len(audio) - 1)
|
||||||
|
audio = audio[indices]
|
||||||
|
|
||||||
# Create multi-channel output
|
self._live_caller_queue.put(audio)
|
||||||
multi_ch = np.zeros((len(audio), num_channels), dtype=np.float32)
|
|
||||||
multi_ch[:, channel_idx] = audio
|
|
||||||
|
|
||||||
# Write to output device
|
|
||||||
with sd.OutputStream(
|
|
||||||
device=self.output_device,
|
|
||||||
samplerate=device_sr,
|
|
||||||
channels=num_channels,
|
|
||||||
dtype=np.float32,
|
|
||||||
) as stream:
|
|
||||||
stream.write(multi_ch)
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Real caller audio routing error: {e}")
|
print(f"Real caller audio routing error: {e}")
|
||||||
@@ -366,44 +416,45 @@ class AudioService:
|
|||||||
|
|
||||||
self._host_send_callback = send_callback
|
self._host_send_callback = send_callback
|
||||||
|
|
||||||
device_info = sd.query_devices(self.input_device)
|
def _start():
|
||||||
max_channels = device_info['max_input_channels']
|
device_info = sd.query_devices(self.input_device)
|
||||||
device_sr = int(device_info['default_samplerate'])
|
max_channels = device_info['max_input_channels']
|
||||||
record_channel = min(self.input_channel, max_channels) - 1
|
device_sr = int(device_info['default_samplerate'])
|
||||||
|
record_channel = min(self.input_channel, max_channels) - 1
|
||||||
|
step = max(1, int(device_sr / 16000))
|
||||||
|
|
||||||
import librosa
|
def callback(indata, frames, time_info, status):
|
||||||
|
if not self._host_send_callback:
|
||||||
|
return
|
||||||
|
mono = indata[:, record_channel]
|
||||||
|
# Simple decimation to ~16kHz
|
||||||
|
if step > 1:
|
||||||
|
mono = mono[::step]
|
||||||
|
pcm = (mono * 32767).astype(np.int16).tobytes()
|
||||||
|
self._host_send_callback(pcm)
|
||||||
|
|
||||||
def callback(indata, frames, time_info, status):
|
self._host_stream = sd.InputStream(
|
||||||
if not self._host_send_callback:
|
device=self.input_device,
|
||||||
return
|
channels=max_channels,
|
||||||
# Extract the configured input channel
|
samplerate=device_sr,
|
||||||
mono = indata[:, record_channel].copy()
|
dtype=np.float32,
|
||||||
# Resample to 16kHz if needed
|
blocksize=4096,
|
||||||
if device_sr != 16000:
|
callback=callback,
|
||||||
mono = librosa.resample(mono, orig_sr=device_sr, target_sr=16000)
|
)
|
||||||
# Convert float32 to int16 PCM
|
self._host_stream.start()
|
||||||
pcm = (mono * 32767).astype(np.int16).tobytes()
|
print(f"[Audio] Host mic streaming started (device {self.input_device} ch {self.input_channel} @ {device_sr}Hz)")
|
||||||
self._host_send_callback(pcm)
|
|
||||||
|
|
||||||
self._host_stream = sd.InputStream(
|
threading.Thread(target=_start, daemon=True).start()
|
||||||
device=self.input_device,
|
|
||||||
channels=max_channels,
|
|
||||||
samplerate=device_sr,
|
|
||||||
dtype=np.float32,
|
|
||||||
blocksize=4096,
|
|
||||||
callback=callback,
|
|
||||||
)
|
|
||||||
self._host_stream.start()
|
|
||||||
print(f"[Audio] Host mic streaming started (device {self.input_device} ch {self.input_channel} @ {device_sr}Hz)")
|
|
||||||
|
|
||||||
def stop_host_stream(self):
|
def stop_host_stream(self):
|
||||||
"""Stop host mic streaming"""
|
"""Stop host mic streaming and live caller output"""
|
||||||
if self._host_stream:
|
if self._host_stream:
|
||||||
self._host_stream.stop()
|
self._host_stream.stop()
|
||||||
self._host_stream.close()
|
self._host_stream.close()
|
||||||
self._host_stream = None
|
self._host_stream = None
|
||||||
self._host_send_callback = None
|
self._host_send_callback = None
|
||||||
print("[Audio] Host mic streaming stopped")
|
print("[Audio] Host mic streaming stopped")
|
||||||
|
self._stop_live_caller_stream()
|
||||||
|
|
||||||
# --- Music Playback ---
|
# --- Music Playback ---
|
||||||
|
|
||||||
|
|||||||
@@ -128,7 +128,7 @@
|
|||||||
</div>
|
</div>
|
||||||
<div class="channel-row">
|
<div class="channel-row">
|
||||||
<label>Caller Ch <input type="number" id="caller-channel" value="1" min="1" max="16" class="channel-input"></label>
|
<label>Caller Ch <input type="number" id="caller-channel" value="1" min="1" max="16" class="channel-input"></label>
|
||||||
<label>Live Ch <input type="number" id="live-caller-channel" value="4" min="1" max="16" class="channel-input"></label>
|
<label>Live Ch <input type="number" id="live-caller-channel" value="9" min="1" max="16" class="channel-input"></label>
|
||||||
<label>Music Ch <input type="number" id="music-channel" value="2" min="1" max="16" class="channel-input"></label>
|
<label>Music Ch <input type="number" id="music-channel" value="2" min="1" max="16" class="channel-input"></label>
|
||||||
<label>SFX Ch <input type="number" id="sfx-channel" value="3" min="1" max="16" class="channel-input"></label>
|
<label>SFX Ch <input type="number" id="sfx-channel" value="3" min="1" max="16" class="channel-input"></label>
|
||||||
</div>
|
</div>
|
||||||
|
|||||||
@@ -229,7 +229,7 @@ async function loadAudioDevices() {
|
|||||||
|
|
||||||
if (inputCh) inputCh.value = settings.input_channel || 1;
|
if (inputCh) inputCh.value = settings.input_channel || 1;
|
||||||
if (callerCh) callerCh.value = settings.caller_channel || 1;
|
if (callerCh) callerCh.value = settings.caller_channel || 1;
|
||||||
if (liveCallerCh) liveCallerCh.value = settings.live_caller_channel || 4;
|
if (liveCallerCh) liveCallerCh.value = settings.live_caller_channel || 9;
|
||||||
if (musicCh) musicCh.value = settings.music_channel || 2;
|
if (musicCh) musicCh.value = settings.music_channel || 2;
|
||||||
if (sfxCh) sfxCh.value = settings.sfx_channel || 3;
|
if (sfxCh) sfxCh.value = settings.sfx_channel || 3;
|
||||||
|
|
||||||
@@ -265,7 +265,7 @@ async function saveAudioDevices() {
|
|||||||
input_channel: inputChannel ? parseInt(inputChannel) : 1,
|
input_channel: inputChannel ? parseInt(inputChannel) : 1,
|
||||||
output_device: outputDevice ? parseInt(outputDevice) : null,
|
output_device: outputDevice ? parseInt(outputDevice) : null,
|
||||||
caller_channel: callerChannel ? parseInt(callerChannel) : 1,
|
caller_channel: callerChannel ? parseInt(callerChannel) : 1,
|
||||||
live_caller_channel: liveCallerChannel ? parseInt(liveCallerChannel) : 4,
|
live_caller_channel: liveCallerChannel ? parseInt(liveCallerChannel) : 9,
|
||||||
music_channel: musicChannel ? parseInt(musicChannel) : 2,
|
music_channel: musicChannel ? parseInt(musicChannel) : 2,
|
||||||
sfx_channel: sfxChannel ? parseInt(sfxChannel) : 3,
|
sfx_channel: sfxChannel ? parseInt(sfxChannel) : 3,
|
||||||
phone_filter: phoneFilterChecked
|
phone_filter: phoneFilterChecked
|
||||||
|
|||||||
Reference in New Issue
Block a user