Files
ai-podcast/backend/services/audio.py
tcpsyn 0bdac16250 Upgrade Whisper to distil-large-v3, fix caller identity confusion, sort clips list
- Whisper base → distil-large-v3 for much better live transcription accuracy
- Add context hints to transcription (caller name, screening status)
- Increase beam_size 3→5 for better decoding
- Add explicit role clarification in caller system prompt so LLM knows Luke is the host
- Prefix host messages with [Host Luke] in LLM conversation
- Fix upload_clips episode list sorting (natural numeric order)
- Episodes 26-28 transcripts, data updates, misc fixes

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-05 12:46:51 -07:00

1271 lines
50 KiB
Python

"""Server-side audio service for Loopback routing"""
import sounddevice as sd
import numpy as np
import threading
import queue
import json
from pathlib import Path
from typing import Optional, Callable
import wave
import time
# Settings file path
SETTINGS_FILE = Path(__file__).parent.parent.parent / "audio_settings.json"
# REAPER state file for dialog region markers
REAPER_STATE_FILE = "/tmp/reaper_state.txt"
def _write_reaper_state(state: str):
"""Write state to file. Uses a thread so it's safe from audio callbacks."""
def _write():
try:
with open(REAPER_STATE_FILE, "w") as f:
f.write(state)
except OSError:
pass
threading.Thread(target=_write, daemon=True).start()
class AudioService:
"""Manages audio I/O with multi-channel support for Loopback routing"""
def __init__(self):
# Device configuration
self.input_device: Optional[int] = 13 # Radio Voice Mic (loopback input)
self.input_channel: int = 1 # 1-indexed channel
self.output_device: Optional[int] = 12 # Radio Voice Mic (loopback output)
self.caller_channel: int = 3 # Channel for caller TTS
self.live_caller_channel: int = 9 # Channel for live caller audio
self.music_channel: int = 5 # Channel for music
self.sfx_channel: int = 3 # Channel for SFX
self.ad_channel: int = 11 # Channel for ads
self.ident_channel: int = 15 # Channel for idents (stereo: ch 15+16)
self.monitor_device: Optional[int] = 14 # Babyface Pro (headphone monitoring)
self.monitor_channel: int = 1 # Channel for mic monitoring on monitor device
self.phone_filter: bool = False # Phone filter on caller voices
# Ad playback state
self._ad_stream: Optional[sd.OutputStream] = None
self._ad_data: Optional[np.ndarray] = None
self._ad_resampled: Optional[np.ndarray] = None
self._ad_position: int = 0
self._ad_playing: bool = False
# Ident playback state
self._ident_stream: Optional[sd.OutputStream] = None
self._ident_data: Optional[np.ndarray] = None
self._ident_resampled: Optional[np.ndarray] = None
self._ident_position: int = 0
self._ident_playing: bool = False
# Recording state
self._recording = False
self._record_thread: Optional[threading.Thread] = None
self._audio_queue: queue.Queue = queue.Queue()
self._recorded_audio: list = []
self._record_device_sr: int = 48000
# Music playback state
self._music_stream: Optional[sd.OutputStream] = None
self._music_data: Optional[np.ndarray] = None
self._music_resampled: Optional[np.ndarray] = None
self._music_position: int = 0
self._music_playing: bool = False
self._music_volume: float = 0.3
self._music_loop: bool = True
# Music crossfade state
self._crossfade_active: bool = False
self._crossfade_old_data: Optional[np.ndarray] = None
self._crossfade_old_position: int = 0
self._crossfade_progress: float = 0.0
self._crossfade_samples: int = 0
self._crossfade_step: float = 0.0
# Caller playback state
self._caller_stop_event = threading.Event()
self._caller_thread: Optional[threading.Thread] = None
# Host mic streaming state
self._host_stream: Optional[sd.InputStream] = None
self._host_send_callback: Optional[Callable] = None
self._host_device_sr: int = 48000
# Live caller routing state
self._live_caller_stream: Optional[sd.OutputStream] = None
self._live_caller_write: Optional[Callable] = None
# Sample rates
self.input_sample_rate = 16000 # For Whisper
self.output_sample_rate = 24000 # For TTS
# Mic monitor (input → monitor device passthrough)
self._monitor_stream: Optional[sd.OutputStream] = None
self._monitor_write: Optional[Callable] = None
# Stem recording (opt-in, attached via API)
self.stem_recorder = None
self._stem_mic_stream: Optional[sd.InputStream] = None
# Load saved settings
self._load_settings()
def _load_settings(self):
"""Load settings from disk"""
if SETTINGS_FILE.exists():
try:
with open(SETTINGS_FILE) as f:
data = json.load(f)
self.input_device = data.get("input_device")
self.input_channel = data.get("input_channel", 1)
self.output_device = data.get("output_device")
self.caller_channel = data.get("caller_channel", 1)
self.live_caller_channel = data.get("live_caller_channel", 4)
self.music_channel = data.get("music_channel", 2)
self.sfx_channel = data.get("sfx_channel", 3)
self.ad_channel = data.get("ad_channel", 11)
self.ident_channel = data.get("ident_channel", 15)
self.monitor_device = data.get("monitor_device")
self.monitor_channel = data.get("monitor_channel", 1)
self.phone_filter = data.get("phone_filter", False)
print(f"Loaded audio settings: input={self.input_device}, output={self.output_device}, monitor={self.monitor_device}, phone_filter={self.phone_filter}")
except Exception as e:
print(f"Failed to load audio settings: {e}")
def _save_settings(self):
"""Save settings to disk"""
try:
data = {
"input_device": self.input_device,
"input_channel": self.input_channel,
"output_device": self.output_device,
"caller_channel": self.caller_channel,
"live_caller_channel": self.live_caller_channel,
"music_channel": self.music_channel,
"sfx_channel": self.sfx_channel,
"ad_channel": self.ad_channel,
"ident_channel": self.ident_channel,
"monitor_device": self.monitor_device,
"monitor_channel": self.monitor_channel,
"phone_filter": self.phone_filter,
}
with open(SETTINGS_FILE, "w") as f:
json.dump(data, f, indent=2)
print(f"Saved audio settings")
except Exception as e:
print(f"Failed to save audio settings: {e}")
def list_devices(self) -> list[dict]:
"""List all available audio devices"""
devices = sd.query_devices()
result = []
for i, d in enumerate(devices):
result.append({
"id": i,
"name": d["name"],
"inputs": d["max_input_channels"],
"outputs": d["max_output_channels"],
"default_sr": d["default_samplerate"]
})
return result
def set_devices(
self,
input_device: Optional[int] = None,
input_channel: Optional[int] = None,
output_device: Optional[int] = None,
caller_channel: Optional[int] = None,
live_caller_channel: Optional[int] = None,
music_channel: Optional[int] = None,
sfx_channel: Optional[int] = None,
ad_channel: Optional[int] = None,
ident_channel: Optional[int] = None,
monitor_device: Optional[int] = None,
monitor_channel: Optional[int] = None,
phone_filter: Optional[bool] = None
):
"""Configure audio devices and channels"""
if input_device is not None:
self.input_device = input_device
if input_channel is not None:
self.input_channel = input_channel
if output_device is not None:
self.output_device = output_device
if caller_channel is not None:
self.caller_channel = caller_channel
if live_caller_channel is not None:
self.live_caller_channel = live_caller_channel
if music_channel is not None:
self.music_channel = music_channel
if sfx_channel is not None:
self.sfx_channel = sfx_channel
if ad_channel is not None:
self.ad_channel = ad_channel
if ident_channel is not None:
self.ident_channel = ident_channel
if monitor_device is not None:
self.monitor_device = monitor_device
if monitor_channel is not None:
self.monitor_channel = monitor_channel
if phone_filter is not None:
self.phone_filter = phone_filter
# Persist to disk
self._save_settings()
def get_device_settings(self) -> dict:
"""Get current device configuration"""
return {
"input_device": self.input_device,
"input_channel": self.input_channel,
"output_device": self.output_device,
"caller_channel": self.caller_channel,
"live_caller_channel": self.live_caller_channel,
"music_channel": self.music_channel,
"sfx_channel": self.sfx_channel,
"ad_channel": self.ad_channel,
"ident_channel": self.ident_channel,
"monitor_device": self.monitor_device,
"monitor_channel": self.monitor_channel,
"phone_filter": self.phone_filter,
}
# --- Recording ---
def start_recording(self) -> bool:
"""Start recording from input device"""
if self._recording:
return False
if self.input_device is None:
print("No input device configured")
return False
self._recording = True
self._recorded_audio = []
if self._host_stream is not None:
# Host stream already capturing — piggyback on it
self._record_device_sr = self._host_device_sr
print(f"Recording started (piggybacking on host stream @ {self._host_device_sr}Hz)")
return True
self._record_thread = threading.Thread(target=self._record_worker)
self._record_thread.start()
print(f"Recording started from device {self.input_device}")
return True
def stop_recording(self) -> bytes:
"""Stop recording and return audio data resampled to 16kHz for Whisper"""
import librosa
if not self._recording:
return b""
self._recording = False
if self._record_thread:
self._record_thread.join(timeout=2.0)
self._record_thread = None
else:
# Piggybacking on host stream — give callback a moment to finish
time.sleep(0.05)
if not self._recorded_audio:
piggyback = self._host_stream is not None
# Check what other streams might be active
active_streams = []
if self._music_stream:
active_streams.append("music")
if self._live_caller_stream:
active_streams.append("live_caller")
if self._host_stream:
active_streams.append("host")
streams_info = f", active_streams=[{','.join(active_streams)}]" if active_streams else ""
print(f"Recording stopped: NO audio chunks captured (piggyback={piggyback}, device={self.input_device}, ch={self.input_channel}{streams_info})")
return b""
# Combine all chunks
audio = np.concatenate(self._recorded_audio)
device_sr = getattr(self, '_record_device_sr', 48000)
print(f"Recording stopped: {len(audio)} samples @ {device_sr}Hz ({len(audio)/device_sr:.2f}s), chunks={len(self._recorded_audio)}, peak={np.abs(audio).max():.4f}")
# Resample to 16kHz for Whisper
if device_sr != 16000:
audio = librosa.resample(audio, orig_sr=device_sr, target_sr=16000)
print(f"Resampled to 16kHz: {len(audio)} samples")
# Convert to bytes (16-bit PCM)
audio_int16 = (audio * 32767).astype(np.int16)
return audio_int16.tobytes()
def _record_worker(self):
"""Background thread for recording from specific channel"""
try:
# Get device info
device_info = sd.query_devices(self.input_device)
max_channels = device_info['max_input_channels']
device_sr = int(device_info['default_samplerate'])
record_channel = min(self.input_channel, max_channels) - 1
if max_channels == 0:
print(f"Recording error: device {self.input_device} has no input channels")
self._recording = False
return
# Store device sample rate for later resampling
self._record_device_sr = device_sr
stream_ready = threading.Event()
callback_count = [0]
def callback(indata, frames, time_info, status):
if status:
print(f"Record status: {status}")
callback_count[0] += 1
if not stream_ready.is_set():
stream_ready.set()
if self._recording:
self._recorded_audio.append(indata[:, record_channel].copy())
if self.stem_recorder:
self.stem_recorder.write("host", indata[:, record_channel].copy(), device_sr)
print(f"Recording: opening stream on device {self.input_device} ch {self.input_channel} @ {device_sr}Hz ({max_channels} ch)")
with sd.InputStream(
device=self.input_device,
channels=max_channels,
samplerate=device_sr, # Use device's native rate
dtype=np.float32,
callback=callback,
blocksize=1024
):
# Wait for stream to actually start capturing
if not stream_ready.wait(timeout=1.0):
print(f"Recording WARNING: stream opened but callback not firing after 1s")
while self._recording:
time.sleep(0.05)
print(f"Recording: stream closed, {callback_count[0]} callbacks fired, {len(self._recorded_audio)} chunks captured")
except Exception as e:
print(f"Recording error: {e}")
import traceback
traceback.print_exc()
self._recording = False
# --- Caller TTS Playback ---
def _apply_fade(self, audio: np.ndarray, sample_rate: int, fade_ms: int = 15) -> np.ndarray:
"""Apply fade-in and fade-out to avoid clicks"""
fade_samples = int(sample_rate * fade_ms / 1000)
if len(audio) < fade_samples * 2:
return audio
# Fade in
fade_in = np.linspace(0, 1, fade_samples)
audio[:fade_samples] *= fade_in
# Fade out
fade_out = np.linspace(1, 0, fade_samples)
audio[-fade_samples:] *= fade_out
return audio
def play_caller_audio(self, audio_bytes: bytes, sample_rate: int = 24000):
"""Play caller TTS audio to specific channel of output device (interruptible)"""
import librosa
# Stop any existing caller audio
self.stop_caller_audio()
self._caller_stop_event.clear()
# Convert bytes to numpy
audio = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32768.0
if self.output_device is None:
print("No output device configured, using default")
audio = self._apply_fade(audio, sample_rate)
with sd.OutputStream(samplerate=sample_rate, channels=1, dtype=np.float32) as stream:
stream.write(audio.reshape(-1, 1))
return
try:
# Get device info and resample to device's native rate
device_info = sd.query_devices(self.output_device)
num_channels = device_info['max_output_channels']
device_sr = int(device_info['default_samplerate'])
channel_idx = min(self.caller_channel, num_channels) - 1
# Resample if needed
if sample_rate != device_sr:
audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=device_sr)
# Apply fade to prevent clicks
audio = self._apply_fade(audio, device_sr)
# Create multi-channel output with audio only on target channel
multi_ch = np.zeros((len(audio), num_channels), dtype=np.float32)
multi_ch[:, channel_idx] = audio
print(f"Playing caller audio to device {self.output_device} ch {self.caller_channel} @ {device_sr}Hz")
# Play in chunks so we can interrupt
chunk_size = int(device_sr * 0.1) # 100ms chunks
pos = 0
with sd.OutputStream(
device=self.output_device,
samplerate=device_sr,
channels=num_channels,
dtype=np.float32
) as stream:
while pos < len(multi_ch) and not self._caller_stop_event.is_set():
end = min(pos + chunk_size, len(multi_ch))
stream.write(multi_ch[pos:end])
# Record each chunk as it plays so hangups cut the stem too
if self.stem_recorder:
self.stem_recorder.write_sporadic("caller", audio[pos:end].copy(), device_sr)
pos = end
if self._caller_stop_event.is_set():
print("Caller audio stopped early")
else:
print(f"Played caller audio: {len(audio)/device_sr:.2f}s")
except Exception as e:
print(f"Caller playback error: {e}")
def stop_caller_audio(self):
"""Stop any playing caller audio"""
self._caller_stop_event.set()
def _start_live_caller_stream(self):
"""Start persistent output stream with ring buffer jitter absorption"""
if self._live_caller_stream is not None:
return
if self.output_device is None:
return
device_info = sd.query_devices(self.output_device)
num_channels = device_info['max_output_channels']
device_sr = int(device_info['default_samplerate'])
channel_idx = min(self.live_caller_channel, num_channels) - 1
self._live_caller_device_sr = device_sr
self._live_caller_num_channels = num_channels
self._live_caller_channel_idx = channel_idx
# Ring buffer: 3 seconds capacity, 80ms pre-buffer before playback starts
ring_size = int(device_sr * 3)
ring = np.zeros(ring_size, dtype=np.float32)
prebuffer_samples = int(device_sr * 0.08)
# Mutable state shared between writer (main thread) and reader (audio callback)
# CPython GIL makes individual int reads/writes atomic
state = {"write_pos": 0, "read_pos": 0, "avail": 0, "started": False}
def write_audio(data):
n = len(data)
wp = state["write_pos"]
if wp + n <= ring_size:
ring[wp:wp + n] = data
else:
first = ring_size - wp
ring[wp:] = data[:first]
ring[:n - first] = data[first:]
state["write_pos"] = (wp + n) % ring_size
state["avail"] += n
def callback(outdata, frames, time_info, status):
outdata.fill(0)
avail = state["avail"]
if not state["started"]:
if avail >= prebuffer_samples:
state["started"] = True
else:
return
if avail < frames:
# Underrun — stop and re-buffer
state["started"] = False
return
rp = state["read_pos"]
if rp + frames <= ring_size:
outdata[:frames, channel_idx] = ring[rp:rp + frames]
else:
first = ring_size - rp
outdata[:first, channel_idx] = ring[rp:]
outdata[first:frames, channel_idx] = ring[:frames - first]
state["read_pos"] = (rp + frames) % ring_size
state["avail"] -= frames
self._live_caller_write = write_audio
self._live_caller_stream = sd.OutputStream(
device=self.output_device,
samplerate=device_sr,
channels=num_channels,
dtype=np.float32,
callback=callback,
blocksize=1024,
)
self._live_caller_stream.start()
print(f"[Audio] Live caller stream started on ch {self.live_caller_channel} @ {device_sr}Hz (prebuffer {prebuffer_samples} samples)")
def _stop_live_caller_stream(self):
"""Stop persistent live caller output stream"""
if self._live_caller_stream:
stream = self._live_caller_stream
self._live_caller_stream = None
self._live_caller_write = None
self._close_stream(stream)
print("[Audio] Live caller stream stopped")
def route_real_caller_audio(self, pcm_data: bytes, sample_rate: int):
"""Route real caller PCM audio to the configured live caller Loopback channel"""
if self.output_device is None:
return
if self._live_caller_stream is None:
self._start_live_caller_stream()
try:
audio = np.frombuffer(pcm_data, dtype=np.int16).astype(np.float32) / 32768.0
device_sr = self._live_caller_device_sr
if sample_rate != device_sr:
ratio = device_sr / sample_rate
out_len = int(len(audio) * ratio)
indices = (np.arange(out_len) / ratio).astype(int)
indices = np.clip(indices, 0, len(audio) - 1)
audio = audio[indices]
# Stem recording: live caller
if self.stem_recorder:
self.stem_recorder.write_sporadic("caller", audio.copy(), device_sr)
if self._live_caller_write:
self._live_caller_write(audio)
except Exception as e:
print(f"Real caller audio routing error: {e}")
# --- Host Mic Streaming ---
def start_host_stream(self, send_callback: Callable):
"""Start continuous host mic capture for streaming to real callers"""
if self._host_stream is not None:
self._host_send_callback = send_callback
return
if self.input_device is None:
print("[Audio] No input device configured for host streaming")
return
# Close stem_mic if active — this stream's callback handles stem recording too
if self._stem_mic_stream is not None:
stream = self._stem_mic_stream
self._stem_mic_stream = None
self._close_stream(stream)
print("[Audio] Closed stem_mic (host stream takes over)")
self._host_send_callback = send_callback
def _start():
device_info = sd.query_devices(self.input_device)
max_channels = device_info['max_input_channels']
device_sr = int(device_info['default_samplerate'])
record_channel = min(self.input_channel, max_channels) - 1
step = max(1, int(device_sr / 16000))
# Buffer host mic to send ~100ms chunks (reduces WebSocket frame rate)
host_accum = []
host_accum_samples = [0]
send_threshold = 1600 # 100ms at 16kHz
# Start mic monitor if monitor device is configured
self._start_monitor(device_sr)
def callback(indata, frames, time_info, status):
# Capture for push-to-talk recording if active
if self._recording and self._recorded_audio is not None:
self._recorded_audio.append(indata[:, record_channel].copy())
# Stem recording: host mic
if self.stem_recorder:
self.stem_recorder.write("host", indata[:, record_channel].copy(), device_sr)
# Mic monitor: send to headphone device
if self._monitor_write:
self._monitor_write(indata[:, record_channel].copy())
if not self._host_send_callback:
return
mono = indata[:, record_channel]
# Downsample to ~16kHz with averaging (anti-aliased)
if step > 1:
n = len(mono) // step * step
mono = mono[:n].reshape(-1, step).mean(axis=1)
host_accum.append(mono.copy())
host_accum_samples[0] += len(mono)
if host_accum_samples[0] >= send_threshold:
combined = np.concatenate(host_accum)
pcm = (combined * 32767).astype(np.int16).tobytes()
host_accum.clear()
host_accum_samples[0] = 0
self._host_send_callback(pcm)
self._host_device_sr = device_sr
self._host_stream = sd.InputStream(
device=self.input_device,
channels=max_channels,
samplerate=device_sr,
dtype=np.float32,
blocksize=1024,
callback=callback,
)
self._host_stream.start()
print(f"[Audio] Host mic streaming started (device {self.input_device} ch {self.input_channel} @ {device_sr}Hz)")
threading.Thread(target=_start, daemon=True).start()
def stop_host_stream(self):
"""Stop host mic streaming and live caller output"""
if self._host_stream:
stream = self._host_stream
self._host_stream = None
self._host_send_callback = None
self._close_stream(stream)
print("[Audio] Host mic streaming stopped")
self._stop_monitor()
self._stop_live_caller_stream()
# --- Mic Monitor (input → headphone device) ---
def _start_monitor(self, input_sr: int):
"""Start mic monitor stream that routes input to monitor device"""
if self._monitor_stream is not None:
return
if self.monitor_device is None:
return
device_info = sd.query_devices(self.monitor_device)
num_channels = device_info['max_output_channels']
device_sr = int(device_info['default_samplerate'])
channel_idx = min(self.monitor_channel, num_channels) - 1
# Ring buffer for cross-device routing
ring_size = int(device_sr * 2)
ring = np.zeros(ring_size, dtype=np.float32)
state = {"write_pos": 0, "read_pos": 0, "avail": 0}
# Precompute resample ratio (input device sr → monitor device sr)
resample_ratio = device_sr / input_sr
def write_audio(data):
# Resample if sample rates differ
if abs(resample_ratio - 1.0) > 0.01:
n_out = int(len(data) * resample_ratio)
indices = np.linspace(0, len(data) - 1, n_out).astype(int)
data = data[indices]
n = len(data)
wp = state["write_pos"]
if wp + n <= ring_size:
ring[wp:wp + n] = data
else:
first = ring_size - wp
ring[wp:] = data[:first]
ring[:n - first] = data[first:]
state["write_pos"] = (wp + n) % ring_size
state["avail"] += n
def callback(outdata, frames, time_info, status):
outdata.fill(0)
avail = state["avail"]
if avail < frames:
return
rp = state["read_pos"]
if rp + frames <= ring_size:
outdata[:frames, channel_idx] = ring[rp:rp + frames]
else:
first = ring_size - rp
outdata[:first, channel_idx] = ring[rp:]
outdata[first:frames, channel_idx] = ring[:frames - first]
state["read_pos"] = (rp + frames) % ring_size
state["avail"] -= frames
self._monitor_write = write_audio
self._monitor_stream = sd.OutputStream(
device=self.monitor_device,
samplerate=device_sr,
channels=num_channels,
dtype=np.float32,
blocksize=1024,
callback=callback,
)
self._monitor_stream.start()
print(f"[Audio] Mic monitor started (device {self.monitor_device} ch {self.monitor_channel} @ {device_sr}Hz)")
def _stop_monitor(self):
"""Stop mic monitor stream"""
if self._monitor_stream:
stream = self._monitor_stream
self._monitor_stream = None
self._monitor_write = None
self._close_stream(stream)
print("[Audio] Mic monitor stopped")
# --- Music Playback ---
def load_music(self, file_path: str) -> bool:
"""Load a music file for playback"""
path = Path(file_path)
if not path.exists():
print(f"Music file not found: {file_path}")
return False
try:
import librosa
audio, sr = librosa.load(str(path), sr=self.output_sample_rate, mono=True)
self._music_data = audio.astype(np.float32)
self._music_position = 0
print(f"Loaded music: {path.name} ({len(audio)/sr:.1f}s)")
return True
except Exception as e:
print(f"Failed to load music: {e}")
return False
def crossfade_to(self, file_path: str, duration: float = 3.0):
"""Crossfade from current music track to a new one"""
import librosa
if not self._music_playing or self._music_resampled is None:
if self.load_music(file_path):
self.play_music()
return
# Load the new track
path = Path(file_path)
if not path.exists():
print(f"Music file not found: {file_path}")
return
try:
audio, sr = librosa.load(str(path), sr=self.output_sample_rate, mono=True)
new_data = audio.astype(np.float32)
except Exception as e:
print(f"Failed to load music for crossfade: {e}")
return
# Get device sample rate for resampling
if self.output_device is not None:
device_info = sd.query_devices(self.output_device)
device_sr = int(device_info['default_samplerate'])
else:
device_sr = self.output_sample_rate
if self.output_sample_rate != device_sr:
new_resampled = librosa.resample(new_data, orig_sr=self.output_sample_rate, target_sr=device_sr)
else:
new_resampled = new_data.copy()
# Swap: current becomes old, new becomes current
self._crossfade_old_data = self._music_resampled
self._crossfade_old_position = self._music_position
self._music_resampled = new_resampled
self._music_data = new_data
self._music_position = 0
# Configure crossfade timing
self._crossfade_samples = int(device_sr * duration)
self._crossfade_progress = 0.0
self._crossfade_step = 1.0 / self._crossfade_samples if self._crossfade_samples > 0 else 1.0
self._crossfade_active = True
print(f"Crossfading to {path.name} over {duration}s")
def play_music(self):
"""Start music playback to specific channel"""
import librosa
if self._music_data is None:
print("No music loaded")
return
if self._music_playing:
self.stop_music()
self._music_playing = True
self._music_position = 0
if self.output_device is None:
print("No output device configured, using default")
num_channels = 2
device = None
device_sr = self.output_sample_rate
channel_idx = 0
else:
device_info = sd.query_devices(self.output_device)
num_channels = device_info['max_output_channels']
device_sr = int(device_info['default_samplerate'])
device = self.output_device
channel_idx = min(self.music_channel, num_channels) - 1
# Resample music to device sample rate if needed
if self.output_sample_rate != device_sr:
self._music_resampled = librosa.resample(
self._music_data, orig_sr=self.output_sample_rate, target_sr=device_sr
)
else:
self._music_resampled = self._music_data.copy()
# Apply fade-in at start of track
fade_samples = int(device_sr * 0.015) # 15ms fade
if len(self._music_resampled) > fade_samples:
fade_in = np.linspace(0, 1, fade_samples).astype(np.float32)
self._music_resampled[:fade_samples] *= fade_in
def callback(outdata, frames, time_info, status):
outdata.fill(0)
if not self._music_playing or self._music_resampled is None:
return
# Read new track samples
end_pos = self._music_position + frames
if end_pos <= len(self._music_resampled):
new_samples = self._music_resampled[self._music_position:end_pos].copy()
self._music_position = end_pos
else:
remaining = len(self._music_resampled) - self._music_position
new_samples = np.zeros(frames, dtype=np.float32)
if remaining > 0:
new_samples[:remaining] = self._music_resampled[self._music_position:]
if self._music_loop:
wrap_frames = frames - remaining
if wrap_frames > 0:
new_samples[remaining:] = self._music_resampled[:wrap_frames]
self._music_position = wrap_frames
else:
self._music_position = len(self._music_resampled)
if remaining <= 0:
self._music_playing = False
if self._crossfade_active and self._crossfade_old_data is not None:
# Read old track samples
old_end = self._crossfade_old_position + frames
if old_end <= len(self._crossfade_old_data):
old_samples = self._crossfade_old_data[self._crossfade_old_position:old_end]
self._crossfade_old_position = old_end
else:
old_remaining = len(self._crossfade_old_data) - self._crossfade_old_position
old_samples = np.zeros(frames, dtype=np.float32)
if old_remaining > 0:
old_samples[:old_remaining] = self._crossfade_old_data[self._crossfade_old_position:]
self._crossfade_old_position = len(self._crossfade_old_data)
# Compute fade curves for this chunk
start_progress = self._crossfade_progress
end_progress = min(1.0, start_progress + self._crossfade_step * frames)
fade_in = np.linspace(start_progress, end_progress, frames, dtype=np.float32)
fade_out = 1.0 - fade_in
mono_out = (old_samples * fade_out + new_samples * fade_in) * self._music_volume
outdata[:, channel_idx] = mono_out
if self.stem_recorder:
self.stem_recorder.write_sporadic("music", mono_out.copy(), device_sr)
self._crossfade_progress = end_progress
if self._crossfade_progress >= 1.0:
self._crossfade_active = False
self._crossfade_old_data = None
print("Crossfade complete")
else:
mono_out = new_samples * self._music_volume
outdata[:, channel_idx] = mono_out
if self.stem_recorder:
self.stem_recorder.write_sporadic("music", mono_out.copy(), device_sr)
try:
self._music_stream = sd.OutputStream(
device=device,
channels=num_channels,
samplerate=device_sr,
dtype=np.float32,
callback=callback,
blocksize=2048
)
self._music_stream.start()
print(f"Music playback started on ch {self.music_channel} @ {device_sr}Hz")
except Exception as e:
print(f"Music playback error: {e}")
self._music_playing = False
def _close_stream(self, stream):
"""Safely close a sounddevice stream, ignoring double-close errors"""
if stream is None:
return
try:
stream.stop()
except Exception:
pass
try:
stream.close()
except Exception:
pass
def stop_music(self, fade_duration: float = 2.0):
"""Stop music playback with fade out"""
if not self._music_playing or not self._music_stream:
self._music_playing = False
stream = self._music_stream
self._music_stream = None
self._close_stream(stream)
self._music_position = 0
return
if fade_duration <= 0:
self._music_playing = False
stream = self._music_stream
self._music_stream = None
self._close_stream(stream)
self._music_position = 0
print("Music stopped")
return
import threading
original_volume = self._music_volume
steps = 20
step_time = fade_duration / steps
# Capture stream reference locally so the fade thread closes THIS stream,
# not whatever self._music_stream points to later
fade_stream = self._music_stream
self._music_stream = None
def _fade():
for i in range(steps):
if not self._music_playing:
break
self._music_volume = original_volume * (1 - (i + 1) / steps)
import time
time.sleep(step_time)
self._music_playing = False
self._close_stream(fade_stream)
self._music_position = 0
self._music_volume = original_volume
print("Music faded out and stopped")
threading.Thread(target=_fade, daemon=True).start()
def play_ad(self, file_path: str):
"""Load and play an ad file once (no loop) on the ad channel"""
import librosa
path = Path(file_path)
if not path.exists():
print(f"Ad file not found: {file_path}")
return
self.stop_ad()
self.stop_ident()
try:
audio, sr = librosa.load(str(path), sr=self.output_sample_rate, mono=True)
self._ad_data = audio.astype(np.float32)
except Exception as e:
print(f"Failed to load ad: {e}")
return
self._ad_playing = True
self._ad_position = 0
_write_reaper_state("ad")
if self.output_device is None:
num_channels = 2
device = None
device_sr = self.output_sample_rate
channel_idx = 0
else:
device_info = sd.query_devices(self.output_device)
num_channels = device_info['max_output_channels']
device_sr = int(device_info['default_samplerate'])
device = self.output_device
channel_idx = min(self.ad_channel, num_channels) - 1
if self.output_sample_rate != device_sr:
self._ad_resampled = librosa.resample(
self._ad_data, orig_sr=self.output_sample_rate, target_sr=device_sr
).astype(np.float32)
else:
self._ad_resampled = self._ad_data
def callback(outdata, frames, time_info, status):
outdata[:] = 0
if not self._ad_playing or self._ad_resampled is None:
return
remaining = len(self._ad_resampled) - self._ad_position
if remaining >= frames:
chunk = self._ad_resampled[self._ad_position:self._ad_position + frames]
outdata[:, channel_idx] = chunk
if self.stem_recorder:
self.stem_recorder.write_sporadic("ads", chunk.copy(), device_sr)
self._ad_position += frames
else:
if remaining > 0:
outdata[:remaining, channel_idx] = self._ad_resampled[self._ad_position:]
# Ad finished — no loop
self._ad_playing = False
_write_reaper_state("dialog")
try:
self._ad_stream = sd.OutputStream(
device=device,
channels=num_channels,
samplerate=device_sr,
dtype=np.float32,
callback=callback,
blocksize=2048
)
self._ad_stream.start()
print(f"Ad playback started on ch {self.ad_channel} @ {device_sr}Hz")
except Exception as e:
print(f"Ad playback error: {e}")
self._ad_playing = False
def stop_ad(self):
"""Stop ad playback"""
was_playing = self._ad_playing
self._ad_playing = False
if was_playing:
_write_reaper_state("dialog")
if self._ad_stream:
stream = self._ad_stream
self._ad_stream = None
self._close_stream(stream)
self._ad_position = 0
def play_ident(self, file_path: str):
"""Load and play an ident file once (no loop) in stereo on ident_channel/ident_channel+1"""
import librosa
path = Path(file_path)
if not path.exists():
print(f"Ident file not found: {file_path}")
return
self.stop_ident()
self.stop_ad()
try:
audio, sr = librosa.load(str(path), sr=self.output_sample_rate, mono=False)
if audio.ndim == 1:
# Mono file — duplicate to stereo
audio = np.stack([audio, audio])
audio = audio.astype(np.float32) # shape: (2, samples)
self._ident_data = audio
except Exception as e:
print(f"Failed to load ident: {e}")
return
self._ident_playing = True
self._ident_position = 0
_write_reaper_state("ident")
print(f"Ident loaded: shape={self._ident_data.shape}, max={np.max(np.abs(self._ident_data)):.4f}")
if self.output_device is None:
num_channels = 2
device = None
device_sr = self.output_sample_rate
ch_l = 0
ch_r = 1
else:
device_info = sd.query_devices(self.output_device)
num_channels = device_info['max_output_channels']
device_sr = int(device_info['default_samplerate'])
device = self.output_device
ch_l = min(self.ident_channel, num_channels) - 1
ch_r = min(self.ident_channel + 1, num_channels) - 1
if self.output_sample_rate != device_sr:
self._ident_resampled = np.stack([
librosa.resample(self._ident_data[0], orig_sr=self.output_sample_rate, target_sr=device_sr),
librosa.resample(self._ident_data[1], orig_sr=self.output_sample_rate, target_sr=device_sr),
]).astype(np.float32)
else:
self._ident_resampled = self._ident_data
_cb_count = [0]
def callback(outdata, frames, time_info, status):
outdata[:] = 0
if not self._ident_playing or self._ident_resampled is None:
if _cb_count[0] == 0:
print(f"Ident callback: not playing (playing={self._ident_playing}, data={'yes' if self._ident_resampled is not None else 'no'})")
return
n_samples = self._ident_resampled.shape[1]
remaining = n_samples - self._ident_position
if remaining >= frames:
chunk_l = self._ident_resampled[0, self._ident_position:self._ident_position + frames]
chunk_r = self._ident_resampled[1, self._ident_position:self._ident_position + frames]
outdata[:, ch_l] = chunk_l
outdata[:, ch_r] = chunk_r
_cb_count[0] += 1
if _cb_count[0] == 1:
print(f"Ident callback delivering audio: ch_l={ch_l}, ch_r={ch_r}, max={max(np.max(np.abs(chunk_l)), np.max(np.abs(chunk_r))):.4f}")
if self.stem_recorder:
mono_mix = (chunk_l + chunk_r) * 0.5
self.stem_recorder.write_sporadic("idents", mono_mix.copy(), device_sr)
self._ident_position += frames
else:
if remaining > 0:
outdata[:remaining, ch_l] = self._ident_resampled[0, self._ident_position:]
outdata[:remaining, ch_r] = self._ident_resampled[1, self._ident_position:]
self._ident_playing = False
_write_reaper_state("dialog")
try:
self._ident_stream = sd.OutputStream(
device=device,
channels=num_channels,
samplerate=device_sr,
dtype=np.float32,
callback=callback,
blocksize=2048
)
self._ident_stream.start()
print(f"Ident playback started on ch {ch_l+1}/{ch_r+1} (idx {ch_l}/{ch_r}) of {num_channels} channels @ {device_sr}Hz, device={device}")
except Exception as e:
print(f"Ident playback error: {e}")
self._ident_playing = False
def stop_ident(self):
"""Stop ident playback"""
was_playing = self._ident_playing
self._ident_playing = False
if was_playing:
_write_reaper_state("dialog")
if self._ident_stream:
stream = self._ident_stream
self._ident_stream = None
self._close_stream(stream)
self._ident_position = 0
def set_music_volume(self, volume: float):
"""Set music volume (0.0 to 1.0)"""
self._music_volume = max(0.0, min(1.0, volume))
def is_music_playing(self) -> bool:
"""Check if music is currently playing"""
return self._music_playing
# --- SFX Playback ---
def play_sfx(self, file_path: str):
"""Play a sound effect to specific channel using dedicated stream"""
path = Path(file_path)
if not path.exists():
print(f"SFX file not found: {file_path}")
return
try:
import librosa
if self.output_device is None:
audio, sr = librosa.load(str(path), sr=None, mono=True)
audio = audio.astype(np.float32)
audio = self._apply_fade(audio, sr)
def play():
# Use a dedicated stream instead of sd.play()
with sd.OutputStream(samplerate=sr, channels=1, dtype=np.float32) as stream:
stream.write(audio.reshape(-1, 1))
else:
device_info = sd.query_devices(self.output_device)
num_channels = device_info['max_output_channels']
device_sr = int(device_info['default_samplerate'])
channel_idx = min(self.sfx_channel, num_channels) - 1
audio, _ = librosa.load(str(path), sr=device_sr, mono=True)
audio = audio.astype(np.float32)
audio = self._apply_fade(audio, device_sr)
# Stem recording: sfx
if self.stem_recorder:
self.stem_recorder.write_sporadic("sfx", audio.copy(), device_sr)
multi_ch = np.zeros((len(audio), num_channels), dtype=np.float32)
multi_ch[:, channel_idx] = audio
def play():
# Use dedicated stream to avoid interrupting other audio
with sd.OutputStream(
device=self.output_device,
samplerate=device_sr,
channels=num_channels,
dtype=np.float32
) as stream:
stream.write(multi_ch)
threading.Thread(target=play, daemon=True).start()
print(f"Playing SFX: {path.name} on ch {self.sfx_channel}")
except Exception as e:
print(f"SFX playback error: {e}")
# --- Stem Mic Capture ---
def start_stem_mic(self):
"""Start a persistent mic capture stream for stem recording.
Skips if _host_stream is already active (it writes to the host stem too)."""
if self._stem_mic_stream is not None:
return
if self._host_stream is not None:
print("[StemRecorder] Host stream already capturing mic, skipping stem_mic")
return
if self.input_device is None:
print("[StemRecorder] No input device configured, skipping host mic capture")
return
device_info = sd.query_devices(self.input_device)
max_channels = device_info['max_input_channels']
device_sr = int(device_info['default_samplerate'])
record_channel = min(self.input_channel, max_channels) - 1
self._start_monitor(device_sr)
def callback(indata, frames, time_info, status):
if self.stem_recorder:
self.stem_recorder.write("host", indata[:, record_channel].copy(), device_sr)
if self._monitor_write:
self._monitor_write(indata[:, record_channel].copy())
self._stem_mic_stream = sd.InputStream(
device=self.input_device,
channels=max_channels,
samplerate=device_sr,
dtype=np.float32,
blocksize=1024,
callback=callback,
)
self._stem_mic_stream.start()
print(f"[StemRecorder] Host mic capture started (device {self.input_device} ch {self.input_channel} @ {device_sr}Hz)")
def stop_stem_mic(self):
"""Stop the persistent stem mic capture."""
if self._stem_mic_stream:
stream = self._stem_mic_stream
self._stem_mic_stream = None
self._close_stream(stream)
print("[StemRecorder] Host mic capture stopped")
self._stop_monitor()
# Global instance
audio_service = AudioService()