Add post-production pipeline: stem recorder, postprod script, recording UI
New stem recording system captures 5 time-aligned WAV files (host, caller, music, sfx, ads) during live shows. Standalone postprod.py processes stems into broadcast-ready MP3 with gap removal, voice compression, music ducking, and EBU R128 loudness normalization. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -78,6 +78,9 @@ class AudioService:
|
||||
self.input_sample_rate = 16000 # For Whisper
|
||||
self.output_sample_rate = 24000 # For TTS
|
||||
|
||||
# Stem recording (opt-in, attached via API)
|
||||
self.stem_recorder = None
|
||||
|
||||
# Load saved settings
|
||||
self._load_settings()
|
||||
|
||||
@@ -355,6 +358,10 @@ class AudioService:
|
||||
# Apply fade to prevent clicks
|
||||
audio = self._apply_fade(audio, device_sr)
|
||||
|
||||
# Stem recording: caller TTS
|
||||
if self.stem_recorder:
|
||||
self.stem_recorder.write("caller", audio.copy(), device_sr)
|
||||
|
||||
# Create multi-channel output with audio only on target channel
|
||||
multi_ch = np.zeros((len(audio), num_channels), dtype=np.float32)
|
||||
multi_ch[:, channel_idx] = audio
|
||||
@@ -491,6 +498,10 @@ class AudioService:
|
||||
indices = np.clip(indices, 0, len(audio) - 1)
|
||||
audio = audio[indices]
|
||||
|
||||
# Stem recording: live caller
|
||||
if self.stem_recorder:
|
||||
self.stem_recorder.write("caller", audio.copy(), device_sr)
|
||||
|
||||
if self._live_caller_write:
|
||||
self._live_caller_write(audio)
|
||||
|
||||
@@ -524,6 +535,10 @@ class AudioService:
|
||||
if self._recording and self._recorded_audio is not None:
|
||||
self._recorded_audio.append(indata[:, record_channel].copy())
|
||||
|
||||
# Stem recording: host mic
|
||||
if self.stem_recorder:
|
||||
self.stem_recorder.write("host", indata[:, record_channel].copy(), device_sr)
|
||||
|
||||
if not self._host_send_callback:
|
||||
return
|
||||
mono = indata[:, record_channel]
|
||||
@@ -721,7 +736,10 @@ class AudioService:
|
||||
fade_in = np.linspace(start_progress, end_progress, frames, dtype=np.float32)
|
||||
fade_out = 1.0 - fade_in
|
||||
|
||||
outdata[:, channel_idx] = (old_samples * fade_out + new_samples * fade_in) * self._music_volume
|
||||
mono_out = (old_samples * fade_out + new_samples * fade_in) * self._music_volume
|
||||
outdata[:, channel_idx] = mono_out
|
||||
if self.stem_recorder:
|
||||
self.stem_recorder.write("music", mono_out.copy(), device_sr)
|
||||
self._crossfade_progress = end_progress
|
||||
|
||||
if self._crossfade_progress >= 1.0:
|
||||
@@ -729,7 +747,10 @@ class AudioService:
|
||||
self._crossfade_old_data = None
|
||||
print("Crossfade complete")
|
||||
else:
|
||||
outdata[:, channel_idx] = new_samples * self._music_volume
|
||||
mono_out = new_samples * self._music_volume
|
||||
outdata[:, channel_idx] = mono_out
|
||||
if self.stem_recorder:
|
||||
self.stem_recorder.write("music", mono_out.copy(), device_sr)
|
||||
|
||||
try:
|
||||
self._music_stream = sd.OutputStream(
|
||||
@@ -836,7 +857,10 @@ class AudioService:
|
||||
|
||||
remaining = len(self._ad_resampled) - self._ad_position
|
||||
if remaining >= frames:
|
||||
outdata[:, channel_idx] = self._ad_resampled[self._ad_position:self._ad_position + frames]
|
||||
chunk = self._ad_resampled[self._ad_position:self._ad_position + frames]
|
||||
outdata[:, channel_idx] = chunk
|
||||
if self.stem_recorder:
|
||||
self.stem_recorder.write("ads", chunk.copy(), device_sr)
|
||||
self._ad_position += frames
|
||||
else:
|
||||
if remaining > 0:
|
||||
@@ -904,6 +928,10 @@ class AudioService:
|
||||
audio, _ = librosa.load(str(path), sr=device_sr, mono=True)
|
||||
audio = self._apply_fade(audio, device_sr)
|
||||
|
||||
# Stem recording: sfx
|
||||
if self.stem_recorder:
|
||||
self.stem_recorder.write("sfx", audio.copy(), device_sr)
|
||||
|
||||
multi_ch = np.zeros((len(audio), num_channels), dtype=np.float32)
|
||||
multi_ch[:, channel_idx] = audio
|
||||
|
||||
|
||||
@@ -7,21 +7,28 @@ from ..config import settings
|
||||
|
||||
# Available OpenRouter models
|
||||
OPENROUTER_MODELS = [
|
||||
# Best for natural dialog (ranked)
|
||||
"minimax/minimax-m2-her",
|
||||
"mistralai/mistral-small-creative",
|
||||
"x-ai/grok-4-fast",
|
||||
"deepseek/deepseek-v3.2",
|
||||
# Updated standard models
|
||||
"anthropic/claude-haiku-4.5",
|
||||
"anthropic/claude-sonnet-4-5",
|
||||
"google/gemini-2.5-flash",
|
||||
"openai/gpt-4o-mini",
|
||||
"openai/gpt-4o",
|
||||
# Legacy
|
||||
"anthropic/claude-3-haiku",
|
||||
"anthropic/claude-3.5-sonnet",
|
||||
"google/gemini-flash-1.5",
|
||||
"google/gemini-pro-1.5",
|
||||
"meta-llama/llama-3.1-8b-instruct",
|
||||
"mistralai/mistral-7b-instruct",
|
||||
]
|
||||
|
||||
# Fast models to try as fallbacks (cheap, fast, good enough for conversation)
|
||||
FALLBACK_MODELS = [
|
||||
"google/gemini-flash-1.5",
|
||||
"mistralai/mistral-small-creative",
|
||||
"google/gemini-2.5-flash",
|
||||
"openai/gpt-4o-mini",
|
||||
"meta-llama/llama-3.1-8b-instruct",
|
||||
]
|
||||
|
||||
|
||||
@@ -103,21 +110,22 @@ class LLMService:
|
||||
async def generate(
|
||||
self,
|
||||
messages: list[dict],
|
||||
system_prompt: Optional[str] = None
|
||||
system_prompt: Optional[str] = None,
|
||||
max_tokens: Optional[int] = None
|
||||
) -> str:
|
||||
if system_prompt:
|
||||
messages = [{"role": "system", "content": system_prompt}] + messages
|
||||
|
||||
if self.provider == "openrouter":
|
||||
return await self._call_openrouter_with_fallback(messages)
|
||||
return await self._call_openrouter_with_fallback(messages, max_tokens=max_tokens)
|
||||
else:
|
||||
return await self._call_ollama(messages)
|
||||
return await self._call_ollama(messages, max_tokens=max_tokens)
|
||||
|
||||
async def _call_openrouter_with_fallback(self, messages: list[dict]) -> str:
|
||||
async def _call_openrouter_with_fallback(self, messages: list[dict], max_tokens: Optional[int] = None) -> str:
|
||||
"""Try primary model, then fallback models. Always returns a response."""
|
||||
|
||||
# Try primary model first
|
||||
result = await self._call_openrouter_once(messages, self.openrouter_model)
|
||||
result = await self._call_openrouter_once(messages, self.openrouter_model, max_tokens=max_tokens)
|
||||
if result is not None:
|
||||
return result
|
||||
|
||||
@@ -126,7 +134,7 @@ class LLMService:
|
||||
if model == self.openrouter_model:
|
||||
continue # Already tried
|
||||
print(f"[LLM] Falling back to {model}...")
|
||||
result = await self._call_openrouter_once(messages, model, timeout=10.0)
|
||||
result = await self._call_openrouter_once(messages, model, timeout=10.0, max_tokens=max_tokens)
|
||||
if result is not None:
|
||||
return result
|
||||
|
||||
@@ -134,7 +142,7 @@ class LLMService:
|
||||
print("[LLM] All models failed, using canned response")
|
||||
return "Sorry, I totally blanked out for a second. What were you saying?"
|
||||
|
||||
async def _call_openrouter_once(self, messages: list[dict], model: str, timeout: float = 15.0) -> str | None:
|
||||
async def _call_openrouter_once(self, messages: list[dict], model: str, timeout: float = 15.0, max_tokens: Optional[int] = None) -> str | None:
|
||||
"""Single attempt to call OpenRouter. Returns None on failure (not a fallback string)."""
|
||||
try:
|
||||
response = await self.client.post(
|
||||
@@ -146,7 +154,11 @@ class LLMService:
|
||||
json={
|
||||
"model": model,
|
||||
"messages": messages,
|
||||
"max_tokens": 150,
|
||||
"max_tokens": max_tokens or 150,
|
||||
"temperature": 0.8,
|
||||
"top_p": 0.92,
|
||||
"frequency_penalty": 0.5,
|
||||
"presence_penalty": 0.3,
|
||||
},
|
||||
timeout=timeout,
|
||||
)
|
||||
@@ -164,7 +176,7 @@ class LLMService:
|
||||
print(f"[LLM] {model} error: {e}")
|
||||
return None
|
||||
|
||||
async def _call_ollama(self, messages: list[dict]) -> str:
|
||||
async def _call_ollama(self, messages: list[dict], max_tokens: Optional[int] = None) -> str:
|
||||
"""Call Ollama API"""
|
||||
try:
|
||||
async with httpx.AsyncClient() as client:
|
||||
@@ -175,7 +187,7 @@ class LLMService:
|
||||
"messages": messages,
|
||||
"stream": False,
|
||||
"options": {
|
||||
"num_predict": 100,
|
||||
"num_predict": max_tokens or 100,
|
||||
"temperature": 0.8,
|
||||
"top_p": 0.9,
|
||||
"repeat_penalty": 1.3,
|
||||
|
||||
86
backend/services/stem_recorder.py
Normal file
86
backend/services/stem_recorder.py
Normal file
@@ -0,0 +1,86 @@
|
||||
"""Records separate audio stems during a live show for post-production"""
|
||||
|
||||
import time
|
||||
import numpy as np
|
||||
import soundfile as sf
|
||||
from pathlib import Path
|
||||
from scipy import signal as scipy_signal
|
||||
|
||||
STEM_NAMES = ["host", "caller", "music", "sfx", "ads"]
|
||||
|
||||
|
||||
class StemRecorder:
|
||||
def __init__(self, output_dir: str | Path, sample_rate: int = 48000):
|
||||
self.output_dir = Path(output_dir)
|
||||
self.output_dir.mkdir(parents=True, exist_ok=True)
|
||||
self.sample_rate = sample_rate
|
||||
self._files: dict[str, sf.SoundFile] = {}
|
||||
self._write_positions: dict[str, int] = {}
|
||||
self._start_time: float = 0.0
|
||||
self._running = False
|
||||
|
||||
def start(self):
|
||||
self._start_time = time.time()
|
||||
self._running = True
|
||||
for name in STEM_NAMES:
|
||||
path = self.output_dir / f"{name}.wav"
|
||||
f = sf.SoundFile(
|
||||
str(path), mode="w",
|
||||
samplerate=self.sample_rate,
|
||||
channels=1, subtype="FLOAT",
|
||||
)
|
||||
self._files[name] = f
|
||||
self._write_positions[name] = 0
|
||||
print(f"[StemRecorder] Recording started -> {self.output_dir}")
|
||||
|
||||
def write(self, stem_name: str, audio_data: np.ndarray, source_sr: int):
|
||||
if not self._running or stem_name not in self._files:
|
||||
return
|
||||
|
||||
# Resample to target rate if needed
|
||||
if source_sr != self.sample_rate:
|
||||
num_samples = int(len(audio_data) * self.sample_rate / source_sr)
|
||||
if num_samples > 0:
|
||||
audio_data = scipy_signal.resample(audio_data, num_samples).astype(np.float32)
|
||||
else:
|
||||
return
|
||||
|
||||
# Fill silence gap based on elapsed time
|
||||
elapsed = time.time() - self._start_time
|
||||
expected_pos = int(elapsed * self.sample_rate)
|
||||
current_pos = self._write_positions[stem_name]
|
||||
|
||||
if expected_pos > current_pos:
|
||||
gap = expected_pos - current_pos
|
||||
silence = np.zeros(gap, dtype=np.float32)
|
||||
self._files[stem_name].write(silence)
|
||||
self._write_positions[stem_name] = expected_pos
|
||||
|
||||
self._files[stem_name].write(audio_data.astype(np.float32))
|
||||
self._write_positions[stem_name] += len(audio_data)
|
||||
|
||||
def stop(self) -> dict[str, str]:
|
||||
if not self._running:
|
||||
return {}
|
||||
|
||||
self._running = False
|
||||
|
||||
# Pad all stems to the same length
|
||||
max_pos = max(self._write_positions.values()) if self._write_positions else 0
|
||||
for name in STEM_NAMES:
|
||||
pos = self._write_positions[name]
|
||||
if pos < max_pos:
|
||||
silence = np.zeros(max_pos - pos, dtype=np.float32)
|
||||
self._files[name].write(silence)
|
||||
|
||||
# Close all files
|
||||
paths = {}
|
||||
for name in STEM_NAMES:
|
||||
self._files[name].close()
|
||||
paths[name] = str(self.output_dir / f"{name}.wav")
|
||||
|
||||
self._files.clear()
|
||||
self._write_positions.clear()
|
||||
|
||||
print(f"[StemRecorder] Recording stopped. {max_pos} samples ({max_pos/self.sample_rate:.1f}s)")
|
||||
return paths
|
||||
@@ -598,7 +598,7 @@ async def generate_speech_inworld(text: str, voice_id: str) -> tuple[np.ndarray,
|
||||
payload = {
|
||||
"text": text,
|
||||
"voice_id": voice,
|
||||
"model_id": "inworld-tts-1.5-mini",
|
||||
"model_id": "inworld-tts-1.5-max",
|
||||
"audio_config": {
|
||||
"encoding": "LINEAR16",
|
||||
"sample_rate_hertz": 48000,
|
||||
|
||||
Reference in New Issue
Block a user