Add post-production pipeline: stem recorder, postprod script, recording UI

New stem recording system captures 5 time-aligned WAV files (host, caller,
music, sfx, ads) during live shows. Standalone postprod.py processes stems
into broadcast-ready MP3 with gap removal, voice compression, music ducking,
and EBU R128 loudness normalization.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-08 17:53:32 -07:00
parent 356bf145b8
commit 7d88c76f90
12 changed files with 1528 additions and 363 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -78,6 +78,9 @@ class AudioService:
self.input_sample_rate = 16000 # For Whisper
self.output_sample_rate = 24000 # For TTS
# Stem recording (opt-in, attached via API)
self.stem_recorder = None
# Load saved settings
self._load_settings()
@@ -355,6 +358,10 @@ class AudioService:
# Apply fade to prevent clicks
audio = self._apply_fade(audio, device_sr)
# Stem recording: caller TTS
if self.stem_recorder:
self.stem_recorder.write("caller", audio.copy(), device_sr)
# Create multi-channel output with audio only on target channel
multi_ch = np.zeros((len(audio), num_channels), dtype=np.float32)
multi_ch[:, channel_idx] = audio
@@ -491,6 +498,10 @@ class AudioService:
indices = np.clip(indices, 0, len(audio) - 1)
audio = audio[indices]
# Stem recording: live caller
if self.stem_recorder:
self.stem_recorder.write("caller", audio.copy(), device_sr)
if self._live_caller_write:
self._live_caller_write(audio)
@@ -524,6 +535,10 @@ class AudioService:
if self._recording and self._recorded_audio is not None:
self._recorded_audio.append(indata[:, record_channel].copy())
# Stem recording: host mic
if self.stem_recorder:
self.stem_recorder.write("host", indata[:, record_channel].copy(), device_sr)
if not self._host_send_callback:
return
mono = indata[:, record_channel]
@@ -721,7 +736,10 @@ class AudioService:
fade_in = np.linspace(start_progress, end_progress, frames, dtype=np.float32)
fade_out = 1.0 - fade_in
outdata[:, channel_idx] = (old_samples * fade_out + new_samples * fade_in) * self._music_volume
mono_out = (old_samples * fade_out + new_samples * fade_in) * self._music_volume
outdata[:, channel_idx] = mono_out
if self.stem_recorder:
self.stem_recorder.write("music", mono_out.copy(), device_sr)
self._crossfade_progress = end_progress
if self._crossfade_progress >= 1.0:
@@ -729,7 +747,10 @@ class AudioService:
self._crossfade_old_data = None
print("Crossfade complete")
else:
outdata[:, channel_idx] = new_samples * self._music_volume
mono_out = new_samples * self._music_volume
outdata[:, channel_idx] = mono_out
if self.stem_recorder:
self.stem_recorder.write("music", mono_out.copy(), device_sr)
try:
self._music_stream = sd.OutputStream(
@@ -836,7 +857,10 @@ class AudioService:
remaining = len(self._ad_resampled) - self._ad_position
if remaining >= frames:
outdata[:, channel_idx] = self._ad_resampled[self._ad_position:self._ad_position + frames]
chunk = self._ad_resampled[self._ad_position:self._ad_position + frames]
outdata[:, channel_idx] = chunk
if self.stem_recorder:
self.stem_recorder.write("ads", chunk.copy(), device_sr)
self._ad_position += frames
else:
if remaining > 0:
@@ -904,6 +928,10 @@ class AudioService:
audio, _ = librosa.load(str(path), sr=device_sr, mono=True)
audio = self._apply_fade(audio, device_sr)
# Stem recording: sfx
if self.stem_recorder:
self.stem_recorder.write("sfx", audio.copy(), device_sr)
multi_ch = np.zeros((len(audio), num_channels), dtype=np.float32)
multi_ch[:, channel_idx] = audio

View File

@@ -7,21 +7,28 @@ from ..config import settings
# Available OpenRouter models
OPENROUTER_MODELS = [
# Best for natural dialog (ranked)
"minimax/minimax-m2-her",
"mistralai/mistral-small-creative",
"x-ai/grok-4-fast",
"deepseek/deepseek-v3.2",
# Updated standard models
"anthropic/claude-haiku-4.5",
"anthropic/claude-sonnet-4-5",
"google/gemini-2.5-flash",
"openai/gpt-4o-mini",
"openai/gpt-4o",
# Legacy
"anthropic/claude-3-haiku",
"anthropic/claude-3.5-sonnet",
"google/gemini-flash-1.5",
"google/gemini-pro-1.5",
"meta-llama/llama-3.1-8b-instruct",
"mistralai/mistral-7b-instruct",
]
# Fast models to try as fallbacks (cheap, fast, good enough for conversation)
FALLBACK_MODELS = [
"google/gemini-flash-1.5",
"mistralai/mistral-small-creative",
"google/gemini-2.5-flash",
"openai/gpt-4o-mini",
"meta-llama/llama-3.1-8b-instruct",
]
@@ -103,21 +110,22 @@ class LLMService:
async def generate(
self,
messages: list[dict],
system_prompt: Optional[str] = None
system_prompt: Optional[str] = None,
max_tokens: Optional[int] = None
) -> str:
if system_prompt:
messages = [{"role": "system", "content": system_prompt}] + messages
if self.provider == "openrouter":
return await self._call_openrouter_with_fallback(messages)
return await self._call_openrouter_with_fallback(messages, max_tokens=max_tokens)
else:
return await self._call_ollama(messages)
return await self._call_ollama(messages, max_tokens=max_tokens)
async def _call_openrouter_with_fallback(self, messages: list[dict]) -> str:
async def _call_openrouter_with_fallback(self, messages: list[dict], max_tokens: Optional[int] = None) -> str:
"""Try primary model, then fallback models. Always returns a response."""
# Try primary model first
result = await self._call_openrouter_once(messages, self.openrouter_model)
result = await self._call_openrouter_once(messages, self.openrouter_model, max_tokens=max_tokens)
if result is not None:
return result
@@ -126,7 +134,7 @@ class LLMService:
if model == self.openrouter_model:
continue # Already tried
print(f"[LLM] Falling back to {model}...")
result = await self._call_openrouter_once(messages, model, timeout=10.0)
result = await self._call_openrouter_once(messages, model, timeout=10.0, max_tokens=max_tokens)
if result is not None:
return result
@@ -134,7 +142,7 @@ class LLMService:
print("[LLM] All models failed, using canned response")
return "Sorry, I totally blanked out for a second. What were you saying?"
async def _call_openrouter_once(self, messages: list[dict], model: str, timeout: float = 15.0) -> str | None:
async def _call_openrouter_once(self, messages: list[dict], model: str, timeout: float = 15.0, max_tokens: Optional[int] = None) -> str | None:
"""Single attempt to call OpenRouter. Returns None on failure (not a fallback string)."""
try:
response = await self.client.post(
@@ -146,7 +154,11 @@ class LLMService:
json={
"model": model,
"messages": messages,
"max_tokens": 150,
"max_tokens": max_tokens or 150,
"temperature": 0.8,
"top_p": 0.92,
"frequency_penalty": 0.5,
"presence_penalty": 0.3,
},
timeout=timeout,
)
@@ -164,7 +176,7 @@ class LLMService:
print(f"[LLM] {model} error: {e}")
return None
async def _call_ollama(self, messages: list[dict]) -> str:
async def _call_ollama(self, messages: list[dict], max_tokens: Optional[int] = None) -> str:
"""Call Ollama API"""
try:
async with httpx.AsyncClient() as client:
@@ -175,7 +187,7 @@ class LLMService:
"messages": messages,
"stream": False,
"options": {
"num_predict": 100,
"num_predict": max_tokens or 100,
"temperature": 0.8,
"top_p": 0.9,
"repeat_penalty": 1.3,

View File

@@ -0,0 +1,86 @@
"""Records separate audio stems during a live show for post-production"""
import time
import numpy as np
import soundfile as sf
from pathlib import Path
from scipy import signal as scipy_signal
STEM_NAMES = ["host", "caller", "music", "sfx", "ads"]
class StemRecorder:
def __init__(self, output_dir: str | Path, sample_rate: int = 48000):
self.output_dir = Path(output_dir)
self.output_dir.mkdir(parents=True, exist_ok=True)
self.sample_rate = sample_rate
self._files: dict[str, sf.SoundFile] = {}
self._write_positions: dict[str, int] = {}
self._start_time: float = 0.0
self._running = False
def start(self):
self._start_time = time.time()
self._running = True
for name in STEM_NAMES:
path = self.output_dir / f"{name}.wav"
f = sf.SoundFile(
str(path), mode="w",
samplerate=self.sample_rate,
channels=1, subtype="FLOAT",
)
self._files[name] = f
self._write_positions[name] = 0
print(f"[StemRecorder] Recording started -> {self.output_dir}")
def write(self, stem_name: str, audio_data: np.ndarray, source_sr: int):
if not self._running or stem_name not in self._files:
return
# Resample to target rate if needed
if source_sr != self.sample_rate:
num_samples = int(len(audio_data) * self.sample_rate / source_sr)
if num_samples > 0:
audio_data = scipy_signal.resample(audio_data, num_samples).astype(np.float32)
else:
return
# Fill silence gap based on elapsed time
elapsed = time.time() - self._start_time
expected_pos = int(elapsed * self.sample_rate)
current_pos = self._write_positions[stem_name]
if expected_pos > current_pos:
gap = expected_pos - current_pos
silence = np.zeros(gap, dtype=np.float32)
self._files[stem_name].write(silence)
self._write_positions[stem_name] = expected_pos
self._files[stem_name].write(audio_data.astype(np.float32))
self._write_positions[stem_name] += len(audio_data)
def stop(self) -> dict[str, str]:
if not self._running:
return {}
self._running = False
# Pad all stems to the same length
max_pos = max(self._write_positions.values()) if self._write_positions else 0
for name in STEM_NAMES:
pos = self._write_positions[name]
if pos < max_pos:
silence = np.zeros(max_pos - pos, dtype=np.float32)
self._files[name].write(silence)
# Close all files
paths = {}
for name in STEM_NAMES:
self._files[name].close()
paths[name] = str(self.output_dir / f"{name}.wav")
self._files.clear()
self._write_positions.clear()
print(f"[StemRecorder] Recording stopped. {max_pos} samples ({max_pos/self.sample_rate:.1f}s)")
return paths

View File

@@ -598,7 +598,7 @@ async def generate_speech_inworld(text: str, voice_id: str) -> tuple[np.ndarray,
payload = {
"text": text,
"voice_id": voice,
"model_id": "inworld-tts-1.5-mini",
"model_id": "inworld-tts-1.5-max",
"audio_config": {
"encoding": "LINEAR16",
"sample_rate_hertz": 48000,

View File

@@ -1 +1,143 @@
{"regulars": []}
{
"regulars": [
{
"id": "be244306",
"name": "Dale",
"gender": "male",
"age": 44,
"job": "runs a food truck",
"location": "unknown",
"personality_traits": [],
"call_history": [
{
"summary": "Briefly explain the universe's expansion to a child who wants to know what happens when it stops expanding. Please don't suggest unusual topics; keep the explanation simple.",
"timestamp": 1770515097.24686
},
{
"summary": "Dale updates the host on explaining the universe's expansion to his buddy's kid, who now worries if it could \"pop,\" but shifts to his temptation to bet on Super Bowl 60 predictions after reading an article, critiquing a prior caller's gambling mindset while reflecting emotionally on his brother Eddie's fruitless horse-betting habit and his own exhaustion from long taco truck shifts in the cold desert. He ultimately considers a small, affordable wager on the Chiefs as a low-stakes thrill.",
"timestamp": 1770522741.049846
}
],
"last_call": 1770522741.049846,
"created_at": 1770515097.24686
},
{
"id": "584767e8",
"name": "Carl",
"gender": "male",
"age": 36,
"job": "is a firefighter",
"location": "unknown",
"personality_traits": [],
"call_history": [
{
"summary": "Carl, a firefighter from Lordsburg, New Mexico, called to confess his 20-year gambling addiction, which began with casual poker games at the station and escalated to frequent casino visits and online sessions, draining his finances and leaving him with overdue bills and the fear of losing his home. Emotionally raw, he admitted the habit's destructive hold\u2014like an unquenchable fire\u2014and his pride in avoiding help, but agreed to consider support groups and an 800 hotline after the host suggested productive alternatives like gym workouts or extra volunteer shifts.",
"timestamp": 1770522170.1887732
},
{
"summary": "Here is a 1-2 sentence summary of the radio call:\n\nThe caller, Carl, discusses his progress in overcoming his gambling addiction, including rewatching The Sopranos, but the host, Luke, disagrees with Carl's high opinion of the show's ending, leading to a back-and-forth debate between the two about the merits and predictability of the Sopranos finale.",
"timestamp": 1770573289.82847
}
],
"last_call": 1770573289.828471,
"created_at": 1770522170.1887732
},
{
"id": "d97cb6f9",
"name": "Carla",
"gender": "female",
"age": 26,
"job": "is a vet tech",
"location": "unknown",
"personality_traits": [],
"call_history": [
{
"summary": "Carla, separated from her husband but not yet divorced, vented about her intrusive in-laws who relentlessly call and dictate her life\u2014from finances and household matters to her clothing choices\u2014while her spineless spouse relays their demands, making her feel trapped in a one-sided war. With her own parents unavailable (father deceased, mother distant), she leans on her bickering but honest sister for support, underscoring her deep frustration and sense of isolation.",
"timestamp": 1770522530.8554251
},
{
"summary": "Carla dismissed celebrity science theories like Terrence Howard's after watching Neil deGrasse Tyson's critique, then marveled at JWST's exoplanet discoveries before sharing her relief at finally cutting off her toxic in-laws amid her ongoing divorce. She expressed deep heartbreak over actor James Ransone's suicide at 46, reflecting on life's fragility, her late father's death, and the need to eliminate family drama, leaving her contemplative and planning a solo desert drive for clarity.",
"timestamp": 1770526316.004708
}
],
"last_call": 1770526316.004709,
"created_at": 1770522530.855426
},
{
"id": "5ccaea00",
"name": "Jerome",
"gender": "male",
"age": 52,
"job": "works at a cemetery",
"location": "unknown",
"personality_traits": [],
"call_history": [
{
"summary": "Jerome called in to discuss Neil deGrasse Tyson's dismissal of Terrence Howard's unconventional scientific theories, agreeing they don't hold up to real science, before opening up about his emotional turmoil over an unanswered text from his ex, Laura, following a recent blowout that left him questioning his life choices while drinking mezcal in his truck late at night. He reflected on their breakup due to his workaholic tendencies at the cemetery and her desire for more, but found hope in his child's insightful comment about the stars from the Silo books, suggesting they might both be better off apart.",
"timestamp": 1770522903.5809002
},
{
"summary": "Here is a 1-2 sentence summary of the call:\n\nThe caller, Jerome, recounts a humorous customer service interaction where a woman came to the cemetery he works at late at night frantically trying to find her husband's plot, leading to an amusing back-and-forth.",
"timestamp": 1770523944.299309
}
],
"last_call": 1770523944.29931,
"created_at": 1770522903.5809002
},
{
"id": "49147bd5",
"name": "Keith",
"gender": "male",
"age": 61,
"job": "south of Silver City",
"location": "in unknown",
"personality_traits": [],
"call_history": [
{
"summary": "The caller, Luke, kicked off by sharing a humorous clip of Terrence Howard's Tree of Life Theory being critiqued by Neil deGrasse Tyson, which left Howard visibly hurt, before pivoting to economic woes, blaming overspending and Federal Reserve money printing for devaluing the currency and harming everyday people. He advocated abolishing the Fed, echoing Ron Paul's ideas, to let markets stabilize money, potentially boosting innovation and new industries in rural spots like Silver City despite uncertain local impacts.",
"timestamp": 1770524506.3390348
},
{
"summary": "Here is a 1-2 sentence summary of the call:\n\nThe caller, who works at a bank, has been reflecting on his tendency to blame the government and economic system for his problems, rather than taking responsibility for his own role. He had an epiphany while eating leftover enchiladas in his minivan, realizing he needs to be more proactive instead of just complaining.",
"timestamp": 1770574890.1296651
}
],
"last_call": 1770574890.1296651,
"created_at": 1770524506.339036
},
{
"id": "4f4612c7",
"name": "Dale",
"gender": "male",
"age": 38,
"job": "is a cop, 12 years on the force",
"location": "unknown",
"personality_traits": [],
"call_history": [
{
"summary": "Dale from Globe called in to express skepticism about Terrence Howard's Tree of Life theory, arguing it lacks peer-reviewed experiments and scientific consensus, much like how he trusts quantum entanglement based on reliable sources without reading every paper himself. The conversation shifted to an emotional discussion of his grief over Uncle Hector, the man who raised him like a father but changed after a stroke, leaving Dale feeling a profound loss without closure, though he found solace in the host's validation and hope for lucid moments ahead.",
"timestamp": 1770526114.530777
}
],
"last_call": 1770526114.5307782,
"created_at": 1770526114.5307782
},
{
"id": "60053b38",
"name": "Lorraine",
"gender": "female",
"age": 42,
"job": "New Mexico",
"location": "in unknown",
"personality_traits": [],
"call_history": [
{
"summary": "Here is a 1-2 sentence summary of the call:\n\nThe caller has an outstanding warrant for a DUI charge from a few years ago that they have been avoiding dealing with, which has been causing them a lot of stress and guilt. The host encourages the caller to take responsibility and go to the sheriff's office to get the warrant cleared up, as driving drunk is extremely dangerous and unacceptable.",
"timestamp": 1770573956.570584
}
],
"last_call": 1770573956.570584,
"created_at": 1770573956.570584
}
]
}

View File

@@ -75,6 +75,19 @@ header button {
50% { opacity: 0.7; }
}
.rec-btn {
font-weight: 700;
text-transform: uppercase;
letter-spacing: 0.05em;
background: #555 !important;
transition: background 0.2s;
}
.rec-btn.recording {
background: #cc2222 !important;
animation: on-air-pulse 2s ease-in-out infinite;
}
.new-session-btn {
background: var(--accent) !important;
}
@@ -85,17 +98,29 @@ header button {
font-weight: normal;
}
.caller-background {
details.caller-background {
font-size: 0.85rem;
color: var(--text-muted);
padding: 10px;
background: var(--bg);
border-radius: var(--radius);
margin-bottom: 12px;
line-height: 1.4;
}
.caller-background.hidden {
details.caller-background summary {
cursor: pointer;
padding: 8px 10px;
font-weight: bold;
color: var(--text);
font-size: 0.8rem;
}
details.caller-background > div {
padding: 0 10px 10px;
white-space: pre-wrap;
}
details.caller-background.hidden {
display: none;
}

View File

@@ -12,6 +12,7 @@
<h1>Luke at The Roost</h1>
<div class="header-buttons">
<button id="on-air-btn" class="on-air-btn off">OFF AIR</button>
<button id="rec-btn" class="rec-btn" title="Record stems for post-production">REC</button>
<button id="new-session-btn" class="new-session-btn">New Session</button>
<button id="export-session-btn">Export</button>
<button id="settings-btn">Settings</button>
@@ -49,7 +50,10 @@
</label>
</div>
<div id="call-status" class="call-status">No active call</div>
<div id="caller-background" class="caller-background hidden"></div>
<details id="caller-background-details" class="caller-background hidden">
<summary>Caller Background</summary>
<div id="caller-background"></div>
</details>
<button id="hangup-btn" class="hangup-btn" disabled>Hang Up</button>
</section>

View File

@@ -85,6 +85,31 @@ function initEventListeners() {
});
}
// Stem recording toggle
const recBtn = document.getElementById('rec-btn');
if (recBtn) {
let stemRecording = false;
recBtn.addEventListener('click', async () => {
try {
if (!stemRecording) {
const res = await safeFetch('/api/recording/start', { method: 'POST' });
stemRecording = true;
recBtn.classList.add('recording');
recBtn.textContent = '⏺ REC';
log('Stem recording started: ' + res.dir);
} else {
const res = await safeFetch('/api/recording/stop', { method: 'POST' });
stemRecording = false;
recBtn.classList.remove('recording');
recBtn.textContent = 'REC';
log('Stem recording stopped');
}
} catch (err) {
log('Recording error: ' + err.message);
}
});
}
// Export session
document.getElementById('export-session-btn')?.addEventListener('click', exportSession);
@@ -400,11 +425,12 @@ async function startCall(key, name) {
if (aiInfo) aiInfo.classList.remove('hidden');
if (aiName) aiName.textContent = name;
// Show caller background
// Show caller background in disclosure triangle
const bgDetails = document.getElementById('caller-background-details');
const bgEl = document.getElementById('caller-background');
if (bgEl && data.background) {
if (bgDetails && bgEl && data.background) {
bgEl.textContent = data.background;
bgEl.classList.remove('hidden');
bgDetails.classList.remove('hidden');
}
document.querySelectorAll('.caller-btn').forEach(btn => {
@@ -428,8 +454,8 @@ async function newSession() {
conversationSince = 0;
// Hide caller background
const bgEl = document.getElementById('caller-background');
if (bgEl) bgEl.classList.add('hidden');
const bgDetails = document.getElementById('caller-background-details');
if (bgDetails) bgDetails.classList.add('hidden');
// Reload callers to get new session ID
await loadCallers();
@@ -455,8 +481,8 @@ async function hangup() {
document.querySelectorAll('.caller-btn').forEach(btn => btn.classList.remove('active'));
// Hide caller background
const bgEl = document.getElementById('caller-background');
if (bgEl) bgEl.classList.add('hidden');
const bgDetails2 = document.getElementById('caller-background-details');
if (bgDetails2) bgDetails2.classList.add('hidden');
// Hide AI caller indicator
document.getElementById('ai-caller-info')?.classList.add('hidden');

367
postprod.py Normal file
View File

@@ -0,0 +1,367 @@
#!/usr/bin/env python3
"""Post-production pipeline for AI podcast stems.
Usage: python postprod.py recordings/2026-02-07_213000/ -o episode.mp3
Processes 5 aligned WAV stems (host, caller, music, sfx, ads) into a
broadcast-ready MP3 with gap removal, voice compression, music ducking,
and loudness normalization.
"""
import argparse
import subprocess
import sys
import tempfile
from pathlib import Path
import numpy as np
import soundfile as sf
STEM_NAMES = ["host", "caller", "music", "sfx", "ads"]
def load_stems(stems_dir: Path) -> tuple[dict[str, np.ndarray], int]:
stems = {}
sample_rate = None
for name in STEM_NAMES:
path = stems_dir / f"{name}.wav"
if not path.exists():
print(f" {name}.wav not found, creating empty stem")
stems[name] = None
continue
data, sr = sf.read(str(path), dtype="float32")
if sample_rate is None:
sample_rate = sr
elif sr != sample_rate:
print(f" WARNING: {name}.wav has sample rate {sr}, expected {sample_rate}")
stems[name] = data
print(f" {name}: {len(data)} samples ({len(data)/sr:.1f}s)")
if sample_rate is None:
print("ERROR: No valid stems found")
sys.exit(1)
# Pad all stems to same length
max_len = max(len(s) for s in stems.values() if s is not None)
for name in STEM_NAMES:
if stems[name] is None:
stems[name] = np.zeros(max_len, dtype=np.float32)
elif len(stems[name]) < max_len:
stems[name] = np.pad(stems[name], (0, max_len - len(stems[name])))
return stems, sample_rate
def compute_rms(audio: np.ndarray, window_samples: int) -> np.ndarray:
n_windows = len(audio) // window_samples
if n_windows == 0:
return np.array([0.0])
trimmed = audio[:n_windows * window_samples].reshape(n_windows, window_samples)
return np.sqrt(np.mean(trimmed ** 2, axis=1))
def remove_gaps(stems: dict[str, np.ndarray], sr: int,
threshold_s: float = 1.5, crossfade_ms: float = 30) -> dict[str, np.ndarray]:
window_ms = 50
window_samples = int(sr * window_ms / 1000)
crossfade_samples = int(sr * crossfade_ms / 1000)
dialog = stems["host"] + stems["caller"]
rms = compute_rms(dialog, window_samples)
# Threshold: -60dB or adaptive based on mean RMS
mean_rms = np.mean(rms[rms > 0]) if np.any(rms > 0) else 1e-4
silence_thresh = min(mean_rms * 0.05, 0.001)
# Find silent regions
is_silent = rms < silence_thresh
min_silent_windows = int(threshold_s / (window_ms / 1000))
# Build list of regions to cut (in samples)
cuts = []
i = 0
while i < len(is_silent):
if is_silent[i]:
start = i
while i < len(is_silent) and is_silent[i]:
i += 1
length = i - start
if length >= min_silent_windows:
# Keep a small buffer at edges
cut_start = (start + 1) * window_samples
cut_end = (i - 1) * window_samples
if cut_end > cut_start + crossfade_samples * 2:
cuts.append((cut_start, cut_end))
else:
i += 1
if not cuts:
print(" No gaps to remove")
return stems
total_cut = sum(end - start for start, end in cuts) / sr
print(f" Removing {len(cuts)} gaps ({total_cut:.1f}s total)")
# Apply cuts to dialog stems (host, caller, sfx, ads) — not music
cut_stems = ["host", "caller", "sfx", "ads"]
result = {}
for name in cut_stems:
audio = stems[name]
pieces = []
prev_end = 0
for cut_start, cut_end in cuts:
if prev_end < cut_start:
piece = audio[prev_end:cut_start].copy()
# Apply crossfade at join point
if pieces and len(piece) > crossfade_samples:
fade_in = np.linspace(0, 1, crossfade_samples, dtype=np.float32)
piece[:crossfade_samples] *= fade_in
if len(pieces) > 0 and len(pieces[-1]) > crossfade_samples:
fade_out = np.linspace(1, 0, crossfade_samples, dtype=np.float32)
pieces[-1][-crossfade_samples:] *= fade_out
pieces.append(piece)
prev_end = cut_end
if prev_end < len(audio):
piece = audio[prev_end:].copy()
if pieces and len(piece) > crossfade_samples:
fade_in = np.linspace(0, 1, crossfade_samples, dtype=np.float32)
piece[:crossfade_samples] *= fade_in
if len(pieces) > 0 and len(pieces[-1]) > crossfade_samples:
fade_out = np.linspace(1, 0, crossfade_samples, dtype=np.float32)
pieces[-1][-crossfade_samples:] *= fade_out
pieces.append(piece)
result[name] = np.concatenate(pieces) if pieces else np.array([], dtype=np.float32)
# Trim music to match new duration, with fade-out at end
new_len = len(result["host"])
music = stems["music"][:new_len].copy() if len(stems["music"]) >= new_len else np.pad(stems["music"], (0, max(0, new_len - len(stems["music"]))))
fade_samples = int(sr * 2) # 2s fade out
if len(music) > fade_samples:
fade_out = np.linspace(1, 0, fade_samples, dtype=np.float32)
music[-fade_samples:] *= fade_out
result["music"] = music
return result
def compress_voice(audio: np.ndarray, sr: int, tmp_dir: Path,
stem_name: str) -> np.ndarray:
in_path = tmp_dir / f"{stem_name}_pre_comp.wav"
out_path = tmp_dir / f"{stem_name}_post_comp.wav"
sf.write(str(in_path), audio, sr)
cmd = [
"ffmpeg", "-y", "-i", str(in_path),
"-af", "acompressor=threshold=-24dB:ratio=3:attack=5:release=100:makeup=6dB",
str(out_path),
]
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
print(f" WARNING: compression failed for {stem_name}: {result.stderr[:200]}")
return audio
compressed, _ = sf.read(str(out_path), dtype="float32")
return compressed
def apply_ducking(music: np.ndarray, dialog: np.ndarray, sr: int,
duck_db: float = -12, attack_ms: float = 200,
release_ms: float = 500) -> np.ndarray:
window_ms = 50
window_samples = int(sr * window_ms / 1000)
rms = compute_rms(dialog, window_samples)
# Speech detection threshold
mean_rms = np.mean(rms[rms > 0]) if np.any(rms > 0) else 1e-4
speech_thresh = mean_rms * 0.1
# Build gain envelope (per window)
duck_gain = 10 ** (duck_db / 20)
is_speech = rms > speech_thresh
target_gain = np.where(is_speech, duck_gain, 1.0).astype(np.float32)
# Smooth the envelope
attack_windows = max(1, int(attack_ms / window_ms))
release_windows = max(1, int(release_ms / window_ms))
smoothed = np.ones_like(target_gain)
for i in range(1, len(target_gain)):
if target_gain[i] < smoothed[i - 1]:
alpha = 1.0 / attack_windows
smoothed[i] = smoothed[i - 1] + alpha * (target_gain[i] - smoothed[i - 1])
else:
alpha = 1.0 / release_windows
smoothed[i] = smoothed[i - 1] + alpha * (target_gain[i] - smoothed[i - 1])
# Expand envelope to sample level
gain_samples = np.repeat(smoothed, window_samples)
if len(gain_samples) < len(music):
gain_samples = np.pad(gain_samples, (0, len(music) - len(gain_samples)), constant_values=1.0)
else:
gain_samples = gain_samples[:len(music)]
return music * gain_samples
def mix_stems(stems: dict[str, np.ndarray],
levels: dict[str, float] | None = None) -> np.ndarray:
if levels is None:
levels = {"host": 0, "caller": 0, "music": -6, "sfx": -3, "ads": 0}
gains = {name: 10 ** (db / 20) for name, db in levels.items()}
# Find max length
max_len = max(len(s) for s in stems.values())
mix = np.zeros(max_len, dtype=np.float64)
for name in STEM_NAMES:
audio = stems[name]
if len(audio) < max_len:
audio = np.pad(audio, (0, max_len - len(audio)))
mix += audio.astype(np.float64) * gains.get(name, 1.0)
# Stereo (mono duplicated to both channels)
mix = np.clip(mix, -1.0, 1.0).astype(np.float32)
stereo = np.column_stack([mix, mix])
return stereo
def normalize_and_export(audio: np.ndarray, sr: int, output_path: Path,
target_lufs: float = -16, bitrate: str = "128k",
tmp_dir: Path = None):
tmp_wav = tmp_dir / "pre_loudnorm.wav"
sf.write(str(tmp_wav), audio, sr)
# Pass 1: measure loudness
measure_cmd = [
"ffmpeg", "-y", "-i", str(tmp_wav),
"-af", f"loudnorm=I={target_lufs}:TP=-1:LRA=11:print_format=json",
"-f", "null", "-",
]
result = subprocess.run(measure_cmd, capture_output=True, text=True)
stderr = result.stderr
# Parse loudnorm output
import json
json_start = stderr.rfind("{")
json_end = stderr.rfind("}") + 1
if json_start >= 0 and json_end > json_start:
stats = json.loads(stderr[json_start:json_end])
else:
print(" WARNING: couldn't parse loudnorm stats, using defaults")
stats = {
"input_i": "-23", "input_tp": "-1", "input_lra": "11",
"input_thresh": "-34",
}
# Pass 2: apply normalization + limiter + export MP3
loudnorm_filter = (
f"loudnorm=I={target_lufs}:TP=-1:LRA=11"
f":measured_I={stats['input_i']}"
f":measured_TP={stats['input_tp']}"
f":measured_LRA={stats['input_lra']}"
f":measured_thresh={stats['input_thresh']}"
f":linear=true"
)
export_cmd = [
"ffmpeg", "-y", "-i", str(tmp_wav),
"-af", f"{loudnorm_filter},alimiter=limit=-1dB:level=false",
"-ab", bitrate, "-ar", str(sr),
str(output_path),
]
result = subprocess.run(export_cmd, capture_output=True, text=True)
if result.returncode != 0:
print(f" ERROR: export failed: {result.stderr[:300]}")
sys.exit(1)
def main():
parser = argparse.ArgumentParser(description="Post-production for AI podcast stems")
parser.add_argument("stems_dir", type=Path, help="Directory containing stem WAV files")
parser.add_argument("-o", "--output", type=str, default="episode.mp3", help="Output filename")
parser.add_argument("--gap-threshold", type=float, default=1.5, help="Min silence to cut (seconds)")
parser.add_argument("--duck-amount", type=float, default=-12, help="Music duck in dB")
parser.add_argument("--target-lufs", type=float, default=-16, help="Target loudness (LUFS)")
parser.add_argument("--bitrate", type=str, default="128k", help="MP3 bitrate")
parser.add_argument("--no-gap-removal", action="store_true", help="Skip gap removal")
parser.add_argument("--no-compression", action="store_true", help="Skip voice compression")
parser.add_argument("--no-ducking", action="store_true", help="Skip music ducking")
parser.add_argument("--dry-run", action="store_true", help="Show what would be done")
args = parser.parse_args()
stems_dir = args.stems_dir
if not stems_dir.exists():
print(f"ERROR: directory not found: {stems_dir}")
sys.exit(1)
# Resolve output path
output_path = Path(args.output)
if not output_path.is_absolute():
output_path = stems_dir / output_path
print(f"Post-production: {stems_dir} -> {output_path}")
print(f" Gap removal: {'skip' if args.no_gap_removal else f'threshold={args.gap_threshold}s'}")
print(f" Compression: {'skip' if args.no_compression else 'on'}")
print(f" Ducking: {'skip' if args.no_ducking else f'{args.duck_amount}dB'}")
print(f" Loudness: {args.target_lufs} LUFS, bitrate: {args.bitrate}")
if args.dry_run:
print("Dry run — exiting")
return
# Step 1: Load
print("\n[1/6] Loading stems...")
stems, sr = load_stems(stems_dir)
# Step 2: Gap removal
print("\n[2/6] Gap removal...")
if not args.no_gap_removal:
stems = remove_gaps(stems, sr, threshold_s=args.gap_threshold)
else:
print(" Skipped")
# Step 3: Voice compression
print("\n[3/6] Voice compression...")
if not args.no_compression:
with tempfile.TemporaryDirectory() as tmp:
tmp_dir = Path(tmp)
for name in ["host", "caller"]:
if np.any(stems[name] != 0):
print(f" Compressing {name}...")
stems[name] = compress_voice(stems[name], sr, tmp_dir, name)
else:
print(" Skipped")
# Step 4: Music ducking
print("\n[4/6] Music ducking...")
if not args.no_ducking:
dialog = stems["host"] + stems["caller"]
if np.any(dialog != 0) and np.any(stems["music"] != 0):
stems["music"] = apply_ducking(stems["music"], dialog, sr, duck_db=args.duck_amount)
print(" Applied")
else:
print(" No dialog or music to duck")
else:
print(" Skipped")
# Step 5: Mix
print("\n[5/6] Mixing...")
stereo = mix_stems(stems)
print(f" Mixed to stereo: {len(stereo)} samples ({len(stereo)/sr:.1f}s)")
# Step 6: Normalize + export
print("\n[6/6] Loudness normalization + export...")
with tempfile.TemporaryDirectory() as tmp:
normalize_and_export(stereo, sr, output_path,
target_lufs=args.target_lufs,
bitrate=args.bitrate,
tmp_dir=Path(tmp))
print(f"\nDone! Output: {output_path}")
if __name__ == "__main__":
main()

View File

@@ -18,9 +18,34 @@ import sys
import base64
from pathlib import Path
import ssl
import requests
import urllib3
from requests.adapters import HTTPAdapter
from urllib3.util.ssl_ import create_urllib3_context
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
from dotenv import load_dotenv
class TLSAdapter(HTTPAdapter):
"""Adapter to handle servers with older TLS configurations."""
def init_poolmanager(self, *args, **kwargs):
ctx = create_urllib3_context()
ctx.set_ciphers('DEFAULT@SECLEVEL=1')
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
kwargs['ssl_context'] = ctx
return super().init_poolmanager(*args, **kwargs)
def send(self, *args, **kwargs):
kwargs['verify'] = False
return super().send(*args, **kwargs)
# Use a session with TLS compatibility for all Castopod requests
_session = requests.Session()
_session.mount('https://', TLSAdapter())
# Load environment variables
load_dotenv(Path(__file__).parent / ".env")
@@ -156,41 +181,45 @@ Respond with ONLY valid JSON, no markdown or explanation."""
def create_episode(audio_path: str, metadata: dict, episode_number: int) -> dict:
"""Create episode on Castopod."""
"""Create episode on Castopod using curl (handles large file uploads better)."""
print("[3/5] Creating episode on Castopod...")
headers = get_auth_header()
credentials = base64.b64encode(
f"{CASTOPOD_USERNAME}:{CASTOPOD_PASSWORD}".encode()
).decode()
slug = re.sub(r'[^a-z0-9]+', '-', metadata["title"].lower()).strip('-')
# Upload audio and create episode
with open(audio_path, "rb") as f:
files = {
"audio_file": (Path(audio_path).name, f, "audio/mpeg")
}
data = {
"title": metadata["title"],
"slug": slug,
"description": metadata["description"],
"parental_advisory": "explicit",
"type": "full",
"podcast_id": str(PODCAST_ID),
"created_by": "1",
"updated_by": "1",
"episode_number": str(episode_number),
}
response = requests.post(
cmd = [
"curl", "-sk", "-X", "POST",
f"{CASTOPOD_URL}/api/rest/v1/episodes",
headers=headers,
files=files,
data=data
)
"-H", f"Authorization: Basic {credentials}",
"-F", f"audio_file=@{audio_path};type=audio/mpeg",
"-F", f"title={metadata['title']}",
"-F", f"slug={slug}",
"-F", f"description={metadata['description']}",
"-F", "parental_advisory=explicit",
"-F", "type=full",
"-F", f"podcast_id={PODCAST_ID}",
"-F", "created_by=1",
"-F", "updated_by=1",
"-F", f"episode_number={episode_number}",
]
if response.status_code not in (200, 201):
print(f"Error creating episode: {response.status_code} {response.text}")
result = subprocess.run(cmd, capture_output=True, text=True, timeout=300)
if result.returncode != 0:
print(f"Error uploading: {result.stderr}")
sys.exit(1)
try:
episode = json.loads(result.stdout)
except json.JSONDecodeError:
print(f"Error parsing response: {result.stdout[:500]}")
sys.exit(1)
if "id" not in episode:
print(f"Error creating episode: {result.stdout[:500]}")
sys.exit(1)
episode = response.json()
print(f" Created episode ID: {episode['id']}")
print(f" Slug: {episode['slug']}")
@@ -203,13 +232,13 @@ def publish_episode(episode_id: int) -> dict:
headers = get_auth_header()
response = requests.post(
response = _session.post(
f"{CASTOPOD_URL}/api/rest/v1/episodes/{episode_id}/publish",
headers=headers,
data={
"publication_method": "now",
"created_by": "1"
}
},
)
if response.status_code != 200:
@@ -316,9 +345,9 @@ def get_next_episode_number() -> int:
"""Get the next episode number from Castopod."""
headers = get_auth_header()
response = requests.get(
response = _session.get(
f"{CASTOPOD_URL}/api/rest/v1/episodes",
headers=headers
headers=headers,
)
if response.status_code != 200:

View File

@@ -100,7 +100,7 @@
<div class="hiw-step-number">1</div>
<div class="hiw-step-content">
<h3>A Person Is Born</h3>
<p>Every caller starts as a blank slate. The system generates a complete identity: name, age, job, hometown, and personality. Each caller gets a unique speaking style — some ramble, some are blunt, some deflect with humor. They have relationships, vehicles, opinions, memories, and reasons for being up this late.</p>
<p>Every caller starts as a blank slate. The system generates a complete identity: name, age, job, hometown, and personality. Each caller gets a unique speaking style — some ramble, some are blunt, some deflect with humor. They have relationships, vehicles, strong food opinions, nostalgic memories, and reasons for being up this late. They know what they were watching on TV, what errand they ran today, and what song was on the radio before they called.</p>
<div class="hiw-detail-grid">
<div class="hiw-detail">
<span class="hiw-detail-label">Unique Names</span>
@@ -108,7 +108,7 @@
</div>
<div class="hiw-detail">
<span class="hiw-detail-label">Personality Layers</span>
<span class="hiw-detail-value">20+</span>
<span class="hiw-detail-value">30+</span>
</div>
<div class="hiw-detail">
<span class="hiw-detail-label">Towns with Real Knowledge</span>
@@ -126,7 +126,7 @@
<div class="hiw-step-number">2</div>
<div class="hiw-step-content">
<h3>They Know Their World</h3>
<p>Callers know real facts about where they live — the restaurants, the highways, the local gossip. When a caller says they're from Lordsburg, they actually know about the Hidalgo Hotel and the drive to Deming. The system pulls in real-time news so callers can reference things that actually happened today.</p>
<p>Callers know real facts about where they live — the restaurants, the highways, the local gossip. When a caller says they're from Lordsburg, they actually know about the Shakespeare ghost town and the drive to Deming. They know the current weather outside their window, what day of the week it is, whether it's monsoon season or chile harvest. They have strong opinions about where to get the best green chile and get nostalgic about how their town used to be. The system also pulls in real-time news so callers can reference things that actually happened today.</p>
</div>
</div>
@@ -152,7 +152,7 @@
<div class="hiw-step-number">4</div>
<div class="hiw-step-content">
<h3>The Conversation Is Real</h3>
<p>Luke talks to each caller using push-to-talk, just like a real radio show. His voice is transcribed in real time, sent to an AI that responds in character, and then converted to speech using a voice engine — all in a few seconds. The AI doesn't just answer questions; it reacts, gets emotional, goes on tangents, and remembers what was said earlier in the show.</p>
<p>Luke talks to each caller using push-to-talk, just like a real radio show. His voice is transcribed in real time, sent to an AI that responds in character, and then converted to speech using a voice engine — all in a few seconds. The AI doesn't just answer questions; it reacts, gets emotional, goes on tangents, and remembers what was said earlier in the show. Callers even react to previous callers — "Hey Luke, I heard that guy Tony earlier and I got to say, he's full of it." It makes the show feel like a living community, not isolated calls.</p>
</div>
</div>
@@ -207,7 +207,14 @@
<svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><circle cx="12" cy="12" r="10"/><polyline points="12 6 12 12 16 14"/></svg>
</div>
<h3>Real Time</h3>
<p>Everything happens live. Caller generation, voice synthesis, news lookups, phone routing — all in real time during the show. There's no post-production trickery on the caller side. What you hear is what happened.</p>
<p>Everything happens live. Caller generation, voice synthesis, news lookups, weather checks, phone routing — all in real time during the show. There's no post-production trickery on the caller side. What you hear is what happened.</p>
</div>
<div class="hiw-feature">
<div class="hiw-feature-icon">
<svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><path d="M17 21v-2a4 4 0 0 0-4-4H5a4 4 0 0 0-4 4v2"/><circle cx="9" cy="7" r="4"/><path d="M23 21v-2a4 4 0 0 0-3-3.87"/><path d="M16 3.13a4 4 0 0 1 0 7.75"/></svg>
</div>
<h3>They Listen to Each Other</h3>
<p>Callers aren't isolated — they hear what happened earlier in the show. A caller might disagree with the last guy, back someone up, or call in specifically because of something another caller said. The show builds on itself.</p>
</div>
</div>
</section>