Add post-production pipeline: stem recorder, postprod script, recording UI

New stem recording system captures 5 time-aligned WAV files (host, caller, music, sfx, ads) during live shows. Standalone postprod.py processes stems into broadcast-ready MP3 with gap removal, voice compression, music ducking, and EBU R128 loudness normalization. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-08 17:53:32 -07:00
parent 356bf145b8
commit 7d88c76f90
12 changed files with 1528 additions and 363 deletions
@@ -78,6 +78,9 @@ class AudioService:
        self.input_sample_rate = 16000  # For Whisper
        self.output_sample_rate = 24000  # For TTS

+        # Stem recording (opt-in, attached via API)
+        self.stem_recorder = None
+
        # Load saved settings
        self._load_settings()

@@ -355,6 +358,10 @@ class AudioService:
            # Apply fade to prevent clicks
            audio = self._apply_fade(audio, device_sr)

+            # Stem recording: caller TTS
+            if self.stem_recorder:
+                self.stem_recorder.write("caller", audio.copy(), device_sr)
+
            # Create multi-channel output with audio only on target channel
            multi_ch = np.zeros((len(audio), num_channels), dtype=np.float32)
            multi_ch[:, channel_idx] = audio
@@ -491,6 +498,10 @@ class AudioService:
                indices = np.clip(indices, 0, len(audio) - 1)
                audio = audio[indices]

+            # Stem recording: live caller
+            if self.stem_recorder:
+                self.stem_recorder.write("caller", audio.copy(), device_sr)
+
            if self._live_caller_write:
                self._live_caller_write(audio)

@@ -524,6 +535,10 @@ class AudioService:
                if self._recording and self._recorded_audio is not None:
                    self._recorded_audio.append(indata[:, record_channel].copy())

+                # Stem recording: host mic
+                if self.stem_recorder:
+                    self.stem_recorder.write("host", indata[:, record_channel].copy(), device_sr)
+
                if not self._host_send_callback:
                    return
                mono = indata[:, record_channel]
@@ -721,7 +736,10 @@ class AudioService:
                fade_in = np.linspace(start_progress, end_progress, frames, dtype=np.float32)
                fade_out = 1.0 - fade_in

-                outdata[:, channel_idx] = (old_samples * fade_out + new_samples * fade_in) * self._music_volume
+                mono_out = (old_samples * fade_out + new_samples * fade_in) * self._music_volume
+                outdata[:, channel_idx] = mono_out
+                if self.stem_recorder:
+                    self.stem_recorder.write("music", mono_out.copy(), device_sr)
                self._crossfade_progress = end_progress

                if self._crossfade_progress >= 1.0:
@@ -729,7 +747,10 @@ class AudioService:
                    self._crossfade_old_data = None
                    print("Crossfade complete")
            else:
-                outdata[:, channel_idx] = new_samples * self._music_volume
+                mono_out = new_samples * self._music_volume
+                outdata[:, channel_idx] = mono_out
+                if self.stem_recorder:
+                    self.stem_recorder.write("music", mono_out.copy(), device_sr)

        try:
            self._music_stream = sd.OutputStream(
@@ -836,7 +857,10 @@ class AudioService:

            remaining = len(self._ad_resampled) - self._ad_position
            if remaining >= frames:
-                outdata[:, channel_idx] = self._ad_resampled[self._ad_position:self._ad_position + frames]
+                chunk = self._ad_resampled[self._ad_position:self._ad_position + frames]
+                outdata[:, channel_idx] = chunk
+                if self.stem_recorder:
+                    self.stem_recorder.write("ads", chunk.copy(), device_sr)
                self._ad_position += frames
            else:
                if remaining > 0:
@@ -904,6 +928,10 @@ class AudioService:
                audio, _ = librosa.load(str(path), sr=device_sr, mono=True)
                audio = self._apply_fade(audio, device_sr)

+                # Stem recording: sfx
+                if self.stem_recorder:
+                    self.stem_recorder.write("sfx", audio.copy(), device_sr)
+
                multi_ch = np.zeros((len(audio), num_channels), dtype=np.float32)
                multi_ch[:, channel_idx] = audio

@@ -7,21 +7,28 @@ from ..config import settings

 # Available OpenRouter models
 OPENROUTER_MODELS = [
+    # Best for natural dialog (ranked)
+    "minimax/minimax-m2-her",
+    "mistralai/mistral-small-creative",
+    "x-ai/grok-4-fast",
+    "deepseek/deepseek-v3.2",
+    # Updated standard models
+    "anthropic/claude-haiku-4.5",
+    "anthropic/claude-sonnet-4-5",
+    "google/gemini-2.5-flash",
    "openai/gpt-4o-mini",
    "openai/gpt-4o",
+    # Legacy
    "anthropic/claude-3-haiku",
-    "anthropic/claude-3.5-sonnet",
    "google/gemini-flash-1.5",
-    "google/gemini-pro-1.5",
    "meta-llama/llama-3.1-8b-instruct",
-    "mistralai/mistral-7b-instruct",
 ]

 # Fast models to try as fallbacks (cheap, fast, good enough for conversation)
 FALLBACK_MODELS = [
-    "google/gemini-flash-1.5",
+    "mistralai/mistral-small-creative",
+    "google/gemini-2.5-flash",
    "openai/gpt-4o-mini",
-    "meta-llama/llama-3.1-8b-instruct",
 ]


@@ -103,21 +110,22 @@ class LLMService:
    async def generate(
        self,
        messages: list[dict],
-        system_prompt: Optional[str] = None
+        system_prompt: Optional[str] = None,
+        max_tokens: Optional[int] = None
    ) -> str:
        if system_prompt:
            messages = [{"role": "system", "content": system_prompt}] + messages

        if self.provider == "openrouter":
-            return await self._call_openrouter_with_fallback(messages)
+            return await self._call_openrouter_with_fallback(messages, max_tokens=max_tokens)
        else:
-            return await self._call_ollama(messages)
+            return await self._call_ollama(messages, max_tokens=max_tokens)

-    async def _call_openrouter_with_fallback(self, messages: list[dict]) -> str:
+    async def _call_openrouter_with_fallback(self, messages: list[dict], max_tokens: Optional[int] = None) -> str:
        """Try primary model, then fallback models. Always returns a response."""

        # Try primary model first
-        result = await self._call_openrouter_once(messages, self.openrouter_model)
+        result = await self._call_openrouter_once(messages, self.openrouter_model, max_tokens=max_tokens)
        if result is not None:
            return result

@@ -126,7 +134,7 @@ class LLMService:
            if model == self.openrouter_model:
                continue  # Already tried
            print(f"[LLM] Falling back to {model}...")
-            result = await self._call_openrouter_once(messages, model, timeout=10.0)
+            result = await self._call_openrouter_once(messages, model, timeout=10.0, max_tokens=max_tokens)
            if result is not None:
                return result

@@ -134,7 +142,7 @@ class LLMService:
        print("[LLM] All models failed, using canned response")
        return "Sorry, I totally blanked out for a second. What were you saying?"

-    async def _call_openrouter_once(self, messages: list[dict], model: str, timeout: float = 15.0) -> str | None:
+    async def _call_openrouter_once(self, messages: list[dict], model: str, timeout: float = 15.0, max_tokens: Optional[int] = None) -> str | None:
        """Single attempt to call OpenRouter. Returns None on failure (not a fallback string)."""
        try:
            response = await self.client.post(
@@ -146,7 +154,11 @@ class LLMService:
                json={
                    "model": model,
                    "messages": messages,
-                    "max_tokens": 150,
+                    "max_tokens": max_tokens or 150,
+                    "temperature": 0.8,
+                    "top_p": 0.92,
+                    "frequency_penalty": 0.5,
+                    "presence_penalty": 0.3,
                },
                timeout=timeout,
            )
@@ -164,7 +176,7 @@ class LLMService:
            print(f"[LLM] {model} error: {e}")
            return None

-    async def _call_ollama(self, messages: list[dict]) -> str:
+    async def _call_ollama(self, messages: list[dict], max_tokens: Optional[int] = None) -> str:
        """Call Ollama API"""
        try:
            async with httpx.AsyncClient() as client:
@@ -175,7 +187,7 @@ class LLMService:
                        "messages": messages,
                        "stream": False,
                        "options": {
-                            "num_predict": 100,
+                            "num_predict": max_tokens or 100,
                            "temperature": 0.8,
                            "top_p": 0.9,
                            "repeat_penalty": 1.3,
@@ -0,0 +1,86 @@
+"""Records separate audio stems during a live show for post-production"""
+
+import time
+import numpy as np
+import soundfile as sf
+from pathlib import Path
+from scipy import signal as scipy_signal
+
+STEM_NAMES = ["host", "caller", "music", "sfx", "ads"]
+
+
+class StemRecorder:
+    def __init__(self, output_dir: str | Path, sample_rate: int = 48000):
+        self.output_dir = Path(output_dir)
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+        self.sample_rate = sample_rate
+        self._files: dict[str, sf.SoundFile] = {}
+        self._write_positions: dict[str, int] = {}
+        self._start_time: float = 0.0
+        self._running = False
+
+    def start(self):
+        self._start_time = time.time()
+        self._running = True
+        for name in STEM_NAMES:
+            path = self.output_dir / f"{name}.wav"
+            f = sf.SoundFile(
+                str(path), mode="w",
+                samplerate=self.sample_rate,
+                channels=1, subtype="FLOAT",
+            )
+            self._files[name] = f
+            self._write_positions[name] = 0
+        print(f"[StemRecorder] Recording started -> {self.output_dir}")
+
+    def write(self, stem_name: str, audio_data: np.ndarray, source_sr: int):
+        if not self._running or stem_name not in self._files:
+            return
+
+        # Resample to target rate if needed
+        if source_sr != self.sample_rate:
+            num_samples = int(len(audio_data) * self.sample_rate / source_sr)
+            if num_samples > 0:
+                audio_data = scipy_signal.resample(audio_data, num_samples).astype(np.float32)
+            else:
+                return
+
+        # Fill silence gap based on elapsed time
+        elapsed = time.time() - self._start_time
+        expected_pos = int(elapsed * self.sample_rate)
+        current_pos = self._write_positions[stem_name]
+
+        if expected_pos > current_pos:
+            gap = expected_pos - current_pos
+            silence = np.zeros(gap, dtype=np.float32)
+            self._files[stem_name].write(silence)
+            self._write_positions[stem_name] = expected_pos
+
+        self._files[stem_name].write(audio_data.astype(np.float32))
+        self._write_positions[stem_name] += len(audio_data)
+
+    def stop(self) -> dict[str, str]:
+        if not self._running:
+            return {}
+
+        self._running = False
+
+        # Pad all stems to the same length
+        max_pos = max(self._write_positions.values()) if self._write_positions else 0
+        for name in STEM_NAMES:
+            pos = self._write_positions[name]
+            if pos < max_pos:
+                silence = np.zeros(max_pos - pos, dtype=np.float32)
+                self._files[name].write(silence)
+
+        # Close all files
+        paths = {}
+        for name in STEM_NAMES:
+            self._files[name].close()
+            paths[name] = str(self.output_dir / f"{name}.wav")
+
+        self._files.clear()
+        self._write_positions.clear()
+
+        print(f"[StemRecorder] Recording stopped. {max_pos} samples ({max_pos/self.sample_rate:.1f}s)")
+        return paths
@@ -598,7 +598,7 @@ async def generate_speech_inworld(text: str, voice_id: str) -> tuple[np.ndarray,
    payload = {
        "text": text,
        "voice_id": voice,
-        "model_id": "inworld-tts-1.5-mini",
+        "model_id": "inworld-tts-1.5-max",
        "audio_config": {
            "encoding": "LINEAR16",
            "sample_rate_hertz": 48000,
@@ -1 +1,143 @@
-{"regulars": []}
+{
+  "regulars": [
+    {
+      "id": "be244306",
+      "name": "Dale",
+      "gender": "male",
+      "age": 44,
+      "job": "runs a food truck",
+      "location": "unknown",
+      "personality_traits": [],
+      "call_history": [
+        {
+          "summary": "Briefly explain the universe's expansion to a child who wants to know what happens when it stops expanding. Please don't suggest unusual topics; keep the explanation simple.",
+          "timestamp": 1770515097.24686
+        },
+        {
+          "summary": "Dale updates the host on explaining the universe's expansion to his buddy's kid, who now worries if it could \"pop,\" but shifts to his temptation to bet on Super Bowl 60 predictions after reading an article, critiquing a prior caller's gambling mindset while reflecting emotionally on his brother Eddie's fruitless horse-betting habit and his own exhaustion from long taco truck shifts in the cold desert. He ultimately considers a small, affordable wager on the Chiefs as a low-stakes thrill.",
+          "timestamp": 1770522741.049846
+        }
+      ],
+      "last_call": 1770522741.049846,
+      "created_at": 1770515097.24686
+    },
+    {
+      "id": "584767e8",
+      "name": "Carl",
+      "gender": "male",
+      "age": 36,
+      "job": "is a firefighter",
+      "location": "unknown",
+      "personality_traits": [],
+      "call_history": [
+        {
+          "summary": "Carl, a firefighter from Lordsburg, New Mexico, called to confess his 20-year gambling addiction, which began with casual poker games at the station and escalated to frequent casino visits and online sessions, draining his finances and leaving him with overdue bills and the fear of losing his home. Emotionally raw, he admitted the habit's destructive hold\u2014like an unquenchable fire\u2014and his pride in avoiding help, but agreed to consider support groups and an 800 hotline after the host suggested productive alternatives like gym workouts or extra volunteer shifts.",
+          "timestamp": 1770522170.1887732
+        },
+        {
+          "summary": "Here is a 1-2 sentence summary of the radio call:\n\nThe caller, Carl, discusses his progress in overcoming his gambling addiction, including rewatching The Sopranos, but the host, Luke, disagrees with Carl's high opinion of the show's ending, leading to a back-and-forth debate between the two about the merits and predictability of the Sopranos finale.",
+          "timestamp": 1770573289.82847
+        }
+      ],
+      "last_call": 1770573289.828471,
+      "created_at": 1770522170.1887732
+    },
+    {
+      "id": "d97cb6f9",
+      "name": "Carla",
+      "gender": "female",
+      "age": 26,
+      "job": "is a vet tech",
+      "location": "unknown",
+      "personality_traits": [],
+      "call_history": [
+        {
+          "summary": "Carla, separated from her husband but not yet divorced, vented about her intrusive in-laws who relentlessly call and dictate her life\u2014from finances and household matters to her clothing choices\u2014while her spineless spouse relays their demands, making her feel trapped in a one-sided war. With her own parents unavailable (father deceased, mother distant), she leans on her bickering but honest sister for support, underscoring her deep frustration and sense of isolation.",
+          "timestamp": 1770522530.8554251
+        },
+        {
+          "summary": "Carla dismissed celebrity science theories like Terrence Howard's after watching Neil deGrasse Tyson's critique, then marveled at JWST's exoplanet discoveries before sharing her relief at finally cutting off her toxic in-laws amid her ongoing divorce. She expressed deep heartbreak over actor James Ransone's suicide at 46, reflecting on life's fragility, her late father's death, and the need to eliminate family drama, leaving her contemplative and planning a solo desert drive for clarity.",
+          "timestamp": 1770526316.004708
+        }
+      ],
+      "last_call": 1770526316.004709,
+      "created_at": 1770522530.855426
+    },
+    {
+      "id": "5ccaea00",
+      "name": "Jerome",
+      "gender": "male",
+      "age": 52,
+      "job": "works at a cemetery",
+      "location": "unknown",
+      "personality_traits": [],
+      "call_history": [
+        {
+          "summary": "Jerome called in to discuss Neil deGrasse Tyson's dismissal of Terrence Howard's unconventional scientific theories, agreeing they don't hold up to real science, before opening up about his emotional turmoil over an unanswered text from his ex, Laura, following a recent blowout that left him questioning his life choices while drinking mezcal in his truck late at night. He reflected on their breakup due to his workaholic tendencies at the cemetery and her desire for more, but found hope in his child's insightful comment about the stars from the Silo books, suggesting they might both be better off apart.",
+          "timestamp": 1770522903.5809002
+        },
+        {
+          "summary": "Here is a 1-2 sentence summary of the call:\n\nThe caller, Jerome, recounts a humorous customer service interaction where a woman came to the cemetery he works at late at night frantically trying to find her husband's plot, leading to an amusing back-and-forth.",
+          "timestamp": 1770523944.299309
+        }
+      ],
+      "last_call": 1770523944.29931,
+      "created_at": 1770522903.5809002
+    },
+    {
+      "id": "49147bd5",
+      "name": "Keith",
+      "gender": "male",
+      "age": 61,
+      "job": "south of Silver City",
+      "location": "in unknown",
+      "personality_traits": [],
+      "call_history": [
+        {
+          "summary": "The caller, Luke, kicked off by sharing a humorous clip of Terrence Howard's Tree of Life Theory being critiqued by Neil deGrasse Tyson, which left Howard visibly hurt, before pivoting to economic woes, blaming overspending and Federal Reserve money printing for devaluing the currency and harming everyday people. He advocated abolishing the Fed, echoing Ron Paul's ideas, to let markets stabilize money, potentially boosting innovation and new industries in rural spots like Silver City despite uncertain local impacts.",
+          "timestamp": 1770524506.3390348
+        },
+        {
+          "summary": "Here is a 1-2 sentence summary of the call:\n\nThe caller, who works at a bank, has been reflecting on his tendency to blame the government and economic system for his problems, rather than taking responsibility for his own role. He had an epiphany while eating leftover enchiladas in his minivan, realizing he needs to be more proactive instead of just complaining.",
+          "timestamp": 1770574890.1296651
+        }
+      ],
+      "last_call": 1770574890.1296651,
+      "created_at": 1770524506.339036
+    },
+    {
+      "id": "4f4612c7",
+      "name": "Dale",
+      "gender": "male",
+      "age": 38,
+      "job": "is a cop, 12 years on the force",
+      "location": "unknown",
+      "personality_traits": [],
+      "call_history": [
+        {
+          "summary": "Dale from Globe called in to express skepticism about Terrence Howard's Tree of Life theory, arguing it lacks peer-reviewed experiments and scientific consensus, much like how he trusts quantum entanglement based on reliable sources without reading every paper himself. The conversation shifted to an emotional discussion of his grief over Uncle Hector, the man who raised him like a father but changed after a stroke, leaving Dale feeling a profound loss without closure, though he found solace in the host's validation and hope for lucid moments ahead.",
+          "timestamp": 1770526114.530777
+        }
+      ],
+      "last_call": 1770526114.5307782,
+      "created_at": 1770526114.5307782
+    },
+    {
+      "id": "60053b38",
+      "name": "Lorraine",
+      "gender": "female",
+      "age": 42,
+      "job": "New Mexico",
+      "location": "in unknown",
+      "personality_traits": [],
+      "call_history": [
+        {
+          "summary": "Here is a 1-2 sentence summary of the call:\n\nThe caller has an outstanding warrant for a DUI charge from a few years ago that they have been avoiding dealing with, which has been causing them a lot of stress and guilt. The host encourages the caller to take responsibility and go to the sheriff's office to get the warrant cleared up, as driving drunk is extremely dangerous and unacceptable.",
+          "timestamp": 1770573956.570584
+        }
+      ],
+      "last_call": 1770573956.570584,
+      "created_at": 1770573956.570584
+    }
+  ]
+}
@@ -75,6 +75,19 @@ header button {
    50% { opacity: 0.7; }
 }

+.rec-btn {
+    font-weight: 700;
+    text-transform: uppercase;
+    letter-spacing: 0.05em;
+    background: #555 !important;
+    transition: background 0.2s;
+}
+
+.rec-btn.recording {
+    background: #cc2222 !important;
+    animation: on-air-pulse 2s ease-in-out infinite;
+}
+
 .new-session-btn {
    background: var(--accent) !important;
 }
@@ -85,17 +98,29 @@ header button {
    font-weight: normal;
 }

-.caller-background {
+details.caller-background {
    font-size: 0.85rem;
    color: var(--text-muted);
-    padding: 10px;
    background: var(--bg);
    border-radius: var(--radius);
    margin-bottom: 12px;
    line-height: 1.4;
 }

-.caller-background.hidden {
+details.caller-background summary {
+    cursor: pointer;
+    padding: 8px 10px;
+    font-weight: bold;
+    color: var(--text);
+    font-size: 0.8rem;
+}
+
+details.caller-background > div {
+    padding: 0 10px 10px;
+    white-space: pre-wrap;
+}
+
+details.caller-background.hidden {
    display: none;
 }

@@ -12,6 +12,7 @@
            <h1>Luke at The Roost</h1>
            <div class="header-buttons">
                <button id="on-air-btn" class="on-air-btn off">OFF AIR</button>
+                <button id="rec-btn" class="rec-btn" title="Record stems for post-production">REC</button>
                <button id="new-session-btn" class="new-session-btn">New Session</button>
                <button id="export-session-btn">Export</button>
                <button id="settings-btn">Settings</button>
@@ -49,7 +50,10 @@
                    </label>
                </div>
                <div id="call-status" class="call-status">No active call</div>
-                <div id="caller-background" class="caller-background hidden"></div>
+                <details id="caller-background-details" class="caller-background hidden">
+                    <summary>Caller Background</summary>
+                    <div id="caller-background"></div>
+                </details>
                <button id="hangup-btn" class="hangup-btn" disabled>Hang Up</button>
            </section>

@@ -85,6 +85,31 @@ function initEventListeners() {
        });
    }

+    // Stem recording toggle
+    const recBtn = document.getElementById('rec-btn');
+    if (recBtn) {
+        let stemRecording = false;
+        recBtn.addEventListener('click', async () => {
+            try {
+                if (!stemRecording) {
+                    const res = await safeFetch('/api/recording/start', { method: 'POST' });
+                    stemRecording = true;
+                    recBtn.classList.add('recording');
+                    recBtn.textContent = '⏺ REC';
+                    log('Stem recording started: ' + res.dir);
+                } else {
+                    const res = await safeFetch('/api/recording/stop', { method: 'POST' });
+                    stemRecording = false;
+                    recBtn.classList.remove('recording');
+                    recBtn.textContent = 'REC';
+                    log('Stem recording stopped');
+                }
+            } catch (err) {
+                log('Recording error: ' + err.message);
+            }
+        });
+    }
+
    // Export session
    document.getElementById('export-session-btn')?.addEventListener('click', exportSession);

@@ -400,11 +425,12 @@ async function startCall(key, name) {
    if (aiInfo) aiInfo.classList.remove('hidden');
    if (aiName) aiName.textContent = name;

-    // Show caller background
+    // Show caller background in disclosure triangle
+    const bgDetails = document.getElementById('caller-background-details');
    const bgEl = document.getElementById('caller-background');
-    if (bgEl && data.background) {
+    if (bgDetails && bgEl && data.background) {
        bgEl.textContent = data.background;
-        bgEl.classList.remove('hidden');
+        bgDetails.classList.remove('hidden');
    }

    document.querySelectorAll('.caller-btn').forEach(btn => {
@@ -428,8 +454,8 @@ async function newSession() {
    conversationSince = 0;

    // Hide caller background
-    const bgEl = document.getElementById('caller-background');
-    if (bgEl) bgEl.classList.add('hidden');
+    const bgDetails = document.getElementById('caller-background-details');
+    if (bgDetails) bgDetails.classList.add('hidden');

    // Reload callers to get new session ID
    await loadCallers();
@@ -455,8 +481,8 @@ async function hangup() {
    document.querySelectorAll('.caller-btn').forEach(btn => btn.classList.remove('active'));

    // Hide caller background
-    const bgEl = document.getElementById('caller-background');
-    if (bgEl) bgEl.classList.add('hidden');
+    const bgDetails2 = document.getElementById('caller-background-details');
+    if (bgDetails2) bgDetails2.classList.add('hidden');

    // Hide AI caller indicator
    document.getElementById('ai-caller-info')?.classList.add('hidden');
@@ -0,0 +1,367 @@
+#!/usr/bin/env python3
+"""Post-production pipeline for AI podcast stems.
+
+Usage: python postprod.py recordings/2026-02-07_213000/ -o episode.mp3
+
+Processes 5 aligned WAV stems (host, caller, music, sfx, ads) into a
+broadcast-ready MP3 with gap removal, voice compression, music ducking,
+and loudness normalization.
+"""
+
+import argparse
+import subprocess
+import sys
+import tempfile
+from pathlib import Path
+
+import numpy as np
+import soundfile as sf
+
+STEM_NAMES = ["host", "caller", "music", "sfx", "ads"]
+
+
+def load_stems(stems_dir: Path) -> tuple[dict[str, np.ndarray], int]:
+    stems = {}
+    sample_rate = None
+    for name in STEM_NAMES:
+        path = stems_dir / f"{name}.wav"
+        if not path.exists():
+            print(f"  {name}.wav not found, creating empty stem")
+            stems[name] = None
+            continue
+        data, sr = sf.read(str(path), dtype="float32")
+        if sample_rate is None:
+            sample_rate = sr
+        elif sr != sample_rate:
+            print(f"  WARNING: {name}.wav has sample rate {sr}, expected {sample_rate}")
+        stems[name] = data
+        print(f"  {name}: {len(data)} samples ({len(data)/sr:.1f}s)")
+
+    if sample_rate is None:
+        print("ERROR: No valid stems found")
+        sys.exit(1)
+
+    # Pad all stems to same length
+    max_len = max(len(s) for s in stems.values() if s is not None)
+    for name in STEM_NAMES:
+        if stems[name] is None:
+            stems[name] = np.zeros(max_len, dtype=np.float32)
+        elif len(stems[name]) < max_len:
+            stems[name] = np.pad(stems[name], (0, max_len - len(stems[name])))
+
+    return stems, sample_rate
+
+
+def compute_rms(audio: np.ndarray, window_samples: int) -> np.ndarray:
+    n_windows = len(audio) // window_samples
+    if n_windows == 0:
+        return np.array([0.0])
+    trimmed = audio[:n_windows * window_samples].reshape(n_windows, window_samples)
+    return np.sqrt(np.mean(trimmed ** 2, axis=1))
+
+
+def remove_gaps(stems: dict[str, np.ndarray], sr: int,
+                threshold_s: float = 1.5, crossfade_ms: float = 30) -> dict[str, np.ndarray]:
+    window_ms = 50
+    window_samples = int(sr * window_ms / 1000)
+    crossfade_samples = int(sr * crossfade_ms / 1000)
+
+    dialog = stems["host"] + stems["caller"]
+    rms = compute_rms(dialog, window_samples)
+
+    # Threshold: -60dB or adaptive based on mean RMS
+    mean_rms = np.mean(rms[rms > 0]) if np.any(rms > 0) else 1e-4
+    silence_thresh = min(mean_rms * 0.05, 0.001)
+
+    # Find silent regions
+    is_silent = rms < silence_thresh
+    min_silent_windows = int(threshold_s / (window_ms / 1000))
+
+    # Build list of regions to cut (in samples)
+    cuts = []
+    i = 0
+    while i < len(is_silent):
+        if is_silent[i]:
+            start = i
+            while i < len(is_silent) and is_silent[i]:
+                i += 1
+            length = i - start
+            if length >= min_silent_windows:
+                # Keep a small buffer at edges
+                cut_start = (start + 1) * window_samples
+                cut_end = (i - 1) * window_samples
+                if cut_end > cut_start + crossfade_samples * 2:
+                    cuts.append((cut_start, cut_end))
+        else:
+            i += 1
+
+    if not cuts:
+        print("  No gaps to remove")
+        return stems
+
+    total_cut = sum(end - start for start, end in cuts) / sr
+    print(f"  Removing {len(cuts)} gaps ({total_cut:.1f}s total)")
+
+    # Apply cuts to dialog stems (host, caller, sfx, ads) — not music
+    cut_stems = ["host", "caller", "sfx", "ads"]
+    result = {}
+
+    for name in cut_stems:
+        audio = stems[name]
+        pieces = []
+        prev_end = 0
+        for cut_start, cut_end in cuts:
+            if prev_end < cut_start:
+                piece = audio[prev_end:cut_start].copy()
+                # Apply crossfade at join point
+                if pieces and len(piece) > crossfade_samples:
+                    fade_in = np.linspace(0, 1, crossfade_samples, dtype=np.float32)
+                    piece[:crossfade_samples] *= fade_in
+                if len(pieces) > 0 and len(pieces[-1]) > crossfade_samples:
+                    fade_out = np.linspace(1, 0, crossfade_samples, dtype=np.float32)
+                    pieces[-1][-crossfade_samples:] *= fade_out
+                pieces.append(piece)
+            prev_end = cut_end
+
+        if prev_end < len(audio):
+            piece = audio[prev_end:].copy()
+            if pieces and len(piece) > crossfade_samples:
+                fade_in = np.linspace(0, 1, crossfade_samples, dtype=np.float32)
+                piece[:crossfade_samples] *= fade_in
+            if len(pieces) > 0 and len(pieces[-1]) > crossfade_samples:
+                fade_out = np.linspace(1, 0, crossfade_samples, dtype=np.float32)
+                pieces[-1][-crossfade_samples:] *= fade_out
+            pieces.append(piece)
+
+        result[name] = np.concatenate(pieces) if pieces else np.array([], dtype=np.float32)
+
+    # Trim music to match new duration, with fade-out at end
+    new_len = len(result["host"])
+    music = stems["music"][:new_len].copy() if len(stems["music"]) >= new_len else np.pad(stems["music"], (0, max(0, new_len - len(stems["music"]))))
+    fade_samples = int(sr * 2)  # 2s fade out
+    if len(music) > fade_samples:
+        fade_out = np.linspace(1, 0, fade_samples, dtype=np.float32)
+        music[-fade_samples:] *= fade_out
+    result["music"] = music
+
+    return result
+
+
+def compress_voice(audio: np.ndarray, sr: int, tmp_dir: Path,
+                   stem_name: str) -> np.ndarray:
+    in_path = tmp_dir / f"{stem_name}_pre_comp.wav"
+    out_path = tmp_dir / f"{stem_name}_post_comp.wav"
+
+    sf.write(str(in_path), audio, sr)
+
+    cmd = [
+        "ffmpeg", "-y", "-i", str(in_path),
+        "-af", "acompressor=threshold=-24dB:ratio=3:attack=5:release=100:makeup=6dB",
+        str(out_path),
+    ]
+    result = subprocess.run(cmd, capture_output=True, text=True)
+    if result.returncode != 0:
+        print(f"  WARNING: compression failed for {stem_name}: {result.stderr[:200]}")
+        return audio
+
+    compressed, _ = sf.read(str(out_path), dtype="float32")
+    return compressed
+
+
+def apply_ducking(music: np.ndarray, dialog: np.ndarray, sr: int,
+                  duck_db: float = -12, attack_ms: float = 200,
+                  release_ms: float = 500) -> np.ndarray:
+    window_ms = 50
+    window_samples = int(sr * window_ms / 1000)
+    rms = compute_rms(dialog, window_samples)
+
+    # Speech detection threshold
+    mean_rms = np.mean(rms[rms > 0]) if np.any(rms > 0) else 1e-4
+    speech_thresh = mean_rms * 0.1
+
+    # Build gain envelope (per window)
+    duck_gain = 10 ** (duck_db / 20)
+    is_speech = rms > speech_thresh
+    target_gain = np.where(is_speech, duck_gain, 1.0).astype(np.float32)
+
+    # Smooth the envelope
+    attack_windows = max(1, int(attack_ms / window_ms))
+    release_windows = max(1, int(release_ms / window_ms))
+    smoothed = np.ones_like(target_gain)
+    for i in range(1, len(target_gain)):
+        if target_gain[i] < smoothed[i - 1]:
+            alpha = 1.0 / attack_windows
+            smoothed[i] = smoothed[i - 1] + alpha * (target_gain[i] - smoothed[i - 1])
+        else:
+            alpha = 1.0 / release_windows
+            smoothed[i] = smoothed[i - 1] + alpha * (target_gain[i] - smoothed[i - 1])
+
+    # Expand envelope to sample level
+    gain_samples = np.repeat(smoothed, window_samples)
+    if len(gain_samples) < len(music):
+        gain_samples = np.pad(gain_samples, (0, len(music) - len(gain_samples)), constant_values=1.0)
+    else:
+        gain_samples = gain_samples[:len(music)]
+
+    return music * gain_samples
+
+
+def mix_stems(stems: dict[str, np.ndarray],
+              levels: dict[str, float] | None = None) -> np.ndarray:
+    if levels is None:
+        levels = {"host": 0, "caller": 0, "music": -6, "sfx": -3, "ads": 0}
+
+    gains = {name: 10 ** (db / 20) for name, db in levels.items()}
+
+    # Find max length
+    max_len = max(len(s) for s in stems.values())
+
+    mix = np.zeros(max_len, dtype=np.float64)
+    for name in STEM_NAMES:
+        audio = stems[name]
+        if len(audio) < max_len:
+            audio = np.pad(audio, (0, max_len - len(audio)))
+        mix += audio.astype(np.float64) * gains.get(name, 1.0)
+
+    # Stereo (mono duplicated to both channels)
+    mix = np.clip(mix, -1.0, 1.0).astype(np.float32)
+    stereo = np.column_stack([mix, mix])
+    return stereo
+
+
+def normalize_and_export(audio: np.ndarray, sr: int, output_path: Path,
+                         target_lufs: float = -16, bitrate: str = "128k",
+                         tmp_dir: Path = None):
+    tmp_wav = tmp_dir / "pre_loudnorm.wav"
+    sf.write(str(tmp_wav), audio, sr)
+
+    # Pass 1: measure loudness
+    measure_cmd = [
+        "ffmpeg", "-y", "-i", str(tmp_wav),
+        "-af", f"loudnorm=I={target_lufs}:TP=-1:LRA=11:print_format=json",
+        "-f", "null", "-",
+    ]
+    result = subprocess.run(measure_cmd, capture_output=True, text=True)
+    stderr = result.stderr
+
+    # Parse loudnorm output
+    import json
+    json_start = stderr.rfind("{")
+    json_end = stderr.rfind("}") + 1
+    if json_start >= 0 and json_end > json_start:
+        stats = json.loads(stderr[json_start:json_end])
+    else:
+        print("  WARNING: couldn't parse loudnorm stats, using defaults")
+        stats = {
+            "input_i": "-23", "input_tp": "-1", "input_lra": "11",
+            "input_thresh": "-34",
+        }
+
+    # Pass 2: apply normalization + limiter + export MP3
+    loudnorm_filter = (
+        f"loudnorm=I={target_lufs}:TP=-1:LRA=11"
+        f":measured_I={stats['input_i']}"
+        f":measured_TP={stats['input_tp']}"
+        f":measured_LRA={stats['input_lra']}"
+        f":measured_thresh={stats['input_thresh']}"
+        f":linear=true"
+    )
+    export_cmd = [
+        "ffmpeg", "-y", "-i", str(tmp_wav),
+        "-af", f"{loudnorm_filter},alimiter=limit=-1dB:level=false",
+        "-ab", bitrate, "-ar", str(sr),
+        str(output_path),
+    ]
+    result = subprocess.run(export_cmd, capture_output=True, text=True)
+    if result.returncode != 0:
+        print(f"  ERROR: export failed: {result.stderr[:300]}")
+        sys.exit(1)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Post-production for AI podcast stems")
+    parser.add_argument("stems_dir", type=Path, help="Directory containing stem WAV files")
+    parser.add_argument("-o", "--output", type=str, default="episode.mp3", help="Output filename")
+    parser.add_argument("--gap-threshold", type=float, default=1.5, help="Min silence to cut (seconds)")
+    parser.add_argument("--duck-amount", type=float, default=-12, help="Music duck in dB")
+    parser.add_argument("--target-lufs", type=float, default=-16, help="Target loudness (LUFS)")
+    parser.add_argument("--bitrate", type=str, default="128k", help="MP3 bitrate")
+    parser.add_argument("--no-gap-removal", action="store_true", help="Skip gap removal")
+    parser.add_argument("--no-compression", action="store_true", help="Skip voice compression")
+    parser.add_argument("--no-ducking", action="store_true", help="Skip music ducking")
+    parser.add_argument("--dry-run", action="store_true", help="Show what would be done")
+    args = parser.parse_args()
+
+    stems_dir = args.stems_dir
+    if not stems_dir.exists():
+        print(f"ERROR: directory not found: {stems_dir}")
+        sys.exit(1)
+
+    # Resolve output path
+    output_path = Path(args.output)
+    if not output_path.is_absolute():
+        output_path = stems_dir / output_path
+
+    print(f"Post-production: {stems_dir} -> {output_path}")
+    print(f"  Gap removal: {'skip' if args.no_gap_removal else f'threshold={args.gap_threshold}s'}")
+    print(f"  Compression: {'skip' if args.no_compression else 'on'}")
+    print(f"  Ducking: {'skip' if args.no_ducking else f'{args.duck_amount}dB'}")
+    print(f"  Loudness: {args.target_lufs} LUFS, bitrate: {args.bitrate}")
+
+    if args.dry_run:
+        print("Dry run — exiting")
+        return
+
+    # Step 1: Load
+    print("\n[1/6] Loading stems...")
+    stems, sr = load_stems(stems_dir)
+
+    # Step 2: Gap removal
+    print("\n[2/6] Gap removal...")
+    if not args.no_gap_removal:
+        stems = remove_gaps(stems, sr, threshold_s=args.gap_threshold)
+    else:
+        print("  Skipped")
+
+    # Step 3: Voice compression
+    print("\n[3/6] Voice compression...")
+    if not args.no_compression:
+        with tempfile.TemporaryDirectory() as tmp:
+            tmp_dir = Path(tmp)
+            for name in ["host", "caller"]:
+                if np.any(stems[name] != 0):
+                    print(f"  Compressing {name}...")
+                    stems[name] = compress_voice(stems[name], sr, tmp_dir, name)
+    else:
+        print("  Skipped")
+
+    # Step 4: Music ducking
+    print("\n[4/6] Music ducking...")
+    if not args.no_ducking:
+        dialog = stems["host"] + stems["caller"]
+        if np.any(dialog != 0) and np.any(stems["music"] != 0):
+            stems["music"] = apply_ducking(stems["music"], dialog, sr, duck_db=args.duck_amount)
+            print("  Applied")
+        else:
+            print("  No dialog or music to duck")
+    else:
+        print("  Skipped")
+
+    # Step 5: Mix
+    print("\n[5/6] Mixing...")
+    stereo = mix_stems(stems)
+    print(f"  Mixed to stereo: {len(stereo)} samples ({len(stereo)/sr:.1f}s)")
+
+    # Step 6: Normalize + export
+    print("\n[6/6] Loudness normalization + export...")
+    with tempfile.TemporaryDirectory() as tmp:
+        normalize_and_export(stereo, sr, output_path,
+                             target_lufs=args.target_lufs,
+                             bitrate=args.bitrate,
+                             tmp_dir=Path(tmp))
+
+    print(f"\nDone! Output: {output_path}")
+
+
+if __name__ == "__main__":
+    main()
@@ -18,9 +18,34 @@ import sys
 import base64
 from pathlib import Path

+import ssl
 import requests
+import urllib3
+from requests.adapters import HTTPAdapter
+from urllib3.util.ssl_ import create_urllib3_context
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 from dotenv import load_dotenv

+
+class TLSAdapter(HTTPAdapter):
+    """Adapter to handle servers with older TLS configurations."""
+    def init_poolmanager(self, *args, **kwargs):
+        ctx = create_urllib3_context()
+        ctx.set_ciphers('DEFAULT@SECLEVEL=1')
+        ctx.check_hostname = False
+        ctx.verify_mode = ssl.CERT_NONE
+        kwargs['ssl_context'] = ctx
+        return super().init_poolmanager(*args, **kwargs)
+
+    def send(self, *args, **kwargs):
+        kwargs['verify'] = False
+        return super().send(*args, **kwargs)
+
+
+# Use a session with TLS compatibility for all Castopod requests
+_session = requests.Session()
+_session.mount('https://', TLSAdapter())
+
 # Load environment variables
 load_dotenv(Path(__file__).parent / ".env")

@@ -156,41 +181,45 @@ Respond with ONLY valid JSON, no markdown or explanation."""


 def create_episode(audio_path: str, metadata: dict, episode_number: int) -> dict:
-    """Create episode on Castopod."""
+    """Create episode on Castopod using curl (handles large file uploads better)."""
    print("[3/5] Creating episode on Castopod...")

-    headers = get_auth_header()
+    credentials = base64.b64encode(
+        f"{CASTOPOD_USERNAME}:{CASTOPOD_PASSWORD}".encode()
+    ).decode()
    slug = re.sub(r'[^a-z0-9]+', '-', metadata["title"].lower()).strip('-')

-    # Upload audio and create episode
-    with open(audio_path, "rb") as f:
-        files = {
-            "audio_file": (Path(audio_path).name, f, "audio/mpeg")
-        }
-        data = {
-            "title": metadata["title"],
-            "slug": slug,
-            "description": metadata["description"],
-            "parental_advisory": "explicit",
-            "type": "full",
-            "podcast_id": str(PODCAST_ID),
-            "created_by": "1",
-            "updated_by": "1",
-            "episode_number": str(episode_number),
-        }
-
-        response = requests.post(
+    cmd = [
+        "curl", "-sk", "-X", "POST",
        f"{CASTOPOD_URL}/api/rest/v1/episodes",
-            headers=headers,
-            files=files,
-            data=data
-        )
+        "-H", f"Authorization: Basic {credentials}",
+        "-F", f"audio_file=@{audio_path};type=audio/mpeg",
+        "-F", f"title={metadata['title']}",
+        "-F", f"slug={slug}",
+        "-F", f"description={metadata['description']}",
+        "-F", "parental_advisory=explicit",
+        "-F", "type=full",
+        "-F", f"podcast_id={PODCAST_ID}",
+        "-F", "created_by=1",
+        "-F", "updated_by=1",
+        "-F", f"episode_number={episode_number}",
+    ]

-    if response.status_code not in (200, 201):
-        print(f"Error creating episode: {response.status_code} {response.text}")
+    result = subprocess.run(cmd, capture_output=True, text=True, timeout=300)
+    if result.returncode != 0:
+        print(f"Error uploading: {result.stderr}")
+        sys.exit(1)
+
+    try:
+        episode = json.loads(result.stdout)
+    except json.JSONDecodeError:
+        print(f"Error parsing response: {result.stdout[:500]}")
+        sys.exit(1)
+
+    if "id" not in episode:
+        print(f"Error creating episode: {result.stdout[:500]}")
        sys.exit(1)

-    episode = response.json()
    print(f"    Created episode ID: {episode['id']}")
    print(f"    Slug: {episode['slug']}")

@@ -203,13 +232,13 @@ def publish_episode(episode_id: int) -> dict:

    headers = get_auth_header()

-    response = requests.post(
+    response = _session.post(
        f"{CASTOPOD_URL}/api/rest/v1/episodes/{episode_id}/publish",
        headers=headers,
        data={
            "publication_method": "now",
            "created_by": "1"
-        }
+        },
    )

    if response.status_code != 200:
@@ -316,9 +345,9 @@ def get_next_episode_number() -> int:
    """Get the next episode number from Castopod."""
    headers = get_auth_header()

-    response = requests.get(
+    response = _session.get(
        f"{CASTOPOD_URL}/api/rest/v1/episodes",
-        headers=headers
+        headers=headers,
    )

    if response.status_code != 200:
@@ -100,7 +100,7 @@
        <div class="hiw-step-number">1</div>
        <div class="hiw-step-content">
          <h3>A Person Is Born</h3>
-          <p>Every caller starts as a blank slate. The system generates a complete identity: name, age, job, hometown, and personality. Each caller gets a unique speaking style — some ramble, some are blunt, some deflect with humor. They have relationships, vehicles, opinions, memories, and reasons for being up this late.</p>
+          <p>Every caller starts as a blank slate. The system generates a complete identity: name, age, job, hometown, and personality. Each caller gets a unique speaking style — some ramble, some are blunt, some deflect with humor. They have relationships, vehicles, strong food opinions, nostalgic memories, and reasons for being up this late. They know what they were watching on TV, what errand they ran today, and what song was on the radio before they called.</p>
          <div class="hiw-detail-grid">
            <div class="hiw-detail">
              <span class="hiw-detail-label">Unique Names</span>
@@ -108,7 +108,7 @@
            </div>
            <div class="hiw-detail">
              <span class="hiw-detail-label">Personality Layers</span>
-              <span class="hiw-detail-value">20+</span>
+              <span class="hiw-detail-value">30+</span>
            </div>
            <div class="hiw-detail">
              <span class="hiw-detail-label">Towns with Real Knowledge</span>
@@ -126,7 +126,7 @@
        <div class="hiw-step-number">2</div>
        <div class="hiw-step-content">
          <h3>They Know Their World</h3>
-          <p>Callers know real facts about where they live — the restaurants, the highways, the local gossip. When a caller says they're from Lordsburg, they actually know about the Hidalgo Hotel and the drive to Deming. The system pulls in real-time news so callers can reference things that actually happened today.</p>
+          <p>Callers know real facts about where they live — the restaurants, the highways, the local gossip. When a caller says they're from Lordsburg, they actually know about the Shakespeare ghost town and the drive to Deming. They know the current weather outside their window, what day of the week it is, whether it's monsoon season or chile harvest. They have strong opinions about where to get the best green chile and get nostalgic about how their town used to be. The system also pulls in real-time news so callers can reference things that actually happened today.</p>
        </div>
      </div>

@@ -152,7 +152,7 @@
        <div class="hiw-step-number">4</div>
        <div class="hiw-step-content">
          <h3>The Conversation Is Real</h3>
-          <p>Luke talks to each caller using push-to-talk, just like a real radio show. His voice is transcribed in real time, sent to an AI that responds in character, and then converted to speech using a voice engine — all in a few seconds. The AI doesn't just answer questions; it reacts, gets emotional, goes on tangents, and remembers what was said earlier in the show.</p>
+          <p>Luke talks to each caller using push-to-talk, just like a real radio show. His voice is transcribed in real time, sent to an AI that responds in character, and then converted to speech using a voice engine — all in a few seconds. The AI doesn't just answer questions; it reacts, gets emotional, goes on tangents, and remembers what was said earlier in the show. Callers even react to previous callers — "Hey Luke, I heard that guy Tony earlier and I got to say, he's full of it." It makes the show feel like a living community, not isolated calls.</p>
        </div>
      </div>

@@ -207,7 +207,14 @@
          <svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><circle cx="12" cy="12" r="10"/><polyline points="12 6 12 12 16 14"/></svg>
        </div>
        <h3>Real Time</h3>
-        <p>Everything happens live. Caller generation, voice synthesis, news lookups, phone routing — all in real time during the show. There's no post-production trickery on the caller side. What you hear is what happened.</p>
+        <p>Everything happens live. Caller generation, voice synthesis, news lookups, weather checks, phone routing — all in real time during the show. There's no post-production trickery on the caller side. What you hear is what happened.</p>
+      </div>
+      <div class="hiw-feature">
+        <div class="hiw-feature-icon">
+          <svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><path d="M17 21v-2a4 4 0 0 0-4-4H5a4 4 0 0 0-4 4v2"/><circle cx="9" cy="7" r="4"/><path d="M23 21v-2a4 4 0 0 0-3-3.87"/><path d="M16 3.13a4 4 0 0 1 0 7.75"/></svg>
+        </div>
+        <h3>They Listen to Each Other</h3>
+        <p>Callers aren't isolated — they hear what happened earlier in the show. A caller might disagree with the last guy, back someone up, or call in specifically because of something another caller said. The show builds on itself.</p>
      </div>
    </div>
  </section>