Postprod improvements: denoise, phone EQ, ad muting, ducking, voice mappings

- Add host mic noise reduction (afftdn + anlmdn)
- Add phone EQ bandpass on caller stem
- Mute music during ads with 2s lookahead/tail
- Increase ducking release to 3s to reduce pumping
- Add Inworld voice mappings for all regular callers
- Recording toggle endpoint, stem sync fixes

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-12 03:59:08 -07:00
parent 75f15ba2d2
commit 95c2d06435
6 changed files with 216 additions and 96 deletions

View File

@@ -61,23 +61,30 @@ def compute_rms(audio: np.ndarray, window_samples: int) -> np.ndarray:
def remove_gaps(stems: dict[str, np.ndarray], sr: int,
threshold_s: float = 1.5, crossfade_ms: float = 30) -> dict[str, np.ndarray]:
threshold_s: float = 2.0, max_gap_s: float = 8.0,
crossfade_ms: float = 30, pad_s: float = 0.5) -> dict[str, np.ndarray]:
window_ms = 50
window_samples = int(sr * window_ms / 1000)
crossfade_samples = int(sr * crossfade_ms / 1000)
dialog = stems["host"] + stems["caller"]
rms = compute_rms(dialog, window_samples)
# Detect gaps in everything except music (which always plays).
# This catches TTS latency gaps while protecting ad breaks and SFX transitions.
content = stems["host"] + stems["caller"] + stems["sfx"] + stems["ads"]
rms = compute_rms(content, window_samples)
# Threshold: -60dB or adaptive based on mean RMS
mean_rms = np.mean(rms[rms > 0]) if np.any(rms > 0) else 1e-4
silence_thresh = min(mean_rms * 0.05, 0.001)
# Threshold: percentile-based to sit above the mic noise floor
nonzero_rms = rms[rms > 0]
if len(nonzero_rms) == 0:
print(" No audio detected")
return stems
noise_floor = np.percentile(nonzero_rms, 20)
silence_thresh = noise_floor * 3
# Find silent regions
is_silent = rms < silence_thresh
min_silent_windows = int(threshold_s / (window_ms / 1000))
max_silent_windows = int(max_gap_s / (window_ms / 1000))
# Build list of regions to cut (in samples)
# Only cut gaps between 1.5-8s — targets TTS latency, not long breaks
cuts = []
i = 0
while i < len(is_silent):
@@ -86,10 +93,11 @@ def remove_gaps(stems: dict[str, np.ndarray], sr: int,
while i < len(is_silent) and is_silent[i]:
i += 1
length = i - start
if length >= min_silent_windows:
# Keep a small buffer at edges
cut_start = (start + 1) * window_samples
cut_end = (i - 1) * window_samples
if min_silent_windows <= length <= max_silent_windows:
# Leave pad_s of silence so the edit sounds natural
pad_samples = int(pad_s * sr)
cut_start = (start + 1) * window_samples + pad_samples
cut_end = (i - 1) * window_samples - pad_samples
if cut_end > cut_start + crossfade_samples * 2:
cuts.append((cut_start, cut_end))
else:
@@ -102,18 +110,18 @@ def remove_gaps(stems: dict[str, np.ndarray], sr: int,
total_cut = sum(end - start for start, end in cuts) / sr
print(f" Removing {len(cuts)} gaps ({total_cut:.1f}s total)")
# Apply cuts to dialog stems (host, caller, sfx, ads) — not music
cut_stems = ["host", "caller", "sfx", "ads"]
# Cut dialog/sfx/ads at gap points. Leave music uncut — just trim to fit.
result = {}
for name in cut_stems:
for name in STEM_NAMES:
if name == "music":
continue # handled below
audio = stems[name]
pieces = []
prev_end = 0
for cut_start, cut_end in cuts:
if prev_end < cut_start:
piece = audio[prev_end:cut_start].copy()
# Apply crossfade at join point
if pieces and len(piece) > crossfade_samples:
fade_in = np.linspace(0, 1, crossfade_samples, dtype=np.float32)
piece[:crossfade_samples] *= fade_in
@@ -135,18 +143,49 @@ def remove_gaps(stems: dict[str, np.ndarray], sr: int,
result[name] = np.concatenate(pieces) if pieces else np.array([], dtype=np.float32)
# Trim music to match new duration, with fade-out at end
# Music: leave uncut, just trim to match new duration with fade-out
new_len = len(result["host"])
music = stems["music"][:new_len].copy() if len(stems["music"]) >= new_len else np.pad(stems["music"], (0, max(0, new_len - len(stems["music"]))))
fade_samples = int(sr * 2) # 2s fade out
music = stems["music"]
if len(music) >= new_len:
music = music[:new_len].copy()
else:
music = np.pad(music, (0, new_len - len(music)))
fade_samples = int(sr * 3)
if len(music) > fade_samples:
fade_out = np.linspace(1, 0, fade_samples, dtype=np.float32)
music[-fade_samples:] *= fade_out
music[-fade_samples:] *= np.linspace(1, 0, fade_samples, dtype=np.float32)
result["music"] = music
return result
def denoise(audio: np.ndarray, sr: int, tmp_dir: Path) -> np.ndarray:
"""High-quality noise reduction using ffmpeg afftdn (adaptive Wiener filter)."""
in_path = tmp_dir / "host_pre_denoise.wav"
out_path = tmp_dir / "host_post_denoise.wav"
sf.write(str(in_path), audio, sr)
# afftdn: adaptive FFT denoiser with Wiener filter
# nt=w - Wiener filter (best quality)
# om=o - output cleaned signal
# nr=10 - noise reduction in dB (10 = moderate, preserves voice naturalness)
# nf=-30 - noise floor estimate in dB
# anlmdn: non-local means denoiser for residual broadband noise
# s=4 - patch size
# p=0.002 - strength (gentle to avoid artifacts)
af = (
"afftdn=nt=w:om=o:nr=12:nf=-30,"
"anlmdn=s=4:p=0.002"
)
cmd = ["ffmpeg", "-y", "-i", str(in_path), "-af", af, str(out_path)]
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
print(f" WARNING: denoise failed: {result.stderr[:200]}")
return audio
denoised, _ = sf.read(str(out_path), dtype="float32")
return denoised
def compress_voice(audio: np.ndarray, sr: int, tmp_dir: Path,
stem_name: str) -> np.ndarray:
in_path = tmp_dir / f"{stem_name}_pre_comp.wav"
@@ -156,7 +195,7 @@ def compress_voice(audio: np.ndarray, sr: int, tmp_dir: Path,
cmd = [
"ffmpeg", "-y", "-i", str(in_path),
"-af", "acompressor=threshold=-24dB:ratio=3:attack=5:release=100:makeup=6dB",
"-af", "acompressor=threshold=-24dB:ratio=2.5:attack=10:release=800:makeup=6dB",
str(out_path),
]
result = subprocess.run(cmd, capture_output=True, text=True)
@@ -168,9 +207,32 @@ def compress_voice(audio: np.ndarray, sr: int, tmp_dir: Path,
return compressed
def phone_eq(audio: np.ndarray, sr: int, tmp_dir: Path) -> np.ndarray:
"""Apply telephone EQ to make caller sound like a phone call."""
in_path = tmp_dir / "caller_pre_phone.wav"
out_path = tmp_dir / "caller_post_phone.wav"
sf.write(str(in_path), audio, sr)
# Bandpass 300-3400Hz (telephone bandwidth) + slight mid boost for presence
af = (
"highpass=f=300:poles=2,"
"lowpass=f=3400:poles=2,"
"equalizer=f=1000:t=q:w=0.8:g=4"
)
cmd = ["ffmpeg", "-y", "-i", str(in_path), "-af", af, str(out_path)]
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
print(f" WARNING: phone EQ failed: {result.stderr[:200]}")
return audio
filtered, _ = sf.read(str(out_path), dtype="float32")
return filtered
def apply_ducking(music: np.ndarray, dialog: np.ndarray, sr: int,
duck_db: float = -12, attack_ms: float = 200,
release_ms: float = 500) -> np.ndarray:
duck_db: float = -20, attack_ms: float = 200,
release_ms: float = 3000,
mute_signal: np.ndarray | None = None) -> np.ndarray:
window_ms = 50
window_samples = int(sr * window_ms / 1000)
rms = compute_rms(dialog, window_samples)
@@ -184,6 +246,22 @@ def apply_ducking(music: np.ndarray, dialog: np.ndarray, sr: int,
is_speech = rms > speech_thresh
target_gain = np.where(is_speech, duck_gain, 1.0).astype(np.float32)
# Mute music completely during ads with lookahead and tail
if mute_signal is not None:
mute_rms = compute_rms(mute_signal, window_samples)
mute_thresh = np.mean(mute_rms[mute_rms > 0]) * 0.1 if np.any(mute_rms > 0) else 1e-4
is_ads = mute_rms > mute_thresh
# Expand ad regions: 2s before (fade out music before ad) and 2s after (don't resume immediately)
lookahead_windows = int(2000 / window_ms)
tail_windows = int(2000 / window_ms)
expanded_ads = is_ads.copy()
for i in range(len(is_ads)):
if is_ads[i]:
start = max(0, i - lookahead_windows)
end = min(len(expanded_ads), i + tail_windows + 1)
expanded_ads[start:end] = True
target_gain[expanded_ads] = 0.0
# Smooth the envelope
attack_windows = max(1, int(attack_ms / window_ms))
release_windows = max(1, int(release_ms / window_ms))
@@ -206,10 +284,30 @@ def apply_ducking(music: np.ndarray, dialog: np.ndarray, sr: int,
return music * gain_samples
def match_voice_levels(stems: dict[str, np.ndarray], target_rms: float = 0.1) -> dict[str, np.ndarray]:
"""Normalize host, caller, and ads stems to the same RMS level."""
for name in ["host", "caller", "ads"]:
audio = stems[name]
# Only measure non-silent portions
active = audio[np.abs(audio) > 0.001]
if len(active) == 0:
continue
current_rms = np.sqrt(np.mean(active ** 2))
if current_rms < 1e-6:
continue
gain = target_rms / current_rms
# Clamp gain to avoid extreme boosts on very quiet stems
gain = min(gain, 10.0)
stems[name] = np.clip(audio * gain, -1.0, 1.0).astype(np.float32)
db_change = 20 * np.log10(gain) if gain > 0 else 0
print(f" {name}: RMS {current_rms:.4f} -> {target_rms:.4f} ({db_change:+.1f}dB)")
return stems
def mix_stems(stems: dict[str, np.ndarray],
levels: dict[str, float] | None = None) -> np.ndarray:
if levels is None:
levels = {"host": 0, "caller": 0, "music": -6, "sfx": -3, "ads": 0}
levels = {"host": 0, "caller": 0, "music": -6, "sfx": -6, "ads": 0}
gains = {name: 10 ** (db / 20) for name, db in levels.items()}
@@ -282,8 +380,8 @@ def main():
parser = argparse.ArgumentParser(description="Post-production for AI podcast stems")
parser.add_argument("stems_dir", type=Path, help="Directory containing stem WAV files")
parser.add_argument("-o", "--output", type=str, default="episode.mp3", help="Output filename")
parser.add_argument("--gap-threshold", type=float, default=1.5, help="Min silence to cut (seconds)")
parser.add_argument("--duck-amount", type=float, default=-12, help="Music duck in dB")
parser.add_argument("--gap-threshold", type=float, default=2.0, help="Min silence to cut (seconds)")
parser.add_argument("--duck-amount", type=float, default=-20, help="Music duck in dB")
parser.add_argument("--target-lufs", type=float, default=-16, help="Target loudness (LUFS)")
parser.add_argument("--bitrate", type=str, default="128k", help="MP3 bitrate")
parser.add_argument("--no-gap-removal", action="store_true", help="Skip gap removal")
@@ -313,18 +411,27 @@ def main():
return
# Step 1: Load
print("\n[1/6] Loading stems...")
print("\n[1/9] Loading stems...")
stems, sr = load_stems(stems_dir)
# Step 2: Gap removal
print("\n[2/6] Gap removal...")
print("\n[2/9] Gap removal...")
if not args.no_gap_removal:
stems = remove_gaps(stems, sr, threshold_s=args.gap_threshold)
else:
print(" Skipped")
# Step 3: Voice compression
print("\n[3/6] Voice compression...")
# Step 3: Host mic noise reduction
print("\n[3/9] Host mic noise reduction...")
if np.any(stems["host"] != 0):
with tempfile.TemporaryDirectory() as tmp:
stems["host"] = denoise(stems["host"], sr, Path(tmp))
print(" Applied")
else:
print(" No host audio")
# Step 4: Voice compression
print("\n[4/9] Voice compression...")
if not args.no_compression:
with tempfile.TemporaryDirectory() as tmp:
tmp_dir = Path(tmp)
@@ -335,25 +442,39 @@ def main():
else:
print(" Skipped")
# Step 4: Music ducking
print("\n[4/6] Music ducking...")
# Step 5: Phone EQ on caller
print("\n[5/9] Phone EQ on caller...")
if np.any(stems["caller"] != 0):
with tempfile.TemporaryDirectory() as tmp:
stems["caller"] = phone_eq(stems["caller"], sr, Path(tmp))
print(" Applied")
else:
print(" No caller audio")
# Step 6: Match voice levels
print("\n[6/9] Matching voice levels...")
stems = match_voice_levels(stems)
# Step 7: Music ducking
print("\n[7/9] Music ducking...")
if not args.no_ducking:
dialog = stems["host"] + stems["caller"]
if np.any(dialog != 0) and np.any(stems["music"] != 0):
stems["music"] = apply_ducking(stems["music"], dialog, sr, duck_db=args.duck_amount)
stems["music"] = apply_ducking(stems["music"], dialog, sr, duck_db=args.duck_amount,
mute_signal=stems["ads"])
print(" Applied")
else:
print(" No dialog or music to duck")
else:
print(" Skipped")
# Step 5: Mix
print("\n[5/6] Mixing...")
# Step 8: Mix
print("\n[8/9] Mixing...")
stereo = mix_stems(stems)
print(f" Mixed to stereo: {len(stereo)} samples ({len(stereo)/sr:.1f}s)")
# Step 6: Normalize + export
print("\n[6/6] Loudness normalization + export...")
# Step 9: Normalize + export
print("\n[9/9] Loudness normalization + export...")
with tempfile.TemporaryDirectory() as tmp:
normalize_and_export(stereo, sr, output_path,
target_lufs=args.target_lufs,