Postprod improvements: denoise, phone EQ, ad muting, ducking, voice mappings
- Add host mic noise reduction (afftdn + anlmdn) - Add phone EQ bandpass on caller stem - Mute music during ads with 2s lookahead/tail - Increase ducking release to 3s to reduce pumping - Add Inworld voice mappings for all regular callers - Recording toggle endpoint, stem sync fixes Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
197
postprod.py
197
postprod.py
@@ -61,23 +61,30 @@ def compute_rms(audio: np.ndarray, window_samples: int) -> np.ndarray:
|
||||
|
||||
|
||||
def remove_gaps(stems: dict[str, np.ndarray], sr: int,
|
||||
threshold_s: float = 1.5, crossfade_ms: float = 30) -> dict[str, np.ndarray]:
|
||||
threshold_s: float = 2.0, max_gap_s: float = 8.0,
|
||||
crossfade_ms: float = 30, pad_s: float = 0.5) -> dict[str, np.ndarray]:
|
||||
window_ms = 50
|
||||
window_samples = int(sr * window_ms / 1000)
|
||||
crossfade_samples = int(sr * crossfade_ms / 1000)
|
||||
|
||||
dialog = stems["host"] + stems["caller"]
|
||||
rms = compute_rms(dialog, window_samples)
|
||||
# Detect gaps in everything except music (which always plays).
|
||||
# This catches TTS latency gaps while protecting ad breaks and SFX transitions.
|
||||
content = stems["host"] + stems["caller"] + stems["sfx"] + stems["ads"]
|
||||
rms = compute_rms(content, window_samples)
|
||||
|
||||
# Threshold: -60dB or adaptive based on mean RMS
|
||||
mean_rms = np.mean(rms[rms > 0]) if np.any(rms > 0) else 1e-4
|
||||
silence_thresh = min(mean_rms * 0.05, 0.001)
|
||||
# Threshold: percentile-based to sit above the mic noise floor
|
||||
nonzero_rms = rms[rms > 0]
|
||||
if len(nonzero_rms) == 0:
|
||||
print(" No audio detected")
|
||||
return stems
|
||||
noise_floor = np.percentile(nonzero_rms, 20)
|
||||
silence_thresh = noise_floor * 3
|
||||
|
||||
# Find silent regions
|
||||
is_silent = rms < silence_thresh
|
||||
min_silent_windows = int(threshold_s / (window_ms / 1000))
|
||||
max_silent_windows = int(max_gap_s / (window_ms / 1000))
|
||||
|
||||
# Build list of regions to cut (in samples)
|
||||
# Only cut gaps between 1.5-8s — targets TTS latency, not long breaks
|
||||
cuts = []
|
||||
i = 0
|
||||
while i < len(is_silent):
|
||||
@@ -86,10 +93,11 @@ def remove_gaps(stems: dict[str, np.ndarray], sr: int,
|
||||
while i < len(is_silent) and is_silent[i]:
|
||||
i += 1
|
||||
length = i - start
|
||||
if length >= min_silent_windows:
|
||||
# Keep a small buffer at edges
|
||||
cut_start = (start + 1) * window_samples
|
||||
cut_end = (i - 1) * window_samples
|
||||
if min_silent_windows <= length <= max_silent_windows:
|
||||
# Leave pad_s of silence so the edit sounds natural
|
||||
pad_samples = int(pad_s * sr)
|
||||
cut_start = (start + 1) * window_samples + pad_samples
|
||||
cut_end = (i - 1) * window_samples - pad_samples
|
||||
if cut_end > cut_start + crossfade_samples * 2:
|
||||
cuts.append((cut_start, cut_end))
|
||||
else:
|
||||
@@ -102,18 +110,18 @@ def remove_gaps(stems: dict[str, np.ndarray], sr: int,
|
||||
total_cut = sum(end - start for start, end in cuts) / sr
|
||||
print(f" Removing {len(cuts)} gaps ({total_cut:.1f}s total)")
|
||||
|
||||
# Apply cuts to dialog stems (host, caller, sfx, ads) — not music
|
||||
cut_stems = ["host", "caller", "sfx", "ads"]
|
||||
# Cut dialog/sfx/ads at gap points. Leave music uncut — just trim to fit.
|
||||
result = {}
|
||||
|
||||
for name in cut_stems:
|
||||
for name in STEM_NAMES:
|
||||
if name == "music":
|
||||
continue # handled below
|
||||
audio = stems[name]
|
||||
pieces = []
|
||||
prev_end = 0
|
||||
for cut_start, cut_end in cuts:
|
||||
if prev_end < cut_start:
|
||||
piece = audio[prev_end:cut_start].copy()
|
||||
# Apply crossfade at join point
|
||||
if pieces and len(piece) > crossfade_samples:
|
||||
fade_in = np.linspace(0, 1, crossfade_samples, dtype=np.float32)
|
||||
piece[:crossfade_samples] *= fade_in
|
||||
@@ -135,18 +143,49 @@ def remove_gaps(stems: dict[str, np.ndarray], sr: int,
|
||||
|
||||
result[name] = np.concatenate(pieces) if pieces else np.array([], dtype=np.float32)
|
||||
|
||||
# Trim music to match new duration, with fade-out at end
|
||||
# Music: leave uncut, just trim to match new duration with fade-out
|
||||
new_len = len(result["host"])
|
||||
music = stems["music"][:new_len].copy() if len(stems["music"]) >= new_len else np.pad(stems["music"], (0, max(0, new_len - len(stems["music"]))))
|
||||
fade_samples = int(sr * 2) # 2s fade out
|
||||
music = stems["music"]
|
||||
if len(music) >= new_len:
|
||||
music = music[:new_len].copy()
|
||||
else:
|
||||
music = np.pad(music, (0, new_len - len(music)))
|
||||
fade_samples = int(sr * 3)
|
||||
if len(music) > fade_samples:
|
||||
fade_out = np.linspace(1, 0, fade_samples, dtype=np.float32)
|
||||
music[-fade_samples:] *= fade_out
|
||||
music[-fade_samples:] *= np.linspace(1, 0, fade_samples, dtype=np.float32)
|
||||
result["music"] = music
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def denoise(audio: np.ndarray, sr: int, tmp_dir: Path) -> np.ndarray:
|
||||
"""High-quality noise reduction using ffmpeg afftdn (adaptive Wiener filter)."""
|
||||
in_path = tmp_dir / "host_pre_denoise.wav"
|
||||
out_path = tmp_dir / "host_post_denoise.wav"
|
||||
sf.write(str(in_path), audio, sr)
|
||||
|
||||
# afftdn: adaptive FFT denoiser with Wiener filter
|
||||
# nt=w - Wiener filter (best quality)
|
||||
# om=o - output cleaned signal
|
||||
# nr=10 - noise reduction in dB (10 = moderate, preserves voice naturalness)
|
||||
# nf=-30 - noise floor estimate in dB
|
||||
# anlmdn: non-local means denoiser for residual broadband noise
|
||||
# s=4 - patch size
|
||||
# p=0.002 - strength (gentle to avoid artifacts)
|
||||
af = (
|
||||
"afftdn=nt=w:om=o:nr=12:nf=-30,"
|
||||
"anlmdn=s=4:p=0.002"
|
||||
)
|
||||
cmd = ["ffmpeg", "-y", "-i", str(in_path), "-af", af, str(out_path)]
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
if result.returncode != 0:
|
||||
print(f" WARNING: denoise failed: {result.stderr[:200]}")
|
||||
return audio
|
||||
|
||||
denoised, _ = sf.read(str(out_path), dtype="float32")
|
||||
return denoised
|
||||
|
||||
|
||||
def compress_voice(audio: np.ndarray, sr: int, tmp_dir: Path,
|
||||
stem_name: str) -> np.ndarray:
|
||||
in_path = tmp_dir / f"{stem_name}_pre_comp.wav"
|
||||
@@ -156,7 +195,7 @@ def compress_voice(audio: np.ndarray, sr: int, tmp_dir: Path,
|
||||
|
||||
cmd = [
|
||||
"ffmpeg", "-y", "-i", str(in_path),
|
||||
"-af", "acompressor=threshold=-24dB:ratio=3:attack=5:release=100:makeup=6dB",
|
||||
"-af", "acompressor=threshold=-24dB:ratio=2.5:attack=10:release=800:makeup=6dB",
|
||||
str(out_path),
|
||||
]
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
@@ -168,9 +207,32 @@ def compress_voice(audio: np.ndarray, sr: int, tmp_dir: Path,
|
||||
return compressed
|
||||
|
||||
|
||||
def phone_eq(audio: np.ndarray, sr: int, tmp_dir: Path) -> np.ndarray:
|
||||
"""Apply telephone EQ to make caller sound like a phone call."""
|
||||
in_path = tmp_dir / "caller_pre_phone.wav"
|
||||
out_path = tmp_dir / "caller_post_phone.wav"
|
||||
sf.write(str(in_path), audio, sr)
|
||||
|
||||
# Bandpass 300-3400Hz (telephone bandwidth) + slight mid boost for presence
|
||||
af = (
|
||||
"highpass=f=300:poles=2,"
|
||||
"lowpass=f=3400:poles=2,"
|
||||
"equalizer=f=1000:t=q:w=0.8:g=4"
|
||||
)
|
||||
cmd = ["ffmpeg", "-y", "-i", str(in_path), "-af", af, str(out_path)]
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
if result.returncode != 0:
|
||||
print(f" WARNING: phone EQ failed: {result.stderr[:200]}")
|
||||
return audio
|
||||
|
||||
filtered, _ = sf.read(str(out_path), dtype="float32")
|
||||
return filtered
|
||||
|
||||
|
||||
def apply_ducking(music: np.ndarray, dialog: np.ndarray, sr: int,
|
||||
duck_db: float = -12, attack_ms: float = 200,
|
||||
release_ms: float = 500) -> np.ndarray:
|
||||
duck_db: float = -20, attack_ms: float = 200,
|
||||
release_ms: float = 3000,
|
||||
mute_signal: np.ndarray | None = None) -> np.ndarray:
|
||||
window_ms = 50
|
||||
window_samples = int(sr * window_ms / 1000)
|
||||
rms = compute_rms(dialog, window_samples)
|
||||
@@ -184,6 +246,22 @@ def apply_ducking(music: np.ndarray, dialog: np.ndarray, sr: int,
|
||||
is_speech = rms > speech_thresh
|
||||
target_gain = np.where(is_speech, duck_gain, 1.0).astype(np.float32)
|
||||
|
||||
# Mute music completely during ads with lookahead and tail
|
||||
if mute_signal is not None:
|
||||
mute_rms = compute_rms(mute_signal, window_samples)
|
||||
mute_thresh = np.mean(mute_rms[mute_rms > 0]) * 0.1 if np.any(mute_rms > 0) else 1e-4
|
||||
is_ads = mute_rms > mute_thresh
|
||||
# Expand ad regions: 2s before (fade out music before ad) and 2s after (don't resume immediately)
|
||||
lookahead_windows = int(2000 / window_ms)
|
||||
tail_windows = int(2000 / window_ms)
|
||||
expanded_ads = is_ads.copy()
|
||||
for i in range(len(is_ads)):
|
||||
if is_ads[i]:
|
||||
start = max(0, i - lookahead_windows)
|
||||
end = min(len(expanded_ads), i + tail_windows + 1)
|
||||
expanded_ads[start:end] = True
|
||||
target_gain[expanded_ads] = 0.0
|
||||
|
||||
# Smooth the envelope
|
||||
attack_windows = max(1, int(attack_ms / window_ms))
|
||||
release_windows = max(1, int(release_ms / window_ms))
|
||||
@@ -206,10 +284,30 @@ def apply_ducking(music: np.ndarray, dialog: np.ndarray, sr: int,
|
||||
return music * gain_samples
|
||||
|
||||
|
||||
def match_voice_levels(stems: dict[str, np.ndarray], target_rms: float = 0.1) -> dict[str, np.ndarray]:
|
||||
"""Normalize host, caller, and ads stems to the same RMS level."""
|
||||
for name in ["host", "caller", "ads"]:
|
||||
audio = stems[name]
|
||||
# Only measure non-silent portions
|
||||
active = audio[np.abs(audio) > 0.001]
|
||||
if len(active) == 0:
|
||||
continue
|
||||
current_rms = np.sqrt(np.mean(active ** 2))
|
||||
if current_rms < 1e-6:
|
||||
continue
|
||||
gain = target_rms / current_rms
|
||||
# Clamp gain to avoid extreme boosts on very quiet stems
|
||||
gain = min(gain, 10.0)
|
||||
stems[name] = np.clip(audio * gain, -1.0, 1.0).astype(np.float32)
|
||||
db_change = 20 * np.log10(gain) if gain > 0 else 0
|
||||
print(f" {name}: RMS {current_rms:.4f} -> {target_rms:.4f} ({db_change:+.1f}dB)")
|
||||
return stems
|
||||
|
||||
|
||||
def mix_stems(stems: dict[str, np.ndarray],
|
||||
levels: dict[str, float] | None = None) -> np.ndarray:
|
||||
if levels is None:
|
||||
levels = {"host": 0, "caller": 0, "music": -6, "sfx": -3, "ads": 0}
|
||||
levels = {"host": 0, "caller": 0, "music": -6, "sfx": -6, "ads": 0}
|
||||
|
||||
gains = {name: 10 ** (db / 20) for name, db in levels.items()}
|
||||
|
||||
@@ -282,8 +380,8 @@ def main():
|
||||
parser = argparse.ArgumentParser(description="Post-production for AI podcast stems")
|
||||
parser.add_argument("stems_dir", type=Path, help="Directory containing stem WAV files")
|
||||
parser.add_argument("-o", "--output", type=str, default="episode.mp3", help="Output filename")
|
||||
parser.add_argument("--gap-threshold", type=float, default=1.5, help="Min silence to cut (seconds)")
|
||||
parser.add_argument("--duck-amount", type=float, default=-12, help="Music duck in dB")
|
||||
parser.add_argument("--gap-threshold", type=float, default=2.0, help="Min silence to cut (seconds)")
|
||||
parser.add_argument("--duck-amount", type=float, default=-20, help="Music duck in dB")
|
||||
parser.add_argument("--target-lufs", type=float, default=-16, help="Target loudness (LUFS)")
|
||||
parser.add_argument("--bitrate", type=str, default="128k", help="MP3 bitrate")
|
||||
parser.add_argument("--no-gap-removal", action="store_true", help="Skip gap removal")
|
||||
@@ -313,18 +411,27 @@ def main():
|
||||
return
|
||||
|
||||
# Step 1: Load
|
||||
print("\n[1/6] Loading stems...")
|
||||
print("\n[1/9] Loading stems...")
|
||||
stems, sr = load_stems(stems_dir)
|
||||
|
||||
# Step 2: Gap removal
|
||||
print("\n[2/6] Gap removal...")
|
||||
print("\n[2/9] Gap removal...")
|
||||
if not args.no_gap_removal:
|
||||
stems = remove_gaps(stems, sr, threshold_s=args.gap_threshold)
|
||||
else:
|
||||
print(" Skipped")
|
||||
|
||||
# Step 3: Voice compression
|
||||
print("\n[3/6] Voice compression...")
|
||||
# Step 3: Host mic noise reduction
|
||||
print("\n[3/9] Host mic noise reduction...")
|
||||
if np.any(stems["host"] != 0):
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
stems["host"] = denoise(stems["host"], sr, Path(tmp))
|
||||
print(" Applied")
|
||||
else:
|
||||
print(" No host audio")
|
||||
|
||||
# Step 4: Voice compression
|
||||
print("\n[4/9] Voice compression...")
|
||||
if not args.no_compression:
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
tmp_dir = Path(tmp)
|
||||
@@ -335,25 +442,39 @@ def main():
|
||||
else:
|
||||
print(" Skipped")
|
||||
|
||||
# Step 4: Music ducking
|
||||
print("\n[4/6] Music ducking...")
|
||||
# Step 5: Phone EQ on caller
|
||||
print("\n[5/9] Phone EQ on caller...")
|
||||
if np.any(stems["caller"] != 0):
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
stems["caller"] = phone_eq(stems["caller"], sr, Path(tmp))
|
||||
print(" Applied")
|
||||
else:
|
||||
print(" No caller audio")
|
||||
|
||||
# Step 6: Match voice levels
|
||||
print("\n[6/9] Matching voice levels...")
|
||||
stems = match_voice_levels(stems)
|
||||
|
||||
# Step 7: Music ducking
|
||||
print("\n[7/9] Music ducking...")
|
||||
if not args.no_ducking:
|
||||
dialog = stems["host"] + stems["caller"]
|
||||
if np.any(dialog != 0) and np.any(stems["music"] != 0):
|
||||
stems["music"] = apply_ducking(stems["music"], dialog, sr, duck_db=args.duck_amount)
|
||||
stems["music"] = apply_ducking(stems["music"], dialog, sr, duck_db=args.duck_amount,
|
||||
mute_signal=stems["ads"])
|
||||
print(" Applied")
|
||||
else:
|
||||
print(" No dialog or music to duck")
|
||||
else:
|
||||
print(" Skipped")
|
||||
|
||||
# Step 5: Mix
|
||||
print("\n[5/6] Mixing...")
|
||||
# Step 8: Mix
|
||||
print("\n[8/9] Mixing...")
|
||||
stereo = mix_stems(stems)
|
||||
print(f" Mixed to stereo: {len(stereo)} samples ({len(stereo)/sr:.1f}s)")
|
||||
|
||||
# Step 6: Normalize + export
|
||||
print("\n[6/6] Loudness normalization + export...")
|
||||
# Step 9: Normalize + export
|
||||
print("\n[9/9] Loudness normalization + export...")
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
normalize_and_export(stereo, sr, output_path,
|
||||
target_lufs=args.target_lufs,
|
||||
|
||||
Reference in New Issue
Block a user