Postprod improvements: denoise, phone EQ, ad muting, ducking, voice mappings
- Add host mic noise reduction (afftdn + anlmdn) - Add phone EQ bandpass on caller stem - Mute music during ads with 2s lookahead/tail - Increase ducking release to 3s to reduce pumping - Add Inworld voice mappings for all regular callers - Recording toggle endpoint, stem sync fixes Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -2377,8 +2377,8 @@ async def set_on_air(state: dict):
|
|||||||
def _run_postprod():
|
def _run_postprod():
|
||||||
try:
|
try:
|
||||||
result = subprocess.run(
|
result = subprocess.run(
|
||||||
[python, "postprod.py", str(stems_dir), "-o", str(output_file)],
|
[python, "postprod.py", str(stems_dir), "-o", "episode.mp3"],
|
||||||
capture_output=True, text=True, timeout=300,
|
capture_output=True, text=True, timeout=600,
|
||||||
)
|
)
|
||||||
if result.returncode == 0:
|
if result.returncode == 0:
|
||||||
add_log(f"Post-production complete -> {output_file}")
|
add_log(f"Post-production complete -> {output_file}")
|
||||||
@@ -3927,44 +3927,37 @@ async def server_status():
|
|||||||
|
|
||||||
# --- Stem Recording ---
|
# --- Stem Recording ---
|
||||||
|
|
||||||
@app.post("/api/recording/start")
|
@app.post("/api/recording/toggle")
|
||||||
async def start_stem_recording():
|
async def toggle_stem_recording():
|
||||||
if audio_service.stem_recorder is not None:
|
"""Toggle recording on/off. Also toggles on-air state."""
|
||||||
raise HTTPException(400, "Recording already in progress")
|
|
||||||
from datetime import datetime
|
|
||||||
dir_name = datetime.now().strftime("%Y-%m-%d_%H%M%S")
|
|
||||||
recordings_dir = Path("recordings") / dir_name
|
|
||||||
import sounddevice as sd
|
|
||||||
device_info = sd.query_devices(audio_service.output_device) if audio_service.output_device is not None else None
|
|
||||||
sr = int(device_info["default_samplerate"]) if device_info else 48000
|
|
||||||
recorder = StemRecorder(recordings_dir, sample_rate=sr)
|
|
||||||
recorder.start()
|
|
||||||
audio_service.stem_recorder = recorder
|
|
||||||
audio_service.start_stem_mic()
|
|
||||||
add_log(f"Stem recording started -> {recordings_dir}")
|
|
||||||
# Auto go on-air
|
|
||||||
global _show_on_air
|
global _show_on_air
|
||||||
if not _show_on_air:
|
|
||||||
_show_on_air = True
|
|
||||||
_start_host_audio_sender()
|
|
||||||
audio_service.start_host_stream(_host_audio_sync_callback)
|
|
||||||
threading.Thread(target=_update_on_air_cdn, args=(True,), daemon=True).start()
|
|
||||||
add_log("Show auto-set to ON AIR")
|
|
||||||
return {"status": "recording", "dir": str(recordings_dir), "on_air": _show_on_air}
|
|
||||||
|
|
||||||
|
|
||||||
@app.post("/api/recording/stop")
|
|
||||||
async def stop_stem_recording():
|
|
||||||
if audio_service.stem_recorder is None:
|
if audio_service.stem_recorder is None:
|
||||||
raise HTTPException(400, "No recording in progress")
|
# START recording
|
||||||
|
from datetime import datetime
|
||||||
|
dir_name = datetime.now().strftime("%Y-%m-%d_%H%M%S")
|
||||||
|
recordings_dir = Path("recordings") / dir_name
|
||||||
|
import sounddevice as sd
|
||||||
|
device_info = sd.query_devices(audio_service.output_device) if audio_service.output_device is not None else None
|
||||||
|
sr = int(device_info["default_samplerate"]) if device_info else 48000
|
||||||
|
recorder = StemRecorder(recordings_dir, sample_rate=sr)
|
||||||
|
recorder.start()
|
||||||
|
audio_service.stem_recorder = recorder
|
||||||
|
audio_service.start_stem_mic()
|
||||||
|
add_log(f"Stem recording started -> {recordings_dir}")
|
||||||
|
if not _show_on_air:
|
||||||
|
_show_on_air = True
|
||||||
|
_start_host_audio_sender()
|
||||||
|
audio_service.start_host_stream(_host_audio_sync_callback)
|
||||||
|
threading.Thread(target=_update_on_air_cdn, args=(True,), daemon=True).start()
|
||||||
|
add_log("Show auto-set to ON AIR")
|
||||||
|
return {"on_air": _show_on_air, "recording": True}
|
||||||
|
# STOP recording
|
||||||
audio_service.stop_stem_mic()
|
audio_service.stop_stem_mic()
|
||||||
stems_dir = audio_service.stem_recorder.output_dir
|
stems_dir = audio_service.stem_recorder.output_dir
|
||||||
paths = audio_service.stem_recorder.stop()
|
paths = audio_service.stem_recorder.stop()
|
||||||
audio_service.stem_recorder = None
|
audio_service.stem_recorder = None
|
||||||
add_log(f"Stem recording stopped. Running post-production...")
|
add_log(f"Stem recording stopped. Running post-production...")
|
||||||
|
|
||||||
# Auto go off-air
|
|
||||||
global _show_on_air
|
|
||||||
if _show_on_air:
|
if _show_on_air:
|
||||||
_show_on_air = False
|
_show_on_air = False
|
||||||
audio_service.stop_host_stream()
|
audio_service.stop_host_stream()
|
||||||
@@ -3978,8 +3971,8 @@ async def stop_stem_recording():
|
|||||||
def _run_postprod():
|
def _run_postprod():
|
||||||
try:
|
try:
|
||||||
result = subprocess.run(
|
result = subprocess.run(
|
||||||
[python, "postprod.py", str(stems_dir), "-o", str(output_file)],
|
[python, "postprod.py", str(stems_dir), "-o", "episode.mp3"],
|
||||||
capture_output=True, text=True, timeout=300,
|
capture_output=True, text=True, timeout=600,
|
||||||
)
|
)
|
||||||
if result.returncode == 0:
|
if result.returncode == 0:
|
||||||
add_log(f"Post-production complete -> {output_file}")
|
add_log(f"Post-production complete -> {output_file}")
|
||||||
@@ -3989,7 +3982,7 @@ async def stop_stem_recording():
|
|||||||
add_log(f"Post-production error: {e}")
|
add_log(f"Post-production error: {e}")
|
||||||
|
|
||||||
threading.Thread(target=_run_postprod, daemon=True).start()
|
threading.Thread(target=_run_postprod, daemon=True).start()
|
||||||
return {"status": "stopped", "stems": paths, "processing": str(output_file), "on_air": _show_on_air}
|
return {"on_air": _show_on_air, "recording": False}
|
||||||
|
|
||||||
|
|
||||||
@app.post("/api/recording/process")
|
@app.post("/api/recording/process")
|
||||||
|
|||||||
@@ -361,10 +361,6 @@ class AudioService:
|
|||||||
# Apply fade to prevent clicks
|
# Apply fade to prevent clicks
|
||||||
audio = self._apply_fade(audio, device_sr)
|
audio = self._apply_fade(audio, device_sr)
|
||||||
|
|
||||||
# Stem recording: caller TTS
|
|
||||||
if self.stem_recorder:
|
|
||||||
self.stem_recorder.write_sporadic("caller", audio.copy(), device_sr)
|
|
||||||
|
|
||||||
# Create multi-channel output with audio only on target channel
|
# Create multi-channel output with audio only on target channel
|
||||||
multi_ch = np.zeros((len(audio), num_channels), dtype=np.float32)
|
multi_ch = np.zeros((len(audio), num_channels), dtype=np.float32)
|
||||||
multi_ch[:, channel_idx] = audio
|
multi_ch[:, channel_idx] = audio
|
||||||
@@ -384,6 +380,9 @@ class AudioService:
|
|||||||
while pos < len(multi_ch) and not self._caller_stop_event.is_set():
|
while pos < len(multi_ch) and not self._caller_stop_event.is_set():
|
||||||
end = min(pos + chunk_size, len(multi_ch))
|
end = min(pos + chunk_size, len(multi_ch))
|
||||||
stream.write(multi_ch[pos:end])
|
stream.write(multi_ch[pos:end])
|
||||||
|
# Record each chunk as it plays so hangups cut the stem too
|
||||||
|
if self.stem_recorder:
|
||||||
|
self.stem_recorder.write_sporadic("caller", audio[pos:end].copy(), device_sr)
|
||||||
pos = end
|
pos = end
|
||||||
|
|
||||||
if self._caller_stop_event.is_set():
|
if self._caller_stop_event.is_set():
|
||||||
@@ -752,7 +751,7 @@ class AudioService:
|
|||||||
mono_out = (old_samples * fade_out + new_samples * fade_in) * self._music_volume
|
mono_out = (old_samples * fade_out + new_samples * fade_in) * self._music_volume
|
||||||
outdata[:, channel_idx] = mono_out
|
outdata[:, channel_idx] = mono_out
|
||||||
if self.stem_recorder:
|
if self.stem_recorder:
|
||||||
self.stem_recorder.write("music", mono_out.copy(), device_sr)
|
self.stem_recorder.write_sporadic("music", mono_out.copy(), device_sr)
|
||||||
self._crossfade_progress = end_progress
|
self._crossfade_progress = end_progress
|
||||||
|
|
||||||
if self._crossfade_progress >= 1.0:
|
if self._crossfade_progress >= 1.0:
|
||||||
@@ -763,7 +762,7 @@ class AudioService:
|
|||||||
mono_out = new_samples * self._music_volume
|
mono_out = new_samples * self._music_volume
|
||||||
outdata[:, channel_idx] = mono_out
|
outdata[:, channel_idx] = mono_out
|
||||||
if self.stem_recorder:
|
if self.stem_recorder:
|
||||||
self.stem_recorder.write("music", mono_out.copy(), device_sr)
|
self.stem_recorder.write_sporadic("music", mono_out.copy(), device_sr)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
self._music_stream = sd.OutputStream(
|
self._music_stream = sd.OutputStream(
|
||||||
@@ -873,7 +872,7 @@ class AudioService:
|
|||||||
chunk = self._ad_resampled[self._ad_position:self._ad_position + frames]
|
chunk = self._ad_resampled[self._ad_position:self._ad_position + frames]
|
||||||
outdata[:, channel_idx] = chunk
|
outdata[:, channel_idx] = chunk
|
||||||
if self.stem_recorder:
|
if self.stem_recorder:
|
||||||
self.stem_recorder.write("ads", chunk.copy(), device_sr)
|
self.stem_recorder.write_sporadic("ads", chunk.copy(), device_sr)
|
||||||
self._ad_position += frames
|
self._ad_position += frames
|
||||||
else:
|
else:
|
||||||
if remaining > 0:
|
if remaining > 0:
|
||||||
|
|||||||
@@ -86,18 +86,28 @@ DEFAULT_VITS_SPEAKER = "p225"
|
|||||||
# Dennis, Dominus, Edward, Elizabeth, Hades, Hana, Julia, Luna, Mark, Olivia,
|
# Dennis, Dominus, Edward, Elizabeth, Hades, Hana, Julia, Luna, Mark, Olivia,
|
||||||
# Pixie, Priya, Ronald, Sarah, Shaun, Theodore, Timothy, Wendy
|
# Pixie, Priya, Ronald, Sarah, Shaun, Theodore, Timothy, Wendy
|
||||||
INWORLD_VOICES = {
|
INWORLD_VOICES = {
|
||||||
# Male voices - each caller gets a unique voice matching their personality
|
# Original voice IDs
|
||||||
"VR6AewLTigWG4xSOukaG": "Edward", # Tony - fast-talking, emphatic, streetwise
|
"VR6AewLTigWG4xSOukaG": "Edward", # Tony - fast-talking, emphatic, streetwise
|
||||||
"TxGEqnHWrfWFTfGW9XjX": "Shaun", # Rick - friendly, dynamic, conversational
|
"TxGEqnHWrfWFTfGW9XjX": "Shaun", # Rick - friendly, dynamic, conversational
|
||||||
"pNInz6obpgDQGcFmaJgB": "Alex", # Dennis - energetic, expressive, mildly nasal
|
"pNInz6obpgDQGcFmaJgB": "Alex", # Dennis - energetic, expressive, mildly nasal
|
||||||
"ODq5zmih8GrVes37Dizd": "Craig", # Earl - older British, refined, articulate
|
"ODq5zmih8GrVes37Dizd": "Craig", # Earl - older British, refined, articulate
|
||||||
"IKne3meq5aSn9XLyUdCD": "Timothy", # Marcus - lively, upbeat American
|
"IKne3meq5aSn9XLyUdCD": "Timothy", # Marcus/Jerome - lively, upbeat American
|
||||||
# Female voices - each caller gets a unique voice matching their personality
|
|
||||||
"jBpfuIE2acCO8z3wKNLl": "Hana", # Jasmine - bright, expressive young female
|
"jBpfuIE2acCO8z3wKNLl": "Hana", # Jasmine - bright, expressive young female
|
||||||
"EXAVITQu4vr4xnSDxMaL": "Ashley", # Megan - warm, natural female
|
"EXAVITQu4vr4xnSDxMaL": "Ashley", # Megan - warm, natural female
|
||||||
"21m00Tcm4TlvDq8ikWAM": "Wendy", # Tanya - posh, middle-aged British
|
"21m00Tcm4TlvDq8ikWAM": "Wendy", # Tanya - posh, middle-aged British
|
||||||
"XB0fDUnXU5powFXDhCwa": "Sarah", # Carla - fast-talking, questioning tone
|
"XB0fDUnXU5powFXDhCwa": "Sarah", # Carla - fast-talking, questioning tone
|
||||||
"pFZP5JQG7iQjIQuC4Bku": "Deborah", # Brenda - gentle, elegant
|
"pFZP5JQG7iQjIQuC4Bku": "Deborah", # Brenda (original) - gentle, elegant
|
||||||
|
# Regular caller voice IDs (backfilled)
|
||||||
|
"onwK4e9ZLuTAKqWW03F9": "Ronald", # Bobby - repo man
|
||||||
|
"FGY2WhTYpPnrIDTdsKH5": "Julia", # Carla (regular) - Jersey mom
|
||||||
|
"CwhRBWXzGAHq8TQ4Fs17": "Mark", # Leon - male caller
|
||||||
|
"SOYHLrjzK2X1ezoPC6cr": "Carter", # Carl - male caller
|
||||||
|
"N2lVS1w4EtoT3dr4eOWO": "Clive", # Reggie - male caller
|
||||||
|
"hpp4J3VqNfWAUOO0d1Us": "Olivia", # Brenda (regular) - ambulance driver
|
||||||
|
"nPczCjzI2devNBz1zQrb": "Theodore", # Keith - male caller
|
||||||
|
"JBFqnCBsd6RMkjVDRZzb": "Blake", # Andre - male caller
|
||||||
|
"TX3LPaxmHKxFdv7VOQHJ": "Dennis", # Rick (regular) - male caller
|
||||||
|
"cgSgspJ2msm6clMCkdW9": "Priya", # Megan (regular) - female caller
|
||||||
}
|
}
|
||||||
DEFAULT_INWORLD_VOICE = "Dennis"
|
DEFAULT_INWORLD_VOICE = "Dennis"
|
||||||
|
|
||||||
|
|||||||
@@ -60,9 +60,13 @@
|
|||||||
{
|
{
|
||||||
"summary": "Jerome, a police officer in Texas, called from a DQ parking lot worried about AI writing police reports after his son sent him an article suggesting it might replace him. Through the conversation, he moved from fear about accountability and accuracy in criminal cases to acknowledging that AI handling routine paperwork (like cattle complaints) could free him up to do more meaningful police work in his understaffed county, though he remains uncertain about where this technology will lead.",
|
"summary": "Jerome, a police officer in Texas, called from a DQ parking lot worried about AI writing police reports after his son sent him an article suggesting it might replace him. Through the conversation, he moved from fear about accountability and accuracy in criminal cases to acknowledging that AI handling routine paperwork (like cattle complaints) could free him up to do more meaningful police work in his understaffed county, though he remains uncertain about where this technology will lead.",
|
||||||
"timestamp": 1770692087.560522
|
"timestamp": 1770692087.560522
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"summary": "The caller described a turbulent couple of weeks, mentioning an issue with AI writing police reports, which he suggested was just the beginning of a larger problem. He seemed concerned about the developments and wanted to discuss the topic further with the host.",
|
||||||
|
"timestamp": 1770892192.893108
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"last_call": 1770692087.560523,
|
"last_call": 1770892192.89311,
|
||||||
"created_at": 1770692087.560523,
|
"created_at": 1770692087.560523,
|
||||||
"voice": "IKne3meq5aSn9XLyUdCD"
|
"voice": "IKne3meq5aSn9XLyUdCD"
|
||||||
},
|
},
|
||||||
|
|||||||
@@ -101,17 +101,10 @@ function initEventListeners() {
|
|||||||
if (recBtn) {
|
if (recBtn) {
|
||||||
recBtn.addEventListener('click', async () => {
|
recBtn.addEventListener('click', async () => {
|
||||||
try {
|
try {
|
||||||
if (!stemRecording) {
|
const res = await safeFetch('/api/recording/toggle', { method: 'POST' });
|
||||||
const res = await safeFetch('/api/recording/start', { method: 'POST' });
|
updateRecBtn(res.recording);
|
||||||
updateRecBtn(true);
|
if (onAirBtn) updateOnAirBtn(onAirBtn, res.on_air);
|
||||||
if (onAirBtn) updateOnAirBtn(onAirBtn, res.on_air);
|
log(res.recording ? 'Recording started + ON AIR' : 'Recording stopped + OFF AIR');
|
||||||
log('Recording started + ON AIR: ' + res.dir);
|
|
||||||
} else {
|
|
||||||
const res = await safeFetch('/api/recording/stop', { method: 'POST' });
|
|
||||||
updateRecBtn(false);
|
|
||||||
if (onAirBtn) updateOnAirBtn(onAirBtn, res.on_air);
|
|
||||||
log('Recording stopped + OFF AIR');
|
|
||||||
}
|
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
log('Recording error: ' + err.message);
|
log('Recording error: ' + err.message);
|
||||||
}
|
}
|
||||||
|
|||||||
197
postprod.py
197
postprod.py
@@ -61,23 +61,30 @@ def compute_rms(audio: np.ndarray, window_samples: int) -> np.ndarray:
|
|||||||
|
|
||||||
|
|
||||||
def remove_gaps(stems: dict[str, np.ndarray], sr: int,
|
def remove_gaps(stems: dict[str, np.ndarray], sr: int,
|
||||||
threshold_s: float = 1.5, crossfade_ms: float = 30) -> dict[str, np.ndarray]:
|
threshold_s: float = 2.0, max_gap_s: float = 8.0,
|
||||||
|
crossfade_ms: float = 30, pad_s: float = 0.5) -> dict[str, np.ndarray]:
|
||||||
window_ms = 50
|
window_ms = 50
|
||||||
window_samples = int(sr * window_ms / 1000)
|
window_samples = int(sr * window_ms / 1000)
|
||||||
crossfade_samples = int(sr * crossfade_ms / 1000)
|
crossfade_samples = int(sr * crossfade_ms / 1000)
|
||||||
|
|
||||||
dialog = stems["host"] + stems["caller"]
|
# Detect gaps in everything except music (which always plays).
|
||||||
rms = compute_rms(dialog, window_samples)
|
# This catches TTS latency gaps while protecting ad breaks and SFX transitions.
|
||||||
|
content = stems["host"] + stems["caller"] + stems["sfx"] + stems["ads"]
|
||||||
|
rms = compute_rms(content, window_samples)
|
||||||
|
|
||||||
# Threshold: -60dB or adaptive based on mean RMS
|
# Threshold: percentile-based to sit above the mic noise floor
|
||||||
mean_rms = np.mean(rms[rms > 0]) if np.any(rms > 0) else 1e-4
|
nonzero_rms = rms[rms > 0]
|
||||||
silence_thresh = min(mean_rms * 0.05, 0.001)
|
if len(nonzero_rms) == 0:
|
||||||
|
print(" No audio detected")
|
||||||
|
return stems
|
||||||
|
noise_floor = np.percentile(nonzero_rms, 20)
|
||||||
|
silence_thresh = noise_floor * 3
|
||||||
|
|
||||||
# Find silent regions
|
|
||||||
is_silent = rms < silence_thresh
|
is_silent = rms < silence_thresh
|
||||||
min_silent_windows = int(threshold_s / (window_ms / 1000))
|
min_silent_windows = int(threshold_s / (window_ms / 1000))
|
||||||
|
max_silent_windows = int(max_gap_s / (window_ms / 1000))
|
||||||
|
|
||||||
# Build list of regions to cut (in samples)
|
# Only cut gaps between 1.5-8s — targets TTS latency, not long breaks
|
||||||
cuts = []
|
cuts = []
|
||||||
i = 0
|
i = 0
|
||||||
while i < len(is_silent):
|
while i < len(is_silent):
|
||||||
@@ -86,10 +93,11 @@ def remove_gaps(stems: dict[str, np.ndarray], sr: int,
|
|||||||
while i < len(is_silent) and is_silent[i]:
|
while i < len(is_silent) and is_silent[i]:
|
||||||
i += 1
|
i += 1
|
||||||
length = i - start
|
length = i - start
|
||||||
if length >= min_silent_windows:
|
if min_silent_windows <= length <= max_silent_windows:
|
||||||
# Keep a small buffer at edges
|
# Leave pad_s of silence so the edit sounds natural
|
||||||
cut_start = (start + 1) * window_samples
|
pad_samples = int(pad_s * sr)
|
||||||
cut_end = (i - 1) * window_samples
|
cut_start = (start + 1) * window_samples + pad_samples
|
||||||
|
cut_end = (i - 1) * window_samples - pad_samples
|
||||||
if cut_end > cut_start + crossfade_samples * 2:
|
if cut_end > cut_start + crossfade_samples * 2:
|
||||||
cuts.append((cut_start, cut_end))
|
cuts.append((cut_start, cut_end))
|
||||||
else:
|
else:
|
||||||
@@ -102,18 +110,18 @@ def remove_gaps(stems: dict[str, np.ndarray], sr: int,
|
|||||||
total_cut = sum(end - start for start, end in cuts) / sr
|
total_cut = sum(end - start for start, end in cuts) / sr
|
||||||
print(f" Removing {len(cuts)} gaps ({total_cut:.1f}s total)")
|
print(f" Removing {len(cuts)} gaps ({total_cut:.1f}s total)")
|
||||||
|
|
||||||
# Apply cuts to dialog stems (host, caller, sfx, ads) — not music
|
# Cut dialog/sfx/ads at gap points. Leave music uncut — just trim to fit.
|
||||||
cut_stems = ["host", "caller", "sfx", "ads"]
|
|
||||||
result = {}
|
result = {}
|
||||||
|
|
||||||
for name in cut_stems:
|
for name in STEM_NAMES:
|
||||||
|
if name == "music":
|
||||||
|
continue # handled below
|
||||||
audio = stems[name]
|
audio = stems[name]
|
||||||
pieces = []
|
pieces = []
|
||||||
prev_end = 0
|
prev_end = 0
|
||||||
for cut_start, cut_end in cuts:
|
for cut_start, cut_end in cuts:
|
||||||
if prev_end < cut_start:
|
if prev_end < cut_start:
|
||||||
piece = audio[prev_end:cut_start].copy()
|
piece = audio[prev_end:cut_start].copy()
|
||||||
# Apply crossfade at join point
|
|
||||||
if pieces and len(piece) > crossfade_samples:
|
if pieces and len(piece) > crossfade_samples:
|
||||||
fade_in = np.linspace(0, 1, crossfade_samples, dtype=np.float32)
|
fade_in = np.linspace(0, 1, crossfade_samples, dtype=np.float32)
|
||||||
piece[:crossfade_samples] *= fade_in
|
piece[:crossfade_samples] *= fade_in
|
||||||
@@ -135,18 +143,49 @@ def remove_gaps(stems: dict[str, np.ndarray], sr: int,
|
|||||||
|
|
||||||
result[name] = np.concatenate(pieces) if pieces else np.array([], dtype=np.float32)
|
result[name] = np.concatenate(pieces) if pieces else np.array([], dtype=np.float32)
|
||||||
|
|
||||||
# Trim music to match new duration, with fade-out at end
|
# Music: leave uncut, just trim to match new duration with fade-out
|
||||||
new_len = len(result["host"])
|
new_len = len(result["host"])
|
||||||
music = stems["music"][:new_len].copy() if len(stems["music"]) >= new_len else np.pad(stems["music"], (0, max(0, new_len - len(stems["music"]))))
|
music = stems["music"]
|
||||||
fade_samples = int(sr * 2) # 2s fade out
|
if len(music) >= new_len:
|
||||||
|
music = music[:new_len].copy()
|
||||||
|
else:
|
||||||
|
music = np.pad(music, (0, new_len - len(music)))
|
||||||
|
fade_samples = int(sr * 3)
|
||||||
if len(music) > fade_samples:
|
if len(music) > fade_samples:
|
||||||
fade_out = np.linspace(1, 0, fade_samples, dtype=np.float32)
|
music[-fade_samples:] *= np.linspace(1, 0, fade_samples, dtype=np.float32)
|
||||||
music[-fade_samples:] *= fade_out
|
|
||||||
result["music"] = music
|
result["music"] = music
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def denoise(audio: np.ndarray, sr: int, tmp_dir: Path) -> np.ndarray:
|
||||||
|
"""High-quality noise reduction using ffmpeg afftdn (adaptive Wiener filter)."""
|
||||||
|
in_path = tmp_dir / "host_pre_denoise.wav"
|
||||||
|
out_path = tmp_dir / "host_post_denoise.wav"
|
||||||
|
sf.write(str(in_path), audio, sr)
|
||||||
|
|
||||||
|
# afftdn: adaptive FFT denoiser with Wiener filter
|
||||||
|
# nt=w - Wiener filter (best quality)
|
||||||
|
# om=o - output cleaned signal
|
||||||
|
# nr=10 - noise reduction in dB (10 = moderate, preserves voice naturalness)
|
||||||
|
# nf=-30 - noise floor estimate in dB
|
||||||
|
# anlmdn: non-local means denoiser for residual broadband noise
|
||||||
|
# s=4 - patch size
|
||||||
|
# p=0.002 - strength (gentle to avoid artifacts)
|
||||||
|
af = (
|
||||||
|
"afftdn=nt=w:om=o:nr=12:nf=-30,"
|
||||||
|
"anlmdn=s=4:p=0.002"
|
||||||
|
)
|
||||||
|
cmd = ["ffmpeg", "-y", "-i", str(in_path), "-af", af, str(out_path)]
|
||||||
|
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||||
|
if result.returncode != 0:
|
||||||
|
print(f" WARNING: denoise failed: {result.stderr[:200]}")
|
||||||
|
return audio
|
||||||
|
|
||||||
|
denoised, _ = sf.read(str(out_path), dtype="float32")
|
||||||
|
return denoised
|
||||||
|
|
||||||
|
|
||||||
def compress_voice(audio: np.ndarray, sr: int, tmp_dir: Path,
|
def compress_voice(audio: np.ndarray, sr: int, tmp_dir: Path,
|
||||||
stem_name: str) -> np.ndarray:
|
stem_name: str) -> np.ndarray:
|
||||||
in_path = tmp_dir / f"{stem_name}_pre_comp.wav"
|
in_path = tmp_dir / f"{stem_name}_pre_comp.wav"
|
||||||
@@ -156,7 +195,7 @@ def compress_voice(audio: np.ndarray, sr: int, tmp_dir: Path,
|
|||||||
|
|
||||||
cmd = [
|
cmd = [
|
||||||
"ffmpeg", "-y", "-i", str(in_path),
|
"ffmpeg", "-y", "-i", str(in_path),
|
||||||
"-af", "acompressor=threshold=-24dB:ratio=3:attack=5:release=100:makeup=6dB",
|
"-af", "acompressor=threshold=-24dB:ratio=2.5:attack=10:release=800:makeup=6dB",
|
||||||
str(out_path),
|
str(out_path),
|
||||||
]
|
]
|
||||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||||
@@ -168,9 +207,32 @@ def compress_voice(audio: np.ndarray, sr: int, tmp_dir: Path,
|
|||||||
return compressed
|
return compressed
|
||||||
|
|
||||||
|
|
||||||
|
def phone_eq(audio: np.ndarray, sr: int, tmp_dir: Path) -> np.ndarray:
|
||||||
|
"""Apply telephone EQ to make caller sound like a phone call."""
|
||||||
|
in_path = tmp_dir / "caller_pre_phone.wav"
|
||||||
|
out_path = tmp_dir / "caller_post_phone.wav"
|
||||||
|
sf.write(str(in_path), audio, sr)
|
||||||
|
|
||||||
|
# Bandpass 300-3400Hz (telephone bandwidth) + slight mid boost for presence
|
||||||
|
af = (
|
||||||
|
"highpass=f=300:poles=2,"
|
||||||
|
"lowpass=f=3400:poles=2,"
|
||||||
|
"equalizer=f=1000:t=q:w=0.8:g=4"
|
||||||
|
)
|
||||||
|
cmd = ["ffmpeg", "-y", "-i", str(in_path), "-af", af, str(out_path)]
|
||||||
|
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||||
|
if result.returncode != 0:
|
||||||
|
print(f" WARNING: phone EQ failed: {result.stderr[:200]}")
|
||||||
|
return audio
|
||||||
|
|
||||||
|
filtered, _ = sf.read(str(out_path), dtype="float32")
|
||||||
|
return filtered
|
||||||
|
|
||||||
|
|
||||||
def apply_ducking(music: np.ndarray, dialog: np.ndarray, sr: int,
|
def apply_ducking(music: np.ndarray, dialog: np.ndarray, sr: int,
|
||||||
duck_db: float = -12, attack_ms: float = 200,
|
duck_db: float = -20, attack_ms: float = 200,
|
||||||
release_ms: float = 500) -> np.ndarray:
|
release_ms: float = 3000,
|
||||||
|
mute_signal: np.ndarray | None = None) -> np.ndarray:
|
||||||
window_ms = 50
|
window_ms = 50
|
||||||
window_samples = int(sr * window_ms / 1000)
|
window_samples = int(sr * window_ms / 1000)
|
||||||
rms = compute_rms(dialog, window_samples)
|
rms = compute_rms(dialog, window_samples)
|
||||||
@@ -184,6 +246,22 @@ def apply_ducking(music: np.ndarray, dialog: np.ndarray, sr: int,
|
|||||||
is_speech = rms > speech_thresh
|
is_speech = rms > speech_thresh
|
||||||
target_gain = np.where(is_speech, duck_gain, 1.0).astype(np.float32)
|
target_gain = np.where(is_speech, duck_gain, 1.0).astype(np.float32)
|
||||||
|
|
||||||
|
# Mute music completely during ads with lookahead and tail
|
||||||
|
if mute_signal is not None:
|
||||||
|
mute_rms = compute_rms(mute_signal, window_samples)
|
||||||
|
mute_thresh = np.mean(mute_rms[mute_rms > 0]) * 0.1 if np.any(mute_rms > 0) else 1e-4
|
||||||
|
is_ads = mute_rms > mute_thresh
|
||||||
|
# Expand ad regions: 2s before (fade out music before ad) and 2s after (don't resume immediately)
|
||||||
|
lookahead_windows = int(2000 / window_ms)
|
||||||
|
tail_windows = int(2000 / window_ms)
|
||||||
|
expanded_ads = is_ads.copy()
|
||||||
|
for i in range(len(is_ads)):
|
||||||
|
if is_ads[i]:
|
||||||
|
start = max(0, i - lookahead_windows)
|
||||||
|
end = min(len(expanded_ads), i + tail_windows + 1)
|
||||||
|
expanded_ads[start:end] = True
|
||||||
|
target_gain[expanded_ads] = 0.0
|
||||||
|
|
||||||
# Smooth the envelope
|
# Smooth the envelope
|
||||||
attack_windows = max(1, int(attack_ms / window_ms))
|
attack_windows = max(1, int(attack_ms / window_ms))
|
||||||
release_windows = max(1, int(release_ms / window_ms))
|
release_windows = max(1, int(release_ms / window_ms))
|
||||||
@@ -206,10 +284,30 @@ def apply_ducking(music: np.ndarray, dialog: np.ndarray, sr: int,
|
|||||||
return music * gain_samples
|
return music * gain_samples
|
||||||
|
|
||||||
|
|
||||||
|
def match_voice_levels(stems: dict[str, np.ndarray], target_rms: float = 0.1) -> dict[str, np.ndarray]:
|
||||||
|
"""Normalize host, caller, and ads stems to the same RMS level."""
|
||||||
|
for name in ["host", "caller", "ads"]:
|
||||||
|
audio = stems[name]
|
||||||
|
# Only measure non-silent portions
|
||||||
|
active = audio[np.abs(audio) > 0.001]
|
||||||
|
if len(active) == 0:
|
||||||
|
continue
|
||||||
|
current_rms = np.sqrt(np.mean(active ** 2))
|
||||||
|
if current_rms < 1e-6:
|
||||||
|
continue
|
||||||
|
gain = target_rms / current_rms
|
||||||
|
# Clamp gain to avoid extreme boosts on very quiet stems
|
||||||
|
gain = min(gain, 10.0)
|
||||||
|
stems[name] = np.clip(audio * gain, -1.0, 1.0).astype(np.float32)
|
||||||
|
db_change = 20 * np.log10(gain) if gain > 0 else 0
|
||||||
|
print(f" {name}: RMS {current_rms:.4f} -> {target_rms:.4f} ({db_change:+.1f}dB)")
|
||||||
|
return stems
|
||||||
|
|
||||||
|
|
||||||
def mix_stems(stems: dict[str, np.ndarray],
|
def mix_stems(stems: dict[str, np.ndarray],
|
||||||
levels: dict[str, float] | None = None) -> np.ndarray:
|
levels: dict[str, float] | None = None) -> np.ndarray:
|
||||||
if levels is None:
|
if levels is None:
|
||||||
levels = {"host": 0, "caller": 0, "music": -6, "sfx": -3, "ads": 0}
|
levels = {"host": 0, "caller": 0, "music": -6, "sfx": -6, "ads": 0}
|
||||||
|
|
||||||
gains = {name: 10 ** (db / 20) for name, db in levels.items()}
|
gains = {name: 10 ** (db / 20) for name, db in levels.items()}
|
||||||
|
|
||||||
@@ -282,8 +380,8 @@ def main():
|
|||||||
parser = argparse.ArgumentParser(description="Post-production for AI podcast stems")
|
parser = argparse.ArgumentParser(description="Post-production for AI podcast stems")
|
||||||
parser.add_argument("stems_dir", type=Path, help="Directory containing stem WAV files")
|
parser.add_argument("stems_dir", type=Path, help="Directory containing stem WAV files")
|
||||||
parser.add_argument("-o", "--output", type=str, default="episode.mp3", help="Output filename")
|
parser.add_argument("-o", "--output", type=str, default="episode.mp3", help="Output filename")
|
||||||
parser.add_argument("--gap-threshold", type=float, default=1.5, help="Min silence to cut (seconds)")
|
parser.add_argument("--gap-threshold", type=float, default=2.0, help="Min silence to cut (seconds)")
|
||||||
parser.add_argument("--duck-amount", type=float, default=-12, help="Music duck in dB")
|
parser.add_argument("--duck-amount", type=float, default=-20, help="Music duck in dB")
|
||||||
parser.add_argument("--target-lufs", type=float, default=-16, help="Target loudness (LUFS)")
|
parser.add_argument("--target-lufs", type=float, default=-16, help="Target loudness (LUFS)")
|
||||||
parser.add_argument("--bitrate", type=str, default="128k", help="MP3 bitrate")
|
parser.add_argument("--bitrate", type=str, default="128k", help="MP3 bitrate")
|
||||||
parser.add_argument("--no-gap-removal", action="store_true", help="Skip gap removal")
|
parser.add_argument("--no-gap-removal", action="store_true", help="Skip gap removal")
|
||||||
@@ -313,18 +411,27 @@ def main():
|
|||||||
return
|
return
|
||||||
|
|
||||||
# Step 1: Load
|
# Step 1: Load
|
||||||
print("\n[1/6] Loading stems...")
|
print("\n[1/9] Loading stems...")
|
||||||
stems, sr = load_stems(stems_dir)
|
stems, sr = load_stems(stems_dir)
|
||||||
|
|
||||||
# Step 2: Gap removal
|
# Step 2: Gap removal
|
||||||
print("\n[2/6] Gap removal...")
|
print("\n[2/9] Gap removal...")
|
||||||
if not args.no_gap_removal:
|
if not args.no_gap_removal:
|
||||||
stems = remove_gaps(stems, sr, threshold_s=args.gap_threshold)
|
stems = remove_gaps(stems, sr, threshold_s=args.gap_threshold)
|
||||||
else:
|
else:
|
||||||
print(" Skipped")
|
print(" Skipped")
|
||||||
|
|
||||||
# Step 3: Voice compression
|
# Step 3: Host mic noise reduction
|
||||||
print("\n[3/6] Voice compression...")
|
print("\n[3/9] Host mic noise reduction...")
|
||||||
|
if np.any(stems["host"] != 0):
|
||||||
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
|
stems["host"] = denoise(stems["host"], sr, Path(tmp))
|
||||||
|
print(" Applied")
|
||||||
|
else:
|
||||||
|
print(" No host audio")
|
||||||
|
|
||||||
|
# Step 4: Voice compression
|
||||||
|
print("\n[4/9] Voice compression...")
|
||||||
if not args.no_compression:
|
if not args.no_compression:
|
||||||
with tempfile.TemporaryDirectory() as tmp:
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
tmp_dir = Path(tmp)
|
tmp_dir = Path(tmp)
|
||||||
@@ -335,25 +442,39 @@ def main():
|
|||||||
else:
|
else:
|
||||||
print(" Skipped")
|
print(" Skipped")
|
||||||
|
|
||||||
# Step 4: Music ducking
|
# Step 5: Phone EQ on caller
|
||||||
print("\n[4/6] Music ducking...")
|
print("\n[5/9] Phone EQ on caller...")
|
||||||
|
if np.any(stems["caller"] != 0):
|
||||||
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
|
stems["caller"] = phone_eq(stems["caller"], sr, Path(tmp))
|
||||||
|
print(" Applied")
|
||||||
|
else:
|
||||||
|
print(" No caller audio")
|
||||||
|
|
||||||
|
# Step 6: Match voice levels
|
||||||
|
print("\n[6/9] Matching voice levels...")
|
||||||
|
stems = match_voice_levels(stems)
|
||||||
|
|
||||||
|
# Step 7: Music ducking
|
||||||
|
print("\n[7/9] Music ducking...")
|
||||||
if not args.no_ducking:
|
if not args.no_ducking:
|
||||||
dialog = stems["host"] + stems["caller"]
|
dialog = stems["host"] + stems["caller"]
|
||||||
if np.any(dialog != 0) and np.any(stems["music"] != 0):
|
if np.any(dialog != 0) and np.any(stems["music"] != 0):
|
||||||
stems["music"] = apply_ducking(stems["music"], dialog, sr, duck_db=args.duck_amount)
|
stems["music"] = apply_ducking(stems["music"], dialog, sr, duck_db=args.duck_amount,
|
||||||
|
mute_signal=stems["ads"])
|
||||||
print(" Applied")
|
print(" Applied")
|
||||||
else:
|
else:
|
||||||
print(" No dialog or music to duck")
|
print(" No dialog or music to duck")
|
||||||
else:
|
else:
|
||||||
print(" Skipped")
|
print(" Skipped")
|
||||||
|
|
||||||
# Step 5: Mix
|
# Step 8: Mix
|
||||||
print("\n[5/6] Mixing...")
|
print("\n[8/9] Mixing...")
|
||||||
stereo = mix_stems(stems)
|
stereo = mix_stems(stems)
|
||||||
print(f" Mixed to stereo: {len(stereo)} samples ({len(stereo)/sr:.1f}s)")
|
print(f" Mixed to stereo: {len(stereo)} samples ({len(stereo)/sr:.1f}s)")
|
||||||
|
|
||||||
# Step 6: Normalize + export
|
# Step 9: Normalize + export
|
||||||
print("\n[6/6] Loudness normalization + export...")
|
print("\n[9/9] Loudness normalization + export...")
|
||||||
with tempfile.TemporaryDirectory() as tmp:
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
normalize_and_export(stereo, sr, output_path,
|
normalize_and_export(stereo, sr, output_path,
|
||||||
target_lufs=args.target_lufs,
|
target_lufs=args.target_lufs,
|
||||||
|
|||||||
Reference in New Issue
Block a user