diff --git a/analyze_gaps.py b/analyze_gaps.py new file mode 100644 index 0000000..e6dcbca --- /dev/null +++ b/analyze_gaps.py @@ -0,0 +1,260 @@ +#!/usr/bin/env python3 +"""Analyze silence gaps in podcast stems to find optimal strip-silence thresholds. + +Usage: python analyze_gaps.py recordings/2026-03-17_235137/ +""" +import sys +import numpy as np +import soundfile as sf +from pathlib import Path + +BLOCK_SEC = 0.1 +SILENCE_DB = -30 +THRESHOLD = 10 ** (SILENCE_DB / 20) +MIN_VOICE_SEC = 0.3 + + +def load_stem(path: Path) -> tuple[np.ndarray, int]: + audio, sr = sf.read(path, dtype="float32") + if audio.ndim > 1: + audio = audio[:, 0] + return audio, sr + + +def compute_rms_blocks(audio: np.ndarray, sr: int) -> np.ndarray: + block_samples = int(sr * BLOCK_SEC) + n_blocks = len(audio) // block_samples + if n_blocks == 0: + return np.array([0.0]) + trimmed = audio[:n_blocks * block_samples].reshape(n_blocks, block_samples) + return np.sqrt(np.mean(trimmed ** 2, axis=1)) + + +def compute_peak_blocks(audio: np.ndarray, sr: int) -> np.ndarray: + block_samples = int(sr * BLOCK_SEC) + n_blocks = len(audio) // block_samples + if n_blocks == 0: + return np.array([0.0]) + trimmed = audio[:n_blocks * block_samples].reshape(n_blocks, block_samples) + return np.max(np.abs(trimmed), axis=1) + + +def analyze(stems_dir: Path): + stems_dir = Path(stems_dir) + voice_stems = {} + for name in ["host", "devon", "caller"]: + path = stems_dir / f"{name}.wav" + if path.exists(): + print(f"Loading {name}...", end=" ", flush=True) + audio, sr = load_stem(path) + voice_stems[name] = audio + print(f"{len(audio)/sr:.0f}s @ {sr}Hz") + + if not voice_stems: + print("No voice stems found") + return + + sr_val = sr + duration = max(len(a) for a in voice_stems.values()) / sr_val + print(f"\nTotal duration: {duration/60:.1f} min") + + # Compute per-track RMS and peak blocks + track_rms = {} + track_peak = {} + for name, audio in voice_stems.items(): + track_rms[name] = compute_rms_blocks(audio, sr_val) + track_peak[name] = compute_peak_blocks(audio, sr_val) + + n_blocks = min(len(v) for v in track_peak.values()) + + # Detect gaps using same logic as Lua script (RMS for speaker ID, peak for silence) + min_voice_blocks = int(MIN_VOICE_SEC / BLOCK_SEC) + track_names = list(voice_stems.keys()) + + gaps = [] + in_silence = False + silence_start = 0 + track_before = None + last_active = None + voice_run = 0 + voice_run_track = None + + for i in range(n_blocks): + # Peak for silence detection + best_peak = max(track_peak[name][i] for name in track_names) + # RMS for speaker identification + best_rms = 0 + best_track = None + for name in track_names: + r = track_rms[name][i] + if r > best_rms: + best_rms = r + best_track = name + + all_silent = best_peak < THRESHOLD + + if not all_silent: + last_active = best_track + + if in_silence: + if all_silent: + voice_run = 0 + voice_run_track = None + else: + if voice_run == 0: + voice_run_track = best_track + voice_run += 1 + if voice_run >= min_voice_blocks: + voice_start_block = i - (voice_run - 1) + gap_start = silence_start * BLOCK_SEC + gap_end = voice_start_block * BLOCK_SEC + dur = gap_end - gap_start + if dur >= 0.5: # log gaps >= 0.5s + gaps.append({ + "start": gap_start, + "end": gap_end, + "dur": dur, + "before": track_before or "?", + "after": voice_run_track or "?", + }) + in_silence = False + voice_run = 0 + voice_run_track = None + else: + if all_silent: + in_silence = True + silence_start = i + track_before = last_active + voice_run = 0 + voice_run_track = None + + # Trailing silence + if in_silence: + dur = (n_blocks - silence_start) * BLOCK_SEC + if dur >= 0.5: + gaps.append({ + "start": silence_start * BLOCK_SEC, + "end": n_blocks * BLOCK_SEC, + "dur": dur, + "before": track_before or "?", + "after": "end", + }) + + if not gaps: + print("No gaps detected") + return + + # Categorize gaps + categories = { + "host_self": [], # Host -> Host + "host_to_caller": [], # Host -> Caller (TTS latency) + "caller_to_host": [], # Caller -> Host + "host_to_devon": [], # Host -> Devon (TTS latency) + "devon_to_host": [], # Devon -> Host + "caller_to_devon": [],# Caller -> Devon (interjection) + "devon_to_caller": [],# Devon -> Caller + "other": [], + } + + for g in gaps: + b, a = g["before"], g["after"] + if b == "host" and a == "host": + categories["host_self"].append(g) + elif b == "host" and a == "caller": + categories["host_to_caller"].append(g) + elif b == "caller" and a == "host": + categories["caller_to_host"].append(g) + elif b == "host" and a == "devon": + categories["host_to_devon"].append(g) + elif b == "devon" and a == "host": + categories["devon_to_host"].append(g) + elif b == "caller" and a == "devon": + categories["caller_to_devon"].append(g) + elif b == "devon" and a == "caller": + categories["devon_to_caller"].append(g) + else: + categories["other"].append(g) + + # Print results + print(f"\n{'='*70}") + print(f"GAP ANALYSIS — {len(gaps)} gaps detected") + print(f"{'='*70}") + + total_silence = sum(g["dur"] for g in gaps) + print(f"Total silence: {total_silence:.0f}s ({total_silence/60:.1f} min)") + print(f"Content after removal: ~{(duration - total_silence)/60:.1f} min") + + for cat_name, cat_gaps in sorted(categories.items(), key=lambda x: -len(x[1])): + if not cat_gaps: + continue + durs = sorted([g["dur"] for g in cat_gaps]) + print(f"\n--- {cat_name} ({len(cat_gaps)} gaps) ---") + print(f" Range: {durs[0]:.1f}s - {durs[-1]:.1f}s") + print(f" Median: {np.median(durs):.1f}s Mean: {np.mean(durs):.1f}s") + if len(durs) >= 5: + print(f" P25: {np.percentile(durs, 25):.1f}s P75: {np.percentile(durs, 75):.1f}s") + + # Histogram + brackets = [(0, 1), (1, 2), (2, 3), (3, 5), (5, 8), (8, 12), (12, 18), (18, 30), (30, 60), (60, 999)] + print(f" Distribution:") + for lo, hi in brackets: + count = sum(1 for d in durs if lo <= d < hi) + if count > 0: + bar = "#" * count + label = f"{lo}-{hi}s" if hi < 999 else f"{lo}s+" + print(f" {label:>8s}: {bar} ({count})") + + # Find natural clusters and suggest thresholds + print(f"\n{'='*70}") + print("SUGGESTED THRESHOLDS") + print(f"{'='*70}") + + # For each Devon-involved category, find the gap between interjection and TTS gaps + devon_gaps = categories["host_to_devon"] + categories["devon_to_host"] + categories["caller_to_devon"] + categories["devon_to_caller"] + if devon_gaps: + devon_durs = sorted([g["dur"] for g in devon_gaps]) + # Look for a natural break between short (interjection) and long (TTS) gaps + short = [d for d in devon_durs if d < 5] + long = [d for d in devon_durs if d >= 5] + if short and long: + suggested = (max(short) + min(long)) / 2 + print(f"Devon threshold: {suggested:.1f}s (short gaps: {len(short)} up to {max(short):.1f}s, long gaps: {len(long)} from {min(long):.1f}s)") + elif short: + print(f"Devon threshold: {max(short) + 1:.1f}s (all gaps are short, max {max(short):.1f}s)") + else: + print(f"Devon threshold: 3.0s (all gaps are long, min {min(long):.1f}s)") + + caller_gaps = categories["host_to_caller"] + categories["caller_to_host"] + if caller_gaps: + caller_durs = sorted([g["dur"] for g in caller_gaps]) + short = [d for d in caller_durs if d < 5] + long = [d for d in caller_durs if d >= 5] + if short and long: + suggested = (max(short) + min(long)) / 2 + print(f"Caller transition threshold: {suggested:.1f}s (short: {len(short)} up to {max(short):.1f}s, long: {len(long)} from {min(long):.1f}s)") + elif long: + print(f"Caller transition threshold: {min(long) - 1:.1f}s (all gaps >= {min(long):.1f}s)") + + host_self = categories["host_self"] + if host_self: + host_durs = sorted([g["dur"] for g in host_self]) + short = [d for d in host_durs if d < 5] + long = [d for d in host_durs if d >= 5] + if short and long: + suggested = (max(short) + min(long)) / 2 + print(f"Same-speaker threshold: {suggested:.1f}s (short: {len(short)} up to {max(short):.1f}s, long: {len(long)} from {min(long):.1f}s)") + elif long: + print(f"Same-speaker threshold: {min(long) - 1:.1f}s (all gaps >= {min(long):.1f}s)") + + all_durs = sorted([g["dur"] for g in gaps]) + would_cut = [d for d in all_durs if d >= 3.0] + print(f"\nWith current thresholds (Devon=3s, others=6s):") + print(f" Would cut: ~{len(would_cut)} gaps, ~{sum(would_cut):.0f}s ({sum(would_cut)/60:.1f} min)") + print(f" Result: ~{(duration - sum(would_cut))/60:.1f} min") + + +if __name__ == "__main__": + if len(sys.argv) < 2: + print("Usage: python analyze_gaps.py ") + sys.exit(1) + analyze(Path(sys.argv[1])) diff --git a/backend/main.py b/backend/main.py index 33cd6c1..a4054d0 100644 --- a/backend/main.py +++ b/backend/main.py @@ -8507,6 +8507,9 @@ GENRE_KEYWORDS = { "valentine": "Ballad", "romantic": "Ballad", "ballad": "Ballad", + "irish": "Irish", + "ireland": "Irish", + "patricks": "Irish", } diff --git a/reaper/strip_silence_dialog.lua b/reaper/strip_silence_dialog.lua index d8eac8e..ab0e09d 100644 --- a/reaper/strip_silence_dialog.lua +++ b/reaper/strip_silence_dialog.lua @@ -9,12 +9,15 @@ --------------------------------------------------------------------------- local SILENCE_DB = -30 -- dBFS — anything below this is "silence" local MIN_SILENCE_SEC = 6.0 -- same-speaker gaps: only remove silences longer than this -local MIN_SILENCE_TRANSITION_SEC = 2.5 -- cross-speaker gaps: shorter threshold for speaker transitions +local MAX_SILENCE_SEC = 999 -- no practical limit (IDENT/AD regions protect real breaks) +local MIN_SILENCE_TRANSITION_SEC = 5.0 -- cross-speaker gaps: threshold for caller TTS latency +local MIN_SILENCE_DEVON_SEC = 3.0 -- Devon gaps: interjections are prerendered (~2-3s gaps), conversational TTS is 6s+ +local DEVON_TRACK = 2 -- 1-indexed: Devon track number local MIN_VOICE_SEC = 0.3 -- ignore non-silent bursts shorter than this (filters transients) local KEEP_PAD_SEC = 0.5 -- leave this much silence on each side of a cut local BLOCK_SEC = 0.1 -- analysis block size (100ms) local SAMPLE_RATE = 48000 -local CHECK_TRACKS = {1, 2, 3, 4} -- 1-indexed: Host, Devon, Live Caller, AI Caller +local CHECK_TRACKS = {1, 2, 3, 4} -- 1-indexed: Host, Devon, AI Caller, Live Caller local IDENTS_TRACK = 6 -- 1-indexed: Idents track local ADS_TRACK = 7 -- 1-indexed: Ads track local MUSIC_TRACK = 8 -- 1-indexed: Music track @@ -25,7 +28,6 @@ local YIELD_INTERVAL = 200 -- yield to REAPER every N blocks (~20s of audio) local BLOCK_SAMPLES = math.floor(SAMPLE_RATE * BLOCK_SEC) local THRESHOLD = 10 ^ (SILENCE_DB / 20) local MIN_VOICE_BLOCKS = math.ceil(MIN_VOICE_SEC / BLOCK_SEC) - local function log(msg) reaper.ShowConsoleMsg("[PostProd] " .. msg .. "\n") end @@ -306,13 +308,17 @@ local function read_block_peak_rms(ta, project_time) end -- find_loudest_track: returns 1-based index of the loudest track at a given time, or 0 if silent +-- Uses RMS (not peak) for speaker identification — ambient mic noise has high peaks but low RMS local function find_loudest_track(track_audios, project_time) local best_peak = 0 + local best_rms = 0 local best_idx = 0 for i, ta in ipairs(track_audios) do - local peak, _ = read_block_peak_rms(ta, project_time) - if peak > best_peak then - best_peak = peak + local peak, sum_sq = read_block_peak_rms(ta, project_time) + if peak > best_peak then best_peak = peak end + local rms = math.sqrt(sum_sq / BLOCK_SAMPLES) + if rms > best_rms then + best_rms = rms best_idx = i end end @@ -340,12 +346,17 @@ local function find_silences(region, track_audios, rms_acc, progress_fn) while t < region.end_pos do local best_peak = 0 + local best_rms = 0 local best_sum = 0 local best_track = 0 for i, ta in ipairs(track_audios) do local peak, sum_sq = read_block_peak_rms(ta, t) - if peak > best_peak then - best_peak = peak + if peak > best_peak then best_peak = peak end + -- Use RMS for speaker identification (sustained energy, not transient peaks) + -- Host mic ambient noise has high peaks but low RMS; TTS speech has high RMS + local rms = math.sqrt(sum_sq / BLOCK_SAMPLES) + if rms > best_rms then + best_rms = rms best_sum = sum_sq best_track = i end @@ -375,8 +386,11 @@ local function find_silences(region, track_audios, rms_acc, progress_fn) local dur = voice_start - silence_start local track_after = voice_run_track local is_transition = track_before_silence ~= 0 and track_after ~= 0 and track_before_silence ~= track_after - local threshold = is_transition and MIN_SILENCE_TRANSITION_SEC or MIN_SILENCE_SEC - if dur >= threshold then + local devon_involved = track_before_silence == DEVON_TRACK or track_after == DEVON_TRACK + local threshold = devon_involved and MIN_SILENCE_DEVON_SEC + or (is_transition and MIN_SILENCE_TRANSITION_SEC or MIN_SILENCE_SEC) + + if dur >= threshold and dur <= MAX_SILENCE_SEC then table.insert(silences, { start_pos = silence_start, end_pos = voice_start, duration = dur, is_transition = is_transition, @@ -410,7 +424,7 @@ local function find_silences(region, track_audios, rms_acc, progress_fn) if in_silence then local dur = region.end_pos - silence_start - if dur >= MIN_SILENCE_SEC then + if dur >= MIN_SILENCE_SEC and dur <= MAX_SILENCE_SEC then table.insert(silences, {start_pos = silence_start, end_pos = region.end_pos, duration = dur}) end end @@ -547,6 +561,7 @@ local function phase1_strip_silence(dialog_regions) if (t + 1) == MUSIC_TRACK then goto next_track end local track = reaper.GetTrack(0, t) + -- Split and delete the silent portion from items that span r.start_pos local item = find_item_at(track, r.start_pos) if item then local right = reaper.SplitMediaItem(item, r.start_pos) @@ -556,10 +571,36 @@ local function phase1_strip_silence(dialog_regions) end end + -- Handle sparse track items that START within the removal range + -- (not found by find_item_at since they don't contain r.start_pos) + for j = reaper.CountTrackMediaItems(track) - 1, 0, -1 do + local check = reaper.GetTrackMediaItem(track, j) + local cpos = reaper.GetMediaItemInfo_Value(check, "D_POSITION") + if cpos >= r.start_pos and cpos < r.end_pos then + local clen = reaper.GetMediaItemInfo_Value(check, "D_LENGTH") + local cend = cpos + clen + if cend <= r.end_pos then + -- Entirely within removal — delete + reaper.DeleteTrackMediaItem(track, check) + else + -- Starts in removal but extends past — trim start to r.end_pos + local trim = r.end_pos - cpos + local take = reaper.GetActiveTake(check) + if take then + local offset = reaper.GetMediaItemTakeInfo_Value(take, "D_STARTOFFS") + reaper.SetMediaItemTakeInfo_Value(take, "D_STARTOFFS", offset + trim) + end + reaper.SetMediaItemInfo_Value(check, "D_LENGTH", cend - r.end_pos) + reaper.SetMediaItemInfo_Value(check, "D_POSITION", r.end_pos) + end + end + end + + -- Shift items AFTER the removal (use r.end_pos, not r.start_pos) for j = 0, reaper.CountTrackMediaItems(track) - 1 do local shift_item = reaper.GetTrackMediaItem(track, j) local pos = reaper.GetMediaItemInfo_Value(shift_item, "D_POSITION") - if pos >= r.start_pos then + if pos >= r.end_pos then reaper.SetMediaItemInfo_Value(shift_item, "D_POSITION", pos - remove_len) end end @@ -766,6 +807,56 @@ local function phase3_trim_music() local music_track = reaper.GetTrack(0, MUSIC_TRACK - 1) if not music_track then return end + -- Ensure music starts before first voice item. + -- Silence removal shifts voice/idents/ads but not music. If voice now starts before + -- music, nudge all non-music tracks forward so music has a lead-in. + local first_voice_start = math.huge + for _, tidx in ipairs(CHECK_TRACKS) do + local tr = reaper.GetTrack(0, tidx - 1) + if tr and reaper.CountTrackMediaItems(tr) > 0 then + local item = reaper.GetTrackMediaItem(tr, 0) + local pos = reaper.GetMediaItemInfo_Value(item, "D_POSITION") + if pos < first_voice_start then first_voice_start = pos end + end + end + + local MUSIC_LEAD_SEC = 3.0 -- seconds of music before first voice + if first_voice_start < math.huge then + local first_music = reaper.GetTrackMediaItem(music_track, 0) + if first_music then + local music_start = reaper.GetMediaItemInfo_Value(first_music, "D_POSITION") + local desired_voice_start = music_start + MUSIC_LEAD_SEC + if first_voice_start < desired_voice_start then + local nudge = desired_voice_start - first_voice_start + -- Shift all non-music tracks forward + for t = 0, reaper.CountTracks(0) - 1 do + if (t + 1) == MUSIC_TRACK then goto skip_music end + local track = reaper.GetTrack(0, t) + for i = 0, reaper.CountTrackMediaItems(track) - 1 do + local item = reaper.GetTrackMediaItem(track, i) + local pos = reaper.GetMediaItemInfo_Value(item, "D_POSITION") + reaper.SetMediaItemInfo_Value(item, "D_POSITION", pos + nudge) + end + ::skip_music:: + end + -- Also shift all markers/regions forward + local _, num_markers, num_regions = reaper.CountProjectMarkers(0) + local total_m = num_markers + num_regions + for i = 0, total_m - 1 do + local retval, is_region, pos, rgnend, name, idx, color = reaper.EnumProjectMarkers3(0, i) + if retval then + if is_region then + reaper.SetProjectMarker3(0, idx, true, pos + nudge, rgnend + nudge, name, color) + else + reaper.SetProjectMarker3(0, idx, false, pos + nudge, 0, name, color) + end + end + end + log("Phase 3: Nudged non-music tracks forward " .. string.format("%.1f", nudge) .. "s for " .. MUSIC_LEAD_SEC .. "s music lead-in") + end + end + end + local last_end = 0 for _, tidx in ipairs(CHECK_TRACKS) do local tr = reaper.GetTrack(0, tidx - 1)