f3c91fc385
- Devon: more conversational when addressed directly (500 tokens, 3-5 sentences) - Devon: monitor prompt rewritten to encourage more contributions - Devon: polling interval 15s (was 30s), removed 2-message minimum - Whisper: no fuzzy name matching for 3-char names, require first letter match - fetch_music.py: post-fetch vocal detection filter using musicinfo tags - scan_music_vocals.py: new script to scan existing library for vocal tracks Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
123 lines
3.6 KiB
Python
123 lines
3.6 KiB
Python
"""Scan music directory for tracks that contain vocals/lyrics.
|
|
|
|
Uses Whisper to transcribe a sample from each track — if it picks up
|
|
actual words, the track likely has vocals.
|
|
|
|
Usage:
|
|
python scan_music_vocals.py # scan and report
|
|
python scan_music_vocals.py --delete # scan and delete vocal tracks
|
|
"""
|
|
|
|
import argparse
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
import librosa
|
|
import numpy as np
|
|
from faster_whisper import WhisperModel
|
|
|
|
MUSIC_DIR = Path(__file__).parent / "music"
|
|
WHISPER_MODEL = "distil-large-v3"
|
|
|
|
# Words Whisper hallucinates on silence/instrumental — ignore these
|
|
HALLUCINATION_PHRASES = {
|
|
"thank you", "thanks for watching", "subscribe", "like and subscribe",
|
|
"please subscribe", "thank you for watching", "thanks for listening",
|
|
"you", "the end", "bye", "okay",
|
|
}
|
|
|
|
|
|
def scan_track(model: WhisperModel, filepath: Path) -> tuple[bool, str]:
|
|
"""Check a single track for vocals. Returns (has_vocals, transcription)."""
|
|
try:
|
|
audio, sr = librosa.load(str(filepath), sr=16000, mono=True)
|
|
except Exception as e:
|
|
return False, f"[load error: {e}]"
|
|
|
|
duration = len(audio) / sr
|
|
if duration < 10:
|
|
return False, "[too short]"
|
|
|
|
# Sample 30s from the middle (most likely to have vocals)
|
|
mid = len(audio) // 2
|
|
half_window = int(15 * sr) # 15s each side
|
|
start = max(0, mid - half_window)
|
|
end = min(len(audio), mid + half_window)
|
|
sample = audio[start:end]
|
|
|
|
segments, info = model.transcribe(
|
|
sample,
|
|
beam_size=3,
|
|
language="en",
|
|
vad_filter=True,
|
|
vad_parameters=dict(min_speech_duration_ms=500),
|
|
)
|
|
segments_list = list(segments)
|
|
text = " ".join(s.text for s in segments_list).strip()
|
|
|
|
# Filter out Whisper hallucinations
|
|
text_lower = text.lower().strip()
|
|
if text_lower in HALLUCINATION_PHRASES or len(text_lower) < 4:
|
|
return False, ""
|
|
|
|
# If Whisper found substantial text, it's likely vocals
|
|
word_count = len(text.split())
|
|
has_vocals = word_count >= 3
|
|
|
|
return has_vocals, text
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Scan music for vocal tracks")
|
|
parser.add_argument("--delete", action="store_true", help="Delete tracks with vocals")
|
|
args = parser.parse_args()
|
|
|
|
audio_files = sorted(
|
|
f for f in MUSIC_DIR.iterdir()
|
|
if f.suffix.lower() in {".mp3", ".wav", ".ogg", ".flac"}
|
|
)
|
|
|
|
if not audio_files:
|
|
print("No audio files found in music/")
|
|
return
|
|
|
|
print(f"Loading Whisper {WHISPER_MODEL}...")
|
|
model = WhisperModel(WHISPER_MODEL, device="cpu", compute_type="int8")
|
|
|
|
print(f"Scanning {len(audio_files)} tracks for vocals...\n")
|
|
|
|
vocal_tracks = []
|
|
for i, f in enumerate(audio_files, 1):
|
|
print(f"[{i}/{len(audio_files)}] {f.name}...", end=" ", flush=True)
|
|
has_vocals, text = scan_track(model, f)
|
|
if has_vocals:
|
|
print(f"VOCALS: {text[:80]}")
|
|
vocal_tracks.append((f, text))
|
|
else:
|
|
print("OK")
|
|
|
|
print(f"\n{'='*60}")
|
|
print(f"Results: {len(vocal_tracks)} tracks with vocals out of {len(audio_files)}\n")
|
|
|
|
if not vocal_tracks:
|
|
print("All tracks appear to be instrumental!")
|
|
return
|
|
|
|
for f, text in vocal_tracks:
|
|
print(f" {f.name}")
|
|
print(f" Lyrics: {text[:120]}")
|
|
print()
|
|
|
|
if args.delete:
|
|
print(f"Deleting {len(vocal_tracks)} vocal tracks...")
|
|
for f, _ in vocal_tracks:
|
|
f.unlink()
|
|
print(f" Deleted: {f.name}")
|
|
print("Done.")
|
|
else:
|
|
print("Run with --delete to remove these tracks.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|