ai-podcast/fetch_music.py

"""Fetch instrumental background music from Jamendo for the radio show.

Pixabay has no public music API — this uses Jamendo's free API instead.
All tracks are Creative Commons licensed. Attribution is saved to music/CREDITS.txt.

Setup: Get a free client_id at https://devportal.jamendo.com
       Add JAMENDO_CLIENT_ID=your_id to .env

Usage:
    python fetch_music.py                    # download 20 tracks across all genres
    python fetch_music.py --genre jazz       # download jazz only
    python fetch_music.py --count 50         # download 50 tracks
    python fetch_music.py --list             # just list available tracks, don't download
"""

import argparse
import os
import re
import sys
from pathlib import Path

import httpx
from dotenv import load_dotenv

load_dotenv()

MUSIC_DIR = Path(__file__).parent / "music"
CREDITS_FILE = MUSIC_DIR / "CREDITS.txt"
API_BASE = "https://api.jamendo.com/v3.0"

# Genres good for a late-night radio show
GENRES = ["jazz", "lofi", "blues", "ambient", "acoustic", "funk", "chill"]

# Map search tags to labels that _detect_genre() in main.py can match
# jazz, blues, funk, lo-fi are already in GENRE_KEYWORDS
# ambient, acoustic, chill would need to be added for auto-detection
GENRE_LABELS = {
    "jazz": "Jazz",
    "lofi": "Lo-Fi",
    "blues": "Blues",
    "ambient": "Ambient",
    "acoustic": "Acoustic",
    "funk": "Funk",
    "chill": "Chill",
}


def get_client_id():
    key = os.getenv("JAMENDO_CLIENT_ID")
    if not key:
        print("Error: JAMENDO_CLIENT_ID not found in .env")
        print("Get one free at https://devportal.jamendo.com")
        sys.exit(1)
    return key


def sanitize_filename(name: str) -> str:
    return re.sub(r'[<>:"/\\|?*]', '', name).strip()


def _has_vocals(track: dict) -> bool:
    """Check musicinfo for vocal indicators — catches tracks Jamendo mis-tagged as instrumental."""
    mi = track.get("musicinfo", {})
    # Check the vocalinstrumental field in musicinfo (separate from the API filter)
    vi = mi.get("vocalinstrumental")
    if vi and vi.lower() == "vocal":
        return True
    # Check tags for vocal/singing indicators
    tags = mi.get("tags", {})
    # tags can be {"genres": [...], "instruments": [...], "vartags": [...]}
    all_tags = []
    if isinstance(tags, dict):
        for v in tags.values():
            if isinstance(v, list):
                all_tags.extend(t.lower() for t in v)
    elif isinstance(tags, list):
        all_tags = [t.lower() for t in tags]
    vocal_tags = {"vocals", "vocal", "singing", "singer", "voice", "lyrics",
                  "rap", "hiphop", "hip-hop", "spoken", "spoken word"}
    if vocal_tags & set(all_tags):
        return True
    # Check track name for vocal giveaways
    name_lower = track.get("name", "").lower()
    if any(w in name_lower for w in ["feat.", "ft.", "vocal", "remix vocal", "(voice"]):
        return True
    return False


def search_tracks(client: httpx.Client, client_id: str, genre: str, limit: int = 20) -> list[dict]:
    # Request more than needed so we can filter out vocal false positives
    fetch_limit = min(limit * 3, 200)
    params = {
        "client_id": client_id,
        "format": "json",
        "limit": fetch_limit,
        "vocalinstrumental": "instrumental",
        "fuzzytags": genre,
        "durationbetween": "60_300",
        "include": "musicinfo+licenses",
        "order": "popularity_total",
    }

    resp = client.get(f"{API_BASE}/tracks/", params=params)
    resp.raise_for_status()
    data = resp.json()

    if data["headers"]["status"] != "success":
        print(f"  API error: {data['headers'].get('error_message', 'unknown')}")
        return []

    results = data.get("results", [])
    # Post-filter: reject tracks with vocal indicators despite the API filter
    filtered = []
    for t in results:
        if _has_vocals(t):
            print(f"    SKIP (vocals detected): {t.get('artist_name', '?')} - {t.get('name', '?')}")
            continue
        filtered.append(t)
        if len(filtered) >= limit:
            break

    skipped = len(results) - len(filtered)
    if skipped:
        print(f"  (filtered out {skipped} tracks with vocal indicators)")
    return filtered


def make_filename(track: dict, genre_tag: str) -> str:
    artist = sanitize_filename(track.get("artist_name", "Unknown"))
    title = sanitize_filename(track.get("name", "Untitled"))
    label = GENRE_LABELS.get(genre_tag, genre_tag.title())

    # Include genre tag if not already detectable from artist/title
    lower = f"{artist} {title}".lower()
    needs_tag = not any(kw in lower for kw in [genre_tag, label.lower()])

    if needs_tag:
        return f"{artist} - {title} [{label}].mp3"
    return f"{artist} - {title}.mp3"


def download_track(client: httpx.Client, track: dict, filepath: Path, index: int, total: int) -> bool:
    url = track.get("audiodownload")
    if not url:
        print(f"  [{index}/{total}] SKIP (no download URL): {track['name']}")
        return False

    if not track.get("audiodownload_allowed", True):
        print(f"  [{index}/{total}] SKIP (download not allowed): {track['name']}")
        return False

    print(f"  [{index}/{total}] Downloading: {filepath.name}...", end=" ", flush=True)
    resp = client.get(url, follow_redirects=True)
    resp.raise_for_status()
    filepath.write_bytes(resp.content)
    size_mb = len(resp.content) / (1024 * 1024)
    dur = track.get("duration", 0)
    print(f"{size_mb:.1f} MB, {dur // 60}:{dur % 60:02d}")
    return True


def save_credit(track: dict, filename: str):
    artist = track.get("artist_name", "Unknown")
    title = track.get("name", "Untitled")
    license_url = track.get("license_ccurl", "")
    share_url = track.get("shareurl", "")

    line = f"{filename} | {artist} - {title} | {license_url} | {share_url}\n"

    existing = CREDITS_FILE.read_text() if CREDITS_FILE.exists() else ""
    if filename not in existing:
        with open(CREDITS_FILE, "a") as f:
            if not existing:
                f.write("# Music Credits (Jamendo - Creative Commons)\n")
                f.write("# File | Artist - Title | License | URL\n\n")
            f.write(line)


def main():
    parser = argparse.ArgumentParser(description="Download instrumental music from Jamendo")
    parser.add_argument("--genre", choices=GENRES, help="Download only this genre")
    parser.add_argument("--count", type=int, default=20, help="Total tracks to download (default: 20)")
    parser.add_argument("--list", action="store_true", help="List available tracks without downloading")
    args = parser.parse_args()

    client_id = get_client_id()
    MUSIC_DIR.mkdir(exist_ok=True)

    genres = [args.genre] if args.genre else GENRES
    per_genre = max(1, args.count // len(genres))
    remainder = args.count - per_genre * len(genres)

    all_tracks = []
    seen_ids = set()

    with httpx.Client(timeout=30) as api_client:
        for i, genre in enumerate(genres):
            limit = per_genre + (1 if i < remainder else 0)
            if limit <= 0:
                continue
            print(f"Searching {genre}...", end=" ", flush=True)
            tracks = search_tracks(api_client, client_id, genre, limit)
            # Deduplicate across genres
            added = 0
            for t in tracks:
                if t["id"] not in seen_ids and added < limit:
                    t["_genre_tag"] = genre
                    all_tracks.append(t)
                    seen_ids.add(t["id"])
                    added += 1
            print(f"{added} tracks")

    if not all_tracks:
        print("No tracks found.")
        return

    if args.list:
        print(f"\n{'#':<4} {'Genre':<10} {'Artist':<25} {'Title':<40} {'Duration':<8}")
        print("-" * 90)
        for i, t in enumerate(all_tracks, 1):
            dur = f"{t['duration'] // 60}:{t['duration'] % 60:02d}"
            artist = t["artist_name"][:24]
            title = t["name"][:39]
            label = GENRE_LABELS.get(t["_genre_tag"], t["_genre_tag"])
            print(f"{i:<4} {label:<10} {artist:<25} {title:<40} {dur:<8}")
        print(f"\n{len(all_tracks)} tracks available")
        return

    # Download phase
    downloaded = 0
    skipped_exists = 0
    skipped_error = 0

    with httpx.Client(timeout=120, follow_redirects=True) as dl_client:
        for i, track in enumerate(all_tracks, 1):
            filename = make_filename(track, track["_genre_tag"])
            filepath = MUSIC_DIR / filename

            if filepath.exists():
                print(f"  [{i}/{len(all_tracks)}] EXISTS: {filename}")
                skipped_exists += 1
                continue

            try:
                if download_track(dl_client, track, filepath, i, len(all_tracks)):
                    save_credit(track, filename)
                    downloaded += 1
                else:
                    skipped_error += 1
            except Exception as e:
                print(f"  [{i}/{len(all_tracks)}] ERROR: {e}")
                # Clean up partial download
                if filepath.exists():
                    filepath.unlink()
                skipped_error += 1

    print(f"\nDone: {downloaded} downloaded, {skipped_exists} existed, {skipped_error} skipped")


if __name__ == "__main__":
    main()