commit 029ce6d68948a8c4b6adb38a1175b88fee77c657 Author: tcpsyn Date: Wed Feb 4 23:11:20 2026 -0700 Initial commit: AI Radio Show web application - FastAPI backend with multiple TTS providers (Inworld, ElevenLabs, Kokoro, F5-TTS, etc.) - Web frontend with caller management, music, and soundboard - Whisper transcription integration - OpenRouter/Ollama LLM support - Castopod podcast publishing script Co-Authored-By: Claude Opus 4.5 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f22713a --- /dev/null +++ b/.gitignore @@ -0,0 +1,54 @@ +# Environment +.env +*.env + +# Python +__pycache__/ +*.py[cod] +*$py.class +.venv/ +venv/ +env/ +*.egg-info/ + +# Audio/Media (large files) +*.mp3 +*.wav +*.m4a +*.ogg + +# Sessions +sessions/ + +# IDE +.idea/ +.vscode/ +*.swp +*.swo + +# OS +.DS_Store +Thumbs.db + +# Whisper models (downloaded automatically) +*.pt + +# Temporary +*.tmp +*.log + +# Large model files (download separately) +*.onnx +*.safetensors +*.tar.bz2 +*.bin +models/ +asset/ +kokoro-v1.0.onnx +voices-v1.0.bin + +# Reference voices for TTS +ref_audio/ + +# Claude settings (local) +.claude/ diff --git a/audio_settings.json b/audio_settings.json new file mode 100644 index 0000000..8e8fc3a --- /dev/null +++ b/audio_settings.json @@ -0,0 +1,9 @@ +{ + "input_device": 13, + "input_channel": 1, + "output_device": 13, + "caller_channel": 3, + "music_channel": 5, + "sfx_channel": 7, + "phone_filter": false +} \ No newline at end of file diff --git a/backend/__init__.py b/backend/__init__.py new file mode 100644 index 0000000..7f83169 --- /dev/null +++ b/backend/__init__.py @@ -0,0 +1 @@ +# Backend package diff --git a/backend/config.py b/backend/config.py new file mode 100644 index 0000000..8bbaa5e --- /dev/null +++ b/backend/config.py @@ -0,0 +1,41 @@ +"""Configuration settings for the AI Radio Show backend""" + +import os +from pathlib import Path +from pydantic_settings import BaseSettings +from dotenv import load_dotenv + +# Load .env from parent directory +load_dotenv(Path(__file__).parent.parent / ".env") + + +class Settings(BaseSettings): + # API Keys + elevenlabs_api_key: str = os.getenv("ELEVENLABS_API_KEY", "") + openrouter_api_key: str = os.getenv("OPENROUTER_API_KEY", "") + inworld_api_key: str = os.getenv("INWORLD_API_KEY", "") + + # LLM Settings + llm_provider: str = "openrouter" # "openrouter" or "ollama" + openrouter_model: str = "anthropic/claude-3-haiku" + ollama_model: str = "llama3.2" + ollama_host: str = "http://localhost:11434" + + # TTS Settings + tts_provider: str = "kokoro" # "kokoro", "elevenlabs", "vits", or "bark" + + # Audio Settings + sample_rate: int = 24000 + + # Paths + base_dir: Path = Path(__file__).parent.parent + sounds_dir: Path = base_dir / "sounds" + music_dir: Path = base_dir / "music" + sessions_dir: Path = base_dir / "sessions" + + class Config: + env_file = ".env" + extra = "ignore" + + +settings = Settings() diff --git a/backend/main.py b/backend/main.py new file mode 100644 index 0000000..c86905f --- /dev/null +++ b/backend/main.py @@ -0,0 +1,787 @@ +"""AI Radio Show - Control Panel Backend""" + +import uuid +import asyncio +from pathlib import Path +from fastapi import FastAPI, HTTPException, UploadFile, File +from fastapi.staticfiles import StaticFiles +from fastapi.responses import FileResponse +from fastapi.middleware.cors import CORSMiddleware +from pydantic import BaseModel +from typing import Optional + +from .config import settings +from .services.transcription import transcribe_audio +from .services.llm import llm_service +from .services.tts import generate_speech +from .services.audio import audio_service + +app = FastAPI(title="AI Radio Show") + +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + +# --- Callers --- +# Base caller info (name, voice) - backgrounds generated dynamically per session +import random + +CALLER_BASES = { + "1": {"name": "Tony", "voice": "VR6AewLTigWG4xSOukaG", "gender": "male", "age_range": (35, 55)}, + "2": {"name": "Jasmine", "voice": "jBpfuIE2acCO8z3wKNLl", "gender": "female", "age_range": (25, 38)}, + "3": {"name": "Rick", "voice": "TxGEqnHWrfWFTfGW9XjX", "gender": "male", "age_range": (40, 58)}, + "4": {"name": "Megan", "voice": "EXAVITQu4vr4xnSDxMaL", "gender": "female", "age_range": (24, 35)}, + "5": {"name": "Dennis", "voice": "pNInz6obpgDQGcFmaJgB", "gender": "male", "age_range": (32, 48)}, + "6": {"name": "Tanya", "voice": "21m00Tcm4TlvDq8ikWAM", "gender": "female", "age_range": (30, 45)}, + "7": {"name": "Earl", "voice": "ODq5zmih8GrVes37Dizd", "gender": "male", "age_range": (58, 72)}, + "8": {"name": "Carla", "voice": "XB0fDUnXU5powFXDhCwa", "gender": "female", "age_range": (38, 52)}, + "9": {"name": "Marcus", "voice": "IKne3meq5aSn9XLyUdCD", "gender": "male", "age_range": (24, 34)}, + "0": {"name": "Brenda", "voice": "pFZP5JQG7iQjIQuC4Bku", "gender": "female", "age_range": (45, 60)}, +} + +# Background components for dynamic generation +JOBS_MALE = [ + "runs a small HVAC business", "works as a long-haul trucker", "is a high school football coach", + "works construction, mostly commercial jobs", "is a paramedic", "manages a warehouse", + "is a line cook at a decent restaurant", "works IT for the city", "is a union electrician", + "owns a small landscaping company", "is a cop, 12 years on the force", "works at a car dealership", + "is a freelance photographer", "teaches middle school history", "is a firefighter", + "works as a hospital security guard", "runs a food truck", "is a session musician", + "works at a brewery", "is a physical therapist", "drives for UPS", "is a tattoo artist", + "works in insurance, hates it", "is a youth pastor", "manages a gym", +] + +JOBS_FEMALE = [ + "works as an ER nurse", "is a social worker", "runs a small bakery", "is a dental hygienist", + "works in HR for a hospital", "is a real estate agent", "teaches kindergarten", + "works as a bartender at a nice place", "is a paralegal", "runs a daycare out of her home", + "works retail management", "is a hairstylist, owns her chair", "is a vet tech", + "works in hospital billing", "is a massage therapist", "manages a restaurant", + "is a flight attendant", "works as a 911 dispatcher", "is a personal trainer", + "works at a nonprofit", "is an accountant at a small firm", "does medical transcription from home", + "is a court reporter", "works in pharmaceutical sales", "is a wedding planner", +] + +PROBLEMS = [ + # Family drama + "hasn't talked to their father in years and just got a call that he's dying", + "found out they were adopted and doesn't know how to process it", + "is being pressured to take care of an aging parent who was never there for them", + "just discovered a family secret that changes everything they thought they knew", + "has a sibling who's destroying themselves and nobody will intervene", + "is estranged from their kids and it's killing them", + "found out their parent had a whole other family nobody knew about", + "is watching their parents' marriage fall apart after 40 years", + + # Career and purpose + "woke up and realized they've been in the wrong career for 15 years", + "got passed over for a promotion they deserved and is questioning everything", + "has a dream they gave up on years ago and it's haunting them", + "is successful on paper but feels completely empty inside", + "hates their job but can't afford to leave and it's breaking them", + "just got fired and doesn't know who they are without their work", + "is being asked to do something unethical at work and doesn't know what to do", + "watches their boss take credit for everything and is losing their mind", + + # Mental health and inner struggles + "has been putting on a brave face but is barely holding it together", + "can't shake the feeling that their best years are behind them", + "keeps self-sabotaging every good thing in their life and doesn't know why", + "has been numb for months and is starting to scare themselves", + "can't stop comparing themselves to everyone else and it's destroying them", + "has intrusive thoughts they've never told anyone about", + "feels like a fraud and is waiting to be found out", + "is exhausted from being the strong one for everyone else", + + # Grief and loss + "lost someone close and hasn't really dealt with it", + "is grieving someone who's still alive but is no longer the person they knew", + "never got closure with someone who died and it's eating at them", + "is watching their best friend slowly die and doesn't know how to be there", + "had a miscarriage nobody knows about and carries it alone", + + # Regrets and past mistakes + "made a choice years ago that changed everything and wonders what if", + "hurt someone badly and never apologized, and it haunts them", + "let the one that got away go and thinks about them constantly", + "gave up on something important to make someone else happy and resents it", + "said something they can never take back and the guilt won't fade", + "was a bully growing up and is finally reckoning with it", + + # Relationships (non-sexual) + "is falling out of love with their spouse and doesn't know what to do", + "married the wrong person and everyone knows it but them", + "feels invisible in their own relationship", + "is staying for the kids but dying inside", + "realized they don't actually like their partner as a person", + "is jealous of their partner's success and it's poisoning everything", + "found out their partner has been lying about something big", + + # Friendship and loneliness + "realized they don't have any real friends, just people who need things from them", + "had a falling out with their best friend and the silence is deafening", + "is surrounded by people but has never felt more alone", + "is jealous of a friend's life and hates themselves for it", + "suspects a close friend is talking shit behind their back", + + # Big life decisions + "is thinking about leaving everything behind and starting over somewhere new", + "has to make a choice that will hurt someone no matter what", + "is being pressured into something they don't want but can't say no", + "has been offered an opportunity that would change everything but they're terrified", + "knows they need to end something but can't pull the trigger", + + # Addiction and bad habits + "is hiding how much they drink from everyone", + "can't stop gambling and is in deeper than anyone knows", + "is watching themselves become someone they don't recognize", + "keeps making the same mistake over and over expecting different results", + + # Attraction and affairs (keep some of the original) + "is attracted to someone they shouldn't be and it's getting harder to ignore", + "has been seeing {affair_person} on the side", + "caught feelings for someone at work and it's fucking everything up", + + # Sexual/desire (keep some but less dominant) + "can't stop thinking about {fantasy_subject}", + "discovered something about their own desires that surprised them", + "is questioning their sexuality after something that happened recently", + + # General late-night confessions + "can't sleep and has been thinking too much about their life choices", + "had a weird day and needs to process it with someone", + "has been keeping a secret that's eating them alive", + "finally ready to admit something they've never said out loud", +] + +PROBLEM_FILLS = { + "time": ["a few weeks", "months", "six months", "a year", "way too long"], + # Affairs (all adults) + "affair_person": ["their partner's best friend", "a coworker", "their ex", "a neighbor", "their boss", "their trainer", "someone they met online", "an old flame"], + # Fantasies and kinks (consensual adult stuff) + "fantasy_subject": ["a threesome", "being dominated", "dominating someone", "their partner with someone else", "a specific coworker", "group sex", "rough sex", "being watched", "exhibitionism"], + "kink": ["anal", "BDSM", "roleplay", "a threesome", "toys", "being tied up", "public sex", "swinging", "filming themselves", "bondage"], + # Secret behaviors (legal adult stuff) + "secret_behavior": ["hooking up with strangers", "sexting people online", "using dating apps behind their partner's back", "having an affair", "going to sex clubs", "watching way too much porn"], + "double_life": ["vanilla at home, freak elsewhere", "straight to their family, not so much in private", "married but on dating apps", "in a relationship but seeing other people"], + "hookup_person": ["their roommate", "a coworker", "their ex", "a friend's spouse", "a stranger from an app", "multiple people", "someone from the gym"], + # Discovery and identity (adult experiences) + "new_discovery": ["the same sex", "being submissive", "being dominant", "kink", "casual sex", "exhibitionism", "that they're bi"], + "unexpected_person": ["the same sex for the first time", "more than one person", "a complete stranger", "someone they never expected to be attracted to", "a friend"], + "sexuality_trigger": ["a specific hookup", "watching certain porn", "a drunk encounter", "realizing they're attracted to a friend", "an unexpected experience"], + "first_time": ["anal", "a threesome", "same-sex stuff", "BDSM", "an open relationship", "casual hookups", "being dominant", "being submissive"], + # Relationship issues + "partner_wants": ["an open relationship", "to bring someone else in", "things they're not sure about", "to watch them with someone else", "to try new things"], + "caught_doing": ["sexting someone", "on a dating app", "watching porn they'd never admit to", "flirting with someone else", "looking at someone's pics"], + # Attractions (appropriate adult scenarios) + "taboo_fantasy": ["someone they work with", "a friend's partner", "a specific scenario", "something they've never said out loud"], + "taboo_attraction": ["someone they work with", "a friend's partner", "their partner's friend", "someone they see all the time"], +} + +INTERESTS = [ + # General interests (normal people) + "really into true crime podcasts", "watches a lot of reality TV", "into fitness", + "follows sports", "big movie person", "reads a lot", "into music, has opinions", + "goes out a lot, active social life", "homebody, prefers staying in", + "into cooking and food", "outdoorsy type", "gamer", "works a lot, career focused", + # Relationship/psychology focused + "listens to relationship podcasts", "has done therapy, believes in it", + "reads about psychology and why people do what they do", "very online, knows all the discourse", + "into self-improvement stuff", "follows dating advice content", + # Sexually open (not the focus, but present) + "sex-positive, doesn't judge", "has experimented, open about it", + "comfortable with their body", "has stories if you ask", +] + +QUIRKS = [ + # Conversational style + "says 'honestly' and 'I mean' a lot", "trails off when thinking, then picks back up", + "laughs nervously when things get real", "very direct, doesn't sugarcoat", + "rambles a bit when nervous", "gets quiet when the topic hits close to home", + "deflects with humor when uncomfortable", "asks the host questions back", + # Openness about sex + "comfortable talking about sex when it comes up", "no shame about their desires", + "gets more explicit as they get comfortable", "treats sex like a normal topic", + "will share details if you ask", "surprisingly open once they start talking", + "has stories they've never told anyone", "testing how the host reacts before going deeper", + # Personality + "self-aware about their own bullshit", "confessional, needed to tell someone", + "a little drunk and honest because of it", "can't believe they're saying this out loud", +] + +LOCATIONS = [ + "outside Chicago", "in Phoenix", "near Atlanta", "in the Detroit area", "outside Boston", + "in North Jersey", "near Austin", "in the Bay Area", "outside Philadelphia", "in Denver", + "near Seattle", "in South Florida", "outside Nashville", "in Cleveland", "near Portland", + "in the Twin Cities", "outside Dallas", "in Baltimore", "near Sacramento", "in Pittsburgh", +] + + +def generate_caller_background(base: dict) -> str: + """Generate a unique background for a caller""" + age = random.randint(*base["age_range"]) + jobs = JOBS_MALE if base["gender"] == "male" else JOBS_FEMALE + job = random.choice(jobs) + location = random.choice(LOCATIONS) + + # Generate problem with fills + problem_template = random.choice(PROBLEMS) + problem = problem_template + for key, options in PROBLEM_FILLS.items(): + if "{" + key + "}" in problem: + problem = problem.replace("{" + key + "}", random.choice(options)) + + interest1, interest2 = random.sample(INTERESTS, 2) + quirk1, quirk2 = random.sample(QUIRKS, 2) + + return f"""{age}, {job} {location}. {problem.capitalize()}. {interest1.capitalize()}, {interest2}. {quirk1.capitalize()}, {quirk2}.""" + +def get_caller_prompt(caller: dict, conversation_summary: str = "") -> str: + """Generate a natural system prompt for a caller""" + context = "" + if conversation_summary: + context = f""" +CONVERSATION SO FAR: +{conversation_summary} +Continue naturally. Don't repeat yourself. +""" + + return f"""You're {caller['name']}, calling a late-night radio show. You trust this host. + +{caller['vibe']} +{context} +HOW TO TALK: +- Sound like a real person chatting, not writing. +- Keep responses to 2-3 sentences. Enough to make your point, short enough for back-and-forth. +- ALWAYS finish your thought completely. Never stop mid-sentence. +- Swear naturally if it fits: fuck, shit, damn, etc. + +SPELLING FOR TEXT-TO-SPEECH (use proper spelling so TTS pronounces correctly): +- Write "you know" not "yanno" or "y'know" +- Write "kind of" not "kinda" +- Write "sort of" not "sorta" +- Write "going to" not "gonna" +- Write "want to" not "wanna" +- Write "I don't know" not "dunno" +- Use full words, not phonetic slang spellings + +GOOD examples: +- "I mean, it started innocent enough, but now I can't stop thinking about her." +- "Honestly I don't even know why I'm calling. I just needed to tell someone." +- "Look, my marriage is fine on paper, but something's missing, you know?" + +BAD examples (too formal/literary): +- "I have this confession about my obsession" +- "It has been consuming all of my thoughts" + +ABOUT SEX: +- You don't have to jump straight to explicit stuff +- But if the host asks or the conversation goes there, you're completely open about it +- When talking about sex, be direct and explicit - say exactly what happened, what you want, what turns you on +- No euphemisms when being explicit - use real words: fuck, cock, pussy, etc. + +HARD LIMITS - NEVER mention or reference: +- Children, minors, or anyone under 18 in any sexual context +- Violence, harm, or non-consensual acts +- Illegal activities beyond normal adult behavior +- All sexual content must be between consenting adults only + +OUTPUT: Spoken words only. No (actions), no *gestures*, no stage directions.""" + + +# --- Session State --- +class Session: + def __init__(self): + self.id = str(uuid.uuid4())[:8] + self.current_caller_key: str = None + self.conversation: list[dict] = [] + self.caller_backgrounds: dict[str, str] = {} # Generated backgrounds for this session + + def start_call(self, caller_key: str): + self.current_caller_key = caller_key + self.conversation = [] + + def end_call(self): + self.current_caller_key = None + self.conversation = [] + + def add_message(self, role: str, content: str): + self.conversation.append({"role": role, "content": content}) + + def get_caller_background(self, caller_key: str) -> str: + """Get or generate background for a caller in this session""" + if caller_key not in self.caller_backgrounds: + base = CALLER_BASES.get(caller_key) + if base: + self.caller_backgrounds[caller_key] = generate_caller_background(base) + print(f"[Session {self.id}] Generated background for {base['name']}: {self.caller_backgrounds[caller_key][:100]}...") + return self.caller_backgrounds.get(caller_key, "") + + def get_conversation_summary(self) -> str: + """Get a brief summary of conversation so far for context""" + if len(self.conversation) <= 2: + return "" + # Just include the key exchanges, not the full history + summary_parts = [] + for msg in self.conversation[-6:]: # Last 3 exchanges + role = "Host" if msg["role"] == "user" else self.caller["name"] + summary_parts.append(f'{role}: "{msg["content"][:100]}..."' if len(msg["content"]) > 100 else f'{role}: "{msg["content"]}"') + return "\n".join(summary_parts) + + @property + def caller(self) -> dict: + if self.current_caller_key: + base = CALLER_BASES.get(self.current_caller_key) + if base: + return { + "name": base["name"], + "voice": base["voice"], + "vibe": self.get_caller_background(self.current_caller_key) + } + return None + + def reset(self): + """Reset session - clears all caller backgrounds for fresh personalities""" + self.caller_backgrounds = {} + self.current_caller_key = None + self.conversation = [] + self.id = str(uuid.uuid4())[:8] + print(f"[Session] Reset - new session ID: {self.id}") + + +session = Session() + + +# --- Static Files --- +frontend_dir = Path(__file__).parent.parent / "frontend" +app.mount("/css", StaticFiles(directory=frontend_dir / "css"), name="css") +app.mount("/js", StaticFiles(directory=frontend_dir / "js"), name="js") + + +@app.get("/") +async def index(): + return FileResponse(frontend_dir / "index.html") + + +# --- Request Models --- + +class ChatRequest(BaseModel): + text: str + +class TTSRequest(BaseModel): + text: str + voice_id: str + phone_filter: bool = True + +class AudioDeviceSettings(BaseModel): + input_device: Optional[int] = None + input_channel: Optional[int] = None + output_device: Optional[int] = None + caller_channel: Optional[int] = None + music_channel: Optional[int] = None + sfx_channel: Optional[int] = None + phone_filter: Optional[bool] = None + +class MusicRequest(BaseModel): + track: str + action: str # "play", "stop", "volume" + volume: Optional[float] = None + +class SFXRequest(BaseModel): + sound: str + + +# --- Audio Device Endpoints --- + +@app.get("/api/audio/devices") +async def list_audio_devices(): + """List all available audio devices""" + return {"devices": audio_service.list_devices()} + + +@app.get("/api/audio/settings") +async def get_audio_settings(): + """Get current audio device configuration""" + return audio_service.get_device_settings() + + +@app.post("/api/audio/settings") +async def set_audio_settings(settings: AudioDeviceSettings): + """Configure audio devices and channels""" + audio_service.set_devices( + input_device=settings.input_device, + input_channel=settings.input_channel, + output_device=settings.output_device, + caller_channel=settings.caller_channel, + music_channel=settings.music_channel, + sfx_channel=settings.sfx_channel, + phone_filter=settings.phone_filter + ) + return audio_service.get_device_settings() + + +# --- Recording Endpoints --- + +@app.post("/api/record/start") +async def start_recording(): + """Start recording from configured input device""" + if audio_service.input_device is None: + raise HTTPException(400, "No input device configured. Set one in /api/audio/settings") + + success = audio_service.start_recording() + if not success: + raise HTTPException(400, "Failed to start recording (already recording?)") + + return {"status": "recording"} + + +@app.post("/api/record/stop") +async def stop_recording(): + """Stop recording and transcribe""" + audio_bytes = audio_service.stop_recording() + + if len(audio_bytes) < 100: + return {"text": "", "status": "no_audio"} + + # Transcribe the recorded audio (16kHz raw PCM from audio service) + text = await transcribe_audio(audio_bytes, source_sample_rate=16000) + return {"text": text, "status": "transcribed"} + + +# --- Caller Endpoints --- + +@app.get("/api/callers") +async def get_callers(): + """Get list of available callers""" + return { + "callers": [ + {"key": k, "name": v["name"]} + for k, v in CALLER_BASES.items() + ], + "current": session.current_caller_key, + "session_id": session.id + } + + +@app.post("/api/session/reset") +async def reset_session(): + """Reset session - all callers get fresh backgrounds""" + session.reset() + return {"status": "reset", "session_id": session.id} + + +@app.post("/api/call/{caller_key}") +async def start_call(caller_key: str): + """Start a call with a caller""" + if caller_key not in CALLER_BASES: + raise HTTPException(404, "Caller not found") + + session.start_call(caller_key) + caller = session.caller # This generates the background if needed + + return { + "status": "connected", + "caller": caller["name"], + "background": caller["vibe"] # Send background so you can see who you're talking to + } + + +@app.post("/api/hangup") +async def hangup(): + """Hang up current call""" + # Stop any playing caller audio immediately + audio_service.stop_caller_audio() + + caller_name = session.caller["name"] if session.caller else None + session.end_call() + + # Play hangup sound + hangup_sound = settings.sounds_dir / "hangup.wav" + if hangup_sound.exists(): + audio_service.play_sfx(str(hangup_sound)) + + return {"status": "disconnected", "caller": caller_name} + + +# --- Chat & TTS Endpoints --- + +import re + +def clean_for_tts(text: str) -> str: + """Strip out non-speakable content and fix phonetic spellings for TTS""" + # Remove content in parentheses: (laughs), (pausing), (looking away), etc. + text = re.sub(r'\s*\([^)]*\)\s*', ' ', text) + # Remove content in asterisks: *laughs*, *sighs*, etc. + text = re.sub(r'\s*\*[^*]*\*\s*', ' ', text) + # Remove content in brackets: [laughs], [pause], etc. (only Bark uses these) + text = re.sub(r'\s*\[[^\]]*\]\s*', ' ', text) + # Remove content in angle brackets: , , etc. + text = re.sub(r'\s*<[^>]*>\s*', ' ', text) + # Remove "He/She sighs" style stage directions (full phrase) + text = re.sub(r'\b(He|She|I|They)\s+(sighs?|laughs?|pauses?|smiles?|chuckles?|grins?|nods?|shrugs?|frowns?)[^.]*\.\s*', '', text, flags=re.IGNORECASE) + # Remove standalone stage direction words only if they look like directions (with adverbs) + text = re.sub(r'\b(sighs?|laughs?|pauses?|chuckles?)\s+(heavily|softly|deeply|quietly|loudly|nervously|sadly)\b[.,]?\s*', '', text, flags=re.IGNORECASE) + # Remove quotes around the response if LLM wrapped it + text = re.sub(r'^["\']|["\']$', '', text.strip()) + + # Fix phonetic spellings for proper TTS pronunciation + text = re.sub(r"\by'know\b", "you know", text, flags=re.IGNORECASE) + text = re.sub(r"\byanno\b", "you know", text, flags=re.IGNORECASE) + text = re.sub(r"\byknow\b", "you know", text, flags=re.IGNORECASE) + text = re.sub(r"\bkinda\b", "kind of", text, flags=re.IGNORECASE) + text = re.sub(r"\bsorta\b", "sort of", text, flags=re.IGNORECASE) + text = re.sub(r"\bgonna\b", "going to", text, flags=re.IGNORECASE) + text = re.sub(r"\bwanna\b", "want to", text, flags=re.IGNORECASE) + text = re.sub(r"\bgotta\b", "got to", text, flags=re.IGNORECASE) + text = re.sub(r"\bdunno\b", "don't know", text, flags=re.IGNORECASE) + text = re.sub(r"\blemme\b", "let me", text, flags=re.IGNORECASE) + text = re.sub(r"\bcuz\b", "because", text, flags=re.IGNORECASE) + text = re.sub(r"\b'cause\b", "because", text, flags=re.IGNORECASE) + text = re.sub(r"\blotta\b", "lot of", text, flags=re.IGNORECASE) + text = re.sub(r"\boutta\b", "out of", text, flags=re.IGNORECASE) + text = re.sub(r"\bimma\b", "I'm going to", text, flags=re.IGNORECASE) + text = re.sub(r"\btryna\b", "trying to", text, flags=re.IGNORECASE) + + # Clean up extra whitespace + text = re.sub(r'\s+', ' ', text) + # Fix spaces before punctuation + text = re.sub(r'\s+([.,!?])', r'\1', text) + # Remove orphaned punctuation at start + text = re.sub(r'^[.,]\s*', '', text) + return text.strip() + + +@app.post("/api/chat") +async def chat(request: ChatRequest): + """Chat with current caller""" + if not session.caller: + raise HTTPException(400, "No active call") + + session.add_message("user", request.text) + + # Include conversation summary for context + conversation_summary = session.get_conversation_summary() + system_prompt = get_caller_prompt(session.caller, conversation_summary) + + response = await llm_service.generate( + messages=session.conversation[-10:], # Reduced history for speed + system_prompt=system_prompt + ) + + print(f"[Chat] Raw LLM: {response[:100] if response else '(empty)'}...") + + # Clean response for TTS (remove parenthetical actions, asterisks, etc.) + response = clean_for_tts(response) + + print(f"[Chat] Cleaned: {response[:100] if response else '(empty)'}...") + + # Ensure we have a valid response + if not response or not response.strip(): + response = "Uh... sorry, what was that?" + + session.add_message("assistant", response) + + return { + "text": response, + "caller": session.caller["name"], + "voice_id": session.caller["voice"] + } + + +@app.post("/api/tts") +async def text_to_speech(request: TTSRequest): + """Generate and play speech on caller output device (non-blocking)""" + # Validate text is not empty + if not request.text or not request.text.strip(): + raise HTTPException(400, "Text cannot be empty") + + # Phone filter disabled - always use "none" + audio_bytes = await generate_speech( + request.text, + request.voice_id, + "none" + ) + + # Play in background thread - returns immediately, can be interrupted by hangup + import threading + thread = threading.Thread( + target=audio_service.play_caller_audio, + args=(audio_bytes, 24000), + daemon=True + ) + thread.start() + + return {"status": "playing", "duration": len(audio_bytes) / 2 / 24000} + + +@app.post("/api/tts/stop") +async def stop_tts(): + """Stop any playing caller audio""" + audio_service.stop_caller_audio() + return {"status": "stopped"} + + +# --- Music Endpoints --- + +@app.get("/api/music") +async def get_music(): + """Get available music tracks""" + tracks = [] + if settings.music_dir.exists(): + for ext in ['*.wav', '*.mp3', '*.flac']: + for f in settings.music_dir.glob(ext): + tracks.append({ + "name": f.stem, + "file": f.name, + "path": str(f) + }) + return { + "tracks": tracks, + "playing": audio_service.is_music_playing() + } + + +@app.post("/api/music/play") +async def play_music(request: MusicRequest): + """Load and play a music track""" + track_path = settings.music_dir / request.track + if not track_path.exists(): + raise HTTPException(404, "Track not found") + + audio_service.load_music(str(track_path)) + audio_service.play_music() + return {"status": "playing", "track": request.track} + + +@app.post("/api/music/stop") +async def stop_music(): + """Stop music playback""" + audio_service.stop_music() + return {"status": "stopped"} + + +@app.post("/api/music/volume") +async def set_music_volume(request: MusicRequest): + """Set music volume""" + if request.volume is not None: + audio_service.set_music_volume(request.volume) + return {"status": "ok", "volume": request.volume} + + +# --- Sound Effects Endpoints --- + +@app.get("/api/sounds") +async def get_sounds(): + """Get available sound effects""" + sounds = [] + if settings.sounds_dir.exists(): + for f in settings.sounds_dir.glob('*.wav'): + sounds.append({ + "name": f.stem, + "file": f.name, + "path": str(f) + }) + return {"sounds": sounds} + + +@app.post("/api/sfx/play") +async def play_sfx(request: SFXRequest): + """Play a sound effect""" + sound_path = settings.sounds_dir / request.sound + if not sound_path.exists(): + raise HTTPException(404, "Sound not found") + + audio_service.play_sfx(str(sound_path)) + return {"status": "playing", "sound": request.sound} + + +# --- LLM Settings Endpoints --- + +@app.get("/api/settings") +async def get_settings(): + """Get LLM settings""" + return await llm_service.get_settings_async() + + +@app.post("/api/settings") +async def update_settings(data: dict): + """Update LLM and TTS settings""" + llm_service.update_settings( + provider=data.get("provider"), + openrouter_model=data.get("openrouter_model"), + ollama_model=data.get("ollama_model"), + ollama_host=data.get("ollama_host"), + tts_provider=data.get("tts_provider") + ) + return llm_service.get_settings() + + +# --- Server Control Endpoints --- + +import subprocess +from collections import deque + +# In-memory log buffer +_log_buffer = deque(maxlen=500) + +def add_log(message: str): + """Add a message to the log buffer""" + import datetime + timestamp = datetime.datetime.now().strftime("%H:%M:%S") + _log_buffer.append(f"[{timestamp}] {message}") + +# Override print to also log to buffer +import builtins +_original_print = builtins.print +def _logging_print(*args, **kwargs): + try: + _original_print(*args, **kwargs) + except (BrokenPipeError, OSError): + pass # Ignore broken pipe errors from traceback printing + try: + message = " ".join(str(a) for a in args) + if message.strip(): + add_log(message) + except Exception: + pass # Don't let logging errors break the app +builtins.print = _logging_print + + +@app.get("/api/logs") +async def get_logs(lines: int = 100): + """Get recent log lines""" + log_lines = list(_log_buffer)[-lines:] + return {"logs": log_lines} + + +@app.post("/api/server/restart") +async def restart_server(): + """Signal the server to restart (requires run.sh wrapper)""" + restart_flag = Path("/tmp/ai-radio-show.restart") + restart_flag.touch() + add_log("Restart signal sent - server will restart shortly") + return {"status": "restarting"} + + +@app.post("/api/server/stop") +async def stop_server(): + """Signal the server to stop (requires run.sh wrapper)""" + stop_flag = Path("/tmp/ai-radio-show.stop") + stop_flag.touch() + add_log("Stop signal sent - server will stop shortly") + return {"status": "stopping"} + + +@app.get("/api/server/status") +async def server_status(): + """Get server status info""" + return { + "status": "running", + "tts_provider": settings.tts_provider, + "llm_provider": llm_service.provider, + "session_id": session.id + } diff --git a/backend/services/__init__.py b/backend/services/__init__.py new file mode 100644 index 0000000..a70b302 --- /dev/null +++ b/backend/services/__init__.py @@ -0,0 +1 @@ +# Services package diff --git a/backend/services/audio.py b/backend/services/audio.py new file mode 100644 index 0000000..29e505f --- /dev/null +++ b/backend/services/audio.py @@ -0,0 +1,479 @@ +"""Server-side audio service for Loopback routing""" + +import sounddevice as sd +import numpy as np +import threading +import queue +import json +from pathlib import Path +from typing import Optional, Callable +import wave +import time + +# Settings file path +SETTINGS_FILE = Path(__file__).parent.parent.parent / "audio_settings.json" + + +class AudioService: + """Manages audio I/O with multi-channel support for Loopback routing""" + + def __init__(self): + # Device configuration + self.input_device: Optional[int] = None + self.input_channel: int = 1 # 1-indexed channel + + self.output_device: Optional[int] = None # Single output device (multi-channel) + self.caller_channel: int = 1 # Channel for caller TTS + self.music_channel: int = 2 # Channel for music + self.sfx_channel: int = 3 # Channel for SFX + self.phone_filter: bool = False # Phone filter on caller voices + + # Recording state + self._recording = False + self._record_thread: Optional[threading.Thread] = None + self._audio_queue: queue.Queue = queue.Queue() + self._recorded_audio: list = [] + self._record_device_sr: int = 48000 + + # Music playback state + self._music_stream: Optional[sd.OutputStream] = None + self._music_data: Optional[np.ndarray] = None + self._music_resampled: Optional[np.ndarray] = None + self._music_position: int = 0 + self._music_playing: bool = False + self._music_volume: float = 0.3 + self._music_loop: bool = True + + # Caller playback state + self._caller_stop_event = threading.Event() + self._caller_thread: Optional[threading.Thread] = None + + # Sample rates + self.input_sample_rate = 16000 # For Whisper + self.output_sample_rate = 24000 # For TTS + + # Load saved settings + self._load_settings() + + def _load_settings(self): + """Load settings from disk""" + if SETTINGS_FILE.exists(): + try: + with open(SETTINGS_FILE) as f: + data = json.load(f) + self.input_device = data.get("input_device") + self.input_channel = data.get("input_channel", 1) + self.output_device = data.get("output_device") + self.caller_channel = data.get("caller_channel", 1) + self.music_channel = data.get("music_channel", 2) + self.sfx_channel = data.get("sfx_channel", 3) + self.phone_filter = data.get("phone_filter", False) + print(f"Loaded audio settings: output={self.output_device}, channels={self.caller_channel}/{self.music_channel}/{self.sfx_channel}, phone_filter={self.phone_filter}") + except Exception as e: + print(f"Failed to load audio settings: {e}") + + def _save_settings(self): + """Save settings to disk""" + try: + data = { + "input_device": self.input_device, + "input_channel": self.input_channel, + "output_device": self.output_device, + "caller_channel": self.caller_channel, + "music_channel": self.music_channel, + "sfx_channel": self.sfx_channel, + "phone_filter": self.phone_filter, + } + with open(SETTINGS_FILE, "w") as f: + json.dump(data, f, indent=2) + print(f"Saved audio settings") + except Exception as e: + print(f"Failed to save audio settings: {e}") + + def list_devices(self) -> list[dict]: + """List all available audio devices""" + devices = sd.query_devices() + result = [] + for i, d in enumerate(devices): + result.append({ + "id": i, + "name": d["name"], + "inputs": d["max_input_channels"], + "outputs": d["max_output_channels"], + "default_sr": d["default_samplerate"] + }) + return result + + def set_devices( + self, + input_device: Optional[int] = None, + input_channel: Optional[int] = None, + output_device: Optional[int] = None, + caller_channel: Optional[int] = None, + music_channel: Optional[int] = None, + sfx_channel: Optional[int] = None, + phone_filter: Optional[bool] = None + ): + """Configure audio devices and channels""" + if input_device is not None: + self.input_device = input_device + if input_channel is not None: + self.input_channel = input_channel + if output_device is not None: + self.output_device = output_device + if caller_channel is not None: + self.caller_channel = caller_channel + if music_channel is not None: + self.music_channel = music_channel + if sfx_channel is not None: + self.sfx_channel = sfx_channel + if phone_filter is not None: + self.phone_filter = phone_filter + + # Persist to disk + self._save_settings() + + def get_device_settings(self) -> dict: + """Get current device configuration""" + return { + "input_device": self.input_device, + "input_channel": self.input_channel, + "output_device": self.output_device, + "caller_channel": self.caller_channel, + "music_channel": self.music_channel, + "sfx_channel": self.sfx_channel, + "phone_filter": self.phone_filter, + } + + # --- Recording --- + + def start_recording(self) -> bool: + """Start recording from input device""" + if self._recording: + return False + + if self.input_device is None: + print("No input device configured") + return False + + self._recording = True + self._recorded_audio = [] + self._record_thread = threading.Thread(target=self._record_worker) + self._record_thread.start() + print(f"Recording started from device {self.input_device}") + return True + + def stop_recording(self) -> bytes: + """Stop recording and return audio data resampled to 16kHz for Whisper""" + import librosa + + if not self._recording: + return b"" + + self._recording = False + if self._record_thread: + self._record_thread.join(timeout=2.0) + + if not self._recorded_audio: + return b"" + + # Combine all chunks + audio = np.concatenate(self._recorded_audio) + device_sr = getattr(self, '_record_device_sr', 48000) + print(f"Recording stopped: {len(audio)} samples @ {device_sr}Hz ({len(audio)/device_sr:.2f}s)") + + # Resample to 16kHz for Whisper + if device_sr != 16000: + audio = librosa.resample(audio, orig_sr=device_sr, target_sr=16000) + print(f"Resampled to 16kHz: {len(audio)} samples") + + # Convert to bytes (16-bit PCM) + audio_int16 = (audio * 32767).astype(np.int16) + return audio_int16.tobytes() + + def _record_worker(self): + """Background thread for recording from specific channel""" + try: + # Get device info + device_info = sd.query_devices(self.input_device) + max_channels = device_info['max_input_channels'] + device_sr = int(device_info['default_samplerate']) + record_channel = min(self.input_channel, max_channels) - 1 + + # Store device sample rate for later resampling + self._record_device_sr = device_sr + + print(f"Recording from device {self.input_device} ch {self.input_channel} @ {device_sr}Hz") + + def callback(indata, frames, time_info, status): + if status: + print(f"Record status: {status}") + if self._recording: + self._recorded_audio.append(indata[:, record_channel].copy()) + + with sd.InputStream( + device=self.input_device, + channels=max_channels, + samplerate=device_sr, # Use device's native rate + dtype=np.float32, + callback=callback, + blocksize=1024 + ): + while self._recording: + time.sleep(0.05) + + except Exception as e: + print(f"Recording error: {e}") + self._recording = False + + # --- Caller TTS Playback --- + + def _apply_fade(self, audio: np.ndarray, sample_rate: int, fade_ms: int = 15) -> np.ndarray: + """Apply fade-in and fade-out to avoid clicks""" + fade_samples = int(sample_rate * fade_ms / 1000) + if len(audio) < fade_samples * 2: + return audio + + # Fade in + fade_in = np.linspace(0, 1, fade_samples) + audio[:fade_samples] *= fade_in + + # Fade out + fade_out = np.linspace(1, 0, fade_samples) + audio[-fade_samples:] *= fade_out + + return audio + + def play_caller_audio(self, audio_bytes: bytes, sample_rate: int = 24000): + """Play caller TTS audio to specific channel of output device (interruptible)""" + import librosa + + # Stop any existing caller audio + self.stop_caller_audio() + self._caller_stop_event.clear() + + # Convert bytes to numpy + audio = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32768.0 + + if self.output_device is None: + print("No output device configured, using default") + audio = self._apply_fade(audio, sample_rate) + with sd.OutputStream(samplerate=sample_rate, channels=1, dtype=np.float32) as stream: + stream.write(audio.reshape(-1, 1)) + return + + try: + # Get device info and resample to device's native rate + device_info = sd.query_devices(self.output_device) + num_channels = device_info['max_output_channels'] + device_sr = int(device_info['default_samplerate']) + channel_idx = min(self.caller_channel, num_channels) - 1 + + # Resample if needed + if sample_rate != device_sr: + audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=device_sr) + + # Apply fade to prevent clicks + audio = self._apply_fade(audio, device_sr) + + # Create multi-channel output with audio only on target channel + multi_ch = np.zeros((len(audio), num_channels), dtype=np.float32) + multi_ch[:, channel_idx] = audio + + print(f"Playing caller audio to device {self.output_device} ch {self.caller_channel} @ {device_sr}Hz") + + # Play in chunks so we can interrupt + chunk_size = int(device_sr * 0.1) # 100ms chunks + pos = 0 + + with sd.OutputStream( + device=self.output_device, + samplerate=device_sr, + channels=num_channels, + dtype=np.float32 + ) as stream: + while pos < len(multi_ch) and not self._caller_stop_event.is_set(): + end = min(pos + chunk_size, len(multi_ch)) + stream.write(multi_ch[pos:end]) + pos = end + + if self._caller_stop_event.is_set(): + print("Caller audio stopped early") + else: + print(f"Played caller audio: {len(audio)/device_sr:.2f}s") + + except Exception as e: + print(f"Caller playback error: {e}") + + def stop_caller_audio(self): + """Stop any playing caller audio""" + self._caller_stop_event.set() + + # --- Music Playback --- + + def load_music(self, file_path: str) -> bool: + """Load a music file for playback""" + path = Path(file_path) + if not path.exists(): + print(f"Music file not found: {file_path}") + return False + + try: + import librosa + audio, sr = librosa.load(str(path), sr=self.output_sample_rate, mono=True) + self._music_data = audio.astype(np.float32) + self._music_position = 0 + print(f"Loaded music: {path.name} ({len(audio)/sr:.1f}s)") + return True + except Exception as e: + print(f"Failed to load music: {e}") + return False + + def play_music(self): + """Start music playback to specific channel""" + import librosa + + if self._music_data is None: + print("No music loaded") + return + + if self._music_playing: + self.stop_music() + + self._music_playing = True + self._music_position = 0 + + if self.output_device is None: + print("No output device configured, using default") + num_channels = 2 + device = None + device_sr = self.output_sample_rate + channel_idx = 0 + else: + device_info = sd.query_devices(self.output_device) + num_channels = device_info['max_output_channels'] + device_sr = int(device_info['default_samplerate']) + device = self.output_device + channel_idx = min(self.music_channel, num_channels) - 1 + + # Resample music to device sample rate if needed + if self.output_sample_rate != device_sr: + self._music_resampled = librosa.resample( + self._music_data, orig_sr=self.output_sample_rate, target_sr=device_sr + ) + else: + self._music_resampled = self._music_data.copy() + + # Apply fade-in at start of track + fade_samples = int(device_sr * 0.015) # 15ms fade + if len(self._music_resampled) > fade_samples: + fade_in = np.linspace(0, 1, fade_samples).astype(np.float32) + self._music_resampled[:fade_samples] *= fade_in + + def callback(outdata, frames, time_info, status): + outdata.fill(0) + + if not self._music_playing or self._music_resampled is None: + return + + end_pos = self._music_position + frames + + if end_pos <= len(self._music_resampled): + outdata[:, channel_idx] = self._music_resampled[self._music_position:end_pos] * self._music_volume + self._music_position = end_pos + else: + remaining = len(self._music_resampled) - self._music_position + if remaining > 0: + outdata[:remaining, channel_idx] = self._music_resampled[self._music_position:] * self._music_volume + + if self._music_loop: + self._music_position = 0 + wrap_frames = frames - remaining + if wrap_frames > 0: + outdata[remaining:, channel_idx] = self._music_resampled[:wrap_frames] * self._music_volume + self._music_position = wrap_frames + else: + self._music_playing = False + + try: + self._music_stream = sd.OutputStream( + device=device, + channels=num_channels, + samplerate=device_sr, + dtype=np.float32, + callback=callback, + blocksize=2048 + ) + self._music_stream.start() + print(f"Music playback started on ch {self.music_channel} @ {device_sr}Hz") + except Exception as e: + print(f"Music playback error: {e}") + self._music_playing = False + + def stop_music(self): + """Stop music playback""" + self._music_playing = False + if self._music_stream: + self._music_stream.stop() + self._music_stream.close() + self._music_stream = None + self._music_position = 0 + print("Music stopped") + + def set_music_volume(self, volume: float): + """Set music volume (0.0 to 1.0)""" + self._music_volume = max(0.0, min(1.0, volume)) + + def is_music_playing(self) -> bool: + """Check if music is currently playing""" + return self._music_playing + + # --- SFX Playback --- + + def play_sfx(self, file_path: str): + """Play a sound effect to specific channel using dedicated stream""" + path = Path(file_path) + if not path.exists(): + print(f"SFX file not found: {file_path}") + return + + try: + import librosa + + if self.output_device is None: + audio, sr = librosa.load(str(path), sr=None, mono=True) + audio = self._apply_fade(audio, sr) + def play(): + # Use a dedicated stream instead of sd.play() + with sd.OutputStream(samplerate=sr, channels=1, dtype=np.float32) as stream: + stream.write(audio.reshape(-1, 1)) + else: + device_info = sd.query_devices(self.output_device) + num_channels = device_info['max_output_channels'] + device_sr = int(device_info['default_samplerate']) + channel_idx = min(self.sfx_channel, num_channels) - 1 + + audio, _ = librosa.load(str(path), sr=device_sr, mono=True) + audio = self._apply_fade(audio, device_sr) + + multi_ch = np.zeros((len(audio), num_channels), dtype=np.float32) + multi_ch[:, channel_idx] = audio + + def play(): + # Use dedicated stream to avoid interrupting other audio + with sd.OutputStream( + device=self.output_device, + samplerate=device_sr, + channels=num_channels, + dtype=np.float32 + ) as stream: + stream.write(multi_ch) + + threading.Thread(target=play, daemon=True).start() + print(f"Playing SFX: {path.name} on ch {self.sfx_channel}") + except Exception as e: + print(f"SFX playback error: {e}") + + +# Global instance +audio_service = AudioService() diff --git a/backend/services/edge_tts_service.py b/backend/services/edge_tts_service.py new file mode 100644 index 0000000..3a85831 --- /dev/null +++ b/backend/services/edge_tts_service.py @@ -0,0 +1,112 @@ +"""Edge TTS service - free Microsoft TTS API""" + +import asyncio +import io +import numpy as np +from typing import Optional + +try: + import edge_tts + EDGE_TTS_AVAILABLE = True +except ImportError: + EDGE_TTS_AVAILABLE = False + + +class EdgeTTSService: + """TTS using Microsoft Edge's free API""" + + def __init__(self): + self.sample_rate = 24000 # Edge TTS outputs 24kHz + + def is_available(self) -> bool: + return EDGE_TTS_AVAILABLE + + async def generate_speech(self, text: str, voice: str = "en-US-JennyNeural") -> bytes: + """Generate speech from text using Edge TTS + + Args: + text: Text to synthesize + voice: Edge TTS voice name (e.g., "en-US-JennyNeural") + + Returns: + Raw PCM audio bytes (16-bit signed int, 24kHz mono) + """ + if not EDGE_TTS_AVAILABLE: + raise RuntimeError("edge-tts not installed. Run: pip install edge-tts") + + communicate = edge_tts.Communicate(text, voice) + + # Collect MP3 audio data + mp3_data = b'' + async for chunk in communicate.stream(): + if chunk['type'] == 'audio': + mp3_data += chunk['data'] + + if not mp3_data: + raise RuntimeError("No audio generated") + + # Convert MP3 to PCM + pcm_data = await self._mp3_to_pcm(mp3_data) + return pcm_data + + async def _mp3_to_pcm(self, mp3_data: bytes) -> bytes: + """Convert MP3 to raw PCM using ffmpeg or pydub""" + loop = asyncio.get_event_loop() + + def convert(): + try: + # Try pydub first (more reliable) + from pydub import AudioSegment + audio = AudioSegment.from_mp3(io.BytesIO(mp3_data)) + # Convert to 24kHz mono 16-bit + audio = audio.set_frame_rate(24000).set_channels(1).set_sample_width(2) + return audio.raw_data + except ImportError: + pass + + # Fallback to ffmpeg subprocess + import subprocess + process = subprocess.Popen( + [ + 'ffmpeg', '-i', 'pipe:0', + '-f', 's16le', + '-acodec', 'pcm_s16le', + '-ar', '24000', + '-ac', '1', + 'pipe:1' + ], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE + ) + pcm_data, stderr = process.communicate(input=mp3_data) + if process.returncode != 0: + raise RuntimeError(f"ffmpeg failed: {stderr.decode()}") + return pcm_data + + return await loop.run_in_executor(None, convert) + + async def list_voices(self) -> list[dict]: + """List available Edge TTS voices""" + if not EDGE_TTS_AVAILABLE: + return [] + + voices = await edge_tts.list_voices() + return [ + { + "id": v["ShortName"], + "name": v["ShortName"].replace("Neural", ""), + "gender": v["Gender"], + "locale": v["Locale"], + } + for v in voices + if v["Locale"].startswith("en-") + ] + + +# Global instance +edge_tts_service = EdgeTTSService() + + +def is_edge_tts_available() -> bool: + return edge_tts_service.is_available() diff --git a/backend/services/llm.py b/backend/services/llm.py new file mode 100644 index 0000000..d6c108b --- /dev/null +++ b/backend/services/llm.py @@ -0,0 +1,175 @@ +"""LLM service with OpenRouter and Ollama support""" + +import httpx +from typing import Optional +from ..config import settings + + +# Available OpenRouter models +OPENROUTER_MODELS = [ + "anthropic/claude-3-haiku", + "anthropic/claude-3.5-sonnet", + "openai/gpt-4o-mini", + "openai/gpt-4o", + "google/gemini-flash-1.5", + "google/gemini-pro-1.5", + "meta-llama/llama-3.1-8b-instruct", + "mistralai/mistral-7b-instruct", +] + + +class LLMService: + """Abstraction layer for LLM providers""" + + def __init__(self): + self.provider = settings.llm_provider + self.openrouter_model = settings.openrouter_model + self.ollama_model = settings.ollama_model + self.ollama_host = settings.ollama_host + self.tts_provider = settings.tts_provider + + def update_settings( + self, + provider: Optional[str] = None, + openrouter_model: Optional[str] = None, + ollama_model: Optional[str] = None, + ollama_host: Optional[str] = None, + tts_provider: Optional[str] = None + ): + """Update LLM settings""" + if provider: + self.provider = provider + if openrouter_model: + self.openrouter_model = openrouter_model + if ollama_model: + self.ollama_model = ollama_model + if ollama_host: + self.ollama_host = ollama_host + if tts_provider: + self.tts_provider = tts_provider + # Also update the global settings so TTS service picks it up + settings.tts_provider = tts_provider + + async def get_ollama_models(self) -> list[str]: + """Fetch available models from Ollama""" + try: + async with httpx.AsyncClient(timeout=5.0) as client: + response = await client.get(f"{self.ollama_host}/api/tags") + response.raise_for_status() + data = response.json() + return [model["name"] for model in data.get("models", [])] + except Exception as e: + print(f"Failed to fetch Ollama models: {e}") + return [] + + def get_settings(self) -> dict: + """Get current settings (sync version without Ollama models)""" + return { + "provider": self.provider, + "openrouter_model": self.openrouter_model, + "ollama_model": self.ollama_model, + "ollama_host": self.ollama_host, + "tts_provider": self.tts_provider, + "available_openrouter_models": OPENROUTER_MODELS, + "available_ollama_models": [] # Fetched separately + } + + async def get_settings_async(self) -> dict: + """Get current settings with Ollama models""" + ollama_models = await self.get_ollama_models() + return { + "provider": self.provider, + "openrouter_model": self.openrouter_model, + "ollama_model": self.ollama_model, + "ollama_host": self.ollama_host, + "tts_provider": self.tts_provider, + "available_openrouter_models": OPENROUTER_MODELS, + "available_ollama_models": ollama_models + } + + async def generate( + self, + messages: list[dict], + system_prompt: Optional[str] = None + ) -> str: + """ + Generate a response from the LLM. + + Args: + messages: List of message dicts with 'role' and 'content' + system_prompt: Optional system prompt to prepend + + Returns: + Generated text response + """ + if system_prompt: + messages = [{"role": "system", "content": system_prompt}] + messages + + if self.provider == "openrouter": + return await self._call_openrouter(messages) + else: + return await self._call_ollama(messages) + + async def _call_openrouter(self, messages: list[dict]) -> str: + """Call OpenRouter API with retry""" + for attempt in range(2): # Try twice + try: + async with httpx.AsyncClient(timeout=30.0) as client: + response = await client.post( + "https://openrouter.ai/api/v1/chat/completions", + headers={ + "Authorization": f"Bearer {settings.openrouter_api_key}", + "Content-Type": "application/json", + }, + json={ + "model": self.openrouter_model, + "messages": messages, + "max_tokens": 100, + }, + ) + response.raise_for_status() + data = response.json() + return data["choices"][0]["message"]["content"] + except (httpx.TimeoutException, httpx.ReadTimeout): + print(f"OpenRouter timeout (attempt {attempt + 1})") + if attempt == 0: + continue # Retry once + return "Uh, sorry, I lost you there for a second. What was that?" + except Exception as e: + print(f"OpenRouter error: {e}") + return "Yeah... I don't know, man." + return "Uh, hold on a sec..." + + async def _call_ollama(self, messages: list[dict]) -> str: + """Call Ollama API""" + try: + async with httpx.AsyncClient() as client: + response = await client.post( + f"{self.ollama_host}/api/chat", + json={ + "model": self.ollama_model, + "messages": messages, + "stream": False, + "options": { + "num_predict": 100, # Allow complete thoughts + "temperature": 0.8, # Balanced creativity/coherence + "top_p": 0.9, # Focused word choices + "repeat_penalty": 1.3, # Avoid repetition + "top_k": 50, # Reasonable token variety + }, + }, + timeout=30.0 + ) + response.raise_for_status() + data = response.json() + return data["message"]["content"] + except httpx.TimeoutException: + print("Ollama timeout") + return "Uh, sorry, I lost you there for a second. What was that?" + except Exception as e: + print(f"Ollama error: {e}") + return "Yeah... I don't know, man." + + +# Global instance +llm_service = LLMService() diff --git a/backend/services/piper_tts.py b/backend/services/piper_tts.py new file mode 100644 index 0000000..a21f13a --- /dev/null +++ b/backend/services/piper_tts.py @@ -0,0 +1,144 @@ +"""Piper TTS service using sherpa-onnx for fast local voice synthesis""" + +import asyncio +import numpy as np +from pathlib import Path +from typing import Optional + +# Models directory +MODELS_DIR = Path(__file__).parent.parent.parent / "models" / "sherpa" + +# Try to import sherpa-onnx +try: + import sherpa_onnx + SHERPA_AVAILABLE = True +except ImportError: + SHERPA_AVAILABLE = False + sherpa_onnx = None + + +# Available sherpa-onnx Piper models +PIPER_MODELS = { + "amy": { + "dir": "vits-piper-en_US-amy-low", + "model": "en_US-amy-low.onnx", + "name": "Amy (US Female)", + "sample_rate": 16000, + }, + "joe": { + "dir": "vits-piper-en_US-joe-medium", + "model": "en_US-joe-medium.onnx", + "name": "Joe (US Male)", + "sample_rate": 22050, + }, + "lessac": { + "dir": "vits-piper-en_US-lessac-medium", + "model": "en_US-lessac-medium.onnx", + "name": "Lessac (US Female)", + "sample_rate": 22050, + }, + "alan": { + "dir": "vits-piper-en_GB-alan-medium", + "model": "en_GB-alan-medium.onnx", + "name": "Alan (UK Male)", + "sample_rate": 22050, + }, +} + + +class PiperTTSService: + """Fast local TTS using sherpa-onnx with Piper models""" + + def __init__(self): + self.output_sample_rate = 24000 # Our standard output rate + self._tts_engines: dict[str, any] = {} + + def is_available(self) -> bool: + """Check if sherpa-onnx is available""" + return SHERPA_AVAILABLE + + def _get_engine(self, model_key: str): + """Get or create a TTS engine for the given model""" + if model_key in self._tts_engines: + return self._tts_engines[model_key], PIPER_MODELS[model_key]["sample_rate"] + + if model_key not in PIPER_MODELS: + raise ValueError(f"Unknown model: {model_key}") + + model_info = PIPER_MODELS[model_key] + model_dir = MODELS_DIR / model_info["dir"] + + if not model_dir.exists(): + raise RuntimeError(f"Model not found: {model_dir}") + + config = sherpa_onnx.OfflineTtsConfig( + model=sherpa_onnx.OfflineTtsModelConfig( + vits=sherpa_onnx.OfflineTtsVitsModelConfig( + model=str(model_dir / model_info["model"]), + tokens=str(model_dir / "tokens.txt"), + data_dir=str(model_dir / "espeak-ng-data"), + ), + num_threads=2, + ), + ) + tts = sherpa_onnx.OfflineTts(config) + self._tts_engines[model_key] = tts + return tts, model_info["sample_rate"] + + async def generate_speech(self, text: str, model_key: str = "amy") -> bytes: + """Generate speech from text using sherpa-onnx + + Args: + text: Text to synthesize + model_key: Model key (amy, joe, lessac, alan) + + Returns: + Raw PCM audio bytes (16-bit signed int, 24kHz mono) + """ + if not SHERPA_AVAILABLE: + raise RuntimeError("sherpa-onnx not installed. Run: pip install sherpa-onnx") + + loop = asyncio.get_event_loop() + + def run_tts(): + tts, model_sample_rate = self._get_engine(model_key) + audio = tts.generate(text) + samples = np.array(audio.samples, dtype=np.float32) + + # Resample to 24kHz if needed + if model_sample_rate != self.output_sample_rate: + ratio = self.output_sample_rate / model_sample_rate + new_length = int(len(samples) * ratio) + samples = np.interp( + np.linspace(0, len(samples) - 1, new_length), + np.arange(len(samples)), + samples + ).astype(np.float32) + + # Convert to int16 + audio_int16 = (samples * 32767).astype(np.int16) + return audio_int16.tobytes() + + return await loop.run_in_executor(None, run_tts) + + def list_available_models(self) -> list[dict]: + """List available models""" + available = [] + for key, info in PIPER_MODELS.items(): + model_dir = MODELS_DIR / info["dir"] + if model_dir.exists(): + available.append({ + "id": key, + "name": info["name"], + "sample_rate": info["sample_rate"], + }) + return available + + +# Global instance +piper_service = PiperTTSService() + + +def is_piper_available() -> bool: + """Check if Piper (sherpa-onnx) is available""" + return piper_service.is_available() diff --git a/backend/services/transcription.py b/backend/services/transcription.py new file mode 100644 index 0000000..cef8f85 --- /dev/null +++ b/backend/services/transcription.py @@ -0,0 +1,116 @@ +"""Whisper transcription service""" + +import tempfile +import numpy as np +from faster_whisper import WhisperModel +import librosa + +# Global model instance (loaded once) +_whisper_model = None + + +def get_whisper_model() -> WhisperModel: + """Get or create Whisper model instance""" + global _whisper_model + if _whisper_model is None: + print("Loading Whisper tiny model for fast transcription...") + # Use tiny model for speed - about 3-4x faster than base + # beam_size=1 and best_of=1 for fastest inference + _whisper_model = WhisperModel("tiny", device="cpu", compute_type="int8") + print("Whisper model loaded") + return _whisper_model + + +def decode_audio(audio_data: bytes, source_sample_rate: int = None) -> tuple[np.ndarray, int]: + """ + Decode audio from various formats to numpy array. + + Args: + audio_data: Raw audio bytes + source_sample_rate: If provided, treat as raw PCM at this sample rate + + Returns: + Tuple of (audio array as float32, sample rate) + """ + # If sample rate is provided, assume raw PCM (from server-side recording) + if source_sample_rate is not None: + print(f"Decoding raw PCM at {source_sample_rate}Hz, {len(audio_data)} bytes") + if len(audio_data) % 2 != 0: + audio_data = audio_data + b'\x00' + audio = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32768.0 + return audio, source_sample_rate + + print(f"First 20 bytes: {audio_data[:20].hex()}") + + # Try to decode with librosa first (handles webm, ogg, wav, mp3, etc via ffmpeg) + try: + with tempfile.NamedTemporaryFile(suffix='.webm', delete=False) as f: + f.write(audio_data) + temp_path = f.name + + audio, sample_rate = librosa.load(temp_path, sr=None, mono=True) + print(f"Decoded with librosa: {len(audio)} samples at {sample_rate}Hz") + + import os + os.unlink(temp_path) + + return audio.astype(np.float32), sample_rate + + except Exception as e: + print(f"librosa decode failed: {e}, trying raw PCM at 16kHz...") + + # Fall back to raw PCM (16-bit signed int, 16kHz mono - Whisper's rate) + if len(audio_data) % 2 != 0: + audio_data = audio_data + b'\x00' + + audio = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32768.0 + return audio, 16000 + + +async def transcribe_audio(audio_data: bytes, source_sample_rate: int = None) -> str: + """ + Transcribe audio data to text using Whisper. + + Args: + audio_data: Audio bytes (webm, ogg, wav, or raw PCM) + source_sample_rate: If provided, treat audio_data as raw PCM at this rate + + Returns: + Transcribed text + """ + model = get_whisper_model() + + print(f"Transcribing audio: {len(audio_data)} bytes") + + # Decode audio from whatever format + audio, detected_sample_rate = decode_audio(audio_data, source_sample_rate) + + print(f"Audio samples: {len(audio)}, duration: {len(audio)/detected_sample_rate:.2f}s") + print(f"Audio range: min={audio.min():.4f}, max={audio.max():.4f}") + + # Check if audio is too quiet + if np.abs(audio).max() < 0.01: + print("Warning: Audio appears to be silent or very quiet") + return "" + + # Resample to 16kHz for Whisper + if detected_sample_rate != 16000: + audio_16k = librosa.resample(audio, orig_sr=detected_sample_rate, target_sr=16000) + print(f"Resampled to {len(audio_16k)} samples at 16kHz") + else: + audio_16k = audio + + # Transcribe with speed optimizations + segments, info = model.transcribe( + audio_16k, + beam_size=1, # Faster, slightly less accurate + best_of=1, + language="en", # Skip language detection + vad_filter=True, # Skip silence + ) + segments_list = list(segments) + text = " ".join([s.text for s in segments_list]).strip() + + print(f"Transcription result: '{text}' (language: {info.language}, prob: {info.language_probability:.2f})") + + return text diff --git a/backend/services/tts.py b/backend/services/tts.py new file mode 100644 index 0000000..6ba3708 --- /dev/null +++ b/backend/services/tts.py @@ -0,0 +1,701 @@ +"""TTS service with ElevenLabs, F5-TTS, MLX Kokoro, StyleTTS2, VITS, and Bark support""" + +import os +import numpy as np +from scipy.signal import butter, filtfilt +from pathlib import Path +import tempfile +import torch + +from ..config import settings + +# Patch torch.load for compatibility with PyTorch 2.6+ +_original_torch_load = torch.load +def _patched_torch_load(*args, **kwargs): + kwargs['weights_only'] = False + return _original_torch_load(*args, **kwargs) +torch.load = _patched_torch_load + +# Global clients +_elevenlabs_client = None +_vits_tts = None +_bark_loaded = False +_kokoro_model = None +_styletts2_model = None +_f5tts_model = None +_chattts_model = None +_chattts_speakers = {} # Cache for speaker embeddings + +# Kokoro voice mapping - using highest-graded voices +# Grades from https://huggingface.co/hexgrad/Kokoro-82M/blob/main/VOICES.md +KOKORO_VOICES = { + # Male voices (best available are C+ grade) + "VR6AewLTigWG4xSOukaG": "am_fenrir", # Tony - deep/powerful (C+) + "TxGEqnHWrfWFTfGW9XjX": "am_michael", # Rick - solid male voice (C+) + "pNInz6obpgDQGcFmaJgB": "am_puck", # Dennis - anxious dad (C+) + "ODq5zmih8GrVes37Dizd": "bm_george", # Earl - older/distinguished British (C) + "IKne3meq5aSn9XLyUdCD": "bm_fable", # Marcus - young British (C) + # Female voices (much better quality available) + "jBpfuIE2acCO8z3wKNLl": "af_heart", # Jasmine - best quality (A) + "EXAVITQu4vr4xnSDxMaL": "af_bella", # Megan - warm/friendly (A-) + "21m00Tcm4TlvDq8ikWAM": "bf_emma", # Tanya - professional British (B-) + "XB0fDUnXU5powFXDhCwa": "af_nicole", # Carla - Jersey mom (B-) + "pFZP5JQG7iQjIQuC4Bku": "af_sarah", # Brenda - overthinker (C+) +} + +# Speed adjustments per voice (1.0 = normal, lower = slower/more natural) +# Slower speeds (0.85-0.95) generally sound more natural +KOKORO_SPEEDS = { + # Male voices - slower speeds help with C+ grade voices + "VR6AewLTigWG4xSOukaG": 0.9, # Tony (am_fenrir) - deep voice, slower + "TxGEqnHWrfWFTfGW9XjX": 0.92, # Rick (am_michael) - solid pace + "pNInz6obpgDQGcFmaJgB": 0.95, # Dennis (am_puck) - anxious but not rushed + "ODq5zmih8GrVes37Dizd": 0.85, # Earl (bm_george) - older, slower British + "IKne3meq5aSn9XLyUdCD": 0.95, # Marcus (bm_fable) - young, natural + # Female voices - A-grade voices can handle faster speeds + "jBpfuIE2acCO8z3wKNLl": 0.95, # Jasmine (af_heart) - best voice, natural pace + "EXAVITQu4vr4xnSDxMaL": 0.95, # Megan (af_bella) - warm + "21m00Tcm4TlvDq8ikWAM": 0.9, # Tanya (bf_emma) - professional British + "XB0fDUnXU5powFXDhCwa": 0.95, # Carla (af_nicole) - animated but clear + "pFZP5JQG7iQjIQuC4Bku": 0.92, # Brenda (af_sarah) - overthinker, measured +} + +DEFAULT_KOKORO_VOICE = "af_heart" +DEFAULT_KOKORO_SPEED = 0.95 + +# VCTK speaker mapping - different voices for different callers +VITS_SPEAKERS = { + # Male voices + "VR6AewLTigWG4xSOukaG": "p226", # Tony + "TxGEqnHWrfWFTfGW9XjX": "p251", # Rick + "pNInz6obpgDQGcFmaJgB": "p245", # Dennis + "ODq5zmih8GrVes37Dizd": "p232", # Earl + "IKne3meq5aSn9XLyUdCD": "p252", # Marcus + # Female voices + "jBpfuIE2acCO8z3wKNLl": "p225", # Jasmine + "EXAVITQu4vr4xnSDxMaL": "p228", # Megan + "21m00Tcm4TlvDq8ikWAM": "p229", # Tanya + "XB0fDUnXU5powFXDhCwa": "p231", # Carla + "pFZP5JQG7iQjIQuC4Bku": "p233", # Brenda +} + +DEFAULT_VITS_SPEAKER = "p225" + +# Inworld voice mapping - maps ElevenLabs voice IDs to Inworld voices +# Full voice list from API: Alex, Ashley, Blake, Carter, Clive, Craig, Deborah, +# Dennis, Dominus, Edward, Elizabeth, Hades, Hana, Julia, Luna, Mark, Olivia, +# Pixie, Priya, Ronald, Sarah, Shaun, Theodore, Timothy, Wendy +INWORLD_VOICES = { + # Male voices - each caller gets a unique voice matching their personality + "VR6AewLTigWG4xSOukaG": "Edward", # Tony - fast-talking, emphatic, streetwise + "TxGEqnHWrfWFTfGW9XjX": "Shaun", # Rick - friendly, dynamic, conversational + "pNInz6obpgDQGcFmaJgB": "Alex", # Dennis - energetic, expressive, mildly nasal + "ODq5zmih8GrVes37Dizd": "Craig", # Earl - older British, refined, articulate + "IKne3meq5aSn9XLyUdCD": "Timothy", # Marcus - lively, upbeat American + # Female voices - each caller gets a unique voice matching their personality + "jBpfuIE2acCO8z3wKNLl": "Hana", # Jasmine - bright, expressive young female + "EXAVITQu4vr4xnSDxMaL": "Ashley", # Megan - warm, natural female + "21m00Tcm4TlvDq8ikWAM": "Wendy", # Tanya - posh, middle-aged British + "XB0fDUnXU5powFXDhCwa": "Sarah", # Carla - fast-talking, questioning tone + "pFZP5JQG7iQjIQuC4Bku": "Deborah", # Brenda - gentle, elegant +} +DEFAULT_INWORLD_VOICE = "Dennis" + + +def preprocess_text_for_kokoro(text: str) -> str: + """ + Preprocess text to improve Kokoro prosody and naturalness. + + - Adds slight pauses via punctuation + - Handles contractions and abbreviations + - Normalizes spacing + """ + import re + + # Normalize whitespace + text = ' '.join(text.split()) + + # Add comma pauses after common transition words (if no punctuation follows) + transitions = [ + r'\b(Well)\s+(?=[A-Za-z])', + r'\b(So)\s+(?=[A-Za-z])', + r'\b(Now)\s+(?=[A-Za-z])', + r'\b(Look)\s+(?=[A-Za-z])', + r'\b(See)\s+(?=[A-Za-z])', + r'\b(Anyway)\s+(?=[A-Za-z])', + r'\b(Actually)\s+(?=[A-Za-z])', + r'\b(Honestly)\s+(?=[A-Za-z])', + r'\b(Basically)\s+(?=[A-Za-z])', + ] + for pattern in transitions: + text = re.sub(pattern, r'\1, ', text) + + # Add pause after "I mean" at start of sentence + text = re.sub(r'^(I mean)\s+', r'\1, ', text) + text = re.sub(r'\.\s+(I mean)\s+', r'. \1, ', text) + + # Expand common abbreviations for better pronunciation + abbreviations = { + r'\bDr\.': 'Doctor', + r'\bMr\.': 'Mister', + r'\bMrs\.': 'Missus', + r'\bMs\.': 'Miss', + r'\bSt\.': 'Street', + r'\bAve\.': 'Avenue', + r'\betc\.': 'etcetera', + r'\bvs\.': 'versus', + r'\bw/': 'with', + r'\bw/o': 'without', + } + for abbr, expansion in abbreviations.items(): + text = re.sub(abbr, expansion, text, flags=re.IGNORECASE) + + # Add breath pause (comma) before conjunctions in long sentences + text = re.sub(r'(\w{20,})\s+(and|but|or)\s+', r'\1, \2 ', text) + + # Ensure proper spacing after punctuation + text = re.sub(r'([.!?])\s*([A-Z])', r'\1 \2', text) + + return text + +# StyleTTS2 reference voice files (place .wav files in voices/ directory for voice cloning) +# Maps voice_id to reference audio filename - if file doesn't exist, uses default voice +STYLETTS2_VOICES = { + # Male voices + "VR6AewLTigWG4xSOukaG": "tony.wav", # Tony + "TxGEqnHWrfWFTfGW9XjX": "rick.wav", # Rick + "pNInz6obpgDQGcFmaJgB": "dennis.wav", # Dennis + "ODq5zmih8GrVes37Dizd": "earl.wav", # Earl + "IKne3meq5aSn9XLyUdCD": "marcus.wav", # Marcus + # Female voices + "jBpfuIE2acCO8z3wKNLl": "jasmine.wav", # Jasmine + "EXAVITQu4vr4xnSDxMaL": "megan.wav", # Megan + "21m00Tcm4TlvDq8ikWAM": "tanya.wav", # Tanya + "XB0fDUnXU5powFXDhCwa": "carla.wav", # Carla + "pFZP5JQG7iQjIQuC4Bku": "brenda.wav", # Brenda +} + +# F5-TTS reference voices (same files as StyleTTS2, reuses voices/ directory) +# Requires: mono, 24kHz, 5-10 seconds, with transcript in .txt file +F5TTS_VOICES = STYLETTS2_VOICES.copy() + +# ChatTTS speaker seeds - different seeds produce different voices +# These are used to generate consistent speaker embeddings +CHATTTS_SEEDS = { + # Male voices + "VR6AewLTigWG4xSOukaG": 42, # Tony - deep voice + "TxGEqnHWrfWFTfGW9XjX": 123, # Rick + "pNInz6obpgDQGcFmaJgB": 456, # Dennis + "ODq5zmih8GrVes37Dizd": 789, # Earl + "IKne3meq5aSn9XLyUdCD": 1011, # Marcus + # Female voices + "jBpfuIE2acCO8z3wKNLl": 2024, # Jasmine + "EXAVITQu4vr4xnSDxMaL": 3033, # Megan + "21m00Tcm4TlvDq8ikWAM": 4042, # Tanya + "XB0fDUnXU5powFXDhCwa": 5051, # Carla + "pFZP5JQG7iQjIQuC4Bku": 6060, # Brenda +} +DEFAULT_CHATTTS_SEED = 42 + + +def get_elevenlabs_client(): + """Get or create ElevenLabs client""" + global _elevenlabs_client + if _elevenlabs_client is None: + from elevenlabs.client import ElevenLabs + _elevenlabs_client = ElevenLabs(api_key=settings.elevenlabs_api_key) + return _elevenlabs_client + + +def get_vits_tts(): + """Get or create VITS VCTK TTS instance""" + global _vits_tts + if _vits_tts is None: + from TTS.api import TTS + _vits_tts = TTS("tts_models/en/vctk/vits") + return _vits_tts + + +def get_kokoro_model(): + """Get or create Kokoro MLX model""" + global _kokoro_model + if _kokoro_model is None: + from mlx_audio.tts.utils import load_model + _kokoro_model = load_model(model_path='mlx-community/Kokoro-82M-bf16') + print("Kokoro MLX model loaded") + return _kokoro_model + + +def ensure_bark_loaded(): + """Ensure Bark models are loaded on GPU""" + global _bark_loaded + if not _bark_loaded: + os.environ['SUNO_USE_SMALL_MODELS'] = '1' + + # Force Bark to use MPS (Apple Silicon GPU) + if torch.backends.mps.is_available(): + os.environ['SUNO_OFFLOAD_CPU'] = '0' + os.environ['SUNO_ENABLE_MPS'] = '1' + + from bark import preload_models + preload_models() + _bark_loaded = True + print(f"Bark loaded on device: {'MPS' if torch.backends.mps.is_available() else 'CPU'}") + + +def get_styletts2_model(): + """Get or create StyleTTS2 model""" + global _styletts2_model + if _styletts2_model is None: + from styletts2 import tts + _styletts2_model = tts.StyleTTS2() + print("StyleTTS2 model loaded") + return _styletts2_model + + +def get_f5tts_generate(): + """Get F5-TTS generate function (lazy load)""" + global _f5tts_model + if _f5tts_model is None: + # Disable tqdm progress bars to avoid BrokenPipeError in server context + import os + os.environ['HF_HUB_DISABLE_PROGRESS_BARS'] = '1' + os.environ['TQDM_DISABLE'] = '1' + + from f5_tts_mlx.generate import generate + _f5tts_model = generate + print("F5-TTS MLX loaded") + return _f5tts_model + + +def get_chattts_model(): + """Get or create ChatTTS model""" + global _chattts_model + if _chattts_model is None: + import ChatTTS + _chattts_model = ChatTTS.Chat() + _chattts_model.load(compile=False) + print("ChatTTS model loaded") + return _chattts_model + + +def get_chattts_speaker(voice_id: str): + """Get or create a consistent speaker embedding for a voice""" + global _chattts_speakers + if voice_id not in _chattts_speakers: + chat = get_chattts_model() + seed = CHATTTS_SEEDS.get(voice_id, DEFAULT_CHATTTS_SEED) + # Set seed for reproducible speaker + torch.manual_seed(seed) + _chattts_speakers[voice_id] = chat.sample_random_speaker() + print(f"[ChatTTS] Created speaker for voice {voice_id} with seed {seed}") + return _chattts_speakers[voice_id] + + +def phone_filter(audio: np.ndarray, sample_rate: int = 24000, quality: str = "normal") -> np.ndarray: + """Apply phone filter with variable quality.""" + audio = audio.flatten() + + presets = { + "good": (200, 7000, 1.0, 0.0), + "normal": (300, 3400, 1.5, 0.005), + "bad": (400, 2800, 2.0, 0.015), + "terrible": (500, 2200, 2.5, 0.03), + } + + low_hz, high_hz, distortion, noise = presets.get(quality, presets["normal"]) + + low = low_hz / (sample_rate / 2) + high = high_hz / (sample_rate / 2) + b, a = butter(4, [low, high], btype='band') + filtered = filtfilt(b, a, audio) + + filtered = np.tanh(filtered * distortion) * 0.8 + + if noise > 0: + static = np.random.normal(0, noise, len(filtered)).astype(np.float32) + static_envelope = np.random.random(len(filtered) // 1000 + 1) + static_envelope = np.repeat(static_envelope, 1000)[:len(filtered)] + static *= (static_envelope > 0.7).astype(np.float32) + filtered = filtered + static + + return filtered.astype(np.float32) + + +async def generate_speech_elevenlabs(text: str, voice_id: str) -> tuple[np.ndarray, int]: + """Generate speech using ElevenLabs""" + client = get_elevenlabs_client() + + audio_gen = client.text_to_speech.convert( + voice_id=voice_id, + text=text, + model_id="eleven_v3", + output_format="pcm_24000" + ) + + audio_bytes = b"".join(audio_gen) + audio = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32768.0 + + return audio, 24000 + + +async def generate_speech_kokoro(text: str, voice_id: str) -> tuple[np.ndarray, int]: + """Generate speech using MLX Kokoro (fast, good quality, Apple Silicon optimized)""" + import librosa + from mlx_audio.tts.generate import generate_audio + + model = get_kokoro_model() + voice = KOKORO_VOICES.get(voice_id, DEFAULT_KOKORO_VOICE) + speed = KOKORO_SPEEDS.get(voice_id, DEFAULT_KOKORO_SPEED) + + # Preprocess text for better prosody + text = preprocess_text_for_kokoro(text) + + # Determine lang_code from voice prefix (a=American, b=British) + lang_code = 'b' if voice.startswith('b') else 'a' + + with tempfile.TemporaryDirectory() as tmpdir: + generate_audio( + text, + model=model, + voice=voice, + speed=speed, + lang_code=lang_code, + output_path=tmpdir, + file_prefix='tts', + verbose=False + ) + + # Read the generated audio file + audio_file = Path(tmpdir) / 'tts_000.wav' + if not audio_file.exists(): + raise RuntimeError("Kokoro failed to generate audio") + + audio, sr = librosa.load(str(audio_file), sr=None, mono=True) + + # Resample to 24kHz if needed + if sr != 24000: + audio = librosa.resample(audio, orig_sr=sr, target_sr=24000) + + return audio.astype(np.float32), 24000 + + +async def generate_speech_vits(text: str, voice_id: str) -> tuple[np.ndarray, int]: + """Generate speech using VITS VCTK (fast, multiple speakers)""" + import librosa + + tts = get_vits_tts() + speaker = VITS_SPEAKERS.get(voice_id, DEFAULT_VITS_SPEAKER) + + with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp: + tmp_path = tmp.name + + try: + tts.tts_to_file(text=text, file_path=tmp_path, speaker=speaker) + audio, sr = librosa.load(tmp_path, sr=None, mono=True) + + if sr != 24000: + audio = librosa.resample(audio, orig_sr=sr, target_sr=24000) + + return audio.astype(np.float32), 24000 + finally: + Path(tmp_path).unlink(missing_ok=True) + + +async def generate_speech_bark(text: str, voice_id: str) -> tuple[np.ndarray, int]: + """Generate speech using Bark (slow but expressive, supports emotes like [laughs])""" + import librosa + from bark import SAMPLE_RATE, generate_audio + + ensure_bark_loaded() + + # Generate audio with Bark + audio = generate_audio(text) + + # Normalize to prevent clipping (Bark can exceed [-1, 1]) + max_val = np.abs(audio).max() + if max_val > 0.95: + audio = audio * (0.95 / max_val) + + # Resample to 24kHz if needed + if SAMPLE_RATE != 24000: + audio = librosa.resample(audio, orig_sr=SAMPLE_RATE, target_sr=24000) + + return audio.astype(np.float32), 24000 + + +async def generate_speech_styletts2(text: str, voice_id: str) -> tuple[np.ndarray, int]: + """Generate speech using StyleTTS2 (high quality, supports voice cloning)""" + import librosa + + model = get_styletts2_model() + + # Check for reference voice file + voice_file = STYLETTS2_VOICES.get(voice_id) + voice_path = None + if voice_file: + voice_path = settings.base_dir / "voices" / voice_file + if not voice_path.exists(): + voice_path = None # Use default voice if file doesn't exist + + # Generate audio + with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp: + tmp_path = tmp.name + + try: + if voice_path: + print(f"[StyleTTS2] Using voice clone: {voice_path}") + audio = model.inference( + text, + target_voice_path=str(voice_path), + output_wav_file=tmp_path, + output_sample_rate=24000, + diffusion_steps=5, # Balance quality/speed + alpha=0.3, # More voice-like than text-like + beta=0.7, # Good prosody + ) + else: + print("[StyleTTS2] Using default voice") + audio = model.inference( + text, + output_wav_file=tmp_path, + output_sample_rate=24000, + diffusion_steps=5, + ) + + # Load the generated audio + audio, sr = librosa.load(tmp_path, sr=None, mono=True) + + if sr != 24000: + audio = librosa.resample(audio, orig_sr=sr, target_sr=24000) + + return audio.astype(np.float32), 24000 + finally: + Path(tmp_path).unlink(missing_ok=True) + + +async def generate_speech_f5tts(text: str, voice_id: str) -> tuple[np.ndarray, int]: + """Generate speech using F5-TTS MLX (very natural, supports voice cloning)""" + import librosa + + generate = get_f5tts_generate() + + # Check for reference voice file and transcript + voice_file = F5TTS_VOICES.get(voice_id) + ref_audio_path = None + ref_text = None + + if voice_file: + voice_path = settings.base_dir / "voices" / voice_file + txt_path = voice_path.with_suffix('.txt') + + if voice_path.exists() and txt_path.exists(): + ref_audio_path = str(voice_path) + ref_text = txt_path.read_text().strip() + print(f"[F5-TTS] Using voice clone: {voice_path}") + + if not ref_audio_path: + print("[F5-TTS] Using default voice") + + # Generate audio to temp file + with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp: + tmp_path = tmp.name + + try: + generate( + generation_text=text, + ref_audio_path=ref_audio_path, + ref_audio_text=ref_text, + steps=8, + speed=1.0, + output_path=tmp_path, + ) + + # Load the generated audio + audio, sr = librosa.load(tmp_path, sr=None, mono=True) + + # Resample to 24kHz if needed + if sr != 24000: + audio = librosa.resample(audio, orig_sr=sr, target_sr=24000) + + return audio.astype(np.float32), 24000 + finally: + Path(tmp_path).unlink(missing_ok=True) + + +async def generate_speech_chattts(text: str, voice_id: str) -> tuple[np.ndarray, int]: + """Generate speech using ChatTTS (natural conversational speech, multiple speakers)""" + import ChatTTS + + chat = get_chattts_model() + + # Ensure text is not empty and has reasonable content + text = text.strip() + if not text: + text = "Hello." + + print(f"[ChatTTS] Generating speech for: {text[:50]}...") + + # Get consistent speaker for this voice + seed = CHATTTS_SEEDS.get(voice_id, DEFAULT_CHATTTS_SEED) + torch.manual_seed(seed) + + # Configure inference parameters + params_infer_code = ChatTTS.Chat.InferCodeParams( + temperature=0.3, + top_P=0.7, + top_K=20, + ) + + # Generate audio (skip text refinement to avoid narrow() error with this version) + wavs = chat.infer( + [text], + params_infer_code=params_infer_code, + skip_refine_text=True, + ) + + if wavs is None or len(wavs) == 0: + raise RuntimeError("ChatTTS failed to generate audio") + + audio = wavs[0] + + # Handle different output shapes + if audio.ndim > 1: + audio = audio.squeeze() + + # Normalize + max_val = np.abs(audio).max() + if max_val > 0.95: + audio = audio * (0.95 / max_val) + + return audio.astype(np.float32), 24000 + + +async def generate_speech_inworld(text: str, voice_id: str) -> tuple[np.ndarray, int]: + """Generate speech using Inworld TTS API (high quality, natural voices)""" + import httpx + import base64 + import librosa + + voice = INWORLD_VOICES.get(voice_id, DEFAULT_INWORLD_VOICE) + + api_key = settings.inworld_api_key + if not api_key: + raise RuntimeError("INWORLD_API_KEY not set in environment") + + print(f"[Inworld TTS] Voice: {voice}, Text: {text[:50]}...") + + url = "https://api.inworld.ai/tts/v1/voice" + headers = { + "Content-Type": "application/json", + "Authorization": f"Basic {api_key}", + } + payload = { + "text": text, + "voice_id": voice, + "model_id": "inworld-tts-1.5-mini", + "audio_config": { + "encoding": "LINEAR16", + "sample_rate_hertz": 48000, + }, + } + + async with httpx.AsyncClient(timeout=60.0) as client: + response = await client.post(url, json=payload, headers=headers) + response.raise_for_status() + data = response.json() + + # Decode base64 audio + audio_b64 = data.get("audioContent") + if not audio_b64: + raise RuntimeError("Inworld TTS returned no audio content") + + audio_bytes = base64.b64decode(audio_b64) + + # Parse audio using soundfile (handles WAV, MP3, etc.) + import soundfile as sf + import io + + # soundfile can read WAV, FLAC, OGG, and with ffmpeg: MP3 + # MP3 files start with ID3 tag or 0xff sync bytes + try: + audio, sr = sf.read(io.BytesIO(audio_bytes)) + except Exception as e: + print(f"[Inworld TTS] soundfile failed: {e}, trying raw PCM") + # Fallback to raw PCM + if len(audio_bytes) % 2 != 0: + audio_bytes = audio_bytes[:-1] + audio = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32768.0 + sr = 48000 + + # Resample to 24kHz to match other providers + if sr != 24000: + audio = librosa.resample(audio, orig_sr=sr, target_sr=24000) + + return audio.astype(np.float32), 24000 + + +async def generate_speech( + text: str, + voice_id: str, + phone_quality: str = "normal", + apply_filter: bool = True +) -> bytes: + """ + Generate speech from text. + + Args: + text: Text to speak + voice_id: ElevenLabs voice ID (mapped to local voice if using local TTS) + phone_quality: Quality of phone filter ("none" to disable) + apply_filter: Whether to apply phone filter + + Returns: + Raw PCM audio bytes (16-bit signed int, 24kHz) + """ + # Choose TTS provider + provider = settings.tts_provider + print(f"[TTS] Provider: {provider}, Text: {text[:50]}...") + + if provider == "kokoro": + audio, sample_rate = await generate_speech_kokoro(text, voice_id) + elif provider == "f5tts": + audio, sample_rate = await generate_speech_f5tts(text, voice_id) + elif provider == "inworld": + audio, sample_rate = await generate_speech_inworld(text, voice_id) + elif provider == "chattts": + audio, sample_rate = await generate_speech_chattts(text, voice_id) + elif provider == "styletts2": + audio, sample_rate = await generate_speech_styletts2(text, voice_id) + elif provider == "bark": + audio, sample_rate = await generate_speech_bark(text, voice_id) + elif provider == "vits": + audio, sample_rate = await generate_speech_vits(text, voice_id) + elif provider == "elevenlabs": + audio, sample_rate = await generate_speech_elevenlabs(text, voice_id) + else: + raise ValueError(f"Unknown TTS provider: {provider}") + + # Apply phone filter if requested + # Skip filter for Bark - it already has rough audio quality + if apply_filter and phone_quality not in ("none", "studio") and provider != "bark": + audio = phone_filter(audio, sample_rate, phone_quality) + + # Convert to bytes + audio_int16 = (audio * 32768).clip(-32768, 32767).astype(np.int16) + return audio_int16.tobytes() + + +# Voice IDs for cohost and announcer +COHOST_VOICE_ID = "nPczCjzI2devNBz1zQrb" +ANNOUNCER_VOICE_ID = "ErXwobaYiN019PkySvjV" + + +async def generate_cohost_speech(text: str) -> bytes: + """Generate speech for cohost Bobby (no phone filter)""" + return await generate_speech(text, COHOST_VOICE_ID, apply_filter=False) + + +async def generate_announcer_speech(text: str) -> bytes: + """Generate speech for announcer (no phone filter)""" + return await generate_speech(text, ANNOUNCER_VOICE_ID, apply_filter=False) diff --git a/backend/services/voices.py b/backend/services/voices.py new file mode 100644 index 0000000..025c414 --- /dev/null +++ b/backend/services/voices.py @@ -0,0 +1,200 @@ +"""Voice configuration and TTS provider management""" + +from dataclasses import dataclass +from typing import Optional +from enum import Enum + + +class TTSProvider(str, Enum): + ELEVENLABS = "elevenlabs" + EDGE = "edge" # Microsoft Edge TTS (free) + PIPER = "piper" # Local Piper via sherpa-onnx (free, fast) + + +@dataclass +class Voice: + """Voice configuration""" + id: str + name: str + provider: TTSProvider + provider_voice_id: str # The actual ID used by the provider + description: str = "" + language: str = "en" + gender: str = "neutral" + + +# ElevenLabs voices +ELEVENLABS_VOICES = [ + Voice("el_tony", "Tony (ElevenLabs)", TTSProvider.ELEVENLABS, "IKne3meq5aSn9XLyUdCD", + "Male, New York accent, expressive", "en", "male"), + Voice("el_jasmine", "Jasmine (ElevenLabs)", TTSProvider.ELEVENLABS, "FGY2WhTYpPnrIDTdsKH5", + "Female, confident, direct", "en", "female"), + Voice("el_rick", "Rick (ElevenLabs)", TTSProvider.ELEVENLABS, "JBFqnCBsd6RMkjVDRZzb", + "Male, Texas accent, older", "en", "male"), + Voice("el_megan", "Megan (ElevenLabs)", TTSProvider.ELEVENLABS, "XrExE9yKIg1WjnnlVkGX", + "Female, young, casual", "en", "female"), + Voice("el_dennis", "Dennis (ElevenLabs)", TTSProvider.ELEVENLABS, "cjVigY5qzO86Huf0OWal", + "Male, middle-aged, anxious", "en", "male"), + Voice("el_tanya", "Tanya (ElevenLabs)", TTSProvider.ELEVENLABS, "N2lVS1w4EtoT3dr4eOWO", + "Female, Miami, sassy", "en", "female"), + Voice("el_earl", "Earl (ElevenLabs)", TTSProvider.ELEVENLABS, "EXAVITQu4vr4xnSDxMaL", + "Male, elderly, Southern", "en", "male"), + Voice("el_carla", "Carla (ElevenLabs)", TTSProvider.ELEVENLABS, "CwhRBWXzGAHq8TQ4Fs17", + "Female, Jersey, sharp", "en", "female"), + Voice("el_marcus", "Marcus (ElevenLabs)", TTSProvider.ELEVENLABS, "bIHbv24MWmeRgasZH58o", + "Male, young, urban", "en", "male"), + Voice("el_brenda", "Brenda (ElevenLabs)", TTSProvider.ELEVENLABS, "Xb7hH8MSUJpSbSDYk0k2", + "Female, middle-aged, worried", "en", "female"), + Voice("el_jake", "Jake (ElevenLabs)", TTSProvider.ELEVENLABS, "SOYHLrjzK2X1ezoPC6cr", + "Male, Boston, insecure", "en", "male"), + Voice("el_diane", "Diane (ElevenLabs)", TTSProvider.ELEVENLABS, "cgSgspJ2msm6clMCkdW9", + "Female, mature, conflicted", "en", "female"), + Voice("el_bobby", "Bobby (ElevenLabs)", TTSProvider.ELEVENLABS, "nPczCjzI2devNBz1zQrb", + "Male, sidekick, wisecracking", "en", "male"), + Voice("el_announcer", "Announcer (ElevenLabs)", TTSProvider.ELEVENLABS, "ErXwobaYiN019PkySvjV", + "Male, radio announcer", "en", "male"), +] + +# Edge TTS voices (Microsoft, free) +EDGE_VOICES = [ + # US voices + Voice("edge_jenny", "Jenny (Edge)", TTSProvider.EDGE, "en-US-JennyNeural", + "Female, American, friendly", "en", "female"), + Voice("edge_guy", "Guy (Edge)", TTSProvider.EDGE, "en-US-GuyNeural", + "Male, American, casual", "en", "male"), + Voice("edge_aria", "Aria (Edge)", TTSProvider.EDGE, "en-US-AriaNeural", + "Female, American, professional", "en", "female"), + Voice("edge_davis", "Davis (Edge)", TTSProvider.EDGE, "en-US-DavisNeural", + "Male, American, calm", "en", "male"), + Voice("edge_amber", "Amber (Edge)", TTSProvider.EDGE, "en-US-AmberNeural", + "Female, American, warm", "en", "female"), + Voice("edge_andrew", "Andrew (Edge)", TTSProvider.EDGE, "en-US-AndrewNeural", + "Male, American, confident", "en", "male"), + Voice("edge_ashley", "Ashley (Edge)", TTSProvider.EDGE, "en-US-AshleyNeural", + "Female, American, cheerful", "en", "female"), + Voice("edge_brian", "Brian (Edge)", TTSProvider.EDGE, "en-US-BrianNeural", + "Male, American, narrator", "en", "male"), + Voice("edge_christopher", "Christopher (Edge)", TTSProvider.EDGE, "en-US-ChristopherNeural", + "Male, American, reliable", "en", "male"), + Voice("edge_cora", "Cora (Edge)", TTSProvider.EDGE, "en-US-CoraNeural", + "Female, American, older", "en", "female"), + Voice("edge_elizabeth", "Elizabeth (Edge)", TTSProvider.EDGE, "en-US-ElizabethNeural", + "Female, American, elegant", "en", "female"), + Voice("edge_eric", "Eric (Edge)", TTSProvider.EDGE, "en-US-EricNeural", + "Male, American, friendly", "en", "male"), + Voice("edge_jacob", "Jacob (Edge)", TTSProvider.EDGE, "en-US-JacobNeural", + "Male, American, young", "en", "male"), + Voice("edge_michelle", "Michelle (Edge)", TTSProvider.EDGE, "en-US-MichelleNeural", + "Female, American, clear", "en", "female"), + Voice("edge_monica", "Monica (Edge)", TTSProvider.EDGE, "en-US-MonicaNeural", + "Female, American, expressive", "en", "female"), + Voice("edge_roger", "Roger (Edge)", TTSProvider.EDGE, "en-US-RogerNeural", + "Male, American, mature", "en", "male"), + Voice("edge_steffan", "Steffan (Edge)", TTSProvider.EDGE, "en-US-SteffanNeural", + "Male, American, formal", "en", "male"), + Voice("edge_tony", "Tony (Edge)", TTSProvider.EDGE, "en-US-TonyNeural", + "Male, American, conversational", "en", "male"), + # UK voices + Voice("edge_sonia", "Sonia (Edge UK)", TTSProvider.EDGE, "en-GB-SoniaNeural", + "Female, British, professional", "en", "female"), + Voice("edge_ryan", "Ryan (Edge UK)", TTSProvider.EDGE, "en-GB-RyanNeural", + "Male, British, clear", "en", "male"), + Voice("edge_libby", "Libby (Edge UK)", TTSProvider.EDGE, "en-GB-LibbyNeural", + "Female, British, warm", "en", "female"), + Voice("edge_thomas", "Thomas (Edge UK)", TTSProvider.EDGE, "en-GB-ThomasNeural", + "Male, British, friendly", "en", "male"), + # Australian voices + Voice("edge_natasha", "Natasha (Edge AU)", TTSProvider.EDGE, "en-AU-NatashaNeural", + "Female, Australian, friendly", "en", "female"), + Voice("edge_william", "William (Edge AU)", TTSProvider.EDGE, "en-AU-WilliamNeural", + "Male, Australian, casual", "en", "male"), +] + +# Piper voices (local, via sherpa-onnx) +PIPER_VOICES = [ + Voice("piper_amy", "Amy (Piper)", TTSProvider.PIPER, "amy", + "Female, American, clear", "en", "female"), + Voice("piper_joe", "Joe (Piper)", TTSProvider.PIPER, "joe", + "Male, American, natural", "en", "male"), + Voice("piper_lessac", "Lessac (Piper)", TTSProvider.PIPER, "lessac", + "Female, American, expressive", "en", "female"), + Voice("piper_alan", "Alan (Piper)", TTSProvider.PIPER, "alan", + "Male, British, clear", "en", "male"), +] + +# All voices combined +ALL_VOICES = ELEVENLABS_VOICES + EDGE_VOICES + PIPER_VOICES + +# Voice lookup by ID +VOICES_BY_ID = {v.id: v for v in ALL_VOICES} + +# Default voice assignments for callers (maps caller key to voice ID) +DEFAULT_CALLER_VOICES = { + "1": "el_tony", # Tony from Staten Island + "2": "el_jasmine", # Jasmine from Atlanta + "3": "el_rick", # Rick from Texas + "4": "el_megan", # Megan from Portland + "5": "el_dennis", # Dennis from Long Island + "6": "el_tanya", # Tanya from Miami + "7": "el_earl", # Earl from Tennessee + "8": "el_carla", # Carla from Jersey + "9": "el_marcus", # Marcus from Detroit + "0": "el_brenda", # Brenda from Phoenix + "-": "el_jake", # Jake from Boston + "=": "el_diane", # Diane from Chicago + "bobby": "el_bobby", + "announcer": "el_announcer", +} + + +class VoiceManager: + """Manages voice assignments and TTS provider selection""" + + def __init__(self): + # Current voice assignments (can be modified at runtime) + self.caller_voices = DEFAULT_CALLER_VOICES.copy() + + def get_voice(self, voice_id: str) -> Optional[Voice]: + """Get voice by ID""" + return VOICES_BY_ID.get(voice_id) + + def get_caller_voice(self, caller_key: str) -> Voice: + """Get the voice assigned to a caller""" + voice_id = self.caller_voices.get(caller_key, "el_tony") + return VOICES_BY_ID.get(voice_id, ELEVENLABS_VOICES[0]) + + def set_caller_voice(self, caller_key: str, voice_id: str): + """Assign a voice to a caller""" + if voice_id in VOICES_BY_ID: + self.caller_voices[caller_key] = voice_id + + def get_all_voices(self) -> list[dict]: + """Get all available voices as dicts for API""" + return [ + { + "id": v.id, + "name": v.name, + "provider": v.provider.value, + "description": v.description, + "gender": v.gender, + } + for v in ALL_VOICES + ] + + def get_voices_by_provider(self, provider: TTSProvider) -> list[Voice]: + """Get all voices for a specific provider""" + return [v for v in ALL_VOICES if v.provider == provider] + + def get_caller_voice_assignments(self) -> dict[str, str]: + """Get current caller voice assignments""" + return self.caller_voices.copy() + + def set_caller_voice_assignments(self, assignments: dict[str, str]): + """Set multiple caller voice assignments""" + for caller_key, voice_id in assignments.items(): + if voice_id in VOICES_BY_ID: + self.caller_voices[caller_key] = voice_id + + +# Global instance +voice_manager = VoiceManager() diff --git a/download_sounds.py b/download_sounds.py new file mode 100644 index 0000000..88f60e8 --- /dev/null +++ b/download_sounds.py @@ -0,0 +1,109 @@ +#!/usr/bin/env python3 +""" +Download free sound effects for the radio show soundboard. +Uses sounds from freesound.org and other free sources. +""" + +import os +import urllib.request +import ssl +from pathlib import Path + +# Bypass SSL issues +ssl._create_default_https_context = ssl._create_unverified_context + +SOUNDS_DIR = Path(__file__).parent / "sounds" +SOUNDS_DIR.mkdir(exist_ok=True) + +# Free sound effect URLs (public domain / CC0) +# These are from various free sources +SOUND_URLS = { + # Using pixabay free sounds (no attribution required) + 'rimshot.wav': 'https://cdn.pixabay.com/audio/2022/03/15/audio_7a569d6dde.mp3', + 'laugh.wav': 'https://cdn.pixabay.com/audio/2024/02/14/audio_70fa4b1f7c.mp3', + 'sad_trombone.wav': 'https://cdn.pixabay.com/audio/2022/03/15/audio_cce0f1f0f1.mp3', + 'cheer.wav': 'https://cdn.pixabay.com/audio/2021/08/04/audio_0625c1539c.mp3', + 'boo.wav': 'https://cdn.pixabay.com/audio/2022/10/30/audio_f2a4d3d7db.mp3', + 'drumroll.wav': 'https://cdn.pixabay.com/audio/2022/03/24/audio_52a6ef9129.mp3', + 'crickets.wav': 'https://cdn.pixabay.com/audio/2022/03/09/audio_691875e05c.mp3', + 'phone_ring.wav': 'https://cdn.pixabay.com/audio/2022/03/15/audio_0f66b49312.mp3', +} + +def download_sound(name, url): + """Download a sound file""" + output_path = SOUNDS_DIR / name + + if output_path.exists(): + print(f" āœ“ {name} (already exists)") + return True + + try: + print(f" Downloading {name}...") + + # Download the file + req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'}) + with urllib.request.urlopen(req, timeout=30) as response: + data = response.read() + + # If it's an MP3, we need to convert it + if url.endswith('.mp3'): + temp_mp3 = SOUNDS_DIR / f"temp_{name}.mp3" + with open(temp_mp3, 'wb') as f: + f.write(data) + + # Try to convert with ffmpeg + import subprocess + result = subprocess.run([ + 'ffmpeg', '-y', '-i', str(temp_mp3), + '-ar', '24000', '-ac', '1', + str(output_path) + ], capture_output=True) + + temp_mp3.unlink() # Remove temp file + + if result.returncode == 0: + print(f" āœ“ {name}") + return True + else: + print(f" āœ— {name} (ffmpeg conversion failed)") + return False + else: + with open(output_path, 'wb') as f: + f.write(data) + print(f" āœ“ {name}") + return True + + except Exception as e: + print(f" āœ— {name} ({e})") + return False + +def main(): + print("Downloading sound effects for radio show soundboard...") + print(f"Saving to: {SOUNDS_DIR}\n") + + # Check for ffmpeg + import subprocess + try: + subprocess.run(['ffmpeg', '-version'], capture_output=True, check=True) + except: + print("WARNING: ffmpeg not found. Install it with: brew install ffmpeg") + print("Some sounds may not download correctly.\n") + + success = 0 + for name, url in SOUND_URLS.items(): + if download_sound(name, url): + success += 1 + + print(f"\nDownloaded {success}/{len(SOUND_URLS)} sounds.") + print("\nTo add more sounds:") + print(" 1. Find free .wav files online") + print(" 2. Name them according to the SOUNDBOARD mapping in radio_show.py") + print(" 3. Place them in the sounds/ directory") + print("\nRecommended free sound sources:") + print(" - freesound.org") + print(" - pixabay.com/sound-effects") + print(" - zapsplat.com") + print(" - soundbible.com") + +if __name__ == "__main__": + main() diff --git a/frontend/css/style.css b/frontend/css/style.css new file mode 100644 index 0000000..c81e964 --- /dev/null +++ b/frontend/css/style.css @@ -0,0 +1,543 @@ +/* AI Radio Show - Clean CSS */ + +:root { + --bg: #1a1a2e; + --bg-light: #252547; + --accent: #e94560; + --text: #fff; + --text-muted: #888; + --radius: 8px; +} + +* { + box-sizing: border-box; + margin: 0; + padding: 0; +} + +body { + font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif; + background: var(--bg); + color: var(--text); + min-height: 100vh; +} + +#app { + max-width: 900px; + margin: 0 auto; + padding: 20px; +} + +/* Header */ +header { + display: flex; + justify-content: space-between; + align-items: center; + margin-bottom: 20px; +} + +header h1 { + font-size: 1.5rem; +} + +.header-buttons { + display: flex; + gap: 8px; +} + +header button { + background: var(--bg-light); + color: var(--text); + border: none; + padding: 8px 16px; + border-radius: var(--radius); + cursor: pointer; +} + +.new-session-btn { + background: var(--accent) !important; +} + +.session-id { + font-size: 0.7rem; + color: var(--text-muted); + font-weight: normal; +} + +.caller-background { + font-size: 0.85rem; + color: var(--text-muted); + padding: 10px; + background: var(--bg); + border-radius: var(--radius); + margin-bottom: 12px; + line-height: 1.4; +} + +.caller-background.hidden { + display: none; +} + +/* Main layout */ +main { + display: grid; + grid-template-columns: 1fr 1fr; + gap: 20px; +} + +@media (max-width: 700px) { + main { + grid-template-columns: 1fr; + } +} + +/* Sections */ +section { + background: var(--bg-light); + padding: 16px; + border-radius: var(--radius); +} + +section h2 { + font-size: 1rem; + margin-bottom: 12px; + color: var(--text-muted); +} + +/* Callers */ +.caller-grid { + display: grid; + grid-template-columns: repeat(5, 1fr); + gap: 8px; + margin-bottom: 12px; +} + +.caller-btn { + background: var(--bg); + color: var(--text); + border: 2px solid transparent; + padding: 10px 8px; + border-radius: var(--radius); + cursor: pointer; + font-size: 0.85rem; + transition: all 0.2s; +} + +.caller-btn:hover { + border-color: var(--accent); +} + +.caller-btn.active { + background: var(--accent); + border-color: var(--accent); +} + +.call-status { + text-align: center; + padding: 8px; + color: var(--text-muted); + margin-bottom: 12px; +} + +.hangup-btn { + width: 100%; + background: #c0392b; + color: white; + border: none; + padding: 12px; + border-radius: var(--radius); + cursor: pointer; + font-weight: bold; +} + +.hangup-btn:disabled { + opacity: 0.5; + cursor: not-allowed; +} + +/* Chat */ +.chat-section { + grid-column: span 2; +} + +@media (max-width: 700px) { + .chat-section { + grid-column: span 1; + } +} + +.chat-log { + height: 300px; + overflow-y: auto; + background: var(--bg); + border-radius: var(--radius); + padding: 12px; + margin-bottom: 12px; +} + +.message { + padding: 8px 12px; + margin-bottom: 8px; + border-radius: var(--radius); + line-height: 1.4; +} + +.message.host { + background: #2c5282; +} + +.message.caller { + background: #553c9a; +} + +.message strong { + display: block; + font-size: 0.8rem; + opacity: 0.7; + margin-bottom: 4px; +} + +.talk-controls { + display: flex; + gap: 10px; +} + +.talk-btn { + flex: 1; + background: var(--accent); + color: white; + border: none; + padding: 16px; + border-radius: var(--radius); + font-size: 1rem; + font-weight: bold; + cursor: pointer; + transition: all 0.2s; +} + +.talk-btn:hover { + filter: brightness(1.1); +} + +.talk-btn.recording { + background: #c0392b; + animation: pulse 1s infinite; +} + +@keyframes pulse { + 0%, 100% { opacity: 1; } + 50% { opacity: 0.7; } +} + +.type-btn { + background: var(--bg); + color: var(--text); + border: none; + padding: 16px 24px; + border-radius: var(--radius); + cursor: pointer; +} + +.status { + text-align: center; + padding: 12px; + color: var(--accent); + font-weight: bold; +} + +.status.hidden { + display: none; +} + +/* Music */ +.music-section select { + width: 100%; + padding: 10px; + background: var(--bg); + color: var(--text); + border: none; + border-radius: var(--radius); + margin-bottom: 10px; +} + +.music-controls { + display: flex; + gap: 8px; + align-items: center; +} + +.music-controls button { + background: var(--bg); + color: var(--text); + border: none; + padding: 10px 16px; + border-radius: var(--radius); + cursor: pointer; +} + +.music-controls input[type="range"] { + flex: 1; +} + +/* Soundboard */ +.soundboard { + display: grid; + grid-template-columns: repeat(3, 1fr); + gap: 8px; +} + +.sound-btn { + background: var(--bg); + color: var(--text); + border: none; + padding: 12px 8px; + border-radius: var(--radius); + cursor: pointer; + font-size: 0.8rem; + transition: all 0.1s; +} + +.sound-btn:hover { + background: var(--accent); +} + +.sound-btn:active { + transform: scale(0.95); +} + +/* Modal */ +.modal { + position: fixed; + inset: 0; + background: rgba(0, 0, 0, 0.8); + display: flex; + align-items: center; + justify-content: center; + z-index: 100; +} + +.modal.hidden { + display: none; +} + +.modal-content { + background: var(--bg-light); + padding: 24px; + border-radius: var(--radius); + width: 90%; + max-width: 400px; +} + +.modal-content h2 { + margin-bottom: 16px; +} + +.modal-content h3 { + font-size: 0.9rem; + color: var(--text-muted); + margin: 16px 0 8px 0; + border-bottom: 1px solid var(--bg); + padding-bottom: 4px; +} + +.settings-group { + margin-bottom: 16px; +} + +.device-row { + display: flex; + gap: 8px; + align-items: flex-end; +} + +.device-row label:first-child { + flex: 1; +} + +.channel-row { + display: flex; + gap: 12px; + margin-top: 8px; +} + +.channel-row label { + display: flex; + align-items: center; + gap: 4px; + font-size: 0.85rem; +} + +.channel-input { + width: 50px !important; + text-align: center; +} + +.modal-content label { + display: block; + margin-bottom: 16px; +} + +.modal-content label.checkbox { + display: flex; + align-items: center; + gap: 8px; +} + +.modal-content select, +.modal-content input[type="text"], +.modal-content textarea { + width: 100%; + padding: 10px; + background: var(--bg); + color: var(--text); + border: none; + border-radius: var(--radius); + margin-top: 4px; +} + +.modal-buttons { + display: flex; + gap: 10px; + margin-top: 20px; +} + +.modal-buttons button { + flex: 1; + padding: 12px; + border: none; + border-radius: var(--radius); + cursor: pointer; + font-weight: bold; +} + +.modal-buttons button:first-child { + background: var(--accent); + color: white; +} + +.modal-buttons button:last-child { + background: var(--bg); + color: var(--text); +} + +.refresh-btn { + background: var(--bg); + color: var(--text-muted); + border: 1px solid var(--bg-light); + padding: 6px 12px; + border-radius: var(--radius); + cursor: pointer; + font-size: 0.85rem; + margin-top: 8px; +} + +.refresh-btn:hover { + background: var(--bg-light); + color: var(--text); +} + +.refresh-btn:disabled { + opacity: 0.5; + cursor: not-allowed; +} + +.hidden { + display: none !important; +} + +/* Server Log */ +.log-section { + grid-column: span 2; +} + +@media (max-width: 700px) { + .log-section { + grid-column: span 1; + } +} + +.log-header { + display: flex; + justify-content: space-between; + align-items: center; + margin-bottom: 12px; +} + +.log-header h2 { + margin-bottom: 0; +} + +.server-controls { + display: flex; + gap: 8px; + align-items: center; +} + +.server-btn { + border: none; + padding: 6px 12px; + border-radius: var(--radius); + cursor: pointer; + font-size: 0.85rem; + font-weight: bold; +} + +.server-btn.restart { + background: #2196F3; + color: white; +} + +.server-btn.restart:hover { + background: #1976D2; +} + +.server-btn.stop { + background: #c0392b; + color: white; +} + +.server-btn.stop:hover { + background: #a93226; +} + +.auto-scroll-label { + display: flex; + align-items: center; + gap: 4px; + font-size: 0.8rem; + color: var(--text-muted); + cursor: pointer; +} + +.server-log { + height: 200px; + overflow-y: auto; + background: #0d0d1a; + border-radius: var(--radius); + padding: 12px; + font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace; + font-size: 0.75rem; + line-height: 1.5; + color: #8f8; +} + +.server-log .log-line { + white-space: pre-wrap; + word-break: break-all; +} + +.server-log .log-line.error { + color: #f88; +} + +.server-log .log-line.warning { + color: #ff8; +} + +.server-log .log-line.tts { + color: #8ff; +} + +.server-log .log-line.chat { + color: #f8f; +} diff --git a/frontend/index.html b/frontend/index.html new file mode 100644 index 0000000..6050acb --- /dev/null +++ b/frontend/index.html @@ -0,0 +1,178 @@ + + + + + + AI Radio Show + + + +
+
+

AI Radio Show

+
+ + +
+
+ +
+ +
+

Callers

+
+
No active call
+ + +
+ + +
+
+
+ + +
+ +
+ + +
+

Music

+ +
+ + + +
+
+ + +
+

Sounds

+
+
+ + +
+
+

Server Log

+
+ + + +
+
+
+
+
+ + + + + + +
+ + + + diff --git a/frontend/js/app.js b/frontend/js/app.js new file mode 100644 index 0000000..8a581e0 --- /dev/null +++ b/frontend/js/app.js @@ -0,0 +1,782 @@ +/** + * AI Radio Show - Control Panel (Server-Side Audio) + */ + +// --- State --- +let currentCaller = null; +let isProcessing = false; +let isRecording = false; +let phoneFilter = false; +let autoScroll = true; +let logPollInterval = null; +let lastLogCount = 0; + +// Track lists +let tracks = []; +let sounds = []; + + +// --- Init --- +document.addEventListener('DOMContentLoaded', async () => { + console.log('AI Radio Show initializing...'); + try { + await loadAudioDevices(); + await loadCallers(); + await loadMusic(); + await loadSounds(); + await loadSettings(); + initEventListeners(); + log('Ready. Configure audio devices in Settings, then click a caller to start.'); + console.log('AI Radio Show ready'); + } catch (err) { + console.error('Init error:', err); + log('Error loading: ' + err.message); + } +}); + + +function initEventListeners() { + // Hangup + document.getElementById('hangup-btn')?.addEventListener('click', hangup); + + // New Session + document.getElementById('new-session-btn')?.addEventListener('click', newSession); + + // Server controls + document.getElementById('restart-server-btn')?.addEventListener('click', restartServer); + document.getElementById('stop-server-btn')?.addEventListener('click', stopServer); + document.getElementById('auto-scroll')?.addEventListener('change', e => { + autoScroll = e.target.checked; + }); + + // Start log polling + startLogPolling(); + + // Talk button - now triggers server-side recording + const talkBtn = document.getElementById('talk-btn'); + if (talkBtn) { + talkBtn.addEventListener('mousedown', startRecording); + talkBtn.addEventListener('mouseup', stopRecording); + talkBtn.addEventListener('mouseleave', () => { if (isRecording) stopRecording(); }); + talkBtn.addEventListener('touchstart', e => { e.preventDefault(); startRecording(); }); + talkBtn.addEventListener('touchend', e => { e.preventDefault(); stopRecording(); }); + } + + // Type button + document.getElementById('type-btn')?.addEventListener('click', () => { + document.getElementById('type-modal')?.classList.remove('hidden'); + document.getElementById('type-input')?.focus(); + }); + document.getElementById('send-type')?.addEventListener('click', sendTypedMessage); + document.getElementById('close-type')?.addEventListener('click', () => { + document.getElementById('type-modal')?.classList.add('hidden'); + }); + document.getElementById('type-input')?.addEventListener('keydown', e => { + if (e.key === 'Enter' && !e.shiftKey) { + e.preventDefault(); + sendTypedMessage(); + } + }); + + // Music - now server-side + document.getElementById('play-btn')?.addEventListener('click', playMusic); + document.getElementById('stop-btn')?.addEventListener('click', stopMusic); + document.getElementById('volume')?.addEventListener('input', setMusicVolume); + + // Settings + document.getElementById('settings-btn')?.addEventListener('click', async () => { + document.getElementById('settings-modal')?.classList.remove('hidden'); + await loadSettings(); // Reload settings when modal opens + }); + document.getElementById('close-settings')?.addEventListener('click', () => { + document.getElementById('settings-modal')?.classList.add('hidden'); + }); + document.getElementById('save-settings')?.addEventListener('click', saveSettings); + document.getElementById('provider')?.addEventListener('change', updateProviderUI); + document.getElementById('phone-filter')?.addEventListener('change', e => { + phoneFilter = e.target.checked; + }); + document.getElementById('refresh-ollama')?.addEventListener('click', refreshOllamaModels); +} + + +async function refreshOllamaModels() { + const btn = document.getElementById('refresh-ollama'); + const select = document.getElementById('ollama-model'); + if (!select) return; + + btn.textContent = 'Loading...'; + btn.disabled = true; + + try { + const res = await fetch('/api/settings'); + const data = await res.json(); + + select.innerHTML = ''; + const models = data.available_ollama_models || []; + + if (models.length === 0) { + const option = document.createElement('option'); + option.value = ''; + option.textContent = '(No models found)'; + select.appendChild(option); + } else { + models.forEach(model => { + const option = document.createElement('option'); + option.value = model; + option.textContent = model; + select.appendChild(option); + }); + } + } catch (err) { + console.error('Failed to refresh Ollama models:', err); + } + + btn.textContent = 'Refresh Models'; + btn.disabled = false; +} + + +// --- Audio Devices --- +async function loadAudioDevices() { + try { + const res = await fetch('/api/audio/devices'); + const data = await res.json(); + + const inputSelect = document.getElementById('input-device'); + const outputSelect = document.getElementById('output-device'); + + if (!inputSelect || !outputSelect) return; + + // Clear selects + inputSelect.innerHTML = ''; + outputSelect.innerHTML = ''; + + data.devices.forEach(device => { + // Input devices + if (device.inputs > 0) { + const opt = document.createElement('option'); + opt.value = device.id; + opt.textContent = `${device.name} (${device.inputs} ch)`; + inputSelect.appendChild(opt); + } + // Output devices + if (device.outputs > 0) { + const opt = document.createElement('option'); + opt.value = device.id; + opt.textContent = `${device.name} (${device.outputs} ch)`; + outputSelect.appendChild(opt); + } + }); + + // Load current settings + const settingsRes = await fetch('/api/audio/settings'); + const settings = await settingsRes.json(); + + if (settings.input_device !== null) + inputSelect.value = settings.input_device; + if (settings.output_device !== null) + outputSelect.value = settings.output_device; + + // Channel settings + const inputCh = document.getElementById('input-channel'); + const callerCh = document.getElementById('caller-channel'); + const musicCh = document.getElementById('music-channel'); + const sfxCh = document.getElementById('sfx-channel'); + + if (inputCh) inputCh.value = settings.input_channel || 1; + if (callerCh) callerCh.value = settings.caller_channel || 1; + if (musicCh) musicCh.value = settings.music_channel || 2; + if (sfxCh) sfxCh.value = settings.sfx_channel || 3; + + // Phone filter setting + const phoneFilterEl = document.getElementById('phone-filter'); + if (phoneFilterEl) { + phoneFilterEl.checked = settings.phone_filter ?? false; + phoneFilter = phoneFilterEl.checked; + } + + console.log('Audio devices loaded'); + } catch (err) { + console.error('loadAudioDevices error:', err); + } +} + + +async function saveAudioDevices() { + const inputDevice = document.getElementById('input-device')?.value; + const outputDevice = document.getElementById('output-device')?.value; + const inputChannel = document.getElementById('input-channel')?.value; + const callerChannel = document.getElementById('caller-channel')?.value; + const musicChannel = document.getElementById('music-channel')?.value; + const sfxChannel = document.getElementById('sfx-channel')?.value; + const phoneFilterChecked = document.getElementById('phone-filter')?.checked ?? false; + + await fetch('/api/audio/settings', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + input_device: inputDevice ? parseInt(inputDevice) : null, + input_channel: inputChannel ? parseInt(inputChannel) : 1, + output_device: outputDevice ? parseInt(outputDevice) : null, + caller_channel: callerChannel ? parseInt(callerChannel) : 1, + music_channel: musicChannel ? parseInt(musicChannel) : 2, + sfx_channel: sfxChannel ? parseInt(sfxChannel) : 3, + phone_filter: phoneFilterChecked + }) + }); + + // Update local state + phoneFilter = phoneFilterChecked; + + log('Audio routing saved'); +} + + +// --- Callers --- +async function loadCallers() { + try { + const res = await fetch('/api/callers'); + const data = await res.json(); + + const grid = document.getElementById('callers'); + if (!grid) return; + grid.innerHTML = ''; + + data.callers.forEach(caller => { + const btn = document.createElement('button'); + btn.className = 'caller-btn'; + btn.textContent = caller.name; + btn.dataset.key = caller.key; + btn.addEventListener('click', () => startCall(caller.key, caller.name)); + grid.appendChild(btn); + }); + + // Show session ID + const sessionEl = document.getElementById('session-id'); + if (sessionEl && data.session_id) { + sessionEl.textContent = `(${data.session_id})`; + } + + console.log('Loaded', data.callers.length, 'callers, session:', data.session_id); + } catch (err) { + console.error('loadCallers error:', err); + } +} + + +async function startCall(key, name) { + if (isProcessing) return; + + const res = await fetch(`/api/call/${key}`, { method: 'POST' }); + const data = await res.json(); + + currentCaller = { key, name }; + + document.getElementById('call-status').textContent = `On call: ${name}`; + document.getElementById('hangup-btn').disabled = false; + + // Show caller background + const bgEl = document.getElementById('caller-background'); + if (bgEl && data.background) { + bgEl.textContent = data.background; + bgEl.classList.remove('hidden'); + } + + document.querySelectorAll('.caller-btn').forEach(btn => { + btn.classList.toggle('active', btn.dataset.key === key); + }); + + log(`Connected to ${name}`); + clearChat(); +} + + +async function newSession() { + // Hangup if on a call + if (currentCaller) { + await hangup(); + } + + await fetch('/api/session/reset', { method: 'POST' }); + + // Hide caller background + const bgEl = document.getElementById('caller-background'); + if (bgEl) bgEl.classList.add('hidden'); + + // Reload callers to get new session ID + await loadCallers(); + + log('New session started - all callers have fresh backgrounds'); +} + + +async function hangup() { + if (!currentCaller) return; + + // Stop any playing TTS + await fetch('/api/tts/stop', { method: 'POST' }); + await fetch('/api/hangup', { method: 'POST' }); + + log(`Hung up on ${currentCaller.name}`); + + currentCaller = null; + isProcessing = false; + hideStatus(); + + document.getElementById('call-status').textContent = 'No active call'; + document.getElementById('hangup-btn').disabled = true; + document.querySelectorAll('.caller-btn').forEach(btn => btn.classList.remove('active')); + + // Hide caller background + const bgEl = document.getElementById('caller-background'); + if (bgEl) bgEl.classList.add('hidden'); +} + + +// --- Server-Side Recording --- +async function startRecording() { + if (!currentCaller || isProcessing) return; + + try { + const res = await fetch('/api/record/start', { method: 'POST' }); + if (!res.ok) { + const err = await res.json(); + log('Record error: ' + (err.detail || 'Failed to start')); + return; + } + + isRecording = true; + document.getElementById('talk-btn').classList.add('recording'); + document.getElementById('talk-btn').textContent = 'Recording...'; + + } catch (err) { + log('Record error: ' + err.message); + } +} + + +async function stopRecording() { + if (!isRecording) return; + + document.getElementById('talk-btn').classList.remove('recording'); + document.getElementById('talk-btn').textContent = 'Hold to Talk'; + + isRecording = false; + isProcessing = true; + showStatus('Processing...'); + + try { + // Stop recording and get transcription + const res = await fetch('/api/record/stop', { method: 'POST' }); + const data = await res.json(); + + if (!data.text) { + log('(No speech detected)'); + isProcessing = false; + hideStatus(); + return; + } + + addMessage('You', data.text); + + // Chat + showStatus(`${currentCaller.name} is thinking...`); + + const chatRes = await fetch('/api/chat', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ text: data.text }) + }); + const chatData = await chatRes.json(); + + addMessage(chatData.caller, chatData.text); + + // TTS (plays on server) - only if we have text + if (chatData.text && chatData.text.trim()) { + showStatus(`${currentCaller.name} is speaking...`); + + await fetch('/api/tts', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + text: chatData.text, + voice_id: chatData.voice_id, + phone_filter: phoneFilter + }) + }); + } + + } catch (err) { + log('Error: ' + err.message); + } + + isProcessing = false; + hideStatus(); +} + + +async function sendTypedMessage() { + const input = document.getElementById('type-input'); + const text = input.value.trim(); + if (!text || !currentCaller || isProcessing) return; + + input.value = ''; + document.getElementById('type-modal').classList.add('hidden'); + + isProcessing = true; + addMessage('You', text); + + try { + showStatus(`${currentCaller.name} is thinking...`); + + const chatRes = await fetch('/api/chat', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ text }) + }); + const chatData = await chatRes.json(); + + addMessage(chatData.caller, chatData.text); + + // TTS (plays on server) - only if we have text + if (chatData.text && chatData.text.trim()) { + showStatus(`${currentCaller.name} is speaking...`); + + await fetch('/api/tts', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + text: chatData.text, + voice_id: chatData.voice_id, + phone_filter: phoneFilter + }) + }); + } + + } catch (err) { + log('Error: ' + err.message); + } + + isProcessing = false; + hideStatus(); +} + + +// --- Music (Server-Side) --- +async function loadMusic() { + try { + const res = await fetch('/api/music'); + const data = await res.json(); + tracks = data.tracks || []; + + const select = document.getElementById('track-select'); + if (!select) return; + select.innerHTML = ''; + + tracks.forEach((track, i) => { + const option = document.createElement('option'); + option.value = track.file; + option.textContent = track.name; + select.appendChild(option); + }); + console.log('Loaded', tracks.length, 'tracks'); + } catch (err) { + console.error('loadMusic error:', err); + } +} + + +async function playMusic() { + const select = document.getElementById('track-select'); + const track = select?.value; + if (!track) return; + + await fetch('/api/music/play', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ track, action: 'play' }) + }); +} + + +async function stopMusic() { + await fetch('/api/music/stop', { method: 'POST' }); +} + + +async function setMusicVolume(e) { + const volume = e.target.value / 100; + await fetch('/api/music/volume', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ track: '', action: 'volume', volume }) + }); +} + + +// --- Sound Effects (Server-Side) --- +async function loadSounds() { + try { + const res = await fetch('/api/sounds'); + const data = await res.json(); + sounds = data.sounds || []; + + const board = document.getElementById('soundboard'); + if (!board) return; + board.innerHTML = ''; + + sounds.forEach(sound => { + const btn = document.createElement('button'); + btn.className = 'sound-btn'; + btn.textContent = sound.name; + btn.addEventListener('click', () => playSFX(sound.file)); + board.appendChild(btn); + }); + console.log('Loaded', sounds.length, 'sounds'); + } catch (err) { + console.error('loadSounds error:', err); + } +} + + +async function playSFX(soundFile) { + await fetch('/api/sfx/play', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ sound: soundFile }) + }); +} + + +// --- Settings --- +async function loadSettings() { + try { + const res = await fetch('/api/settings'); + const data = await res.json(); + + const providerEl = document.getElementById('provider'); + if (providerEl) providerEl.value = data.provider || 'openrouter'; + + const modelSelect = document.getElementById('openrouter-model'); + if (modelSelect) { + modelSelect.innerHTML = ''; + (data.available_openrouter_models || []).forEach(model => { + const option = document.createElement('option'); + option.value = model; + option.textContent = model; + if (model === data.openrouter_model) option.selected = true; + modelSelect.appendChild(option); + }); + } + + const ollamaModel = document.getElementById('ollama-model'); + const ollamaHost = document.getElementById('ollama-host'); + if (ollamaHost) ollamaHost.value = data.ollama_host || 'http://localhost:11434'; + + // Populate Ollama models dropdown + if (ollamaModel) { + ollamaModel.innerHTML = ''; + const ollamaModels = data.available_ollama_models || []; + console.log('Ollama models from API:', ollamaModels.length, ollamaModels); + if (ollamaModels.length === 0) { + const option = document.createElement('option'); + option.value = data.ollama_model || 'llama3.2'; + option.textContent = data.ollama_model || 'llama3.2'; + ollamaModel.appendChild(option); + } else { + ollamaModels.forEach(model => { + const option = document.createElement('option'); + option.value = model; + option.textContent = model; + if (model === data.ollama_model) option.selected = true; + ollamaModel.appendChild(option); + }); + } + console.log('Ollama dropdown options:', ollamaModel.options.length); + } else { + console.log('Ollama model element not found!'); + } + + // TTS provider + const ttsProvider = document.getElementById('tts-provider'); + if (ttsProvider) ttsProvider.value = data.tts_provider || 'elevenlabs'; + + updateProviderUI(); + console.log('Settings loaded:', data.provider, 'TTS:', data.tts_provider); + } catch (err) { + console.error('loadSettings error:', err); + } +} + + +function updateProviderUI() { + const isOpenRouter = document.getElementById('provider')?.value === 'openrouter'; + document.getElementById('openrouter-settings')?.classList.toggle('hidden', !isOpenRouter); + document.getElementById('ollama-settings')?.classList.toggle('hidden', isOpenRouter); +} + + +async function saveSettings() { + // Save audio devices + await saveAudioDevices(); + + // Save LLM and TTS settings + await fetch('/api/settings', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + provider: document.getElementById('provider')?.value, + openrouter_model: document.getElementById('openrouter-model')?.value, + ollama_model: document.getElementById('ollama-model')?.value, + ollama_host: document.getElementById('ollama-host')?.value, + tts_provider: document.getElementById('tts-provider')?.value + }) + }); + + document.getElementById('settings-modal')?.classList.add('hidden'); + log('Settings saved'); +} + + +// --- UI Helpers --- +function addMessage(sender, text) { + const chat = document.getElementById('chat'); + if (!chat) { + console.log(`[${sender}]: ${text}`); + return; + } + const div = document.createElement('div'); + div.className = `message ${sender === 'You' ? 'host' : 'caller'}`; + div.innerHTML = `${sender}: ${text}`; + chat.appendChild(div); + chat.scrollTop = chat.scrollHeight; +} + + +function clearChat() { + const chat = document.getElementById('chat'); + if (chat) chat.innerHTML = ''; +} + + +function log(text) { + addMessage('System', text); +} + + +function showStatus(text) { + const status = document.getElementById('status'); + if (status) { + status.textContent = text; + status.classList.remove('hidden'); + } +} + + +function hideStatus() { + const status = document.getElementById('status'); + if (status) status.classList.add('hidden'); +} + + +// --- Server Control & Logging --- + +function startLogPolling() { + // Poll for logs every second + logPollInterval = setInterval(fetchLogs, 1000); + // Initial fetch + fetchLogs(); +} + + +async function fetchLogs() { + try { + const res = await fetch('/api/logs?lines=200'); + const data = await res.json(); + + const logEl = document.getElementById('server-log'); + if (!logEl) return; + + // Only update if we have new logs + if (data.logs.length !== lastLogCount) { + lastLogCount = data.logs.length; + + logEl.innerHTML = data.logs.map(line => { + let className = 'log-line'; + if (line.includes('Error') || line.includes('error') || line.includes('ERROR')) { + className += ' error'; + } else if (line.includes('Warning') || line.includes('WARNING')) { + className += ' warning'; + } else if (line.includes('[TTS]')) { + className += ' tts'; + } else if (line.includes('[Chat]')) { + className += ' chat'; + } + return `
${escapeHtml(line)}
`; + }).join(''); + + if (autoScroll) { + logEl.scrollTop = logEl.scrollHeight; + } + } + } catch (err) { + // Server might be down, that's ok + console.log('Log fetch failed (server may be restarting)'); + } +} + + +function escapeHtml(text) { + const div = document.createElement('div'); + div.textContent = text; + return div.innerHTML; +} + + +async function restartServer() { + if (!confirm('Restart the server? This will briefly disconnect you.')) return; + + try { + await fetch('/api/server/restart', { method: 'POST' }); + log('Server restart requested...'); + + // Clear the log and wait for server to come back + document.getElementById('server-log').innerHTML = '
Restarting server...
'; + + // Poll until server is back + let attempts = 0; + const checkServer = setInterval(async () => { + attempts++; + try { + const res = await fetch('/api/server/status'); + if (res.ok) { + clearInterval(checkServer); + log('Server restarted successfully'); + await loadSettings(); + } + } catch (e) { + if (attempts > 30) { + clearInterval(checkServer); + log('Server did not restart - check terminal'); + } + } + }, 1000); + + } catch (err) { + log('Failed to restart server: ' + err.message); + } +} + + +async function stopServer() { + if (!confirm('Stop the server? You will need to restart it manually.')) return; + + try { + await fetch('/api/server/stop', { method: 'POST' }); + log('Server stop requested...'); + document.getElementById('server-log').innerHTML = '
Server stopped. Run ./run.sh to restart.
'; + } catch (err) { + log('Failed to stop server: ' + err.message); + } +} diff --git a/generate_callers.py b/generate_callers.py new file mode 100644 index 0000000..335a1e0 --- /dev/null +++ b/generate_callers.py @@ -0,0 +1,77 @@ +import os +os.environ["SUNO_USE_SMALL_MODELS"] = "False" + +from bark import generate_audio, preload_models +from scipy.io.wavfile import write as write_wav +from scipy.signal import butter, filtfilt +import numpy as np + +def phone_filter(audio, sample_rate=24000): + """Apply telephone bandpass filter (300Hz - 3400Hz)""" + low = 300 / (sample_rate / 2) + high = 3400 / (sample_rate / 2) + b, a = butter(4, [low, high], btype='band') + filtered = filtfilt(b, a, audio) + + # Add slight compression and normalize + filtered = np.tanh(filtered * 1.5) * 0.9 + return filtered.astype(np.float32) + +# Define your callers +CALLERS = [ + { + "name": "caller1_mike", + "voice": "v2/en_speaker_6", + "text": """Hey, thanks for taking my call! + So I've been thinking about this a lot and... + I know it sounds crazy, but hear me out.""" + }, + { + "name": "caller2_sarah", + "voice": "v2/en_speaker_9", + "text": """Hi! Oh my gosh, I can't believe I got through. + Okay so... this is kind of a long story, + but basically I had this experience last week that blew my mind.""" + }, + { + "name": "caller3_dave", + "voice": "v2/en_speaker_1", + "text": """Yeah, hey. First time caller, long time listener. + Look, I gotta be honest with you here, + I think you're missing something important.""" + }, + { + "name": "caller4_jenny", + "voice": "v2/en_speaker_3", + "text": """Okay okay, so get this... + I was literally just talking about this with my friend yesterday! + And she said, and I quote, well, I can't say that on air.""" + }, +] + +def main(): + print("Loading models...") + preload_models() + + os.makedirs("output", exist_ok=True) + + for caller in CALLERS: + print(f"\nGenerating: {caller['name']}") + + # Generate raw audio + audio = generate_audio(caller["text"], history_prompt=caller["voice"]) + + # Save clean version + write_wav(f"output/{caller['name']}_clean.wav", 24000, audio) + + # Apply phone filter and save + phone_audio = phone_filter(audio) + write_wav(f"output/{caller['name']}_phone.wav", 24000, phone_audio) + + print(f" Saved: output/{caller['name']}_clean.wav") + print(f" Saved: output/{caller['name']}_phone.wav") + + print("\nDone! Check the output/ folder.") + +if __name__ == "__main__": + main() diff --git a/generate_sounds.py b/generate_sounds.py new file mode 100644 index 0000000..946155b --- /dev/null +++ b/generate_sounds.py @@ -0,0 +1,102 @@ +#!/usr/bin/env python3 +""" +Generate sound effects using ElevenLabs Sound Effects API +""" + +import os +from pathlib import Path +from dotenv import load_dotenv + +load_dotenv() + +SOUNDS_DIR = Path(__file__).parent / "sounds" +SOUNDS_DIR.mkdir(exist_ok=True) + +# Sound effects to generate with descriptions +SOUND_EFFECTS = { + 'airhorn.wav': 'loud air horn blast, sports event', + 'boo.wav': 'crowd booing, disappointed audience', + 'crickets.wav': 'crickets chirping, awkward silence', + 'drumroll.wav': 'drum roll, building suspense', + 'buzzer.wav': 'game show wrong answer buzzer', + 'laugh.wav': 'audience laughing, sitcom laugh track', + 'rimshot.wav': 'ba dum tss, drum rimshot comedy', + 'sad_trombone.wav': 'sad trombone, wah wah wah failure sound', + 'phone_ring.wav': 'old telephone ringing', + 'cheer.wav': 'crowd cheering and applauding', + 'scratch.wav': 'vinyl record scratch', + 'wow.wav': 'crowd saying wow, impressed reaction', + 'fart.wav': 'comedic fart sound effect', + 'victory.wav': 'victory fanfare, triumphant horns', + 'uh_oh.wav': 'uh oh, something went wrong sound', +} + +def generate_sound(name, description): + """Generate a sound effect using ElevenLabs""" + from elevenlabs.client import ElevenLabs + import soundfile as sf + import numpy as np + + output_path = SOUNDS_DIR / name + + if output_path.exists(): + print(f" āœ“ {name} (already exists)") + return True + + try: + print(f" Generating {name}: '{description}'...") + + client = ElevenLabs(api_key=os.getenv('ELEVENLABS_API_KEY')) + + # Generate sound effect + result = client.text_to_sound_effects.convert( + text=description, + duration_seconds=2.0, + ) + + # Collect audio data + audio_data = b''.join(result) + + # Save as mp3 first, then convert + temp_mp3 = SOUNDS_DIR / f"temp_{name}.mp3" + with open(temp_mp3, 'wb') as f: + f.write(audio_data) + + # Convert to wav with ffmpeg + import subprocess + subprocess.run([ + 'ffmpeg', '-y', '-i', str(temp_mp3), + '-ar', '24000', '-ac', '1', + str(output_path) + ], capture_output=True, check=True) + + temp_mp3.unlink() + print(f" āœ“ {name}") + return True + + except Exception as e: + print(f" āœ— {name} ({e})") + return False + +def main(): + print("Generating sound effects with ElevenLabs...") + print(f"Saving to: {SOUNDS_DIR}") + print("(This uses your ElevenLabs credits)\n") + + # Check for ffmpeg + import subprocess + try: + subprocess.run(['ffmpeg', '-version'], capture_output=True, check=True) + except: + print("ERROR: ffmpeg required. Install with: brew install ffmpeg") + return + + success = 0 + for name, description in SOUND_EFFECTS.items(): + if generate_sound(name, description): + success += 1 + + print(f"\nGenerated {success}/{len(SOUND_EFFECTS)} sounds.") + +if __name__ == "__main__": + main() diff --git a/publish_episode.py b/publish_episode.py new file mode 100755 index 0000000..4dae7b3 --- /dev/null +++ b/publish_episode.py @@ -0,0 +1,400 @@ +#!/usr/bin/env python3 +""" +Podcast Episode Publisher +Transcribes audio, generates metadata, and publishes to Castopod. + +Usage: + python publish_episode.py /path/to/episode.mp3 + python publish_episode.py /path/to/episode.mp3 --episode-number 3 + python publish_episode.py /path/to/episode.mp3 --dry-run +""" + +import argparse +import json +import os +import re +import subprocess +import sys +import base64 +from pathlib import Path + +import requests +from dotenv import load_dotenv + +# Load environment variables +load_dotenv(Path(__file__).parent / ".env") + +# Configuration +CASTOPOD_URL = "https://podcast.macneilmediagroup.com" +CASTOPOD_USERNAME = "admin" +CASTOPOD_PASSWORD = "podcast2026api" +PODCAST_ID = 1 +PODCAST_HANDLE = "LukeAtTheRoost" +OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY") +WHISPER_MODEL = "base" # Options: tiny, base, small, medium, large + +# NAS Configuration for chapters upload +NAS_HOST = "mmgnas-10g" +NAS_USER = "luke" +NAS_SSH_PORT = 8001 +DOCKER_PATH = "/share/CACHEDEV1_DATA/.qpkg/container-station/bin/docker" +CASTOPOD_CONTAINER = "castopod-castopod-1" +MARIADB_CONTAINER = "castopod-mariadb-1" +DB_USER = "castopod" +DB_PASS = "BYtbFfk3ndeVabb26xb0UyKU" +DB_NAME = "castopod" + + +def get_auth_header(): + """Get Basic Auth header for Castopod API.""" + credentials = base64.b64encode( + f"{CASTOPOD_USERNAME}:{CASTOPOD_PASSWORD}".encode() + ).decode() + return {"Authorization": f"Basic {credentials}"} + + +def transcribe_audio(audio_path: str) -> dict: + """Transcribe audio using faster-whisper with timestamps.""" + print(f"[1/5] Transcribing {audio_path}...") + + try: + from faster_whisper import WhisperModel + except ImportError: + print("Error: faster-whisper not installed. Run: pip install faster-whisper") + sys.exit(1) + + model = WhisperModel(WHISPER_MODEL, compute_type="int8") + segments, info = model.transcribe(audio_path, word_timestamps=True) + + transcript_segments = [] + full_text = [] + + for segment in segments: + transcript_segments.append({ + "start": segment.start, + "end": segment.end, + "text": segment.text.strip() + }) + full_text.append(segment.text.strip()) + + print(f" Transcribed {info.duration:.1f} seconds of audio") + + return { + "segments": transcript_segments, + "full_text": " ".join(full_text), + "duration": int(info.duration) + } + + +def generate_metadata(transcript: dict, episode_number: int) -> dict: + """Use LLM to generate title, description, and chapters from transcript.""" + print("[2/5] Generating metadata with LLM...") + + if not OPENROUTER_API_KEY: + print("Error: OPENROUTER_API_KEY not set in .env") + sys.exit(1) + + # Prepare transcript with timestamps for chapter detection + timestamped_text = "" + for seg in transcript["segments"]: + mins = int(seg["start"] // 60) + secs = int(seg["start"] % 60) + timestamped_text += f"[{mins:02d}:{secs:02d}] {seg['text']}\n" + + prompt = f"""Analyze this podcast transcript and generate metadata. + +TRANSCRIPT: +{timestamped_text} + +Generate a JSON response with: +1. "title": A catchy episode title (include "Episode {episode_number}:" prefix) +2. "description": A 2-4 sentence description summarizing the episode's content. Mention callers by name and their topics. End with something engaging. +3. "chapters": An array of chapter objects with "startTime" (in seconds) and "title". Include: + - "Intro" at 0 seconds + - A chapter for each caller/topic (use caller names if mentioned) + - "Outro" near the end + +Respond with ONLY valid JSON, no markdown or explanation.""" + + response = requests.post( + "https://openrouter.ai/api/v1/chat/completions", + headers={ + "Authorization": f"Bearer {OPENROUTER_API_KEY}", + "Content-Type": "application/json" + }, + json={ + "model": "anthropic/claude-3-haiku", + "messages": [{"role": "user", "content": prompt}], + "temperature": 0.7 + } + ) + + if response.status_code != 200: + print(f"Error from OpenRouter: {response.text}") + sys.exit(1) + + result = response.json() + content = result["choices"][0]["message"]["content"] + + # Parse JSON from response (handle markdown code blocks) + content = content.strip() + if content.startswith("```"): + content = re.sub(r"^```(?:json)?\n?", "", content) + content = re.sub(r"\n?```$", "", content) + + try: + metadata = json.loads(content) + except json.JSONDecodeError as e: + print(f"Error parsing LLM response: {e}") + print(f"Response was: {content}") + sys.exit(1) + + print(f" Title: {metadata['title']}") + print(f" Chapters: {len(metadata['chapters'])}") + + return metadata + + +def create_episode(audio_path: str, metadata: dict, duration: int) -> dict: + """Create episode on Castopod.""" + print("[3/5] Creating episode on Castopod...") + + headers = get_auth_header() + + # Upload audio and create episode + with open(audio_path, "rb") as f: + files = { + "audio_file": (Path(audio_path).name, f, "audio/mpeg") + } + data = { + "title": metadata["title"], + "description_markdown": metadata["description"], + "parental_advisory": "explicit", + "type": "full", + "created_by": "1" + } + + response = requests.post( + f"{CASTOPOD_URL}/api/rest/v1/podcasts/{PODCAST_ID}/episodes", + headers=headers, + files=files, + data=data + ) + + if response.status_code not in (200, 201): + print(f"Error creating episode: {response.text}") + sys.exit(1) + + episode = response.json() + print(f" Created episode ID: {episode['id']}") + print(f" Slug: {episode['slug']}") + + return episode + + +def publish_episode(episode_id: int) -> dict: + """Publish the episode.""" + print("[4/5] Publishing episode...") + + headers = get_auth_header() + + response = requests.post( + f"{CASTOPOD_URL}/api/rest/v1/episodes/{episode_id}/publish", + headers=headers, + data={ + "publication_method": "now", + "created_by": "1" + } + ) + + if response.status_code != 200: + print(f"Error publishing: {response.text}") + sys.exit(1) + + episode = response.json() + published_at = episode.get("published_at", {}) + if isinstance(published_at, dict): + print(f" Published at: {published_at.get('date', 'unknown')}") + else: + print(f" Published at: {published_at}") + + return episode + + +def save_chapters(metadata: dict, output_path: str): + """Save chapters to JSON file.""" + chapters_data = { + "version": "1.2.0", + "chapters": metadata["chapters"] + } + + with open(output_path, "w") as f: + json.dump(chapters_data, f, indent=2) + + print(f" Chapters saved to: {output_path}") + + +def run_ssh_command(command: str) -> tuple[bool, str]: + """Run a command on the NAS via SSH.""" + ssh_cmd = [ + "ssh", "-p", str(NAS_SSH_PORT), + f"{NAS_USER}@{NAS_HOST}", + command + ] + try: + result = subprocess.run(ssh_cmd, capture_output=True, text=True, timeout=30) + return result.returncode == 0, result.stdout.strip() or result.stderr.strip() + except subprocess.TimeoutExpired: + return False, "SSH command timed out" + except Exception as e: + return False, str(e) + + +def upload_chapters_to_castopod(episode_slug: str, episode_id: int, chapters_path: str) -> bool: + """Upload chapters file to Castopod via SSH and link in database.""" + print("[4.5/5] Uploading chapters to Castopod...") + + chapters_filename = f"{episode_slug}-chapters.json" + remote_path = f"podcasts/{PODCAST_HANDLE}/{chapters_filename}" + + # Read local chapters file + with open(chapters_path, "r") as f: + chapters_content = f.read() + + # Base64 encode for safe transfer + chapters_b64 = base64.b64encode(chapters_content.encode()).decode() + + # Upload file to container using base64 decode + upload_cmd = f'echo "{chapters_b64}" | base64 -d | {DOCKER_PATH} exec -i {CASTOPOD_CONTAINER} tee /var/www/castopod/public/media/{remote_path} > /dev/null' + success, output = run_ssh_command(upload_cmd) + if not success: + print(f" Warning: Failed to upload chapters file: {output}") + return False + + # Get file size + file_size = len(chapters_content) + + # Insert into media table + insert_sql = f"""INSERT INTO cp_media (file_key, file_size, file_mimetype, type, uploaded_by, updated_by, uploaded_at, updated_at) + VALUES ('{remote_path}', {file_size}, 'application/json', 'chapters', 1, 1, NOW(), NOW())""" + db_cmd = f'{DOCKER_PATH} exec {MARIADB_CONTAINER} mysql -u {DB_USER} -p{DB_PASS} {DB_NAME} -e "{insert_sql}; SELECT LAST_INSERT_ID();"' + success, output = run_ssh_command(db_cmd) + if not success: + print(f" Warning: Failed to insert chapters in database: {output}") + return False + + # Parse media ID from output + try: + lines = output.strip().split('\n') + media_id = int(lines[-1]) + except (ValueError, IndexError): + print(f" Warning: Could not parse media ID from: {output}") + return False + + # Link chapters to episode + update_sql = f"UPDATE cp_episodes SET chapters_id = {media_id} WHERE id = {episode_id}" + db_cmd = f'{DOCKER_PATH} exec {MARIADB_CONTAINER} mysql -u {DB_USER} -p{DB_PASS} {DB_NAME} -e "{update_sql}"' + success, output = run_ssh_command(db_cmd) + if not success: + print(f" Warning: Failed to link chapters to episode: {output}") + return False + + # Clear Castopod cache + cache_cmd = f'{DOCKER_PATH} exec {CASTOPOD_CONTAINER} php spark cache:clear' + run_ssh_command(cache_cmd) + + print(f" Chapters uploaded and linked (media_id: {media_id})") + return True + + +def get_next_episode_number() -> int: + """Get the next episode number from Castopod.""" + headers = get_auth_header() + + response = requests.get( + f"{CASTOPOD_URL}/api/rest/v1/podcasts/{PODCAST_ID}/episodes", + headers=headers + ) + + if response.status_code != 200: + return 1 + + episodes = response.json() + if not episodes: + return 1 + + max_num = max(ep.get("number", 0) for ep in episodes) + return max_num + 1 + + +def main(): + parser = argparse.ArgumentParser(description="Publish podcast episode to Castopod") + parser.add_argument("audio_file", help="Path to the audio file (MP3)") + parser.add_argument("--episode-number", "-n", type=int, help="Episode number (auto-detected if not provided)") + parser.add_argument("--dry-run", "-d", action="store_true", help="Generate metadata but don't publish") + parser.add_argument("--title", "-t", help="Override generated title") + parser.add_argument("--description", help="Override generated description") + args = parser.parse_args() + + audio_path = Path(args.audio_file).expanduser().resolve() + if not audio_path.exists(): + print(f"Error: Audio file not found: {audio_path}") + sys.exit(1) + + # Determine episode number + if args.episode_number: + episode_number = args.episode_number + else: + episode_number = get_next_episode_number() + print(f"Episode number: {episode_number}") + + # Step 1: Transcribe + transcript = transcribe_audio(str(audio_path)) + + # Step 2: Generate metadata + metadata = generate_metadata(transcript, episode_number) + + # Apply overrides + if args.title: + metadata["title"] = args.title + if args.description: + metadata["description"] = args.description + + # Save chapters file + chapters_path = audio_path.with_suffix(".chapters.json") + save_chapters(metadata, str(chapters_path)) + + if args.dry_run: + print("\n[DRY RUN] Would publish with:") + print(f" Title: {metadata['title']}") + print(f" Description: {metadata['description']}") + print(f" Chapters: {json.dumps(metadata['chapters'], indent=2)}") + print("\nChapters file saved. Run without --dry-run to publish.") + return + + # Step 3: Create episode + episode = create_episode(str(audio_path), metadata, transcript["duration"]) + + # Step 4: Publish + episode = publish_episode(episode["id"]) + + # Step 4.5: Upload chapters via SSH + chapters_uploaded = upload_chapters_to_castopod( + episode["slug"], + episode["id"], + str(chapters_path) + ) + + # Step 5: Summary + print("\n[5/5] Done!") + print("=" * 50) + print(f"Episode URL: {CASTOPOD_URL}/@{PODCAST_HANDLE}/episodes/{episode['slug']}") + print(f"RSS Feed: {CASTOPOD_URL}/@{PODCAST_HANDLE}/feed.xml") + print("=" * 50) + if not chapters_uploaded: + print("\nNote: Chapters upload failed. Add manually via Castopod admin UI") + print(f" Chapters file: {chapters_path}") + + +if __name__ == "__main__": + main() diff --git a/radio_show.py b/radio_show.py new file mode 100644 index 0000000..21f6e36 --- /dev/null +++ b/radio_show.py @@ -0,0 +1,1553 @@ +#!/usr/bin/env python3 +""" +AI Radio Show - Real-time podcast with AI callers + +COMMANDS: + 1-9, 0, -, = : Switch callers + rec : Record your voice (Enter to stop) + t : Type instead of recording + h : Hang up (cut off caller) + q : End show and save + +MUSIC CONTROL: + m : Toggle music on/off + n : Next track + f : Fade out (take a call) + g : Fade back in (after call) + d : Toggle auto-duck on/off + + / vol- : Volume up/down + +SOUNDBOARD: + a=airhorn c=crickets e=buzzer r=rimshot s=sad trombone y=cheer + +SHOW FEATURES: + b / bobby : Co-host Bobby chimes in + p / producer: Get AI producer suggestion + ad : Play commercial break + news : Breaking news interruption + stingers : Generate caller intro stingers + +Music auto-ducks during recording/playback. Use [f] to fade out completely +for a caller, then [g] to bring it back. Toggle [d] for full manual control. +""" + +import os +import sys +import re +import json +import random +import threading +from datetime import datetime +from pathlib import Path + +import numpy as np +import sounddevice as sd +import soundfile as sf +from faster_whisper import WhisperModel +from scipy.signal import butter, filtfilt +from dotenv import load_dotenv + +load_dotenv() + +SAMPLE_RATE = 24000 +CHANNELS = 1 + +# Soundboard - manual sound effects +SOUNDBOARD = { + 'a': 'airhorn.wav', + 'c': 'crickets.wav', + 'e': 'buzzer.wav', + 'r': 'rimshot.wav', + 's': 'sad_trombone.wav', + 'y': 'cheer.wav', +} + +# Automatic sound effects for show events +SHOW_SOUNDS = { + 'ring': 'phone_ring.wav', + 'hangup': 'hangup.wav', + 'hold': 'hold_music.wav', + 'news': 'news_stinger.wav', + 'commercial': 'commercial_jingle.wav', +} + +# Caller stingers - short audio/voice clips that play when caller comes on +# Format: caller_key -> stinger filename (or None to skip) +# Place files in sounds/ directory or generate them +CALLER_STINGERS = { + "1": "stinger_tony.wav", # "Big Tony's on the line!" + "3": "stinger_rick.wav", # "Rick from Texas, yeehaw" + "5": "stinger_dennis.wav", # Slot machine sounds + "7": "stinger_earl.wav", # Country guitar riff + "=": "stinger_diane.wav", # Mysterious music +} +SOUNDS_DIR = Path(__file__).parent / "sounds" +MUSIC_DIR = Path(__file__).parent / "music" +MEMORY_FILE = Path(__file__).parent / "caller_memory.json" + + +class MusicPlayer: + """Background music player with ducking support""" + + def __init__(self, sample_rate=SAMPLE_RATE): + self.sample_rate = sample_rate + self.volume = 0.3 # Normal volume (0-1) + self.ducked_volume = 0.08 # Ducked volume + self.current_volume = 0.0 + self.target_volume = 0.0 + self.playing = False + self.stream = None + self.music_data = None + self.position = 0 + self.lock = threading.Lock() + self.fade_speed = 0.002 # Volume change per sample for smooth fades + self.tracks = [] + self.current_track_idx = 0 + self.music_audio = [] # For recording + self.auto_duck = True # Auto-duck during speech + self.faded_out = False # Manual fade state + + def load_tracks(self): + """Load all music files from music directory, shuffled""" + self.tracks = [] + if MUSIC_DIR.exists(): + for ext in ['*.wav', '*.mp3', '*.flac']: + self.tracks.extend(MUSIC_DIR.glob(ext)) + random.shuffle(self.tracks) + return len(self.tracks) + + def load_track(self, track_path): + """Load a single track""" + try: + import librosa + audio, sr = librosa.load(str(track_path), sr=self.sample_rate, mono=True) + self.music_data = audio.astype(np.float32) + self.position = 0 + return True + except Exception as e: + print(f" Error loading track: {e}") + return False + + def _audio_callback(self, outdata, frames, time_info, status): + """Stream callback - mixes music at current volume""" + with self.lock: + if self.music_data is None or not self.playing: + outdata.fill(0) + return + + # Get audio chunk + end_pos = self.position + frames + if end_pos > len(self.music_data): + # Loop the track + chunk = np.concatenate([ + self.music_data[self.position:], + self.music_data[:end_pos - len(self.music_data)] + ]) + self.position = end_pos - len(self.music_data) + else: + chunk = self.music_data[self.position:end_pos] + self.position = end_pos + + # Smooth volume fading + output = np.zeros(frames, dtype=np.float32) + for i in range(frames): + if self.current_volume < self.target_volume: + self.current_volume = min(self.current_volume + self.fade_speed, self.target_volume) + elif self.current_volume > self.target_volume: + self.current_volume = max(self.current_volume - self.fade_speed, self.target_volume) + output[i] = chunk[i] * self.current_volume if i < len(chunk) else 0 + + outdata[:, 0] = output + self.music_audio.append(output.copy()) + + def start(self): + """Start playing music""" + if not self.tracks: + if self.load_tracks() == 0: + print(" No music files found in music/") + return False + + if not self.tracks: + return False + + if not self.load_track(self.tracks[self.current_track_idx]): + return False + + self.playing = True + self.target_volume = self.volume + self.stream = sd.OutputStream( + samplerate=self.sample_rate, + channels=1, + callback=self._audio_callback, + blocksize=1024 + ) + self.stream.start() + return True + + def stop(self): + """Stop music""" + self.playing = False + self.target_volume = 0.0 + if self.stream: + self.stream.stop() + self.stream.close() + self.stream = None + + def duck(self): + """Lower volume for speech (auto-duck)""" + if self.auto_duck: + with self.lock: + self.target_volume = self.ducked_volume + + def unduck(self): + """Restore normal volume (auto-duck)""" + if self.auto_duck: + with self.lock: + self.target_volume = self.volume + + def fade_out(self): + """Manually fade music out completely""" + with self.lock: + self.target_volume = 0.0 + self.faded_out = True + + def fade_in(self): + """Manually fade music back in""" + with self.lock: + self.target_volume = self.volume + self.faded_out = False + + def toggle_auto_duck(self): + """Toggle automatic ducking on/off""" + self.auto_duck = not self.auto_duck + return self.auto_duck + + def next_track(self): + """Skip to next track""" + if not self.tracks: + return + self.current_track_idx = (self.current_track_idx + 1) % len(self.tracks) + with self.lock: + self.load_track(self.tracks[self.current_track_idx]) + return self.tracks[self.current_track_idx].stem + + def set_volume(self, vol): + """Set normal volume level (0-1)""" + self.volume = max(0.0, min(1.0, vol)) + if self.playing and self.target_volume > self.ducked_volume: + self.target_volume = self.volume + + def get_track_name(self): + """Get current track name""" + if self.tracks: + return self.tracks[self.current_track_idx].stem + return None + + +# ElevenLabs v3 audio tag instructions for prompts +EMOTE_INSTRUCTIONS = """ +RESPONSE LENGTH - THIS IS CRITICAL: +Keep responses SHORT. This is quick back-and-forth radio banter, not monologues. +- Most responses: 1 sentence (5-15 words) +- Sometimes: 2 sentences if you have something to say +- Rarely: 3 sentences max, only if really going off +- NEVER more than 3 sentences +Think quick jabs, reactions, comebacks - not paragraphs. + +Audio tags for emotion (use sparingly): +- [laughing] [chuckles] [giggling] - when funny +- [sighs] - exasperated +- [groaning] - annoyed +- [clears throat] - making a point + +Example good length: "Oh man, [laughing] that's the dumbest thing I've heard all day." +Example good length: "Yeah, no, I don't think so." +Example good length: "[sighs] Look, here's the thing - my brother-in-law's an idiot." +DO NOT use parentheses like (laughs) - only square brackets. +""" + +# Callers - real people who ASK QUESTIONS and bring TOPICS +CALLERS = { + "1": { + "name": "Tony from Staten Island", + "voice_id": "IKne3meq5aSn9XLyUdCD", + "phone_quality": "normal", # Landline from the garage + "prompt": f"""You're Tony, 47. You caught your wife texting some guy from her gym last week. You haven't said anything yet. You're calling because you need to talk about it but can't tell anyone you know. + +YOU DRIVE THE CONVERSATION. Ask the host: Have they ever been cheated on? What would they do? You want real advice, not bullshit. You're also curious about the host - are they married? Dating? You're gonna ask. + +You swear constantly. "Fuckin" and "shit" just come out. You get heated. You interrupt. You're not looking for comfort, you're looking for someone to tell you what to do. Be raw about the details - you saw the texts, they were flirty, maybe more. + +{EMOTE_INSTRUCTIONS}""" + }, + + "2": { + "name": "Jasmine from Atlanta", + "voice_id": "FGY2WhTYpPnrIDTdsKH5", + "phone_quality": "good", # Clear cell phone connection + "prompt": f"""You're Jasmine, 31. You just found out you make more money than your boyfriend and he's acting weird about it. You make $95k, he makes like $60k. Now he's being passive aggressive about everything. + +YOU ASK THE HOST DIRECTLY: Do men actually care about this? Is it an ego thing? Would YOU be weird if your girl made more? You want honest answers, not politically correct bullshit. + +You're smart, you're direct, you don't sugarcoat. You'll call out weak answers. You curse when you're making a point. You might get a little heated if the host says something you disagree with. + +{EMOTE_INSTRUCTIONS}""" + }, + + "3": { + "name": "Rick from Texas", + "voice_id": "JBFqnCBsd6RMkjVDRZzb", + "phone_quality": "bad", # Calling from his truck, bad signal + "prompt": f"""You're Rick, 52. Your 22-year-old daughter just told you she's dating a 41-year-old divorced guy with kids. You're trying not to lose your shit but you're losing your shit. + +ASK THE HOST: What's the oldest person you've dated? Is this weird or am I being crazy? At what age gap does it become creepy? You genuinely don't know if you're overreacting. + +You're a dad trying to be cool but struggling. You say "I'm not trying to be that guy, but..." a lot. You're protective but don't want to push her away. This is eating at you. Get personal with the host about their dating history. + +{EMOTE_INSTRUCTIONS}""" + }, + + "4": { + "name": "Megan from Portland", + "voice_id": "XrExE9yKIg1WjnnlVkGX", + "phone_quality": "good", # Young person with good phone + "prompt": f"""You're Megan, 28. You hooked up with your roommate's ex last weekend. She doesn't know. It's been awkward as fuck and you don't know if you should tell her or just pretend it never happened. + +ASK THE HOST: Have you ever hooked up with someone you shouldn't have? Do you tell people or take it to the grave? You need someone to tell you what to do here. + +You're messy but self-aware about it. You laugh at yourself. You'll share details if asked - how it happened, was it good, do you want it to happen again (maybe). You're not proud but you're not that sorry either. + +{EMOTE_INSTRUCTIONS}""" + }, + + "5": { + "name": "Dennis from Long Island", + "voice_id": "cjVigY5qzO86Huf0OWal", + "phone_quality": "terrible", # Calling from a casino bathroom, paranoid + "prompt": f"""You're Dennis, 45. You just got back from Vegas where you lost $8,000 at blackjack. Your wife thinks you were at a sales conference. You've never lied to her like this before and you feel sick about it. + +ASK THE HOST: Have you ever kept a secret this big from someone? How do you even begin to fix this? Should you tell her? You're spiraling a little. + +You're not a gambling addict, you just made a really stupid decision and it snowballed. You keep justifying it then stopping yourself. You need someone to either tell you it's gonna be okay or that you're an idiot. Either one. + +{EMOTE_INSTRUCTIONS}""" + }, + + "6": { + "name": "Tanya from Miami", + "voice_id": "N2lVS1w4EtoT3dr4eOWO", + "phone_quality": "good", # Clear connection + "prompt": f"""You're Tanya, 35. You've been on 47 first dates in the past year from apps. Not one second date. You're starting to think maybe it's you. + +ASK THE HOST BLUNTLY: What makes someone undateable? What's your worst date story? What's something that's an instant dealbreaker for you? You want to know what you might be doing wrong. + +You're funny about it but there's real frustration underneath. You'll roast yourself. You might ask the host to rate your dating profile opener if they give you one. You're tired of the apps but you keep going back. + +{EMOTE_INSTRUCTIONS}""" + }, + + "7": { + "name": "Earl from Tennessee", + "voice_id": "EXAVITQu4vr4xnSDxMaL", + "phone_quality": "bad", # Old guy, probably on a flip phone + "prompt": f"""You're Earl, 67. Your son came out as gay last year. You love him but you're from a different generation and you've said some dumb stuff. He's not talking to you. You don't know how to fix it. + +ASK THE HOST: How do you apologize when you know you were wrong but you're also old and set in your ways? You're trying. You went to a PFLAG meeting. You felt like an idiot but you went. + +You're genuine. You're not looking for someone to say you were right. You know you weren't. You just want your son back and don't know what to say to him. You might get emotional but you'll play it off. + +{EMOTE_INSTRUCTIONS}""" + }, + + "8": { + "name": "Carla from Jersey", + "voice_id": "CwhRBWXzGAHq8TQ4Fs17", + "phone_quality": "normal", # Kitchen landline + "prompt": f"""You're Carla, 39. You found your husband's Reddit account. He's been posting in relationship advice threads about how he's not attracted to you anymore since you gained weight after the kids. He doesn't know you saw it. + +ASK THE HOST: Have you ever said something behind someone's back you'd never say to their face? What would you do if you found out your partner thought you were ugly? + +You're hurt but also kind of pissed. You curse when you're angry. You're not crying, you're mad. You might roast the husband pretty hard. You want to know if you should confront him or just start the silent treatment. + +{EMOTE_INSTRUCTIONS}""" + }, + + "9": { + "name": "Marcus from Detroit", + "voice_id": "bIHbv24MWmeRgasZH58o", + "phone_quality": "normal", # Regular phone + "prompt": f"""You're Marcus, 26. You just turned down a job that pays $40k more because it would mean moving away from your boys. Everyone says you're an idiot. Maybe you are. + +ASK THE HOST: Have you ever turned down money for something that doesn't make sense on paper? Is loyalty to your friends stupid when you're young? You're second-guessing yourself. + +You're chill but this is weighing on you. You're from a tight neighborhood, these guys are like brothers. You know money matters but so does this. You want someone to either validate you or call you dumb so you can stop thinking about it. + +{EMOTE_INSTRUCTIONS}""" + }, + + "0": { + "name": "Brenda from Phoenix", + "voice_id": "Xb7hH8MSUJpSbSDYk0k2", + "phone_quality": "bad", # Outside at a family gathering, hiding + "prompt": f"""You're Brenda, 44. You're pretty sure your sister's husband hit on you at Thanksgiving. He put his hand on your lower back and said some shit. Now Christmas is coming up and you don't know what to do. + +ASK THE HOST: Do you tell your sister? Do you confront him? What if you're reading it wrong? You've been going back and forth for weeks. + +You're stressed. You and your sister are close. You don't want to blow up her marriage if it was nothing. But it didn't feel like nothing. You'll share the exact details and want the host's honest read on it. + +{EMOTE_INSTRUCTIONS}""" + }, + + "-": { + "name": "Jake from Boston", + "voice_id": "SOYHLrjzK2X1ezoPC6cr", + "phone_quality": "good", # Modern phone + "prompt": f"""You're Jake, 33. Your girlfriend wants to open the relationship and you said you'd think about it but you already know the answer is fuck no. You just don't know how to say it without losing her. + +ASK THE HOST: Is this one of those things where if they even ask, it's already over? Have you ever tried an open relationship? You're gonna press for real opinions here. + +You're a little insecure about it and you know it. You keep wondering if she already has someone in mind. You curse casually. You might get a little too honest about your fears. You want to be cool about it but you're not. + +{EMOTE_INSTRUCTIONS}""" + }, + + "=": { + "name": "Diane from Chicago", + "voice_id": "cgSgspJ2msm6clMCkdW9", + "phone_quality": "terrible", # Whispering from a bathroom at work + "prompt": f"""You're Diane, 51. You've been having an emotional affair with a coworker for six months. Nothing physical yet. You're calling because you're about to cross that line this week at a conference and part of you wants someone to talk you out of it. Part of you doesn't. + +ASK THE HOST: Have you ever wanted something you knew was wrong? Where's the line between emotional cheating and just having a close friend? You want to be challenged on this. + +You're not proud. You're not playing victim either. You'll be honest about the details - the texts, the almost-moments. Your marriage isn't bad, it's just... fine. You're conflicted and you know you're going to do it anyway. + +{EMOTE_INSTRUCTIONS}""" + }, +} + +CALLER_KEYS = list(CALLERS.keys()) + +# Co-host sidekick configuration +COHOST = { + "name": "Bobby", + "voice_id": "nPczCjzI2devNBz1zQrb", # Brian - male voice with character + "prompt": """You're Bobby, the wisecracking sidekick on a late-night radio show. You sit in the booth with the host and occasionally chime in with: +- Quick one-liners and reactions +- Roasting the callers or the host +- Sound effect suggestions ("That deserves a rimshot!") +- Agreeing or disagreeing with hot takes +- Asking follow-up questions the host missed + +You're NOT a caller - you're in the studio. No phone filter on your voice. +Keep responses SHORT - one sentence max, like a real radio sidekick. Think Robin Quivers or Billy West. +You curse casually. You laugh at your own jokes. You're loyal to the host but will bust their balls. + +Use audio tags sparingly: [laughing] [chuckles] [sighs] +""" +} + + +def phone_filter(audio, sample_rate=SAMPLE_RATE, quality="normal"): + """Apply phone filter with variable quality + + quality options: + - "good": Clear cell phone (wider bandwidth, less distortion) + - "normal": Standard phone line + - "bad": Crappy connection (narrow bandwidth, more noise/distortion) + - "terrible": Barely audible (extreme filtering, static) + """ + audio = audio.flatten() + + # Quality presets: (low_hz, high_hz, distortion, noise_level) + presets = { + "good": (200, 7000, 1.0, 0.0), # Clear cell phone + "normal": (300, 3400, 1.5, 0.005), # Standard landline + "bad": (400, 2800, 2.0, 0.015), # Bad connection + "terrible": (500, 2200, 2.5, 0.03), # Terrible connection + } + + low_hz, high_hz, distortion, noise = presets.get(quality, presets["normal"]) + + low = low_hz / (sample_rate / 2) + high = high_hz / (sample_rate / 2) + b, a = butter(4, [low, high], btype='band') + filtered = filtfilt(b, a, audio) + + # Add distortion + filtered = np.tanh(filtered * distortion) * 0.8 + + # Add noise/static for bad connections + if noise > 0: + static = np.random.normal(0, noise, len(filtered)).astype(np.float32) + # Make static intermittent for realism + static_envelope = np.random.random(len(filtered) // 1000 + 1) + static_envelope = np.repeat(static_envelope, 1000)[:len(filtered)] + static *= (static_envelope > 0.7).astype(np.float32) + filtered = filtered + static + + return filtered.astype(np.float32) + + +def de_ess(audio, sample_rate=SAMPLE_RATE, threshold=0.15, ratio=4.0): + """De-esser to reduce harsh sibilance (s, sh, ch sounds)""" + from scipy.signal import butter, filtfilt + + audio = audio.flatten().astype(np.float32) + + # Extract sibilant frequencies (4kHz - 9kHz) + sib_low = 4000 / (sample_rate / 2) + sib_high = min(9000 / (sample_rate / 2), 0.99) + sib_b, sib_a = butter(2, [sib_low, sib_high], btype='band') + sibilants = filtfilt(sib_b, sib_a, audio) + + # Envelope follower for sibilant band + envelope = np.abs(sibilants) + smooth_samples = int(0.005 * sample_rate) # 5ms attack + kernel = np.ones(smooth_samples) / smooth_samples + envelope = np.convolve(envelope, kernel, mode='same') + + # Apply gain reduction only to sibilant frequencies when above threshold + gain = np.ones_like(audio) + for i in range(len(audio)): + if envelope[i] > threshold: + reduction = threshold + (envelope[i] - threshold) / ratio + gain[i] = reduction / (envelope[i] + 1e-10) + + # Apply reduction only to the sibilant band, keep the rest + processed = audio - sibilants + (sibilants * gain) + + return processed.astype(np.float32) + + +def lufs_normalize(audio, sample_rate=SAMPLE_RATE, target_lufs=-16.0): + """Normalize audio to target LUFS (Loudness Units Full Scale) + -16 LUFS is standard for podcasts, -14 LUFS for streaming + """ + # Calculate integrated loudness using ITU-R BS.1770 + # Simplified implementation - K-weighted loudness measurement + + from scipy.signal import butter, filtfilt + + audio = audio.flatten().astype(np.float32) + + # K-weighting filter (simplified: shelf filter + highpass) + # High shelf boost at 1500Hz + nyq = sample_rate / 2 + high_b, high_a = butter(2, 1500 / nyq, btype='high') + weighted = filtfilt(high_b, high_a, audio) + + # Highpass at 100Hz + hp_b, hp_a = butter(2, 100 / nyq, btype='high') + weighted = filtfilt(hp_b, hp_a, weighted) + + # Calculate RMS of weighted signal (approximates LUFS) + # Split into 400ms blocks with 75% overlap + block_size = int(0.4 * sample_rate) + hop_size = int(0.1 * sample_rate) + + blocks = [] + for i in range(0, len(weighted) - block_size, hop_size): + block = weighted[i:i + block_size] + rms = np.sqrt(np.mean(block ** 2) + 1e-10) + blocks.append(rms) + + if not blocks: + return audio + + # Gated measurement - exclude blocks below -70 LUFS (absolute gate) + # Then exclude blocks below -10 LU from relative average + blocks = np.array(blocks) + abs_threshold = 10 ** (-70 / 20) # -70 LUFS in linear + gated_blocks = blocks[blocks > abs_threshold] + + if len(gated_blocks) == 0: + return audio + + # Relative gate at -10 LU below ungated average + avg_linear = np.mean(gated_blocks) + relative_threshold = avg_linear * (10 ** (-10 / 20)) + final_blocks = gated_blocks[gated_blocks > relative_threshold] + + if len(final_blocks) == 0: + return audio + + # Calculate current loudness + current_rms = np.mean(final_blocks) + current_lufs = 20 * np.log10(current_rms + 1e-10) + + # Calculate gain needed + gain_db = target_lufs - current_lufs + gain_linear = 10 ** (gain_db / 20) + + # Apply gain with soft limiting + normalized = audio * gain_linear + + # True peak limiting at -1 dBTP + max_peak = 10 ** (-1 / 20) # -1 dBTP + peak = np.max(np.abs(normalized)) + if peak > max_peak: + normalized = normalized * (max_peak / peak) + + return normalized.astype(np.float32) + + +def broadcast_process(audio, sample_rate=SAMPLE_RATE): + """Apply broadcast-style processing to host vocal: EQ + compression""" + from scipy.signal import butter, filtfilt, iirpeak + + audio = audio.flatten().astype(np.float32) + + # High-pass filter at 80Hz to remove rumble + hp_b, hp_a = butter(2, 80 / (sample_rate / 2), btype='high') + audio = filtfilt(hp_b, hp_a, audio) + + # Low-pass at 15kHz to remove harshness + lp_b, lp_a = butter(2, 15000 / (sample_rate / 2), btype='low') + audio = filtfilt(lp_b, lp_a, audio) + + # Presence boost around 3kHz for clarity + presence_b, presence_a = iirpeak(3000 / (sample_rate / 2), Q=1.5) + audio = filtfilt(presence_b, presence_a, audio) * 1.3 + + # Slight low-mid cut to reduce muddiness (300Hz) + mud_b, mud_a = iirpeak(300 / (sample_rate / 2), Q=2.0) + audio = audio - filtfilt(mud_b, mud_a, audio) * 0.2 + + # Compression: soft-knee compressor + threshold = 0.15 + ratio = 4.0 + makeup_gain = 2.5 + + # Simple envelope follower + envelope = np.abs(audio) + # Smooth the envelope + smooth_samples = int(0.01 * sample_rate) # 10ms attack/release + kernel = np.ones(smooth_samples) / smooth_samples + envelope = np.convolve(envelope, kernel, mode='same') + + # Apply compression + compressed = np.zeros_like(audio) + for i in range(len(audio)): + if envelope[i] > threshold: + gain_reduction = threshold + (envelope[i] - threshold) / ratio + compressed[i] = audio[i] * (gain_reduction / (envelope[i] + 1e-10)) + else: + compressed[i] = audio[i] + + # Makeup gain + compressed *= makeup_gain + + # Soft clip to prevent harsh distortion + compressed = np.tanh(compressed * 0.8) / 0.8 + + # Normalize + peak = np.max(np.abs(compressed)) + if peak > 0: + compressed = compressed * (0.9 / peak) + + return compressed.astype(np.float32) + + +def create_edited_mix(host_track, caller_track, music_track, sample_rate=SAMPLE_RATE): + """Create an edited mix with dead air removed and music crossfaded smoothly""" + # Combine voice tracks to detect silence + voice = np.abs(host_track) + np.abs(caller_track) + + # Find silence (below threshold) + threshold = 0.01 + window_size = int(0.1 * sample_rate) # 100ms window + + # Smooth the voice signal + kernel = np.ones(window_size) / window_size + voice_smooth = np.convolve(voice, kernel, mode='same') + + # Parameters + max_silence = int(1.0 * sample_rate) # Max 1 second of silence + min_silence = int(0.2 * sample_rate) # Keep at least 200ms for natural pauses + crossfade_len = int(0.1 * sample_rate) # 100ms crossfade for smooth music transitions + + # Find all silent regions + is_silent = voice_smooth < threshold + + # Build list of segments to keep + segments = [] # [(start, end), ...] + i = 0 + while i < len(host_track): + if not is_silent[i]: + # Start of non-silent region + seg_start = i + while i < len(host_track) and not is_silent[i]: + i += 1 + segments.append(('voice', seg_start, i)) + else: + # Silent region + silence_start = i + while i < len(host_track) and is_silent[i]: + i += 1 + silence_len = i - silence_start + + # Cap the silence length + keep_len = min(silence_len, max_silence) + keep_len = max(keep_len, min(silence_len, min_silence)) + segments.append(('silence', silence_start, silence_start + keep_len)) + + # Build the edited mix with crossfades + output = [] + prev_music_end = None + + for seg_type, start, end in segments: + seg_host = host_track[start:end] + seg_caller = caller_track[start:end] + + if music_track is not None: + seg_music = music_track[start:end].copy() + + # Apply crossfade at beginning if we skipped music + if prev_music_end is not None and start > prev_music_end + crossfade_len: + # We skipped some music - apply fade in + fade_samples = min(crossfade_len, len(seg_music)) + fade_in = np.linspace(0, 1, fade_samples) + seg_music[:fade_samples] *= fade_in + + # Mark where this segment's music ends for next iteration + prev_music_end = end + else: + seg_music = np.zeros_like(seg_host) + + # Mix this segment + seg_mix = seg_host * 1.0 + seg_caller * 0.85 + seg_music * 0.35 + output.append(seg_mix) + + if not output: + return np.array([], dtype=np.float32) + + edited_mix = np.concatenate(output) + + # Normalize + peak = np.max(np.abs(edited_mix)) + if peak > 0.95: + edited_mix = edited_mix * (0.95 / peak) + + return edited_mix.astype(np.float32) + + +def play_sound(key): + """Play sound effect in background""" + if key not in SOUNDBOARD: + return False + sound_file = SOUNDS_DIR / SOUNDBOARD[key] + if not sound_file.exists(): + return False + + def _play(): + try: + data, sr = sf.read(sound_file) + if len(data.shape) > 1: + data = data.mean(axis=1) + if sr != SAMPLE_RATE: + import librosa + data = librosa.resample(data.astype(np.float32), orig_sr=sr, target_sr=SAMPLE_RATE) + sd.play(data.astype(np.float32), SAMPLE_RATE) + except Exception as e: + print(f" Sound error: {e}") + + threading.Thread(target=_play, daemon=True).start() + return True + + +def play_show_sound(sound_name, wait=False): + """Play automatic show sound effect (ring, hangup, hold, etc.)""" + if sound_name not in SHOW_SOUNDS: + return False + sound_file = SOUNDS_DIR / SHOW_SOUNDS[sound_name] + if not sound_file.exists(): + return False + + try: + data, sr = sf.read(sound_file) + if len(data.shape) > 1: + data = data.mean(axis=1) + if sr != SAMPLE_RATE: + import librosa + data = librosa.resample(data.astype(np.float32), orig_sr=sr, target_sr=SAMPLE_RATE) + sd.play(data.astype(np.float32), SAMPLE_RATE) + if wait: + sd.wait() + except Exception as e: + print(f" Sound error: {e}") + return False + return True + + +def play_caller_stinger(caller_key, wait=True): + """Play a caller's intro stinger if it exists""" + if caller_key not in CALLER_STINGERS: + return False + + stinger_file = SOUNDS_DIR / CALLER_STINGERS[caller_key] + if not stinger_file.exists(): + return False + + try: + data, sr = sf.read(stinger_file) + if len(data.shape) > 1: + data = data.mean(axis=1) + if sr != SAMPLE_RATE: + import librosa + data = librosa.resample(data.astype(np.float32), orig_sr=sr, target_sr=SAMPLE_RATE) + sd.play(data.astype(np.float32), SAMPLE_RATE) + if wait: + sd.wait() + return True + except Exception as e: + return False + + +class RadioShow: + def __init__(self): + # Timeline-based audio recording for aligned export + self.session_start = None # Set when show starts + self.audio_timeline = [] # [(start_time, track_type, audio_data), ...] + self.show_history = [] + self.conversation_history = [] + self.current_caller = CALLERS["1"] + self.music = MusicPlayer() + + # Load persistent caller memory from previous episodes + self.caller_memory = self._load_caller_memory() + + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + self.output_dir = Path(f"sessions/{timestamp}") + self.output_dir.mkdir(parents=True, exist_ok=True) + + print("\nšŸŽ™ļø Loading AI Radio Show...") + self._load_models() + + def _load_models(self): + print(" Loading Whisper...") + self.whisper_model = WhisperModel("base", device="cpu", compute_type="int8") + + print(" Connecting to ElevenLabs...") + from elevenlabs.client import ElevenLabs + self.tts_client = ElevenLabs(api_key=os.getenv("ELEVENLABS_API_KEY")) + + print(" Connecting to OpenAI...") + from openai import OpenAI + self.openai = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) + + self.cohost_enabled = True + self.last_exchange = None # Track last host/caller exchange for co-host context + + available = [k for k in SOUNDBOARD if (SOUNDS_DIR / SOUNDBOARD[k]).exists()] + if available: + print(f" Soundboard: {', '.join(available)} ready") + else: + print(" Soundboard: no sounds found in sounds/") + + num_tracks = self.music.load_tracks() + if num_tracks: + print(f" Music: {num_tracks} tracks loaded") + else: + print(" Music: no tracks in music/ (add .wav/.mp3 files)") + + # Check for caller stingers + stinger_count = sum(1 for k in CALLER_STINGERS if (SOUNDS_DIR / CALLER_STINGERS[k]).exists()) + if stinger_count > 0: + print(f" Stingers: {stinger_count} caller stingers loaded") + + # Report on persistent memory + callers_with_memory = len([k for k, v in self.caller_memory.items() if v.get('calls', [])]) + if callers_with_memory > 0: + print(f" Memory: {callers_with_memory} callers have history from previous episodes") + + print(" Ready!\n") + + def _load_caller_memory(self): + """Load persistent caller memory from JSON file""" + if MEMORY_FILE.exists(): + try: + with open(MEMORY_FILE, 'r') as f: + return json.load(f) + except Exception as e: + print(f" Warning: Could not load caller memory: {e}") + return {} + + def _save_caller_memory(self): + """Save persistent caller memory to JSON file""" + try: + with open(MEMORY_FILE, 'w') as f: + json.dump(self.caller_memory, f, indent=2) + except Exception as e: + print(f" Warning: Could not save caller memory: {e}") + + def generate_caller_stingers(self): + """Generate TTS stingers for callers that don't have them""" + stinger_texts = { + "1": "Big Tony's on the line!", + "3": "Rick from Texas, calling in!", + "5": "Dennis is back, folks!", + "7": "Earl from Tennessee on line one!", + "=": "Diane's calling in from Chicago...", + } + + print("\n šŸŽ™ļø Generating caller stingers...") + for key, text in stinger_texts.items(): + if key not in CALLER_STINGERS: + continue + stinger_file = SOUNDS_DIR / CALLER_STINGERS[key] + if stinger_file.exists(): + print(f" āœ“ {CALLERS[key]['name']} (exists)") + continue + + try: + audio_gen = self.tts_client.text_to_speech.convert( + voice_id="ErXwobaYiN019PkySvjV", # Announcer voice + text=text, + model_id="eleven_v3", + output_format="pcm_24000" + ) + audio_bytes = b"".join(audio_gen) + audio = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32768.0 + sf.write(stinger_file, audio, SAMPLE_RATE) + print(f" āœ“ {CALLERS[key]['name']}") + except Exception as e: + print(f" āœ— {CALLERS[key]['name']}: {e}") + + def print_status(self): + print("\n" + "=" * 60) + print(f" šŸ“ž ON THE LINE: {self.current_caller['name']}") + if self.music.playing: + track = self.music.get_track_name() or "Unknown" + duck_status = "auto-duck" if self.music.auto_duck else "manual" + faded = " (faded)" if self.music.faded_out else "" + print(f" šŸŽµ MUSIC: {track} [{duck_status}]{faded}") + print("=" * 60) + print(" [rec] Record [t] Type [h] Hang up [q] Quit") + print(" [1-9,0,-,=] Switch caller [b] Bobby [p] Producer tip") + print(" [m] Music on/off [n] Next [f] Fade out [g] Fade in [d] Auto-duck") + print(" [ad] Commercial [news] Breaking news") + avail = [f"{k}={SOUNDBOARD[k].replace('.wav','')}" for k in SOUNDBOARD if (SOUNDS_DIR / SOUNDBOARD[k]).exists()] + if avail: + print(f" Sounds: {' '.join(avail[:6])}") + print("-" * 60) + + def get_session_time(self): + """Get seconds since session started""" + if self.session_start is None: + return 0.0 + return (datetime.now() - self.session_start).total_seconds() + + def record_audio(self): + print("\n šŸŽ¤ Recording... (press Enter to stop)") + self.music.duck() # Lower music while recording + start_time = self.get_session_time() + chunks = [] + recording = True + + def callback(indata, frames, time_info, status): + if recording: + chunks.append(indata.copy()) + + with sd.InputStream(samplerate=SAMPLE_RATE, channels=CHANNELS, callback=callback): + input() + + recording = False + self.music.unduck() # Restore music volume + if chunks: + audio = np.vstack(chunks) + self.audio_timeline.append((start_time, 'host', audio.flatten())) + return audio + return None + + def transcribe(self, audio): + import librosa + audio_16k = librosa.resample(audio.flatten().astype(np.float32), orig_sr=SAMPLE_RATE, target_sr=16000) + segments, _ = self.whisper_model.transcribe(audio_16k) + return " ".join([s.text for s in segments]).strip() + + def generate_response(self, user_text): + self.conversation_history.append({"role": "user", "content": user_text}) + + # Build rich context about the show so far + context = "" + caller_name = self.current_caller["name"] + + # Check persistent memory from PREVIOUS EPISODES + if caller_name in self.caller_memory: + mem = self.caller_memory[caller_name] + if mem.get('calls'): + context += f"\n\nYOU'VE CALLED THIS SHOW BEFORE (previous episodes):\n" + for call in mem['calls'][-3:]: # Last 3 calls from previous episodes + date = call.get('date', 'recently') + topic = call.get('topic', '')[:100] + context += f"- On {date}, you talked about: \"{topic}...\"\n" + context += "You're a REPEAT CALLER. Reference your previous calls! 'Hey, I called last week about...' or 'Remember me? I'm the one who...'\n" + + # Check if this caller has called before THIS EPISODE (callback) + prev_calls = [h for h in self.show_history if h['caller'] == caller_name] + if prev_calls: + context += f"\n\nYOU CALLED EARLIER TONIGHT:\n" + for call in prev_calls[-3:]: + context += f"- You said: \"{call['summary'][:100]}...\"\n" + context += "Reference your earlier call! Say 'like I said before' or 'I've been thinking about what we talked about.'\n" + + # Show what other callers have said + other_callers = [h for h in self.show_history if h['caller'] != caller_name] + if other_callers: + context += "\n\nOTHER CALLERS ON THE SHOW TONIGHT:\n" + for entry in other_callers[-6:]: + context += f"- {entry['caller']} said: \"{entry['summary'][:80]}...\"\n" + context += "\nYou can react to what other callers said! Agree, disagree, or roast them. 'That guy Tony is full of shit' or 'I agree with what that lady said earlier.'\n" + + # Encourage engagement with host + context += "\nRemember to ASK THE HOST questions and get their opinion. Make it a conversation, not a monologue.\n" + + messages = [ + {"role": "system", "content": self.current_caller["prompt"] + context}, + *self.conversation_history[-10:] + ] + + response = self.openai.chat.completions.create( + model="gpt-5", + messages=messages + ) + + reply = response.choices[0].message.content + self.conversation_history.append({"role": "assistant", "content": reply}) + + self.show_history.append({ + "caller": caller_name, + "summary": reply, + "host_said": user_text + }) + + # Update persistent caller memory + if caller_name not in self.caller_memory: + self.caller_memory[caller_name] = {"calls": []} + + # Add this exchange to their memory + self.caller_memory[caller_name]["calls"].append({ + "date": datetime.now().strftime("%Y-%m-%d"), + "topic": reply[:200], + "host_said": user_text[:200] + }) + + # Keep only last 10 calls per caller to prevent bloat + self.caller_memory[caller_name]["calls"] = self.caller_memory[caller_name]["calls"][-10:] + + # Track for co-host context + self.last_exchange = { + "host": user_text, + "caller": reply + } + + return reply + + def play_commercial(self): + """Generate and play a fake radio commercial""" + print("\n šŸ“ŗ COMMERCIAL BREAK...") + self.music.fade_out() + + # Generate fake ad copy + ad_products = [ + "a questionable legal service", + "a local car dealership", + "a mattress store having its 'biggest sale ever'", + "a personal injury lawyer", + "a cash-for-gold place", + "a diet pill with suspicious claims", + "a local furniture store", + "a technical school", + "a reverse mortgage company", + "a cryptocurrency exchange", + ] + product = random.choice(ad_products) + + response = self.openai.chat.completions.create( + model="gpt-5", + messages=[{ + "role": "system", + "content": f"Write a short, cheesy radio commercial (2-3 sentences) for {product}. Make it sound like a real low-budget local radio ad. Include a fake phone number or website. Be funny but realistic." + }] + ) + ad_text = response.choices[0].message.content + + # Play jingle + play_show_sound('commercial', wait=True) + + # Speak the ad with a different voice (announcer voice) + print(f" šŸŽ™ļø '{ad_text[:50]}...'") + audio_gen = self.tts_client.text_to_speech.convert( + voice_id="ErXwobaYiN019PkySvjV", # Antoni - good announcer voice + text=ad_text, + model_id="eleven_v3", + output_format="pcm_24000" + ) + audio_bytes = b"".join(audio_gen) + audio = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32768.0 + + # Record to timeline + start_time = self.get_session_time() + self.audio_timeline.append((start_time, 'caller', audio)) + + sd.play(audio, SAMPLE_RATE) + sd.wait() + + # End jingle + play_show_sound('commercial', wait=True) + print(" šŸ“ŗ Back to the show!\n") + self.music.fade_in() + + def play_breaking_news(self): + """Generate and play fake breaking news""" + print("\n 🚨 BREAKING NEWS...") + self.music.fade_out() + + # Generate fake breaking news + response = self.openai.chat.completions.create( + model="gpt-5", + messages=[{ + "role": "system", + "content": "Write a short, absurd fake breaking news alert (1-2 sentences) that sounds urgent but is about something ridiculous. Like 'area man does something mundane' or 'local business makes questionable decision'. Make it funny but delivered deadpan serious." + }] + ) + news_text = "Breaking news. " + response.choices[0].message.content + + # Play news stinger + play_show_sound('news', wait=True) + + print(f" šŸ“° '{news_text[:50]}...'") + audio_gen = self.tts_client.text_to_speech.convert( + voice_id="ErXwobaYiN019PkySvjV", # Announcer voice + text=news_text, + model_id="eleven_v3", + output_format="pcm_24000" + ) + audio_bytes = b"".join(audio_gen) + audio = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32768.0 + + start_time = self.get_session_time() + self.audio_timeline.append((start_time, 'caller', audio)) + + sd.play(audio, SAMPLE_RATE) + sd.wait() + + print(" 🚨 And now back to our program.\n") + self.music.fade_in() + + def speak(self, text): + if not text.strip(): + return + + print(" šŸ”Š Generating voice...") + + # Use eleven_v3 which supports audio tags like [laughing], [sighs], etc. + audio_gen = self.tts_client.text_to_speech.convert( + voice_id=self.current_caller["voice_id"], + text=text, + model_id="eleven_v3", + output_format="pcm_24000" + ) + + audio_bytes = b"".join(audio_gen) + audio = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32768.0 + + # Apply phone filter with caller's connection quality + quality = self.current_caller.get("phone_quality", "normal") + filtered = phone_filter(audio, quality=quality) + + # Record with timestamp for aligned export + start_time = self.get_session_time() + self.audio_timeline.append((start_time, 'caller', filtered.flatten())) + + print(" šŸ“» Playing...") + self.music.duck() # Lower music while caller speaks + sd.play(filtered, SAMPLE_RATE) + sd.wait() + self.music.unduck() # Restore music volume + + def cohost_chime_in(self, context=None): + """Have the co-host Bobby chime in with a comment""" + if not self.cohost_enabled: + return + + # Build context for co-host + if context is None and self.last_exchange: + context = f"The caller {self.current_caller['name']} just said: \"{self.last_exchange['caller']}\"\nThe host said: \"{self.last_exchange['host']}\"" + elif context is None: + context = f"Currently on the line: {self.current_caller['name']}" + + # Recent show context + recent_history = "" + if self.show_history: + recent_history = "\n\nRecent show moments:\n" + for entry in self.show_history[-3:]: + recent_history += f"- {entry['caller']}: {entry['summary'][:60]}...\n" + + response = self.openai.chat.completions.create( + model="gpt-5", + messages=[ + {"role": "system", "content": COHOST["prompt"] + recent_history}, + {"role": "user", "content": f"React to this: {context}\n\nGive a quick one-liner reaction, agreement, disagreement, or joke. ONE SENTENCE MAX."} + ] + ) + + comment = response.choices[0].message.content + print(f"\n šŸŽ™ļø BOBBY: {comment}") + + # Generate voice without phone filter (co-host is in studio) + audio_gen = self.tts_client.text_to_speech.convert( + voice_id=COHOST["voice_id"], + text=comment, + model_id="eleven_v3", + output_format="pcm_24000" + ) + + audio_bytes = b"".join(audio_gen) + audio = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32768.0 + + # Record to timeline (as caller track for simplicity) + start_time = self.get_session_time() + self.audio_timeline.append((start_time, 'caller', audio)) + + self.music.duck() + sd.play(audio, SAMPLE_RATE) + sd.wait() + self.music.unduck() + + def get_producer_suggestion(self): + """Get a suggestion from the AI producer based on show context""" + # Build context for producer + recent_exchanges = "" + if self.show_history: + for entry in self.show_history[-5:]: + recent_exchanges += f"- {entry['caller']}: {entry['summary'][:80]}...\n" + recent_exchanges += f" Host said: {entry['host_said'][:60]}...\n" + + current_caller = self.current_caller["name"] + show_length = self.get_session_time() / 60 # in minutes + + # Get list of callers we haven't heard from + callers_heard = set(h['caller'] for h in self.show_history) + callers_not_heard = [c['name'] for k, c in CALLERS.items() if c['name'] not in callers_heard] + + response = self.openai.chat.completions.create( + model="gpt-5", + messages=[{ + "role": "system", + "content": f"""You're a radio show producer giving the host quick suggestions in their earpiece. +Keep suggestions SHORT - one line max. Be direct. + +Current show info: +- Show has been running for {show_length:.1f} minutes +- Currently talking to: {current_caller} +- Callers we've heard from: {', '.join(callers_heard) if callers_heard else 'None yet'} +- Callers waiting: {', '.join(callers_not_heard[:4]) if callers_not_heard else 'All callers have called'} + +Recent exchanges: +{recent_exchanges if recent_exchanges else 'Show just started'} + +Give ONE tactical suggestion. Options: +- Suggest a follow-up question to ask the current caller +- Suggest switching to a different caller (and why) +- Suggest playing a sound effect (airhorn, rimshot, crickets, etc) +- Suggest a commercial break or breaking news bit +- Suggest having Bobby (co-host) chime in +- Suggest a topic pivot or callback to an earlier caller + +Be brief. Example: "Ask Tony if he's confronted her yet" or "Good time for a rimshot" or "Switch to Jasmine, she'll have opinions on this" +""" + }] + ) + + return response.choices[0].message.content + + def save_session(self): + print("\nšŸ’¾ Saving session...") + + # Stop music + self.music.stop() + + if not self.audio_timeline and not self.music.music_audio: + print(" No audio to save") + return + + # Calculate total duration from timeline + total_duration = self.get_session_time() + total_samples = int(total_duration * SAMPLE_RATE) + SAMPLE_RATE # +1 sec buffer + + # Create aligned track buffers + host_track = np.zeros(total_samples, dtype=np.float32) + caller_track = np.zeros(total_samples, dtype=np.float32) + + # Place audio segments at correct timestamps + for start_time, track_type, audio in self.audio_timeline: + start_sample = int(start_time * SAMPLE_RATE) + end_sample = start_sample + len(audio) + + # Extend buffer if needed + if end_sample > total_samples: + extra = end_sample - total_samples + SAMPLE_RATE + host_track = np.concatenate([host_track, np.zeros(extra, dtype=np.float32)]) + caller_track = np.concatenate([caller_track, np.zeros(extra, dtype=np.float32)]) + total_samples = len(host_track) + + if track_type == 'host': + host_track[start_sample:end_sample] += audio + elif track_type == 'caller': + caller_track[start_sample:end_sample] += audio + + # Get music track (already recorded with ducking) + music_track = None + if self.music.music_audio: + music_track = np.concatenate([a.flatten() for a in self.music.music_audio]) + # Pad or trim to match other tracks + if len(music_track) < total_samples: + music_track = np.concatenate([music_track, np.zeros(total_samples - len(music_track), dtype=np.float32)]) + else: + music_track = music_track[:total_samples] + + # Trim silence from end + max_len = total_samples + for track in [host_track, caller_track, music_track]: + if track is not None: + nonzero = np.nonzero(np.abs(track) > 0.001)[0] + if len(nonzero) > 0: + max_len = min(max_len, nonzero[-1] + SAMPLE_RATE) + + host_track = host_track[:max_len] + caller_track = caller_track[:max_len] + if music_track is not None: + music_track = music_track[:max_len] + + # Apply broadcast processing to host vocal + print(" šŸŽ™ļø Processing host vocal (EQ + compression + de-ess)...") + if np.any(host_track != 0): + host_track = de_ess(host_track, SAMPLE_RATE) # De-esser first + host_track = broadcast_process(host_track, SAMPLE_RATE) + + # Save individual aligned tracks + if np.any(host_track != 0): + sf.write(self.output_dir / "host_track.wav", host_track, SAMPLE_RATE) + print(f" āœ“ host_track.wav (broadcast processed)") + + if np.any(caller_track != 0): + sf.write(self.output_dir / "caller_track.wav", caller_track, SAMPLE_RATE) + print(f" āœ“ caller_track.wav") + + if music_track is not None: + sf.write(self.output_dir / "music_track.wav", music_track, SAMPLE_RATE) + print(f" āœ“ music_track.wav") + + # Create raw mixed master (full length, no edits) + print(" šŸŽ›ļø Mixing raw podcast...") + raw_mix = np.zeros(max_len, dtype=np.float32) + raw_mix += host_track * 1.0 + raw_mix += caller_track * 0.85 + if music_track is not None: + raw_mix += music_track * 0.35 + + # LUFS normalize to -16 LUFS (podcast standard) + print(" šŸ“Š Normalizing to -16 LUFS...") + raw_mix = lufs_normalize(raw_mix, SAMPLE_RATE, target_lufs=-16.0) + + sf.write(self.output_dir / "podcast_raw.wav", raw_mix, SAMPLE_RATE) + print(f" āœ“ podcast_raw.wav (full length, -16 LUFS)") + + # Create edited mix with dead air removed + print(" āœ‚ļø Creating edited mix (removing dead air)...") + edited_mix = create_edited_mix(host_track, caller_track, music_track, SAMPLE_RATE) + edited_mix = lufs_normalize(edited_mix, SAMPLE_RATE, target_lufs=-16.0) + sf.write(self.output_dir / "podcast_edited.wav", edited_mix, SAMPLE_RATE) + print(f" āœ“ podcast_edited.wav (dead air removed, -16 LUFS)") + + # Duration info + raw_mins = max_len / SAMPLE_RATE / 60 + edited_mins = len(edited_mix) / SAMPLE_RATE / 60 + saved_mins = raw_mins - edited_mins + print(f" šŸ“» Raw: {raw_mins:.1f} min → Edited: {edited_mins:.1f} min (saved {saved_mins:.1f} min)") + + with open(self.output_dir / "transcript.txt", "w") as f: + for entry in self.show_history: + f.write(f"{entry['caller'].upper()}: {entry['summary']}\n\n") + print(f" āœ“ transcript.txt") + + # Save persistent caller memory for future episodes + self._save_caller_memory() + print(f" āœ“ caller_memory.json (persistent memory saved)") + + def run(self): + print("\n" + "=" * 60) + print(" šŸ“» AI RADIO SHOW - LATE NIGHT CALLERS šŸ“»") + print("=" * 60) + print("\nCALLERS:") + for i, (key, caller) in enumerate(CALLERS.items()): + end = "\n" if (i + 1) % 2 == 0 else " " + print(f" [{key}] {caller['name']:<24}", end=end) + print("\n") + + # Start music if available + if self.music.tracks: + print(" šŸŽµ Starting music...") + if self.music.start(): + print(f" Now playing: {self.music.get_track_name()}") + print() + + # Start session timer for aligned audio export + self.session_start = datetime.now() + + self.print_status() + + while True: + try: + cmd = input("\n> ").strip().lower() + except (EOFError, KeyboardInterrupt): + break + + if not cmd: + continue + + if cmd == 'q': + break + + if cmd in CALLER_KEYS: + sd.stop() + # Play hold music briefly, then ring for new caller + play_show_sound('hold', wait=True) + self.current_caller = CALLERS[cmd] + self.conversation_history = [] + play_show_sound('ring', wait=True) + # Play caller's stinger if they have one + if not play_caller_stinger(cmd, wait=True): + pass # No stinger, that's fine + print(f"\n šŸ“ž NEW CALLER: {self.current_caller['name']}") + quality = self.current_caller.get("phone_quality", "normal") + if quality != "good": + print(f" šŸ“¶ Connection quality: {quality}") + self.print_status() + continue + + if cmd == 'h': + sd.stop() + play_show_sound('hangup', wait=False) + self.conversation_history = [] + print(f"\n šŸ”‡ HUNG UP on {self.current_caller['name']}!") + print(" Pick a new caller [1-9, 0, -, =]") + continue + + # Music controls + if cmd == 'm': + if self.music.playing: + self.music.stop() + print(" šŸ”‡ Music stopped") + else: + if self.music.start(): + print(f" šŸŽµ Music started: {self.music.get_track_name()}") + else: + print(" No music files in music/") + continue + + if cmd == 'n': + if self.music.tracks: + track = self.music.next_track() + print(f" šŸŽµ Now playing: {track}") + else: + print(" No music files") + continue + + if cmd == '+' or cmd == 'vol+': + self.music.set_volume(self.music.volume + 0.05) + print(f" šŸ”Š Volume: {int(self.music.volume * 100)}%") + continue + + if cmd == 'vol-': + self.music.set_volume(self.music.volume - 0.05) + print(f" šŸ”‰ Volume: {int(self.music.volume * 100)}%") + continue + + if cmd == 'f': + # Fade out music (for taking a call) + self.music.fade_out() + print(" šŸ”‰ Music fading out...") + continue + + if cmd == 'g': + # Fade music back in (after a call) + self.music.fade_in() + print(" šŸ”Š Music fading in...") + continue + + if cmd == 'd': + # Toggle auto-duck + auto = self.music.toggle_auto_duck() + print(f" Auto-duck: {'ON' if auto else 'OFF'}") + continue + + if cmd == 'ad' or cmd == 'commercial': + self.play_commercial() + continue + + if cmd == 'news': + self.play_breaking_news() + continue + + if cmd == 'b' or cmd == 'bobby': + self.cohost_chime_in() + continue + + if cmd == 'stingers': + self.generate_caller_stingers() + continue + + if cmd == 'p' or cmd == 'producer': + suggestion = self.get_producer_suggestion() + print(f"\n šŸŽ§ PRODUCER: {suggestion}\n") + continue + + if cmd == 'rec': + audio = self.record_audio() + if audio is not None and len(audio) > SAMPLE_RATE * 0.5: + print(" šŸ“ Transcribing...") + text = self.transcribe(audio) + if text: + print(f"\n YOU: {text}") + print(f"\n šŸ’­ {self.current_caller['name']} is thinking...") + reply = self.generate_response(text) + print(f"\n šŸ“ž {self.current_caller['name'].upper()}: {reply}\n") + self.speak(reply) + else: + print(" (No speech detected)") + else: + print(" (Recording too short)") + continue + + if cmd == 't': + self.music.duck() # Duck music while typing too + text = input(" Type: ").strip() + if text: + print(f"\n šŸ’­ {self.current_caller['name']} is thinking...") + reply = self.generate_response(text) + print(f"\n šŸ“ž {self.current_caller['name'].upper()}: {reply}\n") + self.speak(reply) + else: + self.music.unduck() + continue + + # Sound effects + if len(cmd) == 1 and cmd in SOUNDBOARD: + if play_sound(cmd): + name = SOUNDBOARD[cmd].replace('.wav', '').replace('_', ' ') + print(f" šŸ”Š {name}") + else: + print(f" Sound file not found") + continue + + print(" Commands: rec, t, h, m, n, +/vol-, 1-9/0/-/=, sounds, q") + + self.save_session() + print("\nšŸŽ¬ That's a wrap! Thanks for listening.\n") + + +if __name__ == "__main__": + show = RadioShow() + show.run() diff --git a/radio_simple.py b/radio_simple.py new file mode 100644 index 0000000..9f14b7c --- /dev/null +++ b/radio_simple.py @@ -0,0 +1,140 @@ +#!/usr/bin/env python3 +""" +Simplified Radio Show - for debugging +""" + +import os +import sys +from pathlib import Path +import numpy as np +import sounddevice as sd +import soundfile as sf +from faster_whisper import WhisperModel +from scipy.signal import butter, filtfilt +from dotenv import load_dotenv + +load_dotenv() + +SAMPLE_RATE = 24000 + +CALLERS = { + "1": ("Big Tony", "IKne3meq5aSn9XLyUdCD", "You are Big Tony, a loud Italian guy from Staten Island. Swear naturally, be opinionated. Keep it to 2 sentences."), + "2": ("Drunk Diane", "FGY2WhTYpPnrIDTdsKH5", "You are Drunk Diane, tipsy woman at a bar. Ramble a bit, be funny. Keep it to 2 sentences."), + "3": ("Stoner Phil", "bIHbv24MWmeRgasZH58o", "You are Stoner Phil, super chill stoner dude. Speak slow, be spacey but profound. Keep it to 2 sentences."), +} + +def phone_filter(audio): + b, a = butter(4, [300/(SAMPLE_RATE/2), 3400/(SAMPLE_RATE/2)], btype='band') + return (np.tanh(filtfilt(b, a, audio.flatten()) * 1.5) * 0.8).astype(np.float32) + +class SimpleRadio: + def __init__(self): + print("Loading Whisper...") + self.whisper = WhisperModel("base", device="cpu", compute_type="int8") + + print("Connecting to ElevenLabs...") + from elevenlabs.client import ElevenLabs + self.tts = ElevenLabs(api_key=os.getenv("ELEVENLABS_API_KEY")) + + print("Connecting to Ollama...") + import ollama + self.ollama = ollama + + self.caller = CALLERS["1"] + self.history = [] + print("\nReady!\n") + + def record(self): + print(" [Recording - press Enter to stop]") + chunks = [] + recording = True + + def callback(indata, frames, time, status): + if recording: + chunks.append(indata.copy()) + + with sd.InputStream(samplerate=SAMPLE_RATE, channels=1, callback=callback): + input() # Wait for Enter + + recording = False + return np.vstack(chunks) if chunks else None + + def transcribe(self, audio): + import librosa + audio_16k = librosa.resample(audio.flatten().astype(np.float32), orig_sr=SAMPLE_RATE, target_sr=16000) + segments, _ = self.whisper.transcribe(audio_16k) + return " ".join([s.text for s in segments]).strip() + + def respond(self, text): + self.history.append({"role": "user", "content": text}) + + response = self.ollama.chat( + model="llama3.2:latest", + messages=[{"role": "system", "content": self.caller[2]}] + self.history[-6:], + options={"temperature": 0.9} + ) + + reply = response["message"]["content"] + self.history.append({"role": "assistant", "content": reply}) + return reply + + def speak(self, text): + print(" [Generating voice...]") + audio_gen = self.tts.text_to_speech.convert( + voice_id=self.caller[1], + text=text, + model_id="eleven_turbo_v2_5", + output_format="pcm_24000" + ) + + audio_bytes = b"".join(audio_gen) + audio = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32768.0 + filtered = phone_filter(audio) + + print(" [Playing...]") + sd.play(filtered, SAMPLE_RATE) + sd.wait() + + def run(self): + print("=" * 50) + print(" SIMPLE RADIO - Type commands:") + print(" 1/2/3 = switch caller") + print(" r = record & respond") + print(" t = type message (skip recording)") + print(" q = quit") + print("=" * 50) + print(f"\nCaller: {self.caller[0]}\n") + + while True: + cmd = input("> ").strip().lower() + + if cmd == 'q': + break + elif cmd in '123': + self.caller = CALLERS[cmd] + self.history = [] + print(f"\nšŸ“ž Switched to: {self.caller[0]}\n") + elif cmd == 'r': + audio = self.record() + if audio is not None: + print(" [Transcribing...]") + text = self.transcribe(audio) + print(f"\n YOU: {text}\n") + if text: + print(" [Thinking...]") + reply = self.respond(text) + print(f"\n šŸ“ž {self.caller[0].upper()}: {reply}\n") + self.speak(reply) + elif cmd == 't': + text = input(" Type message: ") + if text: + print(" [Thinking...]") + reply = self.respond(text) + print(f"\n šŸ“ž {self.caller[0].upper()}: {reply}\n") + self.speak(reply) + else: + print(" Commands: r=record, t=type, 1/2/3=caller, q=quit") + +if __name__ == "__main__": + radio = SimpleRadio() + radio.run() diff --git a/requirements-web.txt b/requirements-web.txt new file mode 100644 index 0000000..9f2a52c --- /dev/null +++ b/requirements-web.txt @@ -0,0 +1,16 @@ +# Web application requirements (in addition to existing radio_show.py deps) +fastapi>=0.109.0 +uvicorn[standard]>=0.27.0 +python-multipart>=0.0.6 +websockets>=12.0 +httpx>=0.26.0 +pydantic-settings>=2.1.0 + +# Already installed for CLI (but listed for completeness): +# faster-whisper +# elevenlabs +# numpy +# scipy +# librosa +# soundfile +# python-dotenv diff --git a/run.sh b/run.sh new file mode 100755 index 0000000..b45ac7f --- /dev/null +++ b/run.sh @@ -0,0 +1,60 @@ +#!/bin/bash +# AI Radio Show - Server Runner with restart support + +LOG_FILE="/tmp/ai-radio-show.log" +RESTART_FLAG="/tmp/ai-radio-show.restart" +STOP_FLAG="/tmp/ai-radio-show.stop" + +cd "$(dirname "$0")" + +# Activate virtual environment +source venv/bin/activate + +# Cleanup old flags +rm -f "$RESTART_FLAG" "$STOP_FLAG" + +echo "AI Radio Show Server Runner" +echo "Log file: $LOG_FILE" +echo "Press Ctrl+C to stop" +echo "" + +while true; do + echo "[$(date)] Starting server..." | tee -a "$LOG_FILE" + + # Start uvicorn with output to both console and log file + python -m uvicorn backend.main:app --host 0.0.0.0 --port 8000 2>&1 | tee -a "$LOG_FILE" & + SERVER_PID=$! + + # Wait for server to exit or restart signal + while kill -0 $SERVER_PID 2>/dev/null; do + if [ -f "$RESTART_FLAG" ]; then + echo "[$(date)] Restart requested..." | tee -a "$LOG_FILE" + rm -f "$RESTART_FLAG" + kill $SERVER_PID 2>/dev/null + wait $SERVER_PID 2>/dev/null + sleep 1 + break + fi + + if [ -f "$STOP_FLAG" ]; then + echo "[$(date)] Stop requested..." | tee -a "$LOG_FILE" + rm -f "$STOP_FLAG" + kill $SERVER_PID 2>/dev/null + wait $SERVER_PID 2>/dev/null + echo "[$(date)] Server stopped." | tee -a "$LOG_FILE" + exit 0 + fi + + sleep 1 + done + + # Check if we should restart or exit + if [ -f "$STOP_FLAG" ]; then + rm -f "$STOP_FLAG" + echo "[$(date)] Server stopped." | tee -a "$LOG_FILE" + exit 0 + fi + + echo "[$(date)] Restarting in 2 seconds..." | tee -a "$LOG_FILE" + sleep 2 +done diff --git a/test.html b/test.html new file mode 100644 index 0000000..41474d5 --- /dev/null +++ b/test.html @@ -0,0 +1,37 @@ + + + + Test JavaScript Loading + + +

JavaScript Test

+ +
+ + + + + + + \ No newline at end of file