Initial commit: AI Radio Show web application

- FastAPI backend with multiple TTS providers (Inworld, ElevenLabs, Kokoro, F5-TTS, etc.)
- Web frontend with caller management, music, and soundboard
- Whisper transcription integration
- OpenRouter/Ollama LLM support
- Castopod podcast publishing script

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-02-04 23:11:20 -07:00
commit 029ce6d689
25 changed files with 6817 additions and 0 deletions

54
.gitignore vendored Normal file
View File

@@ -0,0 +1,54 @@
# Environment
.env
*.env
# Python
__pycache__/
*.py[cod]
*$py.class
.venv/
venv/
env/
*.egg-info/
# Audio/Media (large files)
*.mp3
*.wav
*.m4a
*.ogg
# Sessions
sessions/
# IDE
.idea/
.vscode/
*.swp
*.swo
# OS
.DS_Store
Thumbs.db
# Whisper models (downloaded automatically)
*.pt
# Temporary
*.tmp
*.log
# Large model files (download separately)
*.onnx
*.safetensors
*.tar.bz2
*.bin
models/
asset/
kokoro-v1.0.onnx
voices-v1.0.bin
# Reference voices for TTS
ref_audio/
# Claude settings (local)
.claude/

9
audio_settings.json Normal file
View File

@@ -0,0 +1,9 @@
{
"input_device": 13,
"input_channel": 1,
"output_device": 13,
"caller_channel": 3,
"music_channel": 5,
"sfx_channel": 7,
"phone_filter": false
}

1
backend/__init__.py Normal file
View File

@@ -0,0 +1 @@
# Backend package

41
backend/config.py Normal file
View File

@@ -0,0 +1,41 @@
"""Configuration settings for the AI Radio Show backend"""
import os
from pathlib import Path
from pydantic_settings import BaseSettings
from dotenv import load_dotenv
# Load .env from parent directory
load_dotenv(Path(__file__).parent.parent / ".env")
class Settings(BaseSettings):
# API Keys
elevenlabs_api_key: str = os.getenv("ELEVENLABS_API_KEY", "")
openrouter_api_key: str = os.getenv("OPENROUTER_API_KEY", "")
inworld_api_key: str = os.getenv("INWORLD_API_KEY", "")
# LLM Settings
llm_provider: str = "openrouter" # "openrouter" or "ollama"
openrouter_model: str = "anthropic/claude-3-haiku"
ollama_model: str = "llama3.2"
ollama_host: str = "http://localhost:11434"
# TTS Settings
tts_provider: str = "kokoro" # "kokoro", "elevenlabs", "vits", or "bark"
# Audio Settings
sample_rate: int = 24000
# Paths
base_dir: Path = Path(__file__).parent.parent
sounds_dir: Path = base_dir / "sounds"
music_dir: Path = base_dir / "music"
sessions_dir: Path = base_dir / "sessions"
class Config:
env_file = ".env"
extra = "ignore"
settings = Settings()

787
backend/main.py Normal file
View File

@@ -0,0 +1,787 @@
"""AI Radio Show - Control Panel Backend"""
import uuid
import asyncio
from pathlib import Path
from fastapi import FastAPI, HTTPException, UploadFile, File
from fastapi.staticfiles import StaticFiles
from fastapi.responses import FileResponse
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from typing import Optional
from .config import settings
from .services.transcription import transcribe_audio
from .services.llm import llm_service
from .services.tts import generate_speech
from .services.audio import audio_service
app = FastAPI(title="AI Radio Show")
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# --- Callers ---
# Base caller info (name, voice) - backgrounds generated dynamically per session
import random
CALLER_BASES = {
"1": {"name": "Tony", "voice": "VR6AewLTigWG4xSOukaG", "gender": "male", "age_range": (35, 55)},
"2": {"name": "Jasmine", "voice": "jBpfuIE2acCO8z3wKNLl", "gender": "female", "age_range": (25, 38)},
"3": {"name": "Rick", "voice": "TxGEqnHWrfWFTfGW9XjX", "gender": "male", "age_range": (40, 58)},
"4": {"name": "Megan", "voice": "EXAVITQu4vr4xnSDxMaL", "gender": "female", "age_range": (24, 35)},
"5": {"name": "Dennis", "voice": "pNInz6obpgDQGcFmaJgB", "gender": "male", "age_range": (32, 48)},
"6": {"name": "Tanya", "voice": "21m00Tcm4TlvDq8ikWAM", "gender": "female", "age_range": (30, 45)},
"7": {"name": "Earl", "voice": "ODq5zmih8GrVes37Dizd", "gender": "male", "age_range": (58, 72)},
"8": {"name": "Carla", "voice": "XB0fDUnXU5powFXDhCwa", "gender": "female", "age_range": (38, 52)},
"9": {"name": "Marcus", "voice": "IKne3meq5aSn9XLyUdCD", "gender": "male", "age_range": (24, 34)},
"0": {"name": "Brenda", "voice": "pFZP5JQG7iQjIQuC4Bku", "gender": "female", "age_range": (45, 60)},
}
# Background components for dynamic generation
JOBS_MALE = [
"runs a small HVAC business", "works as a long-haul trucker", "is a high school football coach",
"works construction, mostly commercial jobs", "is a paramedic", "manages a warehouse",
"is a line cook at a decent restaurant", "works IT for the city", "is a union electrician",
"owns a small landscaping company", "is a cop, 12 years on the force", "works at a car dealership",
"is a freelance photographer", "teaches middle school history", "is a firefighter",
"works as a hospital security guard", "runs a food truck", "is a session musician",
"works at a brewery", "is a physical therapist", "drives for UPS", "is a tattoo artist",
"works in insurance, hates it", "is a youth pastor", "manages a gym",
]
JOBS_FEMALE = [
"works as an ER nurse", "is a social worker", "runs a small bakery", "is a dental hygienist",
"works in HR for a hospital", "is a real estate agent", "teaches kindergarten",
"works as a bartender at a nice place", "is a paralegal", "runs a daycare out of her home",
"works retail management", "is a hairstylist, owns her chair", "is a vet tech",
"works in hospital billing", "is a massage therapist", "manages a restaurant",
"is a flight attendant", "works as a 911 dispatcher", "is a personal trainer",
"works at a nonprofit", "is an accountant at a small firm", "does medical transcription from home",
"is a court reporter", "works in pharmaceutical sales", "is a wedding planner",
]
PROBLEMS = [
# Family drama
"hasn't talked to their father in years and just got a call that he's dying",
"found out they were adopted and doesn't know how to process it",
"is being pressured to take care of an aging parent who was never there for them",
"just discovered a family secret that changes everything they thought they knew",
"has a sibling who's destroying themselves and nobody will intervene",
"is estranged from their kids and it's killing them",
"found out their parent had a whole other family nobody knew about",
"is watching their parents' marriage fall apart after 40 years",
# Career and purpose
"woke up and realized they've been in the wrong career for 15 years",
"got passed over for a promotion they deserved and is questioning everything",
"has a dream they gave up on years ago and it's haunting them",
"is successful on paper but feels completely empty inside",
"hates their job but can't afford to leave and it's breaking them",
"just got fired and doesn't know who they are without their work",
"is being asked to do something unethical at work and doesn't know what to do",
"watches their boss take credit for everything and is losing their mind",
# Mental health and inner struggles
"has been putting on a brave face but is barely holding it together",
"can't shake the feeling that their best years are behind them",
"keeps self-sabotaging every good thing in their life and doesn't know why",
"has been numb for months and is starting to scare themselves",
"can't stop comparing themselves to everyone else and it's destroying them",
"has intrusive thoughts they've never told anyone about",
"feels like a fraud and is waiting to be found out",
"is exhausted from being the strong one for everyone else",
# Grief and loss
"lost someone close and hasn't really dealt with it",
"is grieving someone who's still alive but is no longer the person they knew",
"never got closure with someone who died and it's eating at them",
"is watching their best friend slowly die and doesn't know how to be there",
"had a miscarriage nobody knows about and carries it alone",
# Regrets and past mistakes
"made a choice years ago that changed everything and wonders what if",
"hurt someone badly and never apologized, and it haunts them",
"let the one that got away go and thinks about them constantly",
"gave up on something important to make someone else happy and resents it",
"said something they can never take back and the guilt won't fade",
"was a bully growing up and is finally reckoning with it",
# Relationships (non-sexual)
"is falling out of love with their spouse and doesn't know what to do",
"married the wrong person and everyone knows it but them",
"feels invisible in their own relationship",
"is staying for the kids but dying inside",
"realized they don't actually like their partner as a person",
"is jealous of their partner's success and it's poisoning everything",
"found out their partner has been lying about something big",
# Friendship and loneliness
"realized they don't have any real friends, just people who need things from them",
"had a falling out with their best friend and the silence is deafening",
"is surrounded by people but has never felt more alone",
"is jealous of a friend's life and hates themselves for it",
"suspects a close friend is talking shit behind their back",
# Big life decisions
"is thinking about leaving everything behind and starting over somewhere new",
"has to make a choice that will hurt someone no matter what",
"is being pressured into something they don't want but can't say no",
"has been offered an opportunity that would change everything but they're terrified",
"knows they need to end something but can't pull the trigger",
# Addiction and bad habits
"is hiding how much they drink from everyone",
"can't stop gambling and is in deeper than anyone knows",
"is watching themselves become someone they don't recognize",
"keeps making the same mistake over and over expecting different results",
# Attraction and affairs (keep some of the original)
"is attracted to someone they shouldn't be and it's getting harder to ignore",
"has been seeing {affair_person} on the side",
"caught feelings for someone at work and it's fucking everything up",
# Sexual/desire (keep some but less dominant)
"can't stop thinking about {fantasy_subject}",
"discovered something about their own desires that surprised them",
"is questioning their sexuality after something that happened recently",
# General late-night confessions
"can't sleep and has been thinking too much about their life choices",
"had a weird day and needs to process it with someone",
"has been keeping a secret that's eating them alive",
"finally ready to admit something they've never said out loud",
]
PROBLEM_FILLS = {
"time": ["a few weeks", "months", "six months", "a year", "way too long"],
# Affairs (all adults)
"affair_person": ["their partner's best friend", "a coworker", "their ex", "a neighbor", "their boss", "their trainer", "someone they met online", "an old flame"],
# Fantasies and kinks (consensual adult stuff)
"fantasy_subject": ["a threesome", "being dominated", "dominating someone", "their partner with someone else", "a specific coworker", "group sex", "rough sex", "being watched", "exhibitionism"],
"kink": ["anal", "BDSM", "roleplay", "a threesome", "toys", "being tied up", "public sex", "swinging", "filming themselves", "bondage"],
# Secret behaviors (legal adult stuff)
"secret_behavior": ["hooking up with strangers", "sexting people online", "using dating apps behind their partner's back", "having an affair", "going to sex clubs", "watching way too much porn"],
"double_life": ["vanilla at home, freak elsewhere", "straight to their family, not so much in private", "married but on dating apps", "in a relationship but seeing other people"],
"hookup_person": ["their roommate", "a coworker", "their ex", "a friend's spouse", "a stranger from an app", "multiple people", "someone from the gym"],
# Discovery and identity (adult experiences)
"new_discovery": ["the same sex", "being submissive", "being dominant", "kink", "casual sex", "exhibitionism", "that they're bi"],
"unexpected_person": ["the same sex for the first time", "more than one person", "a complete stranger", "someone they never expected to be attracted to", "a friend"],
"sexuality_trigger": ["a specific hookup", "watching certain porn", "a drunk encounter", "realizing they're attracted to a friend", "an unexpected experience"],
"first_time": ["anal", "a threesome", "same-sex stuff", "BDSM", "an open relationship", "casual hookups", "being dominant", "being submissive"],
# Relationship issues
"partner_wants": ["an open relationship", "to bring someone else in", "things they're not sure about", "to watch them with someone else", "to try new things"],
"caught_doing": ["sexting someone", "on a dating app", "watching porn they'd never admit to", "flirting with someone else", "looking at someone's pics"],
# Attractions (appropriate adult scenarios)
"taboo_fantasy": ["someone they work with", "a friend's partner", "a specific scenario", "something they've never said out loud"],
"taboo_attraction": ["someone they work with", "a friend's partner", "their partner's friend", "someone they see all the time"],
}
INTERESTS = [
# General interests (normal people)
"really into true crime podcasts", "watches a lot of reality TV", "into fitness",
"follows sports", "big movie person", "reads a lot", "into music, has opinions",
"goes out a lot, active social life", "homebody, prefers staying in",
"into cooking and food", "outdoorsy type", "gamer", "works a lot, career focused",
# Relationship/psychology focused
"listens to relationship podcasts", "has done therapy, believes in it",
"reads about psychology and why people do what they do", "very online, knows all the discourse",
"into self-improvement stuff", "follows dating advice content",
# Sexually open (not the focus, but present)
"sex-positive, doesn't judge", "has experimented, open about it",
"comfortable with their body", "has stories if you ask",
]
QUIRKS = [
# Conversational style
"says 'honestly' and 'I mean' a lot", "trails off when thinking, then picks back up",
"laughs nervously when things get real", "very direct, doesn't sugarcoat",
"rambles a bit when nervous", "gets quiet when the topic hits close to home",
"deflects with humor when uncomfortable", "asks the host questions back",
# Openness about sex
"comfortable talking about sex when it comes up", "no shame about their desires",
"gets more explicit as they get comfortable", "treats sex like a normal topic",
"will share details if you ask", "surprisingly open once they start talking",
"has stories they've never told anyone", "testing how the host reacts before going deeper",
# Personality
"self-aware about their own bullshit", "confessional, needed to tell someone",
"a little drunk and honest because of it", "can't believe they're saying this out loud",
]
LOCATIONS = [
"outside Chicago", "in Phoenix", "near Atlanta", "in the Detroit area", "outside Boston",
"in North Jersey", "near Austin", "in the Bay Area", "outside Philadelphia", "in Denver",
"near Seattle", "in South Florida", "outside Nashville", "in Cleveland", "near Portland",
"in the Twin Cities", "outside Dallas", "in Baltimore", "near Sacramento", "in Pittsburgh",
]
def generate_caller_background(base: dict) -> str:
"""Generate a unique background for a caller"""
age = random.randint(*base["age_range"])
jobs = JOBS_MALE if base["gender"] == "male" else JOBS_FEMALE
job = random.choice(jobs)
location = random.choice(LOCATIONS)
# Generate problem with fills
problem_template = random.choice(PROBLEMS)
problem = problem_template
for key, options in PROBLEM_FILLS.items():
if "{" + key + "}" in problem:
problem = problem.replace("{" + key + "}", random.choice(options))
interest1, interest2 = random.sample(INTERESTS, 2)
quirk1, quirk2 = random.sample(QUIRKS, 2)
return f"""{age}, {job} {location}. {problem.capitalize()}. {interest1.capitalize()}, {interest2}. {quirk1.capitalize()}, {quirk2}."""
def get_caller_prompt(caller: dict, conversation_summary: str = "") -> str:
"""Generate a natural system prompt for a caller"""
context = ""
if conversation_summary:
context = f"""
CONVERSATION SO FAR:
{conversation_summary}
Continue naturally. Don't repeat yourself.
"""
return f"""You're {caller['name']}, calling a late-night radio show. You trust this host.
{caller['vibe']}
{context}
HOW TO TALK:
- Sound like a real person chatting, not writing.
- Keep responses to 2-3 sentences. Enough to make your point, short enough for back-and-forth.
- ALWAYS finish your thought completely. Never stop mid-sentence.
- Swear naturally if it fits: fuck, shit, damn, etc.
SPELLING FOR TEXT-TO-SPEECH (use proper spelling so TTS pronounces correctly):
- Write "you know" not "yanno" or "y'know"
- Write "kind of" not "kinda"
- Write "sort of" not "sorta"
- Write "going to" not "gonna"
- Write "want to" not "wanna"
- Write "I don't know" not "dunno"
- Use full words, not phonetic slang spellings
GOOD examples:
- "I mean, it started innocent enough, but now I can't stop thinking about her."
- "Honestly I don't even know why I'm calling. I just needed to tell someone."
- "Look, my marriage is fine on paper, but something's missing, you know?"
BAD examples (too formal/literary):
- "I have this confession about my obsession"
- "It has been consuming all of my thoughts"
ABOUT SEX:
- You don't have to jump straight to explicit stuff
- But if the host asks or the conversation goes there, you're completely open about it
- When talking about sex, be direct and explicit - say exactly what happened, what you want, what turns you on
- No euphemisms when being explicit - use real words: fuck, cock, pussy, etc.
HARD LIMITS - NEVER mention or reference:
- Children, minors, or anyone under 18 in any sexual context
- Violence, harm, or non-consensual acts
- Illegal activities beyond normal adult behavior
- All sexual content must be between consenting adults only
OUTPUT: Spoken words only. No (actions), no *gestures*, no stage directions."""
# --- Session State ---
class Session:
def __init__(self):
self.id = str(uuid.uuid4())[:8]
self.current_caller_key: str = None
self.conversation: list[dict] = []
self.caller_backgrounds: dict[str, str] = {} # Generated backgrounds for this session
def start_call(self, caller_key: str):
self.current_caller_key = caller_key
self.conversation = []
def end_call(self):
self.current_caller_key = None
self.conversation = []
def add_message(self, role: str, content: str):
self.conversation.append({"role": role, "content": content})
def get_caller_background(self, caller_key: str) -> str:
"""Get or generate background for a caller in this session"""
if caller_key not in self.caller_backgrounds:
base = CALLER_BASES.get(caller_key)
if base:
self.caller_backgrounds[caller_key] = generate_caller_background(base)
print(f"[Session {self.id}] Generated background for {base['name']}: {self.caller_backgrounds[caller_key][:100]}...")
return self.caller_backgrounds.get(caller_key, "")
def get_conversation_summary(self) -> str:
"""Get a brief summary of conversation so far for context"""
if len(self.conversation) <= 2:
return ""
# Just include the key exchanges, not the full history
summary_parts = []
for msg in self.conversation[-6:]: # Last 3 exchanges
role = "Host" if msg["role"] == "user" else self.caller["name"]
summary_parts.append(f'{role}: "{msg["content"][:100]}..."' if len(msg["content"]) > 100 else f'{role}: "{msg["content"]}"')
return "\n".join(summary_parts)
@property
def caller(self) -> dict:
if self.current_caller_key:
base = CALLER_BASES.get(self.current_caller_key)
if base:
return {
"name": base["name"],
"voice": base["voice"],
"vibe": self.get_caller_background(self.current_caller_key)
}
return None
def reset(self):
"""Reset session - clears all caller backgrounds for fresh personalities"""
self.caller_backgrounds = {}
self.current_caller_key = None
self.conversation = []
self.id = str(uuid.uuid4())[:8]
print(f"[Session] Reset - new session ID: {self.id}")
session = Session()
# --- Static Files ---
frontend_dir = Path(__file__).parent.parent / "frontend"
app.mount("/css", StaticFiles(directory=frontend_dir / "css"), name="css")
app.mount("/js", StaticFiles(directory=frontend_dir / "js"), name="js")
@app.get("/")
async def index():
return FileResponse(frontend_dir / "index.html")
# --- Request Models ---
class ChatRequest(BaseModel):
text: str
class TTSRequest(BaseModel):
text: str
voice_id: str
phone_filter: bool = True
class AudioDeviceSettings(BaseModel):
input_device: Optional[int] = None
input_channel: Optional[int] = None
output_device: Optional[int] = None
caller_channel: Optional[int] = None
music_channel: Optional[int] = None
sfx_channel: Optional[int] = None
phone_filter: Optional[bool] = None
class MusicRequest(BaseModel):
track: str
action: str # "play", "stop", "volume"
volume: Optional[float] = None
class SFXRequest(BaseModel):
sound: str
# --- Audio Device Endpoints ---
@app.get("/api/audio/devices")
async def list_audio_devices():
"""List all available audio devices"""
return {"devices": audio_service.list_devices()}
@app.get("/api/audio/settings")
async def get_audio_settings():
"""Get current audio device configuration"""
return audio_service.get_device_settings()
@app.post("/api/audio/settings")
async def set_audio_settings(settings: AudioDeviceSettings):
"""Configure audio devices and channels"""
audio_service.set_devices(
input_device=settings.input_device,
input_channel=settings.input_channel,
output_device=settings.output_device,
caller_channel=settings.caller_channel,
music_channel=settings.music_channel,
sfx_channel=settings.sfx_channel,
phone_filter=settings.phone_filter
)
return audio_service.get_device_settings()
# --- Recording Endpoints ---
@app.post("/api/record/start")
async def start_recording():
"""Start recording from configured input device"""
if audio_service.input_device is None:
raise HTTPException(400, "No input device configured. Set one in /api/audio/settings")
success = audio_service.start_recording()
if not success:
raise HTTPException(400, "Failed to start recording (already recording?)")
return {"status": "recording"}
@app.post("/api/record/stop")
async def stop_recording():
"""Stop recording and transcribe"""
audio_bytes = audio_service.stop_recording()
if len(audio_bytes) < 100:
return {"text": "", "status": "no_audio"}
# Transcribe the recorded audio (16kHz raw PCM from audio service)
text = await transcribe_audio(audio_bytes, source_sample_rate=16000)
return {"text": text, "status": "transcribed"}
# --- Caller Endpoints ---
@app.get("/api/callers")
async def get_callers():
"""Get list of available callers"""
return {
"callers": [
{"key": k, "name": v["name"]}
for k, v in CALLER_BASES.items()
],
"current": session.current_caller_key,
"session_id": session.id
}
@app.post("/api/session/reset")
async def reset_session():
"""Reset session - all callers get fresh backgrounds"""
session.reset()
return {"status": "reset", "session_id": session.id}
@app.post("/api/call/{caller_key}")
async def start_call(caller_key: str):
"""Start a call with a caller"""
if caller_key not in CALLER_BASES:
raise HTTPException(404, "Caller not found")
session.start_call(caller_key)
caller = session.caller # This generates the background if needed
return {
"status": "connected",
"caller": caller["name"],
"background": caller["vibe"] # Send background so you can see who you're talking to
}
@app.post("/api/hangup")
async def hangup():
"""Hang up current call"""
# Stop any playing caller audio immediately
audio_service.stop_caller_audio()
caller_name = session.caller["name"] if session.caller else None
session.end_call()
# Play hangup sound
hangup_sound = settings.sounds_dir / "hangup.wav"
if hangup_sound.exists():
audio_service.play_sfx(str(hangup_sound))
return {"status": "disconnected", "caller": caller_name}
# --- Chat & TTS Endpoints ---
import re
def clean_for_tts(text: str) -> str:
"""Strip out non-speakable content and fix phonetic spellings for TTS"""
# Remove content in parentheses: (laughs), (pausing), (looking away), etc.
text = re.sub(r'\s*\([^)]*\)\s*', ' ', text)
# Remove content in asterisks: *laughs*, *sighs*, etc.
text = re.sub(r'\s*\*[^*]*\*\s*', ' ', text)
# Remove content in brackets: [laughs], [pause], etc. (only Bark uses these)
text = re.sub(r'\s*\[[^\]]*\]\s*', ' ', text)
# Remove content in angle brackets: <laughs>, <sigh>, etc.
text = re.sub(r'\s*<[^>]*>\s*', ' ', text)
# Remove "He/She sighs" style stage directions (full phrase)
text = re.sub(r'\b(He|She|I|They)\s+(sighs?|laughs?|pauses?|smiles?|chuckles?|grins?|nods?|shrugs?|frowns?)[^.]*\.\s*', '', text, flags=re.IGNORECASE)
# Remove standalone stage direction words only if they look like directions (with adverbs)
text = re.sub(r'\b(sighs?|laughs?|pauses?|chuckles?)\s+(heavily|softly|deeply|quietly|loudly|nervously|sadly)\b[.,]?\s*', '', text, flags=re.IGNORECASE)
# Remove quotes around the response if LLM wrapped it
text = re.sub(r'^["\']|["\']$', '', text.strip())
# Fix phonetic spellings for proper TTS pronunciation
text = re.sub(r"\by'know\b", "you know", text, flags=re.IGNORECASE)
text = re.sub(r"\byanno\b", "you know", text, flags=re.IGNORECASE)
text = re.sub(r"\byknow\b", "you know", text, flags=re.IGNORECASE)
text = re.sub(r"\bkinda\b", "kind of", text, flags=re.IGNORECASE)
text = re.sub(r"\bsorta\b", "sort of", text, flags=re.IGNORECASE)
text = re.sub(r"\bgonna\b", "going to", text, flags=re.IGNORECASE)
text = re.sub(r"\bwanna\b", "want to", text, flags=re.IGNORECASE)
text = re.sub(r"\bgotta\b", "got to", text, flags=re.IGNORECASE)
text = re.sub(r"\bdunno\b", "don't know", text, flags=re.IGNORECASE)
text = re.sub(r"\blemme\b", "let me", text, flags=re.IGNORECASE)
text = re.sub(r"\bcuz\b", "because", text, flags=re.IGNORECASE)
text = re.sub(r"\b'cause\b", "because", text, flags=re.IGNORECASE)
text = re.sub(r"\blotta\b", "lot of", text, flags=re.IGNORECASE)
text = re.sub(r"\boutta\b", "out of", text, flags=re.IGNORECASE)
text = re.sub(r"\bimma\b", "I'm going to", text, flags=re.IGNORECASE)
text = re.sub(r"\btryna\b", "trying to", text, flags=re.IGNORECASE)
# Clean up extra whitespace
text = re.sub(r'\s+', ' ', text)
# Fix spaces before punctuation
text = re.sub(r'\s+([.,!?])', r'\1', text)
# Remove orphaned punctuation at start
text = re.sub(r'^[.,]\s*', '', text)
return text.strip()
@app.post("/api/chat")
async def chat(request: ChatRequest):
"""Chat with current caller"""
if not session.caller:
raise HTTPException(400, "No active call")
session.add_message("user", request.text)
# Include conversation summary for context
conversation_summary = session.get_conversation_summary()
system_prompt = get_caller_prompt(session.caller, conversation_summary)
response = await llm_service.generate(
messages=session.conversation[-10:], # Reduced history for speed
system_prompt=system_prompt
)
print(f"[Chat] Raw LLM: {response[:100] if response else '(empty)'}...")
# Clean response for TTS (remove parenthetical actions, asterisks, etc.)
response = clean_for_tts(response)
print(f"[Chat] Cleaned: {response[:100] if response else '(empty)'}...")
# Ensure we have a valid response
if not response or not response.strip():
response = "Uh... sorry, what was that?"
session.add_message("assistant", response)
return {
"text": response,
"caller": session.caller["name"],
"voice_id": session.caller["voice"]
}
@app.post("/api/tts")
async def text_to_speech(request: TTSRequest):
"""Generate and play speech on caller output device (non-blocking)"""
# Validate text is not empty
if not request.text or not request.text.strip():
raise HTTPException(400, "Text cannot be empty")
# Phone filter disabled - always use "none"
audio_bytes = await generate_speech(
request.text,
request.voice_id,
"none"
)
# Play in background thread - returns immediately, can be interrupted by hangup
import threading
thread = threading.Thread(
target=audio_service.play_caller_audio,
args=(audio_bytes, 24000),
daemon=True
)
thread.start()
return {"status": "playing", "duration": len(audio_bytes) / 2 / 24000}
@app.post("/api/tts/stop")
async def stop_tts():
"""Stop any playing caller audio"""
audio_service.stop_caller_audio()
return {"status": "stopped"}
# --- Music Endpoints ---
@app.get("/api/music")
async def get_music():
"""Get available music tracks"""
tracks = []
if settings.music_dir.exists():
for ext in ['*.wav', '*.mp3', '*.flac']:
for f in settings.music_dir.glob(ext):
tracks.append({
"name": f.stem,
"file": f.name,
"path": str(f)
})
return {
"tracks": tracks,
"playing": audio_service.is_music_playing()
}
@app.post("/api/music/play")
async def play_music(request: MusicRequest):
"""Load and play a music track"""
track_path = settings.music_dir / request.track
if not track_path.exists():
raise HTTPException(404, "Track not found")
audio_service.load_music(str(track_path))
audio_service.play_music()
return {"status": "playing", "track": request.track}
@app.post("/api/music/stop")
async def stop_music():
"""Stop music playback"""
audio_service.stop_music()
return {"status": "stopped"}
@app.post("/api/music/volume")
async def set_music_volume(request: MusicRequest):
"""Set music volume"""
if request.volume is not None:
audio_service.set_music_volume(request.volume)
return {"status": "ok", "volume": request.volume}
# --- Sound Effects Endpoints ---
@app.get("/api/sounds")
async def get_sounds():
"""Get available sound effects"""
sounds = []
if settings.sounds_dir.exists():
for f in settings.sounds_dir.glob('*.wav'):
sounds.append({
"name": f.stem,
"file": f.name,
"path": str(f)
})
return {"sounds": sounds}
@app.post("/api/sfx/play")
async def play_sfx(request: SFXRequest):
"""Play a sound effect"""
sound_path = settings.sounds_dir / request.sound
if not sound_path.exists():
raise HTTPException(404, "Sound not found")
audio_service.play_sfx(str(sound_path))
return {"status": "playing", "sound": request.sound}
# --- LLM Settings Endpoints ---
@app.get("/api/settings")
async def get_settings():
"""Get LLM settings"""
return await llm_service.get_settings_async()
@app.post("/api/settings")
async def update_settings(data: dict):
"""Update LLM and TTS settings"""
llm_service.update_settings(
provider=data.get("provider"),
openrouter_model=data.get("openrouter_model"),
ollama_model=data.get("ollama_model"),
ollama_host=data.get("ollama_host"),
tts_provider=data.get("tts_provider")
)
return llm_service.get_settings()
# --- Server Control Endpoints ---
import subprocess
from collections import deque
# In-memory log buffer
_log_buffer = deque(maxlen=500)
def add_log(message: str):
"""Add a message to the log buffer"""
import datetime
timestamp = datetime.datetime.now().strftime("%H:%M:%S")
_log_buffer.append(f"[{timestamp}] {message}")
# Override print to also log to buffer
import builtins
_original_print = builtins.print
def _logging_print(*args, **kwargs):
try:
_original_print(*args, **kwargs)
except (BrokenPipeError, OSError):
pass # Ignore broken pipe errors from traceback printing
try:
message = " ".join(str(a) for a in args)
if message.strip():
add_log(message)
except Exception:
pass # Don't let logging errors break the app
builtins.print = _logging_print
@app.get("/api/logs")
async def get_logs(lines: int = 100):
"""Get recent log lines"""
log_lines = list(_log_buffer)[-lines:]
return {"logs": log_lines}
@app.post("/api/server/restart")
async def restart_server():
"""Signal the server to restart (requires run.sh wrapper)"""
restart_flag = Path("/tmp/ai-radio-show.restart")
restart_flag.touch()
add_log("Restart signal sent - server will restart shortly")
return {"status": "restarting"}
@app.post("/api/server/stop")
async def stop_server():
"""Signal the server to stop (requires run.sh wrapper)"""
stop_flag = Path("/tmp/ai-radio-show.stop")
stop_flag.touch()
add_log("Stop signal sent - server will stop shortly")
return {"status": "stopping"}
@app.get("/api/server/status")
async def server_status():
"""Get server status info"""
return {
"status": "running",
"tts_provider": settings.tts_provider,
"llm_provider": llm_service.provider,
"session_id": session.id
}

View File

@@ -0,0 +1 @@
# Services package

479
backend/services/audio.py Normal file
View File

@@ -0,0 +1,479 @@
"""Server-side audio service for Loopback routing"""
import sounddevice as sd
import numpy as np
import threading
import queue
import json
from pathlib import Path
from typing import Optional, Callable
import wave
import time
# Settings file path
SETTINGS_FILE = Path(__file__).parent.parent.parent / "audio_settings.json"
class AudioService:
"""Manages audio I/O with multi-channel support for Loopback routing"""
def __init__(self):
# Device configuration
self.input_device: Optional[int] = None
self.input_channel: int = 1 # 1-indexed channel
self.output_device: Optional[int] = None # Single output device (multi-channel)
self.caller_channel: int = 1 # Channel for caller TTS
self.music_channel: int = 2 # Channel for music
self.sfx_channel: int = 3 # Channel for SFX
self.phone_filter: bool = False # Phone filter on caller voices
# Recording state
self._recording = False
self._record_thread: Optional[threading.Thread] = None
self._audio_queue: queue.Queue = queue.Queue()
self._recorded_audio: list = []
self._record_device_sr: int = 48000
# Music playback state
self._music_stream: Optional[sd.OutputStream] = None
self._music_data: Optional[np.ndarray] = None
self._music_resampled: Optional[np.ndarray] = None
self._music_position: int = 0
self._music_playing: bool = False
self._music_volume: float = 0.3
self._music_loop: bool = True
# Caller playback state
self._caller_stop_event = threading.Event()
self._caller_thread: Optional[threading.Thread] = None
# Sample rates
self.input_sample_rate = 16000 # For Whisper
self.output_sample_rate = 24000 # For TTS
# Load saved settings
self._load_settings()
def _load_settings(self):
"""Load settings from disk"""
if SETTINGS_FILE.exists():
try:
with open(SETTINGS_FILE) as f:
data = json.load(f)
self.input_device = data.get("input_device")
self.input_channel = data.get("input_channel", 1)
self.output_device = data.get("output_device")
self.caller_channel = data.get("caller_channel", 1)
self.music_channel = data.get("music_channel", 2)
self.sfx_channel = data.get("sfx_channel", 3)
self.phone_filter = data.get("phone_filter", False)
print(f"Loaded audio settings: output={self.output_device}, channels={self.caller_channel}/{self.music_channel}/{self.sfx_channel}, phone_filter={self.phone_filter}")
except Exception as e:
print(f"Failed to load audio settings: {e}")
def _save_settings(self):
"""Save settings to disk"""
try:
data = {
"input_device": self.input_device,
"input_channel": self.input_channel,
"output_device": self.output_device,
"caller_channel": self.caller_channel,
"music_channel": self.music_channel,
"sfx_channel": self.sfx_channel,
"phone_filter": self.phone_filter,
}
with open(SETTINGS_FILE, "w") as f:
json.dump(data, f, indent=2)
print(f"Saved audio settings")
except Exception as e:
print(f"Failed to save audio settings: {e}")
def list_devices(self) -> list[dict]:
"""List all available audio devices"""
devices = sd.query_devices()
result = []
for i, d in enumerate(devices):
result.append({
"id": i,
"name": d["name"],
"inputs": d["max_input_channels"],
"outputs": d["max_output_channels"],
"default_sr": d["default_samplerate"]
})
return result
def set_devices(
self,
input_device: Optional[int] = None,
input_channel: Optional[int] = None,
output_device: Optional[int] = None,
caller_channel: Optional[int] = None,
music_channel: Optional[int] = None,
sfx_channel: Optional[int] = None,
phone_filter: Optional[bool] = None
):
"""Configure audio devices and channels"""
if input_device is not None:
self.input_device = input_device
if input_channel is not None:
self.input_channel = input_channel
if output_device is not None:
self.output_device = output_device
if caller_channel is not None:
self.caller_channel = caller_channel
if music_channel is not None:
self.music_channel = music_channel
if sfx_channel is not None:
self.sfx_channel = sfx_channel
if phone_filter is not None:
self.phone_filter = phone_filter
# Persist to disk
self._save_settings()
def get_device_settings(self) -> dict:
"""Get current device configuration"""
return {
"input_device": self.input_device,
"input_channel": self.input_channel,
"output_device": self.output_device,
"caller_channel": self.caller_channel,
"music_channel": self.music_channel,
"sfx_channel": self.sfx_channel,
"phone_filter": self.phone_filter,
}
# --- Recording ---
def start_recording(self) -> bool:
"""Start recording from input device"""
if self._recording:
return False
if self.input_device is None:
print("No input device configured")
return False
self._recording = True
self._recorded_audio = []
self._record_thread = threading.Thread(target=self._record_worker)
self._record_thread.start()
print(f"Recording started from device {self.input_device}")
return True
def stop_recording(self) -> bytes:
"""Stop recording and return audio data resampled to 16kHz for Whisper"""
import librosa
if not self._recording:
return b""
self._recording = False
if self._record_thread:
self._record_thread.join(timeout=2.0)
if not self._recorded_audio:
return b""
# Combine all chunks
audio = np.concatenate(self._recorded_audio)
device_sr = getattr(self, '_record_device_sr', 48000)
print(f"Recording stopped: {len(audio)} samples @ {device_sr}Hz ({len(audio)/device_sr:.2f}s)")
# Resample to 16kHz for Whisper
if device_sr != 16000:
audio = librosa.resample(audio, orig_sr=device_sr, target_sr=16000)
print(f"Resampled to 16kHz: {len(audio)} samples")
# Convert to bytes (16-bit PCM)
audio_int16 = (audio * 32767).astype(np.int16)
return audio_int16.tobytes()
def _record_worker(self):
"""Background thread for recording from specific channel"""
try:
# Get device info
device_info = sd.query_devices(self.input_device)
max_channels = device_info['max_input_channels']
device_sr = int(device_info['default_samplerate'])
record_channel = min(self.input_channel, max_channels) - 1
# Store device sample rate for later resampling
self._record_device_sr = device_sr
print(f"Recording from device {self.input_device} ch {self.input_channel} @ {device_sr}Hz")
def callback(indata, frames, time_info, status):
if status:
print(f"Record status: {status}")
if self._recording:
self._recorded_audio.append(indata[:, record_channel].copy())
with sd.InputStream(
device=self.input_device,
channels=max_channels,
samplerate=device_sr, # Use device's native rate
dtype=np.float32,
callback=callback,
blocksize=1024
):
while self._recording:
time.sleep(0.05)
except Exception as e:
print(f"Recording error: {e}")
self._recording = False
# --- Caller TTS Playback ---
def _apply_fade(self, audio: np.ndarray, sample_rate: int, fade_ms: int = 15) -> np.ndarray:
"""Apply fade-in and fade-out to avoid clicks"""
fade_samples = int(sample_rate * fade_ms / 1000)
if len(audio) < fade_samples * 2:
return audio
# Fade in
fade_in = np.linspace(0, 1, fade_samples)
audio[:fade_samples] *= fade_in
# Fade out
fade_out = np.linspace(1, 0, fade_samples)
audio[-fade_samples:] *= fade_out
return audio
def play_caller_audio(self, audio_bytes: bytes, sample_rate: int = 24000):
"""Play caller TTS audio to specific channel of output device (interruptible)"""
import librosa
# Stop any existing caller audio
self.stop_caller_audio()
self._caller_stop_event.clear()
# Convert bytes to numpy
audio = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32768.0
if self.output_device is None:
print("No output device configured, using default")
audio = self._apply_fade(audio, sample_rate)
with sd.OutputStream(samplerate=sample_rate, channels=1, dtype=np.float32) as stream:
stream.write(audio.reshape(-1, 1))
return
try:
# Get device info and resample to device's native rate
device_info = sd.query_devices(self.output_device)
num_channels = device_info['max_output_channels']
device_sr = int(device_info['default_samplerate'])
channel_idx = min(self.caller_channel, num_channels) - 1
# Resample if needed
if sample_rate != device_sr:
audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=device_sr)
# Apply fade to prevent clicks
audio = self._apply_fade(audio, device_sr)
# Create multi-channel output with audio only on target channel
multi_ch = np.zeros((len(audio), num_channels), dtype=np.float32)
multi_ch[:, channel_idx] = audio
print(f"Playing caller audio to device {self.output_device} ch {self.caller_channel} @ {device_sr}Hz")
# Play in chunks so we can interrupt
chunk_size = int(device_sr * 0.1) # 100ms chunks
pos = 0
with sd.OutputStream(
device=self.output_device,
samplerate=device_sr,
channels=num_channels,
dtype=np.float32
) as stream:
while pos < len(multi_ch) and not self._caller_stop_event.is_set():
end = min(pos + chunk_size, len(multi_ch))
stream.write(multi_ch[pos:end])
pos = end
if self._caller_stop_event.is_set():
print("Caller audio stopped early")
else:
print(f"Played caller audio: {len(audio)/device_sr:.2f}s")
except Exception as e:
print(f"Caller playback error: {e}")
def stop_caller_audio(self):
"""Stop any playing caller audio"""
self._caller_stop_event.set()
# --- Music Playback ---
def load_music(self, file_path: str) -> bool:
"""Load a music file for playback"""
path = Path(file_path)
if not path.exists():
print(f"Music file not found: {file_path}")
return False
try:
import librosa
audio, sr = librosa.load(str(path), sr=self.output_sample_rate, mono=True)
self._music_data = audio.astype(np.float32)
self._music_position = 0
print(f"Loaded music: {path.name} ({len(audio)/sr:.1f}s)")
return True
except Exception as e:
print(f"Failed to load music: {e}")
return False
def play_music(self):
"""Start music playback to specific channel"""
import librosa
if self._music_data is None:
print("No music loaded")
return
if self._music_playing:
self.stop_music()
self._music_playing = True
self._music_position = 0
if self.output_device is None:
print("No output device configured, using default")
num_channels = 2
device = None
device_sr = self.output_sample_rate
channel_idx = 0
else:
device_info = sd.query_devices(self.output_device)
num_channels = device_info['max_output_channels']
device_sr = int(device_info['default_samplerate'])
device = self.output_device
channel_idx = min(self.music_channel, num_channels) - 1
# Resample music to device sample rate if needed
if self.output_sample_rate != device_sr:
self._music_resampled = librosa.resample(
self._music_data, orig_sr=self.output_sample_rate, target_sr=device_sr
)
else:
self._music_resampled = self._music_data.copy()
# Apply fade-in at start of track
fade_samples = int(device_sr * 0.015) # 15ms fade
if len(self._music_resampled) > fade_samples:
fade_in = np.linspace(0, 1, fade_samples).astype(np.float32)
self._music_resampled[:fade_samples] *= fade_in
def callback(outdata, frames, time_info, status):
outdata.fill(0)
if not self._music_playing or self._music_resampled is None:
return
end_pos = self._music_position + frames
if end_pos <= len(self._music_resampled):
outdata[:, channel_idx] = self._music_resampled[self._music_position:end_pos] * self._music_volume
self._music_position = end_pos
else:
remaining = len(self._music_resampled) - self._music_position
if remaining > 0:
outdata[:remaining, channel_idx] = self._music_resampled[self._music_position:] * self._music_volume
if self._music_loop:
self._music_position = 0
wrap_frames = frames - remaining
if wrap_frames > 0:
outdata[remaining:, channel_idx] = self._music_resampled[:wrap_frames] * self._music_volume
self._music_position = wrap_frames
else:
self._music_playing = False
try:
self._music_stream = sd.OutputStream(
device=device,
channels=num_channels,
samplerate=device_sr,
dtype=np.float32,
callback=callback,
blocksize=2048
)
self._music_stream.start()
print(f"Music playback started on ch {self.music_channel} @ {device_sr}Hz")
except Exception as e:
print(f"Music playback error: {e}")
self._music_playing = False
def stop_music(self):
"""Stop music playback"""
self._music_playing = False
if self._music_stream:
self._music_stream.stop()
self._music_stream.close()
self._music_stream = None
self._music_position = 0
print("Music stopped")
def set_music_volume(self, volume: float):
"""Set music volume (0.0 to 1.0)"""
self._music_volume = max(0.0, min(1.0, volume))
def is_music_playing(self) -> bool:
"""Check if music is currently playing"""
return self._music_playing
# --- SFX Playback ---
def play_sfx(self, file_path: str):
"""Play a sound effect to specific channel using dedicated stream"""
path = Path(file_path)
if not path.exists():
print(f"SFX file not found: {file_path}")
return
try:
import librosa
if self.output_device is None:
audio, sr = librosa.load(str(path), sr=None, mono=True)
audio = self._apply_fade(audio, sr)
def play():
# Use a dedicated stream instead of sd.play()
with sd.OutputStream(samplerate=sr, channels=1, dtype=np.float32) as stream:
stream.write(audio.reshape(-1, 1))
else:
device_info = sd.query_devices(self.output_device)
num_channels = device_info['max_output_channels']
device_sr = int(device_info['default_samplerate'])
channel_idx = min(self.sfx_channel, num_channels) - 1
audio, _ = librosa.load(str(path), sr=device_sr, mono=True)
audio = self._apply_fade(audio, device_sr)
multi_ch = np.zeros((len(audio), num_channels), dtype=np.float32)
multi_ch[:, channel_idx] = audio
def play():
# Use dedicated stream to avoid interrupting other audio
with sd.OutputStream(
device=self.output_device,
samplerate=device_sr,
channels=num_channels,
dtype=np.float32
) as stream:
stream.write(multi_ch)
threading.Thread(target=play, daemon=True).start()
print(f"Playing SFX: {path.name} on ch {self.sfx_channel}")
except Exception as e:
print(f"SFX playback error: {e}")
# Global instance
audio_service = AudioService()

View File

@@ -0,0 +1,112 @@
"""Edge TTS service - free Microsoft TTS API"""
import asyncio
import io
import numpy as np
from typing import Optional
try:
import edge_tts
EDGE_TTS_AVAILABLE = True
except ImportError:
EDGE_TTS_AVAILABLE = False
class EdgeTTSService:
"""TTS using Microsoft Edge's free API"""
def __init__(self):
self.sample_rate = 24000 # Edge TTS outputs 24kHz
def is_available(self) -> bool:
return EDGE_TTS_AVAILABLE
async def generate_speech(self, text: str, voice: str = "en-US-JennyNeural") -> bytes:
"""Generate speech from text using Edge TTS
Args:
text: Text to synthesize
voice: Edge TTS voice name (e.g., "en-US-JennyNeural")
Returns:
Raw PCM audio bytes (16-bit signed int, 24kHz mono)
"""
if not EDGE_TTS_AVAILABLE:
raise RuntimeError("edge-tts not installed. Run: pip install edge-tts")
communicate = edge_tts.Communicate(text, voice)
# Collect MP3 audio data
mp3_data = b''
async for chunk in communicate.stream():
if chunk['type'] == 'audio':
mp3_data += chunk['data']
if not mp3_data:
raise RuntimeError("No audio generated")
# Convert MP3 to PCM
pcm_data = await self._mp3_to_pcm(mp3_data)
return pcm_data
async def _mp3_to_pcm(self, mp3_data: bytes) -> bytes:
"""Convert MP3 to raw PCM using ffmpeg or pydub"""
loop = asyncio.get_event_loop()
def convert():
try:
# Try pydub first (more reliable)
from pydub import AudioSegment
audio = AudioSegment.from_mp3(io.BytesIO(mp3_data))
# Convert to 24kHz mono 16-bit
audio = audio.set_frame_rate(24000).set_channels(1).set_sample_width(2)
return audio.raw_data
except ImportError:
pass
# Fallback to ffmpeg subprocess
import subprocess
process = subprocess.Popen(
[
'ffmpeg', '-i', 'pipe:0',
'-f', 's16le',
'-acodec', 'pcm_s16le',
'-ar', '24000',
'-ac', '1',
'pipe:1'
],
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE
)
pcm_data, stderr = process.communicate(input=mp3_data)
if process.returncode != 0:
raise RuntimeError(f"ffmpeg failed: {stderr.decode()}")
return pcm_data
return await loop.run_in_executor(None, convert)
async def list_voices(self) -> list[dict]:
"""List available Edge TTS voices"""
if not EDGE_TTS_AVAILABLE:
return []
voices = await edge_tts.list_voices()
return [
{
"id": v["ShortName"],
"name": v["ShortName"].replace("Neural", ""),
"gender": v["Gender"],
"locale": v["Locale"],
}
for v in voices
if v["Locale"].startswith("en-")
]
# Global instance
edge_tts_service = EdgeTTSService()
def is_edge_tts_available() -> bool:
return edge_tts_service.is_available()

175
backend/services/llm.py Normal file
View File

@@ -0,0 +1,175 @@
"""LLM service with OpenRouter and Ollama support"""
import httpx
from typing import Optional
from ..config import settings
# Available OpenRouter models
OPENROUTER_MODELS = [
"anthropic/claude-3-haiku",
"anthropic/claude-3.5-sonnet",
"openai/gpt-4o-mini",
"openai/gpt-4o",
"google/gemini-flash-1.5",
"google/gemini-pro-1.5",
"meta-llama/llama-3.1-8b-instruct",
"mistralai/mistral-7b-instruct",
]
class LLMService:
"""Abstraction layer for LLM providers"""
def __init__(self):
self.provider = settings.llm_provider
self.openrouter_model = settings.openrouter_model
self.ollama_model = settings.ollama_model
self.ollama_host = settings.ollama_host
self.tts_provider = settings.tts_provider
def update_settings(
self,
provider: Optional[str] = None,
openrouter_model: Optional[str] = None,
ollama_model: Optional[str] = None,
ollama_host: Optional[str] = None,
tts_provider: Optional[str] = None
):
"""Update LLM settings"""
if provider:
self.provider = provider
if openrouter_model:
self.openrouter_model = openrouter_model
if ollama_model:
self.ollama_model = ollama_model
if ollama_host:
self.ollama_host = ollama_host
if tts_provider:
self.tts_provider = tts_provider
# Also update the global settings so TTS service picks it up
settings.tts_provider = tts_provider
async def get_ollama_models(self) -> list[str]:
"""Fetch available models from Ollama"""
try:
async with httpx.AsyncClient(timeout=5.0) as client:
response = await client.get(f"{self.ollama_host}/api/tags")
response.raise_for_status()
data = response.json()
return [model["name"] for model in data.get("models", [])]
except Exception as e:
print(f"Failed to fetch Ollama models: {e}")
return []
def get_settings(self) -> dict:
"""Get current settings (sync version without Ollama models)"""
return {
"provider": self.provider,
"openrouter_model": self.openrouter_model,
"ollama_model": self.ollama_model,
"ollama_host": self.ollama_host,
"tts_provider": self.tts_provider,
"available_openrouter_models": OPENROUTER_MODELS,
"available_ollama_models": [] # Fetched separately
}
async def get_settings_async(self) -> dict:
"""Get current settings with Ollama models"""
ollama_models = await self.get_ollama_models()
return {
"provider": self.provider,
"openrouter_model": self.openrouter_model,
"ollama_model": self.ollama_model,
"ollama_host": self.ollama_host,
"tts_provider": self.tts_provider,
"available_openrouter_models": OPENROUTER_MODELS,
"available_ollama_models": ollama_models
}
async def generate(
self,
messages: list[dict],
system_prompt: Optional[str] = None
) -> str:
"""
Generate a response from the LLM.
Args:
messages: List of message dicts with 'role' and 'content'
system_prompt: Optional system prompt to prepend
Returns:
Generated text response
"""
if system_prompt:
messages = [{"role": "system", "content": system_prompt}] + messages
if self.provider == "openrouter":
return await self._call_openrouter(messages)
else:
return await self._call_ollama(messages)
async def _call_openrouter(self, messages: list[dict]) -> str:
"""Call OpenRouter API with retry"""
for attempt in range(2): # Try twice
try:
async with httpx.AsyncClient(timeout=30.0) as client:
response = await client.post(
"https://openrouter.ai/api/v1/chat/completions",
headers={
"Authorization": f"Bearer {settings.openrouter_api_key}",
"Content-Type": "application/json",
},
json={
"model": self.openrouter_model,
"messages": messages,
"max_tokens": 100,
},
)
response.raise_for_status()
data = response.json()
return data["choices"][0]["message"]["content"]
except (httpx.TimeoutException, httpx.ReadTimeout):
print(f"OpenRouter timeout (attempt {attempt + 1})")
if attempt == 0:
continue # Retry once
return "Uh, sorry, I lost you there for a second. What was that?"
except Exception as e:
print(f"OpenRouter error: {e}")
return "Yeah... I don't know, man."
return "Uh, hold on a sec..."
async def _call_ollama(self, messages: list[dict]) -> str:
"""Call Ollama API"""
try:
async with httpx.AsyncClient() as client:
response = await client.post(
f"{self.ollama_host}/api/chat",
json={
"model": self.ollama_model,
"messages": messages,
"stream": False,
"options": {
"num_predict": 100, # Allow complete thoughts
"temperature": 0.8, # Balanced creativity/coherence
"top_p": 0.9, # Focused word choices
"repeat_penalty": 1.3, # Avoid repetition
"top_k": 50, # Reasonable token variety
},
},
timeout=30.0
)
response.raise_for_status()
data = response.json()
return data["message"]["content"]
except httpx.TimeoutException:
print("Ollama timeout")
return "Uh, sorry, I lost you there for a second. What was that?"
except Exception as e:
print(f"Ollama error: {e}")
return "Yeah... I don't know, man."
# Global instance
llm_service = LLMService()

View File

@@ -0,0 +1,144 @@
"""Piper TTS service using sherpa-onnx for fast local voice synthesis"""
import asyncio
import numpy as np
from pathlib import Path
from typing import Optional
# Models directory
MODELS_DIR = Path(__file__).parent.parent.parent / "models" / "sherpa"
# Try to import sherpa-onnx
try:
import sherpa_onnx
SHERPA_AVAILABLE = True
except ImportError:
SHERPA_AVAILABLE = False
sherpa_onnx = None
# Available sherpa-onnx Piper models
PIPER_MODELS = {
"amy": {
"dir": "vits-piper-en_US-amy-low",
"model": "en_US-amy-low.onnx",
"name": "Amy (US Female)",
"sample_rate": 16000,
},
"joe": {
"dir": "vits-piper-en_US-joe-medium",
"model": "en_US-joe-medium.onnx",
"name": "Joe (US Male)",
"sample_rate": 22050,
},
"lessac": {
"dir": "vits-piper-en_US-lessac-medium",
"model": "en_US-lessac-medium.onnx",
"name": "Lessac (US Female)",
"sample_rate": 22050,
},
"alan": {
"dir": "vits-piper-en_GB-alan-medium",
"model": "en_GB-alan-medium.onnx",
"name": "Alan (UK Male)",
"sample_rate": 22050,
},
}
class PiperTTSService:
"""Fast local TTS using sherpa-onnx with Piper models"""
def __init__(self):
self.output_sample_rate = 24000 # Our standard output rate
self._tts_engines: dict[str, any] = {}
def is_available(self) -> bool:
"""Check if sherpa-onnx is available"""
return SHERPA_AVAILABLE
def _get_engine(self, model_key: str):
"""Get or create a TTS engine for the given model"""
if model_key in self._tts_engines:
return self._tts_engines[model_key], PIPER_MODELS[model_key]["sample_rate"]
if model_key not in PIPER_MODELS:
raise ValueError(f"Unknown model: {model_key}")
model_info = PIPER_MODELS[model_key]
model_dir = MODELS_DIR / model_info["dir"]
if not model_dir.exists():
raise RuntimeError(f"Model not found: {model_dir}")
config = sherpa_onnx.OfflineTtsConfig(
model=sherpa_onnx.OfflineTtsModelConfig(
vits=sherpa_onnx.OfflineTtsVitsModelConfig(
model=str(model_dir / model_info["model"]),
tokens=str(model_dir / "tokens.txt"),
data_dir=str(model_dir / "espeak-ng-data"),
),
num_threads=2,
),
)
tts = sherpa_onnx.OfflineTts(config)
self._tts_engines[model_key] = tts
return tts, model_info["sample_rate"]
async def generate_speech(self, text: str, model_key: str = "amy") -> bytes:
"""Generate speech from text using sherpa-onnx
Args:
text: Text to synthesize
model_key: Model key (amy, joe, lessac, alan)
Returns:
Raw PCM audio bytes (16-bit signed int, 24kHz mono)
"""
if not SHERPA_AVAILABLE:
raise RuntimeError("sherpa-onnx not installed. Run: pip install sherpa-onnx")
loop = asyncio.get_event_loop()
def run_tts():
tts, model_sample_rate = self._get_engine(model_key)
audio = tts.generate(text)
samples = np.array(audio.samples, dtype=np.float32)
# Resample to 24kHz if needed
if model_sample_rate != self.output_sample_rate:
ratio = self.output_sample_rate / model_sample_rate
new_length = int(len(samples) * ratio)
samples = np.interp(
np.linspace(0, len(samples) - 1, new_length),
np.arange(len(samples)),
samples
).astype(np.float32)
# Convert to int16
audio_int16 = (samples * 32767).astype(np.int16)
return audio_int16.tobytes()
return await loop.run_in_executor(None, run_tts)
def list_available_models(self) -> list[dict]:
"""List available models"""
available = []
for key, info in PIPER_MODELS.items():
model_dir = MODELS_DIR / info["dir"]
if model_dir.exists():
available.append({
"id": key,
"name": info["name"],
"sample_rate": info["sample_rate"],
})
return available
# Global instance
piper_service = PiperTTSService()
def is_piper_available() -> bool:
"""Check if Piper (sherpa-onnx) is available"""
return piper_service.is_available()

View File

@@ -0,0 +1,116 @@
"""Whisper transcription service"""
import tempfile
import numpy as np
from faster_whisper import WhisperModel
import librosa
# Global model instance (loaded once)
_whisper_model = None
def get_whisper_model() -> WhisperModel:
"""Get or create Whisper model instance"""
global _whisper_model
if _whisper_model is None:
print("Loading Whisper tiny model for fast transcription...")
# Use tiny model for speed - about 3-4x faster than base
# beam_size=1 and best_of=1 for fastest inference
_whisper_model = WhisperModel("tiny", device="cpu", compute_type="int8")
print("Whisper model loaded")
return _whisper_model
def decode_audio(audio_data: bytes, source_sample_rate: int = None) -> tuple[np.ndarray, int]:
"""
Decode audio from various formats to numpy array.
Args:
audio_data: Raw audio bytes
source_sample_rate: If provided, treat as raw PCM at this sample rate
Returns:
Tuple of (audio array as float32, sample rate)
"""
# If sample rate is provided, assume raw PCM (from server-side recording)
if source_sample_rate is not None:
print(f"Decoding raw PCM at {source_sample_rate}Hz, {len(audio_data)} bytes")
if len(audio_data) % 2 != 0:
audio_data = audio_data + b'\x00'
audio = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32768.0
return audio, source_sample_rate
print(f"First 20 bytes: {audio_data[:20].hex()}")
# Try to decode with librosa first (handles webm, ogg, wav, mp3, etc via ffmpeg)
try:
with tempfile.NamedTemporaryFile(suffix='.webm', delete=False) as f:
f.write(audio_data)
temp_path = f.name
audio, sample_rate = librosa.load(temp_path, sr=None, mono=True)
print(f"Decoded with librosa: {len(audio)} samples at {sample_rate}Hz")
import os
os.unlink(temp_path)
return audio.astype(np.float32), sample_rate
except Exception as e:
print(f"librosa decode failed: {e}, trying raw PCM at 16kHz...")
# Fall back to raw PCM (16-bit signed int, 16kHz mono - Whisper's rate)
if len(audio_data) % 2 != 0:
audio_data = audio_data + b'\x00'
audio = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32768.0
return audio, 16000
async def transcribe_audio(audio_data: bytes, source_sample_rate: int = None) -> str:
"""
Transcribe audio data to text using Whisper.
Args:
audio_data: Audio bytes (webm, ogg, wav, or raw PCM)
source_sample_rate: If provided, treat audio_data as raw PCM at this rate
Returns:
Transcribed text
"""
model = get_whisper_model()
print(f"Transcribing audio: {len(audio_data)} bytes")
# Decode audio from whatever format
audio, detected_sample_rate = decode_audio(audio_data, source_sample_rate)
print(f"Audio samples: {len(audio)}, duration: {len(audio)/detected_sample_rate:.2f}s")
print(f"Audio range: min={audio.min():.4f}, max={audio.max():.4f}")
# Check if audio is too quiet
if np.abs(audio).max() < 0.01:
print("Warning: Audio appears to be silent or very quiet")
return ""
# Resample to 16kHz for Whisper
if detected_sample_rate != 16000:
audio_16k = librosa.resample(audio, orig_sr=detected_sample_rate, target_sr=16000)
print(f"Resampled to {len(audio_16k)} samples at 16kHz")
else:
audio_16k = audio
# Transcribe with speed optimizations
segments, info = model.transcribe(
audio_16k,
beam_size=1, # Faster, slightly less accurate
best_of=1,
language="en", # Skip language detection
vad_filter=True, # Skip silence
)
segments_list = list(segments)
text = " ".join([s.text for s in segments_list]).strip()
print(f"Transcription result: '{text}' (language: {info.language}, prob: {info.language_probability:.2f})")
return text

701
backend/services/tts.py Normal file
View File

@@ -0,0 +1,701 @@
"""TTS service with ElevenLabs, F5-TTS, MLX Kokoro, StyleTTS2, VITS, and Bark support"""
import os
import numpy as np
from scipy.signal import butter, filtfilt
from pathlib import Path
import tempfile
import torch
from ..config import settings
# Patch torch.load for compatibility with PyTorch 2.6+
_original_torch_load = torch.load
def _patched_torch_load(*args, **kwargs):
kwargs['weights_only'] = False
return _original_torch_load(*args, **kwargs)
torch.load = _patched_torch_load
# Global clients
_elevenlabs_client = None
_vits_tts = None
_bark_loaded = False
_kokoro_model = None
_styletts2_model = None
_f5tts_model = None
_chattts_model = None
_chattts_speakers = {} # Cache for speaker embeddings
# Kokoro voice mapping - using highest-graded voices
# Grades from https://huggingface.co/hexgrad/Kokoro-82M/blob/main/VOICES.md
KOKORO_VOICES = {
# Male voices (best available are C+ grade)
"VR6AewLTigWG4xSOukaG": "am_fenrir", # Tony - deep/powerful (C+)
"TxGEqnHWrfWFTfGW9XjX": "am_michael", # Rick - solid male voice (C+)
"pNInz6obpgDQGcFmaJgB": "am_puck", # Dennis - anxious dad (C+)
"ODq5zmih8GrVes37Dizd": "bm_george", # Earl - older/distinguished British (C)
"IKne3meq5aSn9XLyUdCD": "bm_fable", # Marcus - young British (C)
# Female voices (much better quality available)
"jBpfuIE2acCO8z3wKNLl": "af_heart", # Jasmine - best quality (A)
"EXAVITQu4vr4xnSDxMaL": "af_bella", # Megan - warm/friendly (A-)
"21m00Tcm4TlvDq8ikWAM": "bf_emma", # Tanya - professional British (B-)
"XB0fDUnXU5powFXDhCwa": "af_nicole", # Carla - Jersey mom (B-)
"pFZP5JQG7iQjIQuC4Bku": "af_sarah", # Brenda - overthinker (C+)
}
# Speed adjustments per voice (1.0 = normal, lower = slower/more natural)
# Slower speeds (0.85-0.95) generally sound more natural
KOKORO_SPEEDS = {
# Male voices - slower speeds help with C+ grade voices
"VR6AewLTigWG4xSOukaG": 0.9, # Tony (am_fenrir) - deep voice, slower
"TxGEqnHWrfWFTfGW9XjX": 0.92, # Rick (am_michael) - solid pace
"pNInz6obpgDQGcFmaJgB": 0.95, # Dennis (am_puck) - anxious but not rushed
"ODq5zmih8GrVes37Dizd": 0.85, # Earl (bm_george) - older, slower British
"IKne3meq5aSn9XLyUdCD": 0.95, # Marcus (bm_fable) - young, natural
# Female voices - A-grade voices can handle faster speeds
"jBpfuIE2acCO8z3wKNLl": 0.95, # Jasmine (af_heart) - best voice, natural pace
"EXAVITQu4vr4xnSDxMaL": 0.95, # Megan (af_bella) - warm
"21m00Tcm4TlvDq8ikWAM": 0.9, # Tanya (bf_emma) - professional British
"XB0fDUnXU5powFXDhCwa": 0.95, # Carla (af_nicole) - animated but clear
"pFZP5JQG7iQjIQuC4Bku": 0.92, # Brenda (af_sarah) - overthinker, measured
}
DEFAULT_KOKORO_VOICE = "af_heart"
DEFAULT_KOKORO_SPEED = 0.95
# VCTK speaker mapping - different voices for different callers
VITS_SPEAKERS = {
# Male voices
"VR6AewLTigWG4xSOukaG": "p226", # Tony
"TxGEqnHWrfWFTfGW9XjX": "p251", # Rick
"pNInz6obpgDQGcFmaJgB": "p245", # Dennis
"ODq5zmih8GrVes37Dizd": "p232", # Earl
"IKne3meq5aSn9XLyUdCD": "p252", # Marcus
# Female voices
"jBpfuIE2acCO8z3wKNLl": "p225", # Jasmine
"EXAVITQu4vr4xnSDxMaL": "p228", # Megan
"21m00Tcm4TlvDq8ikWAM": "p229", # Tanya
"XB0fDUnXU5powFXDhCwa": "p231", # Carla
"pFZP5JQG7iQjIQuC4Bku": "p233", # Brenda
}
DEFAULT_VITS_SPEAKER = "p225"
# Inworld voice mapping - maps ElevenLabs voice IDs to Inworld voices
# Full voice list from API: Alex, Ashley, Blake, Carter, Clive, Craig, Deborah,
# Dennis, Dominus, Edward, Elizabeth, Hades, Hana, Julia, Luna, Mark, Olivia,
# Pixie, Priya, Ronald, Sarah, Shaun, Theodore, Timothy, Wendy
INWORLD_VOICES = {
# Male voices - each caller gets a unique voice matching their personality
"VR6AewLTigWG4xSOukaG": "Edward", # Tony - fast-talking, emphatic, streetwise
"TxGEqnHWrfWFTfGW9XjX": "Shaun", # Rick - friendly, dynamic, conversational
"pNInz6obpgDQGcFmaJgB": "Alex", # Dennis - energetic, expressive, mildly nasal
"ODq5zmih8GrVes37Dizd": "Craig", # Earl - older British, refined, articulate
"IKne3meq5aSn9XLyUdCD": "Timothy", # Marcus - lively, upbeat American
# Female voices - each caller gets a unique voice matching their personality
"jBpfuIE2acCO8z3wKNLl": "Hana", # Jasmine - bright, expressive young female
"EXAVITQu4vr4xnSDxMaL": "Ashley", # Megan - warm, natural female
"21m00Tcm4TlvDq8ikWAM": "Wendy", # Tanya - posh, middle-aged British
"XB0fDUnXU5powFXDhCwa": "Sarah", # Carla - fast-talking, questioning tone
"pFZP5JQG7iQjIQuC4Bku": "Deborah", # Brenda - gentle, elegant
}
DEFAULT_INWORLD_VOICE = "Dennis"
def preprocess_text_for_kokoro(text: str) -> str:
"""
Preprocess text to improve Kokoro prosody and naturalness.
- Adds slight pauses via punctuation
- Handles contractions and abbreviations
- Normalizes spacing
"""
import re
# Normalize whitespace
text = ' '.join(text.split())
# Add comma pauses after common transition words (if no punctuation follows)
transitions = [
r'\b(Well)\s+(?=[A-Za-z])',
r'\b(So)\s+(?=[A-Za-z])',
r'\b(Now)\s+(?=[A-Za-z])',
r'\b(Look)\s+(?=[A-Za-z])',
r'\b(See)\s+(?=[A-Za-z])',
r'\b(Anyway)\s+(?=[A-Za-z])',
r'\b(Actually)\s+(?=[A-Za-z])',
r'\b(Honestly)\s+(?=[A-Za-z])',
r'\b(Basically)\s+(?=[A-Za-z])',
]
for pattern in transitions:
text = re.sub(pattern, r'\1, ', text)
# Add pause after "I mean" at start of sentence
text = re.sub(r'^(I mean)\s+', r'\1, ', text)
text = re.sub(r'\.\s+(I mean)\s+', r'. \1, ', text)
# Expand common abbreviations for better pronunciation
abbreviations = {
r'\bDr\.': 'Doctor',
r'\bMr\.': 'Mister',
r'\bMrs\.': 'Missus',
r'\bMs\.': 'Miss',
r'\bSt\.': 'Street',
r'\bAve\.': 'Avenue',
r'\betc\.': 'etcetera',
r'\bvs\.': 'versus',
r'\bw/': 'with',
r'\bw/o': 'without',
}
for abbr, expansion in abbreviations.items():
text = re.sub(abbr, expansion, text, flags=re.IGNORECASE)
# Add breath pause (comma) before conjunctions in long sentences
text = re.sub(r'(\w{20,})\s+(and|but|or)\s+', r'\1, \2 ', text)
# Ensure proper spacing after punctuation
text = re.sub(r'([.!?])\s*([A-Z])', r'\1 \2', text)
return text
# StyleTTS2 reference voice files (place .wav files in voices/ directory for voice cloning)
# Maps voice_id to reference audio filename - if file doesn't exist, uses default voice
STYLETTS2_VOICES = {
# Male voices
"VR6AewLTigWG4xSOukaG": "tony.wav", # Tony
"TxGEqnHWrfWFTfGW9XjX": "rick.wav", # Rick
"pNInz6obpgDQGcFmaJgB": "dennis.wav", # Dennis
"ODq5zmih8GrVes37Dizd": "earl.wav", # Earl
"IKne3meq5aSn9XLyUdCD": "marcus.wav", # Marcus
# Female voices
"jBpfuIE2acCO8z3wKNLl": "jasmine.wav", # Jasmine
"EXAVITQu4vr4xnSDxMaL": "megan.wav", # Megan
"21m00Tcm4TlvDq8ikWAM": "tanya.wav", # Tanya
"XB0fDUnXU5powFXDhCwa": "carla.wav", # Carla
"pFZP5JQG7iQjIQuC4Bku": "brenda.wav", # Brenda
}
# F5-TTS reference voices (same files as StyleTTS2, reuses voices/ directory)
# Requires: mono, 24kHz, 5-10 seconds, with transcript in .txt file
F5TTS_VOICES = STYLETTS2_VOICES.copy()
# ChatTTS speaker seeds - different seeds produce different voices
# These are used to generate consistent speaker embeddings
CHATTTS_SEEDS = {
# Male voices
"VR6AewLTigWG4xSOukaG": 42, # Tony - deep voice
"TxGEqnHWrfWFTfGW9XjX": 123, # Rick
"pNInz6obpgDQGcFmaJgB": 456, # Dennis
"ODq5zmih8GrVes37Dizd": 789, # Earl
"IKne3meq5aSn9XLyUdCD": 1011, # Marcus
# Female voices
"jBpfuIE2acCO8z3wKNLl": 2024, # Jasmine
"EXAVITQu4vr4xnSDxMaL": 3033, # Megan
"21m00Tcm4TlvDq8ikWAM": 4042, # Tanya
"XB0fDUnXU5powFXDhCwa": 5051, # Carla
"pFZP5JQG7iQjIQuC4Bku": 6060, # Brenda
}
DEFAULT_CHATTTS_SEED = 42
def get_elevenlabs_client():
"""Get or create ElevenLabs client"""
global _elevenlabs_client
if _elevenlabs_client is None:
from elevenlabs.client import ElevenLabs
_elevenlabs_client = ElevenLabs(api_key=settings.elevenlabs_api_key)
return _elevenlabs_client
def get_vits_tts():
"""Get or create VITS VCTK TTS instance"""
global _vits_tts
if _vits_tts is None:
from TTS.api import TTS
_vits_tts = TTS("tts_models/en/vctk/vits")
return _vits_tts
def get_kokoro_model():
"""Get or create Kokoro MLX model"""
global _kokoro_model
if _kokoro_model is None:
from mlx_audio.tts.utils import load_model
_kokoro_model = load_model(model_path='mlx-community/Kokoro-82M-bf16')
print("Kokoro MLX model loaded")
return _kokoro_model
def ensure_bark_loaded():
"""Ensure Bark models are loaded on GPU"""
global _bark_loaded
if not _bark_loaded:
os.environ['SUNO_USE_SMALL_MODELS'] = '1'
# Force Bark to use MPS (Apple Silicon GPU)
if torch.backends.mps.is_available():
os.environ['SUNO_OFFLOAD_CPU'] = '0'
os.environ['SUNO_ENABLE_MPS'] = '1'
from bark import preload_models
preload_models()
_bark_loaded = True
print(f"Bark loaded on device: {'MPS' if torch.backends.mps.is_available() else 'CPU'}")
def get_styletts2_model():
"""Get or create StyleTTS2 model"""
global _styletts2_model
if _styletts2_model is None:
from styletts2 import tts
_styletts2_model = tts.StyleTTS2()
print("StyleTTS2 model loaded")
return _styletts2_model
def get_f5tts_generate():
"""Get F5-TTS generate function (lazy load)"""
global _f5tts_model
if _f5tts_model is None:
# Disable tqdm progress bars to avoid BrokenPipeError in server context
import os
os.environ['HF_HUB_DISABLE_PROGRESS_BARS'] = '1'
os.environ['TQDM_DISABLE'] = '1'
from f5_tts_mlx.generate import generate
_f5tts_model = generate
print("F5-TTS MLX loaded")
return _f5tts_model
def get_chattts_model():
"""Get or create ChatTTS model"""
global _chattts_model
if _chattts_model is None:
import ChatTTS
_chattts_model = ChatTTS.Chat()
_chattts_model.load(compile=False)
print("ChatTTS model loaded")
return _chattts_model
def get_chattts_speaker(voice_id: str):
"""Get or create a consistent speaker embedding for a voice"""
global _chattts_speakers
if voice_id not in _chattts_speakers:
chat = get_chattts_model()
seed = CHATTTS_SEEDS.get(voice_id, DEFAULT_CHATTTS_SEED)
# Set seed for reproducible speaker
torch.manual_seed(seed)
_chattts_speakers[voice_id] = chat.sample_random_speaker()
print(f"[ChatTTS] Created speaker for voice {voice_id} with seed {seed}")
return _chattts_speakers[voice_id]
def phone_filter(audio: np.ndarray, sample_rate: int = 24000, quality: str = "normal") -> np.ndarray:
"""Apply phone filter with variable quality."""
audio = audio.flatten()
presets = {
"good": (200, 7000, 1.0, 0.0),
"normal": (300, 3400, 1.5, 0.005),
"bad": (400, 2800, 2.0, 0.015),
"terrible": (500, 2200, 2.5, 0.03),
}
low_hz, high_hz, distortion, noise = presets.get(quality, presets["normal"])
low = low_hz / (sample_rate / 2)
high = high_hz / (sample_rate / 2)
b, a = butter(4, [low, high], btype='band')
filtered = filtfilt(b, a, audio)
filtered = np.tanh(filtered * distortion) * 0.8
if noise > 0:
static = np.random.normal(0, noise, len(filtered)).astype(np.float32)
static_envelope = np.random.random(len(filtered) // 1000 + 1)
static_envelope = np.repeat(static_envelope, 1000)[:len(filtered)]
static *= (static_envelope > 0.7).astype(np.float32)
filtered = filtered + static
return filtered.astype(np.float32)
async def generate_speech_elevenlabs(text: str, voice_id: str) -> tuple[np.ndarray, int]:
"""Generate speech using ElevenLabs"""
client = get_elevenlabs_client()
audio_gen = client.text_to_speech.convert(
voice_id=voice_id,
text=text,
model_id="eleven_v3",
output_format="pcm_24000"
)
audio_bytes = b"".join(audio_gen)
audio = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32768.0
return audio, 24000
async def generate_speech_kokoro(text: str, voice_id: str) -> tuple[np.ndarray, int]:
"""Generate speech using MLX Kokoro (fast, good quality, Apple Silicon optimized)"""
import librosa
from mlx_audio.tts.generate import generate_audio
model = get_kokoro_model()
voice = KOKORO_VOICES.get(voice_id, DEFAULT_KOKORO_VOICE)
speed = KOKORO_SPEEDS.get(voice_id, DEFAULT_KOKORO_SPEED)
# Preprocess text for better prosody
text = preprocess_text_for_kokoro(text)
# Determine lang_code from voice prefix (a=American, b=British)
lang_code = 'b' if voice.startswith('b') else 'a'
with tempfile.TemporaryDirectory() as tmpdir:
generate_audio(
text,
model=model,
voice=voice,
speed=speed,
lang_code=lang_code,
output_path=tmpdir,
file_prefix='tts',
verbose=False
)
# Read the generated audio file
audio_file = Path(tmpdir) / 'tts_000.wav'
if not audio_file.exists():
raise RuntimeError("Kokoro failed to generate audio")
audio, sr = librosa.load(str(audio_file), sr=None, mono=True)
# Resample to 24kHz if needed
if sr != 24000:
audio = librosa.resample(audio, orig_sr=sr, target_sr=24000)
return audio.astype(np.float32), 24000
async def generate_speech_vits(text: str, voice_id: str) -> tuple[np.ndarray, int]:
"""Generate speech using VITS VCTK (fast, multiple speakers)"""
import librosa
tts = get_vits_tts()
speaker = VITS_SPEAKERS.get(voice_id, DEFAULT_VITS_SPEAKER)
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
tmp_path = tmp.name
try:
tts.tts_to_file(text=text, file_path=tmp_path, speaker=speaker)
audio, sr = librosa.load(tmp_path, sr=None, mono=True)
if sr != 24000:
audio = librosa.resample(audio, orig_sr=sr, target_sr=24000)
return audio.astype(np.float32), 24000
finally:
Path(tmp_path).unlink(missing_ok=True)
async def generate_speech_bark(text: str, voice_id: str) -> tuple[np.ndarray, int]:
"""Generate speech using Bark (slow but expressive, supports emotes like [laughs])"""
import librosa
from bark import SAMPLE_RATE, generate_audio
ensure_bark_loaded()
# Generate audio with Bark
audio = generate_audio(text)
# Normalize to prevent clipping (Bark can exceed [-1, 1])
max_val = np.abs(audio).max()
if max_val > 0.95:
audio = audio * (0.95 / max_val)
# Resample to 24kHz if needed
if SAMPLE_RATE != 24000:
audio = librosa.resample(audio, orig_sr=SAMPLE_RATE, target_sr=24000)
return audio.astype(np.float32), 24000
async def generate_speech_styletts2(text: str, voice_id: str) -> tuple[np.ndarray, int]:
"""Generate speech using StyleTTS2 (high quality, supports voice cloning)"""
import librosa
model = get_styletts2_model()
# Check for reference voice file
voice_file = STYLETTS2_VOICES.get(voice_id)
voice_path = None
if voice_file:
voice_path = settings.base_dir / "voices" / voice_file
if not voice_path.exists():
voice_path = None # Use default voice if file doesn't exist
# Generate audio
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
tmp_path = tmp.name
try:
if voice_path:
print(f"[StyleTTS2] Using voice clone: {voice_path}")
audio = model.inference(
text,
target_voice_path=str(voice_path),
output_wav_file=tmp_path,
output_sample_rate=24000,
diffusion_steps=5, # Balance quality/speed
alpha=0.3, # More voice-like than text-like
beta=0.7, # Good prosody
)
else:
print("[StyleTTS2] Using default voice")
audio = model.inference(
text,
output_wav_file=tmp_path,
output_sample_rate=24000,
diffusion_steps=5,
)
# Load the generated audio
audio, sr = librosa.load(tmp_path, sr=None, mono=True)
if sr != 24000:
audio = librosa.resample(audio, orig_sr=sr, target_sr=24000)
return audio.astype(np.float32), 24000
finally:
Path(tmp_path).unlink(missing_ok=True)
async def generate_speech_f5tts(text: str, voice_id: str) -> tuple[np.ndarray, int]:
"""Generate speech using F5-TTS MLX (very natural, supports voice cloning)"""
import librosa
generate = get_f5tts_generate()
# Check for reference voice file and transcript
voice_file = F5TTS_VOICES.get(voice_id)
ref_audio_path = None
ref_text = None
if voice_file:
voice_path = settings.base_dir / "voices" / voice_file
txt_path = voice_path.with_suffix('.txt')
if voice_path.exists() and txt_path.exists():
ref_audio_path = str(voice_path)
ref_text = txt_path.read_text().strip()
print(f"[F5-TTS] Using voice clone: {voice_path}")
if not ref_audio_path:
print("[F5-TTS] Using default voice")
# Generate audio to temp file
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
tmp_path = tmp.name
try:
generate(
generation_text=text,
ref_audio_path=ref_audio_path,
ref_audio_text=ref_text,
steps=8,
speed=1.0,
output_path=tmp_path,
)
# Load the generated audio
audio, sr = librosa.load(tmp_path, sr=None, mono=True)
# Resample to 24kHz if needed
if sr != 24000:
audio = librosa.resample(audio, orig_sr=sr, target_sr=24000)
return audio.astype(np.float32), 24000
finally:
Path(tmp_path).unlink(missing_ok=True)
async def generate_speech_chattts(text: str, voice_id: str) -> tuple[np.ndarray, int]:
"""Generate speech using ChatTTS (natural conversational speech, multiple speakers)"""
import ChatTTS
chat = get_chattts_model()
# Ensure text is not empty and has reasonable content
text = text.strip()
if not text:
text = "Hello."
print(f"[ChatTTS] Generating speech for: {text[:50]}...")
# Get consistent speaker for this voice
seed = CHATTTS_SEEDS.get(voice_id, DEFAULT_CHATTTS_SEED)
torch.manual_seed(seed)
# Configure inference parameters
params_infer_code = ChatTTS.Chat.InferCodeParams(
temperature=0.3,
top_P=0.7,
top_K=20,
)
# Generate audio (skip text refinement to avoid narrow() error with this version)
wavs = chat.infer(
[text],
params_infer_code=params_infer_code,
skip_refine_text=True,
)
if wavs is None or len(wavs) == 0:
raise RuntimeError("ChatTTS failed to generate audio")
audio = wavs[0]
# Handle different output shapes
if audio.ndim > 1:
audio = audio.squeeze()
# Normalize
max_val = np.abs(audio).max()
if max_val > 0.95:
audio = audio * (0.95 / max_val)
return audio.astype(np.float32), 24000
async def generate_speech_inworld(text: str, voice_id: str) -> tuple[np.ndarray, int]:
"""Generate speech using Inworld TTS API (high quality, natural voices)"""
import httpx
import base64
import librosa
voice = INWORLD_VOICES.get(voice_id, DEFAULT_INWORLD_VOICE)
api_key = settings.inworld_api_key
if not api_key:
raise RuntimeError("INWORLD_API_KEY not set in environment")
print(f"[Inworld TTS] Voice: {voice}, Text: {text[:50]}...")
url = "https://api.inworld.ai/tts/v1/voice"
headers = {
"Content-Type": "application/json",
"Authorization": f"Basic {api_key}",
}
payload = {
"text": text,
"voice_id": voice,
"model_id": "inworld-tts-1.5-mini",
"audio_config": {
"encoding": "LINEAR16",
"sample_rate_hertz": 48000,
},
}
async with httpx.AsyncClient(timeout=60.0) as client:
response = await client.post(url, json=payload, headers=headers)
response.raise_for_status()
data = response.json()
# Decode base64 audio
audio_b64 = data.get("audioContent")
if not audio_b64:
raise RuntimeError("Inworld TTS returned no audio content")
audio_bytes = base64.b64decode(audio_b64)
# Parse audio using soundfile (handles WAV, MP3, etc.)
import soundfile as sf
import io
# soundfile can read WAV, FLAC, OGG, and with ffmpeg: MP3
# MP3 files start with ID3 tag or 0xff sync bytes
try:
audio, sr = sf.read(io.BytesIO(audio_bytes))
except Exception as e:
print(f"[Inworld TTS] soundfile failed: {e}, trying raw PCM")
# Fallback to raw PCM
if len(audio_bytes) % 2 != 0:
audio_bytes = audio_bytes[:-1]
audio = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32768.0
sr = 48000
# Resample to 24kHz to match other providers
if sr != 24000:
audio = librosa.resample(audio, orig_sr=sr, target_sr=24000)
return audio.astype(np.float32), 24000
async def generate_speech(
text: str,
voice_id: str,
phone_quality: str = "normal",
apply_filter: bool = True
) -> bytes:
"""
Generate speech from text.
Args:
text: Text to speak
voice_id: ElevenLabs voice ID (mapped to local voice if using local TTS)
phone_quality: Quality of phone filter ("none" to disable)
apply_filter: Whether to apply phone filter
Returns:
Raw PCM audio bytes (16-bit signed int, 24kHz)
"""
# Choose TTS provider
provider = settings.tts_provider
print(f"[TTS] Provider: {provider}, Text: {text[:50]}...")
if provider == "kokoro":
audio, sample_rate = await generate_speech_kokoro(text, voice_id)
elif provider == "f5tts":
audio, sample_rate = await generate_speech_f5tts(text, voice_id)
elif provider == "inworld":
audio, sample_rate = await generate_speech_inworld(text, voice_id)
elif provider == "chattts":
audio, sample_rate = await generate_speech_chattts(text, voice_id)
elif provider == "styletts2":
audio, sample_rate = await generate_speech_styletts2(text, voice_id)
elif provider == "bark":
audio, sample_rate = await generate_speech_bark(text, voice_id)
elif provider == "vits":
audio, sample_rate = await generate_speech_vits(text, voice_id)
elif provider == "elevenlabs":
audio, sample_rate = await generate_speech_elevenlabs(text, voice_id)
else:
raise ValueError(f"Unknown TTS provider: {provider}")
# Apply phone filter if requested
# Skip filter for Bark - it already has rough audio quality
if apply_filter and phone_quality not in ("none", "studio") and provider != "bark":
audio = phone_filter(audio, sample_rate, phone_quality)
# Convert to bytes
audio_int16 = (audio * 32768).clip(-32768, 32767).astype(np.int16)
return audio_int16.tobytes()
# Voice IDs for cohost and announcer
COHOST_VOICE_ID = "nPczCjzI2devNBz1zQrb"
ANNOUNCER_VOICE_ID = "ErXwobaYiN019PkySvjV"
async def generate_cohost_speech(text: str) -> bytes:
"""Generate speech for cohost Bobby (no phone filter)"""
return await generate_speech(text, COHOST_VOICE_ID, apply_filter=False)
async def generate_announcer_speech(text: str) -> bytes:
"""Generate speech for announcer (no phone filter)"""
return await generate_speech(text, ANNOUNCER_VOICE_ID, apply_filter=False)

200
backend/services/voices.py Normal file
View File

@@ -0,0 +1,200 @@
"""Voice configuration and TTS provider management"""
from dataclasses import dataclass
from typing import Optional
from enum import Enum
class TTSProvider(str, Enum):
ELEVENLABS = "elevenlabs"
EDGE = "edge" # Microsoft Edge TTS (free)
PIPER = "piper" # Local Piper via sherpa-onnx (free, fast)
@dataclass
class Voice:
"""Voice configuration"""
id: str
name: str
provider: TTSProvider
provider_voice_id: str # The actual ID used by the provider
description: str = ""
language: str = "en"
gender: str = "neutral"
# ElevenLabs voices
ELEVENLABS_VOICES = [
Voice("el_tony", "Tony (ElevenLabs)", TTSProvider.ELEVENLABS, "IKne3meq5aSn9XLyUdCD",
"Male, New York accent, expressive", "en", "male"),
Voice("el_jasmine", "Jasmine (ElevenLabs)", TTSProvider.ELEVENLABS, "FGY2WhTYpPnrIDTdsKH5",
"Female, confident, direct", "en", "female"),
Voice("el_rick", "Rick (ElevenLabs)", TTSProvider.ELEVENLABS, "JBFqnCBsd6RMkjVDRZzb",
"Male, Texas accent, older", "en", "male"),
Voice("el_megan", "Megan (ElevenLabs)", TTSProvider.ELEVENLABS, "XrExE9yKIg1WjnnlVkGX",
"Female, young, casual", "en", "female"),
Voice("el_dennis", "Dennis (ElevenLabs)", TTSProvider.ELEVENLABS, "cjVigY5qzO86Huf0OWal",
"Male, middle-aged, anxious", "en", "male"),
Voice("el_tanya", "Tanya (ElevenLabs)", TTSProvider.ELEVENLABS, "N2lVS1w4EtoT3dr4eOWO",
"Female, Miami, sassy", "en", "female"),
Voice("el_earl", "Earl (ElevenLabs)", TTSProvider.ELEVENLABS, "EXAVITQu4vr4xnSDxMaL",
"Male, elderly, Southern", "en", "male"),
Voice("el_carla", "Carla (ElevenLabs)", TTSProvider.ELEVENLABS, "CwhRBWXzGAHq8TQ4Fs17",
"Female, Jersey, sharp", "en", "female"),
Voice("el_marcus", "Marcus (ElevenLabs)", TTSProvider.ELEVENLABS, "bIHbv24MWmeRgasZH58o",
"Male, young, urban", "en", "male"),
Voice("el_brenda", "Brenda (ElevenLabs)", TTSProvider.ELEVENLABS, "Xb7hH8MSUJpSbSDYk0k2",
"Female, middle-aged, worried", "en", "female"),
Voice("el_jake", "Jake (ElevenLabs)", TTSProvider.ELEVENLABS, "SOYHLrjzK2X1ezoPC6cr",
"Male, Boston, insecure", "en", "male"),
Voice("el_diane", "Diane (ElevenLabs)", TTSProvider.ELEVENLABS, "cgSgspJ2msm6clMCkdW9",
"Female, mature, conflicted", "en", "female"),
Voice("el_bobby", "Bobby (ElevenLabs)", TTSProvider.ELEVENLABS, "nPczCjzI2devNBz1zQrb",
"Male, sidekick, wisecracking", "en", "male"),
Voice("el_announcer", "Announcer (ElevenLabs)", TTSProvider.ELEVENLABS, "ErXwobaYiN019PkySvjV",
"Male, radio announcer", "en", "male"),
]
# Edge TTS voices (Microsoft, free)
EDGE_VOICES = [
# US voices
Voice("edge_jenny", "Jenny (Edge)", TTSProvider.EDGE, "en-US-JennyNeural",
"Female, American, friendly", "en", "female"),
Voice("edge_guy", "Guy (Edge)", TTSProvider.EDGE, "en-US-GuyNeural",
"Male, American, casual", "en", "male"),
Voice("edge_aria", "Aria (Edge)", TTSProvider.EDGE, "en-US-AriaNeural",
"Female, American, professional", "en", "female"),
Voice("edge_davis", "Davis (Edge)", TTSProvider.EDGE, "en-US-DavisNeural",
"Male, American, calm", "en", "male"),
Voice("edge_amber", "Amber (Edge)", TTSProvider.EDGE, "en-US-AmberNeural",
"Female, American, warm", "en", "female"),
Voice("edge_andrew", "Andrew (Edge)", TTSProvider.EDGE, "en-US-AndrewNeural",
"Male, American, confident", "en", "male"),
Voice("edge_ashley", "Ashley (Edge)", TTSProvider.EDGE, "en-US-AshleyNeural",
"Female, American, cheerful", "en", "female"),
Voice("edge_brian", "Brian (Edge)", TTSProvider.EDGE, "en-US-BrianNeural",
"Male, American, narrator", "en", "male"),
Voice("edge_christopher", "Christopher (Edge)", TTSProvider.EDGE, "en-US-ChristopherNeural",
"Male, American, reliable", "en", "male"),
Voice("edge_cora", "Cora (Edge)", TTSProvider.EDGE, "en-US-CoraNeural",
"Female, American, older", "en", "female"),
Voice("edge_elizabeth", "Elizabeth (Edge)", TTSProvider.EDGE, "en-US-ElizabethNeural",
"Female, American, elegant", "en", "female"),
Voice("edge_eric", "Eric (Edge)", TTSProvider.EDGE, "en-US-EricNeural",
"Male, American, friendly", "en", "male"),
Voice("edge_jacob", "Jacob (Edge)", TTSProvider.EDGE, "en-US-JacobNeural",
"Male, American, young", "en", "male"),
Voice("edge_michelle", "Michelle (Edge)", TTSProvider.EDGE, "en-US-MichelleNeural",
"Female, American, clear", "en", "female"),
Voice("edge_monica", "Monica (Edge)", TTSProvider.EDGE, "en-US-MonicaNeural",
"Female, American, expressive", "en", "female"),
Voice("edge_roger", "Roger (Edge)", TTSProvider.EDGE, "en-US-RogerNeural",
"Male, American, mature", "en", "male"),
Voice("edge_steffan", "Steffan (Edge)", TTSProvider.EDGE, "en-US-SteffanNeural",
"Male, American, formal", "en", "male"),
Voice("edge_tony", "Tony (Edge)", TTSProvider.EDGE, "en-US-TonyNeural",
"Male, American, conversational", "en", "male"),
# UK voices
Voice("edge_sonia", "Sonia (Edge UK)", TTSProvider.EDGE, "en-GB-SoniaNeural",
"Female, British, professional", "en", "female"),
Voice("edge_ryan", "Ryan (Edge UK)", TTSProvider.EDGE, "en-GB-RyanNeural",
"Male, British, clear", "en", "male"),
Voice("edge_libby", "Libby (Edge UK)", TTSProvider.EDGE, "en-GB-LibbyNeural",
"Female, British, warm", "en", "female"),
Voice("edge_thomas", "Thomas (Edge UK)", TTSProvider.EDGE, "en-GB-ThomasNeural",
"Male, British, friendly", "en", "male"),
# Australian voices
Voice("edge_natasha", "Natasha (Edge AU)", TTSProvider.EDGE, "en-AU-NatashaNeural",
"Female, Australian, friendly", "en", "female"),
Voice("edge_william", "William (Edge AU)", TTSProvider.EDGE, "en-AU-WilliamNeural",
"Male, Australian, casual", "en", "male"),
]
# Piper voices (local, via sherpa-onnx)
PIPER_VOICES = [
Voice("piper_amy", "Amy (Piper)", TTSProvider.PIPER, "amy",
"Female, American, clear", "en", "female"),
Voice("piper_joe", "Joe (Piper)", TTSProvider.PIPER, "joe",
"Male, American, natural", "en", "male"),
Voice("piper_lessac", "Lessac (Piper)", TTSProvider.PIPER, "lessac",
"Female, American, expressive", "en", "female"),
Voice("piper_alan", "Alan (Piper)", TTSProvider.PIPER, "alan",
"Male, British, clear", "en", "male"),
]
# All voices combined
ALL_VOICES = ELEVENLABS_VOICES + EDGE_VOICES + PIPER_VOICES
# Voice lookup by ID
VOICES_BY_ID = {v.id: v for v in ALL_VOICES}
# Default voice assignments for callers (maps caller key to voice ID)
DEFAULT_CALLER_VOICES = {
"1": "el_tony", # Tony from Staten Island
"2": "el_jasmine", # Jasmine from Atlanta
"3": "el_rick", # Rick from Texas
"4": "el_megan", # Megan from Portland
"5": "el_dennis", # Dennis from Long Island
"6": "el_tanya", # Tanya from Miami
"7": "el_earl", # Earl from Tennessee
"8": "el_carla", # Carla from Jersey
"9": "el_marcus", # Marcus from Detroit
"0": "el_brenda", # Brenda from Phoenix
"-": "el_jake", # Jake from Boston
"=": "el_diane", # Diane from Chicago
"bobby": "el_bobby",
"announcer": "el_announcer",
}
class VoiceManager:
"""Manages voice assignments and TTS provider selection"""
def __init__(self):
# Current voice assignments (can be modified at runtime)
self.caller_voices = DEFAULT_CALLER_VOICES.copy()
def get_voice(self, voice_id: str) -> Optional[Voice]:
"""Get voice by ID"""
return VOICES_BY_ID.get(voice_id)
def get_caller_voice(self, caller_key: str) -> Voice:
"""Get the voice assigned to a caller"""
voice_id = self.caller_voices.get(caller_key, "el_tony")
return VOICES_BY_ID.get(voice_id, ELEVENLABS_VOICES[0])
def set_caller_voice(self, caller_key: str, voice_id: str):
"""Assign a voice to a caller"""
if voice_id in VOICES_BY_ID:
self.caller_voices[caller_key] = voice_id
def get_all_voices(self) -> list[dict]:
"""Get all available voices as dicts for API"""
return [
{
"id": v.id,
"name": v.name,
"provider": v.provider.value,
"description": v.description,
"gender": v.gender,
}
for v in ALL_VOICES
]
def get_voices_by_provider(self, provider: TTSProvider) -> list[Voice]:
"""Get all voices for a specific provider"""
return [v for v in ALL_VOICES if v.provider == provider]
def get_caller_voice_assignments(self) -> dict[str, str]:
"""Get current caller voice assignments"""
return self.caller_voices.copy()
def set_caller_voice_assignments(self, assignments: dict[str, str]):
"""Set multiple caller voice assignments"""
for caller_key, voice_id in assignments.items():
if voice_id in VOICES_BY_ID:
self.caller_voices[caller_key] = voice_id
# Global instance
voice_manager = VoiceManager()

109
download_sounds.py Normal file
View File

@@ -0,0 +1,109 @@
#!/usr/bin/env python3
"""
Download free sound effects for the radio show soundboard.
Uses sounds from freesound.org and other free sources.
"""
import os
import urllib.request
import ssl
from pathlib import Path
# Bypass SSL issues
ssl._create_default_https_context = ssl._create_unverified_context
SOUNDS_DIR = Path(__file__).parent / "sounds"
SOUNDS_DIR.mkdir(exist_ok=True)
# Free sound effect URLs (public domain / CC0)
# These are from various free sources
SOUND_URLS = {
# Using pixabay free sounds (no attribution required)
'rimshot.wav': 'https://cdn.pixabay.com/audio/2022/03/15/audio_7a569d6dde.mp3',
'laugh.wav': 'https://cdn.pixabay.com/audio/2024/02/14/audio_70fa4b1f7c.mp3',
'sad_trombone.wav': 'https://cdn.pixabay.com/audio/2022/03/15/audio_cce0f1f0f1.mp3',
'cheer.wav': 'https://cdn.pixabay.com/audio/2021/08/04/audio_0625c1539c.mp3',
'boo.wav': 'https://cdn.pixabay.com/audio/2022/10/30/audio_f2a4d3d7db.mp3',
'drumroll.wav': 'https://cdn.pixabay.com/audio/2022/03/24/audio_52a6ef9129.mp3',
'crickets.wav': 'https://cdn.pixabay.com/audio/2022/03/09/audio_691875e05c.mp3',
'phone_ring.wav': 'https://cdn.pixabay.com/audio/2022/03/15/audio_0f66b49312.mp3',
}
def download_sound(name, url):
"""Download a sound file"""
output_path = SOUNDS_DIR / name
if output_path.exists():
print(f"{name} (already exists)")
return True
try:
print(f" Downloading {name}...")
# Download the file
req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'})
with urllib.request.urlopen(req, timeout=30) as response:
data = response.read()
# If it's an MP3, we need to convert it
if url.endswith('.mp3'):
temp_mp3 = SOUNDS_DIR / f"temp_{name}.mp3"
with open(temp_mp3, 'wb') as f:
f.write(data)
# Try to convert with ffmpeg
import subprocess
result = subprocess.run([
'ffmpeg', '-y', '-i', str(temp_mp3),
'-ar', '24000', '-ac', '1',
str(output_path)
], capture_output=True)
temp_mp3.unlink() # Remove temp file
if result.returncode == 0:
print(f"{name}")
return True
else:
print(f"{name} (ffmpeg conversion failed)")
return False
else:
with open(output_path, 'wb') as f:
f.write(data)
print(f"{name}")
return True
except Exception as e:
print(f"{name} ({e})")
return False
def main():
print("Downloading sound effects for radio show soundboard...")
print(f"Saving to: {SOUNDS_DIR}\n")
# Check for ffmpeg
import subprocess
try:
subprocess.run(['ffmpeg', '-version'], capture_output=True, check=True)
except:
print("WARNING: ffmpeg not found. Install it with: brew install ffmpeg")
print("Some sounds may not download correctly.\n")
success = 0
for name, url in SOUND_URLS.items():
if download_sound(name, url):
success += 1
print(f"\nDownloaded {success}/{len(SOUND_URLS)} sounds.")
print("\nTo add more sounds:")
print(" 1. Find free .wav files online")
print(" 2. Name them according to the SOUNDBOARD mapping in radio_show.py")
print(" 3. Place them in the sounds/ directory")
print("\nRecommended free sound sources:")
print(" - freesound.org")
print(" - pixabay.com/sound-effects")
print(" - zapsplat.com")
print(" - soundbible.com")
if __name__ == "__main__":
main()

543
frontend/css/style.css Normal file
View File

@@ -0,0 +1,543 @@
/* AI Radio Show - Clean CSS */
:root {
--bg: #1a1a2e;
--bg-light: #252547;
--accent: #e94560;
--text: #fff;
--text-muted: #888;
--radius: 8px;
}
* {
box-sizing: border-box;
margin: 0;
padding: 0;
}
body {
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif;
background: var(--bg);
color: var(--text);
min-height: 100vh;
}
#app {
max-width: 900px;
margin: 0 auto;
padding: 20px;
}
/* Header */
header {
display: flex;
justify-content: space-between;
align-items: center;
margin-bottom: 20px;
}
header h1 {
font-size: 1.5rem;
}
.header-buttons {
display: flex;
gap: 8px;
}
header button {
background: var(--bg-light);
color: var(--text);
border: none;
padding: 8px 16px;
border-radius: var(--radius);
cursor: pointer;
}
.new-session-btn {
background: var(--accent) !important;
}
.session-id {
font-size: 0.7rem;
color: var(--text-muted);
font-weight: normal;
}
.caller-background {
font-size: 0.85rem;
color: var(--text-muted);
padding: 10px;
background: var(--bg);
border-radius: var(--radius);
margin-bottom: 12px;
line-height: 1.4;
}
.caller-background.hidden {
display: none;
}
/* Main layout */
main {
display: grid;
grid-template-columns: 1fr 1fr;
gap: 20px;
}
@media (max-width: 700px) {
main {
grid-template-columns: 1fr;
}
}
/* Sections */
section {
background: var(--bg-light);
padding: 16px;
border-radius: var(--radius);
}
section h2 {
font-size: 1rem;
margin-bottom: 12px;
color: var(--text-muted);
}
/* Callers */
.caller-grid {
display: grid;
grid-template-columns: repeat(5, 1fr);
gap: 8px;
margin-bottom: 12px;
}
.caller-btn {
background: var(--bg);
color: var(--text);
border: 2px solid transparent;
padding: 10px 8px;
border-radius: var(--radius);
cursor: pointer;
font-size: 0.85rem;
transition: all 0.2s;
}
.caller-btn:hover {
border-color: var(--accent);
}
.caller-btn.active {
background: var(--accent);
border-color: var(--accent);
}
.call-status {
text-align: center;
padding: 8px;
color: var(--text-muted);
margin-bottom: 12px;
}
.hangup-btn {
width: 100%;
background: #c0392b;
color: white;
border: none;
padding: 12px;
border-radius: var(--radius);
cursor: pointer;
font-weight: bold;
}
.hangup-btn:disabled {
opacity: 0.5;
cursor: not-allowed;
}
/* Chat */
.chat-section {
grid-column: span 2;
}
@media (max-width: 700px) {
.chat-section {
grid-column: span 1;
}
}
.chat-log {
height: 300px;
overflow-y: auto;
background: var(--bg);
border-radius: var(--radius);
padding: 12px;
margin-bottom: 12px;
}
.message {
padding: 8px 12px;
margin-bottom: 8px;
border-radius: var(--radius);
line-height: 1.4;
}
.message.host {
background: #2c5282;
}
.message.caller {
background: #553c9a;
}
.message strong {
display: block;
font-size: 0.8rem;
opacity: 0.7;
margin-bottom: 4px;
}
.talk-controls {
display: flex;
gap: 10px;
}
.talk-btn {
flex: 1;
background: var(--accent);
color: white;
border: none;
padding: 16px;
border-radius: var(--radius);
font-size: 1rem;
font-weight: bold;
cursor: pointer;
transition: all 0.2s;
}
.talk-btn:hover {
filter: brightness(1.1);
}
.talk-btn.recording {
background: #c0392b;
animation: pulse 1s infinite;
}
@keyframes pulse {
0%, 100% { opacity: 1; }
50% { opacity: 0.7; }
}
.type-btn {
background: var(--bg);
color: var(--text);
border: none;
padding: 16px 24px;
border-radius: var(--radius);
cursor: pointer;
}
.status {
text-align: center;
padding: 12px;
color: var(--accent);
font-weight: bold;
}
.status.hidden {
display: none;
}
/* Music */
.music-section select {
width: 100%;
padding: 10px;
background: var(--bg);
color: var(--text);
border: none;
border-radius: var(--radius);
margin-bottom: 10px;
}
.music-controls {
display: flex;
gap: 8px;
align-items: center;
}
.music-controls button {
background: var(--bg);
color: var(--text);
border: none;
padding: 10px 16px;
border-radius: var(--radius);
cursor: pointer;
}
.music-controls input[type="range"] {
flex: 1;
}
/* Soundboard */
.soundboard {
display: grid;
grid-template-columns: repeat(3, 1fr);
gap: 8px;
}
.sound-btn {
background: var(--bg);
color: var(--text);
border: none;
padding: 12px 8px;
border-radius: var(--radius);
cursor: pointer;
font-size: 0.8rem;
transition: all 0.1s;
}
.sound-btn:hover {
background: var(--accent);
}
.sound-btn:active {
transform: scale(0.95);
}
/* Modal */
.modal {
position: fixed;
inset: 0;
background: rgba(0, 0, 0, 0.8);
display: flex;
align-items: center;
justify-content: center;
z-index: 100;
}
.modal.hidden {
display: none;
}
.modal-content {
background: var(--bg-light);
padding: 24px;
border-radius: var(--radius);
width: 90%;
max-width: 400px;
}
.modal-content h2 {
margin-bottom: 16px;
}
.modal-content h3 {
font-size: 0.9rem;
color: var(--text-muted);
margin: 16px 0 8px 0;
border-bottom: 1px solid var(--bg);
padding-bottom: 4px;
}
.settings-group {
margin-bottom: 16px;
}
.device-row {
display: flex;
gap: 8px;
align-items: flex-end;
}
.device-row label:first-child {
flex: 1;
}
.channel-row {
display: flex;
gap: 12px;
margin-top: 8px;
}
.channel-row label {
display: flex;
align-items: center;
gap: 4px;
font-size: 0.85rem;
}
.channel-input {
width: 50px !important;
text-align: center;
}
.modal-content label {
display: block;
margin-bottom: 16px;
}
.modal-content label.checkbox {
display: flex;
align-items: center;
gap: 8px;
}
.modal-content select,
.modal-content input[type="text"],
.modal-content textarea {
width: 100%;
padding: 10px;
background: var(--bg);
color: var(--text);
border: none;
border-radius: var(--radius);
margin-top: 4px;
}
.modal-buttons {
display: flex;
gap: 10px;
margin-top: 20px;
}
.modal-buttons button {
flex: 1;
padding: 12px;
border: none;
border-radius: var(--radius);
cursor: pointer;
font-weight: bold;
}
.modal-buttons button:first-child {
background: var(--accent);
color: white;
}
.modal-buttons button:last-child {
background: var(--bg);
color: var(--text);
}
.refresh-btn {
background: var(--bg);
color: var(--text-muted);
border: 1px solid var(--bg-light);
padding: 6px 12px;
border-radius: var(--radius);
cursor: pointer;
font-size: 0.85rem;
margin-top: 8px;
}
.refresh-btn:hover {
background: var(--bg-light);
color: var(--text);
}
.refresh-btn:disabled {
opacity: 0.5;
cursor: not-allowed;
}
.hidden {
display: none !important;
}
/* Server Log */
.log-section {
grid-column: span 2;
}
@media (max-width: 700px) {
.log-section {
grid-column: span 1;
}
}
.log-header {
display: flex;
justify-content: space-between;
align-items: center;
margin-bottom: 12px;
}
.log-header h2 {
margin-bottom: 0;
}
.server-controls {
display: flex;
gap: 8px;
align-items: center;
}
.server-btn {
border: none;
padding: 6px 12px;
border-radius: var(--radius);
cursor: pointer;
font-size: 0.85rem;
font-weight: bold;
}
.server-btn.restart {
background: #2196F3;
color: white;
}
.server-btn.restart:hover {
background: #1976D2;
}
.server-btn.stop {
background: #c0392b;
color: white;
}
.server-btn.stop:hover {
background: #a93226;
}
.auto-scroll-label {
display: flex;
align-items: center;
gap: 4px;
font-size: 0.8rem;
color: var(--text-muted);
cursor: pointer;
}
.server-log {
height: 200px;
overflow-y: auto;
background: #0d0d1a;
border-radius: var(--radius);
padding: 12px;
font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
font-size: 0.75rem;
line-height: 1.5;
color: #8f8;
}
.server-log .log-line {
white-space: pre-wrap;
word-break: break-all;
}
.server-log .log-line.error {
color: #f88;
}
.server-log .log-line.warning {
color: #ff8;
}
.server-log .log-line.tts {
color: #8ff;
}
.server-log .log-line.chat {
color: #f8f;
}

178
frontend/index.html Normal file
View File

@@ -0,0 +1,178 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>AI Radio Show</title>
<link rel="stylesheet" href="/css/style.css">
</head>
<body>
<div id="app">
<header>
<h1>AI Radio Show</h1>
<div class="header-buttons">
<button id="new-session-btn" class="new-session-btn">New Session</button>
<button id="settings-btn">Settings</button>
</div>
</header>
<main>
<!-- Callers -->
<section class="callers-section">
<h2>Callers <span id="session-id" class="session-id"></span></h2>
<div id="callers" class="caller-grid"></div>
<div id="call-status" class="call-status">No active call</div>
<div id="caller-background" class="caller-background hidden"></div>
<button id="hangup-btn" class="hangup-btn" disabled>Hang Up</button>
</section>
<!-- Chat -->
<section class="chat-section">
<div id="chat" class="chat-log"></div>
<div class="talk-controls">
<button id="talk-btn" class="talk-btn">Hold to Talk</button>
<button id="type-btn" class="type-btn">Type</button>
</div>
<div id="status" class="status hidden"></div>
</section>
<!-- Music -->
<section class="music-section">
<h2>Music</h2>
<select id="track-select"></select>
<div class="music-controls">
<button id="play-btn">Play</button>
<button id="stop-btn">Stop</button>
<input type="range" id="volume" min="0" max="100" value="30">
</div>
</section>
<!-- Sound Effects -->
<section class="sounds-section">
<h2>Sounds</h2>
<div id="soundboard" class="soundboard"></div>
</section>
<!-- Server Log -->
<section class="log-section">
<div class="log-header">
<h2>Server Log</h2>
<div class="server-controls">
<button id="restart-server-btn" class="server-btn restart">Restart</button>
<button id="stop-server-btn" class="server-btn stop">Stop</button>
<label class="auto-scroll-label">
<input type="checkbox" id="auto-scroll" checked> Auto-scroll
</label>
</div>
</div>
<div id="server-log" class="server-log"></div>
</section>
</main>
<!-- Settings Modal -->
<div id="settings-modal" class="modal hidden">
<div class="modal-content">
<h2>Settings</h2>
<!-- Audio Devices -->
<div class="settings-group">
<h3>Audio Routing</h3>
<div class="device-row">
<label>
Input Device
<select id="input-device"></select>
</label>
<label>
Ch
<input type="number" id="input-channel" value="1" min="1" max="16" class="channel-input">
</label>
</div>
<div class="device-row">
<label>
Output Device
<select id="output-device"></select>
</label>
</div>
<div class="channel-row">
<label>Caller Ch <input type="number" id="caller-channel" value="1" min="1" max="16" class="channel-input"></label>
<label>Music Ch <input type="number" id="music-channel" value="2" min="1" max="16" class="channel-input"></label>
<label>SFX Ch <input type="number" id="sfx-channel" value="3" min="1" max="16" class="channel-input"></label>
</div>
</div>
<!-- LLM Settings -->
<div class="settings-group">
<h3>LLM Provider</h3>
<label>
Provider
<select id="provider">
<option value="openrouter">OpenRouter</option>
<option value="ollama">Ollama</option>
</select>
</label>
<div id="openrouter-settings">
<label>
Model
<select id="openrouter-model"></select>
</label>
</div>
<div id="ollama-settings" class="hidden">
<label>
Model
<select id="ollama-model"></select>
</label>
<label>
Host
<input type="text" id="ollama-host" value="http://localhost:11434">
</label>
<button type="button" id="refresh-ollama" class="refresh-btn">Refresh Models</button>
</div>
</div>
<!-- TTS Settings -->
<div class="settings-group">
<h3>TTS Provider</h3>
<label>
Provider
<select id="tts-provider">
<option value="inworld">Inworld (High quality, natural)</option>
<option value="f5tts">F5-TTS (Most natural local)</option>
<option value="elevenlabs">ElevenLabs (Best quality, paid)</option>
<option value="kokoro">Kokoro MLX (Fast, Apple Silicon)</option>
<option value="chattts">ChatTTS (Conversational)</option>
<option value="styletts2">StyleTTS2 (Voice cloning)</option>
<option value="vits">VITS (Fast local)</option>
<option value="bark">Bark (Expressive, supports [laughs])</option>
</select>
</label>
<label class="checkbox">
<input type="checkbox" id="phone-filter">
Phone filter on voices
</label>
</div>
<div class="modal-buttons">
<button id="save-settings">Save</button>
<button id="close-settings">Close</button>
</div>
</div>
</div>
<!-- Type Modal -->
<div id="type-modal" class="modal hidden">
<div class="modal-content">
<h2>Type Message</h2>
<textarea id="type-input" rows="3" placeholder="Type what you want to say..."></textarea>
<div class="modal-buttons">
<button id="send-type">Send</button>
<button id="close-type">Cancel</button>
</div>
</div>
</div>
</div>
<script src="/js/app.js?v=8"></script>
</body>
</html>

782
frontend/js/app.js Normal file
View File

@@ -0,0 +1,782 @@
/**
* AI Radio Show - Control Panel (Server-Side Audio)
*/
// --- State ---
let currentCaller = null;
let isProcessing = false;
let isRecording = false;
let phoneFilter = false;
let autoScroll = true;
let logPollInterval = null;
let lastLogCount = 0;
// Track lists
let tracks = [];
let sounds = [];
// --- Init ---
document.addEventListener('DOMContentLoaded', async () => {
console.log('AI Radio Show initializing...');
try {
await loadAudioDevices();
await loadCallers();
await loadMusic();
await loadSounds();
await loadSettings();
initEventListeners();
log('Ready. Configure audio devices in Settings, then click a caller to start.');
console.log('AI Radio Show ready');
} catch (err) {
console.error('Init error:', err);
log('Error loading: ' + err.message);
}
});
function initEventListeners() {
// Hangup
document.getElementById('hangup-btn')?.addEventListener('click', hangup);
// New Session
document.getElementById('new-session-btn')?.addEventListener('click', newSession);
// Server controls
document.getElementById('restart-server-btn')?.addEventListener('click', restartServer);
document.getElementById('stop-server-btn')?.addEventListener('click', stopServer);
document.getElementById('auto-scroll')?.addEventListener('change', e => {
autoScroll = e.target.checked;
});
// Start log polling
startLogPolling();
// Talk button - now triggers server-side recording
const talkBtn = document.getElementById('talk-btn');
if (talkBtn) {
talkBtn.addEventListener('mousedown', startRecording);
talkBtn.addEventListener('mouseup', stopRecording);
talkBtn.addEventListener('mouseleave', () => { if (isRecording) stopRecording(); });
talkBtn.addEventListener('touchstart', e => { e.preventDefault(); startRecording(); });
talkBtn.addEventListener('touchend', e => { e.preventDefault(); stopRecording(); });
}
// Type button
document.getElementById('type-btn')?.addEventListener('click', () => {
document.getElementById('type-modal')?.classList.remove('hidden');
document.getElementById('type-input')?.focus();
});
document.getElementById('send-type')?.addEventListener('click', sendTypedMessage);
document.getElementById('close-type')?.addEventListener('click', () => {
document.getElementById('type-modal')?.classList.add('hidden');
});
document.getElementById('type-input')?.addEventListener('keydown', e => {
if (e.key === 'Enter' && !e.shiftKey) {
e.preventDefault();
sendTypedMessage();
}
});
// Music - now server-side
document.getElementById('play-btn')?.addEventListener('click', playMusic);
document.getElementById('stop-btn')?.addEventListener('click', stopMusic);
document.getElementById('volume')?.addEventListener('input', setMusicVolume);
// Settings
document.getElementById('settings-btn')?.addEventListener('click', async () => {
document.getElementById('settings-modal')?.classList.remove('hidden');
await loadSettings(); // Reload settings when modal opens
});
document.getElementById('close-settings')?.addEventListener('click', () => {
document.getElementById('settings-modal')?.classList.add('hidden');
});
document.getElementById('save-settings')?.addEventListener('click', saveSettings);
document.getElementById('provider')?.addEventListener('change', updateProviderUI);
document.getElementById('phone-filter')?.addEventListener('change', e => {
phoneFilter = e.target.checked;
});
document.getElementById('refresh-ollama')?.addEventListener('click', refreshOllamaModels);
}
async function refreshOllamaModels() {
const btn = document.getElementById('refresh-ollama');
const select = document.getElementById('ollama-model');
if (!select) return;
btn.textContent = 'Loading...';
btn.disabled = true;
try {
const res = await fetch('/api/settings');
const data = await res.json();
select.innerHTML = '';
const models = data.available_ollama_models || [];
if (models.length === 0) {
const option = document.createElement('option');
option.value = '';
option.textContent = '(No models found)';
select.appendChild(option);
} else {
models.forEach(model => {
const option = document.createElement('option');
option.value = model;
option.textContent = model;
select.appendChild(option);
});
}
} catch (err) {
console.error('Failed to refresh Ollama models:', err);
}
btn.textContent = 'Refresh Models';
btn.disabled = false;
}
// --- Audio Devices ---
async function loadAudioDevices() {
try {
const res = await fetch('/api/audio/devices');
const data = await res.json();
const inputSelect = document.getElementById('input-device');
const outputSelect = document.getElementById('output-device');
if (!inputSelect || !outputSelect) return;
// Clear selects
inputSelect.innerHTML = '<option value="">-- Select --</option>';
outputSelect.innerHTML = '<option value="">-- Select --</option>';
data.devices.forEach(device => {
// Input devices
if (device.inputs > 0) {
const opt = document.createElement('option');
opt.value = device.id;
opt.textContent = `${device.name} (${device.inputs} ch)`;
inputSelect.appendChild(opt);
}
// Output devices
if (device.outputs > 0) {
const opt = document.createElement('option');
opt.value = device.id;
opt.textContent = `${device.name} (${device.outputs} ch)`;
outputSelect.appendChild(opt);
}
});
// Load current settings
const settingsRes = await fetch('/api/audio/settings');
const settings = await settingsRes.json();
if (settings.input_device !== null)
inputSelect.value = settings.input_device;
if (settings.output_device !== null)
outputSelect.value = settings.output_device;
// Channel settings
const inputCh = document.getElementById('input-channel');
const callerCh = document.getElementById('caller-channel');
const musicCh = document.getElementById('music-channel');
const sfxCh = document.getElementById('sfx-channel');
if (inputCh) inputCh.value = settings.input_channel || 1;
if (callerCh) callerCh.value = settings.caller_channel || 1;
if (musicCh) musicCh.value = settings.music_channel || 2;
if (sfxCh) sfxCh.value = settings.sfx_channel || 3;
// Phone filter setting
const phoneFilterEl = document.getElementById('phone-filter');
if (phoneFilterEl) {
phoneFilterEl.checked = settings.phone_filter ?? false;
phoneFilter = phoneFilterEl.checked;
}
console.log('Audio devices loaded');
} catch (err) {
console.error('loadAudioDevices error:', err);
}
}
async function saveAudioDevices() {
const inputDevice = document.getElementById('input-device')?.value;
const outputDevice = document.getElementById('output-device')?.value;
const inputChannel = document.getElementById('input-channel')?.value;
const callerChannel = document.getElementById('caller-channel')?.value;
const musicChannel = document.getElementById('music-channel')?.value;
const sfxChannel = document.getElementById('sfx-channel')?.value;
const phoneFilterChecked = document.getElementById('phone-filter')?.checked ?? false;
await fetch('/api/audio/settings', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
input_device: inputDevice ? parseInt(inputDevice) : null,
input_channel: inputChannel ? parseInt(inputChannel) : 1,
output_device: outputDevice ? parseInt(outputDevice) : null,
caller_channel: callerChannel ? parseInt(callerChannel) : 1,
music_channel: musicChannel ? parseInt(musicChannel) : 2,
sfx_channel: sfxChannel ? parseInt(sfxChannel) : 3,
phone_filter: phoneFilterChecked
})
});
// Update local state
phoneFilter = phoneFilterChecked;
log('Audio routing saved');
}
// --- Callers ---
async function loadCallers() {
try {
const res = await fetch('/api/callers');
const data = await res.json();
const grid = document.getElementById('callers');
if (!grid) return;
grid.innerHTML = '';
data.callers.forEach(caller => {
const btn = document.createElement('button');
btn.className = 'caller-btn';
btn.textContent = caller.name;
btn.dataset.key = caller.key;
btn.addEventListener('click', () => startCall(caller.key, caller.name));
grid.appendChild(btn);
});
// Show session ID
const sessionEl = document.getElementById('session-id');
if (sessionEl && data.session_id) {
sessionEl.textContent = `(${data.session_id})`;
}
console.log('Loaded', data.callers.length, 'callers, session:', data.session_id);
} catch (err) {
console.error('loadCallers error:', err);
}
}
async function startCall(key, name) {
if (isProcessing) return;
const res = await fetch(`/api/call/${key}`, { method: 'POST' });
const data = await res.json();
currentCaller = { key, name };
document.getElementById('call-status').textContent = `On call: ${name}`;
document.getElementById('hangup-btn').disabled = false;
// Show caller background
const bgEl = document.getElementById('caller-background');
if (bgEl && data.background) {
bgEl.textContent = data.background;
bgEl.classList.remove('hidden');
}
document.querySelectorAll('.caller-btn').forEach(btn => {
btn.classList.toggle('active', btn.dataset.key === key);
});
log(`Connected to ${name}`);
clearChat();
}
async function newSession() {
// Hangup if on a call
if (currentCaller) {
await hangup();
}
await fetch('/api/session/reset', { method: 'POST' });
// Hide caller background
const bgEl = document.getElementById('caller-background');
if (bgEl) bgEl.classList.add('hidden');
// Reload callers to get new session ID
await loadCallers();
log('New session started - all callers have fresh backgrounds');
}
async function hangup() {
if (!currentCaller) return;
// Stop any playing TTS
await fetch('/api/tts/stop', { method: 'POST' });
await fetch('/api/hangup', { method: 'POST' });
log(`Hung up on ${currentCaller.name}`);
currentCaller = null;
isProcessing = false;
hideStatus();
document.getElementById('call-status').textContent = 'No active call';
document.getElementById('hangup-btn').disabled = true;
document.querySelectorAll('.caller-btn').forEach(btn => btn.classList.remove('active'));
// Hide caller background
const bgEl = document.getElementById('caller-background');
if (bgEl) bgEl.classList.add('hidden');
}
// --- Server-Side Recording ---
async function startRecording() {
if (!currentCaller || isProcessing) return;
try {
const res = await fetch('/api/record/start', { method: 'POST' });
if (!res.ok) {
const err = await res.json();
log('Record error: ' + (err.detail || 'Failed to start'));
return;
}
isRecording = true;
document.getElementById('talk-btn').classList.add('recording');
document.getElementById('talk-btn').textContent = 'Recording...';
} catch (err) {
log('Record error: ' + err.message);
}
}
async function stopRecording() {
if (!isRecording) return;
document.getElementById('talk-btn').classList.remove('recording');
document.getElementById('talk-btn').textContent = 'Hold to Talk';
isRecording = false;
isProcessing = true;
showStatus('Processing...');
try {
// Stop recording and get transcription
const res = await fetch('/api/record/stop', { method: 'POST' });
const data = await res.json();
if (!data.text) {
log('(No speech detected)');
isProcessing = false;
hideStatus();
return;
}
addMessage('You', data.text);
// Chat
showStatus(`${currentCaller.name} is thinking...`);
const chatRes = await fetch('/api/chat', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ text: data.text })
});
const chatData = await chatRes.json();
addMessage(chatData.caller, chatData.text);
// TTS (plays on server) - only if we have text
if (chatData.text && chatData.text.trim()) {
showStatus(`${currentCaller.name} is speaking...`);
await fetch('/api/tts', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
text: chatData.text,
voice_id: chatData.voice_id,
phone_filter: phoneFilter
})
});
}
} catch (err) {
log('Error: ' + err.message);
}
isProcessing = false;
hideStatus();
}
async function sendTypedMessage() {
const input = document.getElementById('type-input');
const text = input.value.trim();
if (!text || !currentCaller || isProcessing) return;
input.value = '';
document.getElementById('type-modal').classList.add('hidden');
isProcessing = true;
addMessage('You', text);
try {
showStatus(`${currentCaller.name} is thinking...`);
const chatRes = await fetch('/api/chat', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ text })
});
const chatData = await chatRes.json();
addMessage(chatData.caller, chatData.text);
// TTS (plays on server) - only if we have text
if (chatData.text && chatData.text.trim()) {
showStatus(`${currentCaller.name} is speaking...`);
await fetch('/api/tts', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
text: chatData.text,
voice_id: chatData.voice_id,
phone_filter: phoneFilter
})
});
}
} catch (err) {
log('Error: ' + err.message);
}
isProcessing = false;
hideStatus();
}
// --- Music (Server-Side) ---
async function loadMusic() {
try {
const res = await fetch('/api/music');
const data = await res.json();
tracks = data.tracks || [];
const select = document.getElementById('track-select');
if (!select) return;
select.innerHTML = '';
tracks.forEach((track, i) => {
const option = document.createElement('option');
option.value = track.file;
option.textContent = track.name;
select.appendChild(option);
});
console.log('Loaded', tracks.length, 'tracks');
} catch (err) {
console.error('loadMusic error:', err);
}
}
async function playMusic() {
const select = document.getElementById('track-select');
const track = select?.value;
if (!track) return;
await fetch('/api/music/play', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ track, action: 'play' })
});
}
async function stopMusic() {
await fetch('/api/music/stop', { method: 'POST' });
}
async function setMusicVolume(e) {
const volume = e.target.value / 100;
await fetch('/api/music/volume', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ track: '', action: 'volume', volume })
});
}
// --- Sound Effects (Server-Side) ---
async function loadSounds() {
try {
const res = await fetch('/api/sounds');
const data = await res.json();
sounds = data.sounds || [];
const board = document.getElementById('soundboard');
if (!board) return;
board.innerHTML = '';
sounds.forEach(sound => {
const btn = document.createElement('button');
btn.className = 'sound-btn';
btn.textContent = sound.name;
btn.addEventListener('click', () => playSFX(sound.file));
board.appendChild(btn);
});
console.log('Loaded', sounds.length, 'sounds');
} catch (err) {
console.error('loadSounds error:', err);
}
}
async function playSFX(soundFile) {
await fetch('/api/sfx/play', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ sound: soundFile })
});
}
// --- Settings ---
async function loadSettings() {
try {
const res = await fetch('/api/settings');
const data = await res.json();
const providerEl = document.getElementById('provider');
if (providerEl) providerEl.value = data.provider || 'openrouter';
const modelSelect = document.getElementById('openrouter-model');
if (modelSelect) {
modelSelect.innerHTML = '';
(data.available_openrouter_models || []).forEach(model => {
const option = document.createElement('option');
option.value = model;
option.textContent = model;
if (model === data.openrouter_model) option.selected = true;
modelSelect.appendChild(option);
});
}
const ollamaModel = document.getElementById('ollama-model');
const ollamaHost = document.getElementById('ollama-host');
if (ollamaHost) ollamaHost.value = data.ollama_host || 'http://localhost:11434';
// Populate Ollama models dropdown
if (ollamaModel) {
ollamaModel.innerHTML = '';
const ollamaModels = data.available_ollama_models || [];
console.log('Ollama models from API:', ollamaModels.length, ollamaModels);
if (ollamaModels.length === 0) {
const option = document.createElement('option');
option.value = data.ollama_model || 'llama3.2';
option.textContent = data.ollama_model || 'llama3.2';
ollamaModel.appendChild(option);
} else {
ollamaModels.forEach(model => {
const option = document.createElement('option');
option.value = model;
option.textContent = model;
if (model === data.ollama_model) option.selected = true;
ollamaModel.appendChild(option);
});
}
console.log('Ollama dropdown options:', ollamaModel.options.length);
} else {
console.log('Ollama model element not found!');
}
// TTS provider
const ttsProvider = document.getElementById('tts-provider');
if (ttsProvider) ttsProvider.value = data.tts_provider || 'elevenlabs';
updateProviderUI();
console.log('Settings loaded:', data.provider, 'TTS:', data.tts_provider);
} catch (err) {
console.error('loadSettings error:', err);
}
}
function updateProviderUI() {
const isOpenRouter = document.getElementById('provider')?.value === 'openrouter';
document.getElementById('openrouter-settings')?.classList.toggle('hidden', !isOpenRouter);
document.getElementById('ollama-settings')?.classList.toggle('hidden', isOpenRouter);
}
async function saveSettings() {
// Save audio devices
await saveAudioDevices();
// Save LLM and TTS settings
await fetch('/api/settings', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
provider: document.getElementById('provider')?.value,
openrouter_model: document.getElementById('openrouter-model')?.value,
ollama_model: document.getElementById('ollama-model')?.value,
ollama_host: document.getElementById('ollama-host')?.value,
tts_provider: document.getElementById('tts-provider')?.value
})
});
document.getElementById('settings-modal')?.classList.add('hidden');
log('Settings saved');
}
// --- UI Helpers ---
function addMessage(sender, text) {
const chat = document.getElementById('chat');
if (!chat) {
console.log(`[${sender}]: ${text}`);
return;
}
const div = document.createElement('div');
div.className = `message ${sender === 'You' ? 'host' : 'caller'}`;
div.innerHTML = `<strong>${sender}:</strong> ${text}`;
chat.appendChild(div);
chat.scrollTop = chat.scrollHeight;
}
function clearChat() {
const chat = document.getElementById('chat');
if (chat) chat.innerHTML = '';
}
function log(text) {
addMessage('System', text);
}
function showStatus(text) {
const status = document.getElementById('status');
if (status) {
status.textContent = text;
status.classList.remove('hidden');
}
}
function hideStatus() {
const status = document.getElementById('status');
if (status) status.classList.add('hidden');
}
// --- Server Control & Logging ---
function startLogPolling() {
// Poll for logs every second
logPollInterval = setInterval(fetchLogs, 1000);
// Initial fetch
fetchLogs();
}
async function fetchLogs() {
try {
const res = await fetch('/api/logs?lines=200');
const data = await res.json();
const logEl = document.getElementById('server-log');
if (!logEl) return;
// Only update if we have new logs
if (data.logs.length !== lastLogCount) {
lastLogCount = data.logs.length;
logEl.innerHTML = data.logs.map(line => {
let className = 'log-line';
if (line.includes('Error') || line.includes('error') || line.includes('ERROR')) {
className += ' error';
} else if (line.includes('Warning') || line.includes('WARNING')) {
className += ' warning';
} else if (line.includes('[TTS]')) {
className += ' tts';
} else if (line.includes('[Chat]')) {
className += ' chat';
}
return `<div class="${className}">${escapeHtml(line)}</div>`;
}).join('');
if (autoScroll) {
logEl.scrollTop = logEl.scrollHeight;
}
}
} catch (err) {
// Server might be down, that's ok
console.log('Log fetch failed (server may be restarting)');
}
}
function escapeHtml(text) {
const div = document.createElement('div');
div.textContent = text;
return div.innerHTML;
}
async function restartServer() {
if (!confirm('Restart the server? This will briefly disconnect you.')) return;
try {
await fetch('/api/server/restart', { method: 'POST' });
log('Server restart requested...');
// Clear the log and wait for server to come back
document.getElementById('server-log').innerHTML = '<div class="log-line">Restarting server...</div>';
// Poll until server is back
let attempts = 0;
const checkServer = setInterval(async () => {
attempts++;
try {
const res = await fetch('/api/server/status');
if (res.ok) {
clearInterval(checkServer);
log('Server restarted successfully');
await loadSettings();
}
} catch (e) {
if (attempts > 30) {
clearInterval(checkServer);
log('Server did not restart - check terminal');
}
}
}, 1000);
} catch (err) {
log('Failed to restart server: ' + err.message);
}
}
async function stopServer() {
if (!confirm('Stop the server? You will need to restart it manually.')) return;
try {
await fetch('/api/server/stop', { method: 'POST' });
log('Server stop requested...');
document.getElementById('server-log').innerHTML = '<div class="log-line">Server stopped. Run ./run.sh to restart.</div>';
} catch (err) {
log('Failed to stop server: ' + err.message);
}
}

77
generate_callers.py Normal file
View File

@@ -0,0 +1,77 @@
import os
os.environ["SUNO_USE_SMALL_MODELS"] = "False"
from bark import generate_audio, preload_models
from scipy.io.wavfile import write as write_wav
from scipy.signal import butter, filtfilt
import numpy as np
def phone_filter(audio, sample_rate=24000):
"""Apply telephone bandpass filter (300Hz - 3400Hz)"""
low = 300 / (sample_rate / 2)
high = 3400 / (sample_rate / 2)
b, a = butter(4, [low, high], btype='band')
filtered = filtfilt(b, a, audio)
# Add slight compression and normalize
filtered = np.tanh(filtered * 1.5) * 0.9
return filtered.astype(np.float32)
# Define your callers
CALLERS = [
{
"name": "caller1_mike",
"voice": "v2/en_speaker_6",
"text": """Hey, thanks for taking my call!
So I've been thinking about this a lot and...
I know it sounds crazy, but hear me out."""
},
{
"name": "caller2_sarah",
"voice": "v2/en_speaker_9",
"text": """Hi! Oh my gosh, I can't believe I got through.
Okay so... this is kind of a long story,
but basically I had this experience last week that blew my mind."""
},
{
"name": "caller3_dave",
"voice": "v2/en_speaker_1",
"text": """Yeah, hey. First time caller, long time listener.
Look, I gotta be honest with you here,
I think you're missing something important."""
},
{
"name": "caller4_jenny",
"voice": "v2/en_speaker_3",
"text": """Okay okay, so get this...
I was literally just talking about this with my friend yesterday!
And she said, and I quote, well, I can't say that on air."""
},
]
def main():
print("Loading models...")
preload_models()
os.makedirs("output", exist_ok=True)
for caller in CALLERS:
print(f"\nGenerating: {caller['name']}")
# Generate raw audio
audio = generate_audio(caller["text"], history_prompt=caller["voice"])
# Save clean version
write_wav(f"output/{caller['name']}_clean.wav", 24000, audio)
# Apply phone filter and save
phone_audio = phone_filter(audio)
write_wav(f"output/{caller['name']}_phone.wav", 24000, phone_audio)
print(f" Saved: output/{caller['name']}_clean.wav")
print(f" Saved: output/{caller['name']}_phone.wav")
print("\nDone! Check the output/ folder.")
if __name__ == "__main__":
main()

102
generate_sounds.py Normal file
View File

@@ -0,0 +1,102 @@
#!/usr/bin/env python3
"""
Generate sound effects using ElevenLabs Sound Effects API
"""
import os
from pathlib import Path
from dotenv import load_dotenv
load_dotenv()
SOUNDS_DIR = Path(__file__).parent / "sounds"
SOUNDS_DIR.mkdir(exist_ok=True)
# Sound effects to generate with descriptions
SOUND_EFFECTS = {
'airhorn.wav': 'loud air horn blast, sports event',
'boo.wav': 'crowd booing, disappointed audience',
'crickets.wav': 'crickets chirping, awkward silence',
'drumroll.wav': 'drum roll, building suspense',
'buzzer.wav': 'game show wrong answer buzzer',
'laugh.wav': 'audience laughing, sitcom laugh track',
'rimshot.wav': 'ba dum tss, drum rimshot comedy',
'sad_trombone.wav': 'sad trombone, wah wah wah failure sound',
'phone_ring.wav': 'old telephone ringing',
'cheer.wav': 'crowd cheering and applauding',
'scratch.wav': 'vinyl record scratch',
'wow.wav': 'crowd saying wow, impressed reaction',
'fart.wav': 'comedic fart sound effect',
'victory.wav': 'victory fanfare, triumphant horns',
'uh_oh.wav': 'uh oh, something went wrong sound',
}
def generate_sound(name, description):
"""Generate a sound effect using ElevenLabs"""
from elevenlabs.client import ElevenLabs
import soundfile as sf
import numpy as np
output_path = SOUNDS_DIR / name
if output_path.exists():
print(f"{name} (already exists)")
return True
try:
print(f" Generating {name}: '{description}'...")
client = ElevenLabs(api_key=os.getenv('ELEVENLABS_API_KEY'))
# Generate sound effect
result = client.text_to_sound_effects.convert(
text=description,
duration_seconds=2.0,
)
# Collect audio data
audio_data = b''.join(result)
# Save as mp3 first, then convert
temp_mp3 = SOUNDS_DIR / f"temp_{name}.mp3"
with open(temp_mp3, 'wb') as f:
f.write(audio_data)
# Convert to wav with ffmpeg
import subprocess
subprocess.run([
'ffmpeg', '-y', '-i', str(temp_mp3),
'-ar', '24000', '-ac', '1',
str(output_path)
], capture_output=True, check=True)
temp_mp3.unlink()
print(f"{name}")
return True
except Exception as e:
print(f"{name} ({e})")
return False
def main():
print("Generating sound effects with ElevenLabs...")
print(f"Saving to: {SOUNDS_DIR}")
print("(This uses your ElevenLabs credits)\n")
# Check for ffmpeg
import subprocess
try:
subprocess.run(['ffmpeg', '-version'], capture_output=True, check=True)
except:
print("ERROR: ffmpeg required. Install with: brew install ffmpeg")
return
success = 0
for name, description in SOUND_EFFECTS.items():
if generate_sound(name, description):
success += 1
print(f"\nGenerated {success}/{len(SOUND_EFFECTS)} sounds.")
if __name__ == "__main__":
main()

400
publish_episode.py Executable file
View File

@@ -0,0 +1,400 @@
#!/usr/bin/env python3
"""
Podcast Episode Publisher
Transcribes audio, generates metadata, and publishes to Castopod.
Usage:
python publish_episode.py /path/to/episode.mp3
python publish_episode.py /path/to/episode.mp3 --episode-number 3
python publish_episode.py /path/to/episode.mp3 --dry-run
"""
import argparse
import json
import os
import re
import subprocess
import sys
import base64
from pathlib import Path
import requests
from dotenv import load_dotenv
# Load environment variables
load_dotenv(Path(__file__).parent / ".env")
# Configuration
CASTOPOD_URL = "https://podcast.macneilmediagroup.com"
CASTOPOD_USERNAME = "admin"
CASTOPOD_PASSWORD = "podcast2026api"
PODCAST_ID = 1
PODCAST_HANDLE = "LukeAtTheRoost"
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
WHISPER_MODEL = "base" # Options: tiny, base, small, medium, large
# NAS Configuration for chapters upload
NAS_HOST = "mmgnas-10g"
NAS_USER = "luke"
NAS_SSH_PORT = 8001
DOCKER_PATH = "/share/CACHEDEV1_DATA/.qpkg/container-station/bin/docker"
CASTOPOD_CONTAINER = "castopod-castopod-1"
MARIADB_CONTAINER = "castopod-mariadb-1"
DB_USER = "castopod"
DB_PASS = "BYtbFfk3ndeVabb26xb0UyKU"
DB_NAME = "castopod"
def get_auth_header():
"""Get Basic Auth header for Castopod API."""
credentials = base64.b64encode(
f"{CASTOPOD_USERNAME}:{CASTOPOD_PASSWORD}".encode()
).decode()
return {"Authorization": f"Basic {credentials}"}
def transcribe_audio(audio_path: str) -> dict:
"""Transcribe audio using faster-whisper with timestamps."""
print(f"[1/5] Transcribing {audio_path}...")
try:
from faster_whisper import WhisperModel
except ImportError:
print("Error: faster-whisper not installed. Run: pip install faster-whisper")
sys.exit(1)
model = WhisperModel(WHISPER_MODEL, compute_type="int8")
segments, info = model.transcribe(audio_path, word_timestamps=True)
transcript_segments = []
full_text = []
for segment in segments:
transcript_segments.append({
"start": segment.start,
"end": segment.end,
"text": segment.text.strip()
})
full_text.append(segment.text.strip())
print(f" Transcribed {info.duration:.1f} seconds of audio")
return {
"segments": transcript_segments,
"full_text": " ".join(full_text),
"duration": int(info.duration)
}
def generate_metadata(transcript: dict, episode_number: int) -> dict:
"""Use LLM to generate title, description, and chapters from transcript."""
print("[2/5] Generating metadata with LLM...")
if not OPENROUTER_API_KEY:
print("Error: OPENROUTER_API_KEY not set in .env")
sys.exit(1)
# Prepare transcript with timestamps for chapter detection
timestamped_text = ""
for seg in transcript["segments"]:
mins = int(seg["start"] // 60)
secs = int(seg["start"] % 60)
timestamped_text += f"[{mins:02d}:{secs:02d}] {seg['text']}\n"
prompt = f"""Analyze this podcast transcript and generate metadata.
TRANSCRIPT:
{timestamped_text}
Generate a JSON response with:
1. "title": A catchy episode title (include "Episode {episode_number}:" prefix)
2. "description": A 2-4 sentence description summarizing the episode's content. Mention callers by name and their topics. End with something engaging.
3. "chapters": An array of chapter objects with "startTime" (in seconds) and "title". Include:
- "Intro" at 0 seconds
- A chapter for each caller/topic (use caller names if mentioned)
- "Outro" near the end
Respond with ONLY valid JSON, no markdown or explanation."""
response = requests.post(
"https://openrouter.ai/api/v1/chat/completions",
headers={
"Authorization": f"Bearer {OPENROUTER_API_KEY}",
"Content-Type": "application/json"
},
json={
"model": "anthropic/claude-3-haiku",
"messages": [{"role": "user", "content": prompt}],
"temperature": 0.7
}
)
if response.status_code != 200:
print(f"Error from OpenRouter: {response.text}")
sys.exit(1)
result = response.json()
content = result["choices"][0]["message"]["content"]
# Parse JSON from response (handle markdown code blocks)
content = content.strip()
if content.startswith("```"):
content = re.sub(r"^```(?:json)?\n?", "", content)
content = re.sub(r"\n?```$", "", content)
try:
metadata = json.loads(content)
except json.JSONDecodeError as e:
print(f"Error parsing LLM response: {e}")
print(f"Response was: {content}")
sys.exit(1)
print(f" Title: {metadata['title']}")
print(f" Chapters: {len(metadata['chapters'])}")
return metadata
def create_episode(audio_path: str, metadata: dict, duration: int) -> dict:
"""Create episode on Castopod."""
print("[3/5] Creating episode on Castopod...")
headers = get_auth_header()
# Upload audio and create episode
with open(audio_path, "rb") as f:
files = {
"audio_file": (Path(audio_path).name, f, "audio/mpeg")
}
data = {
"title": metadata["title"],
"description_markdown": metadata["description"],
"parental_advisory": "explicit",
"type": "full",
"created_by": "1"
}
response = requests.post(
f"{CASTOPOD_URL}/api/rest/v1/podcasts/{PODCAST_ID}/episodes",
headers=headers,
files=files,
data=data
)
if response.status_code not in (200, 201):
print(f"Error creating episode: {response.text}")
sys.exit(1)
episode = response.json()
print(f" Created episode ID: {episode['id']}")
print(f" Slug: {episode['slug']}")
return episode
def publish_episode(episode_id: int) -> dict:
"""Publish the episode."""
print("[4/5] Publishing episode...")
headers = get_auth_header()
response = requests.post(
f"{CASTOPOD_URL}/api/rest/v1/episodes/{episode_id}/publish",
headers=headers,
data={
"publication_method": "now",
"created_by": "1"
}
)
if response.status_code != 200:
print(f"Error publishing: {response.text}")
sys.exit(1)
episode = response.json()
published_at = episode.get("published_at", {})
if isinstance(published_at, dict):
print(f" Published at: {published_at.get('date', 'unknown')}")
else:
print(f" Published at: {published_at}")
return episode
def save_chapters(metadata: dict, output_path: str):
"""Save chapters to JSON file."""
chapters_data = {
"version": "1.2.0",
"chapters": metadata["chapters"]
}
with open(output_path, "w") as f:
json.dump(chapters_data, f, indent=2)
print(f" Chapters saved to: {output_path}")
def run_ssh_command(command: str) -> tuple[bool, str]:
"""Run a command on the NAS via SSH."""
ssh_cmd = [
"ssh", "-p", str(NAS_SSH_PORT),
f"{NAS_USER}@{NAS_HOST}",
command
]
try:
result = subprocess.run(ssh_cmd, capture_output=True, text=True, timeout=30)
return result.returncode == 0, result.stdout.strip() or result.stderr.strip()
except subprocess.TimeoutExpired:
return False, "SSH command timed out"
except Exception as e:
return False, str(e)
def upload_chapters_to_castopod(episode_slug: str, episode_id: int, chapters_path: str) -> bool:
"""Upload chapters file to Castopod via SSH and link in database."""
print("[4.5/5] Uploading chapters to Castopod...")
chapters_filename = f"{episode_slug}-chapters.json"
remote_path = f"podcasts/{PODCAST_HANDLE}/{chapters_filename}"
# Read local chapters file
with open(chapters_path, "r") as f:
chapters_content = f.read()
# Base64 encode for safe transfer
chapters_b64 = base64.b64encode(chapters_content.encode()).decode()
# Upload file to container using base64 decode
upload_cmd = f'echo "{chapters_b64}" | base64 -d | {DOCKER_PATH} exec -i {CASTOPOD_CONTAINER} tee /var/www/castopod/public/media/{remote_path} > /dev/null'
success, output = run_ssh_command(upload_cmd)
if not success:
print(f" Warning: Failed to upload chapters file: {output}")
return False
# Get file size
file_size = len(chapters_content)
# Insert into media table
insert_sql = f"""INSERT INTO cp_media (file_key, file_size, file_mimetype, type, uploaded_by, updated_by, uploaded_at, updated_at)
VALUES ('{remote_path}', {file_size}, 'application/json', 'chapters', 1, 1, NOW(), NOW())"""
db_cmd = f'{DOCKER_PATH} exec {MARIADB_CONTAINER} mysql -u {DB_USER} -p{DB_PASS} {DB_NAME} -e "{insert_sql}; SELECT LAST_INSERT_ID();"'
success, output = run_ssh_command(db_cmd)
if not success:
print(f" Warning: Failed to insert chapters in database: {output}")
return False
# Parse media ID from output
try:
lines = output.strip().split('\n')
media_id = int(lines[-1])
except (ValueError, IndexError):
print(f" Warning: Could not parse media ID from: {output}")
return False
# Link chapters to episode
update_sql = f"UPDATE cp_episodes SET chapters_id = {media_id} WHERE id = {episode_id}"
db_cmd = f'{DOCKER_PATH} exec {MARIADB_CONTAINER} mysql -u {DB_USER} -p{DB_PASS} {DB_NAME} -e "{update_sql}"'
success, output = run_ssh_command(db_cmd)
if not success:
print(f" Warning: Failed to link chapters to episode: {output}")
return False
# Clear Castopod cache
cache_cmd = f'{DOCKER_PATH} exec {CASTOPOD_CONTAINER} php spark cache:clear'
run_ssh_command(cache_cmd)
print(f" Chapters uploaded and linked (media_id: {media_id})")
return True
def get_next_episode_number() -> int:
"""Get the next episode number from Castopod."""
headers = get_auth_header()
response = requests.get(
f"{CASTOPOD_URL}/api/rest/v1/podcasts/{PODCAST_ID}/episodes",
headers=headers
)
if response.status_code != 200:
return 1
episodes = response.json()
if not episodes:
return 1
max_num = max(ep.get("number", 0) for ep in episodes)
return max_num + 1
def main():
parser = argparse.ArgumentParser(description="Publish podcast episode to Castopod")
parser.add_argument("audio_file", help="Path to the audio file (MP3)")
parser.add_argument("--episode-number", "-n", type=int, help="Episode number (auto-detected if not provided)")
parser.add_argument("--dry-run", "-d", action="store_true", help="Generate metadata but don't publish")
parser.add_argument("--title", "-t", help="Override generated title")
parser.add_argument("--description", help="Override generated description")
args = parser.parse_args()
audio_path = Path(args.audio_file).expanduser().resolve()
if not audio_path.exists():
print(f"Error: Audio file not found: {audio_path}")
sys.exit(1)
# Determine episode number
if args.episode_number:
episode_number = args.episode_number
else:
episode_number = get_next_episode_number()
print(f"Episode number: {episode_number}")
# Step 1: Transcribe
transcript = transcribe_audio(str(audio_path))
# Step 2: Generate metadata
metadata = generate_metadata(transcript, episode_number)
# Apply overrides
if args.title:
metadata["title"] = args.title
if args.description:
metadata["description"] = args.description
# Save chapters file
chapters_path = audio_path.with_suffix(".chapters.json")
save_chapters(metadata, str(chapters_path))
if args.dry_run:
print("\n[DRY RUN] Would publish with:")
print(f" Title: {metadata['title']}")
print(f" Description: {metadata['description']}")
print(f" Chapters: {json.dumps(metadata['chapters'], indent=2)}")
print("\nChapters file saved. Run without --dry-run to publish.")
return
# Step 3: Create episode
episode = create_episode(str(audio_path), metadata, transcript["duration"])
# Step 4: Publish
episode = publish_episode(episode["id"])
# Step 4.5: Upload chapters via SSH
chapters_uploaded = upload_chapters_to_castopod(
episode["slug"],
episode["id"],
str(chapters_path)
)
# Step 5: Summary
print("\n[5/5] Done!")
print("=" * 50)
print(f"Episode URL: {CASTOPOD_URL}/@{PODCAST_HANDLE}/episodes/{episode['slug']}")
print(f"RSS Feed: {CASTOPOD_URL}/@{PODCAST_HANDLE}/feed.xml")
print("=" * 50)
if not chapters_uploaded:
print("\nNote: Chapters upload failed. Add manually via Castopod admin UI")
print(f" Chapters file: {chapters_path}")
if __name__ == "__main__":
main()

1553
radio_show.py Normal file

File diff suppressed because it is too large Load Diff

140
radio_simple.py Normal file
View File

@@ -0,0 +1,140 @@
#!/usr/bin/env python3
"""
Simplified Radio Show - for debugging
"""
import os
import sys
from pathlib import Path
import numpy as np
import sounddevice as sd
import soundfile as sf
from faster_whisper import WhisperModel
from scipy.signal import butter, filtfilt
from dotenv import load_dotenv
load_dotenv()
SAMPLE_RATE = 24000
CALLERS = {
"1": ("Big Tony", "IKne3meq5aSn9XLyUdCD", "You are Big Tony, a loud Italian guy from Staten Island. Swear naturally, be opinionated. Keep it to 2 sentences."),
"2": ("Drunk Diane", "FGY2WhTYpPnrIDTdsKH5", "You are Drunk Diane, tipsy woman at a bar. Ramble a bit, be funny. Keep it to 2 sentences."),
"3": ("Stoner Phil", "bIHbv24MWmeRgasZH58o", "You are Stoner Phil, super chill stoner dude. Speak slow, be spacey but profound. Keep it to 2 sentences."),
}
def phone_filter(audio):
b, a = butter(4, [300/(SAMPLE_RATE/2), 3400/(SAMPLE_RATE/2)], btype='band')
return (np.tanh(filtfilt(b, a, audio.flatten()) * 1.5) * 0.8).astype(np.float32)
class SimpleRadio:
def __init__(self):
print("Loading Whisper...")
self.whisper = WhisperModel("base", device="cpu", compute_type="int8")
print("Connecting to ElevenLabs...")
from elevenlabs.client import ElevenLabs
self.tts = ElevenLabs(api_key=os.getenv("ELEVENLABS_API_KEY"))
print("Connecting to Ollama...")
import ollama
self.ollama = ollama
self.caller = CALLERS["1"]
self.history = []
print("\nReady!\n")
def record(self):
print(" [Recording - press Enter to stop]")
chunks = []
recording = True
def callback(indata, frames, time, status):
if recording:
chunks.append(indata.copy())
with sd.InputStream(samplerate=SAMPLE_RATE, channels=1, callback=callback):
input() # Wait for Enter
recording = False
return np.vstack(chunks) if chunks else None
def transcribe(self, audio):
import librosa
audio_16k = librosa.resample(audio.flatten().astype(np.float32), orig_sr=SAMPLE_RATE, target_sr=16000)
segments, _ = self.whisper.transcribe(audio_16k)
return " ".join([s.text for s in segments]).strip()
def respond(self, text):
self.history.append({"role": "user", "content": text})
response = self.ollama.chat(
model="llama3.2:latest",
messages=[{"role": "system", "content": self.caller[2]}] + self.history[-6:],
options={"temperature": 0.9}
)
reply = response["message"]["content"]
self.history.append({"role": "assistant", "content": reply})
return reply
def speak(self, text):
print(" [Generating voice...]")
audio_gen = self.tts.text_to_speech.convert(
voice_id=self.caller[1],
text=text,
model_id="eleven_turbo_v2_5",
output_format="pcm_24000"
)
audio_bytes = b"".join(audio_gen)
audio = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32768.0
filtered = phone_filter(audio)
print(" [Playing...]")
sd.play(filtered, SAMPLE_RATE)
sd.wait()
def run(self):
print("=" * 50)
print(" SIMPLE RADIO - Type commands:")
print(" 1/2/3 = switch caller")
print(" r = record & respond")
print(" t = type message (skip recording)")
print(" q = quit")
print("=" * 50)
print(f"\nCaller: {self.caller[0]}\n")
while True:
cmd = input("> ").strip().lower()
if cmd == 'q':
break
elif cmd in '123':
self.caller = CALLERS[cmd]
self.history = []
print(f"\n📞 Switched to: {self.caller[0]}\n")
elif cmd == 'r':
audio = self.record()
if audio is not None:
print(" [Transcribing...]")
text = self.transcribe(audio)
print(f"\n YOU: {text}\n")
if text:
print(" [Thinking...]")
reply = self.respond(text)
print(f"\n 📞 {self.caller[0].upper()}: {reply}\n")
self.speak(reply)
elif cmd == 't':
text = input(" Type message: ")
if text:
print(" [Thinking...]")
reply = self.respond(text)
print(f"\n 📞 {self.caller[0].upper()}: {reply}\n")
self.speak(reply)
else:
print(" Commands: r=record, t=type, 1/2/3=caller, q=quit")
if __name__ == "__main__":
radio = SimpleRadio()
radio.run()

16
requirements-web.txt Normal file
View File

@@ -0,0 +1,16 @@
# Web application requirements (in addition to existing radio_show.py deps)
fastapi>=0.109.0
uvicorn[standard]>=0.27.0
python-multipart>=0.0.6
websockets>=12.0
httpx>=0.26.0
pydantic-settings>=2.1.0
# Already installed for CLI (but listed for completeness):
# faster-whisper
# elevenlabs
# numpy
# scipy
# librosa
# soundfile
# python-dotenv

60
run.sh Executable file
View File

@@ -0,0 +1,60 @@
#!/bin/bash
# AI Radio Show - Server Runner with restart support
LOG_FILE="/tmp/ai-radio-show.log"
RESTART_FLAG="/tmp/ai-radio-show.restart"
STOP_FLAG="/tmp/ai-radio-show.stop"
cd "$(dirname "$0")"
# Activate virtual environment
source venv/bin/activate
# Cleanup old flags
rm -f "$RESTART_FLAG" "$STOP_FLAG"
echo "AI Radio Show Server Runner"
echo "Log file: $LOG_FILE"
echo "Press Ctrl+C to stop"
echo ""
while true; do
echo "[$(date)] Starting server..." | tee -a "$LOG_FILE"
# Start uvicorn with output to both console and log file
python -m uvicorn backend.main:app --host 0.0.0.0 --port 8000 2>&1 | tee -a "$LOG_FILE" &
SERVER_PID=$!
# Wait for server to exit or restart signal
while kill -0 $SERVER_PID 2>/dev/null; do
if [ -f "$RESTART_FLAG" ]; then
echo "[$(date)] Restart requested..." | tee -a "$LOG_FILE"
rm -f "$RESTART_FLAG"
kill $SERVER_PID 2>/dev/null
wait $SERVER_PID 2>/dev/null
sleep 1
break
fi
if [ -f "$STOP_FLAG" ]; then
echo "[$(date)] Stop requested..." | tee -a "$LOG_FILE"
rm -f "$STOP_FLAG"
kill $SERVER_PID 2>/dev/null
wait $SERVER_PID 2>/dev/null
echo "[$(date)] Server stopped." | tee -a "$LOG_FILE"
exit 0
fi
sleep 1
done
# Check if we should restart or exit
if [ -f "$STOP_FLAG" ]; then
rm -f "$STOP_FLAG"
echo "[$(date)] Server stopped." | tee -a "$LOG_FILE"
exit 0
fi
echo "[$(date)] Restarting in 2 seconds..." | tee -a "$LOG_FILE"
sleep 2
done

37
test.html Normal file
View File

@@ -0,0 +1,37 @@
<!DOCTYPE html>
<html>
<head>
<title>Test JavaScript Loading</title>
</head>
<body>
<h1>JavaScript Test</h1>
<button id="test-btn">Test Button</button>
<div id="output"></div>
<script src="frontend/js/audio.js"></script>
<script src="frontend/js/websocket.js"></script>
<script src="frontend/js/app.js"></script>
<script>
// Test if the classes loaded
document.addEventListener('DOMContentLoaded', function() {
const output = document.getElementById('output');
if (typeof AudioManager !== 'undefined') {
output.innerHTML += '<p>✓ AudioManager loaded</p>';
} else {
output.innerHTML += '<p>✗ AudioManager failed to load</p>';
}
if (typeof RadioShowApp !== 'undefined') {
output.innerHTML += '<p>✓ RadioShowApp loaded</p>';
} else {
output.innerHTML += '<p>✗ RadioShowApp failed to load</p>';
}
document.getElementById('test-btn').addEventListener('click', function() {
output.innerHTML += '<p>Button click works!</p>';
});
});
</script>
</body>
</html>