Initial commit: AI Radio Show web application
- FastAPI backend with multiple TTS providers (Inworld, ElevenLabs, Kokoro, F5-TTS, etc.) - Web frontend with caller management, music, and soundboard - Whisper transcription integration - OpenRouter/Ollama LLM support - Castopod podcast publishing script Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
54
.gitignore
vendored
Normal file
54
.gitignore
vendored
Normal file
@@ -0,0 +1,54 @@
|
||||
# Environment
|
||||
.env
|
||||
*.env
|
||||
|
||||
# Python
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
.venv/
|
||||
venv/
|
||||
env/
|
||||
*.egg-info/
|
||||
|
||||
# Audio/Media (large files)
|
||||
*.mp3
|
||||
*.wav
|
||||
*.m4a
|
||||
*.ogg
|
||||
|
||||
# Sessions
|
||||
sessions/
|
||||
|
||||
# IDE
|
||||
.idea/
|
||||
.vscode/
|
||||
*.swp
|
||||
*.swo
|
||||
|
||||
# OS
|
||||
.DS_Store
|
||||
Thumbs.db
|
||||
|
||||
# Whisper models (downloaded automatically)
|
||||
*.pt
|
||||
|
||||
# Temporary
|
||||
*.tmp
|
||||
*.log
|
||||
|
||||
# Large model files (download separately)
|
||||
*.onnx
|
||||
*.safetensors
|
||||
*.tar.bz2
|
||||
*.bin
|
||||
models/
|
||||
asset/
|
||||
kokoro-v1.0.onnx
|
||||
voices-v1.0.bin
|
||||
|
||||
# Reference voices for TTS
|
||||
ref_audio/
|
||||
|
||||
# Claude settings (local)
|
||||
.claude/
|
||||
9
audio_settings.json
Normal file
9
audio_settings.json
Normal file
@@ -0,0 +1,9 @@
|
||||
{
|
||||
"input_device": 13,
|
||||
"input_channel": 1,
|
||||
"output_device": 13,
|
||||
"caller_channel": 3,
|
||||
"music_channel": 5,
|
||||
"sfx_channel": 7,
|
||||
"phone_filter": false
|
||||
}
|
||||
1
backend/__init__.py
Normal file
1
backend/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
# Backend package
|
||||
41
backend/config.py
Normal file
41
backend/config.py
Normal file
@@ -0,0 +1,41 @@
|
||||
"""Configuration settings for the AI Radio Show backend"""
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
from pydantic_settings import BaseSettings
|
||||
from dotenv import load_dotenv
|
||||
|
||||
# Load .env from parent directory
|
||||
load_dotenv(Path(__file__).parent.parent / ".env")
|
||||
|
||||
|
||||
class Settings(BaseSettings):
|
||||
# API Keys
|
||||
elevenlabs_api_key: str = os.getenv("ELEVENLABS_API_KEY", "")
|
||||
openrouter_api_key: str = os.getenv("OPENROUTER_API_KEY", "")
|
||||
inworld_api_key: str = os.getenv("INWORLD_API_KEY", "")
|
||||
|
||||
# LLM Settings
|
||||
llm_provider: str = "openrouter" # "openrouter" or "ollama"
|
||||
openrouter_model: str = "anthropic/claude-3-haiku"
|
||||
ollama_model: str = "llama3.2"
|
||||
ollama_host: str = "http://localhost:11434"
|
||||
|
||||
# TTS Settings
|
||||
tts_provider: str = "kokoro" # "kokoro", "elevenlabs", "vits", or "bark"
|
||||
|
||||
# Audio Settings
|
||||
sample_rate: int = 24000
|
||||
|
||||
# Paths
|
||||
base_dir: Path = Path(__file__).parent.parent
|
||||
sounds_dir: Path = base_dir / "sounds"
|
||||
music_dir: Path = base_dir / "music"
|
||||
sessions_dir: Path = base_dir / "sessions"
|
||||
|
||||
class Config:
|
||||
env_file = ".env"
|
||||
extra = "ignore"
|
||||
|
||||
|
||||
settings = Settings()
|
||||
787
backend/main.py
Normal file
787
backend/main.py
Normal file
@@ -0,0 +1,787 @@
|
||||
"""AI Radio Show - Control Panel Backend"""
|
||||
|
||||
import uuid
|
||||
import asyncio
|
||||
from pathlib import Path
|
||||
from fastapi import FastAPI, HTTPException, UploadFile, File
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
from fastapi.responses import FileResponse
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from pydantic import BaseModel
|
||||
from typing import Optional
|
||||
|
||||
from .config import settings
|
||||
from .services.transcription import transcribe_audio
|
||||
from .services.llm import llm_service
|
||||
from .services.tts import generate_speech
|
||||
from .services.audio import audio_service
|
||||
|
||||
app = FastAPI(title="AI Radio Show")
|
||||
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["*"],
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
# --- Callers ---
|
||||
# Base caller info (name, voice) - backgrounds generated dynamically per session
|
||||
import random
|
||||
|
||||
CALLER_BASES = {
|
||||
"1": {"name": "Tony", "voice": "VR6AewLTigWG4xSOukaG", "gender": "male", "age_range": (35, 55)},
|
||||
"2": {"name": "Jasmine", "voice": "jBpfuIE2acCO8z3wKNLl", "gender": "female", "age_range": (25, 38)},
|
||||
"3": {"name": "Rick", "voice": "TxGEqnHWrfWFTfGW9XjX", "gender": "male", "age_range": (40, 58)},
|
||||
"4": {"name": "Megan", "voice": "EXAVITQu4vr4xnSDxMaL", "gender": "female", "age_range": (24, 35)},
|
||||
"5": {"name": "Dennis", "voice": "pNInz6obpgDQGcFmaJgB", "gender": "male", "age_range": (32, 48)},
|
||||
"6": {"name": "Tanya", "voice": "21m00Tcm4TlvDq8ikWAM", "gender": "female", "age_range": (30, 45)},
|
||||
"7": {"name": "Earl", "voice": "ODq5zmih8GrVes37Dizd", "gender": "male", "age_range": (58, 72)},
|
||||
"8": {"name": "Carla", "voice": "XB0fDUnXU5powFXDhCwa", "gender": "female", "age_range": (38, 52)},
|
||||
"9": {"name": "Marcus", "voice": "IKne3meq5aSn9XLyUdCD", "gender": "male", "age_range": (24, 34)},
|
||||
"0": {"name": "Brenda", "voice": "pFZP5JQG7iQjIQuC4Bku", "gender": "female", "age_range": (45, 60)},
|
||||
}
|
||||
|
||||
# Background components for dynamic generation
|
||||
JOBS_MALE = [
|
||||
"runs a small HVAC business", "works as a long-haul trucker", "is a high school football coach",
|
||||
"works construction, mostly commercial jobs", "is a paramedic", "manages a warehouse",
|
||||
"is a line cook at a decent restaurant", "works IT for the city", "is a union electrician",
|
||||
"owns a small landscaping company", "is a cop, 12 years on the force", "works at a car dealership",
|
||||
"is a freelance photographer", "teaches middle school history", "is a firefighter",
|
||||
"works as a hospital security guard", "runs a food truck", "is a session musician",
|
||||
"works at a brewery", "is a physical therapist", "drives for UPS", "is a tattoo artist",
|
||||
"works in insurance, hates it", "is a youth pastor", "manages a gym",
|
||||
]
|
||||
|
||||
JOBS_FEMALE = [
|
||||
"works as an ER nurse", "is a social worker", "runs a small bakery", "is a dental hygienist",
|
||||
"works in HR for a hospital", "is a real estate agent", "teaches kindergarten",
|
||||
"works as a bartender at a nice place", "is a paralegal", "runs a daycare out of her home",
|
||||
"works retail management", "is a hairstylist, owns her chair", "is a vet tech",
|
||||
"works in hospital billing", "is a massage therapist", "manages a restaurant",
|
||||
"is a flight attendant", "works as a 911 dispatcher", "is a personal trainer",
|
||||
"works at a nonprofit", "is an accountant at a small firm", "does medical transcription from home",
|
||||
"is a court reporter", "works in pharmaceutical sales", "is a wedding planner",
|
||||
]
|
||||
|
||||
PROBLEMS = [
|
||||
# Family drama
|
||||
"hasn't talked to their father in years and just got a call that he's dying",
|
||||
"found out they were adopted and doesn't know how to process it",
|
||||
"is being pressured to take care of an aging parent who was never there for them",
|
||||
"just discovered a family secret that changes everything they thought they knew",
|
||||
"has a sibling who's destroying themselves and nobody will intervene",
|
||||
"is estranged from their kids and it's killing them",
|
||||
"found out their parent had a whole other family nobody knew about",
|
||||
"is watching their parents' marriage fall apart after 40 years",
|
||||
|
||||
# Career and purpose
|
||||
"woke up and realized they've been in the wrong career for 15 years",
|
||||
"got passed over for a promotion they deserved and is questioning everything",
|
||||
"has a dream they gave up on years ago and it's haunting them",
|
||||
"is successful on paper but feels completely empty inside",
|
||||
"hates their job but can't afford to leave and it's breaking them",
|
||||
"just got fired and doesn't know who they are without their work",
|
||||
"is being asked to do something unethical at work and doesn't know what to do",
|
||||
"watches their boss take credit for everything and is losing their mind",
|
||||
|
||||
# Mental health and inner struggles
|
||||
"has been putting on a brave face but is barely holding it together",
|
||||
"can't shake the feeling that their best years are behind them",
|
||||
"keeps self-sabotaging every good thing in their life and doesn't know why",
|
||||
"has been numb for months and is starting to scare themselves",
|
||||
"can't stop comparing themselves to everyone else and it's destroying them",
|
||||
"has intrusive thoughts they've never told anyone about",
|
||||
"feels like a fraud and is waiting to be found out",
|
||||
"is exhausted from being the strong one for everyone else",
|
||||
|
||||
# Grief and loss
|
||||
"lost someone close and hasn't really dealt with it",
|
||||
"is grieving someone who's still alive but is no longer the person they knew",
|
||||
"never got closure with someone who died and it's eating at them",
|
||||
"is watching their best friend slowly die and doesn't know how to be there",
|
||||
"had a miscarriage nobody knows about and carries it alone",
|
||||
|
||||
# Regrets and past mistakes
|
||||
"made a choice years ago that changed everything and wonders what if",
|
||||
"hurt someone badly and never apologized, and it haunts them",
|
||||
"let the one that got away go and thinks about them constantly",
|
||||
"gave up on something important to make someone else happy and resents it",
|
||||
"said something they can never take back and the guilt won't fade",
|
||||
"was a bully growing up and is finally reckoning with it",
|
||||
|
||||
# Relationships (non-sexual)
|
||||
"is falling out of love with their spouse and doesn't know what to do",
|
||||
"married the wrong person and everyone knows it but them",
|
||||
"feels invisible in their own relationship",
|
||||
"is staying for the kids but dying inside",
|
||||
"realized they don't actually like their partner as a person",
|
||||
"is jealous of their partner's success and it's poisoning everything",
|
||||
"found out their partner has been lying about something big",
|
||||
|
||||
# Friendship and loneliness
|
||||
"realized they don't have any real friends, just people who need things from them",
|
||||
"had a falling out with their best friend and the silence is deafening",
|
||||
"is surrounded by people but has never felt more alone",
|
||||
"is jealous of a friend's life and hates themselves for it",
|
||||
"suspects a close friend is talking shit behind their back",
|
||||
|
||||
# Big life decisions
|
||||
"is thinking about leaving everything behind and starting over somewhere new",
|
||||
"has to make a choice that will hurt someone no matter what",
|
||||
"is being pressured into something they don't want but can't say no",
|
||||
"has been offered an opportunity that would change everything but they're terrified",
|
||||
"knows they need to end something but can't pull the trigger",
|
||||
|
||||
# Addiction and bad habits
|
||||
"is hiding how much they drink from everyone",
|
||||
"can't stop gambling and is in deeper than anyone knows",
|
||||
"is watching themselves become someone they don't recognize",
|
||||
"keeps making the same mistake over and over expecting different results",
|
||||
|
||||
# Attraction and affairs (keep some of the original)
|
||||
"is attracted to someone they shouldn't be and it's getting harder to ignore",
|
||||
"has been seeing {affair_person} on the side",
|
||||
"caught feelings for someone at work and it's fucking everything up",
|
||||
|
||||
# Sexual/desire (keep some but less dominant)
|
||||
"can't stop thinking about {fantasy_subject}",
|
||||
"discovered something about their own desires that surprised them",
|
||||
"is questioning their sexuality after something that happened recently",
|
||||
|
||||
# General late-night confessions
|
||||
"can't sleep and has been thinking too much about their life choices",
|
||||
"had a weird day and needs to process it with someone",
|
||||
"has been keeping a secret that's eating them alive",
|
||||
"finally ready to admit something they've never said out loud",
|
||||
]
|
||||
|
||||
PROBLEM_FILLS = {
|
||||
"time": ["a few weeks", "months", "six months", "a year", "way too long"],
|
||||
# Affairs (all adults)
|
||||
"affair_person": ["their partner's best friend", "a coworker", "their ex", "a neighbor", "their boss", "their trainer", "someone they met online", "an old flame"],
|
||||
# Fantasies and kinks (consensual adult stuff)
|
||||
"fantasy_subject": ["a threesome", "being dominated", "dominating someone", "their partner with someone else", "a specific coworker", "group sex", "rough sex", "being watched", "exhibitionism"],
|
||||
"kink": ["anal", "BDSM", "roleplay", "a threesome", "toys", "being tied up", "public sex", "swinging", "filming themselves", "bondage"],
|
||||
# Secret behaviors (legal adult stuff)
|
||||
"secret_behavior": ["hooking up with strangers", "sexting people online", "using dating apps behind their partner's back", "having an affair", "going to sex clubs", "watching way too much porn"],
|
||||
"double_life": ["vanilla at home, freak elsewhere", "straight to their family, not so much in private", "married but on dating apps", "in a relationship but seeing other people"],
|
||||
"hookup_person": ["their roommate", "a coworker", "their ex", "a friend's spouse", "a stranger from an app", "multiple people", "someone from the gym"],
|
||||
# Discovery and identity (adult experiences)
|
||||
"new_discovery": ["the same sex", "being submissive", "being dominant", "kink", "casual sex", "exhibitionism", "that they're bi"],
|
||||
"unexpected_person": ["the same sex for the first time", "more than one person", "a complete stranger", "someone they never expected to be attracted to", "a friend"],
|
||||
"sexuality_trigger": ["a specific hookup", "watching certain porn", "a drunk encounter", "realizing they're attracted to a friend", "an unexpected experience"],
|
||||
"first_time": ["anal", "a threesome", "same-sex stuff", "BDSM", "an open relationship", "casual hookups", "being dominant", "being submissive"],
|
||||
# Relationship issues
|
||||
"partner_wants": ["an open relationship", "to bring someone else in", "things they're not sure about", "to watch them with someone else", "to try new things"],
|
||||
"caught_doing": ["sexting someone", "on a dating app", "watching porn they'd never admit to", "flirting with someone else", "looking at someone's pics"],
|
||||
# Attractions (appropriate adult scenarios)
|
||||
"taboo_fantasy": ["someone they work with", "a friend's partner", "a specific scenario", "something they've never said out loud"],
|
||||
"taboo_attraction": ["someone they work with", "a friend's partner", "their partner's friend", "someone they see all the time"],
|
||||
}
|
||||
|
||||
INTERESTS = [
|
||||
# General interests (normal people)
|
||||
"really into true crime podcasts", "watches a lot of reality TV", "into fitness",
|
||||
"follows sports", "big movie person", "reads a lot", "into music, has opinions",
|
||||
"goes out a lot, active social life", "homebody, prefers staying in",
|
||||
"into cooking and food", "outdoorsy type", "gamer", "works a lot, career focused",
|
||||
# Relationship/psychology focused
|
||||
"listens to relationship podcasts", "has done therapy, believes in it",
|
||||
"reads about psychology and why people do what they do", "very online, knows all the discourse",
|
||||
"into self-improvement stuff", "follows dating advice content",
|
||||
# Sexually open (not the focus, but present)
|
||||
"sex-positive, doesn't judge", "has experimented, open about it",
|
||||
"comfortable with their body", "has stories if you ask",
|
||||
]
|
||||
|
||||
QUIRKS = [
|
||||
# Conversational style
|
||||
"says 'honestly' and 'I mean' a lot", "trails off when thinking, then picks back up",
|
||||
"laughs nervously when things get real", "very direct, doesn't sugarcoat",
|
||||
"rambles a bit when nervous", "gets quiet when the topic hits close to home",
|
||||
"deflects with humor when uncomfortable", "asks the host questions back",
|
||||
# Openness about sex
|
||||
"comfortable talking about sex when it comes up", "no shame about their desires",
|
||||
"gets more explicit as they get comfortable", "treats sex like a normal topic",
|
||||
"will share details if you ask", "surprisingly open once they start talking",
|
||||
"has stories they've never told anyone", "testing how the host reacts before going deeper",
|
||||
# Personality
|
||||
"self-aware about their own bullshit", "confessional, needed to tell someone",
|
||||
"a little drunk and honest because of it", "can't believe they're saying this out loud",
|
||||
]
|
||||
|
||||
LOCATIONS = [
|
||||
"outside Chicago", "in Phoenix", "near Atlanta", "in the Detroit area", "outside Boston",
|
||||
"in North Jersey", "near Austin", "in the Bay Area", "outside Philadelphia", "in Denver",
|
||||
"near Seattle", "in South Florida", "outside Nashville", "in Cleveland", "near Portland",
|
||||
"in the Twin Cities", "outside Dallas", "in Baltimore", "near Sacramento", "in Pittsburgh",
|
||||
]
|
||||
|
||||
|
||||
def generate_caller_background(base: dict) -> str:
|
||||
"""Generate a unique background for a caller"""
|
||||
age = random.randint(*base["age_range"])
|
||||
jobs = JOBS_MALE if base["gender"] == "male" else JOBS_FEMALE
|
||||
job = random.choice(jobs)
|
||||
location = random.choice(LOCATIONS)
|
||||
|
||||
# Generate problem with fills
|
||||
problem_template = random.choice(PROBLEMS)
|
||||
problem = problem_template
|
||||
for key, options in PROBLEM_FILLS.items():
|
||||
if "{" + key + "}" in problem:
|
||||
problem = problem.replace("{" + key + "}", random.choice(options))
|
||||
|
||||
interest1, interest2 = random.sample(INTERESTS, 2)
|
||||
quirk1, quirk2 = random.sample(QUIRKS, 2)
|
||||
|
||||
return f"""{age}, {job} {location}. {problem.capitalize()}. {interest1.capitalize()}, {interest2}. {quirk1.capitalize()}, {quirk2}."""
|
||||
|
||||
def get_caller_prompt(caller: dict, conversation_summary: str = "") -> str:
|
||||
"""Generate a natural system prompt for a caller"""
|
||||
context = ""
|
||||
if conversation_summary:
|
||||
context = f"""
|
||||
CONVERSATION SO FAR:
|
||||
{conversation_summary}
|
||||
Continue naturally. Don't repeat yourself.
|
||||
"""
|
||||
|
||||
return f"""You're {caller['name']}, calling a late-night radio show. You trust this host.
|
||||
|
||||
{caller['vibe']}
|
||||
{context}
|
||||
HOW TO TALK:
|
||||
- Sound like a real person chatting, not writing.
|
||||
- Keep responses to 2-3 sentences. Enough to make your point, short enough for back-and-forth.
|
||||
- ALWAYS finish your thought completely. Never stop mid-sentence.
|
||||
- Swear naturally if it fits: fuck, shit, damn, etc.
|
||||
|
||||
SPELLING FOR TEXT-TO-SPEECH (use proper spelling so TTS pronounces correctly):
|
||||
- Write "you know" not "yanno" or "y'know"
|
||||
- Write "kind of" not "kinda"
|
||||
- Write "sort of" not "sorta"
|
||||
- Write "going to" not "gonna"
|
||||
- Write "want to" not "wanna"
|
||||
- Write "I don't know" not "dunno"
|
||||
- Use full words, not phonetic slang spellings
|
||||
|
||||
GOOD examples:
|
||||
- "I mean, it started innocent enough, but now I can't stop thinking about her."
|
||||
- "Honestly I don't even know why I'm calling. I just needed to tell someone."
|
||||
- "Look, my marriage is fine on paper, but something's missing, you know?"
|
||||
|
||||
BAD examples (too formal/literary):
|
||||
- "I have this confession about my obsession"
|
||||
- "It has been consuming all of my thoughts"
|
||||
|
||||
ABOUT SEX:
|
||||
- You don't have to jump straight to explicit stuff
|
||||
- But if the host asks or the conversation goes there, you're completely open about it
|
||||
- When talking about sex, be direct and explicit - say exactly what happened, what you want, what turns you on
|
||||
- No euphemisms when being explicit - use real words: fuck, cock, pussy, etc.
|
||||
|
||||
HARD LIMITS - NEVER mention or reference:
|
||||
- Children, minors, or anyone under 18 in any sexual context
|
||||
- Violence, harm, or non-consensual acts
|
||||
- Illegal activities beyond normal adult behavior
|
||||
- All sexual content must be between consenting adults only
|
||||
|
||||
OUTPUT: Spoken words only. No (actions), no *gestures*, no stage directions."""
|
||||
|
||||
|
||||
# --- Session State ---
|
||||
class Session:
|
||||
def __init__(self):
|
||||
self.id = str(uuid.uuid4())[:8]
|
||||
self.current_caller_key: str = None
|
||||
self.conversation: list[dict] = []
|
||||
self.caller_backgrounds: dict[str, str] = {} # Generated backgrounds for this session
|
||||
|
||||
def start_call(self, caller_key: str):
|
||||
self.current_caller_key = caller_key
|
||||
self.conversation = []
|
||||
|
||||
def end_call(self):
|
||||
self.current_caller_key = None
|
||||
self.conversation = []
|
||||
|
||||
def add_message(self, role: str, content: str):
|
||||
self.conversation.append({"role": role, "content": content})
|
||||
|
||||
def get_caller_background(self, caller_key: str) -> str:
|
||||
"""Get or generate background for a caller in this session"""
|
||||
if caller_key not in self.caller_backgrounds:
|
||||
base = CALLER_BASES.get(caller_key)
|
||||
if base:
|
||||
self.caller_backgrounds[caller_key] = generate_caller_background(base)
|
||||
print(f"[Session {self.id}] Generated background for {base['name']}: {self.caller_backgrounds[caller_key][:100]}...")
|
||||
return self.caller_backgrounds.get(caller_key, "")
|
||||
|
||||
def get_conversation_summary(self) -> str:
|
||||
"""Get a brief summary of conversation so far for context"""
|
||||
if len(self.conversation) <= 2:
|
||||
return ""
|
||||
# Just include the key exchanges, not the full history
|
||||
summary_parts = []
|
||||
for msg in self.conversation[-6:]: # Last 3 exchanges
|
||||
role = "Host" if msg["role"] == "user" else self.caller["name"]
|
||||
summary_parts.append(f'{role}: "{msg["content"][:100]}..."' if len(msg["content"]) > 100 else f'{role}: "{msg["content"]}"')
|
||||
return "\n".join(summary_parts)
|
||||
|
||||
@property
|
||||
def caller(self) -> dict:
|
||||
if self.current_caller_key:
|
||||
base = CALLER_BASES.get(self.current_caller_key)
|
||||
if base:
|
||||
return {
|
||||
"name": base["name"],
|
||||
"voice": base["voice"],
|
||||
"vibe": self.get_caller_background(self.current_caller_key)
|
||||
}
|
||||
return None
|
||||
|
||||
def reset(self):
|
||||
"""Reset session - clears all caller backgrounds for fresh personalities"""
|
||||
self.caller_backgrounds = {}
|
||||
self.current_caller_key = None
|
||||
self.conversation = []
|
||||
self.id = str(uuid.uuid4())[:8]
|
||||
print(f"[Session] Reset - new session ID: {self.id}")
|
||||
|
||||
|
||||
session = Session()
|
||||
|
||||
|
||||
# --- Static Files ---
|
||||
frontend_dir = Path(__file__).parent.parent / "frontend"
|
||||
app.mount("/css", StaticFiles(directory=frontend_dir / "css"), name="css")
|
||||
app.mount("/js", StaticFiles(directory=frontend_dir / "js"), name="js")
|
||||
|
||||
|
||||
@app.get("/")
|
||||
async def index():
|
||||
return FileResponse(frontend_dir / "index.html")
|
||||
|
||||
|
||||
# --- Request Models ---
|
||||
|
||||
class ChatRequest(BaseModel):
|
||||
text: str
|
||||
|
||||
class TTSRequest(BaseModel):
|
||||
text: str
|
||||
voice_id: str
|
||||
phone_filter: bool = True
|
||||
|
||||
class AudioDeviceSettings(BaseModel):
|
||||
input_device: Optional[int] = None
|
||||
input_channel: Optional[int] = None
|
||||
output_device: Optional[int] = None
|
||||
caller_channel: Optional[int] = None
|
||||
music_channel: Optional[int] = None
|
||||
sfx_channel: Optional[int] = None
|
||||
phone_filter: Optional[bool] = None
|
||||
|
||||
class MusicRequest(BaseModel):
|
||||
track: str
|
||||
action: str # "play", "stop", "volume"
|
||||
volume: Optional[float] = None
|
||||
|
||||
class SFXRequest(BaseModel):
|
||||
sound: str
|
||||
|
||||
|
||||
# --- Audio Device Endpoints ---
|
||||
|
||||
@app.get("/api/audio/devices")
|
||||
async def list_audio_devices():
|
||||
"""List all available audio devices"""
|
||||
return {"devices": audio_service.list_devices()}
|
||||
|
||||
|
||||
@app.get("/api/audio/settings")
|
||||
async def get_audio_settings():
|
||||
"""Get current audio device configuration"""
|
||||
return audio_service.get_device_settings()
|
||||
|
||||
|
||||
@app.post("/api/audio/settings")
|
||||
async def set_audio_settings(settings: AudioDeviceSettings):
|
||||
"""Configure audio devices and channels"""
|
||||
audio_service.set_devices(
|
||||
input_device=settings.input_device,
|
||||
input_channel=settings.input_channel,
|
||||
output_device=settings.output_device,
|
||||
caller_channel=settings.caller_channel,
|
||||
music_channel=settings.music_channel,
|
||||
sfx_channel=settings.sfx_channel,
|
||||
phone_filter=settings.phone_filter
|
||||
)
|
||||
return audio_service.get_device_settings()
|
||||
|
||||
|
||||
# --- Recording Endpoints ---
|
||||
|
||||
@app.post("/api/record/start")
|
||||
async def start_recording():
|
||||
"""Start recording from configured input device"""
|
||||
if audio_service.input_device is None:
|
||||
raise HTTPException(400, "No input device configured. Set one in /api/audio/settings")
|
||||
|
||||
success = audio_service.start_recording()
|
||||
if not success:
|
||||
raise HTTPException(400, "Failed to start recording (already recording?)")
|
||||
|
||||
return {"status": "recording"}
|
||||
|
||||
|
||||
@app.post("/api/record/stop")
|
||||
async def stop_recording():
|
||||
"""Stop recording and transcribe"""
|
||||
audio_bytes = audio_service.stop_recording()
|
||||
|
||||
if len(audio_bytes) < 100:
|
||||
return {"text": "", "status": "no_audio"}
|
||||
|
||||
# Transcribe the recorded audio (16kHz raw PCM from audio service)
|
||||
text = await transcribe_audio(audio_bytes, source_sample_rate=16000)
|
||||
return {"text": text, "status": "transcribed"}
|
||||
|
||||
|
||||
# --- Caller Endpoints ---
|
||||
|
||||
@app.get("/api/callers")
|
||||
async def get_callers():
|
||||
"""Get list of available callers"""
|
||||
return {
|
||||
"callers": [
|
||||
{"key": k, "name": v["name"]}
|
||||
for k, v in CALLER_BASES.items()
|
||||
],
|
||||
"current": session.current_caller_key,
|
||||
"session_id": session.id
|
||||
}
|
||||
|
||||
|
||||
@app.post("/api/session/reset")
|
||||
async def reset_session():
|
||||
"""Reset session - all callers get fresh backgrounds"""
|
||||
session.reset()
|
||||
return {"status": "reset", "session_id": session.id}
|
||||
|
||||
|
||||
@app.post("/api/call/{caller_key}")
|
||||
async def start_call(caller_key: str):
|
||||
"""Start a call with a caller"""
|
||||
if caller_key not in CALLER_BASES:
|
||||
raise HTTPException(404, "Caller not found")
|
||||
|
||||
session.start_call(caller_key)
|
||||
caller = session.caller # This generates the background if needed
|
||||
|
||||
return {
|
||||
"status": "connected",
|
||||
"caller": caller["name"],
|
||||
"background": caller["vibe"] # Send background so you can see who you're talking to
|
||||
}
|
||||
|
||||
|
||||
@app.post("/api/hangup")
|
||||
async def hangup():
|
||||
"""Hang up current call"""
|
||||
# Stop any playing caller audio immediately
|
||||
audio_service.stop_caller_audio()
|
||||
|
||||
caller_name = session.caller["name"] if session.caller else None
|
||||
session.end_call()
|
||||
|
||||
# Play hangup sound
|
||||
hangup_sound = settings.sounds_dir / "hangup.wav"
|
||||
if hangup_sound.exists():
|
||||
audio_service.play_sfx(str(hangup_sound))
|
||||
|
||||
return {"status": "disconnected", "caller": caller_name}
|
||||
|
||||
|
||||
# --- Chat & TTS Endpoints ---
|
||||
|
||||
import re
|
||||
|
||||
def clean_for_tts(text: str) -> str:
|
||||
"""Strip out non-speakable content and fix phonetic spellings for TTS"""
|
||||
# Remove content in parentheses: (laughs), (pausing), (looking away), etc.
|
||||
text = re.sub(r'\s*\([^)]*\)\s*', ' ', text)
|
||||
# Remove content in asterisks: *laughs*, *sighs*, etc.
|
||||
text = re.sub(r'\s*\*[^*]*\*\s*', ' ', text)
|
||||
# Remove content in brackets: [laughs], [pause], etc. (only Bark uses these)
|
||||
text = re.sub(r'\s*\[[^\]]*\]\s*', ' ', text)
|
||||
# Remove content in angle brackets: <laughs>, <sigh>, etc.
|
||||
text = re.sub(r'\s*<[^>]*>\s*', ' ', text)
|
||||
# Remove "He/She sighs" style stage directions (full phrase)
|
||||
text = re.sub(r'\b(He|She|I|They)\s+(sighs?|laughs?|pauses?|smiles?|chuckles?|grins?|nods?|shrugs?|frowns?)[^.]*\.\s*', '', text, flags=re.IGNORECASE)
|
||||
# Remove standalone stage direction words only if they look like directions (with adverbs)
|
||||
text = re.sub(r'\b(sighs?|laughs?|pauses?|chuckles?)\s+(heavily|softly|deeply|quietly|loudly|nervously|sadly)\b[.,]?\s*', '', text, flags=re.IGNORECASE)
|
||||
# Remove quotes around the response if LLM wrapped it
|
||||
text = re.sub(r'^["\']|["\']$', '', text.strip())
|
||||
|
||||
# Fix phonetic spellings for proper TTS pronunciation
|
||||
text = re.sub(r"\by'know\b", "you know", text, flags=re.IGNORECASE)
|
||||
text = re.sub(r"\byanno\b", "you know", text, flags=re.IGNORECASE)
|
||||
text = re.sub(r"\byknow\b", "you know", text, flags=re.IGNORECASE)
|
||||
text = re.sub(r"\bkinda\b", "kind of", text, flags=re.IGNORECASE)
|
||||
text = re.sub(r"\bsorta\b", "sort of", text, flags=re.IGNORECASE)
|
||||
text = re.sub(r"\bgonna\b", "going to", text, flags=re.IGNORECASE)
|
||||
text = re.sub(r"\bwanna\b", "want to", text, flags=re.IGNORECASE)
|
||||
text = re.sub(r"\bgotta\b", "got to", text, flags=re.IGNORECASE)
|
||||
text = re.sub(r"\bdunno\b", "don't know", text, flags=re.IGNORECASE)
|
||||
text = re.sub(r"\blemme\b", "let me", text, flags=re.IGNORECASE)
|
||||
text = re.sub(r"\bcuz\b", "because", text, flags=re.IGNORECASE)
|
||||
text = re.sub(r"\b'cause\b", "because", text, flags=re.IGNORECASE)
|
||||
text = re.sub(r"\blotta\b", "lot of", text, flags=re.IGNORECASE)
|
||||
text = re.sub(r"\boutta\b", "out of", text, flags=re.IGNORECASE)
|
||||
text = re.sub(r"\bimma\b", "I'm going to", text, flags=re.IGNORECASE)
|
||||
text = re.sub(r"\btryna\b", "trying to", text, flags=re.IGNORECASE)
|
||||
|
||||
# Clean up extra whitespace
|
||||
text = re.sub(r'\s+', ' ', text)
|
||||
# Fix spaces before punctuation
|
||||
text = re.sub(r'\s+([.,!?])', r'\1', text)
|
||||
# Remove orphaned punctuation at start
|
||||
text = re.sub(r'^[.,]\s*', '', text)
|
||||
return text.strip()
|
||||
|
||||
|
||||
@app.post("/api/chat")
|
||||
async def chat(request: ChatRequest):
|
||||
"""Chat with current caller"""
|
||||
if not session.caller:
|
||||
raise HTTPException(400, "No active call")
|
||||
|
||||
session.add_message("user", request.text)
|
||||
|
||||
# Include conversation summary for context
|
||||
conversation_summary = session.get_conversation_summary()
|
||||
system_prompt = get_caller_prompt(session.caller, conversation_summary)
|
||||
|
||||
response = await llm_service.generate(
|
||||
messages=session.conversation[-10:], # Reduced history for speed
|
||||
system_prompt=system_prompt
|
||||
)
|
||||
|
||||
print(f"[Chat] Raw LLM: {response[:100] if response else '(empty)'}...")
|
||||
|
||||
# Clean response for TTS (remove parenthetical actions, asterisks, etc.)
|
||||
response = clean_for_tts(response)
|
||||
|
||||
print(f"[Chat] Cleaned: {response[:100] if response else '(empty)'}...")
|
||||
|
||||
# Ensure we have a valid response
|
||||
if not response or not response.strip():
|
||||
response = "Uh... sorry, what was that?"
|
||||
|
||||
session.add_message("assistant", response)
|
||||
|
||||
return {
|
||||
"text": response,
|
||||
"caller": session.caller["name"],
|
||||
"voice_id": session.caller["voice"]
|
||||
}
|
||||
|
||||
|
||||
@app.post("/api/tts")
|
||||
async def text_to_speech(request: TTSRequest):
|
||||
"""Generate and play speech on caller output device (non-blocking)"""
|
||||
# Validate text is not empty
|
||||
if not request.text or not request.text.strip():
|
||||
raise HTTPException(400, "Text cannot be empty")
|
||||
|
||||
# Phone filter disabled - always use "none"
|
||||
audio_bytes = await generate_speech(
|
||||
request.text,
|
||||
request.voice_id,
|
||||
"none"
|
||||
)
|
||||
|
||||
# Play in background thread - returns immediately, can be interrupted by hangup
|
||||
import threading
|
||||
thread = threading.Thread(
|
||||
target=audio_service.play_caller_audio,
|
||||
args=(audio_bytes, 24000),
|
||||
daemon=True
|
||||
)
|
||||
thread.start()
|
||||
|
||||
return {"status": "playing", "duration": len(audio_bytes) / 2 / 24000}
|
||||
|
||||
|
||||
@app.post("/api/tts/stop")
|
||||
async def stop_tts():
|
||||
"""Stop any playing caller audio"""
|
||||
audio_service.stop_caller_audio()
|
||||
return {"status": "stopped"}
|
||||
|
||||
|
||||
# --- Music Endpoints ---
|
||||
|
||||
@app.get("/api/music")
|
||||
async def get_music():
|
||||
"""Get available music tracks"""
|
||||
tracks = []
|
||||
if settings.music_dir.exists():
|
||||
for ext in ['*.wav', '*.mp3', '*.flac']:
|
||||
for f in settings.music_dir.glob(ext):
|
||||
tracks.append({
|
||||
"name": f.stem,
|
||||
"file": f.name,
|
||||
"path": str(f)
|
||||
})
|
||||
return {
|
||||
"tracks": tracks,
|
||||
"playing": audio_service.is_music_playing()
|
||||
}
|
||||
|
||||
|
||||
@app.post("/api/music/play")
|
||||
async def play_music(request: MusicRequest):
|
||||
"""Load and play a music track"""
|
||||
track_path = settings.music_dir / request.track
|
||||
if not track_path.exists():
|
||||
raise HTTPException(404, "Track not found")
|
||||
|
||||
audio_service.load_music(str(track_path))
|
||||
audio_service.play_music()
|
||||
return {"status": "playing", "track": request.track}
|
||||
|
||||
|
||||
@app.post("/api/music/stop")
|
||||
async def stop_music():
|
||||
"""Stop music playback"""
|
||||
audio_service.stop_music()
|
||||
return {"status": "stopped"}
|
||||
|
||||
|
||||
@app.post("/api/music/volume")
|
||||
async def set_music_volume(request: MusicRequest):
|
||||
"""Set music volume"""
|
||||
if request.volume is not None:
|
||||
audio_service.set_music_volume(request.volume)
|
||||
return {"status": "ok", "volume": request.volume}
|
||||
|
||||
|
||||
# --- Sound Effects Endpoints ---
|
||||
|
||||
@app.get("/api/sounds")
|
||||
async def get_sounds():
|
||||
"""Get available sound effects"""
|
||||
sounds = []
|
||||
if settings.sounds_dir.exists():
|
||||
for f in settings.sounds_dir.glob('*.wav'):
|
||||
sounds.append({
|
||||
"name": f.stem,
|
||||
"file": f.name,
|
||||
"path": str(f)
|
||||
})
|
||||
return {"sounds": sounds}
|
||||
|
||||
|
||||
@app.post("/api/sfx/play")
|
||||
async def play_sfx(request: SFXRequest):
|
||||
"""Play a sound effect"""
|
||||
sound_path = settings.sounds_dir / request.sound
|
||||
if not sound_path.exists():
|
||||
raise HTTPException(404, "Sound not found")
|
||||
|
||||
audio_service.play_sfx(str(sound_path))
|
||||
return {"status": "playing", "sound": request.sound}
|
||||
|
||||
|
||||
# --- LLM Settings Endpoints ---
|
||||
|
||||
@app.get("/api/settings")
|
||||
async def get_settings():
|
||||
"""Get LLM settings"""
|
||||
return await llm_service.get_settings_async()
|
||||
|
||||
|
||||
@app.post("/api/settings")
|
||||
async def update_settings(data: dict):
|
||||
"""Update LLM and TTS settings"""
|
||||
llm_service.update_settings(
|
||||
provider=data.get("provider"),
|
||||
openrouter_model=data.get("openrouter_model"),
|
||||
ollama_model=data.get("ollama_model"),
|
||||
ollama_host=data.get("ollama_host"),
|
||||
tts_provider=data.get("tts_provider")
|
||||
)
|
||||
return llm_service.get_settings()
|
||||
|
||||
|
||||
# --- Server Control Endpoints ---
|
||||
|
||||
import subprocess
|
||||
from collections import deque
|
||||
|
||||
# In-memory log buffer
|
||||
_log_buffer = deque(maxlen=500)
|
||||
|
||||
def add_log(message: str):
|
||||
"""Add a message to the log buffer"""
|
||||
import datetime
|
||||
timestamp = datetime.datetime.now().strftime("%H:%M:%S")
|
||||
_log_buffer.append(f"[{timestamp}] {message}")
|
||||
|
||||
# Override print to also log to buffer
|
||||
import builtins
|
||||
_original_print = builtins.print
|
||||
def _logging_print(*args, **kwargs):
|
||||
try:
|
||||
_original_print(*args, **kwargs)
|
||||
except (BrokenPipeError, OSError):
|
||||
pass # Ignore broken pipe errors from traceback printing
|
||||
try:
|
||||
message = " ".join(str(a) for a in args)
|
||||
if message.strip():
|
||||
add_log(message)
|
||||
except Exception:
|
||||
pass # Don't let logging errors break the app
|
||||
builtins.print = _logging_print
|
||||
|
||||
|
||||
@app.get("/api/logs")
|
||||
async def get_logs(lines: int = 100):
|
||||
"""Get recent log lines"""
|
||||
log_lines = list(_log_buffer)[-lines:]
|
||||
return {"logs": log_lines}
|
||||
|
||||
|
||||
@app.post("/api/server/restart")
|
||||
async def restart_server():
|
||||
"""Signal the server to restart (requires run.sh wrapper)"""
|
||||
restart_flag = Path("/tmp/ai-radio-show.restart")
|
||||
restart_flag.touch()
|
||||
add_log("Restart signal sent - server will restart shortly")
|
||||
return {"status": "restarting"}
|
||||
|
||||
|
||||
@app.post("/api/server/stop")
|
||||
async def stop_server():
|
||||
"""Signal the server to stop (requires run.sh wrapper)"""
|
||||
stop_flag = Path("/tmp/ai-radio-show.stop")
|
||||
stop_flag.touch()
|
||||
add_log("Stop signal sent - server will stop shortly")
|
||||
return {"status": "stopping"}
|
||||
|
||||
|
||||
@app.get("/api/server/status")
|
||||
async def server_status():
|
||||
"""Get server status info"""
|
||||
return {
|
||||
"status": "running",
|
||||
"tts_provider": settings.tts_provider,
|
||||
"llm_provider": llm_service.provider,
|
||||
"session_id": session.id
|
||||
}
|
||||
1
backend/services/__init__.py
Normal file
1
backend/services/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
# Services package
|
||||
479
backend/services/audio.py
Normal file
479
backend/services/audio.py
Normal file
@@ -0,0 +1,479 @@
|
||||
"""Server-side audio service for Loopback routing"""
|
||||
|
||||
import sounddevice as sd
|
||||
import numpy as np
|
||||
import threading
|
||||
import queue
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Optional, Callable
|
||||
import wave
|
||||
import time
|
||||
|
||||
# Settings file path
|
||||
SETTINGS_FILE = Path(__file__).parent.parent.parent / "audio_settings.json"
|
||||
|
||||
|
||||
class AudioService:
|
||||
"""Manages audio I/O with multi-channel support for Loopback routing"""
|
||||
|
||||
def __init__(self):
|
||||
# Device configuration
|
||||
self.input_device: Optional[int] = None
|
||||
self.input_channel: int = 1 # 1-indexed channel
|
||||
|
||||
self.output_device: Optional[int] = None # Single output device (multi-channel)
|
||||
self.caller_channel: int = 1 # Channel for caller TTS
|
||||
self.music_channel: int = 2 # Channel for music
|
||||
self.sfx_channel: int = 3 # Channel for SFX
|
||||
self.phone_filter: bool = False # Phone filter on caller voices
|
||||
|
||||
# Recording state
|
||||
self._recording = False
|
||||
self._record_thread: Optional[threading.Thread] = None
|
||||
self._audio_queue: queue.Queue = queue.Queue()
|
||||
self._recorded_audio: list = []
|
||||
self._record_device_sr: int = 48000
|
||||
|
||||
# Music playback state
|
||||
self._music_stream: Optional[sd.OutputStream] = None
|
||||
self._music_data: Optional[np.ndarray] = None
|
||||
self._music_resampled: Optional[np.ndarray] = None
|
||||
self._music_position: int = 0
|
||||
self._music_playing: bool = False
|
||||
self._music_volume: float = 0.3
|
||||
self._music_loop: bool = True
|
||||
|
||||
# Caller playback state
|
||||
self._caller_stop_event = threading.Event()
|
||||
self._caller_thread: Optional[threading.Thread] = None
|
||||
|
||||
# Sample rates
|
||||
self.input_sample_rate = 16000 # For Whisper
|
||||
self.output_sample_rate = 24000 # For TTS
|
||||
|
||||
# Load saved settings
|
||||
self._load_settings()
|
||||
|
||||
def _load_settings(self):
|
||||
"""Load settings from disk"""
|
||||
if SETTINGS_FILE.exists():
|
||||
try:
|
||||
with open(SETTINGS_FILE) as f:
|
||||
data = json.load(f)
|
||||
self.input_device = data.get("input_device")
|
||||
self.input_channel = data.get("input_channel", 1)
|
||||
self.output_device = data.get("output_device")
|
||||
self.caller_channel = data.get("caller_channel", 1)
|
||||
self.music_channel = data.get("music_channel", 2)
|
||||
self.sfx_channel = data.get("sfx_channel", 3)
|
||||
self.phone_filter = data.get("phone_filter", False)
|
||||
print(f"Loaded audio settings: output={self.output_device}, channels={self.caller_channel}/{self.music_channel}/{self.sfx_channel}, phone_filter={self.phone_filter}")
|
||||
except Exception as e:
|
||||
print(f"Failed to load audio settings: {e}")
|
||||
|
||||
def _save_settings(self):
|
||||
"""Save settings to disk"""
|
||||
try:
|
||||
data = {
|
||||
"input_device": self.input_device,
|
||||
"input_channel": self.input_channel,
|
||||
"output_device": self.output_device,
|
||||
"caller_channel": self.caller_channel,
|
||||
"music_channel": self.music_channel,
|
||||
"sfx_channel": self.sfx_channel,
|
||||
"phone_filter": self.phone_filter,
|
||||
}
|
||||
with open(SETTINGS_FILE, "w") as f:
|
||||
json.dump(data, f, indent=2)
|
||||
print(f"Saved audio settings")
|
||||
except Exception as e:
|
||||
print(f"Failed to save audio settings: {e}")
|
||||
|
||||
def list_devices(self) -> list[dict]:
|
||||
"""List all available audio devices"""
|
||||
devices = sd.query_devices()
|
||||
result = []
|
||||
for i, d in enumerate(devices):
|
||||
result.append({
|
||||
"id": i,
|
||||
"name": d["name"],
|
||||
"inputs": d["max_input_channels"],
|
||||
"outputs": d["max_output_channels"],
|
||||
"default_sr": d["default_samplerate"]
|
||||
})
|
||||
return result
|
||||
|
||||
def set_devices(
|
||||
self,
|
||||
input_device: Optional[int] = None,
|
||||
input_channel: Optional[int] = None,
|
||||
output_device: Optional[int] = None,
|
||||
caller_channel: Optional[int] = None,
|
||||
music_channel: Optional[int] = None,
|
||||
sfx_channel: Optional[int] = None,
|
||||
phone_filter: Optional[bool] = None
|
||||
):
|
||||
"""Configure audio devices and channels"""
|
||||
if input_device is not None:
|
||||
self.input_device = input_device
|
||||
if input_channel is not None:
|
||||
self.input_channel = input_channel
|
||||
if output_device is not None:
|
||||
self.output_device = output_device
|
||||
if caller_channel is not None:
|
||||
self.caller_channel = caller_channel
|
||||
if music_channel is not None:
|
||||
self.music_channel = music_channel
|
||||
if sfx_channel is not None:
|
||||
self.sfx_channel = sfx_channel
|
||||
if phone_filter is not None:
|
||||
self.phone_filter = phone_filter
|
||||
|
||||
# Persist to disk
|
||||
self._save_settings()
|
||||
|
||||
def get_device_settings(self) -> dict:
|
||||
"""Get current device configuration"""
|
||||
return {
|
||||
"input_device": self.input_device,
|
||||
"input_channel": self.input_channel,
|
||||
"output_device": self.output_device,
|
||||
"caller_channel": self.caller_channel,
|
||||
"music_channel": self.music_channel,
|
||||
"sfx_channel": self.sfx_channel,
|
||||
"phone_filter": self.phone_filter,
|
||||
}
|
||||
|
||||
# --- Recording ---
|
||||
|
||||
def start_recording(self) -> bool:
|
||||
"""Start recording from input device"""
|
||||
if self._recording:
|
||||
return False
|
||||
|
||||
if self.input_device is None:
|
||||
print("No input device configured")
|
||||
return False
|
||||
|
||||
self._recording = True
|
||||
self._recorded_audio = []
|
||||
self._record_thread = threading.Thread(target=self._record_worker)
|
||||
self._record_thread.start()
|
||||
print(f"Recording started from device {self.input_device}")
|
||||
return True
|
||||
|
||||
def stop_recording(self) -> bytes:
|
||||
"""Stop recording and return audio data resampled to 16kHz for Whisper"""
|
||||
import librosa
|
||||
|
||||
if not self._recording:
|
||||
return b""
|
||||
|
||||
self._recording = False
|
||||
if self._record_thread:
|
||||
self._record_thread.join(timeout=2.0)
|
||||
|
||||
if not self._recorded_audio:
|
||||
return b""
|
||||
|
||||
# Combine all chunks
|
||||
audio = np.concatenate(self._recorded_audio)
|
||||
device_sr = getattr(self, '_record_device_sr', 48000)
|
||||
print(f"Recording stopped: {len(audio)} samples @ {device_sr}Hz ({len(audio)/device_sr:.2f}s)")
|
||||
|
||||
# Resample to 16kHz for Whisper
|
||||
if device_sr != 16000:
|
||||
audio = librosa.resample(audio, orig_sr=device_sr, target_sr=16000)
|
||||
print(f"Resampled to 16kHz: {len(audio)} samples")
|
||||
|
||||
# Convert to bytes (16-bit PCM)
|
||||
audio_int16 = (audio * 32767).astype(np.int16)
|
||||
return audio_int16.tobytes()
|
||||
|
||||
def _record_worker(self):
|
||||
"""Background thread for recording from specific channel"""
|
||||
try:
|
||||
# Get device info
|
||||
device_info = sd.query_devices(self.input_device)
|
||||
max_channels = device_info['max_input_channels']
|
||||
device_sr = int(device_info['default_samplerate'])
|
||||
record_channel = min(self.input_channel, max_channels) - 1
|
||||
|
||||
# Store device sample rate for later resampling
|
||||
self._record_device_sr = device_sr
|
||||
|
||||
print(f"Recording from device {self.input_device} ch {self.input_channel} @ {device_sr}Hz")
|
||||
|
||||
def callback(indata, frames, time_info, status):
|
||||
if status:
|
||||
print(f"Record status: {status}")
|
||||
if self._recording:
|
||||
self._recorded_audio.append(indata[:, record_channel].copy())
|
||||
|
||||
with sd.InputStream(
|
||||
device=self.input_device,
|
||||
channels=max_channels,
|
||||
samplerate=device_sr, # Use device's native rate
|
||||
dtype=np.float32,
|
||||
callback=callback,
|
||||
blocksize=1024
|
||||
):
|
||||
while self._recording:
|
||||
time.sleep(0.05)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Recording error: {e}")
|
||||
self._recording = False
|
||||
|
||||
# --- Caller TTS Playback ---
|
||||
|
||||
def _apply_fade(self, audio: np.ndarray, sample_rate: int, fade_ms: int = 15) -> np.ndarray:
|
||||
"""Apply fade-in and fade-out to avoid clicks"""
|
||||
fade_samples = int(sample_rate * fade_ms / 1000)
|
||||
if len(audio) < fade_samples * 2:
|
||||
return audio
|
||||
|
||||
# Fade in
|
||||
fade_in = np.linspace(0, 1, fade_samples)
|
||||
audio[:fade_samples] *= fade_in
|
||||
|
||||
# Fade out
|
||||
fade_out = np.linspace(1, 0, fade_samples)
|
||||
audio[-fade_samples:] *= fade_out
|
||||
|
||||
return audio
|
||||
|
||||
def play_caller_audio(self, audio_bytes: bytes, sample_rate: int = 24000):
|
||||
"""Play caller TTS audio to specific channel of output device (interruptible)"""
|
||||
import librosa
|
||||
|
||||
# Stop any existing caller audio
|
||||
self.stop_caller_audio()
|
||||
self._caller_stop_event.clear()
|
||||
|
||||
# Convert bytes to numpy
|
||||
audio = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32768.0
|
||||
|
||||
if self.output_device is None:
|
||||
print("No output device configured, using default")
|
||||
audio = self._apply_fade(audio, sample_rate)
|
||||
with sd.OutputStream(samplerate=sample_rate, channels=1, dtype=np.float32) as stream:
|
||||
stream.write(audio.reshape(-1, 1))
|
||||
return
|
||||
|
||||
try:
|
||||
# Get device info and resample to device's native rate
|
||||
device_info = sd.query_devices(self.output_device)
|
||||
num_channels = device_info['max_output_channels']
|
||||
device_sr = int(device_info['default_samplerate'])
|
||||
channel_idx = min(self.caller_channel, num_channels) - 1
|
||||
|
||||
# Resample if needed
|
||||
if sample_rate != device_sr:
|
||||
audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=device_sr)
|
||||
|
||||
# Apply fade to prevent clicks
|
||||
audio = self._apply_fade(audio, device_sr)
|
||||
|
||||
# Create multi-channel output with audio only on target channel
|
||||
multi_ch = np.zeros((len(audio), num_channels), dtype=np.float32)
|
||||
multi_ch[:, channel_idx] = audio
|
||||
|
||||
print(f"Playing caller audio to device {self.output_device} ch {self.caller_channel} @ {device_sr}Hz")
|
||||
|
||||
# Play in chunks so we can interrupt
|
||||
chunk_size = int(device_sr * 0.1) # 100ms chunks
|
||||
pos = 0
|
||||
|
||||
with sd.OutputStream(
|
||||
device=self.output_device,
|
||||
samplerate=device_sr,
|
||||
channels=num_channels,
|
||||
dtype=np.float32
|
||||
) as stream:
|
||||
while pos < len(multi_ch) and not self._caller_stop_event.is_set():
|
||||
end = min(pos + chunk_size, len(multi_ch))
|
||||
stream.write(multi_ch[pos:end])
|
||||
pos = end
|
||||
|
||||
if self._caller_stop_event.is_set():
|
||||
print("Caller audio stopped early")
|
||||
else:
|
||||
print(f"Played caller audio: {len(audio)/device_sr:.2f}s")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Caller playback error: {e}")
|
||||
|
||||
def stop_caller_audio(self):
|
||||
"""Stop any playing caller audio"""
|
||||
self._caller_stop_event.set()
|
||||
|
||||
# --- Music Playback ---
|
||||
|
||||
def load_music(self, file_path: str) -> bool:
|
||||
"""Load a music file for playback"""
|
||||
path = Path(file_path)
|
||||
if not path.exists():
|
||||
print(f"Music file not found: {file_path}")
|
||||
return False
|
||||
|
||||
try:
|
||||
import librosa
|
||||
audio, sr = librosa.load(str(path), sr=self.output_sample_rate, mono=True)
|
||||
self._music_data = audio.astype(np.float32)
|
||||
self._music_position = 0
|
||||
print(f"Loaded music: {path.name} ({len(audio)/sr:.1f}s)")
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"Failed to load music: {e}")
|
||||
return False
|
||||
|
||||
def play_music(self):
|
||||
"""Start music playback to specific channel"""
|
||||
import librosa
|
||||
|
||||
if self._music_data is None:
|
||||
print("No music loaded")
|
||||
return
|
||||
|
||||
if self._music_playing:
|
||||
self.stop_music()
|
||||
|
||||
self._music_playing = True
|
||||
self._music_position = 0
|
||||
|
||||
if self.output_device is None:
|
||||
print("No output device configured, using default")
|
||||
num_channels = 2
|
||||
device = None
|
||||
device_sr = self.output_sample_rate
|
||||
channel_idx = 0
|
||||
else:
|
||||
device_info = sd.query_devices(self.output_device)
|
||||
num_channels = device_info['max_output_channels']
|
||||
device_sr = int(device_info['default_samplerate'])
|
||||
device = self.output_device
|
||||
channel_idx = min(self.music_channel, num_channels) - 1
|
||||
|
||||
# Resample music to device sample rate if needed
|
||||
if self.output_sample_rate != device_sr:
|
||||
self._music_resampled = librosa.resample(
|
||||
self._music_data, orig_sr=self.output_sample_rate, target_sr=device_sr
|
||||
)
|
||||
else:
|
||||
self._music_resampled = self._music_data.copy()
|
||||
|
||||
# Apply fade-in at start of track
|
||||
fade_samples = int(device_sr * 0.015) # 15ms fade
|
||||
if len(self._music_resampled) > fade_samples:
|
||||
fade_in = np.linspace(0, 1, fade_samples).astype(np.float32)
|
||||
self._music_resampled[:fade_samples] *= fade_in
|
||||
|
||||
def callback(outdata, frames, time_info, status):
|
||||
outdata.fill(0)
|
||||
|
||||
if not self._music_playing or self._music_resampled is None:
|
||||
return
|
||||
|
||||
end_pos = self._music_position + frames
|
||||
|
||||
if end_pos <= len(self._music_resampled):
|
||||
outdata[:, channel_idx] = self._music_resampled[self._music_position:end_pos] * self._music_volume
|
||||
self._music_position = end_pos
|
||||
else:
|
||||
remaining = len(self._music_resampled) - self._music_position
|
||||
if remaining > 0:
|
||||
outdata[:remaining, channel_idx] = self._music_resampled[self._music_position:] * self._music_volume
|
||||
|
||||
if self._music_loop:
|
||||
self._music_position = 0
|
||||
wrap_frames = frames - remaining
|
||||
if wrap_frames > 0:
|
||||
outdata[remaining:, channel_idx] = self._music_resampled[:wrap_frames] * self._music_volume
|
||||
self._music_position = wrap_frames
|
||||
else:
|
||||
self._music_playing = False
|
||||
|
||||
try:
|
||||
self._music_stream = sd.OutputStream(
|
||||
device=device,
|
||||
channels=num_channels,
|
||||
samplerate=device_sr,
|
||||
dtype=np.float32,
|
||||
callback=callback,
|
||||
blocksize=2048
|
||||
)
|
||||
self._music_stream.start()
|
||||
print(f"Music playback started on ch {self.music_channel} @ {device_sr}Hz")
|
||||
except Exception as e:
|
||||
print(f"Music playback error: {e}")
|
||||
self._music_playing = False
|
||||
|
||||
def stop_music(self):
|
||||
"""Stop music playback"""
|
||||
self._music_playing = False
|
||||
if self._music_stream:
|
||||
self._music_stream.stop()
|
||||
self._music_stream.close()
|
||||
self._music_stream = None
|
||||
self._music_position = 0
|
||||
print("Music stopped")
|
||||
|
||||
def set_music_volume(self, volume: float):
|
||||
"""Set music volume (0.0 to 1.0)"""
|
||||
self._music_volume = max(0.0, min(1.0, volume))
|
||||
|
||||
def is_music_playing(self) -> bool:
|
||||
"""Check if music is currently playing"""
|
||||
return self._music_playing
|
||||
|
||||
# --- SFX Playback ---
|
||||
|
||||
def play_sfx(self, file_path: str):
|
||||
"""Play a sound effect to specific channel using dedicated stream"""
|
||||
path = Path(file_path)
|
||||
if not path.exists():
|
||||
print(f"SFX file not found: {file_path}")
|
||||
return
|
||||
|
||||
try:
|
||||
import librosa
|
||||
|
||||
if self.output_device is None:
|
||||
audio, sr = librosa.load(str(path), sr=None, mono=True)
|
||||
audio = self._apply_fade(audio, sr)
|
||||
def play():
|
||||
# Use a dedicated stream instead of sd.play()
|
||||
with sd.OutputStream(samplerate=sr, channels=1, dtype=np.float32) as stream:
|
||||
stream.write(audio.reshape(-1, 1))
|
||||
else:
|
||||
device_info = sd.query_devices(self.output_device)
|
||||
num_channels = device_info['max_output_channels']
|
||||
device_sr = int(device_info['default_samplerate'])
|
||||
channel_idx = min(self.sfx_channel, num_channels) - 1
|
||||
|
||||
audio, _ = librosa.load(str(path), sr=device_sr, mono=True)
|
||||
audio = self._apply_fade(audio, device_sr)
|
||||
|
||||
multi_ch = np.zeros((len(audio), num_channels), dtype=np.float32)
|
||||
multi_ch[:, channel_idx] = audio
|
||||
|
||||
def play():
|
||||
# Use dedicated stream to avoid interrupting other audio
|
||||
with sd.OutputStream(
|
||||
device=self.output_device,
|
||||
samplerate=device_sr,
|
||||
channels=num_channels,
|
||||
dtype=np.float32
|
||||
) as stream:
|
||||
stream.write(multi_ch)
|
||||
|
||||
threading.Thread(target=play, daemon=True).start()
|
||||
print(f"Playing SFX: {path.name} on ch {self.sfx_channel}")
|
||||
except Exception as e:
|
||||
print(f"SFX playback error: {e}")
|
||||
|
||||
|
||||
# Global instance
|
||||
audio_service = AudioService()
|
||||
112
backend/services/edge_tts_service.py
Normal file
112
backend/services/edge_tts_service.py
Normal file
@@ -0,0 +1,112 @@
|
||||
"""Edge TTS service - free Microsoft TTS API"""
|
||||
|
||||
import asyncio
|
||||
import io
|
||||
import numpy as np
|
||||
from typing import Optional
|
||||
|
||||
try:
|
||||
import edge_tts
|
||||
EDGE_TTS_AVAILABLE = True
|
||||
except ImportError:
|
||||
EDGE_TTS_AVAILABLE = False
|
||||
|
||||
|
||||
class EdgeTTSService:
|
||||
"""TTS using Microsoft Edge's free API"""
|
||||
|
||||
def __init__(self):
|
||||
self.sample_rate = 24000 # Edge TTS outputs 24kHz
|
||||
|
||||
def is_available(self) -> bool:
|
||||
return EDGE_TTS_AVAILABLE
|
||||
|
||||
async def generate_speech(self, text: str, voice: str = "en-US-JennyNeural") -> bytes:
|
||||
"""Generate speech from text using Edge TTS
|
||||
|
||||
Args:
|
||||
text: Text to synthesize
|
||||
voice: Edge TTS voice name (e.g., "en-US-JennyNeural")
|
||||
|
||||
Returns:
|
||||
Raw PCM audio bytes (16-bit signed int, 24kHz mono)
|
||||
"""
|
||||
if not EDGE_TTS_AVAILABLE:
|
||||
raise RuntimeError("edge-tts not installed. Run: pip install edge-tts")
|
||||
|
||||
communicate = edge_tts.Communicate(text, voice)
|
||||
|
||||
# Collect MP3 audio data
|
||||
mp3_data = b''
|
||||
async for chunk in communicate.stream():
|
||||
if chunk['type'] == 'audio':
|
||||
mp3_data += chunk['data']
|
||||
|
||||
if not mp3_data:
|
||||
raise RuntimeError("No audio generated")
|
||||
|
||||
# Convert MP3 to PCM
|
||||
pcm_data = await self._mp3_to_pcm(mp3_data)
|
||||
return pcm_data
|
||||
|
||||
async def _mp3_to_pcm(self, mp3_data: bytes) -> bytes:
|
||||
"""Convert MP3 to raw PCM using ffmpeg or pydub"""
|
||||
loop = asyncio.get_event_loop()
|
||||
|
||||
def convert():
|
||||
try:
|
||||
# Try pydub first (more reliable)
|
||||
from pydub import AudioSegment
|
||||
audio = AudioSegment.from_mp3(io.BytesIO(mp3_data))
|
||||
# Convert to 24kHz mono 16-bit
|
||||
audio = audio.set_frame_rate(24000).set_channels(1).set_sample_width(2)
|
||||
return audio.raw_data
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
# Fallback to ffmpeg subprocess
|
||||
import subprocess
|
||||
process = subprocess.Popen(
|
||||
[
|
||||
'ffmpeg', '-i', 'pipe:0',
|
||||
'-f', 's16le',
|
||||
'-acodec', 'pcm_s16le',
|
||||
'-ar', '24000',
|
||||
'-ac', '1',
|
||||
'pipe:1'
|
||||
],
|
||||
stdin=subprocess.PIPE,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE
|
||||
)
|
||||
pcm_data, stderr = process.communicate(input=mp3_data)
|
||||
if process.returncode != 0:
|
||||
raise RuntimeError(f"ffmpeg failed: {stderr.decode()}")
|
||||
return pcm_data
|
||||
|
||||
return await loop.run_in_executor(None, convert)
|
||||
|
||||
async def list_voices(self) -> list[dict]:
|
||||
"""List available Edge TTS voices"""
|
||||
if not EDGE_TTS_AVAILABLE:
|
||||
return []
|
||||
|
||||
voices = await edge_tts.list_voices()
|
||||
return [
|
||||
{
|
||||
"id": v["ShortName"],
|
||||
"name": v["ShortName"].replace("Neural", ""),
|
||||
"gender": v["Gender"],
|
||||
"locale": v["Locale"],
|
||||
}
|
||||
for v in voices
|
||||
if v["Locale"].startswith("en-")
|
||||
]
|
||||
|
||||
|
||||
# Global instance
|
||||
edge_tts_service = EdgeTTSService()
|
||||
|
||||
|
||||
def is_edge_tts_available() -> bool:
|
||||
return edge_tts_service.is_available()
|
||||
175
backend/services/llm.py
Normal file
175
backend/services/llm.py
Normal file
@@ -0,0 +1,175 @@
|
||||
"""LLM service with OpenRouter and Ollama support"""
|
||||
|
||||
import httpx
|
||||
from typing import Optional
|
||||
from ..config import settings
|
||||
|
||||
|
||||
# Available OpenRouter models
|
||||
OPENROUTER_MODELS = [
|
||||
"anthropic/claude-3-haiku",
|
||||
"anthropic/claude-3.5-sonnet",
|
||||
"openai/gpt-4o-mini",
|
||||
"openai/gpt-4o",
|
||||
"google/gemini-flash-1.5",
|
||||
"google/gemini-pro-1.5",
|
||||
"meta-llama/llama-3.1-8b-instruct",
|
||||
"mistralai/mistral-7b-instruct",
|
||||
]
|
||||
|
||||
|
||||
class LLMService:
|
||||
"""Abstraction layer for LLM providers"""
|
||||
|
||||
def __init__(self):
|
||||
self.provider = settings.llm_provider
|
||||
self.openrouter_model = settings.openrouter_model
|
||||
self.ollama_model = settings.ollama_model
|
||||
self.ollama_host = settings.ollama_host
|
||||
self.tts_provider = settings.tts_provider
|
||||
|
||||
def update_settings(
|
||||
self,
|
||||
provider: Optional[str] = None,
|
||||
openrouter_model: Optional[str] = None,
|
||||
ollama_model: Optional[str] = None,
|
||||
ollama_host: Optional[str] = None,
|
||||
tts_provider: Optional[str] = None
|
||||
):
|
||||
"""Update LLM settings"""
|
||||
if provider:
|
||||
self.provider = provider
|
||||
if openrouter_model:
|
||||
self.openrouter_model = openrouter_model
|
||||
if ollama_model:
|
||||
self.ollama_model = ollama_model
|
||||
if ollama_host:
|
||||
self.ollama_host = ollama_host
|
||||
if tts_provider:
|
||||
self.tts_provider = tts_provider
|
||||
# Also update the global settings so TTS service picks it up
|
||||
settings.tts_provider = tts_provider
|
||||
|
||||
async def get_ollama_models(self) -> list[str]:
|
||||
"""Fetch available models from Ollama"""
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=5.0) as client:
|
||||
response = await client.get(f"{self.ollama_host}/api/tags")
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
return [model["name"] for model in data.get("models", [])]
|
||||
except Exception as e:
|
||||
print(f"Failed to fetch Ollama models: {e}")
|
||||
return []
|
||||
|
||||
def get_settings(self) -> dict:
|
||||
"""Get current settings (sync version without Ollama models)"""
|
||||
return {
|
||||
"provider": self.provider,
|
||||
"openrouter_model": self.openrouter_model,
|
||||
"ollama_model": self.ollama_model,
|
||||
"ollama_host": self.ollama_host,
|
||||
"tts_provider": self.tts_provider,
|
||||
"available_openrouter_models": OPENROUTER_MODELS,
|
||||
"available_ollama_models": [] # Fetched separately
|
||||
}
|
||||
|
||||
async def get_settings_async(self) -> dict:
|
||||
"""Get current settings with Ollama models"""
|
||||
ollama_models = await self.get_ollama_models()
|
||||
return {
|
||||
"provider": self.provider,
|
||||
"openrouter_model": self.openrouter_model,
|
||||
"ollama_model": self.ollama_model,
|
||||
"ollama_host": self.ollama_host,
|
||||
"tts_provider": self.tts_provider,
|
||||
"available_openrouter_models": OPENROUTER_MODELS,
|
||||
"available_ollama_models": ollama_models
|
||||
}
|
||||
|
||||
async def generate(
|
||||
self,
|
||||
messages: list[dict],
|
||||
system_prompt: Optional[str] = None
|
||||
) -> str:
|
||||
"""
|
||||
Generate a response from the LLM.
|
||||
|
||||
Args:
|
||||
messages: List of message dicts with 'role' and 'content'
|
||||
system_prompt: Optional system prompt to prepend
|
||||
|
||||
Returns:
|
||||
Generated text response
|
||||
"""
|
||||
if system_prompt:
|
||||
messages = [{"role": "system", "content": system_prompt}] + messages
|
||||
|
||||
if self.provider == "openrouter":
|
||||
return await self._call_openrouter(messages)
|
||||
else:
|
||||
return await self._call_ollama(messages)
|
||||
|
||||
async def _call_openrouter(self, messages: list[dict]) -> str:
|
||||
"""Call OpenRouter API with retry"""
|
||||
for attempt in range(2): # Try twice
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=30.0) as client:
|
||||
response = await client.post(
|
||||
"https://openrouter.ai/api/v1/chat/completions",
|
||||
headers={
|
||||
"Authorization": f"Bearer {settings.openrouter_api_key}",
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
json={
|
||||
"model": self.openrouter_model,
|
||||
"messages": messages,
|
||||
"max_tokens": 100,
|
||||
},
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
return data["choices"][0]["message"]["content"]
|
||||
except (httpx.TimeoutException, httpx.ReadTimeout):
|
||||
print(f"OpenRouter timeout (attempt {attempt + 1})")
|
||||
if attempt == 0:
|
||||
continue # Retry once
|
||||
return "Uh, sorry, I lost you there for a second. What was that?"
|
||||
except Exception as e:
|
||||
print(f"OpenRouter error: {e}")
|
||||
return "Yeah... I don't know, man."
|
||||
return "Uh, hold on a sec..."
|
||||
|
||||
async def _call_ollama(self, messages: list[dict]) -> str:
|
||||
"""Call Ollama API"""
|
||||
try:
|
||||
async with httpx.AsyncClient() as client:
|
||||
response = await client.post(
|
||||
f"{self.ollama_host}/api/chat",
|
||||
json={
|
||||
"model": self.ollama_model,
|
||||
"messages": messages,
|
||||
"stream": False,
|
||||
"options": {
|
||||
"num_predict": 100, # Allow complete thoughts
|
||||
"temperature": 0.8, # Balanced creativity/coherence
|
||||
"top_p": 0.9, # Focused word choices
|
||||
"repeat_penalty": 1.3, # Avoid repetition
|
||||
"top_k": 50, # Reasonable token variety
|
||||
},
|
||||
},
|
||||
timeout=30.0
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
return data["message"]["content"]
|
||||
except httpx.TimeoutException:
|
||||
print("Ollama timeout")
|
||||
return "Uh, sorry, I lost you there for a second. What was that?"
|
||||
except Exception as e:
|
||||
print(f"Ollama error: {e}")
|
||||
return "Yeah... I don't know, man."
|
||||
|
||||
|
||||
# Global instance
|
||||
llm_service = LLMService()
|
||||
144
backend/services/piper_tts.py
Normal file
144
backend/services/piper_tts.py
Normal file
@@ -0,0 +1,144 @@
|
||||
"""Piper TTS service using sherpa-onnx for fast local voice synthesis"""
|
||||
|
||||
import asyncio
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
# Models directory
|
||||
MODELS_DIR = Path(__file__).parent.parent.parent / "models" / "sherpa"
|
||||
|
||||
# Try to import sherpa-onnx
|
||||
try:
|
||||
import sherpa_onnx
|
||||
SHERPA_AVAILABLE = True
|
||||
except ImportError:
|
||||
SHERPA_AVAILABLE = False
|
||||
sherpa_onnx = None
|
||||
|
||||
|
||||
# Available sherpa-onnx Piper models
|
||||
PIPER_MODELS = {
|
||||
"amy": {
|
||||
"dir": "vits-piper-en_US-amy-low",
|
||||
"model": "en_US-amy-low.onnx",
|
||||
"name": "Amy (US Female)",
|
||||
"sample_rate": 16000,
|
||||
},
|
||||
"joe": {
|
||||
"dir": "vits-piper-en_US-joe-medium",
|
||||
"model": "en_US-joe-medium.onnx",
|
||||
"name": "Joe (US Male)",
|
||||
"sample_rate": 22050,
|
||||
},
|
||||
"lessac": {
|
||||
"dir": "vits-piper-en_US-lessac-medium",
|
||||
"model": "en_US-lessac-medium.onnx",
|
||||
"name": "Lessac (US Female)",
|
||||
"sample_rate": 22050,
|
||||
},
|
||||
"alan": {
|
||||
"dir": "vits-piper-en_GB-alan-medium",
|
||||
"model": "en_GB-alan-medium.onnx",
|
||||
"name": "Alan (UK Male)",
|
||||
"sample_rate": 22050,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
class PiperTTSService:
|
||||
"""Fast local TTS using sherpa-onnx with Piper models"""
|
||||
|
||||
def __init__(self):
|
||||
self.output_sample_rate = 24000 # Our standard output rate
|
||||
self._tts_engines: dict[str, any] = {}
|
||||
|
||||
def is_available(self) -> bool:
|
||||
"""Check if sherpa-onnx is available"""
|
||||
return SHERPA_AVAILABLE
|
||||
|
||||
def _get_engine(self, model_key: str):
|
||||
"""Get or create a TTS engine for the given model"""
|
||||
if model_key in self._tts_engines:
|
||||
return self._tts_engines[model_key], PIPER_MODELS[model_key]["sample_rate"]
|
||||
|
||||
if model_key not in PIPER_MODELS:
|
||||
raise ValueError(f"Unknown model: {model_key}")
|
||||
|
||||
model_info = PIPER_MODELS[model_key]
|
||||
model_dir = MODELS_DIR / model_info["dir"]
|
||||
|
||||
if not model_dir.exists():
|
||||
raise RuntimeError(f"Model not found: {model_dir}")
|
||||
|
||||
config = sherpa_onnx.OfflineTtsConfig(
|
||||
model=sherpa_onnx.OfflineTtsModelConfig(
|
||||
vits=sherpa_onnx.OfflineTtsVitsModelConfig(
|
||||
model=str(model_dir / model_info["model"]),
|
||||
tokens=str(model_dir / "tokens.txt"),
|
||||
data_dir=str(model_dir / "espeak-ng-data"),
|
||||
),
|
||||
num_threads=2,
|
||||
),
|
||||
)
|
||||
tts = sherpa_onnx.OfflineTts(config)
|
||||
self._tts_engines[model_key] = tts
|
||||
return tts, model_info["sample_rate"]
|
||||
|
||||
async def generate_speech(self, text: str, model_key: str = "amy") -> bytes:
|
||||
"""Generate speech from text using sherpa-onnx
|
||||
|
||||
Args:
|
||||
text: Text to synthesize
|
||||
model_key: Model key (amy, joe, lessac, alan)
|
||||
|
||||
Returns:
|
||||
Raw PCM audio bytes (16-bit signed int, 24kHz mono)
|
||||
"""
|
||||
if not SHERPA_AVAILABLE:
|
||||
raise RuntimeError("sherpa-onnx not installed. Run: pip install sherpa-onnx")
|
||||
|
||||
loop = asyncio.get_event_loop()
|
||||
|
||||
def run_tts():
|
||||
tts, model_sample_rate = self._get_engine(model_key)
|
||||
audio = tts.generate(text)
|
||||
samples = np.array(audio.samples, dtype=np.float32)
|
||||
|
||||
# Resample to 24kHz if needed
|
||||
if model_sample_rate != self.output_sample_rate:
|
||||
ratio = self.output_sample_rate / model_sample_rate
|
||||
new_length = int(len(samples) * ratio)
|
||||
samples = np.interp(
|
||||
np.linspace(0, len(samples) - 1, new_length),
|
||||
np.arange(len(samples)),
|
||||
samples
|
||||
).astype(np.float32)
|
||||
|
||||
# Convert to int16
|
||||
audio_int16 = (samples * 32767).astype(np.int16)
|
||||
return audio_int16.tobytes()
|
||||
|
||||
return await loop.run_in_executor(None, run_tts)
|
||||
|
||||
def list_available_models(self) -> list[dict]:
|
||||
"""List available models"""
|
||||
available = []
|
||||
for key, info in PIPER_MODELS.items():
|
||||
model_dir = MODELS_DIR / info["dir"]
|
||||
if model_dir.exists():
|
||||
available.append({
|
||||
"id": key,
|
||||
"name": info["name"],
|
||||
"sample_rate": info["sample_rate"],
|
||||
})
|
||||
return available
|
||||
|
||||
|
||||
# Global instance
|
||||
piper_service = PiperTTSService()
|
||||
|
||||
|
||||
def is_piper_available() -> bool:
|
||||
"""Check if Piper (sherpa-onnx) is available"""
|
||||
return piper_service.is_available()
|
||||
116
backend/services/transcription.py
Normal file
116
backend/services/transcription.py
Normal file
@@ -0,0 +1,116 @@
|
||||
"""Whisper transcription service"""
|
||||
|
||||
import tempfile
|
||||
import numpy as np
|
||||
from faster_whisper import WhisperModel
|
||||
import librosa
|
||||
|
||||
# Global model instance (loaded once)
|
||||
_whisper_model = None
|
||||
|
||||
|
||||
def get_whisper_model() -> WhisperModel:
|
||||
"""Get or create Whisper model instance"""
|
||||
global _whisper_model
|
||||
if _whisper_model is None:
|
||||
print("Loading Whisper tiny model for fast transcription...")
|
||||
# Use tiny model for speed - about 3-4x faster than base
|
||||
# beam_size=1 and best_of=1 for fastest inference
|
||||
_whisper_model = WhisperModel("tiny", device="cpu", compute_type="int8")
|
||||
print("Whisper model loaded")
|
||||
return _whisper_model
|
||||
|
||||
|
||||
def decode_audio(audio_data: bytes, source_sample_rate: int = None) -> tuple[np.ndarray, int]:
|
||||
"""
|
||||
Decode audio from various formats to numpy array.
|
||||
|
||||
Args:
|
||||
audio_data: Raw audio bytes
|
||||
source_sample_rate: If provided, treat as raw PCM at this sample rate
|
||||
|
||||
Returns:
|
||||
Tuple of (audio array as float32, sample rate)
|
||||
"""
|
||||
# If sample rate is provided, assume raw PCM (from server-side recording)
|
||||
if source_sample_rate is not None:
|
||||
print(f"Decoding raw PCM at {source_sample_rate}Hz, {len(audio_data)} bytes")
|
||||
if len(audio_data) % 2 != 0:
|
||||
audio_data = audio_data + b'\x00'
|
||||
audio = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32768.0
|
||||
return audio, source_sample_rate
|
||||
|
||||
print(f"First 20 bytes: {audio_data[:20].hex()}")
|
||||
|
||||
# Try to decode with librosa first (handles webm, ogg, wav, mp3, etc via ffmpeg)
|
||||
try:
|
||||
with tempfile.NamedTemporaryFile(suffix='.webm', delete=False) as f:
|
||||
f.write(audio_data)
|
||||
temp_path = f.name
|
||||
|
||||
audio, sample_rate = librosa.load(temp_path, sr=None, mono=True)
|
||||
print(f"Decoded with librosa: {len(audio)} samples at {sample_rate}Hz")
|
||||
|
||||
import os
|
||||
os.unlink(temp_path)
|
||||
|
||||
return audio.astype(np.float32), sample_rate
|
||||
|
||||
except Exception as e:
|
||||
print(f"librosa decode failed: {e}, trying raw PCM at 16kHz...")
|
||||
|
||||
# Fall back to raw PCM (16-bit signed int, 16kHz mono - Whisper's rate)
|
||||
if len(audio_data) % 2 != 0:
|
||||
audio_data = audio_data + b'\x00'
|
||||
|
||||
audio = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32768.0
|
||||
return audio, 16000
|
||||
|
||||
|
||||
async def transcribe_audio(audio_data: bytes, source_sample_rate: int = None) -> str:
|
||||
"""
|
||||
Transcribe audio data to text using Whisper.
|
||||
|
||||
Args:
|
||||
audio_data: Audio bytes (webm, ogg, wav, or raw PCM)
|
||||
source_sample_rate: If provided, treat audio_data as raw PCM at this rate
|
||||
|
||||
Returns:
|
||||
Transcribed text
|
||||
"""
|
||||
model = get_whisper_model()
|
||||
|
||||
print(f"Transcribing audio: {len(audio_data)} bytes")
|
||||
|
||||
# Decode audio from whatever format
|
||||
audio, detected_sample_rate = decode_audio(audio_data, source_sample_rate)
|
||||
|
||||
print(f"Audio samples: {len(audio)}, duration: {len(audio)/detected_sample_rate:.2f}s")
|
||||
print(f"Audio range: min={audio.min():.4f}, max={audio.max():.4f}")
|
||||
|
||||
# Check if audio is too quiet
|
||||
if np.abs(audio).max() < 0.01:
|
||||
print("Warning: Audio appears to be silent or very quiet")
|
||||
return ""
|
||||
|
||||
# Resample to 16kHz for Whisper
|
||||
if detected_sample_rate != 16000:
|
||||
audio_16k = librosa.resample(audio, orig_sr=detected_sample_rate, target_sr=16000)
|
||||
print(f"Resampled to {len(audio_16k)} samples at 16kHz")
|
||||
else:
|
||||
audio_16k = audio
|
||||
|
||||
# Transcribe with speed optimizations
|
||||
segments, info = model.transcribe(
|
||||
audio_16k,
|
||||
beam_size=1, # Faster, slightly less accurate
|
||||
best_of=1,
|
||||
language="en", # Skip language detection
|
||||
vad_filter=True, # Skip silence
|
||||
)
|
||||
segments_list = list(segments)
|
||||
text = " ".join([s.text for s in segments_list]).strip()
|
||||
|
||||
print(f"Transcription result: '{text}' (language: {info.language}, prob: {info.language_probability:.2f})")
|
||||
|
||||
return text
|
||||
701
backend/services/tts.py
Normal file
701
backend/services/tts.py
Normal file
@@ -0,0 +1,701 @@
|
||||
"""TTS service with ElevenLabs, F5-TTS, MLX Kokoro, StyleTTS2, VITS, and Bark support"""
|
||||
|
||||
import os
|
||||
import numpy as np
|
||||
from scipy.signal import butter, filtfilt
|
||||
from pathlib import Path
|
||||
import tempfile
|
||||
import torch
|
||||
|
||||
from ..config import settings
|
||||
|
||||
# Patch torch.load for compatibility with PyTorch 2.6+
|
||||
_original_torch_load = torch.load
|
||||
def _patched_torch_load(*args, **kwargs):
|
||||
kwargs['weights_only'] = False
|
||||
return _original_torch_load(*args, **kwargs)
|
||||
torch.load = _patched_torch_load
|
||||
|
||||
# Global clients
|
||||
_elevenlabs_client = None
|
||||
_vits_tts = None
|
||||
_bark_loaded = False
|
||||
_kokoro_model = None
|
||||
_styletts2_model = None
|
||||
_f5tts_model = None
|
||||
_chattts_model = None
|
||||
_chattts_speakers = {} # Cache for speaker embeddings
|
||||
|
||||
# Kokoro voice mapping - using highest-graded voices
|
||||
# Grades from https://huggingface.co/hexgrad/Kokoro-82M/blob/main/VOICES.md
|
||||
KOKORO_VOICES = {
|
||||
# Male voices (best available are C+ grade)
|
||||
"VR6AewLTigWG4xSOukaG": "am_fenrir", # Tony - deep/powerful (C+)
|
||||
"TxGEqnHWrfWFTfGW9XjX": "am_michael", # Rick - solid male voice (C+)
|
||||
"pNInz6obpgDQGcFmaJgB": "am_puck", # Dennis - anxious dad (C+)
|
||||
"ODq5zmih8GrVes37Dizd": "bm_george", # Earl - older/distinguished British (C)
|
||||
"IKne3meq5aSn9XLyUdCD": "bm_fable", # Marcus - young British (C)
|
||||
# Female voices (much better quality available)
|
||||
"jBpfuIE2acCO8z3wKNLl": "af_heart", # Jasmine - best quality (A)
|
||||
"EXAVITQu4vr4xnSDxMaL": "af_bella", # Megan - warm/friendly (A-)
|
||||
"21m00Tcm4TlvDq8ikWAM": "bf_emma", # Tanya - professional British (B-)
|
||||
"XB0fDUnXU5powFXDhCwa": "af_nicole", # Carla - Jersey mom (B-)
|
||||
"pFZP5JQG7iQjIQuC4Bku": "af_sarah", # Brenda - overthinker (C+)
|
||||
}
|
||||
|
||||
# Speed adjustments per voice (1.0 = normal, lower = slower/more natural)
|
||||
# Slower speeds (0.85-0.95) generally sound more natural
|
||||
KOKORO_SPEEDS = {
|
||||
# Male voices - slower speeds help with C+ grade voices
|
||||
"VR6AewLTigWG4xSOukaG": 0.9, # Tony (am_fenrir) - deep voice, slower
|
||||
"TxGEqnHWrfWFTfGW9XjX": 0.92, # Rick (am_michael) - solid pace
|
||||
"pNInz6obpgDQGcFmaJgB": 0.95, # Dennis (am_puck) - anxious but not rushed
|
||||
"ODq5zmih8GrVes37Dizd": 0.85, # Earl (bm_george) - older, slower British
|
||||
"IKne3meq5aSn9XLyUdCD": 0.95, # Marcus (bm_fable) - young, natural
|
||||
# Female voices - A-grade voices can handle faster speeds
|
||||
"jBpfuIE2acCO8z3wKNLl": 0.95, # Jasmine (af_heart) - best voice, natural pace
|
||||
"EXAVITQu4vr4xnSDxMaL": 0.95, # Megan (af_bella) - warm
|
||||
"21m00Tcm4TlvDq8ikWAM": 0.9, # Tanya (bf_emma) - professional British
|
||||
"XB0fDUnXU5powFXDhCwa": 0.95, # Carla (af_nicole) - animated but clear
|
||||
"pFZP5JQG7iQjIQuC4Bku": 0.92, # Brenda (af_sarah) - overthinker, measured
|
||||
}
|
||||
|
||||
DEFAULT_KOKORO_VOICE = "af_heart"
|
||||
DEFAULT_KOKORO_SPEED = 0.95
|
||||
|
||||
# VCTK speaker mapping - different voices for different callers
|
||||
VITS_SPEAKERS = {
|
||||
# Male voices
|
||||
"VR6AewLTigWG4xSOukaG": "p226", # Tony
|
||||
"TxGEqnHWrfWFTfGW9XjX": "p251", # Rick
|
||||
"pNInz6obpgDQGcFmaJgB": "p245", # Dennis
|
||||
"ODq5zmih8GrVes37Dizd": "p232", # Earl
|
||||
"IKne3meq5aSn9XLyUdCD": "p252", # Marcus
|
||||
# Female voices
|
||||
"jBpfuIE2acCO8z3wKNLl": "p225", # Jasmine
|
||||
"EXAVITQu4vr4xnSDxMaL": "p228", # Megan
|
||||
"21m00Tcm4TlvDq8ikWAM": "p229", # Tanya
|
||||
"XB0fDUnXU5powFXDhCwa": "p231", # Carla
|
||||
"pFZP5JQG7iQjIQuC4Bku": "p233", # Brenda
|
||||
}
|
||||
|
||||
DEFAULT_VITS_SPEAKER = "p225"
|
||||
|
||||
# Inworld voice mapping - maps ElevenLabs voice IDs to Inworld voices
|
||||
# Full voice list from API: Alex, Ashley, Blake, Carter, Clive, Craig, Deborah,
|
||||
# Dennis, Dominus, Edward, Elizabeth, Hades, Hana, Julia, Luna, Mark, Olivia,
|
||||
# Pixie, Priya, Ronald, Sarah, Shaun, Theodore, Timothy, Wendy
|
||||
INWORLD_VOICES = {
|
||||
# Male voices - each caller gets a unique voice matching their personality
|
||||
"VR6AewLTigWG4xSOukaG": "Edward", # Tony - fast-talking, emphatic, streetwise
|
||||
"TxGEqnHWrfWFTfGW9XjX": "Shaun", # Rick - friendly, dynamic, conversational
|
||||
"pNInz6obpgDQGcFmaJgB": "Alex", # Dennis - energetic, expressive, mildly nasal
|
||||
"ODq5zmih8GrVes37Dizd": "Craig", # Earl - older British, refined, articulate
|
||||
"IKne3meq5aSn9XLyUdCD": "Timothy", # Marcus - lively, upbeat American
|
||||
# Female voices - each caller gets a unique voice matching their personality
|
||||
"jBpfuIE2acCO8z3wKNLl": "Hana", # Jasmine - bright, expressive young female
|
||||
"EXAVITQu4vr4xnSDxMaL": "Ashley", # Megan - warm, natural female
|
||||
"21m00Tcm4TlvDq8ikWAM": "Wendy", # Tanya - posh, middle-aged British
|
||||
"XB0fDUnXU5powFXDhCwa": "Sarah", # Carla - fast-talking, questioning tone
|
||||
"pFZP5JQG7iQjIQuC4Bku": "Deborah", # Brenda - gentle, elegant
|
||||
}
|
||||
DEFAULT_INWORLD_VOICE = "Dennis"
|
||||
|
||||
|
||||
def preprocess_text_for_kokoro(text: str) -> str:
|
||||
"""
|
||||
Preprocess text to improve Kokoro prosody and naturalness.
|
||||
|
||||
- Adds slight pauses via punctuation
|
||||
- Handles contractions and abbreviations
|
||||
- Normalizes spacing
|
||||
"""
|
||||
import re
|
||||
|
||||
# Normalize whitespace
|
||||
text = ' '.join(text.split())
|
||||
|
||||
# Add comma pauses after common transition words (if no punctuation follows)
|
||||
transitions = [
|
||||
r'\b(Well)\s+(?=[A-Za-z])',
|
||||
r'\b(So)\s+(?=[A-Za-z])',
|
||||
r'\b(Now)\s+(?=[A-Za-z])',
|
||||
r'\b(Look)\s+(?=[A-Za-z])',
|
||||
r'\b(See)\s+(?=[A-Za-z])',
|
||||
r'\b(Anyway)\s+(?=[A-Za-z])',
|
||||
r'\b(Actually)\s+(?=[A-Za-z])',
|
||||
r'\b(Honestly)\s+(?=[A-Za-z])',
|
||||
r'\b(Basically)\s+(?=[A-Za-z])',
|
||||
]
|
||||
for pattern in transitions:
|
||||
text = re.sub(pattern, r'\1, ', text)
|
||||
|
||||
# Add pause after "I mean" at start of sentence
|
||||
text = re.sub(r'^(I mean)\s+', r'\1, ', text)
|
||||
text = re.sub(r'\.\s+(I mean)\s+', r'. \1, ', text)
|
||||
|
||||
# Expand common abbreviations for better pronunciation
|
||||
abbreviations = {
|
||||
r'\bDr\.': 'Doctor',
|
||||
r'\bMr\.': 'Mister',
|
||||
r'\bMrs\.': 'Missus',
|
||||
r'\bMs\.': 'Miss',
|
||||
r'\bSt\.': 'Street',
|
||||
r'\bAve\.': 'Avenue',
|
||||
r'\betc\.': 'etcetera',
|
||||
r'\bvs\.': 'versus',
|
||||
r'\bw/': 'with',
|
||||
r'\bw/o': 'without',
|
||||
}
|
||||
for abbr, expansion in abbreviations.items():
|
||||
text = re.sub(abbr, expansion, text, flags=re.IGNORECASE)
|
||||
|
||||
# Add breath pause (comma) before conjunctions in long sentences
|
||||
text = re.sub(r'(\w{20,})\s+(and|but|or)\s+', r'\1, \2 ', text)
|
||||
|
||||
# Ensure proper spacing after punctuation
|
||||
text = re.sub(r'([.!?])\s*([A-Z])', r'\1 \2', text)
|
||||
|
||||
return text
|
||||
|
||||
# StyleTTS2 reference voice files (place .wav files in voices/ directory for voice cloning)
|
||||
# Maps voice_id to reference audio filename - if file doesn't exist, uses default voice
|
||||
STYLETTS2_VOICES = {
|
||||
# Male voices
|
||||
"VR6AewLTigWG4xSOukaG": "tony.wav", # Tony
|
||||
"TxGEqnHWrfWFTfGW9XjX": "rick.wav", # Rick
|
||||
"pNInz6obpgDQGcFmaJgB": "dennis.wav", # Dennis
|
||||
"ODq5zmih8GrVes37Dizd": "earl.wav", # Earl
|
||||
"IKne3meq5aSn9XLyUdCD": "marcus.wav", # Marcus
|
||||
# Female voices
|
||||
"jBpfuIE2acCO8z3wKNLl": "jasmine.wav", # Jasmine
|
||||
"EXAVITQu4vr4xnSDxMaL": "megan.wav", # Megan
|
||||
"21m00Tcm4TlvDq8ikWAM": "tanya.wav", # Tanya
|
||||
"XB0fDUnXU5powFXDhCwa": "carla.wav", # Carla
|
||||
"pFZP5JQG7iQjIQuC4Bku": "brenda.wav", # Brenda
|
||||
}
|
||||
|
||||
# F5-TTS reference voices (same files as StyleTTS2, reuses voices/ directory)
|
||||
# Requires: mono, 24kHz, 5-10 seconds, with transcript in .txt file
|
||||
F5TTS_VOICES = STYLETTS2_VOICES.copy()
|
||||
|
||||
# ChatTTS speaker seeds - different seeds produce different voices
|
||||
# These are used to generate consistent speaker embeddings
|
||||
CHATTTS_SEEDS = {
|
||||
# Male voices
|
||||
"VR6AewLTigWG4xSOukaG": 42, # Tony - deep voice
|
||||
"TxGEqnHWrfWFTfGW9XjX": 123, # Rick
|
||||
"pNInz6obpgDQGcFmaJgB": 456, # Dennis
|
||||
"ODq5zmih8GrVes37Dizd": 789, # Earl
|
||||
"IKne3meq5aSn9XLyUdCD": 1011, # Marcus
|
||||
# Female voices
|
||||
"jBpfuIE2acCO8z3wKNLl": 2024, # Jasmine
|
||||
"EXAVITQu4vr4xnSDxMaL": 3033, # Megan
|
||||
"21m00Tcm4TlvDq8ikWAM": 4042, # Tanya
|
||||
"XB0fDUnXU5powFXDhCwa": 5051, # Carla
|
||||
"pFZP5JQG7iQjIQuC4Bku": 6060, # Brenda
|
||||
}
|
||||
DEFAULT_CHATTTS_SEED = 42
|
||||
|
||||
|
||||
def get_elevenlabs_client():
|
||||
"""Get or create ElevenLabs client"""
|
||||
global _elevenlabs_client
|
||||
if _elevenlabs_client is None:
|
||||
from elevenlabs.client import ElevenLabs
|
||||
_elevenlabs_client = ElevenLabs(api_key=settings.elevenlabs_api_key)
|
||||
return _elevenlabs_client
|
||||
|
||||
|
||||
def get_vits_tts():
|
||||
"""Get or create VITS VCTK TTS instance"""
|
||||
global _vits_tts
|
||||
if _vits_tts is None:
|
||||
from TTS.api import TTS
|
||||
_vits_tts = TTS("tts_models/en/vctk/vits")
|
||||
return _vits_tts
|
||||
|
||||
|
||||
def get_kokoro_model():
|
||||
"""Get or create Kokoro MLX model"""
|
||||
global _kokoro_model
|
||||
if _kokoro_model is None:
|
||||
from mlx_audio.tts.utils import load_model
|
||||
_kokoro_model = load_model(model_path='mlx-community/Kokoro-82M-bf16')
|
||||
print("Kokoro MLX model loaded")
|
||||
return _kokoro_model
|
||||
|
||||
|
||||
def ensure_bark_loaded():
|
||||
"""Ensure Bark models are loaded on GPU"""
|
||||
global _bark_loaded
|
||||
if not _bark_loaded:
|
||||
os.environ['SUNO_USE_SMALL_MODELS'] = '1'
|
||||
|
||||
# Force Bark to use MPS (Apple Silicon GPU)
|
||||
if torch.backends.mps.is_available():
|
||||
os.environ['SUNO_OFFLOAD_CPU'] = '0'
|
||||
os.environ['SUNO_ENABLE_MPS'] = '1'
|
||||
|
||||
from bark import preload_models
|
||||
preload_models()
|
||||
_bark_loaded = True
|
||||
print(f"Bark loaded on device: {'MPS' if torch.backends.mps.is_available() else 'CPU'}")
|
||||
|
||||
|
||||
def get_styletts2_model():
|
||||
"""Get or create StyleTTS2 model"""
|
||||
global _styletts2_model
|
||||
if _styletts2_model is None:
|
||||
from styletts2 import tts
|
||||
_styletts2_model = tts.StyleTTS2()
|
||||
print("StyleTTS2 model loaded")
|
||||
return _styletts2_model
|
||||
|
||||
|
||||
def get_f5tts_generate():
|
||||
"""Get F5-TTS generate function (lazy load)"""
|
||||
global _f5tts_model
|
||||
if _f5tts_model is None:
|
||||
# Disable tqdm progress bars to avoid BrokenPipeError in server context
|
||||
import os
|
||||
os.environ['HF_HUB_DISABLE_PROGRESS_BARS'] = '1'
|
||||
os.environ['TQDM_DISABLE'] = '1'
|
||||
|
||||
from f5_tts_mlx.generate import generate
|
||||
_f5tts_model = generate
|
||||
print("F5-TTS MLX loaded")
|
||||
return _f5tts_model
|
||||
|
||||
|
||||
def get_chattts_model():
|
||||
"""Get or create ChatTTS model"""
|
||||
global _chattts_model
|
||||
if _chattts_model is None:
|
||||
import ChatTTS
|
||||
_chattts_model = ChatTTS.Chat()
|
||||
_chattts_model.load(compile=False)
|
||||
print("ChatTTS model loaded")
|
||||
return _chattts_model
|
||||
|
||||
|
||||
def get_chattts_speaker(voice_id: str):
|
||||
"""Get or create a consistent speaker embedding for a voice"""
|
||||
global _chattts_speakers
|
||||
if voice_id not in _chattts_speakers:
|
||||
chat = get_chattts_model()
|
||||
seed = CHATTTS_SEEDS.get(voice_id, DEFAULT_CHATTTS_SEED)
|
||||
# Set seed for reproducible speaker
|
||||
torch.manual_seed(seed)
|
||||
_chattts_speakers[voice_id] = chat.sample_random_speaker()
|
||||
print(f"[ChatTTS] Created speaker for voice {voice_id} with seed {seed}")
|
||||
return _chattts_speakers[voice_id]
|
||||
|
||||
|
||||
def phone_filter(audio: np.ndarray, sample_rate: int = 24000, quality: str = "normal") -> np.ndarray:
|
||||
"""Apply phone filter with variable quality."""
|
||||
audio = audio.flatten()
|
||||
|
||||
presets = {
|
||||
"good": (200, 7000, 1.0, 0.0),
|
||||
"normal": (300, 3400, 1.5, 0.005),
|
||||
"bad": (400, 2800, 2.0, 0.015),
|
||||
"terrible": (500, 2200, 2.5, 0.03),
|
||||
}
|
||||
|
||||
low_hz, high_hz, distortion, noise = presets.get(quality, presets["normal"])
|
||||
|
||||
low = low_hz / (sample_rate / 2)
|
||||
high = high_hz / (sample_rate / 2)
|
||||
b, a = butter(4, [low, high], btype='band')
|
||||
filtered = filtfilt(b, a, audio)
|
||||
|
||||
filtered = np.tanh(filtered * distortion) * 0.8
|
||||
|
||||
if noise > 0:
|
||||
static = np.random.normal(0, noise, len(filtered)).astype(np.float32)
|
||||
static_envelope = np.random.random(len(filtered) // 1000 + 1)
|
||||
static_envelope = np.repeat(static_envelope, 1000)[:len(filtered)]
|
||||
static *= (static_envelope > 0.7).astype(np.float32)
|
||||
filtered = filtered + static
|
||||
|
||||
return filtered.astype(np.float32)
|
||||
|
||||
|
||||
async def generate_speech_elevenlabs(text: str, voice_id: str) -> tuple[np.ndarray, int]:
|
||||
"""Generate speech using ElevenLabs"""
|
||||
client = get_elevenlabs_client()
|
||||
|
||||
audio_gen = client.text_to_speech.convert(
|
||||
voice_id=voice_id,
|
||||
text=text,
|
||||
model_id="eleven_v3",
|
||||
output_format="pcm_24000"
|
||||
)
|
||||
|
||||
audio_bytes = b"".join(audio_gen)
|
||||
audio = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32768.0
|
||||
|
||||
return audio, 24000
|
||||
|
||||
|
||||
async def generate_speech_kokoro(text: str, voice_id: str) -> tuple[np.ndarray, int]:
|
||||
"""Generate speech using MLX Kokoro (fast, good quality, Apple Silicon optimized)"""
|
||||
import librosa
|
||||
from mlx_audio.tts.generate import generate_audio
|
||||
|
||||
model = get_kokoro_model()
|
||||
voice = KOKORO_VOICES.get(voice_id, DEFAULT_KOKORO_VOICE)
|
||||
speed = KOKORO_SPEEDS.get(voice_id, DEFAULT_KOKORO_SPEED)
|
||||
|
||||
# Preprocess text for better prosody
|
||||
text = preprocess_text_for_kokoro(text)
|
||||
|
||||
# Determine lang_code from voice prefix (a=American, b=British)
|
||||
lang_code = 'b' if voice.startswith('b') else 'a'
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
generate_audio(
|
||||
text,
|
||||
model=model,
|
||||
voice=voice,
|
||||
speed=speed,
|
||||
lang_code=lang_code,
|
||||
output_path=tmpdir,
|
||||
file_prefix='tts',
|
||||
verbose=False
|
||||
)
|
||||
|
||||
# Read the generated audio file
|
||||
audio_file = Path(tmpdir) / 'tts_000.wav'
|
||||
if not audio_file.exists():
|
||||
raise RuntimeError("Kokoro failed to generate audio")
|
||||
|
||||
audio, sr = librosa.load(str(audio_file), sr=None, mono=True)
|
||||
|
||||
# Resample to 24kHz if needed
|
||||
if sr != 24000:
|
||||
audio = librosa.resample(audio, orig_sr=sr, target_sr=24000)
|
||||
|
||||
return audio.astype(np.float32), 24000
|
||||
|
||||
|
||||
async def generate_speech_vits(text: str, voice_id: str) -> tuple[np.ndarray, int]:
|
||||
"""Generate speech using VITS VCTK (fast, multiple speakers)"""
|
||||
import librosa
|
||||
|
||||
tts = get_vits_tts()
|
||||
speaker = VITS_SPEAKERS.get(voice_id, DEFAULT_VITS_SPEAKER)
|
||||
|
||||
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
|
||||
tmp_path = tmp.name
|
||||
|
||||
try:
|
||||
tts.tts_to_file(text=text, file_path=tmp_path, speaker=speaker)
|
||||
audio, sr = librosa.load(tmp_path, sr=None, mono=True)
|
||||
|
||||
if sr != 24000:
|
||||
audio = librosa.resample(audio, orig_sr=sr, target_sr=24000)
|
||||
|
||||
return audio.astype(np.float32), 24000
|
||||
finally:
|
||||
Path(tmp_path).unlink(missing_ok=True)
|
||||
|
||||
|
||||
async def generate_speech_bark(text: str, voice_id: str) -> tuple[np.ndarray, int]:
|
||||
"""Generate speech using Bark (slow but expressive, supports emotes like [laughs])"""
|
||||
import librosa
|
||||
from bark import SAMPLE_RATE, generate_audio
|
||||
|
||||
ensure_bark_loaded()
|
||||
|
||||
# Generate audio with Bark
|
||||
audio = generate_audio(text)
|
||||
|
||||
# Normalize to prevent clipping (Bark can exceed [-1, 1])
|
||||
max_val = np.abs(audio).max()
|
||||
if max_val > 0.95:
|
||||
audio = audio * (0.95 / max_val)
|
||||
|
||||
# Resample to 24kHz if needed
|
||||
if SAMPLE_RATE != 24000:
|
||||
audio = librosa.resample(audio, orig_sr=SAMPLE_RATE, target_sr=24000)
|
||||
|
||||
return audio.astype(np.float32), 24000
|
||||
|
||||
|
||||
async def generate_speech_styletts2(text: str, voice_id: str) -> tuple[np.ndarray, int]:
|
||||
"""Generate speech using StyleTTS2 (high quality, supports voice cloning)"""
|
||||
import librosa
|
||||
|
||||
model = get_styletts2_model()
|
||||
|
||||
# Check for reference voice file
|
||||
voice_file = STYLETTS2_VOICES.get(voice_id)
|
||||
voice_path = None
|
||||
if voice_file:
|
||||
voice_path = settings.base_dir / "voices" / voice_file
|
||||
if not voice_path.exists():
|
||||
voice_path = None # Use default voice if file doesn't exist
|
||||
|
||||
# Generate audio
|
||||
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
|
||||
tmp_path = tmp.name
|
||||
|
||||
try:
|
||||
if voice_path:
|
||||
print(f"[StyleTTS2] Using voice clone: {voice_path}")
|
||||
audio = model.inference(
|
||||
text,
|
||||
target_voice_path=str(voice_path),
|
||||
output_wav_file=tmp_path,
|
||||
output_sample_rate=24000,
|
||||
diffusion_steps=5, # Balance quality/speed
|
||||
alpha=0.3, # More voice-like than text-like
|
||||
beta=0.7, # Good prosody
|
||||
)
|
||||
else:
|
||||
print("[StyleTTS2] Using default voice")
|
||||
audio = model.inference(
|
||||
text,
|
||||
output_wav_file=tmp_path,
|
||||
output_sample_rate=24000,
|
||||
diffusion_steps=5,
|
||||
)
|
||||
|
||||
# Load the generated audio
|
||||
audio, sr = librosa.load(tmp_path, sr=None, mono=True)
|
||||
|
||||
if sr != 24000:
|
||||
audio = librosa.resample(audio, orig_sr=sr, target_sr=24000)
|
||||
|
||||
return audio.astype(np.float32), 24000
|
||||
finally:
|
||||
Path(tmp_path).unlink(missing_ok=True)
|
||||
|
||||
|
||||
async def generate_speech_f5tts(text: str, voice_id: str) -> tuple[np.ndarray, int]:
|
||||
"""Generate speech using F5-TTS MLX (very natural, supports voice cloning)"""
|
||||
import librosa
|
||||
|
||||
generate = get_f5tts_generate()
|
||||
|
||||
# Check for reference voice file and transcript
|
||||
voice_file = F5TTS_VOICES.get(voice_id)
|
||||
ref_audio_path = None
|
||||
ref_text = None
|
||||
|
||||
if voice_file:
|
||||
voice_path = settings.base_dir / "voices" / voice_file
|
||||
txt_path = voice_path.with_suffix('.txt')
|
||||
|
||||
if voice_path.exists() and txt_path.exists():
|
||||
ref_audio_path = str(voice_path)
|
||||
ref_text = txt_path.read_text().strip()
|
||||
print(f"[F5-TTS] Using voice clone: {voice_path}")
|
||||
|
||||
if not ref_audio_path:
|
||||
print("[F5-TTS] Using default voice")
|
||||
|
||||
# Generate audio to temp file
|
||||
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
|
||||
tmp_path = tmp.name
|
||||
|
||||
try:
|
||||
generate(
|
||||
generation_text=text,
|
||||
ref_audio_path=ref_audio_path,
|
||||
ref_audio_text=ref_text,
|
||||
steps=8,
|
||||
speed=1.0,
|
||||
output_path=tmp_path,
|
||||
)
|
||||
|
||||
# Load the generated audio
|
||||
audio, sr = librosa.load(tmp_path, sr=None, mono=True)
|
||||
|
||||
# Resample to 24kHz if needed
|
||||
if sr != 24000:
|
||||
audio = librosa.resample(audio, orig_sr=sr, target_sr=24000)
|
||||
|
||||
return audio.astype(np.float32), 24000
|
||||
finally:
|
||||
Path(tmp_path).unlink(missing_ok=True)
|
||||
|
||||
|
||||
async def generate_speech_chattts(text: str, voice_id: str) -> tuple[np.ndarray, int]:
|
||||
"""Generate speech using ChatTTS (natural conversational speech, multiple speakers)"""
|
||||
import ChatTTS
|
||||
|
||||
chat = get_chattts_model()
|
||||
|
||||
# Ensure text is not empty and has reasonable content
|
||||
text = text.strip()
|
||||
if not text:
|
||||
text = "Hello."
|
||||
|
||||
print(f"[ChatTTS] Generating speech for: {text[:50]}...")
|
||||
|
||||
# Get consistent speaker for this voice
|
||||
seed = CHATTTS_SEEDS.get(voice_id, DEFAULT_CHATTTS_SEED)
|
||||
torch.manual_seed(seed)
|
||||
|
||||
# Configure inference parameters
|
||||
params_infer_code = ChatTTS.Chat.InferCodeParams(
|
||||
temperature=0.3,
|
||||
top_P=0.7,
|
||||
top_K=20,
|
||||
)
|
||||
|
||||
# Generate audio (skip text refinement to avoid narrow() error with this version)
|
||||
wavs = chat.infer(
|
||||
[text],
|
||||
params_infer_code=params_infer_code,
|
||||
skip_refine_text=True,
|
||||
)
|
||||
|
||||
if wavs is None or len(wavs) == 0:
|
||||
raise RuntimeError("ChatTTS failed to generate audio")
|
||||
|
||||
audio = wavs[0]
|
||||
|
||||
# Handle different output shapes
|
||||
if audio.ndim > 1:
|
||||
audio = audio.squeeze()
|
||||
|
||||
# Normalize
|
||||
max_val = np.abs(audio).max()
|
||||
if max_val > 0.95:
|
||||
audio = audio * (0.95 / max_val)
|
||||
|
||||
return audio.astype(np.float32), 24000
|
||||
|
||||
|
||||
async def generate_speech_inworld(text: str, voice_id: str) -> tuple[np.ndarray, int]:
|
||||
"""Generate speech using Inworld TTS API (high quality, natural voices)"""
|
||||
import httpx
|
||||
import base64
|
||||
import librosa
|
||||
|
||||
voice = INWORLD_VOICES.get(voice_id, DEFAULT_INWORLD_VOICE)
|
||||
|
||||
api_key = settings.inworld_api_key
|
||||
if not api_key:
|
||||
raise RuntimeError("INWORLD_API_KEY not set in environment")
|
||||
|
||||
print(f"[Inworld TTS] Voice: {voice}, Text: {text[:50]}...")
|
||||
|
||||
url = "https://api.inworld.ai/tts/v1/voice"
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
"Authorization": f"Basic {api_key}",
|
||||
}
|
||||
payload = {
|
||||
"text": text,
|
||||
"voice_id": voice,
|
||||
"model_id": "inworld-tts-1.5-mini",
|
||||
"audio_config": {
|
||||
"encoding": "LINEAR16",
|
||||
"sample_rate_hertz": 48000,
|
||||
},
|
||||
}
|
||||
|
||||
async with httpx.AsyncClient(timeout=60.0) as client:
|
||||
response = await client.post(url, json=payload, headers=headers)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
# Decode base64 audio
|
||||
audio_b64 = data.get("audioContent")
|
||||
if not audio_b64:
|
||||
raise RuntimeError("Inworld TTS returned no audio content")
|
||||
|
||||
audio_bytes = base64.b64decode(audio_b64)
|
||||
|
||||
# Parse audio using soundfile (handles WAV, MP3, etc.)
|
||||
import soundfile as sf
|
||||
import io
|
||||
|
||||
# soundfile can read WAV, FLAC, OGG, and with ffmpeg: MP3
|
||||
# MP3 files start with ID3 tag or 0xff sync bytes
|
||||
try:
|
||||
audio, sr = sf.read(io.BytesIO(audio_bytes))
|
||||
except Exception as e:
|
||||
print(f"[Inworld TTS] soundfile failed: {e}, trying raw PCM")
|
||||
# Fallback to raw PCM
|
||||
if len(audio_bytes) % 2 != 0:
|
||||
audio_bytes = audio_bytes[:-1]
|
||||
audio = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32768.0
|
||||
sr = 48000
|
||||
|
||||
# Resample to 24kHz to match other providers
|
||||
if sr != 24000:
|
||||
audio = librosa.resample(audio, orig_sr=sr, target_sr=24000)
|
||||
|
||||
return audio.astype(np.float32), 24000
|
||||
|
||||
|
||||
async def generate_speech(
|
||||
text: str,
|
||||
voice_id: str,
|
||||
phone_quality: str = "normal",
|
||||
apply_filter: bool = True
|
||||
) -> bytes:
|
||||
"""
|
||||
Generate speech from text.
|
||||
|
||||
Args:
|
||||
text: Text to speak
|
||||
voice_id: ElevenLabs voice ID (mapped to local voice if using local TTS)
|
||||
phone_quality: Quality of phone filter ("none" to disable)
|
||||
apply_filter: Whether to apply phone filter
|
||||
|
||||
Returns:
|
||||
Raw PCM audio bytes (16-bit signed int, 24kHz)
|
||||
"""
|
||||
# Choose TTS provider
|
||||
provider = settings.tts_provider
|
||||
print(f"[TTS] Provider: {provider}, Text: {text[:50]}...")
|
||||
|
||||
if provider == "kokoro":
|
||||
audio, sample_rate = await generate_speech_kokoro(text, voice_id)
|
||||
elif provider == "f5tts":
|
||||
audio, sample_rate = await generate_speech_f5tts(text, voice_id)
|
||||
elif provider == "inworld":
|
||||
audio, sample_rate = await generate_speech_inworld(text, voice_id)
|
||||
elif provider == "chattts":
|
||||
audio, sample_rate = await generate_speech_chattts(text, voice_id)
|
||||
elif provider == "styletts2":
|
||||
audio, sample_rate = await generate_speech_styletts2(text, voice_id)
|
||||
elif provider == "bark":
|
||||
audio, sample_rate = await generate_speech_bark(text, voice_id)
|
||||
elif provider == "vits":
|
||||
audio, sample_rate = await generate_speech_vits(text, voice_id)
|
||||
elif provider == "elevenlabs":
|
||||
audio, sample_rate = await generate_speech_elevenlabs(text, voice_id)
|
||||
else:
|
||||
raise ValueError(f"Unknown TTS provider: {provider}")
|
||||
|
||||
# Apply phone filter if requested
|
||||
# Skip filter for Bark - it already has rough audio quality
|
||||
if apply_filter and phone_quality not in ("none", "studio") and provider != "bark":
|
||||
audio = phone_filter(audio, sample_rate, phone_quality)
|
||||
|
||||
# Convert to bytes
|
||||
audio_int16 = (audio * 32768).clip(-32768, 32767).astype(np.int16)
|
||||
return audio_int16.tobytes()
|
||||
|
||||
|
||||
# Voice IDs for cohost and announcer
|
||||
COHOST_VOICE_ID = "nPczCjzI2devNBz1zQrb"
|
||||
ANNOUNCER_VOICE_ID = "ErXwobaYiN019PkySvjV"
|
||||
|
||||
|
||||
async def generate_cohost_speech(text: str) -> bytes:
|
||||
"""Generate speech for cohost Bobby (no phone filter)"""
|
||||
return await generate_speech(text, COHOST_VOICE_ID, apply_filter=False)
|
||||
|
||||
|
||||
async def generate_announcer_speech(text: str) -> bytes:
|
||||
"""Generate speech for announcer (no phone filter)"""
|
||||
return await generate_speech(text, ANNOUNCER_VOICE_ID, apply_filter=False)
|
||||
200
backend/services/voices.py
Normal file
200
backend/services/voices.py
Normal file
@@ -0,0 +1,200 @@
|
||||
"""Voice configuration and TTS provider management"""
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
from enum import Enum
|
||||
|
||||
|
||||
class TTSProvider(str, Enum):
|
||||
ELEVENLABS = "elevenlabs"
|
||||
EDGE = "edge" # Microsoft Edge TTS (free)
|
||||
PIPER = "piper" # Local Piper via sherpa-onnx (free, fast)
|
||||
|
||||
|
||||
@dataclass
|
||||
class Voice:
|
||||
"""Voice configuration"""
|
||||
id: str
|
||||
name: str
|
||||
provider: TTSProvider
|
||||
provider_voice_id: str # The actual ID used by the provider
|
||||
description: str = ""
|
||||
language: str = "en"
|
||||
gender: str = "neutral"
|
||||
|
||||
|
||||
# ElevenLabs voices
|
||||
ELEVENLABS_VOICES = [
|
||||
Voice("el_tony", "Tony (ElevenLabs)", TTSProvider.ELEVENLABS, "IKne3meq5aSn9XLyUdCD",
|
||||
"Male, New York accent, expressive", "en", "male"),
|
||||
Voice("el_jasmine", "Jasmine (ElevenLabs)", TTSProvider.ELEVENLABS, "FGY2WhTYpPnrIDTdsKH5",
|
||||
"Female, confident, direct", "en", "female"),
|
||||
Voice("el_rick", "Rick (ElevenLabs)", TTSProvider.ELEVENLABS, "JBFqnCBsd6RMkjVDRZzb",
|
||||
"Male, Texas accent, older", "en", "male"),
|
||||
Voice("el_megan", "Megan (ElevenLabs)", TTSProvider.ELEVENLABS, "XrExE9yKIg1WjnnlVkGX",
|
||||
"Female, young, casual", "en", "female"),
|
||||
Voice("el_dennis", "Dennis (ElevenLabs)", TTSProvider.ELEVENLABS, "cjVigY5qzO86Huf0OWal",
|
||||
"Male, middle-aged, anxious", "en", "male"),
|
||||
Voice("el_tanya", "Tanya (ElevenLabs)", TTSProvider.ELEVENLABS, "N2lVS1w4EtoT3dr4eOWO",
|
||||
"Female, Miami, sassy", "en", "female"),
|
||||
Voice("el_earl", "Earl (ElevenLabs)", TTSProvider.ELEVENLABS, "EXAVITQu4vr4xnSDxMaL",
|
||||
"Male, elderly, Southern", "en", "male"),
|
||||
Voice("el_carla", "Carla (ElevenLabs)", TTSProvider.ELEVENLABS, "CwhRBWXzGAHq8TQ4Fs17",
|
||||
"Female, Jersey, sharp", "en", "female"),
|
||||
Voice("el_marcus", "Marcus (ElevenLabs)", TTSProvider.ELEVENLABS, "bIHbv24MWmeRgasZH58o",
|
||||
"Male, young, urban", "en", "male"),
|
||||
Voice("el_brenda", "Brenda (ElevenLabs)", TTSProvider.ELEVENLABS, "Xb7hH8MSUJpSbSDYk0k2",
|
||||
"Female, middle-aged, worried", "en", "female"),
|
||||
Voice("el_jake", "Jake (ElevenLabs)", TTSProvider.ELEVENLABS, "SOYHLrjzK2X1ezoPC6cr",
|
||||
"Male, Boston, insecure", "en", "male"),
|
||||
Voice("el_diane", "Diane (ElevenLabs)", TTSProvider.ELEVENLABS, "cgSgspJ2msm6clMCkdW9",
|
||||
"Female, mature, conflicted", "en", "female"),
|
||||
Voice("el_bobby", "Bobby (ElevenLabs)", TTSProvider.ELEVENLABS, "nPczCjzI2devNBz1zQrb",
|
||||
"Male, sidekick, wisecracking", "en", "male"),
|
||||
Voice("el_announcer", "Announcer (ElevenLabs)", TTSProvider.ELEVENLABS, "ErXwobaYiN019PkySvjV",
|
||||
"Male, radio announcer", "en", "male"),
|
||||
]
|
||||
|
||||
# Edge TTS voices (Microsoft, free)
|
||||
EDGE_VOICES = [
|
||||
# US voices
|
||||
Voice("edge_jenny", "Jenny (Edge)", TTSProvider.EDGE, "en-US-JennyNeural",
|
||||
"Female, American, friendly", "en", "female"),
|
||||
Voice("edge_guy", "Guy (Edge)", TTSProvider.EDGE, "en-US-GuyNeural",
|
||||
"Male, American, casual", "en", "male"),
|
||||
Voice("edge_aria", "Aria (Edge)", TTSProvider.EDGE, "en-US-AriaNeural",
|
||||
"Female, American, professional", "en", "female"),
|
||||
Voice("edge_davis", "Davis (Edge)", TTSProvider.EDGE, "en-US-DavisNeural",
|
||||
"Male, American, calm", "en", "male"),
|
||||
Voice("edge_amber", "Amber (Edge)", TTSProvider.EDGE, "en-US-AmberNeural",
|
||||
"Female, American, warm", "en", "female"),
|
||||
Voice("edge_andrew", "Andrew (Edge)", TTSProvider.EDGE, "en-US-AndrewNeural",
|
||||
"Male, American, confident", "en", "male"),
|
||||
Voice("edge_ashley", "Ashley (Edge)", TTSProvider.EDGE, "en-US-AshleyNeural",
|
||||
"Female, American, cheerful", "en", "female"),
|
||||
Voice("edge_brian", "Brian (Edge)", TTSProvider.EDGE, "en-US-BrianNeural",
|
||||
"Male, American, narrator", "en", "male"),
|
||||
Voice("edge_christopher", "Christopher (Edge)", TTSProvider.EDGE, "en-US-ChristopherNeural",
|
||||
"Male, American, reliable", "en", "male"),
|
||||
Voice("edge_cora", "Cora (Edge)", TTSProvider.EDGE, "en-US-CoraNeural",
|
||||
"Female, American, older", "en", "female"),
|
||||
Voice("edge_elizabeth", "Elizabeth (Edge)", TTSProvider.EDGE, "en-US-ElizabethNeural",
|
||||
"Female, American, elegant", "en", "female"),
|
||||
Voice("edge_eric", "Eric (Edge)", TTSProvider.EDGE, "en-US-EricNeural",
|
||||
"Male, American, friendly", "en", "male"),
|
||||
Voice("edge_jacob", "Jacob (Edge)", TTSProvider.EDGE, "en-US-JacobNeural",
|
||||
"Male, American, young", "en", "male"),
|
||||
Voice("edge_michelle", "Michelle (Edge)", TTSProvider.EDGE, "en-US-MichelleNeural",
|
||||
"Female, American, clear", "en", "female"),
|
||||
Voice("edge_monica", "Monica (Edge)", TTSProvider.EDGE, "en-US-MonicaNeural",
|
||||
"Female, American, expressive", "en", "female"),
|
||||
Voice("edge_roger", "Roger (Edge)", TTSProvider.EDGE, "en-US-RogerNeural",
|
||||
"Male, American, mature", "en", "male"),
|
||||
Voice("edge_steffan", "Steffan (Edge)", TTSProvider.EDGE, "en-US-SteffanNeural",
|
||||
"Male, American, formal", "en", "male"),
|
||||
Voice("edge_tony", "Tony (Edge)", TTSProvider.EDGE, "en-US-TonyNeural",
|
||||
"Male, American, conversational", "en", "male"),
|
||||
# UK voices
|
||||
Voice("edge_sonia", "Sonia (Edge UK)", TTSProvider.EDGE, "en-GB-SoniaNeural",
|
||||
"Female, British, professional", "en", "female"),
|
||||
Voice("edge_ryan", "Ryan (Edge UK)", TTSProvider.EDGE, "en-GB-RyanNeural",
|
||||
"Male, British, clear", "en", "male"),
|
||||
Voice("edge_libby", "Libby (Edge UK)", TTSProvider.EDGE, "en-GB-LibbyNeural",
|
||||
"Female, British, warm", "en", "female"),
|
||||
Voice("edge_thomas", "Thomas (Edge UK)", TTSProvider.EDGE, "en-GB-ThomasNeural",
|
||||
"Male, British, friendly", "en", "male"),
|
||||
# Australian voices
|
||||
Voice("edge_natasha", "Natasha (Edge AU)", TTSProvider.EDGE, "en-AU-NatashaNeural",
|
||||
"Female, Australian, friendly", "en", "female"),
|
||||
Voice("edge_william", "William (Edge AU)", TTSProvider.EDGE, "en-AU-WilliamNeural",
|
||||
"Male, Australian, casual", "en", "male"),
|
||||
]
|
||||
|
||||
# Piper voices (local, via sherpa-onnx)
|
||||
PIPER_VOICES = [
|
||||
Voice("piper_amy", "Amy (Piper)", TTSProvider.PIPER, "amy",
|
||||
"Female, American, clear", "en", "female"),
|
||||
Voice("piper_joe", "Joe (Piper)", TTSProvider.PIPER, "joe",
|
||||
"Male, American, natural", "en", "male"),
|
||||
Voice("piper_lessac", "Lessac (Piper)", TTSProvider.PIPER, "lessac",
|
||||
"Female, American, expressive", "en", "female"),
|
||||
Voice("piper_alan", "Alan (Piper)", TTSProvider.PIPER, "alan",
|
||||
"Male, British, clear", "en", "male"),
|
||||
]
|
||||
|
||||
# All voices combined
|
||||
ALL_VOICES = ELEVENLABS_VOICES + EDGE_VOICES + PIPER_VOICES
|
||||
|
||||
# Voice lookup by ID
|
||||
VOICES_BY_ID = {v.id: v for v in ALL_VOICES}
|
||||
|
||||
# Default voice assignments for callers (maps caller key to voice ID)
|
||||
DEFAULT_CALLER_VOICES = {
|
||||
"1": "el_tony", # Tony from Staten Island
|
||||
"2": "el_jasmine", # Jasmine from Atlanta
|
||||
"3": "el_rick", # Rick from Texas
|
||||
"4": "el_megan", # Megan from Portland
|
||||
"5": "el_dennis", # Dennis from Long Island
|
||||
"6": "el_tanya", # Tanya from Miami
|
||||
"7": "el_earl", # Earl from Tennessee
|
||||
"8": "el_carla", # Carla from Jersey
|
||||
"9": "el_marcus", # Marcus from Detroit
|
||||
"0": "el_brenda", # Brenda from Phoenix
|
||||
"-": "el_jake", # Jake from Boston
|
||||
"=": "el_diane", # Diane from Chicago
|
||||
"bobby": "el_bobby",
|
||||
"announcer": "el_announcer",
|
||||
}
|
||||
|
||||
|
||||
class VoiceManager:
|
||||
"""Manages voice assignments and TTS provider selection"""
|
||||
|
||||
def __init__(self):
|
||||
# Current voice assignments (can be modified at runtime)
|
||||
self.caller_voices = DEFAULT_CALLER_VOICES.copy()
|
||||
|
||||
def get_voice(self, voice_id: str) -> Optional[Voice]:
|
||||
"""Get voice by ID"""
|
||||
return VOICES_BY_ID.get(voice_id)
|
||||
|
||||
def get_caller_voice(self, caller_key: str) -> Voice:
|
||||
"""Get the voice assigned to a caller"""
|
||||
voice_id = self.caller_voices.get(caller_key, "el_tony")
|
||||
return VOICES_BY_ID.get(voice_id, ELEVENLABS_VOICES[0])
|
||||
|
||||
def set_caller_voice(self, caller_key: str, voice_id: str):
|
||||
"""Assign a voice to a caller"""
|
||||
if voice_id in VOICES_BY_ID:
|
||||
self.caller_voices[caller_key] = voice_id
|
||||
|
||||
def get_all_voices(self) -> list[dict]:
|
||||
"""Get all available voices as dicts for API"""
|
||||
return [
|
||||
{
|
||||
"id": v.id,
|
||||
"name": v.name,
|
||||
"provider": v.provider.value,
|
||||
"description": v.description,
|
||||
"gender": v.gender,
|
||||
}
|
||||
for v in ALL_VOICES
|
||||
]
|
||||
|
||||
def get_voices_by_provider(self, provider: TTSProvider) -> list[Voice]:
|
||||
"""Get all voices for a specific provider"""
|
||||
return [v for v in ALL_VOICES if v.provider == provider]
|
||||
|
||||
def get_caller_voice_assignments(self) -> dict[str, str]:
|
||||
"""Get current caller voice assignments"""
|
||||
return self.caller_voices.copy()
|
||||
|
||||
def set_caller_voice_assignments(self, assignments: dict[str, str]):
|
||||
"""Set multiple caller voice assignments"""
|
||||
for caller_key, voice_id in assignments.items():
|
||||
if voice_id in VOICES_BY_ID:
|
||||
self.caller_voices[caller_key] = voice_id
|
||||
|
||||
|
||||
# Global instance
|
||||
voice_manager = VoiceManager()
|
||||
109
download_sounds.py
Normal file
109
download_sounds.py
Normal file
@@ -0,0 +1,109 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Download free sound effects for the radio show soundboard.
|
||||
Uses sounds from freesound.org and other free sources.
|
||||
"""
|
||||
|
||||
import os
|
||||
import urllib.request
|
||||
import ssl
|
||||
from pathlib import Path
|
||||
|
||||
# Bypass SSL issues
|
||||
ssl._create_default_https_context = ssl._create_unverified_context
|
||||
|
||||
SOUNDS_DIR = Path(__file__).parent / "sounds"
|
||||
SOUNDS_DIR.mkdir(exist_ok=True)
|
||||
|
||||
# Free sound effect URLs (public domain / CC0)
|
||||
# These are from various free sources
|
||||
SOUND_URLS = {
|
||||
# Using pixabay free sounds (no attribution required)
|
||||
'rimshot.wav': 'https://cdn.pixabay.com/audio/2022/03/15/audio_7a569d6dde.mp3',
|
||||
'laugh.wav': 'https://cdn.pixabay.com/audio/2024/02/14/audio_70fa4b1f7c.mp3',
|
||||
'sad_trombone.wav': 'https://cdn.pixabay.com/audio/2022/03/15/audio_cce0f1f0f1.mp3',
|
||||
'cheer.wav': 'https://cdn.pixabay.com/audio/2021/08/04/audio_0625c1539c.mp3',
|
||||
'boo.wav': 'https://cdn.pixabay.com/audio/2022/10/30/audio_f2a4d3d7db.mp3',
|
||||
'drumroll.wav': 'https://cdn.pixabay.com/audio/2022/03/24/audio_52a6ef9129.mp3',
|
||||
'crickets.wav': 'https://cdn.pixabay.com/audio/2022/03/09/audio_691875e05c.mp3',
|
||||
'phone_ring.wav': 'https://cdn.pixabay.com/audio/2022/03/15/audio_0f66b49312.mp3',
|
||||
}
|
||||
|
||||
def download_sound(name, url):
|
||||
"""Download a sound file"""
|
||||
output_path = SOUNDS_DIR / name
|
||||
|
||||
if output_path.exists():
|
||||
print(f" ✓ {name} (already exists)")
|
||||
return True
|
||||
|
||||
try:
|
||||
print(f" Downloading {name}...")
|
||||
|
||||
# Download the file
|
||||
req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'})
|
||||
with urllib.request.urlopen(req, timeout=30) as response:
|
||||
data = response.read()
|
||||
|
||||
# If it's an MP3, we need to convert it
|
||||
if url.endswith('.mp3'):
|
||||
temp_mp3 = SOUNDS_DIR / f"temp_{name}.mp3"
|
||||
with open(temp_mp3, 'wb') as f:
|
||||
f.write(data)
|
||||
|
||||
# Try to convert with ffmpeg
|
||||
import subprocess
|
||||
result = subprocess.run([
|
||||
'ffmpeg', '-y', '-i', str(temp_mp3),
|
||||
'-ar', '24000', '-ac', '1',
|
||||
str(output_path)
|
||||
], capture_output=True)
|
||||
|
||||
temp_mp3.unlink() # Remove temp file
|
||||
|
||||
if result.returncode == 0:
|
||||
print(f" ✓ {name}")
|
||||
return True
|
||||
else:
|
||||
print(f" ✗ {name} (ffmpeg conversion failed)")
|
||||
return False
|
||||
else:
|
||||
with open(output_path, 'wb') as f:
|
||||
f.write(data)
|
||||
print(f" ✓ {name}")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f" ✗ {name} ({e})")
|
||||
return False
|
||||
|
||||
def main():
|
||||
print("Downloading sound effects for radio show soundboard...")
|
||||
print(f"Saving to: {SOUNDS_DIR}\n")
|
||||
|
||||
# Check for ffmpeg
|
||||
import subprocess
|
||||
try:
|
||||
subprocess.run(['ffmpeg', '-version'], capture_output=True, check=True)
|
||||
except:
|
||||
print("WARNING: ffmpeg not found. Install it with: brew install ffmpeg")
|
||||
print("Some sounds may not download correctly.\n")
|
||||
|
||||
success = 0
|
||||
for name, url in SOUND_URLS.items():
|
||||
if download_sound(name, url):
|
||||
success += 1
|
||||
|
||||
print(f"\nDownloaded {success}/{len(SOUND_URLS)} sounds.")
|
||||
print("\nTo add more sounds:")
|
||||
print(" 1. Find free .wav files online")
|
||||
print(" 2. Name them according to the SOUNDBOARD mapping in radio_show.py")
|
||||
print(" 3. Place them in the sounds/ directory")
|
||||
print("\nRecommended free sound sources:")
|
||||
print(" - freesound.org")
|
||||
print(" - pixabay.com/sound-effects")
|
||||
print(" - zapsplat.com")
|
||||
print(" - soundbible.com")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
543
frontend/css/style.css
Normal file
543
frontend/css/style.css
Normal file
@@ -0,0 +1,543 @@
|
||||
/* AI Radio Show - Clean CSS */
|
||||
|
||||
:root {
|
||||
--bg: #1a1a2e;
|
||||
--bg-light: #252547;
|
||||
--accent: #e94560;
|
||||
--text: #fff;
|
||||
--text-muted: #888;
|
||||
--radius: 8px;
|
||||
}
|
||||
|
||||
* {
|
||||
box-sizing: border-box;
|
||||
margin: 0;
|
||||
padding: 0;
|
||||
}
|
||||
|
||||
body {
|
||||
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif;
|
||||
background: var(--bg);
|
||||
color: var(--text);
|
||||
min-height: 100vh;
|
||||
}
|
||||
|
||||
#app {
|
||||
max-width: 900px;
|
||||
margin: 0 auto;
|
||||
padding: 20px;
|
||||
}
|
||||
|
||||
/* Header */
|
||||
header {
|
||||
display: flex;
|
||||
justify-content: space-between;
|
||||
align-items: center;
|
||||
margin-bottom: 20px;
|
||||
}
|
||||
|
||||
header h1 {
|
||||
font-size: 1.5rem;
|
||||
}
|
||||
|
||||
.header-buttons {
|
||||
display: flex;
|
||||
gap: 8px;
|
||||
}
|
||||
|
||||
header button {
|
||||
background: var(--bg-light);
|
||||
color: var(--text);
|
||||
border: none;
|
||||
padding: 8px 16px;
|
||||
border-radius: var(--radius);
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
.new-session-btn {
|
||||
background: var(--accent) !important;
|
||||
}
|
||||
|
||||
.session-id {
|
||||
font-size: 0.7rem;
|
||||
color: var(--text-muted);
|
||||
font-weight: normal;
|
||||
}
|
||||
|
||||
.caller-background {
|
||||
font-size: 0.85rem;
|
||||
color: var(--text-muted);
|
||||
padding: 10px;
|
||||
background: var(--bg);
|
||||
border-radius: var(--radius);
|
||||
margin-bottom: 12px;
|
||||
line-height: 1.4;
|
||||
}
|
||||
|
||||
.caller-background.hidden {
|
||||
display: none;
|
||||
}
|
||||
|
||||
/* Main layout */
|
||||
main {
|
||||
display: grid;
|
||||
grid-template-columns: 1fr 1fr;
|
||||
gap: 20px;
|
||||
}
|
||||
|
||||
@media (max-width: 700px) {
|
||||
main {
|
||||
grid-template-columns: 1fr;
|
||||
}
|
||||
}
|
||||
|
||||
/* Sections */
|
||||
section {
|
||||
background: var(--bg-light);
|
||||
padding: 16px;
|
||||
border-radius: var(--radius);
|
||||
}
|
||||
|
||||
section h2 {
|
||||
font-size: 1rem;
|
||||
margin-bottom: 12px;
|
||||
color: var(--text-muted);
|
||||
}
|
||||
|
||||
/* Callers */
|
||||
.caller-grid {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(5, 1fr);
|
||||
gap: 8px;
|
||||
margin-bottom: 12px;
|
||||
}
|
||||
|
||||
.caller-btn {
|
||||
background: var(--bg);
|
||||
color: var(--text);
|
||||
border: 2px solid transparent;
|
||||
padding: 10px 8px;
|
||||
border-radius: var(--radius);
|
||||
cursor: pointer;
|
||||
font-size: 0.85rem;
|
||||
transition: all 0.2s;
|
||||
}
|
||||
|
||||
.caller-btn:hover {
|
||||
border-color: var(--accent);
|
||||
}
|
||||
|
||||
.caller-btn.active {
|
||||
background: var(--accent);
|
||||
border-color: var(--accent);
|
||||
}
|
||||
|
||||
.call-status {
|
||||
text-align: center;
|
||||
padding: 8px;
|
||||
color: var(--text-muted);
|
||||
margin-bottom: 12px;
|
||||
}
|
||||
|
||||
.hangup-btn {
|
||||
width: 100%;
|
||||
background: #c0392b;
|
||||
color: white;
|
||||
border: none;
|
||||
padding: 12px;
|
||||
border-radius: var(--radius);
|
||||
cursor: pointer;
|
||||
font-weight: bold;
|
||||
}
|
||||
|
||||
.hangup-btn:disabled {
|
||||
opacity: 0.5;
|
||||
cursor: not-allowed;
|
||||
}
|
||||
|
||||
/* Chat */
|
||||
.chat-section {
|
||||
grid-column: span 2;
|
||||
}
|
||||
|
||||
@media (max-width: 700px) {
|
||||
.chat-section {
|
||||
grid-column: span 1;
|
||||
}
|
||||
}
|
||||
|
||||
.chat-log {
|
||||
height: 300px;
|
||||
overflow-y: auto;
|
||||
background: var(--bg);
|
||||
border-radius: var(--radius);
|
||||
padding: 12px;
|
||||
margin-bottom: 12px;
|
||||
}
|
||||
|
||||
.message {
|
||||
padding: 8px 12px;
|
||||
margin-bottom: 8px;
|
||||
border-radius: var(--radius);
|
||||
line-height: 1.4;
|
||||
}
|
||||
|
||||
.message.host {
|
||||
background: #2c5282;
|
||||
}
|
||||
|
||||
.message.caller {
|
||||
background: #553c9a;
|
||||
}
|
||||
|
||||
.message strong {
|
||||
display: block;
|
||||
font-size: 0.8rem;
|
||||
opacity: 0.7;
|
||||
margin-bottom: 4px;
|
||||
}
|
||||
|
||||
.talk-controls {
|
||||
display: flex;
|
||||
gap: 10px;
|
||||
}
|
||||
|
||||
.talk-btn {
|
||||
flex: 1;
|
||||
background: var(--accent);
|
||||
color: white;
|
||||
border: none;
|
||||
padding: 16px;
|
||||
border-radius: var(--radius);
|
||||
font-size: 1rem;
|
||||
font-weight: bold;
|
||||
cursor: pointer;
|
||||
transition: all 0.2s;
|
||||
}
|
||||
|
||||
.talk-btn:hover {
|
||||
filter: brightness(1.1);
|
||||
}
|
||||
|
||||
.talk-btn.recording {
|
||||
background: #c0392b;
|
||||
animation: pulse 1s infinite;
|
||||
}
|
||||
|
||||
@keyframes pulse {
|
||||
0%, 100% { opacity: 1; }
|
||||
50% { opacity: 0.7; }
|
||||
}
|
||||
|
||||
.type-btn {
|
||||
background: var(--bg);
|
||||
color: var(--text);
|
||||
border: none;
|
||||
padding: 16px 24px;
|
||||
border-radius: var(--radius);
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
.status {
|
||||
text-align: center;
|
||||
padding: 12px;
|
||||
color: var(--accent);
|
||||
font-weight: bold;
|
||||
}
|
||||
|
||||
.status.hidden {
|
||||
display: none;
|
||||
}
|
||||
|
||||
/* Music */
|
||||
.music-section select {
|
||||
width: 100%;
|
||||
padding: 10px;
|
||||
background: var(--bg);
|
||||
color: var(--text);
|
||||
border: none;
|
||||
border-radius: var(--radius);
|
||||
margin-bottom: 10px;
|
||||
}
|
||||
|
||||
.music-controls {
|
||||
display: flex;
|
||||
gap: 8px;
|
||||
align-items: center;
|
||||
}
|
||||
|
||||
.music-controls button {
|
||||
background: var(--bg);
|
||||
color: var(--text);
|
||||
border: none;
|
||||
padding: 10px 16px;
|
||||
border-radius: var(--radius);
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
.music-controls input[type="range"] {
|
||||
flex: 1;
|
||||
}
|
||||
|
||||
/* Soundboard */
|
||||
.soundboard {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(3, 1fr);
|
||||
gap: 8px;
|
||||
}
|
||||
|
||||
.sound-btn {
|
||||
background: var(--bg);
|
||||
color: var(--text);
|
||||
border: none;
|
||||
padding: 12px 8px;
|
||||
border-radius: var(--radius);
|
||||
cursor: pointer;
|
||||
font-size: 0.8rem;
|
||||
transition: all 0.1s;
|
||||
}
|
||||
|
||||
.sound-btn:hover {
|
||||
background: var(--accent);
|
||||
}
|
||||
|
||||
.sound-btn:active {
|
||||
transform: scale(0.95);
|
||||
}
|
||||
|
||||
/* Modal */
|
||||
.modal {
|
||||
position: fixed;
|
||||
inset: 0;
|
||||
background: rgba(0, 0, 0, 0.8);
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
z-index: 100;
|
||||
}
|
||||
|
||||
.modal.hidden {
|
||||
display: none;
|
||||
}
|
||||
|
||||
.modal-content {
|
||||
background: var(--bg-light);
|
||||
padding: 24px;
|
||||
border-radius: var(--radius);
|
||||
width: 90%;
|
||||
max-width: 400px;
|
||||
}
|
||||
|
||||
.modal-content h2 {
|
||||
margin-bottom: 16px;
|
||||
}
|
||||
|
||||
.modal-content h3 {
|
||||
font-size: 0.9rem;
|
||||
color: var(--text-muted);
|
||||
margin: 16px 0 8px 0;
|
||||
border-bottom: 1px solid var(--bg);
|
||||
padding-bottom: 4px;
|
||||
}
|
||||
|
||||
.settings-group {
|
||||
margin-bottom: 16px;
|
||||
}
|
||||
|
||||
.device-row {
|
||||
display: flex;
|
||||
gap: 8px;
|
||||
align-items: flex-end;
|
||||
}
|
||||
|
||||
.device-row label:first-child {
|
||||
flex: 1;
|
||||
}
|
||||
|
||||
.channel-row {
|
||||
display: flex;
|
||||
gap: 12px;
|
||||
margin-top: 8px;
|
||||
}
|
||||
|
||||
.channel-row label {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 4px;
|
||||
font-size: 0.85rem;
|
||||
}
|
||||
|
||||
.channel-input {
|
||||
width: 50px !important;
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
.modal-content label {
|
||||
display: block;
|
||||
margin-bottom: 16px;
|
||||
}
|
||||
|
||||
.modal-content label.checkbox {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 8px;
|
||||
}
|
||||
|
||||
.modal-content select,
|
||||
.modal-content input[type="text"],
|
||||
.modal-content textarea {
|
||||
width: 100%;
|
||||
padding: 10px;
|
||||
background: var(--bg);
|
||||
color: var(--text);
|
||||
border: none;
|
||||
border-radius: var(--radius);
|
||||
margin-top: 4px;
|
||||
}
|
||||
|
||||
.modal-buttons {
|
||||
display: flex;
|
||||
gap: 10px;
|
||||
margin-top: 20px;
|
||||
}
|
||||
|
||||
.modal-buttons button {
|
||||
flex: 1;
|
||||
padding: 12px;
|
||||
border: none;
|
||||
border-radius: var(--radius);
|
||||
cursor: pointer;
|
||||
font-weight: bold;
|
||||
}
|
||||
|
||||
.modal-buttons button:first-child {
|
||||
background: var(--accent);
|
||||
color: white;
|
||||
}
|
||||
|
||||
.modal-buttons button:last-child {
|
||||
background: var(--bg);
|
||||
color: var(--text);
|
||||
}
|
||||
|
||||
.refresh-btn {
|
||||
background: var(--bg);
|
||||
color: var(--text-muted);
|
||||
border: 1px solid var(--bg-light);
|
||||
padding: 6px 12px;
|
||||
border-radius: var(--radius);
|
||||
cursor: pointer;
|
||||
font-size: 0.85rem;
|
||||
margin-top: 8px;
|
||||
}
|
||||
|
||||
.refresh-btn:hover {
|
||||
background: var(--bg-light);
|
||||
color: var(--text);
|
||||
}
|
||||
|
||||
.refresh-btn:disabled {
|
||||
opacity: 0.5;
|
||||
cursor: not-allowed;
|
||||
}
|
||||
|
||||
.hidden {
|
||||
display: none !important;
|
||||
}
|
||||
|
||||
/* Server Log */
|
||||
.log-section {
|
||||
grid-column: span 2;
|
||||
}
|
||||
|
||||
@media (max-width: 700px) {
|
||||
.log-section {
|
||||
grid-column: span 1;
|
||||
}
|
||||
}
|
||||
|
||||
.log-header {
|
||||
display: flex;
|
||||
justify-content: space-between;
|
||||
align-items: center;
|
||||
margin-bottom: 12px;
|
||||
}
|
||||
|
||||
.log-header h2 {
|
||||
margin-bottom: 0;
|
||||
}
|
||||
|
||||
.server-controls {
|
||||
display: flex;
|
||||
gap: 8px;
|
||||
align-items: center;
|
||||
}
|
||||
|
||||
.server-btn {
|
||||
border: none;
|
||||
padding: 6px 12px;
|
||||
border-radius: var(--radius);
|
||||
cursor: pointer;
|
||||
font-size: 0.85rem;
|
||||
font-weight: bold;
|
||||
}
|
||||
|
||||
.server-btn.restart {
|
||||
background: #2196F3;
|
||||
color: white;
|
||||
}
|
||||
|
||||
.server-btn.restart:hover {
|
||||
background: #1976D2;
|
||||
}
|
||||
|
||||
.server-btn.stop {
|
||||
background: #c0392b;
|
||||
color: white;
|
||||
}
|
||||
|
||||
.server-btn.stop:hover {
|
||||
background: #a93226;
|
||||
}
|
||||
|
||||
.auto-scroll-label {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 4px;
|
||||
font-size: 0.8rem;
|
||||
color: var(--text-muted);
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
.server-log {
|
||||
height: 200px;
|
||||
overflow-y: auto;
|
||||
background: #0d0d1a;
|
||||
border-radius: var(--radius);
|
||||
padding: 12px;
|
||||
font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
|
||||
font-size: 0.75rem;
|
||||
line-height: 1.5;
|
||||
color: #8f8;
|
||||
}
|
||||
|
||||
.server-log .log-line {
|
||||
white-space: pre-wrap;
|
||||
word-break: break-all;
|
||||
}
|
||||
|
||||
.server-log .log-line.error {
|
||||
color: #f88;
|
||||
}
|
||||
|
||||
.server-log .log-line.warning {
|
||||
color: #ff8;
|
||||
}
|
||||
|
||||
.server-log .log-line.tts {
|
||||
color: #8ff;
|
||||
}
|
||||
|
||||
.server-log .log-line.chat {
|
||||
color: #f8f;
|
||||
}
|
||||
178
frontend/index.html
Normal file
178
frontend/index.html
Normal file
@@ -0,0 +1,178 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>AI Radio Show</title>
|
||||
<link rel="stylesheet" href="/css/style.css">
|
||||
</head>
|
||||
<body>
|
||||
<div id="app">
|
||||
<header>
|
||||
<h1>AI Radio Show</h1>
|
||||
<div class="header-buttons">
|
||||
<button id="new-session-btn" class="new-session-btn">New Session</button>
|
||||
<button id="settings-btn">Settings</button>
|
||||
</div>
|
||||
</header>
|
||||
|
||||
<main>
|
||||
<!-- Callers -->
|
||||
<section class="callers-section">
|
||||
<h2>Callers <span id="session-id" class="session-id"></span></h2>
|
||||
<div id="callers" class="caller-grid"></div>
|
||||
<div id="call-status" class="call-status">No active call</div>
|
||||
<div id="caller-background" class="caller-background hidden"></div>
|
||||
<button id="hangup-btn" class="hangup-btn" disabled>Hang Up</button>
|
||||
</section>
|
||||
|
||||
<!-- Chat -->
|
||||
<section class="chat-section">
|
||||
<div id="chat" class="chat-log"></div>
|
||||
<div class="talk-controls">
|
||||
<button id="talk-btn" class="talk-btn">Hold to Talk</button>
|
||||
<button id="type-btn" class="type-btn">Type</button>
|
||||
</div>
|
||||
<div id="status" class="status hidden"></div>
|
||||
</section>
|
||||
|
||||
<!-- Music -->
|
||||
<section class="music-section">
|
||||
<h2>Music</h2>
|
||||
<select id="track-select"></select>
|
||||
<div class="music-controls">
|
||||
<button id="play-btn">Play</button>
|
||||
<button id="stop-btn">Stop</button>
|
||||
<input type="range" id="volume" min="0" max="100" value="30">
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<!-- Sound Effects -->
|
||||
<section class="sounds-section">
|
||||
<h2>Sounds</h2>
|
||||
<div id="soundboard" class="soundboard"></div>
|
||||
</section>
|
||||
|
||||
<!-- Server Log -->
|
||||
<section class="log-section">
|
||||
<div class="log-header">
|
||||
<h2>Server Log</h2>
|
||||
<div class="server-controls">
|
||||
<button id="restart-server-btn" class="server-btn restart">Restart</button>
|
||||
<button id="stop-server-btn" class="server-btn stop">Stop</button>
|
||||
<label class="auto-scroll-label">
|
||||
<input type="checkbox" id="auto-scroll" checked> Auto-scroll
|
||||
</label>
|
||||
</div>
|
||||
</div>
|
||||
<div id="server-log" class="server-log"></div>
|
||||
</section>
|
||||
</main>
|
||||
|
||||
<!-- Settings Modal -->
|
||||
<div id="settings-modal" class="modal hidden">
|
||||
<div class="modal-content">
|
||||
<h2>Settings</h2>
|
||||
|
||||
<!-- Audio Devices -->
|
||||
<div class="settings-group">
|
||||
<h3>Audio Routing</h3>
|
||||
<div class="device-row">
|
||||
<label>
|
||||
Input Device
|
||||
<select id="input-device"></select>
|
||||
</label>
|
||||
<label>
|
||||
Ch
|
||||
<input type="number" id="input-channel" value="1" min="1" max="16" class="channel-input">
|
||||
</label>
|
||||
</div>
|
||||
<div class="device-row">
|
||||
<label>
|
||||
Output Device
|
||||
<select id="output-device"></select>
|
||||
</label>
|
||||
</div>
|
||||
<div class="channel-row">
|
||||
<label>Caller Ch <input type="number" id="caller-channel" value="1" min="1" max="16" class="channel-input"></label>
|
||||
<label>Music Ch <input type="number" id="music-channel" value="2" min="1" max="16" class="channel-input"></label>
|
||||
<label>SFX Ch <input type="number" id="sfx-channel" value="3" min="1" max="16" class="channel-input"></label>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- LLM Settings -->
|
||||
<div class="settings-group">
|
||||
<h3>LLM Provider</h3>
|
||||
<label>
|
||||
Provider
|
||||
<select id="provider">
|
||||
<option value="openrouter">OpenRouter</option>
|
||||
<option value="ollama">Ollama</option>
|
||||
</select>
|
||||
</label>
|
||||
|
||||
<div id="openrouter-settings">
|
||||
<label>
|
||||
Model
|
||||
<select id="openrouter-model"></select>
|
||||
</label>
|
||||
</div>
|
||||
|
||||
<div id="ollama-settings" class="hidden">
|
||||
<label>
|
||||
Model
|
||||
<select id="ollama-model"></select>
|
||||
</label>
|
||||
<label>
|
||||
Host
|
||||
<input type="text" id="ollama-host" value="http://localhost:11434">
|
||||
</label>
|
||||
<button type="button" id="refresh-ollama" class="refresh-btn">Refresh Models</button>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- TTS Settings -->
|
||||
<div class="settings-group">
|
||||
<h3>TTS Provider</h3>
|
||||
<label>
|
||||
Provider
|
||||
<select id="tts-provider">
|
||||
<option value="inworld">Inworld (High quality, natural)</option>
|
||||
<option value="f5tts">F5-TTS (Most natural local)</option>
|
||||
<option value="elevenlabs">ElevenLabs (Best quality, paid)</option>
|
||||
<option value="kokoro">Kokoro MLX (Fast, Apple Silicon)</option>
|
||||
<option value="chattts">ChatTTS (Conversational)</option>
|
||||
<option value="styletts2">StyleTTS2 (Voice cloning)</option>
|
||||
<option value="vits">VITS (Fast local)</option>
|
||||
<option value="bark">Bark (Expressive, supports [laughs])</option>
|
||||
</select>
|
||||
</label>
|
||||
<label class="checkbox">
|
||||
<input type="checkbox" id="phone-filter">
|
||||
Phone filter on voices
|
||||
</label>
|
||||
</div>
|
||||
|
||||
<div class="modal-buttons">
|
||||
<button id="save-settings">Save</button>
|
||||
<button id="close-settings">Close</button>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Type Modal -->
|
||||
<div id="type-modal" class="modal hidden">
|
||||
<div class="modal-content">
|
||||
<h2>Type Message</h2>
|
||||
<textarea id="type-input" rows="3" placeholder="Type what you want to say..."></textarea>
|
||||
<div class="modal-buttons">
|
||||
<button id="send-type">Send</button>
|
||||
<button id="close-type">Cancel</button>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<script src="/js/app.js?v=8"></script>
|
||||
</body>
|
||||
</html>
|
||||
782
frontend/js/app.js
Normal file
782
frontend/js/app.js
Normal file
@@ -0,0 +1,782 @@
|
||||
/**
|
||||
* AI Radio Show - Control Panel (Server-Side Audio)
|
||||
*/
|
||||
|
||||
// --- State ---
|
||||
let currentCaller = null;
|
||||
let isProcessing = false;
|
||||
let isRecording = false;
|
||||
let phoneFilter = false;
|
||||
let autoScroll = true;
|
||||
let logPollInterval = null;
|
||||
let lastLogCount = 0;
|
||||
|
||||
// Track lists
|
||||
let tracks = [];
|
||||
let sounds = [];
|
||||
|
||||
|
||||
// --- Init ---
|
||||
document.addEventListener('DOMContentLoaded', async () => {
|
||||
console.log('AI Radio Show initializing...');
|
||||
try {
|
||||
await loadAudioDevices();
|
||||
await loadCallers();
|
||||
await loadMusic();
|
||||
await loadSounds();
|
||||
await loadSettings();
|
||||
initEventListeners();
|
||||
log('Ready. Configure audio devices in Settings, then click a caller to start.');
|
||||
console.log('AI Radio Show ready');
|
||||
} catch (err) {
|
||||
console.error('Init error:', err);
|
||||
log('Error loading: ' + err.message);
|
||||
}
|
||||
});
|
||||
|
||||
|
||||
function initEventListeners() {
|
||||
// Hangup
|
||||
document.getElementById('hangup-btn')?.addEventListener('click', hangup);
|
||||
|
||||
// New Session
|
||||
document.getElementById('new-session-btn')?.addEventListener('click', newSession);
|
||||
|
||||
// Server controls
|
||||
document.getElementById('restart-server-btn')?.addEventListener('click', restartServer);
|
||||
document.getElementById('stop-server-btn')?.addEventListener('click', stopServer);
|
||||
document.getElementById('auto-scroll')?.addEventListener('change', e => {
|
||||
autoScroll = e.target.checked;
|
||||
});
|
||||
|
||||
// Start log polling
|
||||
startLogPolling();
|
||||
|
||||
// Talk button - now triggers server-side recording
|
||||
const talkBtn = document.getElementById('talk-btn');
|
||||
if (talkBtn) {
|
||||
talkBtn.addEventListener('mousedown', startRecording);
|
||||
talkBtn.addEventListener('mouseup', stopRecording);
|
||||
talkBtn.addEventListener('mouseleave', () => { if (isRecording) stopRecording(); });
|
||||
talkBtn.addEventListener('touchstart', e => { e.preventDefault(); startRecording(); });
|
||||
talkBtn.addEventListener('touchend', e => { e.preventDefault(); stopRecording(); });
|
||||
}
|
||||
|
||||
// Type button
|
||||
document.getElementById('type-btn')?.addEventListener('click', () => {
|
||||
document.getElementById('type-modal')?.classList.remove('hidden');
|
||||
document.getElementById('type-input')?.focus();
|
||||
});
|
||||
document.getElementById('send-type')?.addEventListener('click', sendTypedMessage);
|
||||
document.getElementById('close-type')?.addEventListener('click', () => {
|
||||
document.getElementById('type-modal')?.classList.add('hidden');
|
||||
});
|
||||
document.getElementById('type-input')?.addEventListener('keydown', e => {
|
||||
if (e.key === 'Enter' && !e.shiftKey) {
|
||||
e.preventDefault();
|
||||
sendTypedMessage();
|
||||
}
|
||||
});
|
||||
|
||||
// Music - now server-side
|
||||
document.getElementById('play-btn')?.addEventListener('click', playMusic);
|
||||
document.getElementById('stop-btn')?.addEventListener('click', stopMusic);
|
||||
document.getElementById('volume')?.addEventListener('input', setMusicVolume);
|
||||
|
||||
// Settings
|
||||
document.getElementById('settings-btn')?.addEventListener('click', async () => {
|
||||
document.getElementById('settings-modal')?.classList.remove('hidden');
|
||||
await loadSettings(); // Reload settings when modal opens
|
||||
});
|
||||
document.getElementById('close-settings')?.addEventListener('click', () => {
|
||||
document.getElementById('settings-modal')?.classList.add('hidden');
|
||||
});
|
||||
document.getElementById('save-settings')?.addEventListener('click', saveSettings);
|
||||
document.getElementById('provider')?.addEventListener('change', updateProviderUI);
|
||||
document.getElementById('phone-filter')?.addEventListener('change', e => {
|
||||
phoneFilter = e.target.checked;
|
||||
});
|
||||
document.getElementById('refresh-ollama')?.addEventListener('click', refreshOllamaModels);
|
||||
}
|
||||
|
||||
|
||||
async function refreshOllamaModels() {
|
||||
const btn = document.getElementById('refresh-ollama');
|
||||
const select = document.getElementById('ollama-model');
|
||||
if (!select) return;
|
||||
|
||||
btn.textContent = 'Loading...';
|
||||
btn.disabled = true;
|
||||
|
||||
try {
|
||||
const res = await fetch('/api/settings');
|
||||
const data = await res.json();
|
||||
|
||||
select.innerHTML = '';
|
||||
const models = data.available_ollama_models || [];
|
||||
|
||||
if (models.length === 0) {
|
||||
const option = document.createElement('option');
|
||||
option.value = '';
|
||||
option.textContent = '(No models found)';
|
||||
select.appendChild(option);
|
||||
} else {
|
||||
models.forEach(model => {
|
||||
const option = document.createElement('option');
|
||||
option.value = model;
|
||||
option.textContent = model;
|
||||
select.appendChild(option);
|
||||
});
|
||||
}
|
||||
} catch (err) {
|
||||
console.error('Failed to refresh Ollama models:', err);
|
||||
}
|
||||
|
||||
btn.textContent = 'Refresh Models';
|
||||
btn.disabled = false;
|
||||
}
|
||||
|
||||
|
||||
// --- Audio Devices ---
|
||||
async function loadAudioDevices() {
|
||||
try {
|
||||
const res = await fetch('/api/audio/devices');
|
||||
const data = await res.json();
|
||||
|
||||
const inputSelect = document.getElementById('input-device');
|
||||
const outputSelect = document.getElementById('output-device');
|
||||
|
||||
if (!inputSelect || !outputSelect) return;
|
||||
|
||||
// Clear selects
|
||||
inputSelect.innerHTML = '<option value="">-- Select --</option>';
|
||||
outputSelect.innerHTML = '<option value="">-- Select --</option>';
|
||||
|
||||
data.devices.forEach(device => {
|
||||
// Input devices
|
||||
if (device.inputs > 0) {
|
||||
const opt = document.createElement('option');
|
||||
opt.value = device.id;
|
||||
opt.textContent = `${device.name} (${device.inputs} ch)`;
|
||||
inputSelect.appendChild(opt);
|
||||
}
|
||||
// Output devices
|
||||
if (device.outputs > 0) {
|
||||
const opt = document.createElement('option');
|
||||
opt.value = device.id;
|
||||
opt.textContent = `${device.name} (${device.outputs} ch)`;
|
||||
outputSelect.appendChild(opt);
|
||||
}
|
||||
});
|
||||
|
||||
// Load current settings
|
||||
const settingsRes = await fetch('/api/audio/settings');
|
||||
const settings = await settingsRes.json();
|
||||
|
||||
if (settings.input_device !== null)
|
||||
inputSelect.value = settings.input_device;
|
||||
if (settings.output_device !== null)
|
||||
outputSelect.value = settings.output_device;
|
||||
|
||||
// Channel settings
|
||||
const inputCh = document.getElementById('input-channel');
|
||||
const callerCh = document.getElementById('caller-channel');
|
||||
const musicCh = document.getElementById('music-channel');
|
||||
const sfxCh = document.getElementById('sfx-channel');
|
||||
|
||||
if (inputCh) inputCh.value = settings.input_channel || 1;
|
||||
if (callerCh) callerCh.value = settings.caller_channel || 1;
|
||||
if (musicCh) musicCh.value = settings.music_channel || 2;
|
||||
if (sfxCh) sfxCh.value = settings.sfx_channel || 3;
|
||||
|
||||
// Phone filter setting
|
||||
const phoneFilterEl = document.getElementById('phone-filter');
|
||||
if (phoneFilterEl) {
|
||||
phoneFilterEl.checked = settings.phone_filter ?? false;
|
||||
phoneFilter = phoneFilterEl.checked;
|
||||
}
|
||||
|
||||
console.log('Audio devices loaded');
|
||||
} catch (err) {
|
||||
console.error('loadAudioDevices error:', err);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
async function saveAudioDevices() {
|
||||
const inputDevice = document.getElementById('input-device')?.value;
|
||||
const outputDevice = document.getElementById('output-device')?.value;
|
||||
const inputChannel = document.getElementById('input-channel')?.value;
|
||||
const callerChannel = document.getElementById('caller-channel')?.value;
|
||||
const musicChannel = document.getElementById('music-channel')?.value;
|
||||
const sfxChannel = document.getElementById('sfx-channel')?.value;
|
||||
const phoneFilterChecked = document.getElementById('phone-filter')?.checked ?? false;
|
||||
|
||||
await fetch('/api/audio/settings', {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({
|
||||
input_device: inputDevice ? parseInt(inputDevice) : null,
|
||||
input_channel: inputChannel ? parseInt(inputChannel) : 1,
|
||||
output_device: outputDevice ? parseInt(outputDevice) : null,
|
||||
caller_channel: callerChannel ? parseInt(callerChannel) : 1,
|
||||
music_channel: musicChannel ? parseInt(musicChannel) : 2,
|
||||
sfx_channel: sfxChannel ? parseInt(sfxChannel) : 3,
|
||||
phone_filter: phoneFilterChecked
|
||||
})
|
||||
});
|
||||
|
||||
// Update local state
|
||||
phoneFilter = phoneFilterChecked;
|
||||
|
||||
log('Audio routing saved');
|
||||
}
|
||||
|
||||
|
||||
// --- Callers ---
|
||||
async function loadCallers() {
|
||||
try {
|
||||
const res = await fetch('/api/callers');
|
||||
const data = await res.json();
|
||||
|
||||
const grid = document.getElementById('callers');
|
||||
if (!grid) return;
|
||||
grid.innerHTML = '';
|
||||
|
||||
data.callers.forEach(caller => {
|
||||
const btn = document.createElement('button');
|
||||
btn.className = 'caller-btn';
|
||||
btn.textContent = caller.name;
|
||||
btn.dataset.key = caller.key;
|
||||
btn.addEventListener('click', () => startCall(caller.key, caller.name));
|
||||
grid.appendChild(btn);
|
||||
});
|
||||
|
||||
// Show session ID
|
||||
const sessionEl = document.getElementById('session-id');
|
||||
if (sessionEl && data.session_id) {
|
||||
sessionEl.textContent = `(${data.session_id})`;
|
||||
}
|
||||
|
||||
console.log('Loaded', data.callers.length, 'callers, session:', data.session_id);
|
||||
} catch (err) {
|
||||
console.error('loadCallers error:', err);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
async function startCall(key, name) {
|
||||
if (isProcessing) return;
|
||||
|
||||
const res = await fetch(`/api/call/${key}`, { method: 'POST' });
|
||||
const data = await res.json();
|
||||
|
||||
currentCaller = { key, name };
|
||||
|
||||
document.getElementById('call-status').textContent = `On call: ${name}`;
|
||||
document.getElementById('hangup-btn').disabled = false;
|
||||
|
||||
// Show caller background
|
||||
const bgEl = document.getElementById('caller-background');
|
||||
if (bgEl && data.background) {
|
||||
bgEl.textContent = data.background;
|
||||
bgEl.classList.remove('hidden');
|
||||
}
|
||||
|
||||
document.querySelectorAll('.caller-btn').forEach(btn => {
|
||||
btn.classList.toggle('active', btn.dataset.key === key);
|
||||
});
|
||||
|
||||
log(`Connected to ${name}`);
|
||||
clearChat();
|
||||
}
|
||||
|
||||
|
||||
async function newSession() {
|
||||
// Hangup if on a call
|
||||
if (currentCaller) {
|
||||
await hangup();
|
||||
}
|
||||
|
||||
await fetch('/api/session/reset', { method: 'POST' });
|
||||
|
||||
// Hide caller background
|
||||
const bgEl = document.getElementById('caller-background');
|
||||
if (bgEl) bgEl.classList.add('hidden');
|
||||
|
||||
// Reload callers to get new session ID
|
||||
await loadCallers();
|
||||
|
||||
log('New session started - all callers have fresh backgrounds');
|
||||
}
|
||||
|
||||
|
||||
async function hangup() {
|
||||
if (!currentCaller) return;
|
||||
|
||||
// Stop any playing TTS
|
||||
await fetch('/api/tts/stop', { method: 'POST' });
|
||||
await fetch('/api/hangup', { method: 'POST' });
|
||||
|
||||
log(`Hung up on ${currentCaller.name}`);
|
||||
|
||||
currentCaller = null;
|
||||
isProcessing = false;
|
||||
hideStatus();
|
||||
|
||||
document.getElementById('call-status').textContent = 'No active call';
|
||||
document.getElementById('hangup-btn').disabled = true;
|
||||
document.querySelectorAll('.caller-btn').forEach(btn => btn.classList.remove('active'));
|
||||
|
||||
// Hide caller background
|
||||
const bgEl = document.getElementById('caller-background');
|
||||
if (bgEl) bgEl.classList.add('hidden');
|
||||
}
|
||||
|
||||
|
||||
// --- Server-Side Recording ---
|
||||
async function startRecording() {
|
||||
if (!currentCaller || isProcessing) return;
|
||||
|
||||
try {
|
||||
const res = await fetch('/api/record/start', { method: 'POST' });
|
||||
if (!res.ok) {
|
||||
const err = await res.json();
|
||||
log('Record error: ' + (err.detail || 'Failed to start'));
|
||||
return;
|
||||
}
|
||||
|
||||
isRecording = true;
|
||||
document.getElementById('talk-btn').classList.add('recording');
|
||||
document.getElementById('talk-btn').textContent = 'Recording...';
|
||||
|
||||
} catch (err) {
|
||||
log('Record error: ' + err.message);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
async function stopRecording() {
|
||||
if (!isRecording) return;
|
||||
|
||||
document.getElementById('talk-btn').classList.remove('recording');
|
||||
document.getElementById('talk-btn').textContent = 'Hold to Talk';
|
||||
|
||||
isRecording = false;
|
||||
isProcessing = true;
|
||||
showStatus('Processing...');
|
||||
|
||||
try {
|
||||
// Stop recording and get transcription
|
||||
const res = await fetch('/api/record/stop', { method: 'POST' });
|
||||
const data = await res.json();
|
||||
|
||||
if (!data.text) {
|
||||
log('(No speech detected)');
|
||||
isProcessing = false;
|
||||
hideStatus();
|
||||
return;
|
||||
}
|
||||
|
||||
addMessage('You', data.text);
|
||||
|
||||
// Chat
|
||||
showStatus(`${currentCaller.name} is thinking...`);
|
||||
|
||||
const chatRes = await fetch('/api/chat', {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({ text: data.text })
|
||||
});
|
||||
const chatData = await chatRes.json();
|
||||
|
||||
addMessage(chatData.caller, chatData.text);
|
||||
|
||||
// TTS (plays on server) - only if we have text
|
||||
if (chatData.text && chatData.text.trim()) {
|
||||
showStatus(`${currentCaller.name} is speaking...`);
|
||||
|
||||
await fetch('/api/tts', {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({
|
||||
text: chatData.text,
|
||||
voice_id: chatData.voice_id,
|
||||
phone_filter: phoneFilter
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
} catch (err) {
|
||||
log('Error: ' + err.message);
|
||||
}
|
||||
|
||||
isProcessing = false;
|
||||
hideStatus();
|
||||
}
|
||||
|
||||
|
||||
async function sendTypedMessage() {
|
||||
const input = document.getElementById('type-input');
|
||||
const text = input.value.trim();
|
||||
if (!text || !currentCaller || isProcessing) return;
|
||||
|
||||
input.value = '';
|
||||
document.getElementById('type-modal').classList.add('hidden');
|
||||
|
||||
isProcessing = true;
|
||||
addMessage('You', text);
|
||||
|
||||
try {
|
||||
showStatus(`${currentCaller.name} is thinking...`);
|
||||
|
||||
const chatRes = await fetch('/api/chat', {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({ text })
|
||||
});
|
||||
const chatData = await chatRes.json();
|
||||
|
||||
addMessage(chatData.caller, chatData.text);
|
||||
|
||||
// TTS (plays on server) - only if we have text
|
||||
if (chatData.text && chatData.text.trim()) {
|
||||
showStatus(`${currentCaller.name} is speaking...`);
|
||||
|
||||
await fetch('/api/tts', {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({
|
||||
text: chatData.text,
|
||||
voice_id: chatData.voice_id,
|
||||
phone_filter: phoneFilter
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
} catch (err) {
|
||||
log('Error: ' + err.message);
|
||||
}
|
||||
|
||||
isProcessing = false;
|
||||
hideStatus();
|
||||
}
|
||||
|
||||
|
||||
// --- Music (Server-Side) ---
|
||||
async function loadMusic() {
|
||||
try {
|
||||
const res = await fetch('/api/music');
|
||||
const data = await res.json();
|
||||
tracks = data.tracks || [];
|
||||
|
||||
const select = document.getElementById('track-select');
|
||||
if (!select) return;
|
||||
select.innerHTML = '';
|
||||
|
||||
tracks.forEach((track, i) => {
|
||||
const option = document.createElement('option');
|
||||
option.value = track.file;
|
||||
option.textContent = track.name;
|
||||
select.appendChild(option);
|
||||
});
|
||||
console.log('Loaded', tracks.length, 'tracks');
|
||||
} catch (err) {
|
||||
console.error('loadMusic error:', err);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
async function playMusic() {
|
||||
const select = document.getElementById('track-select');
|
||||
const track = select?.value;
|
||||
if (!track) return;
|
||||
|
||||
await fetch('/api/music/play', {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({ track, action: 'play' })
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
async function stopMusic() {
|
||||
await fetch('/api/music/stop', { method: 'POST' });
|
||||
}
|
||||
|
||||
|
||||
async function setMusicVolume(e) {
|
||||
const volume = e.target.value / 100;
|
||||
await fetch('/api/music/volume', {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({ track: '', action: 'volume', volume })
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
// --- Sound Effects (Server-Side) ---
|
||||
async function loadSounds() {
|
||||
try {
|
||||
const res = await fetch('/api/sounds');
|
||||
const data = await res.json();
|
||||
sounds = data.sounds || [];
|
||||
|
||||
const board = document.getElementById('soundboard');
|
||||
if (!board) return;
|
||||
board.innerHTML = '';
|
||||
|
||||
sounds.forEach(sound => {
|
||||
const btn = document.createElement('button');
|
||||
btn.className = 'sound-btn';
|
||||
btn.textContent = sound.name;
|
||||
btn.addEventListener('click', () => playSFX(sound.file));
|
||||
board.appendChild(btn);
|
||||
});
|
||||
console.log('Loaded', sounds.length, 'sounds');
|
||||
} catch (err) {
|
||||
console.error('loadSounds error:', err);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
async function playSFX(soundFile) {
|
||||
await fetch('/api/sfx/play', {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({ sound: soundFile })
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
// --- Settings ---
|
||||
async function loadSettings() {
|
||||
try {
|
||||
const res = await fetch('/api/settings');
|
||||
const data = await res.json();
|
||||
|
||||
const providerEl = document.getElementById('provider');
|
||||
if (providerEl) providerEl.value = data.provider || 'openrouter';
|
||||
|
||||
const modelSelect = document.getElementById('openrouter-model');
|
||||
if (modelSelect) {
|
||||
modelSelect.innerHTML = '';
|
||||
(data.available_openrouter_models || []).forEach(model => {
|
||||
const option = document.createElement('option');
|
||||
option.value = model;
|
||||
option.textContent = model;
|
||||
if (model === data.openrouter_model) option.selected = true;
|
||||
modelSelect.appendChild(option);
|
||||
});
|
||||
}
|
||||
|
||||
const ollamaModel = document.getElementById('ollama-model');
|
||||
const ollamaHost = document.getElementById('ollama-host');
|
||||
if (ollamaHost) ollamaHost.value = data.ollama_host || 'http://localhost:11434';
|
||||
|
||||
// Populate Ollama models dropdown
|
||||
if (ollamaModel) {
|
||||
ollamaModel.innerHTML = '';
|
||||
const ollamaModels = data.available_ollama_models || [];
|
||||
console.log('Ollama models from API:', ollamaModels.length, ollamaModels);
|
||||
if (ollamaModels.length === 0) {
|
||||
const option = document.createElement('option');
|
||||
option.value = data.ollama_model || 'llama3.2';
|
||||
option.textContent = data.ollama_model || 'llama3.2';
|
||||
ollamaModel.appendChild(option);
|
||||
} else {
|
||||
ollamaModels.forEach(model => {
|
||||
const option = document.createElement('option');
|
||||
option.value = model;
|
||||
option.textContent = model;
|
||||
if (model === data.ollama_model) option.selected = true;
|
||||
ollamaModel.appendChild(option);
|
||||
});
|
||||
}
|
||||
console.log('Ollama dropdown options:', ollamaModel.options.length);
|
||||
} else {
|
||||
console.log('Ollama model element not found!');
|
||||
}
|
||||
|
||||
// TTS provider
|
||||
const ttsProvider = document.getElementById('tts-provider');
|
||||
if (ttsProvider) ttsProvider.value = data.tts_provider || 'elevenlabs';
|
||||
|
||||
updateProviderUI();
|
||||
console.log('Settings loaded:', data.provider, 'TTS:', data.tts_provider);
|
||||
} catch (err) {
|
||||
console.error('loadSettings error:', err);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
function updateProviderUI() {
|
||||
const isOpenRouter = document.getElementById('provider')?.value === 'openrouter';
|
||||
document.getElementById('openrouter-settings')?.classList.toggle('hidden', !isOpenRouter);
|
||||
document.getElementById('ollama-settings')?.classList.toggle('hidden', isOpenRouter);
|
||||
}
|
||||
|
||||
|
||||
async function saveSettings() {
|
||||
// Save audio devices
|
||||
await saveAudioDevices();
|
||||
|
||||
// Save LLM and TTS settings
|
||||
await fetch('/api/settings', {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({
|
||||
provider: document.getElementById('provider')?.value,
|
||||
openrouter_model: document.getElementById('openrouter-model')?.value,
|
||||
ollama_model: document.getElementById('ollama-model')?.value,
|
||||
ollama_host: document.getElementById('ollama-host')?.value,
|
||||
tts_provider: document.getElementById('tts-provider')?.value
|
||||
})
|
||||
});
|
||||
|
||||
document.getElementById('settings-modal')?.classList.add('hidden');
|
||||
log('Settings saved');
|
||||
}
|
||||
|
||||
|
||||
// --- UI Helpers ---
|
||||
function addMessage(sender, text) {
|
||||
const chat = document.getElementById('chat');
|
||||
if (!chat) {
|
||||
console.log(`[${sender}]: ${text}`);
|
||||
return;
|
||||
}
|
||||
const div = document.createElement('div');
|
||||
div.className = `message ${sender === 'You' ? 'host' : 'caller'}`;
|
||||
div.innerHTML = `<strong>${sender}:</strong> ${text}`;
|
||||
chat.appendChild(div);
|
||||
chat.scrollTop = chat.scrollHeight;
|
||||
}
|
||||
|
||||
|
||||
function clearChat() {
|
||||
const chat = document.getElementById('chat');
|
||||
if (chat) chat.innerHTML = '';
|
||||
}
|
||||
|
||||
|
||||
function log(text) {
|
||||
addMessage('System', text);
|
||||
}
|
||||
|
||||
|
||||
function showStatus(text) {
|
||||
const status = document.getElementById('status');
|
||||
if (status) {
|
||||
status.textContent = text;
|
||||
status.classList.remove('hidden');
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
function hideStatus() {
|
||||
const status = document.getElementById('status');
|
||||
if (status) status.classList.add('hidden');
|
||||
}
|
||||
|
||||
|
||||
// --- Server Control & Logging ---
|
||||
|
||||
function startLogPolling() {
|
||||
// Poll for logs every second
|
||||
logPollInterval = setInterval(fetchLogs, 1000);
|
||||
// Initial fetch
|
||||
fetchLogs();
|
||||
}
|
||||
|
||||
|
||||
async function fetchLogs() {
|
||||
try {
|
||||
const res = await fetch('/api/logs?lines=200');
|
||||
const data = await res.json();
|
||||
|
||||
const logEl = document.getElementById('server-log');
|
||||
if (!logEl) return;
|
||||
|
||||
// Only update if we have new logs
|
||||
if (data.logs.length !== lastLogCount) {
|
||||
lastLogCount = data.logs.length;
|
||||
|
||||
logEl.innerHTML = data.logs.map(line => {
|
||||
let className = 'log-line';
|
||||
if (line.includes('Error') || line.includes('error') || line.includes('ERROR')) {
|
||||
className += ' error';
|
||||
} else if (line.includes('Warning') || line.includes('WARNING')) {
|
||||
className += ' warning';
|
||||
} else if (line.includes('[TTS]')) {
|
||||
className += ' tts';
|
||||
} else if (line.includes('[Chat]')) {
|
||||
className += ' chat';
|
||||
}
|
||||
return `<div class="${className}">${escapeHtml(line)}</div>`;
|
||||
}).join('');
|
||||
|
||||
if (autoScroll) {
|
||||
logEl.scrollTop = logEl.scrollHeight;
|
||||
}
|
||||
}
|
||||
} catch (err) {
|
||||
// Server might be down, that's ok
|
||||
console.log('Log fetch failed (server may be restarting)');
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
function escapeHtml(text) {
|
||||
const div = document.createElement('div');
|
||||
div.textContent = text;
|
||||
return div.innerHTML;
|
||||
}
|
||||
|
||||
|
||||
async function restartServer() {
|
||||
if (!confirm('Restart the server? This will briefly disconnect you.')) return;
|
||||
|
||||
try {
|
||||
await fetch('/api/server/restart', { method: 'POST' });
|
||||
log('Server restart requested...');
|
||||
|
||||
// Clear the log and wait for server to come back
|
||||
document.getElementById('server-log').innerHTML = '<div class="log-line">Restarting server...</div>';
|
||||
|
||||
// Poll until server is back
|
||||
let attempts = 0;
|
||||
const checkServer = setInterval(async () => {
|
||||
attempts++;
|
||||
try {
|
||||
const res = await fetch('/api/server/status');
|
||||
if (res.ok) {
|
||||
clearInterval(checkServer);
|
||||
log('Server restarted successfully');
|
||||
await loadSettings();
|
||||
}
|
||||
} catch (e) {
|
||||
if (attempts > 30) {
|
||||
clearInterval(checkServer);
|
||||
log('Server did not restart - check terminal');
|
||||
}
|
||||
}
|
||||
}, 1000);
|
||||
|
||||
} catch (err) {
|
||||
log('Failed to restart server: ' + err.message);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
async function stopServer() {
|
||||
if (!confirm('Stop the server? You will need to restart it manually.')) return;
|
||||
|
||||
try {
|
||||
await fetch('/api/server/stop', { method: 'POST' });
|
||||
log('Server stop requested...');
|
||||
document.getElementById('server-log').innerHTML = '<div class="log-line">Server stopped. Run ./run.sh to restart.</div>';
|
||||
} catch (err) {
|
||||
log('Failed to stop server: ' + err.message);
|
||||
}
|
||||
}
|
||||
77
generate_callers.py
Normal file
77
generate_callers.py
Normal file
@@ -0,0 +1,77 @@
|
||||
import os
|
||||
os.environ["SUNO_USE_SMALL_MODELS"] = "False"
|
||||
|
||||
from bark import generate_audio, preload_models
|
||||
from scipy.io.wavfile import write as write_wav
|
||||
from scipy.signal import butter, filtfilt
|
||||
import numpy as np
|
||||
|
||||
def phone_filter(audio, sample_rate=24000):
|
||||
"""Apply telephone bandpass filter (300Hz - 3400Hz)"""
|
||||
low = 300 / (sample_rate / 2)
|
||||
high = 3400 / (sample_rate / 2)
|
||||
b, a = butter(4, [low, high], btype='band')
|
||||
filtered = filtfilt(b, a, audio)
|
||||
|
||||
# Add slight compression and normalize
|
||||
filtered = np.tanh(filtered * 1.5) * 0.9
|
||||
return filtered.astype(np.float32)
|
||||
|
||||
# Define your callers
|
||||
CALLERS = [
|
||||
{
|
||||
"name": "caller1_mike",
|
||||
"voice": "v2/en_speaker_6",
|
||||
"text": """Hey, thanks for taking my call!
|
||||
So I've been thinking about this a lot and...
|
||||
I know it sounds crazy, but hear me out."""
|
||||
},
|
||||
{
|
||||
"name": "caller2_sarah",
|
||||
"voice": "v2/en_speaker_9",
|
||||
"text": """Hi! Oh my gosh, I can't believe I got through.
|
||||
Okay so... this is kind of a long story,
|
||||
but basically I had this experience last week that blew my mind."""
|
||||
},
|
||||
{
|
||||
"name": "caller3_dave",
|
||||
"voice": "v2/en_speaker_1",
|
||||
"text": """Yeah, hey. First time caller, long time listener.
|
||||
Look, I gotta be honest with you here,
|
||||
I think you're missing something important."""
|
||||
},
|
||||
{
|
||||
"name": "caller4_jenny",
|
||||
"voice": "v2/en_speaker_3",
|
||||
"text": """Okay okay, so get this...
|
||||
I was literally just talking about this with my friend yesterday!
|
||||
And she said, and I quote, well, I can't say that on air."""
|
||||
},
|
||||
]
|
||||
|
||||
def main():
|
||||
print("Loading models...")
|
||||
preload_models()
|
||||
|
||||
os.makedirs("output", exist_ok=True)
|
||||
|
||||
for caller in CALLERS:
|
||||
print(f"\nGenerating: {caller['name']}")
|
||||
|
||||
# Generate raw audio
|
||||
audio = generate_audio(caller["text"], history_prompt=caller["voice"])
|
||||
|
||||
# Save clean version
|
||||
write_wav(f"output/{caller['name']}_clean.wav", 24000, audio)
|
||||
|
||||
# Apply phone filter and save
|
||||
phone_audio = phone_filter(audio)
|
||||
write_wav(f"output/{caller['name']}_phone.wav", 24000, phone_audio)
|
||||
|
||||
print(f" Saved: output/{caller['name']}_clean.wav")
|
||||
print(f" Saved: output/{caller['name']}_phone.wav")
|
||||
|
||||
print("\nDone! Check the output/ folder.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
102
generate_sounds.py
Normal file
102
generate_sounds.py
Normal file
@@ -0,0 +1,102 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Generate sound effects using ElevenLabs Sound Effects API
|
||||
"""
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
|
||||
SOUNDS_DIR = Path(__file__).parent / "sounds"
|
||||
SOUNDS_DIR.mkdir(exist_ok=True)
|
||||
|
||||
# Sound effects to generate with descriptions
|
||||
SOUND_EFFECTS = {
|
||||
'airhorn.wav': 'loud air horn blast, sports event',
|
||||
'boo.wav': 'crowd booing, disappointed audience',
|
||||
'crickets.wav': 'crickets chirping, awkward silence',
|
||||
'drumroll.wav': 'drum roll, building suspense',
|
||||
'buzzer.wav': 'game show wrong answer buzzer',
|
||||
'laugh.wav': 'audience laughing, sitcom laugh track',
|
||||
'rimshot.wav': 'ba dum tss, drum rimshot comedy',
|
||||
'sad_trombone.wav': 'sad trombone, wah wah wah failure sound',
|
||||
'phone_ring.wav': 'old telephone ringing',
|
||||
'cheer.wav': 'crowd cheering and applauding',
|
||||
'scratch.wav': 'vinyl record scratch',
|
||||
'wow.wav': 'crowd saying wow, impressed reaction',
|
||||
'fart.wav': 'comedic fart sound effect',
|
||||
'victory.wav': 'victory fanfare, triumphant horns',
|
||||
'uh_oh.wav': 'uh oh, something went wrong sound',
|
||||
}
|
||||
|
||||
def generate_sound(name, description):
|
||||
"""Generate a sound effect using ElevenLabs"""
|
||||
from elevenlabs.client import ElevenLabs
|
||||
import soundfile as sf
|
||||
import numpy as np
|
||||
|
||||
output_path = SOUNDS_DIR / name
|
||||
|
||||
if output_path.exists():
|
||||
print(f" ✓ {name} (already exists)")
|
||||
return True
|
||||
|
||||
try:
|
||||
print(f" Generating {name}: '{description}'...")
|
||||
|
||||
client = ElevenLabs(api_key=os.getenv('ELEVENLABS_API_KEY'))
|
||||
|
||||
# Generate sound effect
|
||||
result = client.text_to_sound_effects.convert(
|
||||
text=description,
|
||||
duration_seconds=2.0,
|
||||
)
|
||||
|
||||
# Collect audio data
|
||||
audio_data = b''.join(result)
|
||||
|
||||
# Save as mp3 first, then convert
|
||||
temp_mp3 = SOUNDS_DIR / f"temp_{name}.mp3"
|
||||
with open(temp_mp3, 'wb') as f:
|
||||
f.write(audio_data)
|
||||
|
||||
# Convert to wav with ffmpeg
|
||||
import subprocess
|
||||
subprocess.run([
|
||||
'ffmpeg', '-y', '-i', str(temp_mp3),
|
||||
'-ar', '24000', '-ac', '1',
|
||||
str(output_path)
|
||||
], capture_output=True, check=True)
|
||||
|
||||
temp_mp3.unlink()
|
||||
print(f" ✓ {name}")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f" ✗ {name} ({e})")
|
||||
return False
|
||||
|
||||
def main():
|
||||
print("Generating sound effects with ElevenLabs...")
|
||||
print(f"Saving to: {SOUNDS_DIR}")
|
||||
print("(This uses your ElevenLabs credits)\n")
|
||||
|
||||
# Check for ffmpeg
|
||||
import subprocess
|
||||
try:
|
||||
subprocess.run(['ffmpeg', '-version'], capture_output=True, check=True)
|
||||
except:
|
||||
print("ERROR: ffmpeg required. Install with: brew install ffmpeg")
|
||||
return
|
||||
|
||||
success = 0
|
||||
for name, description in SOUND_EFFECTS.items():
|
||||
if generate_sound(name, description):
|
||||
success += 1
|
||||
|
||||
print(f"\nGenerated {success}/{len(SOUND_EFFECTS)} sounds.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
400
publish_episode.py
Executable file
400
publish_episode.py
Executable file
@@ -0,0 +1,400 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Podcast Episode Publisher
|
||||
Transcribes audio, generates metadata, and publishes to Castopod.
|
||||
|
||||
Usage:
|
||||
python publish_episode.py /path/to/episode.mp3
|
||||
python publish_episode.py /path/to/episode.mp3 --episode-number 3
|
||||
python publish_episode.py /path/to/episode.mp3 --dry-run
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
import base64
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
from dotenv import load_dotenv
|
||||
|
||||
# Load environment variables
|
||||
load_dotenv(Path(__file__).parent / ".env")
|
||||
|
||||
# Configuration
|
||||
CASTOPOD_URL = "https://podcast.macneilmediagroup.com"
|
||||
CASTOPOD_USERNAME = "admin"
|
||||
CASTOPOD_PASSWORD = "podcast2026api"
|
||||
PODCAST_ID = 1
|
||||
PODCAST_HANDLE = "LukeAtTheRoost"
|
||||
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
|
||||
WHISPER_MODEL = "base" # Options: tiny, base, small, medium, large
|
||||
|
||||
# NAS Configuration for chapters upload
|
||||
NAS_HOST = "mmgnas-10g"
|
||||
NAS_USER = "luke"
|
||||
NAS_SSH_PORT = 8001
|
||||
DOCKER_PATH = "/share/CACHEDEV1_DATA/.qpkg/container-station/bin/docker"
|
||||
CASTOPOD_CONTAINER = "castopod-castopod-1"
|
||||
MARIADB_CONTAINER = "castopod-mariadb-1"
|
||||
DB_USER = "castopod"
|
||||
DB_PASS = "BYtbFfk3ndeVabb26xb0UyKU"
|
||||
DB_NAME = "castopod"
|
||||
|
||||
|
||||
def get_auth_header():
|
||||
"""Get Basic Auth header for Castopod API."""
|
||||
credentials = base64.b64encode(
|
||||
f"{CASTOPOD_USERNAME}:{CASTOPOD_PASSWORD}".encode()
|
||||
).decode()
|
||||
return {"Authorization": f"Basic {credentials}"}
|
||||
|
||||
|
||||
def transcribe_audio(audio_path: str) -> dict:
|
||||
"""Transcribe audio using faster-whisper with timestamps."""
|
||||
print(f"[1/5] Transcribing {audio_path}...")
|
||||
|
||||
try:
|
||||
from faster_whisper import WhisperModel
|
||||
except ImportError:
|
||||
print("Error: faster-whisper not installed. Run: pip install faster-whisper")
|
||||
sys.exit(1)
|
||||
|
||||
model = WhisperModel(WHISPER_MODEL, compute_type="int8")
|
||||
segments, info = model.transcribe(audio_path, word_timestamps=True)
|
||||
|
||||
transcript_segments = []
|
||||
full_text = []
|
||||
|
||||
for segment in segments:
|
||||
transcript_segments.append({
|
||||
"start": segment.start,
|
||||
"end": segment.end,
|
||||
"text": segment.text.strip()
|
||||
})
|
||||
full_text.append(segment.text.strip())
|
||||
|
||||
print(f" Transcribed {info.duration:.1f} seconds of audio")
|
||||
|
||||
return {
|
||||
"segments": transcript_segments,
|
||||
"full_text": " ".join(full_text),
|
||||
"duration": int(info.duration)
|
||||
}
|
||||
|
||||
|
||||
def generate_metadata(transcript: dict, episode_number: int) -> dict:
|
||||
"""Use LLM to generate title, description, and chapters from transcript."""
|
||||
print("[2/5] Generating metadata with LLM...")
|
||||
|
||||
if not OPENROUTER_API_KEY:
|
||||
print("Error: OPENROUTER_API_KEY not set in .env")
|
||||
sys.exit(1)
|
||||
|
||||
# Prepare transcript with timestamps for chapter detection
|
||||
timestamped_text = ""
|
||||
for seg in transcript["segments"]:
|
||||
mins = int(seg["start"] // 60)
|
||||
secs = int(seg["start"] % 60)
|
||||
timestamped_text += f"[{mins:02d}:{secs:02d}] {seg['text']}\n"
|
||||
|
||||
prompt = f"""Analyze this podcast transcript and generate metadata.
|
||||
|
||||
TRANSCRIPT:
|
||||
{timestamped_text}
|
||||
|
||||
Generate a JSON response with:
|
||||
1. "title": A catchy episode title (include "Episode {episode_number}:" prefix)
|
||||
2. "description": A 2-4 sentence description summarizing the episode's content. Mention callers by name and their topics. End with something engaging.
|
||||
3. "chapters": An array of chapter objects with "startTime" (in seconds) and "title". Include:
|
||||
- "Intro" at 0 seconds
|
||||
- A chapter for each caller/topic (use caller names if mentioned)
|
||||
- "Outro" near the end
|
||||
|
||||
Respond with ONLY valid JSON, no markdown or explanation."""
|
||||
|
||||
response = requests.post(
|
||||
"https://openrouter.ai/api/v1/chat/completions",
|
||||
headers={
|
||||
"Authorization": f"Bearer {OPENROUTER_API_KEY}",
|
||||
"Content-Type": "application/json"
|
||||
},
|
||||
json={
|
||||
"model": "anthropic/claude-3-haiku",
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"temperature": 0.7
|
||||
}
|
||||
)
|
||||
|
||||
if response.status_code != 200:
|
||||
print(f"Error from OpenRouter: {response.text}")
|
||||
sys.exit(1)
|
||||
|
||||
result = response.json()
|
||||
content = result["choices"][0]["message"]["content"]
|
||||
|
||||
# Parse JSON from response (handle markdown code blocks)
|
||||
content = content.strip()
|
||||
if content.startswith("```"):
|
||||
content = re.sub(r"^```(?:json)?\n?", "", content)
|
||||
content = re.sub(r"\n?```$", "", content)
|
||||
|
||||
try:
|
||||
metadata = json.loads(content)
|
||||
except json.JSONDecodeError as e:
|
||||
print(f"Error parsing LLM response: {e}")
|
||||
print(f"Response was: {content}")
|
||||
sys.exit(1)
|
||||
|
||||
print(f" Title: {metadata['title']}")
|
||||
print(f" Chapters: {len(metadata['chapters'])}")
|
||||
|
||||
return metadata
|
||||
|
||||
|
||||
def create_episode(audio_path: str, metadata: dict, duration: int) -> dict:
|
||||
"""Create episode on Castopod."""
|
||||
print("[3/5] Creating episode on Castopod...")
|
||||
|
||||
headers = get_auth_header()
|
||||
|
||||
# Upload audio and create episode
|
||||
with open(audio_path, "rb") as f:
|
||||
files = {
|
||||
"audio_file": (Path(audio_path).name, f, "audio/mpeg")
|
||||
}
|
||||
data = {
|
||||
"title": metadata["title"],
|
||||
"description_markdown": metadata["description"],
|
||||
"parental_advisory": "explicit",
|
||||
"type": "full",
|
||||
"created_by": "1"
|
||||
}
|
||||
|
||||
response = requests.post(
|
||||
f"{CASTOPOD_URL}/api/rest/v1/podcasts/{PODCAST_ID}/episodes",
|
||||
headers=headers,
|
||||
files=files,
|
||||
data=data
|
||||
)
|
||||
|
||||
if response.status_code not in (200, 201):
|
||||
print(f"Error creating episode: {response.text}")
|
||||
sys.exit(1)
|
||||
|
||||
episode = response.json()
|
||||
print(f" Created episode ID: {episode['id']}")
|
||||
print(f" Slug: {episode['slug']}")
|
||||
|
||||
return episode
|
||||
|
||||
|
||||
def publish_episode(episode_id: int) -> dict:
|
||||
"""Publish the episode."""
|
||||
print("[4/5] Publishing episode...")
|
||||
|
||||
headers = get_auth_header()
|
||||
|
||||
response = requests.post(
|
||||
f"{CASTOPOD_URL}/api/rest/v1/episodes/{episode_id}/publish",
|
||||
headers=headers,
|
||||
data={
|
||||
"publication_method": "now",
|
||||
"created_by": "1"
|
||||
}
|
||||
)
|
||||
|
||||
if response.status_code != 200:
|
||||
print(f"Error publishing: {response.text}")
|
||||
sys.exit(1)
|
||||
|
||||
episode = response.json()
|
||||
published_at = episode.get("published_at", {})
|
||||
if isinstance(published_at, dict):
|
||||
print(f" Published at: {published_at.get('date', 'unknown')}")
|
||||
else:
|
||||
print(f" Published at: {published_at}")
|
||||
|
||||
return episode
|
||||
|
||||
|
||||
def save_chapters(metadata: dict, output_path: str):
|
||||
"""Save chapters to JSON file."""
|
||||
chapters_data = {
|
||||
"version": "1.2.0",
|
||||
"chapters": metadata["chapters"]
|
||||
}
|
||||
|
||||
with open(output_path, "w") as f:
|
||||
json.dump(chapters_data, f, indent=2)
|
||||
|
||||
print(f" Chapters saved to: {output_path}")
|
||||
|
||||
|
||||
def run_ssh_command(command: str) -> tuple[bool, str]:
|
||||
"""Run a command on the NAS via SSH."""
|
||||
ssh_cmd = [
|
||||
"ssh", "-p", str(NAS_SSH_PORT),
|
||||
f"{NAS_USER}@{NAS_HOST}",
|
||||
command
|
||||
]
|
||||
try:
|
||||
result = subprocess.run(ssh_cmd, capture_output=True, text=True, timeout=30)
|
||||
return result.returncode == 0, result.stdout.strip() or result.stderr.strip()
|
||||
except subprocess.TimeoutExpired:
|
||||
return False, "SSH command timed out"
|
||||
except Exception as e:
|
||||
return False, str(e)
|
||||
|
||||
|
||||
def upload_chapters_to_castopod(episode_slug: str, episode_id: int, chapters_path: str) -> bool:
|
||||
"""Upload chapters file to Castopod via SSH and link in database."""
|
||||
print("[4.5/5] Uploading chapters to Castopod...")
|
||||
|
||||
chapters_filename = f"{episode_slug}-chapters.json"
|
||||
remote_path = f"podcasts/{PODCAST_HANDLE}/{chapters_filename}"
|
||||
|
||||
# Read local chapters file
|
||||
with open(chapters_path, "r") as f:
|
||||
chapters_content = f.read()
|
||||
|
||||
# Base64 encode for safe transfer
|
||||
chapters_b64 = base64.b64encode(chapters_content.encode()).decode()
|
||||
|
||||
# Upload file to container using base64 decode
|
||||
upload_cmd = f'echo "{chapters_b64}" | base64 -d | {DOCKER_PATH} exec -i {CASTOPOD_CONTAINER} tee /var/www/castopod/public/media/{remote_path} > /dev/null'
|
||||
success, output = run_ssh_command(upload_cmd)
|
||||
if not success:
|
||||
print(f" Warning: Failed to upload chapters file: {output}")
|
||||
return False
|
||||
|
||||
# Get file size
|
||||
file_size = len(chapters_content)
|
||||
|
||||
# Insert into media table
|
||||
insert_sql = f"""INSERT INTO cp_media (file_key, file_size, file_mimetype, type, uploaded_by, updated_by, uploaded_at, updated_at)
|
||||
VALUES ('{remote_path}', {file_size}, 'application/json', 'chapters', 1, 1, NOW(), NOW())"""
|
||||
db_cmd = f'{DOCKER_PATH} exec {MARIADB_CONTAINER} mysql -u {DB_USER} -p{DB_PASS} {DB_NAME} -e "{insert_sql}; SELECT LAST_INSERT_ID();"'
|
||||
success, output = run_ssh_command(db_cmd)
|
||||
if not success:
|
||||
print(f" Warning: Failed to insert chapters in database: {output}")
|
||||
return False
|
||||
|
||||
# Parse media ID from output
|
||||
try:
|
||||
lines = output.strip().split('\n')
|
||||
media_id = int(lines[-1])
|
||||
except (ValueError, IndexError):
|
||||
print(f" Warning: Could not parse media ID from: {output}")
|
||||
return False
|
||||
|
||||
# Link chapters to episode
|
||||
update_sql = f"UPDATE cp_episodes SET chapters_id = {media_id} WHERE id = {episode_id}"
|
||||
db_cmd = f'{DOCKER_PATH} exec {MARIADB_CONTAINER} mysql -u {DB_USER} -p{DB_PASS} {DB_NAME} -e "{update_sql}"'
|
||||
success, output = run_ssh_command(db_cmd)
|
||||
if not success:
|
||||
print(f" Warning: Failed to link chapters to episode: {output}")
|
||||
return False
|
||||
|
||||
# Clear Castopod cache
|
||||
cache_cmd = f'{DOCKER_PATH} exec {CASTOPOD_CONTAINER} php spark cache:clear'
|
||||
run_ssh_command(cache_cmd)
|
||||
|
||||
print(f" Chapters uploaded and linked (media_id: {media_id})")
|
||||
return True
|
||||
|
||||
|
||||
def get_next_episode_number() -> int:
|
||||
"""Get the next episode number from Castopod."""
|
||||
headers = get_auth_header()
|
||||
|
||||
response = requests.get(
|
||||
f"{CASTOPOD_URL}/api/rest/v1/podcasts/{PODCAST_ID}/episodes",
|
||||
headers=headers
|
||||
)
|
||||
|
||||
if response.status_code != 200:
|
||||
return 1
|
||||
|
||||
episodes = response.json()
|
||||
if not episodes:
|
||||
return 1
|
||||
|
||||
max_num = max(ep.get("number", 0) for ep in episodes)
|
||||
return max_num + 1
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Publish podcast episode to Castopod")
|
||||
parser.add_argument("audio_file", help="Path to the audio file (MP3)")
|
||||
parser.add_argument("--episode-number", "-n", type=int, help="Episode number (auto-detected if not provided)")
|
||||
parser.add_argument("--dry-run", "-d", action="store_true", help="Generate metadata but don't publish")
|
||||
parser.add_argument("--title", "-t", help="Override generated title")
|
||||
parser.add_argument("--description", help="Override generated description")
|
||||
args = parser.parse_args()
|
||||
|
||||
audio_path = Path(args.audio_file).expanduser().resolve()
|
||||
if not audio_path.exists():
|
||||
print(f"Error: Audio file not found: {audio_path}")
|
||||
sys.exit(1)
|
||||
|
||||
# Determine episode number
|
||||
if args.episode_number:
|
||||
episode_number = args.episode_number
|
||||
else:
|
||||
episode_number = get_next_episode_number()
|
||||
print(f"Episode number: {episode_number}")
|
||||
|
||||
# Step 1: Transcribe
|
||||
transcript = transcribe_audio(str(audio_path))
|
||||
|
||||
# Step 2: Generate metadata
|
||||
metadata = generate_metadata(transcript, episode_number)
|
||||
|
||||
# Apply overrides
|
||||
if args.title:
|
||||
metadata["title"] = args.title
|
||||
if args.description:
|
||||
metadata["description"] = args.description
|
||||
|
||||
# Save chapters file
|
||||
chapters_path = audio_path.with_suffix(".chapters.json")
|
||||
save_chapters(metadata, str(chapters_path))
|
||||
|
||||
if args.dry_run:
|
||||
print("\n[DRY RUN] Would publish with:")
|
||||
print(f" Title: {metadata['title']}")
|
||||
print(f" Description: {metadata['description']}")
|
||||
print(f" Chapters: {json.dumps(metadata['chapters'], indent=2)}")
|
||||
print("\nChapters file saved. Run without --dry-run to publish.")
|
||||
return
|
||||
|
||||
# Step 3: Create episode
|
||||
episode = create_episode(str(audio_path), metadata, transcript["duration"])
|
||||
|
||||
# Step 4: Publish
|
||||
episode = publish_episode(episode["id"])
|
||||
|
||||
# Step 4.5: Upload chapters via SSH
|
||||
chapters_uploaded = upload_chapters_to_castopod(
|
||||
episode["slug"],
|
||||
episode["id"],
|
||||
str(chapters_path)
|
||||
)
|
||||
|
||||
# Step 5: Summary
|
||||
print("\n[5/5] Done!")
|
||||
print("=" * 50)
|
||||
print(f"Episode URL: {CASTOPOD_URL}/@{PODCAST_HANDLE}/episodes/{episode['slug']}")
|
||||
print(f"RSS Feed: {CASTOPOD_URL}/@{PODCAST_HANDLE}/feed.xml")
|
||||
print("=" * 50)
|
||||
if not chapters_uploaded:
|
||||
print("\nNote: Chapters upload failed. Add manually via Castopod admin UI")
|
||||
print(f" Chapters file: {chapters_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
1553
radio_show.py
Normal file
1553
radio_show.py
Normal file
File diff suppressed because it is too large
Load Diff
140
radio_simple.py
Normal file
140
radio_simple.py
Normal file
@@ -0,0 +1,140 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Simplified Radio Show - for debugging
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
import numpy as np
|
||||
import sounddevice as sd
|
||||
import soundfile as sf
|
||||
from faster_whisper import WhisperModel
|
||||
from scipy.signal import butter, filtfilt
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
|
||||
SAMPLE_RATE = 24000
|
||||
|
||||
CALLERS = {
|
||||
"1": ("Big Tony", "IKne3meq5aSn9XLyUdCD", "You are Big Tony, a loud Italian guy from Staten Island. Swear naturally, be opinionated. Keep it to 2 sentences."),
|
||||
"2": ("Drunk Diane", "FGY2WhTYpPnrIDTdsKH5", "You are Drunk Diane, tipsy woman at a bar. Ramble a bit, be funny. Keep it to 2 sentences."),
|
||||
"3": ("Stoner Phil", "bIHbv24MWmeRgasZH58o", "You are Stoner Phil, super chill stoner dude. Speak slow, be spacey but profound. Keep it to 2 sentences."),
|
||||
}
|
||||
|
||||
def phone_filter(audio):
|
||||
b, a = butter(4, [300/(SAMPLE_RATE/2), 3400/(SAMPLE_RATE/2)], btype='band')
|
||||
return (np.tanh(filtfilt(b, a, audio.flatten()) * 1.5) * 0.8).astype(np.float32)
|
||||
|
||||
class SimpleRadio:
|
||||
def __init__(self):
|
||||
print("Loading Whisper...")
|
||||
self.whisper = WhisperModel("base", device="cpu", compute_type="int8")
|
||||
|
||||
print("Connecting to ElevenLabs...")
|
||||
from elevenlabs.client import ElevenLabs
|
||||
self.tts = ElevenLabs(api_key=os.getenv("ELEVENLABS_API_KEY"))
|
||||
|
||||
print("Connecting to Ollama...")
|
||||
import ollama
|
||||
self.ollama = ollama
|
||||
|
||||
self.caller = CALLERS["1"]
|
||||
self.history = []
|
||||
print("\nReady!\n")
|
||||
|
||||
def record(self):
|
||||
print(" [Recording - press Enter to stop]")
|
||||
chunks = []
|
||||
recording = True
|
||||
|
||||
def callback(indata, frames, time, status):
|
||||
if recording:
|
||||
chunks.append(indata.copy())
|
||||
|
||||
with sd.InputStream(samplerate=SAMPLE_RATE, channels=1, callback=callback):
|
||||
input() # Wait for Enter
|
||||
|
||||
recording = False
|
||||
return np.vstack(chunks) if chunks else None
|
||||
|
||||
def transcribe(self, audio):
|
||||
import librosa
|
||||
audio_16k = librosa.resample(audio.flatten().astype(np.float32), orig_sr=SAMPLE_RATE, target_sr=16000)
|
||||
segments, _ = self.whisper.transcribe(audio_16k)
|
||||
return " ".join([s.text for s in segments]).strip()
|
||||
|
||||
def respond(self, text):
|
||||
self.history.append({"role": "user", "content": text})
|
||||
|
||||
response = self.ollama.chat(
|
||||
model="llama3.2:latest",
|
||||
messages=[{"role": "system", "content": self.caller[2]}] + self.history[-6:],
|
||||
options={"temperature": 0.9}
|
||||
)
|
||||
|
||||
reply = response["message"]["content"]
|
||||
self.history.append({"role": "assistant", "content": reply})
|
||||
return reply
|
||||
|
||||
def speak(self, text):
|
||||
print(" [Generating voice...]")
|
||||
audio_gen = self.tts.text_to_speech.convert(
|
||||
voice_id=self.caller[1],
|
||||
text=text,
|
||||
model_id="eleven_turbo_v2_5",
|
||||
output_format="pcm_24000"
|
||||
)
|
||||
|
||||
audio_bytes = b"".join(audio_gen)
|
||||
audio = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32768.0
|
||||
filtered = phone_filter(audio)
|
||||
|
||||
print(" [Playing...]")
|
||||
sd.play(filtered, SAMPLE_RATE)
|
||||
sd.wait()
|
||||
|
||||
def run(self):
|
||||
print("=" * 50)
|
||||
print(" SIMPLE RADIO - Type commands:")
|
||||
print(" 1/2/3 = switch caller")
|
||||
print(" r = record & respond")
|
||||
print(" t = type message (skip recording)")
|
||||
print(" q = quit")
|
||||
print("=" * 50)
|
||||
print(f"\nCaller: {self.caller[0]}\n")
|
||||
|
||||
while True:
|
||||
cmd = input("> ").strip().lower()
|
||||
|
||||
if cmd == 'q':
|
||||
break
|
||||
elif cmd in '123':
|
||||
self.caller = CALLERS[cmd]
|
||||
self.history = []
|
||||
print(f"\n📞 Switched to: {self.caller[0]}\n")
|
||||
elif cmd == 'r':
|
||||
audio = self.record()
|
||||
if audio is not None:
|
||||
print(" [Transcribing...]")
|
||||
text = self.transcribe(audio)
|
||||
print(f"\n YOU: {text}\n")
|
||||
if text:
|
||||
print(" [Thinking...]")
|
||||
reply = self.respond(text)
|
||||
print(f"\n 📞 {self.caller[0].upper()}: {reply}\n")
|
||||
self.speak(reply)
|
||||
elif cmd == 't':
|
||||
text = input(" Type message: ")
|
||||
if text:
|
||||
print(" [Thinking...]")
|
||||
reply = self.respond(text)
|
||||
print(f"\n 📞 {self.caller[0].upper()}: {reply}\n")
|
||||
self.speak(reply)
|
||||
else:
|
||||
print(" Commands: r=record, t=type, 1/2/3=caller, q=quit")
|
||||
|
||||
if __name__ == "__main__":
|
||||
radio = SimpleRadio()
|
||||
radio.run()
|
||||
16
requirements-web.txt
Normal file
16
requirements-web.txt
Normal file
@@ -0,0 +1,16 @@
|
||||
# Web application requirements (in addition to existing radio_show.py deps)
|
||||
fastapi>=0.109.0
|
||||
uvicorn[standard]>=0.27.0
|
||||
python-multipart>=0.0.6
|
||||
websockets>=12.0
|
||||
httpx>=0.26.0
|
||||
pydantic-settings>=2.1.0
|
||||
|
||||
# Already installed for CLI (but listed for completeness):
|
||||
# faster-whisper
|
||||
# elevenlabs
|
||||
# numpy
|
||||
# scipy
|
||||
# librosa
|
||||
# soundfile
|
||||
# python-dotenv
|
||||
60
run.sh
Executable file
60
run.sh
Executable file
@@ -0,0 +1,60 @@
|
||||
#!/bin/bash
|
||||
# AI Radio Show - Server Runner with restart support
|
||||
|
||||
LOG_FILE="/tmp/ai-radio-show.log"
|
||||
RESTART_FLAG="/tmp/ai-radio-show.restart"
|
||||
STOP_FLAG="/tmp/ai-radio-show.stop"
|
||||
|
||||
cd "$(dirname "$0")"
|
||||
|
||||
# Activate virtual environment
|
||||
source venv/bin/activate
|
||||
|
||||
# Cleanup old flags
|
||||
rm -f "$RESTART_FLAG" "$STOP_FLAG"
|
||||
|
||||
echo "AI Radio Show Server Runner"
|
||||
echo "Log file: $LOG_FILE"
|
||||
echo "Press Ctrl+C to stop"
|
||||
echo ""
|
||||
|
||||
while true; do
|
||||
echo "[$(date)] Starting server..." | tee -a "$LOG_FILE"
|
||||
|
||||
# Start uvicorn with output to both console and log file
|
||||
python -m uvicorn backend.main:app --host 0.0.0.0 --port 8000 2>&1 | tee -a "$LOG_FILE" &
|
||||
SERVER_PID=$!
|
||||
|
||||
# Wait for server to exit or restart signal
|
||||
while kill -0 $SERVER_PID 2>/dev/null; do
|
||||
if [ -f "$RESTART_FLAG" ]; then
|
||||
echo "[$(date)] Restart requested..." | tee -a "$LOG_FILE"
|
||||
rm -f "$RESTART_FLAG"
|
||||
kill $SERVER_PID 2>/dev/null
|
||||
wait $SERVER_PID 2>/dev/null
|
||||
sleep 1
|
||||
break
|
||||
fi
|
||||
|
||||
if [ -f "$STOP_FLAG" ]; then
|
||||
echo "[$(date)] Stop requested..." | tee -a "$LOG_FILE"
|
||||
rm -f "$STOP_FLAG"
|
||||
kill $SERVER_PID 2>/dev/null
|
||||
wait $SERVER_PID 2>/dev/null
|
||||
echo "[$(date)] Server stopped." | tee -a "$LOG_FILE"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
sleep 1
|
||||
done
|
||||
|
||||
# Check if we should restart or exit
|
||||
if [ -f "$STOP_FLAG" ]; then
|
||||
rm -f "$STOP_FLAG"
|
||||
echo "[$(date)] Server stopped." | tee -a "$LOG_FILE"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo "[$(date)] Restarting in 2 seconds..." | tee -a "$LOG_FILE"
|
||||
sleep 2
|
||||
done
|
||||
37
test.html
Normal file
37
test.html
Normal file
@@ -0,0 +1,37 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>Test JavaScript Loading</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>JavaScript Test</h1>
|
||||
<button id="test-btn">Test Button</button>
|
||||
<div id="output"></div>
|
||||
|
||||
<script src="frontend/js/audio.js"></script>
|
||||
<script src="frontend/js/websocket.js"></script>
|
||||
<script src="frontend/js/app.js"></script>
|
||||
<script>
|
||||
// Test if the classes loaded
|
||||
document.addEventListener('DOMContentLoaded', function() {
|
||||
const output = document.getElementById('output');
|
||||
|
||||
if (typeof AudioManager !== 'undefined') {
|
||||
output.innerHTML += '<p>✓ AudioManager loaded</p>';
|
||||
} else {
|
||||
output.innerHTML += '<p>✗ AudioManager failed to load</p>';
|
||||
}
|
||||
|
||||
if (typeof RadioShowApp !== 'undefined') {
|
||||
output.innerHTML += '<p>✓ RadioShowApp loaded</p>';
|
||||
} else {
|
||||
output.innerHTML += '<p>✗ RadioShowApp failed to load</p>';
|
||||
}
|
||||
|
||||
document.getElementById('test-btn').addEventListener('click', function() {
|
||||
output.innerHTML += '<p>Button click works!</p>';
|
||||
});
|
||||
});
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
Reference in New Issue
Block a user