Files
ai-podcast/backend/services/tts.py
tcpsyn bd6c8ccbab Landing page: testimonials slider, how-it-works page, 25 TTS voices
- Add testimonial slider with 8 fake caller reviews
- Add how-it-works page with visual architecture diagram
- Expand voice pools: Inworld 25 voices (14M/11F), ElevenLabs 22 (14M/8F)
- Voice pools auto-switch when TTS provider changes
- Add cover art locally, update cache-busted image refs
- Add "More from Luke" footer links (MMG, prints, YouTube)
- Ad channel configurable in settings UI

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-07 01:34:30 -07:00

707 lines
24 KiB
Python

"""TTS service with ElevenLabs, F5-TTS, MLX Kokoro, StyleTTS2, VITS, and Bark support"""
import os
import numpy as np
from scipy.signal import butter, filtfilt
from pathlib import Path
import tempfile
import torch
from ..config import settings
# Patch torch.load for compatibility with PyTorch 2.6+
_original_torch_load = torch.load
def _patched_torch_load(*args, **kwargs):
kwargs['weights_only'] = False
return _original_torch_load(*args, **kwargs)
torch.load = _patched_torch_load
# Global clients
_elevenlabs_client = None
_vits_tts = None
_bark_loaded = False
_kokoro_model = None
_styletts2_model = None
_f5tts_model = None
_chattts_model = None
_chattts_speakers = {} # Cache for speaker embeddings
# Kokoro voice mapping - using highest-graded voices
# Grades from https://huggingface.co/hexgrad/Kokoro-82M/blob/main/VOICES.md
KOKORO_VOICES = {
# Male voices (best available are C+ grade)
"VR6AewLTigWG4xSOukaG": "am_fenrir", # Tony - deep/powerful (C+)
"TxGEqnHWrfWFTfGW9XjX": "am_michael", # Rick - solid male voice (C+)
"pNInz6obpgDQGcFmaJgB": "am_puck", # Dennis - anxious dad (C+)
"ODq5zmih8GrVes37Dizd": "bm_george", # Earl - older/distinguished British (C)
"IKne3meq5aSn9XLyUdCD": "bm_fable", # Marcus - young British (C)
# Female voices (much better quality available)
"jBpfuIE2acCO8z3wKNLl": "af_heart", # Jasmine - best quality (A)
"EXAVITQu4vr4xnSDxMaL": "af_bella", # Megan - warm/friendly (A-)
"21m00Tcm4TlvDq8ikWAM": "bf_emma", # Tanya - professional British (B-)
"XB0fDUnXU5powFXDhCwa": "af_nicole", # Carla - Jersey mom (B-)
"pFZP5JQG7iQjIQuC4Bku": "af_sarah", # Brenda - overthinker (C+)
}
# Speed adjustments per voice (1.0 = normal, lower = slower/more natural)
# Slower speeds (0.85-0.95) generally sound more natural
KOKORO_SPEEDS = {
# Male voices - slower speeds help with C+ grade voices
"VR6AewLTigWG4xSOukaG": 0.9, # Tony (am_fenrir) - deep voice, slower
"TxGEqnHWrfWFTfGW9XjX": 0.92, # Rick (am_michael) - solid pace
"pNInz6obpgDQGcFmaJgB": 0.95, # Dennis (am_puck) - anxious but not rushed
"ODq5zmih8GrVes37Dizd": 0.85, # Earl (bm_george) - older, slower British
"IKne3meq5aSn9XLyUdCD": 0.95, # Marcus (bm_fable) - young, natural
# Female voices - A-grade voices can handle faster speeds
"jBpfuIE2acCO8z3wKNLl": 0.95, # Jasmine (af_heart) - best voice, natural pace
"EXAVITQu4vr4xnSDxMaL": 0.95, # Megan (af_bella) - warm
"21m00Tcm4TlvDq8ikWAM": 0.9, # Tanya (bf_emma) - professional British
"XB0fDUnXU5powFXDhCwa": 0.95, # Carla (af_nicole) - animated but clear
"pFZP5JQG7iQjIQuC4Bku": 0.92, # Brenda (af_sarah) - overthinker, measured
}
DEFAULT_KOKORO_VOICE = "af_heart"
DEFAULT_KOKORO_SPEED = 0.95
# VCTK speaker mapping - different voices for different callers
VITS_SPEAKERS = {
# Male voices
"VR6AewLTigWG4xSOukaG": "p226", # Tony
"TxGEqnHWrfWFTfGW9XjX": "p251", # Rick
"pNInz6obpgDQGcFmaJgB": "p245", # Dennis
"ODq5zmih8GrVes37Dizd": "p232", # Earl
"IKne3meq5aSn9XLyUdCD": "p252", # Marcus
# Female voices
"jBpfuIE2acCO8z3wKNLl": "p225", # Jasmine
"EXAVITQu4vr4xnSDxMaL": "p228", # Megan
"21m00Tcm4TlvDq8ikWAM": "p229", # Tanya
"XB0fDUnXU5powFXDhCwa": "p231", # Carla
"pFZP5JQG7iQjIQuC4Bku": "p233", # Brenda
}
DEFAULT_VITS_SPEAKER = "p225"
# Inworld voice mapping - maps ElevenLabs voice IDs to Inworld voices
# Full voice list from API: Alex, Ashley, Blake, Carter, Clive, Craig, Deborah,
# Dennis, Dominus, Edward, Elizabeth, Hades, Hana, Julia, Luna, Mark, Olivia,
# Pixie, Priya, Ronald, Sarah, Shaun, Theodore, Timothy, Wendy
INWORLD_VOICES = {
# Male voices - each caller gets a unique voice matching their personality
"VR6AewLTigWG4xSOukaG": "Edward", # Tony - fast-talking, emphatic, streetwise
"TxGEqnHWrfWFTfGW9XjX": "Shaun", # Rick - friendly, dynamic, conversational
"pNInz6obpgDQGcFmaJgB": "Alex", # Dennis - energetic, expressive, mildly nasal
"ODq5zmih8GrVes37Dizd": "Craig", # Earl - older British, refined, articulate
"IKne3meq5aSn9XLyUdCD": "Timothy", # Marcus - lively, upbeat American
# Female voices - each caller gets a unique voice matching their personality
"jBpfuIE2acCO8z3wKNLl": "Hana", # Jasmine - bright, expressive young female
"EXAVITQu4vr4xnSDxMaL": "Ashley", # Megan - warm, natural female
"21m00Tcm4TlvDq8ikWAM": "Wendy", # Tanya - posh, middle-aged British
"XB0fDUnXU5powFXDhCwa": "Sarah", # Carla - fast-talking, questioning tone
"pFZP5JQG7iQjIQuC4Bku": "Deborah", # Brenda - gentle, elegant
}
DEFAULT_INWORLD_VOICE = "Dennis"
def preprocess_text_for_kokoro(text: str) -> str:
"""
Preprocess text to improve Kokoro prosody and naturalness.
- Adds slight pauses via punctuation
- Handles contractions and abbreviations
- Normalizes spacing
"""
import re
# Normalize whitespace
text = ' '.join(text.split())
# Add comma pauses after common transition words (if no punctuation follows)
transitions = [
r'\b(Well)\s+(?=[A-Za-z])',
r'\b(So)\s+(?=[A-Za-z])',
r'\b(Now)\s+(?=[A-Za-z])',
r'\b(Look)\s+(?=[A-Za-z])',
r'\b(See)\s+(?=[A-Za-z])',
r'\b(Anyway)\s+(?=[A-Za-z])',
r'\b(Actually)\s+(?=[A-Za-z])',
r'\b(Honestly)\s+(?=[A-Za-z])',
r'\b(Basically)\s+(?=[A-Za-z])',
]
for pattern in transitions:
text = re.sub(pattern, r'\1, ', text)
# Add pause after "I mean" at start of sentence
text = re.sub(r'^(I mean)\s+', r'\1, ', text)
text = re.sub(r'\.\s+(I mean)\s+', r'. \1, ', text)
# Expand common abbreviations for better pronunciation
abbreviations = {
r'\bDr\.': 'Doctor',
r'\bMr\.': 'Mister',
r'\bMrs\.': 'Missus',
r'\bMs\.': 'Miss',
r'\bSt\.': 'Street',
r'\bAve\.': 'Avenue',
r'\betc\.': 'etcetera',
r'\bvs\.': 'versus',
r'\bw/': 'with',
r'\bw/o': 'without',
}
for abbr, expansion in abbreviations.items():
text = re.sub(abbr, expansion, text, flags=re.IGNORECASE)
# Add breath pause (comma) before conjunctions in long sentences
text = re.sub(r'(\w{20,})\s+(and|but|or)\s+', r'\1, \2 ', text)
# Ensure proper spacing after punctuation
text = re.sub(r'([.!?])\s*([A-Z])', r'\1 \2', text)
return text
# StyleTTS2 reference voice files (place .wav files in voices/ directory for voice cloning)
# Maps voice_id to reference audio filename - if file doesn't exist, uses default voice
STYLETTS2_VOICES = {
# Male voices
"VR6AewLTigWG4xSOukaG": "tony.wav", # Tony
"TxGEqnHWrfWFTfGW9XjX": "rick.wav", # Rick
"pNInz6obpgDQGcFmaJgB": "dennis.wav", # Dennis
"ODq5zmih8GrVes37Dizd": "earl.wav", # Earl
"IKne3meq5aSn9XLyUdCD": "marcus.wav", # Marcus
# Female voices
"jBpfuIE2acCO8z3wKNLl": "jasmine.wav", # Jasmine
"EXAVITQu4vr4xnSDxMaL": "megan.wav", # Megan
"21m00Tcm4TlvDq8ikWAM": "tanya.wav", # Tanya
"XB0fDUnXU5powFXDhCwa": "carla.wav", # Carla
"pFZP5JQG7iQjIQuC4Bku": "brenda.wav", # Brenda
}
# F5-TTS reference voices (same files as StyleTTS2, reuses voices/ directory)
# Requires: mono, 24kHz, 5-10 seconds, with transcript in .txt file
F5TTS_VOICES = STYLETTS2_VOICES.copy()
# ChatTTS speaker seeds - different seeds produce different voices
# These are used to generate consistent speaker embeddings
CHATTTS_SEEDS = {
# Male voices
"VR6AewLTigWG4xSOukaG": 42, # Tony - deep voice
"TxGEqnHWrfWFTfGW9XjX": 123, # Rick
"pNInz6obpgDQGcFmaJgB": 456, # Dennis
"ODq5zmih8GrVes37Dizd": 789, # Earl
"IKne3meq5aSn9XLyUdCD": 1011, # Marcus
# Female voices
"jBpfuIE2acCO8z3wKNLl": 2024, # Jasmine
"EXAVITQu4vr4xnSDxMaL": 3033, # Megan
"21m00Tcm4TlvDq8ikWAM": 4042, # Tanya
"XB0fDUnXU5powFXDhCwa": 5051, # Carla
"pFZP5JQG7iQjIQuC4Bku": 6060, # Brenda
}
DEFAULT_CHATTTS_SEED = 42
def get_elevenlabs_client():
"""Get or create ElevenLabs client"""
global _elevenlabs_client
if _elevenlabs_client is None:
from elevenlabs.client import ElevenLabs
_elevenlabs_client = ElevenLabs(api_key=settings.elevenlabs_api_key)
return _elevenlabs_client
def get_vits_tts():
"""Get or create VITS VCTK TTS instance"""
global _vits_tts
if _vits_tts is None:
from TTS.api import TTS
_vits_tts = TTS("tts_models/en/vctk/vits")
return _vits_tts
def get_kokoro_model():
"""Get or create Kokoro MLX model"""
global _kokoro_model
if _kokoro_model is None:
from mlx_audio.tts.utils import load_model
_kokoro_model = load_model(model_path='mlx-community/Kokoro-82M-bf16')
print("Kokoro MLX model loaded")
return _kokoro_model
def ensure_bark_loaded():
"""Ensure Bark models are loaded on GPU"""
global _bark_loaded
if not _bark_loaded:
os.environ['SUNO_USE_SMALL_MODELS'] = '1'
# Force Bark to use MPS (Apple Silicon GPU)
if torch.backends.mps.is_available():
os.environ['SUNO_OFFLOAD_CPU'] = '0'
os.environ['SUNO_ENABLE_MPS'] = '1'
from bark import preload_models
preload_models()
_bark_loaded = True
print(f"Bark loaded on device: {'MPS' if torch.backends.mps.is_available() else 'CPU'}")
def get_styletts2_model():
"""Get or create StyleTTS2 model"""
global _styletts2_model
if _styletts2_model is None:
from styletts2 import tts
_styletts2_model = tts.StyleTTS2()
print("StyleTTS2 model loaded")
return _styletts2_model
def get_f5tts_generate():
"""Get F5-TTS generate function (lazy load)"""
global _f5tts_model
if _f5tts_model is None:
# Disable tqdm progress bars to avoid BrokenPipeError in server context
import os
os.environ['HF_HUB_DISABLE_PROGRESS_BARS'] = '1'
os.environ['TQDM_DISABLE'] = '1'
from f5_tts_mlx.generate import generate
_f5tts_model = generate
print("F5-TTS MLX loaded")
return _f5tts_model
def get_chattts_model():
"""Get or create ChatTTS model"""
global _chattts_model
if _chattts_model is None:
import ChatTTS
_chattts_model = ChatTTS.Chat()
_chattts_model.load(compile=False)
print("ChatTTS model loaded")
return _chattts_model
def get_chattts_speaker(voice_id: str):
"""Get or create a consistent speaker embedding for a voice"""
global _chattts_speakers
if voice_id not in _chattts_speakers:
chat = get_chattts_model()
seed = CHATTTS_SEEDS.get(voice_id, DEFAULT_CHATTTS_SEED)
# Set seed for reproducible speaker
torch.manual_seed(seed)
_chattts_speakers[voice_id] = chat.sample_random_speaker()
print(f"[ChatTTS] Created speaker for voice {voice_id} with seed {seed}")
return _chattts_speakers[voice_id]
def phone_filter(audio: np.ndarray, sample_rate: int = 24000, quality: str = "normal") -> np.ndarray:
"""Apply phone filter with variable quality."""
audio = audio.flatten()
presets = {
"good": (200, 7000, 1.0, 0.0),
"normal": (300, 3400, 1.5, 0.005),
"bad": (400, 2800, 2.0, 0.015),
"terrible": (500, 2200, 2.5, 0.03),
}
low_hz, high_hz, distortion, noise = presets.get(quality, presets["normal"])
low = low_hz / (sample_rate / 2)
high = high_hz / (sample_rate / 2)
b, a = butter(4, [low, high], btype='band')
filtered = filtfilt(b, a, audio)
filtered = np.tanh(filtered * distortion) * 0.8
if noise > 0:
static = np.random.normal(0, noise, len(filtered)).astype(np.float32)
static_envelope = np.random.random(len(filtered) // 1000 + 1)
static_envelope = np.repeat(static_envelope, 1000)[:len(filtered)]
static *= (static_envelope > 0.7).astype(np.float32)
filtered = filtered + static
return filtered.astype(np.float32)
async def generate_speech_elevenlabs(text: str, voice_id: str) -> tuple[np.ndarray, int]:
"""Generate speech using ElevenLabs"""
client = get_elevenlabs_client()
audio_gen = client.text_to_speech.convert(
voice_id=voice_id,
text=text,
model_id="eleven_v3",
output_format="pcm_24000"
)
audio_bytes = b"".join(audio_gen)
audio = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32768.0
return audio, 24000
async def generate_speech_kokoro(text: str, voice_id: str) -> tuple[np.ndarray, int]:
"""Generate speech using MLX Kokoro (fast, good quality, Apple Silicon optimized)"""
import librosa
from mlx_audio.tts.generate import generate_audio
model = get_kokoro_model()
voice = KOKORO_VOICES.get(voice_id, DEFAULT_KOKORO_VOICE)
speed = KOKORO_SPEEDS.get(voice_id, DEFAULT_KOKORO_SPEED)
# Preprocess text for better prosody
text = preprocess_text_for_kokoro(text)
# Determine lang_code from voice prefix (a=American, b=British)
lang_code = 'b' if voice.startswith('b') else 'a'
with tempfile.TemporaryDirectory() as tmpdir:
generate_audio(
text,
model=model,
voice=voice,
speed=speed,
lang_code=lang_code,
output_path=tmpdir,
file_prefix='tts',
verbose=False
)
# Read the generated audio file
audio_file = Path(tmpdir) / 'tts_000.wav'
if not audio_file.exists():
raise RuntimeError("Kokoro failed to generate audio")
audio, sr = librosa.load(str(audio_file), sr=None, mono=True)
# Resample to 24kHz if needed
if sr != 24000:
audio = librosa.resample(audio, orig_sr=sr, target_sr=24000)
return audio.astype(np.float32), 24000
async def generate_speech_vits(text: str, voice_id: str) -> tuple[np.ndarray, int]:
"""Generate speech using VITS VCTK (fast, multiple speakers)"""
import librosa
tts = get_vits_tts()
speaker = VITS_SPEAKERS.get(voice_id, DEFAULT_VITS_SPEAKER)
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
tmp_path = tmp.name
try:
tts.tts_to_file(text=text, file_path=tmp_path, speaker=speaker)
audio, sr = librosa.load(tmp_path, sr=None, mono=True)
if sr != 24000:
audio = librosa.resample(audio, orig_sr=sr, target_sr=24000)
return audio.astype(np.float32), 24000
finally:
Path(tmp_path).unlink(missing_ok=True)
async def generate_speech_bark(text: str, voice_id: str) -> tuple[np.ndarray, int]:
"""Generate speech using Bark (slow but expressive, supports emotes like [laughs])"""
import librosa
from bark import SAMPLE_RATE, generate_audio
ensure_bark_loaded()
# Generate audio with Bark
audio = generate_audio(text)
# Normalize to prevent clipping (Bark can exceed [-1, 1])
max_val = np.abs(audio).max()
if max_val > 0.95:
audio = audio * (0.95 / max_val)
# Resample to 24kHz if needed
if SAMPLE_RATE != 24000:
audio = librosa.resample(audio, orig_sr=SAMPLE_RATE, target_sr=24000)
return audio.astype(np.float32), 24000
async def generate_speech_styletts2(text: str, voice_id: str) -> tuple[np.ndarray, int]:
"""Generate speech using StyleTTS2 (high quality, supports voice cloning)"""
import librosa
model = get_styletts2_model()
# Check for reference voice file
voice_file = STYLETTS2_VOICES.get(voice_id)
voice_path = None
if voice_file:
voice_path = settings.base_dir / "voices" / voice_file
if not voice_path.exists():
voice_path = None # Use default voice if file doesn't exist
# Generate audio
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
tmp_path = tmp.name
try:
if voice_path:
print(f"[StyleTTS2] Using voice clone: {voice_path}")
audio = model.inference(
text,
target_voice_path=str(voice_path),
output_wav_file=tmp_path,
output_sample_rate=24000,
diffusion_steps=5, # Balance quality/speed
alpha=0.3, # More voice-like than text-like
beta=0.7, # Good prosody
)
else:
print("[StyleTTS2] Using default voice")
audio = model.inference(
text,
output_wav_file=tmp_path,
output_sample_rate=24000,
diffusion_steps=5,
)
# Load the generated audio
audio, sr = librosa.load(tmp_path, sr=None, mono=True)
if sr != 24000:
audio = librosa.resample(audio, orig_sr=sr, target_sr=24000)
return audio.astype(np.float32), 24000
finally:
Path(tmp_path).unlink(missing_ok=True)
async def generate_speech_f5tts(text: str, voice_id: str) -> tuple[np.ndarray, int]:
"""Generate speech using F5-TTS MLX (very natural, supports voice cloning)"""
import librosa
generate = get_f5tts_generate()
# Check for reference voice file and transcript
voice_file = F5TTS_VOICES.get(voice_id)
ref_audio_path = None
ref_text = None
if voice_file:
voice_path = settings.base_dir / "voices" / voice_file
txt_path = voice_path.with_suffix('.txt')
if voice_path.exists() and txt_path.exists():
ref_audio_path = str(voice_path)
ref_text = txt_path.read_text().strip()
print(f"[F5-TTS] Using voice clone: {voice_path}")
if not ref_audio_path:
print("[F5-TTS] Using default voice")
# Generate audio to temp file
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
tmp_path = tmp.name
try:
generate(
generation_text=text,
ref_audio_path=ref_audio_path,
ref_audio_text=ref_text,
steps=8,
speed=1.0,
output_path=tmp_path,
)
# Load the generated audio
audio, sr = librosa.load(tmp_path, sr=None, mono=True)
# Resample to 24kHz if needed
if sr != 24000:
audio = librosa.resample(audio, orig_sr=sr, target_sr=24000)
return audio.astype(np.float32), 24000
finally:
Path(tmp_path).unlink(missing_ok=True)
async def generate_speech_chattts(text: str, voice_id: str) -> tuple[np.ndarray, int]:
"""Generate speech using ChatTTS (natural conversational speech, multiple speakers)"""
import ChatTTS
chat = get_chattts_model()
# Ensure text is not empty and has reasonable content
text = text.strip()
if not text:
text = "Hello."
print(f"[ChatTTS] Generating speech for: {text[:50]}...")
# Get consistent speaker for this voice
seed = CHATTTS_SEEDS.get(voice_id, DEFAULT_CHATTTS_SEED)
torch.manual_seed(seed)
# Configure inference parameters
params_infer_code = ChatTTS.Chat.InferCodeParams(
temperature=0.3,
top_P=0.7,
top_K=20,
)
# Generate audio (skip text refinement to avoid narrow() error with this version)
wavs = chat.infer(
[text],
params_infer_code=params_infer_code,
skip_refine_text=True,
)
if wavs is None or len(wavs) == 0:
raise RuntimeError("ChatTTS failed to generate audio")
audio = wavs[0]
# Handle different output shapes
if audio.ndim > 1:
audio = audio.squeeze()
# Normalize
max_val = np.abs(audio).max()
if max_val > 0.95:
audio = audio * (0.95 / max_val)
return audio.astype(np.float32), 24000
async def generate_speech_inworld(text: str, voice_id: str) -> tuple[np.ndarray, int]:
"""Generate speech using Inworld TTS API (high quality, natural voices)"""
import httpx
import base64
import librosa
# voice_id is now the Inworld voice name directly (e.g. "Edward")
# Fall back to legacy mapping if it's an ElevenLabs ID
if voice_id in INWORLD_VOICES:
voice = INWORLD_VOICES[voice_id]
else:
voice = voice_id
api_key = settings.inworld_api_key
if not api_key:
raise RuntimeError("INWORLD_API_KEY not set in environment")
print(f"[Inworld TTS] Voice: {voice}, Text: {text[:50]}...")
url = "https://api.inworld.ai/tts/v1/voice"
headers = {
"Content-Type": "application/json",
"Authorization": f"Basic {api_key}",
}
payload = {
"text": text,
"voice_id": voice,
"model_id": "inworld-tts-1.5-mini",
"audio_config": {
"encoding": "LINEAR16",
"sample_rate_hertz": 48000,
},
}
async with httpx.AsyncClient(timeout=25.0) as client:
response = await client.post(url, json=payload, headers=headers)
response.raise_for_status()
data = response.json()
# Decode base64 audio
audio_b64 = data.get("audioContent")
if not audio_b64:
raise RuntimeError("Inworld TTS returned no audio content")
audio_bytes = base64.b64decode(audio_b64)
# Parse audio using soundfile (handles WAV, MP3, etc.)
import soundfile as sf
import io
# soundfile can read WAV, FLAC, OGG, and with ffmpeg: MP3
# MP3 files start with ID3 tag or 0xff sync bytes
try:
audio, sr = sf.read(io.BytesIO(audio_bytes))
except Exception as e:
print(f"[Inworld TTS] soundfile failed: {e}, trying raw PCM")
# Fallback to raw PCM
if len(audio_bytes) % 2 != 0:
audio_bytes = audio_bytes[:-1]
audio = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32768.0
sr = 48000
# Resample to 24kHz to match other providers
if sr != 24000:
audio = librosa.resample(audio, orig_sr=sr, target_sr=24000)
return audio.astype(np.float32), 24000
async def generate_speech(
text: str,
voice_id: str,
phone_quality: str = "normal",
apply_filter: bool = True
) -> bytes:
"""
Generate speech from text.
Args:
text: Text to speak
voice_id: ElevenLabs voice ID (mapped to local voice if using local TTS)
phone_quality: Quality of phone filter ("none" to disable)
apply_filter: Whether to apply phone filter
Returns:
Raw PCM audio bytes (16-bit signed int, 24kHz)
"""
# Choose TTS provider
provider = settings.tts_provider
print(f"[TTS] Provider: {provider}, Text: {text[:50]}...")
if provider == "kokoro":
audio, sample_rate = await generate_speech_kokoro(text, voice_id)
elif provider == "f5tts":
audio, sample_rate = await generate_speech_f5tts(text, voice_id)
elif provider == "inworld":
audio, sample_rate = await generate_speech_inworld(text, voice_id)
elif provider == "chattts":
audio, sample_rate = await generate_speech_chattts(text, voice_id)
elif provider == "styletts2":
audio, sample_rate = await generate_speech_styletts2(text, voice_id)
elif provider == "bark":
audio, sample_rate = await generate_speech_bark(text, voice_id)
elif provider == "vits":
audio, sample_rate = await generate_speech_vits(text, voice_id)
elif provider == "elevenlabs":
audio, sample_rate = await generate_speech_elevenlabs(text, voice_id)
else:
raise ValueError(f"Unknown TTS provider: {provider}")
# Apply phone filter if requested
# Skip filter for Bark - it already has rough audio quality
if apply_filter and phone_quality not in ("none", "studio") and provider != "bark":
audio = phone_filter(audio, sample_rate, phone_quality)
# Convert to bytes
audio_int16 = (audio * 32768).clip(-32768, 32767).astype(np.int16)
return audio_int16.tobytes()
# Voice IDs for cohost and announcer
COHOST_VOICE_ID = "nPczCjzI2devNBz1zQrb"
ANNOUNCER_VOICE_ID = "ErXwobaYiN019PkySvjV"
async def generate_cohost_speech(text: str) -> bytes:
"""Generate speech for cohost Bobby (no phone filter)"""
return await generate_speech(text, COHOST_VOICE_ID, apply_filter=False)
async def generate_announcer_speech(text: str) -> bytes:
"""Generate speech for announcer (no phone filter)"""
return await generate_speech(text, ANNOUNCER_VOICE_ID, apply_filter=False)