"""TTS service with ElevenLabs, F5-TTS, MLX Kokoro, StyleTTS2, VITS, and Bark support""" import os import numpy as np from scipy.signal import butter, filtfilt from pathlib import Path import tempfile import torch from ..config import settings from .cost_tracker import cost_tracker # Patch torch.load for compatibility with PyTorch 2.6+ _original_torch_load = torch.load def _patched_torch_load(*args, **kwargs): kwargs['weights_only'] = False return _original_torch_load(*args, **kwargs) torch.load = _patched_torch_load # Global clients _elevenlabs_client = None _vits_tts = None _bark_loaded = False _kokoro_model = None _styletts2_model = None _f5tts_model = None _chattts_model = None _chattts_speakers = {} # Cache for speaker embeddings # Kokoro voice mapping - using highest-graded voices # Grades from https://huggingface.co/hexgrad/Kokoro-82M/blob/main/VOICES.md KOKORO_VOICES = { # Male voices (best available are C+ grade) "VR6AewLTigWG4xSOukaG": "am_fenrir", # Tony - deep/powerful (C+) "TxGEqnHWrfWFTfGW9XjX": "am_michael", # Rick - solid male voice (C+) "pNInz6obpgDQGcFmaJgB": "am_puck", # Dennis - anxious dad (C+) "ODq5zmih8GrVes37Dizd": "bm_george", # Earl - older/distinguished British (C) "IKne3meq5aSn9XLyUdCD": "bm_fable", # Marcus - young British (C) # Female voices (much better quality available) "jBpfuIE2acCO8z3wKNLl": "af_heart", # Jasmine - best quality (A) "EXAVITQu4vr4xnSDxMaL": "af_bella", # Megan - warm/friendly (A-) "21m00Tcm4TlvDq8ikWAM": "bf_emma", # Tanya - professional British (B-) "XB0fDUnXU5powFXDhCwa": "af_nicole", # Carla - Jersey mom (B-) "pFZP5JQG7iQjIQuC4Bku": "af_sarah", # Brenda - overthinker (C+) } # Speed adjustments per voice (1.0 = normal, lower = slower/more natural) # Slower speeds (0.85-0.95) generally sound more natural KOKORO_SPEEDS = { # Male voices - slower speeds help with C+ grade voices "VR6AewLTigWG4xSOukaG": 0.9, # Tony (am_fenrir) - deep voice, slower "TxGEqnHWrfWFTfGW9XjX": 0.92, # Rick (am_michael) - solid pace "pNInz6obpgDQGcFmaJgB": 0.95, # Dennis (am_puck) - anxious but not rushed "ODq5zmih8GrVes37Dizd": 0.85, # Earl (bm_george) - older, slower British "IKne3meq5aSn9XLyUdCD": 0.95, # Marcus (bm_fable) - young, natural # Female voices - A-grade voices can handle faster speeds "jBpfuIE2acCO8z3wKNLl": 0.95, # Jasmine (af_heart) - best voice, natural pace "EXAVITQu4vr4xnSDxMaL": 0.95, # Megan (af_bella) - warm "21m00Tcm4TlvDq8ikWAM": 0.9, # Tanya (bf_emma) - professional British "XB0fDUnXU5powFXDhCwa": 0.95, # Carla (af_nicole) - animated but clear "pFZP5JQG7iQjIQuC4Bku": 0.92, # Brenda (af_sarah) - overthinker, measured } DEFAULT_KOKORO_VOICE = "af_heart" DEFAULT_KOKORO_SPEED = 0.95 # VCTK speaker mapping - different voices for different callers VITS_SPEAKERS = { # Male voices "VR6AewLTigWG4xSOukaG": "p226", # Tony "TxGEqnHWrfWFTfGW9XjX": "p251", # Rick "pNInz6obpgDQGcFmaJgB": "p245", # Dennis "ODq5zmih8GrVes37Dizd": "p232", # Earl "IKne3meq5aSn9XLyUdCD": "p252", # Marcus # Female voices "jBpfuIE2acCO8z3wKNLl": "p225", # Jasmine "EXAVITQu4vr4xnSDxMaL": "p228", # Megan "21m00Tcm4TlvDq8ikWAM": "p229", # Tanya "XB0fDUnXU5powFXDhCwa": "p231", # Carla "pFZP5JQG7iQjIQuC4Bku": "p233", # Brenda } DEFAULT_VITS_SPEAKER = "p225" # Inworld voice mapping - maps ElevenLabs voice IDs to Inworld voices # Full voice list from API (English): Abby, Alex, Amina, Anjali, Arjun, Ashley, # Blake, Brian, Callum, Carter, Celeste, Chloe, Claire, Clive, Craig, Darlene, # Deborah, Dennis, Derek, Dominus, Edward, Elizabeth, Elliot, Ethan, Evan, Evelyn, # Gareth, Graham, Grant, Hades, Hamish, Hana, Hank, Jake, James, Jason, Jessica, # Julia, Kayla, Kelsey, Lauren, Liam, Loretta, Luna, Malcolm, Mark, Marlene, # Miranda, Mortimer, Nate, Oliver, Olivia, Pippa, Pixie, Priya, Ronald, Rupert, # Saanvi, Sarah, Sebastian, Serena, Shaun, Simon, Snik, Tessa, Theodore, Timothy, # Tyler, Veronica, Victor, Victoria, Vinny, Wendy INWORLD_VOICES = { # Original voice IDs "VR6AewLTigWG4xSOukaG": "Edward", # Tony - fast-talking, emphatic, streetwise "TxGEqnHWrfWFTfGW9XjX": "Shaun", # Rick - friendly, dynamic, conversational "pNInz6obpgDQGcFmaJgB": "Alex", # Dennis - energetic, expressive, mildly nasal "ODq5zmih8GrVes37Dizd": "Craig", # Earl - older British, refined, articulate "IKne3meq5aSn9XLyUdCD": "Timothy", # Marcus/Jerome - lively, upbeat American "jBpfuIE2acCO8z3wKNLl": "Hana", # Jasmine - bright, expressive young female "EXAVITQu4vr4xnSDxMaL": "Ashley", # Megan - warm, natural female "21m00Tcm4TlvDq8ikWAM": "Wendy", # Tanya - posh, middle-aged British "XB0fDUnXU5powFXDhCwa": "Sarah", # Carla - fast-talking, questioning tone "pFZP5JQG7iQjIQuC4Bku": "Deborah", # Brenda (original) - gentle, elegant # Regular caller voice IDs (backfilled) "onwK4e9ZLuTAKqWW03F9": "Ronald", # Bobby - repo man "FGY2WhTYpPnrIDTdsKH5": "Julia", # Carla (regular) - Jersey mom "CwhRBWXzGAHq8TQ4Fs17": "Mark", # Leon - male caller "SOYHLrjzK2X1ezoPC6cr": "Carter", # Carl - male caller "N2lVS1w4EtoT3dr4eOWO": "Clive", # Reggie - male caller "hpp4J3VqNfWAUOO0d1Us": "Olivia", # Brenda (regular) - ambulance driver "nPczCjzI2devNBz1zQrb": "Theodore", # Keith - male caller "JBFqnCBsd6RMkjVDRZzb": "Blake", # Andre - male caller "TX3LPaxmHKxFdv7VOQHJ": "Dennis", # Rick (regular) - male caller "cgSgspJ2msm6clMCkdW9": "Priya", # Megan (regular) - female caller } DEFAULT_INWORLD_VOICE = "Dennis" # Inworld voices that speak too slowly at default rate — bump them up # Range is 0.5 to 1.5, where 1.0 is the voice's native speed INWORLD_SPEED_OVERRIDES = { "Wendy": 1.15, "Craig": 1.15, "Deborah": 1.15, "Sarah": 1.1, "Hana": 1.1, "Theodore": 1.15, "Blake": 1.1, "Priya": 1.1, } DEFAULT_INWORLD_SPEED = 1.1 # Slight bump for all voices # Voice profiles — perceptual dimensions for each Inworld voice. # Used by style-to-voice matching to pair caller personalities with fitting voices. # weight: vocal depth/richness (light, medium, heavy) # energy: default speaking animation (low, medium, high) # warmth: friendliness/openness in the voice (cool, neutral, warm) # age_feel: perceived speaker age (young, middle, mature) VOICE_PROFILES = { # --- Male voices --- # Known characterizations from INWORLD_VOICES mapping and usage "Alex": {"weight": "light", "energy": "high", "warmth": "warm", "age_feel": "young"}, # energetic, expressive, mildly nasal "Edward": {"weight": "medium", "energy": "high", "warmth": "neutral", "age_feel": "middle"}, # fast-talking, emphatic, streetwise "Shaun": {"weight": "medium", "energy": "high", "warmth": "warm", "age_feel": "middle"}, # friendly, dynamic, conversational "Craig": {"weight": "heavy", "energy": "low", "warmth": "cool", "age_feel": "mature"}, # older British, refined, articulate "Timothy": {"weight": "light", "energy": "high", "warmth": "warm", "age_feel": "young"}, # lively, upbeat American "Dennis": {"weight": "medium", "energy": "high", "warmth": "warm", "age_feel": "middle"}, # energetic, default voice "Ronald": {"weight": "heavy", "energy": "medium", "warmth": "neutral", "age_feel": "mature"}, # gruff, authoritative "Theodore": {"weight": "heavy", "energy": "low", "warmth": "warm", "age_feel": "mature"}, # slow, deliberate "Blake": {"weight": "medium", "energy": "medium", "warmth": "neutral", "age_feel": "middle"}, "Carter": {"weight": "medium", "energy": "medium", "warmth": "neutral", "age_feel": "middle"}, "Clive": {"weight": "heavy", "energy": "low", "warmth": "cool", "age_feel": "mature"}, "Mark": {"weight": "medium", "energy": "medium", "warmth": "neutral", "age_feel": "middle"}, "Sebastian": {"weight": "medium", "energy": "medium", "warmth": "cool", "age_feel": "middle"}, # used by Silas (cult leader) & Chip "Elliot": {"weight": "light", "energy": "medium", "warmth": "warm", "age_feel": "young"}, # used by Otis (comedian) # Remaining male pool voices "Arjun": {"weight": "medium", "energy": "medium", "warmth": "warm", "age_feel": "middle"}, "Brian": {"weight": "medium", "energy": "medium", "warmth": "warm", "age_feel": "middle"}, "Callum": {"weight": "medium", "energy": "medium", "warmth": "neutral", "age_feel": "young"}, "Derek": {"weight": "medium", "energy": "medium", "warmth": "neutral", "age_feel": "middle"}, "Ethan": {"weight": "medium", "energy": "medium", "warmth": "warm", "age_feel": "young"}, "Evan": {"weight": "light", "energy": "medium", "warmth": "neutral", "age_feel": "young"}, "Gareth": {"weight": "medium", "energy": "medium", "warmth": "neutral", "age_feel": "middle"}, "Graham": {"weight": "heavy", "energy": "low", "warmth": "neutral", "age_feel": "mature"}, "Grant": {"weight": "medium", "energy": "medium", "warmth": "neutral", "age_feel": "middle"}, "Hades": {"weight": "heavy", "energy": "low", "warmth": "cool", "age_feel": "mature"}, "Hamish": {"weight": "medium", "energy": "medium", "warmth": "warm", "age_feel": "middle"}, "Hank": {"weight": "heavy", "energy": "medium", "warmth": "warm", "age_feel": "mature"}, "Jake": {"weight": "medium", "energy": "high", "warmth": "warm", "age_feel": "young"}, "James": {"weight": "medium", "energy": "medium", "warmth": "neutral", "age_feel": "middle"}, "Jason": {"weight": "medium", "energy": "medium", "warmth": "neutral", "age_feel": "middle"}, "Liam": {"weight": "medium", "energy": "high", "warmth": "warm", "age_feel": "young"}, "Malcolm": {"weight": "heavy", "energy": "low", "warmth": "cool", "age_feel": "mature"}, "Mortimer": {"weight": "heavy", "energy": "low", "warmth": "cool", "age_feel": "mature"}, "Nate": {"weight": "light", "energy": "high", "warmth": "warm", "age_feel": "young"}, "Oliver": {"weight": "medium", "energy": "medium", "warmth": "warm", "age_feel": "middle"}, "Rupert": {"weight": "medium", "energy": "low", "warmth": "cool", "age_feel": "mature"}, "Simon": {"weight": "medium", "energy": "medium", "warmth": "neutral", "age_feel": "middle"}, "Tyler": {"weight": "light", "energy": "high", "warmth": "neutral", "age_feel": "young"}, "Victor": {"weight": "heavy", "energy": "medium", "warmth": "cool", "age_feel": "mature"}, "Vinny": {"weight": "medium", "energy": "high", "warmth": "warm", "age_feel": "middle"}, # --- Female voices --- # Known characterizations "Hana": {"weight": "light", "energy": "high", "warmth": "warm", "age_feel": "young"}, # bright, expressive young "Ashley": {"weight": "medium", "energy": "medium", "warmth": "warm", "age_feel": "middle"}, # warm, natural "Wendy": {"weight": "medium", "energy": "low", "warmth": "cool", "age_feel": "mature"}, # posh, middle-aged British "Sarah": {"weight": "light", "energy": "high", "warmth": "neutral", "age_feel": "middle"}, # fast-talking, questioning "Deborah": {"weight": "medium", "energy": "low", "warmth": "warm", "age_feel": "mature"}, # gentle, elegant "Olivia": {"weight": "medium", "energy": "medium", "warmth": "warm", "age_feel": "middle"}, "Julia": {"weight": "medium", "energy": "medium", "warmth": "neutral", "age_feel": "middle"}, # used by Angie (deadpan) "Priya": {"weight": "light", "energy": "medium", "warmth": "warm", "age_feel": "young"}, "Amina": {"weight": "medium", "energy": "medium", "warmth": "warm", "age_feel": "middle"}, # used by Charlene (bragger) "Tessa": {"weight": "medium", "energy": "medium", "warmth": "warm", "age_feel": "middle"}, # used by Lucille "Kelsey": {"weight": "light", "energy": "medium", "warmth": "neutral", "age_feel": "young"}, # used by Maxine (quiet/nervous) # Remaining female pool voices "Anjali": {"weight": "light", "energy": "medium", "warmth": "warm", "age_feel": "young"}, "Celeste": {"weight": "light", "energy": "medium", "warmth": "cool", "age_feel": "middle"}, "Chloe": {"weight": "light", "energy": "high", "warmth": "warm", "age_feel": "young"}, "Claire": {"weight": "medium", "energy": "medium", "warmth": "neutral", "age_feel": "middle"}, "Darlene": {"weight": "medium", "energy": "medium", "warmth": "warm", "age_feel": "mature"}, "Elizabeth": {"weight": "medium", "energy": "medium", "warmth": "cool", "age_feel": "mature"}, "Jessica": {"weight": "medium", "energy": "medium", "warmth": "warm", "age_feel": "middle"}, "Kayla": {"weight": "light", "energy": "high", "warmth": "warm", "age_feel": "young"}, "Lauren": {"weight": "medium", "energy": "medium", "warmth": "neutral", "age_feel": "middle"}, "Loretta": {"weight": "medium", "energy": "low", "warmth": "warm", "age_feel": "mature"}, "Luna": {"weight": "light", "energy": "medium", "warmth": "warm", "age_feel": "young"}, "Marlene": {"weight": "medium", "energy": "low", "warmth": "neutral", "age_feel": "mature"}, "Miranda": {"weight": "medium", "energy": "medium", "warmth": "cool", "age_feel": "middle"}, "Pippa": {"weight": "light", "energy": "high", "warmth": "warm", "age_feel": "young"}, "Saanvi": {"weight": "light", "energy": "medium", "warmth": "warm", "age_feel": "young"}, "Serena": {"weight": "medium", "energy": "medium", "warmth": "cool", "age_feel": "middle"}, "Veronica": {"weight": "medium", "energy": "medium", "warmth": "cool", "age_feel": "middle"}, "Victoria": {"weight": "medium", "energy": "low", "warmth": "cool", "age_feel": "mature"}, } def preprocess_text_for_kokoro(text: str) -> str: """ Preprocess text to improve Kokoro prosody and naturalness. - Adds slight pauses via punctuation - Handles contractions and abbreviations - Normalizes spacing """ import re # Normalize whitespace text = ' '.join(text.split()) # Add comma pauses after common transition words (if no punctuation follows) transitions = [ r'\b(Well)\s+(?=[A-Za-z])', r'\b(So)\s+(?=[A-Za-z])', r'\b(Now)\s+(?=[A-Za-z])', r'\b(Look)\s+(?=[A-Za-z])', r'\b(See)\s+(?=[A-Za-z])', r'\b(Anyway)\s+(?=[A-Za-z])', r'\b(Actually)\s+(?=[A-Za-z])', r'\b(Honestly)\s+(?=[A-Za-z])', r'\b(Basically)\s+(?=[A-Za-z])', ] for pattern in transitions: text = re.sub(pattern, r'\1, ', text) # Add pause after "I mean" at start of sentence text = re.sub(r'^(I mean)\s+', r'\1, ', text) text = re.sub(r'\.\s+(I mean)\s+', r'. \1, ', text) # Expand common abbreviations for better pronunciation abbreviations = { r'\bDr\.': 'Doctor', r'\bMr\.': 'Mister', r'\bMrs\.': 'Missus', r'\bMs\.': 'Miss', r'\bSt\.': 'Street', r'\bAve\.': 'Avenue', r'\betc\.': 'etcetera', r'\bvs\.': 'versus', r'\bw/': 'with', r'\bw/o': 'without', } for abbr, expansion in abbreviations.items(): text = re.sub(abbr, expansion, text, flags=re.IGNORECASE) # Add breath pause (comma) before conjunctions in long sentences text = re.sub(r'(\w{20,})\s+(and|but|or)\s+', r'\1, \2 ', text) # Ensure proper spacing after punctuation text = re.sub(r'([.!?])\s*([A-Z])', r'\1 \2', text) return text # StyleTTS2 reference voice files (place .wav files in voices/ directory for voice cloning) # Maps voice_id to reference audio filename - if file doesn't exist, uses default voice STYLETTS2_VOICES = { # Male voices "VR6AewLTigWG4xSOukaG": "tony.wav", # Tony "TxGEqnHWrfWFTfGW9XjX": "rick.wav", # Rick "pNInz6obpgDQGcFmaJgB": "dennis.wav", # Dennis "ODq5zmih8GrVes37Dizd": "earl.wav", # Earl "IKne3meq5aSn9XLyUdCD": "marcus.wav", # Marcus # Female voices "jBpfuIE2acCO8z3wKNLl": "jasmine.wav", # Jasmine "EXAVITQu4vr4xnSDxMaL": "megan.wav", # Megan "21m00Tcm4TlvDq8ikWAM": "tanya.wav", # Tanya "XB0fDUnXU5powFXDhCwa": "carla.wav", # Carla "pFZP5JQG7iQjIQuC4Bku": "brenda.wav", # Brenda } # F5-TTS reference voices (same files as StyleTTS2, reuses voices/ directory) # Requires: mono, 24kHz, 5-10 seconds, with transcript in .txt file F5TTS_VOICES = STYLETTS2_VOICES.copy() # ChatTTS speaker seeds - different seeds produce different voices # These are used to generate consistent speaker embeddings CHATTTS_SEEDS = { # Male voices "VR6AewLTigWG4xSOukaG": 42, # Tony - deep voice "TxGEqnHWrfWFTfGW9XjX": 123, # Rick "pNInz6obpgDQGcFmaJgB": 456, # Dennis "ODq5zmih8GrVes37Dizd": 789, # Earl "IKne3meq5aSn9XLyUdCD": 1011, # Marcus # Female voices "jBpfuIE2acCO8z3wKNLl": 2024, # Jasmine "EXAVITQu4vr4xnSDxMaL": 3033, # Megan "21m00Tcm4TlvDq8ikWAM": 4042, # Tanya "XB0fDUnXU5powFXDhCwa": 5051, # Carla "pFZP5JQG7iQjIQuC4Bku": 6060, # Brenda } DEFAULT_CHATTTS_SEED = 42 def get_elevenlabs_client(): """Get or create ElevenLabs client""" global _elevenlabs_client if _elevenlabs_client is None: from elevenlabs.client import ElevenLabs _elevenlabs_client = ElevenLabs(api_key=settings.elevenlabs_api_key) return _elevenlabs_client def get_vits_tts(): """Get or create VITS VCTK TTS instance""" global _vits_tts if _vits_tts is None: from TTS.api import TTS _vits_tts = TTS("tts_models/en/vctk/vits") return _vits_tts def get_kokoro_model(): """Get or create Kokoro MLX model""" global _kokoro_model if _kokoro_model is None: from mlx_audio.tts.utils import load_model _kokoro_model = load_model(model_path='mlx-community/Kokoro-82M-bf16') print("Kokoro MLX model loaded") return _kokoro_model def ensure_bark_loaded(): """Ensure Bark models are loaded on GPU""" global _bark_loaded if not _bark_loaded: os.environ['SUNO_USE_SMALL_MODELS'] = '1' # Force Bark to use MPS (Apple Silicon GPU) if torch.backends.mps.is_available(): os.environ['SUNO_OFFLOAD_CPU'] = '0' os.environ['SUNO_ENABLE_MPS'] = '1' from bark import preload_models preload_models() _bark_loaded = True print(f"Bark loaded on device: {'MPS' if torch.backends.mps.is_available() else 'CPU'}") def get_styletts2_model(): """Get or create StyleTTS2 model""" global _styletts2_model if _styletts2_model is None: from styletts2 import tts _styletts2_model = tts.StyleTTS2() print("StyleTTS2 model loaded") return _styletts2_model def get_f5tts_generate(): """Get F5-TTS generate function (lazy load)""" global _f5tts_model if _f5tts_model is None: # Disable tqdm progress bars to avoid BrokenPipeError in server context import os os.environ['HF_HUB_DISABLE_PROGRESS_BARS'] = '1' os.environ['TQDM_DISABLE'] = '1' from f5_tts_mlx.generate import generate _f5tts_model = generate print("F5-TTS MLX loaded") return _f5tts_model def get_chattts_model(): """Get or create ChatTTS model""" global _chattts_model if _chattts_model is None: import ChatTTS _chattts_model = ChatTTS.Chat() _chattts_model.load(compile=False) print("ChatTTS model loaded") return _chattts_model def get_chattts_speaker(voice_id: str): """Get or create a consistent speaker embedding for a voice""" global _chattts_speakers if voice_id not in _chattts_speakers: chat = get_chattts_model() seed = CHATTTS_SEEDS.get(voice_id, DEFAULT_CHATTTS_SEED) # Set seed for reproducible speaker torch.manual_seed(seed) _chattts_speakers[voice_id] = chat.sample_random_speaker() print(f"[ChatTTS] Created speaker for voice {voice_id} with seed {seed}") return _chattts_speakers[voice_id] def phone_filter(audio: np.ndarray, sample_rate: int = 24000, quality: str = "normal") -> np.ndarray: """Apply phone filter with variable quality.""" audio = audio.flatten() presets = { "good": (200, 7000, 1.0, 0.0), "normal": (300, 3400, 1.5, 0.005), "bad": (400, 2800, 2.0, 0.015), "terrible": (500, 2200, 2.5, 0.03), } low_hz, high_hz, distortion, noise = presets.get(quality, presets["normal"]) low = low_hz / (sample_rate / 2) high = high_hz / (sample_rate / 2) b, a = butter(4, [low, high], btype='band') filtered = filtfilt(b, a, audio) filtered = np.tanh(filtered * distortion) * 0.8 if noise > 0: static = np.random.normal(0, noise, len(filtered)).astype(np.float32) static_envelope = np.random.random(len(filtered) // 1000 + 1) static_envelope = np.repeat(static_envelope, 1000)[:len(filtered)] static *= (static_envelope > 0.7).astype(np.float32) filtered = filtered + static return filtered.astype(np.float32) async def generate_speech_elevenlabs(text: str, voice_id: str) -> tuple[np.ndarray, int]: """Generate speech using ElevenLabs""" client = get_elevenlabs_client() audio_gen = client.text_to_speech.convert( voice_id=voice_id, text=text, model_id="eleven_v3", output_format="pcm_24000" ) audio_bytes = b"".join(audio_gen) audio = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32768.0 return audio, 24000 async def generate_speech_kokoro(text: str, voice_id: str) -> tuple[np.ndarray, int]: """Generate speech using MLX Kokoro (fast, good quality, Apple Silicon optimized)""" import librosa from mlx_audio.tts.generate import generate_audio model = get_kokoro_model() voice = KOKORO_VOICES.get(voice_id, DEFAULT_KOKORO_VOICE) speed = KOKORO_SPEEDS.get(voice_id, DEFAULT_KOKORO_SPEED) # Preprocess text for better prosody text = preprocess_text_for_kokoro(text) # Determine lang_code from voice prefix (a=American, b=British) lang_code = 'b' if voice.startswith('b') else 'a' with tempfile.TemporaryDirectory() as tmpdir: generate_audio( text, model=model, voice=voice, speed=speed, lang_code=lang_code, output_path=tmpdir, file_prefix='tts', verbose=False ) # Read the generated audio file audio_file = Path(tmpdir) / 'tts_000.wav' if not audio_file.exists(): raise RuntimeError("Kokoro failed to generate audio") audio, sr = librosa.load(str(audio_file), sr=None, mono=True) # Resample to 24kHz if needed if sr != 24000: audio = librosa.resample(audio, orig_sr=sr, target_sr=24000) return audio.astype(np.float32), 24000 async def generate_speech_vits(text: str, voice_id: str) -> tuple[np.ndarray, int]: """Generate speech using VITS VCTK (fast, multiple speakers)""" import librosa tts = get_vits_tts() speaker = VITS_SPEAKERS.get(voice_id, DEFAULT_VITS_SPEAKER) with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp: tmp_path = tmp.name try: tts.tts_to_file(text=text, file_path=tmp_path, speaker=speaker) audio, sr = librosa.load(tmp_path, sr=None, mono=True) if sr != 24000: audio = librosa.resample(audio, orig_sr=sr, target_sr=24000) return audio.astype(np.float32), 24000 finally: Path(tmp_path).unlink(missing_ok=True) async def generate_speech_bark(text: str, voice_id: str) -> tuple[np.ndarray, int]: """Generate speech using Bark (slow but expressive, supports emotes like [laughs])""" import librosa from bark import SAMPLE_RATE, generate_audio ensure_bark_loaded() # Generate audio with Bark audio = generate_audio(text) # Normalize to prevent clipping (Bark can exceed [-1, 1]) max_val = np.abs(audio).max() if max_val > 0.95: audio = audio * (0.95 / max_val) # Resample to 24kHz if needed if SAMPLE_RATE != 24000: audio = librosa.resample(audio, orig_sr=SAMPLE_RATE, target_sr=24000) return audio.astype(np.float32), 24000 async def generate_speech_styletts2(text: str, voice_id: str) -> tuple[np.ndarray, int]: """Generate speech using StyleTTS2 (high quality, supports voice cloning)""" import librosa model = get_styletts2_model() # Check for reference voice file voice_file = STYLETTS2_VOICES.get(voice_id) voice_path = None if voice_file: voice_path = settings.base_dir / "voices" / voice_file if not voice_path.exists(): voice_path = None # Use default voice if file doesn't exist # Generate audio with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp: tmp_path = tmp.name try: if voice_path: print(f"[StyleTTS2] Using voice clone: {voice_path}") audio = model.inference( text, target_voice_path=str(voice_path), output_wav_file=tmp_path, output_sample_rate=24000, diffusion_steps=5, # Balance quality/speed alpha=0.3, # More voice-like than text-like beta=0.7, # Good prosody ) else: print("[StyleTTS2] Using default voice") audio = model.inference( text, output_wav_file=tmp_path, output_sample_rate=24000, diffusion_steps=5, ) # Load the generated audio audio, sr = librosa.load(tmp_path, sr=None, mono=True) if sr != 24000: audio = librosa.resample(audio, orig_sr=sr, target_sr=24000) return audio.astype(np.float32), 24000 finally: Path(tmp_path).unlink(missing_ok=True) async def generate_speech_f5tts(text: str, voice_id: str) -> tuple[np.ndarray, int]: """Generate speech using F5-TTS MLX (very natural, supports voice cloning)""" import librosa generate = get_f5tts_generate() # Check for reference voice file and transcript voice_file = F5TTS_VOICES.get(voice_id) ref_audio_path = None ref_text = None if voice_file: voice_path = settings.base_dir / "voices" / voice_file txt_path = voice_path.with_suffix('.txt') if voice_path.exists() and txt_path.exists(): ref_audio_path = str(voice_path) ref_text = txt_path.read_text().strip() print(f"[F5-TTS] Using voice clone: {voice_path}") if not ref_audio_path: print("[F5-TTS] Using default voice") # Generate audio to temp file with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp: tmp_path = tmp.name try: generate( generation_text=text, ref_audio_path=ref_audio_path, ref_audio_text=ref_text, steps=8, speed=1.0, output_path=tmp_path, ) # Load the generated audio audio, sr = librosa.load(tmp_path, sr=None, mono=True) # Resample to 24kHz if needed if sr != 24000: audio = librosa.resample(audio, orig_sr=sr, target_sr=24000) return audio.astype(np.float32), 24000 finally: Path(tmp_path).unlink(missing_ok=True) async def generate_speech_chattts(text: str, voice_id: str) -> tuple[np.ndarray, int]: """Generate speech using ChatTTS (natural conversational speech, multiple speakers)""" import ChatTTS chat = get_chattts_model() # Ensure text is not empty and has reasonable content text = text.strip() if not text: text = "Hello." print(f"[ChatTTS] Generating speech for: {text[:50]}...") # Get consistent speaker for this voice seed = CHATTTS_SEEDS.get(voice_id, DEFAULT_CHATTTS_SEED) torch.manual_seed(seed) # Configure inference parameters params_infer_code = ChatTTS.Chat.InferCodeParams( temperature=0.3, top_P=0.7, top_K=20, ) # Generate audio (skip text refinement to avoid narrow() error with this version) wavs = chat.infer( [text], params_infer_code=params_infer_code, skip_refine_text=True, ) if wavs is None or len(wavs) == 0: raise RuntimeError("ChatTTS failed to generate audio") audio = wavs[0] # Handle different output shapes if audio.ndim > 1: audio = audio.squeeze() # Normalize max_val = np.abs(audio).max() if max_val > 0.95: audio = audio * (0.95 / max_val) return audio.astype(np.float32), 24000 _EXCITED_KEYWORDS = {"excited", "amazing", "incredible", "can't believe", "so happy", "hell yeah", "fired up", "furious", "pissed", "angry", "what the hell", "are you kidding", "unbelievable", "!!", "oh my god"} _SAD_KEYWORDS = {"sad", "miss them", "passed away", "funeral", "crying", "broke my heart", "can't stop thinking", "lonely", "depressed", "sorry", "regret", "wish I could", "never got to", "lost", "grief"} def _detect_speech_rate(text: str, base_speed: float) -> float: """Adjust speech rate based on emotional content of the text. Returns a speed value clamped to Inworld's 0.5-1.5 range.""" text_lower = text.lower() excited = sum(1 for kw in _EXCITED_KEYWORDS if kw in text_lower) sad = sum(1 for kw in _SAD_KEYWORDS if kw in text_lower) if excited >= 2: return min(1.5, base_speed + 0.15) elif excited >= 1: return min(1.5, base_speed + 0.08) elif sad >= 2: return max(0.5, base_speed - 0.2) elif sad >= 1: return max(0.5, base_speed - 0.1) return base_speed async def generate_speech_inworld(text: str, voice_id: str) -> tuple[np.ndarray, int]: """Generate speech using Inworld TTS API (high quality, natural voices)""" import httpx import base64 import librosa # voice_id is now the Inworld voice name directly (e.g. "Edward") # Fall back to legacy mapping if it's an ElevenLabs ID if voice_id in INWORLD_VOICES: voice = INWORLD_VOICES[voice_id] else: voice = voice_id api_key = settings.inworld_api_key if not api_key: raise RuntimeError("INWORLD_API_KEY not set in environment") base_speed = INWORLD_SPEED_OVERRIDES.get(voice, DEFAULT_INWORLD_SPEED) speed = _detect_speech_rate(text, base_speed) print(f"[Inworld TTS] Voice: {voice}, Speed: {speed:.2f} (base {base_speed}), Text: {text[:50]}...") url = "https://api.inworld.ai/tts/v1/voice" headers = { "Content-Type": "application/json", "Authorization": f"Basic {api_key}", } payload = { "text": text, "voiceId": voice, "modelId": "inworld-tts-1.5-max", "audioConfig": { "audioEncoding": "LINEAR16", "sampleRateHertz": 48000, "speakingRate": speed, }, } async with httpx.AsyncClient(timeout=12.0) as client: response = await client.post(url, json=payload, headers=headers) response.raise_for_status() data = response.json() # Decode base64 audio audio_b64 = data.get("audioContent") if not audio_b64: raise RuntimeError("Inworld TTS returned no audio content") audio_bytes = base64.b64decode(audio_b64) # Parse audio using soundfile (handles WAV, MP3, etc.) import soundfile as sf import io # soundfile can read WAV, FLAC, OGG, and with ffmpeg: MP3 # MP3 files start with ID3 tag or 0xff sync bytes try: audio, sr = sf.read(io.BytesIO(audio_bytes)) except Exception as e: print(f"[Inworld TTS] soundfile failed: {e}, trying raw PCM") # Fallback to raw PCM if len(audio_bytes) % 2 != 0: audio_bytes = audio_bytes[:-1] audio = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32768.0 sr = 48000 # Resample to 24kHz to match other providers if sr != 24000: audio = librosa.resample(audio, orig_sr=sr, target_sr=24000) return audio.astype(np.float32), 24000 def pick_caller_tts_provider() -> str | None: """Randomly assign a TTS provider for a caller. Returns None to use the global default, or a specific provider name. ~70% inworld (default), ~20% kokoro, ~10% other available.""" import random roll = random.random() if roll < 0.70: return None # Use global default (typically inworld) elif roll < 0.90: return "kokoro" else: return random.choice(["kokoro", "f5tts", "chattts"]) _TTS_PROVIDERS = { "kokoro": lambda text, vid: generate_speech_kokoro(text, vid), "f5tts": lambda text, vid: generate_speech_f5tts(text, vid), "inworld": lambda text, vid: generate_speech_inworld(text, vid), "chattts": lambda text, vid: generate_speech_chattts(text, vid), "styletts2": lambda text, vid: generate_speech_styletts2(text, vid), "bark": lambda text, vid: generate_speech_bark(text, vid), "vits": lambda text, vid: generate_speech_vits(text, vid), "elevenlabs": lambda text, vid: generate_speech_elevenlabs(text, vid), } TTS_MAX_RETRIES = 2 TTS_RETRY_DELAYS = [0.5, 1.0] # seconds between retries async def generate_speech( text: str, voice_id: str, phone_quality: str = "normal", apply_filter: bool = True, provider_override: str = None ) -> bytes: """ Generate speech from text with automatic retry on failure. Args: text: Text to speak voice_id: ElevenLabs voice ID (mapped to local voice if using local TTS) phone_quality: Quality of phone filter ("none" to disable) apply_filter: Whether to apply phone filter provider_override: Override the global TTS provider for this call Returns: Raw PCM audio bytes (16-bit signed int, 24kHz) """ import asyncio provider = provider_override or settings.tts_provider print(f"[TTS] Provider: {provider}{' (override)' if provider_override else ''}, Text: {text[:50]}...") gen_fn = _TTS_PROVIDERS.get(provider) if not gen_fn: raise ValueError(f"Unknown TTS provider: {provider}") last_error = None try: async with asyncio.timeout(20): for attempt in range(TTS_MAX_RETRIES): try: audio, sample_rate = await gen_fn(text, voice_id) cost_tracker.record_tts_call(provider, voice_id, len(text)) if attempt > 0: print(f"[TTS] Succeeded on retry {attempt}") break except TimeoutError: raise # Let asyncio.timeout propagate except Exception as e: last_error = e if attempt < TTS_MAX_RETRIES - 1: delay = TTS_RETRY_DELAYS[attempt] print(f"[TTS] {provider} attempt {attempt + 1} failed: {e} — retrying in {delay}s...") await asyncio.sleep(delay) else: print(f"[TTS] {provider} failed after {TTS_MAX_RETRIES} attempts: {e}") raise except TimeoutError: print(f"[TTS] Overall timeout (20s) for {provider}") raise RuntimeError(f"TTS generation timed out after 20s") # Apply phone filter if requested # Skip filter for Bark - it already has rough audio quality if apply_filter and phone_quality not in ("none", "studio") and provider != "bark": audio = phone_filter(audio, sample_rate, phone_quality) # Convert to bytes audio_int16 = (audio * 32768).clip(-32768, 32767).astype(np.int16) return audio_int16.tobytes() # Voice IDs for cohost and announcer COHOST_VOICE_ID = "nPczCjzI2devNBz1zQrb" ANNOUNCER_VOICE_ID = "ErXwobaYiN019PkySvjV" async def generate_cohost_speech(text: str) -> bytes: """Generate speech for cohost Bobby (no phone filter)""" return await generate_speech(text, COHOST_VOICE_ID, apply_filter=False) async def generate_announcer_speech(text: str) -> bytes: """Generate speech for announcer (no phone filter)""" return await generate_speech(text, ANNOUNCER_VOICE_ID, apply_filter=False)