"""TTS service with ElevenLabs, F5-TTS, MLX Kokoro, StyleTTS2, VITS, and Bark support"""

import os
import numpy as np
from scipy.signal import butter, filtfilt
from pathlib import Path
import tempfile
import torch

from ..config import settings
from .cost_tracker import cost_tracker

# Patch torch.load for compatibility with PyTorch 2.6+
_original_torch_load = torch.load
def _patched_torch_load(*args, **kwargs):
    kwargs['weights_only'] = False
    return _original_torch_load(*args, **kwargs)
torch.load = _patched_torch_load

# Global clients
_elevenlabs_client = None
_vits_tts = None
_bark_loaded = False
_kokoro_model = None
_styletts2_model = None
_f5tts_model = None
_chattts_model = None
_chattts_speakers = {}  # Cache for speaker embeddings

# Kokoro voice mapping - using highest-graded voices
# Grades from https://huggingface.co/hexgrad/Kokoro-82M/blob/main/VOICES.md
KOKORO_VOICES = {
    # Male voices (best available are C+ grade)
    "VR6AewLTigWG4xSOukaG": "am_fenrir",  # Tony - deep/powerful (C+)
    "TxGEqnHWrfWFTfGW9XjX": "am_michael", # Rick - solid male voice (C+)
    "pNInz6obpgDQGcFmaJgB": "am_puck",    # Dennis - anxious dad (C+)
    "ODq5zmih8GrVes37Dizd": "bm_george",  # Earl - older/distinguished British (C)
    "IKne3meq5aSn9XLyUdCD": "bm_fable",   # Marcus - young British (C)
    # Female voices (much better quality available)
    "jBpfuIE2acCO8z3wKNLl": "af_heart",   # Jasmine - best quality (A)
    "EXAVITQu4vr4xnSDxMaL": "af_bella",   # Megan - warm/friendly (A-)
    "21m00Tcm4TlvDq8ikWAM": "bf_emma",    # Tanya - professional British (B-)
    "XB0fDUnXU5powFXDhCwa": "af_nicole",  # Carla - Jersey mom (B-)
    "pFZP5JQG7iQjIQuC4Bku": "af_sarah",   # Brenda - overthinker (C+)
}

# Speed adjustments per voice (1.0 = normal, lower = slower/more natural)
# Slower speeds (0.85-0.95) generally sound more natural
KOKORO_SPEEDS = {
    # Male voices - slower speeds help with C+ grade voices
    "VR6AewLTigWG4xSOukaG": 0.9,   # Tony (am_fenrir) - deep voice, slower
    "TxGEqnHWrfWFTfGW9XjX": 0.92,  # Rick (am_michael) - solid pace
    "pNInz6obpgDQGcFmaJgB": 0.95,  # Dennis (am_puck) - anxious but not rushed
    "ODq5zmih8GrVes37Dizd": 0.85,  # Earl (bm_george) - older, slower British
    "IKne3meq5aSn9XLyUdCD": 0.95,  # Marcus (bm_fable) - young, natural
    # Female voices - A-grade voices can handle faster speeds
    "jBpfuIE2acCO8z3wKNLl": 0.95,  # Jasmine (af_heart) - best voice, natural pace
    "EXAVITQu4vr4xnSDxMaL": 0.95,  # Megan (af_bella) - warm
    "21m00Tcm4TlvDq8ikWAM": 0.9,   # Tanya (bf_emma) - professional British
    "XB0fDUnXU5powFXDhCwa": 0.95,  # Carla (af_nicole) - animated but clear
    "pFZP5JQG7iQjIQuC4Bku": 0.92,  # Brenda (af_sarah) - overthinker, measured
}

DEFAULT_KOKORO_VOICE = "af_heart"
DEFAULT_KOKORO_SPEED = 0.95

# VCTK speaker mapping - different voices for different callers
VITS_SPEAKERS = {
    # Male voices
    "VR6AewLTigWG4xSOukaG": "p226",  # Tony
    "TxGEqnHWrfWFTfGW9XjX": "p251",  # Rick
    "pNInz6obpgDQGcFmaJgB": "p245",  # Dennis
    "ODq5zmih8GrVes37Dizd": "p232",  # Earl
    "IKne3meq5aSn9XLyUdCD": "p252",  # Marcus
    # Female voices
    "jBpfuIE2acCO8z3wKNLl": "p225",  # Jasmine
    "EXAVITQu4vr4xnSDxMaL": "p228",  # Megan
    "21m00Tcm4TlvDq8ikWAM": "p229",  # Tanya
    "XB0fDUnXU5powFXDhCwa": "p231",  # Carla
    "pFZP5JQG7iQjIQuC4Bku": "p233",  # Brenda
}

DEFAULT_VITS_SPEAKER = "p225"

# Inworld voice mapping - maps ElevenLabs voice IDs to Inworld voices
# Full voice list from API (English): Abby, Alex, Amina, Anjali, Arjun, Ashley,
# Blake, Brian, Callum, Carter, Celeste, Chloe, Claire, Clive, Craig, Darlene,
# Deborah, Dennis, Derek, Dominus, Edward, Elizabeth, Elliot, Ethan, Evan, Evelyn,
# Gareth, Graham, Grant, Hades, Hamish, Hana, Hank, Jake, James, Jason, Jessica,
# Julia, Kayla, Kelsey, Lauren, Liam, Loretta, Luna, Malcolm, Mark, Marlene,
# Miranda, Mortimer, Nate, Oliver, Olivia, Pippa, Pixie, Priya, Ronald, Rupert,
# Saanvi, Sarah, Sebastian, Serena, Shaun, Simon, Snik, Tessa, Theodore, Timothy,
# Tyler, Veronica, Victor, Victoria, Vinny, Wendy
INWORLD_VOICES = {
    # Original voice IDs
    "VR6AewLTigWG4xSOukaG": "Edward",    # Tony - fast-talking, emphatic, streetwise
    "TxGEqnHWrfWFTfGW9XjX": "Shaun",     # Rick - friendly, dynamic, conversational
    "pNInz6obpgDQGcFmaJgB": "Alex",      # Dennis - energetic, expressive, mildly nasal
    "ODq5zmih8GrVes37Dizd": "Craig",     # Earl - older British, refined, articulate
    "IKne3meq5aSn9XLyUdCD": "Timothy",   # Marcus/Jerome - lively, upbeat American
    "jBpfuIE2acCO8z3wKNLl": "Hana",      # Jasmine - bright, expressive young female
    "EXAVITQu4vr4xnSDxMaL": "Ashley",    # Megan - warm, natural female
    "21m00Tcm4TlvDq8ikWAM": "Wendy",     # Tanya - posh, middle-aged British
    "XB0fDUnXU5powFXDhCwa": "Sarah",     # Carla - fast-talking, questioning tone
    "pFZP5JQG7iQjIQuC4Bku": "Deborah",   # Brenda (original) - gentle, elegant
    # Regular caller voice IDs (backfilled)
    "onwK4e9ZLuTAKqWW03F9": "Ronald",    # Bobby - repo man
    "FGY2WhTYpPnrIDTdsKH5": "Julia",     # Carla (regular) - Jersey mom
    "CwhRBWXzGAHq8TQ4Fs17": "Mark",      # Leon - male caller
    "SOYHLrjzK2X1ezoPC6cr": "Carter",    # Carl - male caller
    "N2lVS1w4EtoT3dr4eOWO": "Clive",     # Reggie - male caller
    "hpp4J3VqNfWAUOO0d1Us": "Olivia",    # Brenda (regular) - ambulance driver
    "nPczCjzI2devNBz1zQrb": "Theodore",  # Keith - male caller
    "JBFqnCBsd6RMkjVDRZzb": "Blake",     # Andre - male caller
    "TX3LPaxmHKxFdv7VOQHJ": "Dennis",    # Rick (regular) - male caller
    "cgSgspJ2msm6clMCkdW9": "Priya",     # Megan (regular) - female caller
}
DEFAULT_INWORLD_VOICE = "Dennis"

# Inworld voices that speak too slowly at default rate — bump them up
# Range is 0.5 to 1.5, where 1.0 is the voice's native speed
INWORLD_SPEED_OVERRIDES = {
    "Wendy": 1.15,
    "Craig": 1.15,
    "Deborah": 1.15,
    "Sarah": 1.1,
    "Hana": 1.1,
    "Theodore": 1.15,
    "Blake": 1.1,
    "Priya": 1.1,
}
DEFAULT_INWORLD_SPEED = 1.1  # Slight bump for all voices

# Voice profiles — perceptual dimensions for each Inworld voice.
# Used by style-to-voice matching to pair caller personalities with fitting voices.
# weight: vocal depth/richness (light, medium, heavy)
# energy: default speaking animation (low, medium, high)
# warmth: friendliness/openness in the voice (cool, neutral, warm)
# age_feel: perceived speaker age (young, middle, mature)
VOICE_PROFILES = {
    # --- Male voices ---
    # Known characterizations from INWORLD_VOICES mapping and usage
    "Alex":      {"weight": "light",  "energy": "high",   "warmth": "warm",    "age_feel": "young"},    # energetic, expressive, mildly nasal
    "Edward":    {"weight": "medium", "energy": "high",   "warmth": "neutral", "age_feel": "middle"},   # fast-talking, emphatic, streetwise
    "Shaun":     {"weight": "medium", "energy": "high",   "warmth": "warm",    "age_feel": "middle"},   # friendly, dynamic, conversational
    "Craig":     {"weight": "heavy",  "energy": "low",    "warmth": "cool",    "age_feel": "mature"},   # older British, refined, articulate
    "Timothy":   {"weight": "light",  "energy": "high",   "warmth": "warm",    "age_feel": "young"},    # lively, upbeat American
    "Dennis":    {"weight": "medium", "energy": "high",   "warmth": "warm",    "age_feel": "middle"},   # energetic, default voice
    "Ronald":    {"weight": "heavy",  "energy": "medium", "warmth": "neutral", "age_feel": "mature"},   # gruff, authoritative
    "Theodore":  {"weight": "heavy",  "energy": "low",    "warmth": "warm",    "age_feel": "mature"},   # slow, deliberate
    "Blake":     {"weight": "medium", "energy": "medium", "warmth": "neutral", "age_feel": "middle"},
    "Carter":    {"weight": "medium", "energy": "medium", "warmth": "neutral", "age_feel": "middle"},
    "Clive":     {"weight": "heavy",  "energy": "low",    "warmth": "cool",    "age_feel": "mature"},
    "Mark":      {"weight": "medium", "energy": "medium", "warmth": "neutral", "age_feel": "middle"},
    "Sebastian": {"weight": "medium", "energy": "medium", "warmth": "cool",    "age_feel": "middle"},   # used by Silas (cult leader) & Chip
    "Elliot":    {"weight": "light",  "energy": "medium", "warmth": "warm",    "age_feel": "young"},    # used by Otis (comedian)
    # Remaining male pool voices
    "Arjun":     {"weight": "medium", "energy": "medium", "warmth": "warm",    "age_feel": "middle"},
    "Brian":     {"weight": "medium", "energy": "medium", "warmth": "warm",    "age_feel": "middle"},
    "Callum":    {"weight": "medium", "energy": "medium", "warmth": "neutral", "age_feel": "young"},
    "Derek":     {"weight": "medium", "energy": "medium", "warmth": "neutral", "age_feel": "middle"},
    "Ethan":     {"weight": "medium", "energy": "medium", "warmth": "warm",    "age_feel": "young"},
    "Evan":      {"weight": "light",  "energy": "medium", "warmth": "neutral", "age_feel": "young"},
    "Gareth":    {"weight": "medium", "energy": "medium", "warmth": "neutral", "age_feel": "middle"},
    "Graham":    {"weight": "heavy",  "energy": "low",    "warmth": "neutral", "age_feel": "mature"},
    "Grant":     {"weight": "medium", "energy": "medium", "warmth": "neutral", "age_feel": "middle"},
    "Hades":     {"weight": "heavy",  "energy": "low",    "warmth": "cool",    "age_feel": "mature"},
    "Hamish":    {"weight": "medium", "energy": "medium", "warmth": "warm",    "age_feel": "middle"},
    "Hank":      {"weight": "heavy",  "energy": "medium", "warmth": "warm",    "age_feel": "mature"},
    "Jake":      {"weight": "medium", "energy": "high",   "warmth": "warm",    "age_feel": "young"},
    "James":     {"weight": "medium", "energy": "medium", "warmth": "neutral", "age_feel": "middle"},
    "Jason":     {"weight": "medium", "energy": "medium", "warmth": "neutral", "age_feel": "middle"},
    "Liam":      {"weight": "medium", "energy": "high",   "warmth": "warm",    "age_feel": "young"},
    "Malcolm":   {"weight": "heavy",  "energy": "low",    "warmth": "cool",    "age_feel": "mature"},
    "Mortimer":  {"weight": "heavy",  "energy": "low",    "warmth": "cool",    "age_feel": "mature"},
    "Nate":      {"weight": "light",  "energy": "high",   "warmth": "warm",    "age_feel": "young"},
    "Oliver":    {"weight": "medium", "energy": "medium", "warmth": "warm",    "age_feel": "middle"},
    "Rupert":    {"weight": "medium", "energy": "low",    "warmth": "cool",    "age_feel": "mature"},
    "Simon":     {"weight": "medium", "energy": "medium", "warmth": "neutral", "age_feel": "middle"},
    "Tyler":     {"weight": "light",  "energy": "high",   "warmth": "neutral", "age_feel": "young"},
    "Victor":    {"weight": "heavy",  "energy": "medium", "warmth": "cool",    "age_feel": "mature"},
    "Vinny":     {"weight": "medium", "energy": "high",   "warmth": "warm",    "age_feel": "middle"},
    # --- Female voices ---
    # Known characterizations
    "Hana":      {"weight": "light",  "energy": "high",   "warmth": "warm",    "age_feel": "young"},    # bright, expressive young
    "Ashley":    {"weight": "medium", "energy": "medium", "warmth": "warm",    "age_feel": "middle"},   # warm, natural
    "Wendy":     {"weight": "medium", "energy": "low",    "warmth": "cool",    "age_feel": "mature"},   # posh, middle-aged British
    "Sarah":     {"weight": "light",  "energy": "high",   "warmth": "neutral", "age_feel": "middle"},   # fast-talking, questioning
    "Deborah":   {"weight": "medium", "energy": "low",    "warmth": "warm",    "age_feel": "mature"},   # gentle, elegant
    "Olivia":    {"weight": "medium", "energy": "medium", "warmth": "warm",    "age_feel": "middle"},
    "Julia":     {"weight": "medium", "energy": "medium", "warmth": "neutral", "age_feel": "middle"},   # used by Angie (deadpan)
    "Priya":     {"weight": "light",  "energy": "medium", "warmth": "warm",    "age_feel": "young"},
    "Amina":     {"weight": "medium", "energy": "medium", "warmth": "warm",    "age_feel": "middle"},   # used by Charlene (bragger)
    "Tessa":     {"weight": "medium", "energy": "medium", "warmth": "warm",    "age_feel": "middle"},   # used by Lucille
    "Kelsey":    {"weight": "light",  "energy": "medium", "warmth": "neutral", "age_feel": "young"},    # used by Maxine (quiet/nervous)
    # Remaining female pool voices
    "Anjali":    {"weight": "light",  "energy": "medium", "warmth": "warm",    "age_feel": "young"},
    "Celeste":   {"weight": "light",  "energy": "medium", "warmth": "cool",    "age_feel": "middle"},
    "Chloe":     {"weight": "light",  "energy": "high",   "warmth": "warm",    "age_feel": "young"},
    "Claire":    {"weight": "medium", "energy": "medium", "warmth": "neutral", "age_feel": "middle"},
    "Darlene":   {"weight": "medium", "energy": "medium", "warmth": "warm",    "age_feel": "mature"},
    "Elizabeth":  {"weight": "medium", "energy": "medium", "warmth": "cool",    "age_feel": "mature"},
    "Jessica":   {"weight": "medium", "energy": "medium", "warmth": "warm",    "age_feel": "middle"},
    "Kayla":     {"weight": "light",  "energy": "high",   "warmth": "warm",    "age_feel": "young"},
    "Lauren":    {"weight": "medium", "energy": "medium", "warmth": "neutral", "age_feel": "middle"},
    "Loretta":   {"weight": "medium", "energy": "low",    "warmth": "warm",    "age_feel": "mature"},
    "Luna":      {"weight": "light",  "energy": "medium", "warmth": "warm",    "age_feel": "young"},
    "Marlene":   {"weight": "medium", "energy": "low",    "warmth": "neutral", "age_feel": "mature"},
    "Miranda":   {"weight": "medium", "energy": "medium", "warmth": "cool",    "age_feel": "middle"},
    "Pippa":     {"weight": "light",  "energy": "high",   "warmth": "warm",    "age_feel": "young"},
    "Saanvi":    {"weight": "light",  "energy": "medium", "warmth": "warm",    "age_feel": "young"},
    "Serena":    {"weight": "medium", "energy": "medium", "warmth": "cool",    "age_feel": "middle"},
    "Veronica":  {"weight": "medium", "energy": "medium", "warmth": "cool",    "age_feel": "middle"},
    "Victoria":  {"weight": "medium", "energy": "low",    "warmth": "cool",    "age_feel": "mature"},
}


def preprocess_text_for_kokoro(text: str) -> str:
    """
    Preprocess text to improve Kokoro prosody and naturalness.

    - Adds slight pauses via punctuation
    - Handles contractions and abbreviations
    - Normalizes spacing
    """
    import re

    # Normalize whitespace
    text = ' '.join(text.split())

    # Add comma pauses after common transition words (if no punctuation follows)
    transitions = [
        r'\b(Well)\s+(?=[A-Za-z])',
        r'\b(So)\s+(?=[A-Za-z])',
        r'\b(Now)\s+(?=[A-Za-z])',
        r'\b(Look)\s+(?=[A-Za-z])',
        r'\b(See)\s+(?=[A-Za-z])',
        r'\b(Anyway)\s+(?=[A-Za-z])',
        r'\b(Actually)\s+(?=[A-Za-z])',
        r'\b(Honestly)\s+(?=[A-Za-z])',
        r'\b(Basically)\s+(?=[A-Za-z])',
    ]
    for pattern in transitions:
        text = re.sub(pattern, r'\1, ', text)

    # Add pause after "I mean" at start of sentence
    text = re.sub(r'^(I mean)\s+', r'\1, ', text)
    text = re.sub(r'\.\s+(I mean)\s+', r'. \1, ', text)

    # Expand common abbreviations for better pronunciation
    abbreviations = {
        r'\bDr\.': 'Doctor',
        r'\bMr\.': 'Mister',
        r'\bMrs\.': 'Missus',
        r'\bMs\.': 'Miss',
        r'\bSt\.': 'Street',
        r'\bAve\.': 'Avenue',
        r'\betc\.': 'etcetera',
        r'\bvs\.': 'versus',
        r'\bw/': 'with',
        r'\bw/o': 'without',
    }
    for abbr, expansion in abbreviations.items():
        text = re.sub(abbr, expansion, text, flags=re.IGNORECASE)

    # Add breath pause (comma) before conjunctions in long sentences
    text = re.sub(r'(\w{20,})\s+(and|but|or)\s+', r'\1, \2 ', text)

    # Ensure proper spacing after punctuation
    text = re.sub(r'([.!?])\s*([A-Z])', r'\1 \2', text)

    return text

# StyleTTS2 reference voice files (place .wav files in voices/ directory for voice cloning)
# Maps voice_id to reference audio filename - if file doesn't exist, uses default voice
STYLETTS2_VOICES = {
    # Male voices
    "VR6AewLTigWG4xSOukaG": "tony.wav",     # Tony
    "TxGEqnHWrfWFTfGW9XjX": "rick.wav",     # Rick
    "pNInz6obpgDQGcFmaJgB": "dennis.wav",   # Dennis
    "ODq5zmih8GrVes37Dizd": "earl.wav",     # Earl
    "IKne3meq5aSn9XLyUdCD": "marcus.wav",   # Marcus
    # Female voices
    "jBpfuIE2acCO8z3wKNLl": "jasmine.wav",  # Jasmine
    "EXAVITQu4vr4xnSDxMaL": "megan.wav",    # Megan
    "21m00Tcm4TlvDq8ikWAM": "tanya.wav",    # Tanya
    "XB0fDUnXU5powFXDhCwa": "carla.wav",    # Carla
    "pFZP5JQG7iQjIQuC4Bku": "brenda.wav",   # Brenda
}

# F5-TTS reference voices (same files as StyleTTS2, reuses voices/ directory)
# Requires: mono, 24kHz, 5-10 seconds, with transcript in .txt file
F5TTS_VOICES = STYLETTS2_VOICES.copy()

# ChatTTS speaker seeds - different seeds produce different voices
# These are used to generate consistent speaker embeddings
CHATTTS_SEEDS = {
    # Male voices
    "VR6AewLTigWG4xSOukaG": 42,     # Tony - deep voice
    "TxGEqnHWrfWFTfGW9XjX": 123,    # Rick
    "pNInz6obpgDQGcFmaJgB": 456,    # Dennis
    "ODq5zmih8GrVes37Dizd": 789,    # Earl
    "IKne3meq5aSn9XLyUdCD": 1011,   # Marcus
    # Female voices
    "jBpfuIE2acCO8z3wKNLl": 2024,   # Jasmine
    "EXAVITQu4vr4xnSDxMaL": 3033,   # Megan
    "21m00Tcm4TlvDq8ikWAM": 4042,   # Tanya
    "XB0fDUnXU5powFXDhCwa": 5051,   # Carla
    "pFZP5JQG7iQjIQuC4Bku": 6060,   # Brenda
}
DEFAULT_CHATTTS_SEED = 42


def get_elevenlabs_client():
    """Get or create ElevenLabs client"""
    global _elevenlabs_client
    if _elevenlabs_client is None:
        from elevenlabs.client import ElevenLabs
        _elevenlabs_client = ElevenLabs(api_key=settings.elevenlabs_api_key)
    return _elevenlabs_client


def get_vits_tts():
    """Get or create VITS VCTK TTS instance"""
    global _vits_tts
    if _vits_tts is None:
        from TTS.api import TTS
        _vits_tts = TTS("tts_models/en/vctk/vits")
    return _vits_tts


def get_kokoro_model():
    """Get or create Kokoro MLX model"""
    global _kokoro_model
    if _kokoro_model is None:
        from mlx_audio.tts.utils import load_model
        _kokoro_model = load_model(model_path='mlx-community/Kokoro-82M-bf16')
        print("Kokoro MLX model loaded")
    return _kokoro_model


def ensure_bark_loaded():
    """Ensure Bark models are loaded on GPU"""
    global _bark_loaded
    if not _bark_loaded:
        os.environ['SUNO_USE_SMALL_MODELS'] = '1'

        # Force Bark to use MPS (Apple Silicon GPU)
        if torch.backends.mps.is_available():
            os.environ['SUNO_OFFLOAD_CPU'] = '0'
            os.environ['SUNO_ENABLE_MPS'] = '1'

        from bark import preload_models
        preload_models()
        _bark_loaded = True
        print(f"Bark loaded on device: {'MPS' if torch.backends.mps.is_available() else 'CPU'}")


def get_styletts2_model():
    """Get or create StyleTTS2 model"""
    global _styletts2_model
    if _styletts2_model is None:
        from styletts2 import tts
        _styletts2_model = tts.StyleTTS2()
        print("StyleTTS2 model loaded")
    return _styletts2_model


def get_f5tts_generate():
    """Get F5-TTS generate function (lazy load)"""
    global _f5tts_model
    if _f5tts_model is None:
        # Disable tqdm progress bars to avoid BrokenPipeError in server context
        import os
        os.environ['HF_HUB_DISABLE_PROGRESS_BARS'] = '1'
        os.environ['TQDM_DISABLE'] = '1'

        from f5_tts_mlx.generate import generate
        _f5tts_model = generate
        print("F5-TTS MLX loaded")
    return _f5tts_model


def get_chattts_model():
    """Get or create ChatTTS model"""
    global _chattts_model
    if _chattts_model is None:
        import ChatTTS
        _chattts_model = ChatTTS.Chat()
        _chattts_model.load(compile=False)
        print("ChatTTS model loaded")
    return _chattts_model


def get_chattts_speaker(voice_id: str):
    """Get or create a consistent speaker embedding for a voice"""
    global _chattts_speakers
    if voice_id not in _chattts_speakers:
        chat = get_chattts_model()
        seed = CHATTTS_SEEDS.get(voice_id, DEFAULT_CHATTTS_SEED)
        # Set seed for reproducible speaker
        torch.manual_seed(seed)
        _chattts_speakers[voice_id] = chat.sample_random_speaker()
        print(f"[ChatTTS] Created speaker for voice {voice_id} with seed {seed}")
    return _chattts_speakers[voice_id]


def phone_filter(audio: np.ndarray, sample_rate: int = 24000, quality: str = "normal") -> np.ndarray:
    """Apply phone filter with variable quality."""
    audio = audio.flatten()

    presets = {
        "good": (200, 7000, 1.0, 0.0),
        "normal": (300, 3400, 1.5, 0.005),
        "bad": (400, 2800, 2.0, 0.015),
        "terrible": (500, 2200, 2.5, 0.03),
    }

    low_hz, high_hz, distortion, noise = presets.get(quality, presets["normal"])

    low = low_hz / (sample_rate / 2)
    high = high_hz / (sample_rate / 2)
    b, a = butter(4, [low, high], btype='band')
    filtered = filtfilt(b, a, audio)

    filtered = np.tanh(filtered * distortion) * 0.8

    if noise > 0:
        static = np.random.normal(0, noise, len(filtered)).astype(np.float32)
        static_envelope = np.random.random(len(filtered) // 1000 + 1)
        static_envelope = np.repeat(static_envelope, 1000)[:len(filtered)]
        static *= (static_envelope > 0.7).astype(np.float32)
        filtered = filtered + static

    return filtered.astype(np.float32)


async def generate_speech_elevenlabs(text: str, voice_id: str) -> tuple[np.ndarray, int]:
    """Generate speech using ElevenLabs"""
    client = get_elevenlabs_client()

    audio_gen = client.text_to_speech.convert(
        voice_id=voice_id,
        text=text,
        model_id="eleven_v3",
        output_format="pcm_24000"
    )

    audio_bytes = b"".join(audio_gen)
    audio = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32768.0

    return audio, 24000


async def generate_speech_kokoro(text: str, voice_id: str) -> tuple[np.ndarray, int]:
    """Generate speech using MLX Kokoro (fast, good quality, Apple Silicon optimized)"""
    import librosa
    from mlx_audio.tts.generate import generate_audio

    model = get_kokoro_model()
    voice = KOKORO_VOICES.get(voice_id, DEFAULT_KOKORO_VOICE)
    speed = KOKORO_SPEEDS.get(voice_id, DEFAULT_KOKORO_SPEED)

    # Preprocess text for better prosody
    text = preprocess_text_for_kokoro(text)

    # Determine lang_code from voice prefix (a=American, b=British)
    lang_code = 'b' if voice.startswith('b') else 'a'

    with tempfile.TemporaryDirectory() as tmpdir:
        generate_audio(
            text,
            model=model,
            voice=voice,
            speed=speed,
            lang_code=lang_code,
            output_path=tmpdir,
            file_prefix='tts',
            verbose=False
        )

        # Read the generated audio file
        audio_file = Path(tmpdir) / 'tts_000.wav'
        if not audio_file.exists():
            raise RuntimeError("Kokoro failed to generate audio")

        audio, sr = librosa.load(str(audio_file), sr=None, mono=True)

        # Resample to 24kHz if needed
        if sr != 24000:
            audio = librosa.resample(audio, orig_sr=sr, target_sr=24000)

        return audio.astype(np.float32), 24000


async def generate_speech_vits(text: str, voice_id: str) -> tuple[np.ndarray, int]:
    """Generate speech using VITS VCTK (fast, multiple speakers)"""
    import librosa

    tts = get_vits_tts()
    speaker = VITS_SPEAKERS.get(voice_id, DEFAULT_VITS_SPEAKER)

    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
        tmp_path = tmp.name

    try:
        tts.tts_to_file(text=text, file_path=tmp_path, speaker=speaker)
        audio, sr = librosa.load(tmp_path, sr=None, mono=True)

        if sr != 24000:
            audio = librosa.resample(audio, orig_sr=sr, target_sr=24000)

        return audio.astype(np.float32), 24000
    finally:
        Path(tmp_path).unlink(missing_ok=True)


async def generate_speech_bark(text: str, voice_id: str) -> tuple[np.ndarray, int]:
    """Generate speech using Bark (slow but expressive, supports emotes like [laughs])"""
    import librosa
    from bark import SAMPLE_RATE, generate_audio

    ensure_bark_loaded()

    # Generate audio with Bark
    audio = generate_audio(text)

    # Normalize to prevent clipping (Bark can exceed [-1, 1])
    max_val = np.abs(audio).max()
    if max_val > 0.95:
        audio = audio * (0.95 / max_val)

    # Resample to 24kHz if needed
    if SAMPLE_RATE != 24000:
        audio = librosa.resample(audio, orig_sr=SAMPLE_RATE, target_sr=24000)

    return audio.astype(np.float32), 24000


async def generate_speech_styletts2(text: str, voice_id: str) -> tuple[np.ndarray, int]:
    """Generate speech using StyleTTS2 (high quality, supports voice cloning)"""
    import librosa

    model = get_styletts2_model()

    # Check for reference voice file
    voice_file = STYLETTS2_VOICES.get(voice_id)
    voice_path = None
    if voice_file:
        voice_path = settings.base_dir / "voices" / voice_file
        if not voice_path.exists():
            voice_path = None  # Use default voice if file doesn't exist

    # Generate audio
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
        tmp_path = tmp.name

    try:
        if voice_path:
            print(f"[StyleTTS2] Using voice clone: {voice_path}")
            audio = model.inference(
                text,
                target_voice_path=str(voice_path),
                output_wav_file=tmp_path,
                output_sample_rate=24000,
                diffusion_steps=5,  # Balance quality/speed
                alpha=0.3,  # More voice-like than text-like
                beta=0.7,   # Good prosody
            )
        else:
            print("[StyleTTS2] Using default voice")
            audio = model.inference(
                text,
                output_wav_file=tmp_path,
                output_sample_rate=24000,
                diffusion_steps=5,
            )

        # Load the generated audio
        audio, sr = librosa.load(tmp_path, sr=None, mono=True)

        if sr != 24000:
            audio = librosa.resample(audio, orig_sr=sr, target_sr=24000)

        return audio.astype(np.float32), 24000
    finally:
        Path(tmp_path).unlink(missing_ok=True)


async def generate_speech_f5tts(text: str, voice_id: str) -> tuple[np.ndarray, int]:
    """Generate speech using F5-TTS MLX (very natural, supports voice cloning)"""
    import librosa

    generate = get_f5tts_generate()

    # Check for reference voice file and transcript
    voice_file = F5TTS_VOICES.get(voice_id)
    ref_audio_path = None
    ref_text = None

    if voice_file:
        voice_path = settings.base_dir / "voices" / voice_file
        txt_path = voice_path.with_suffix('.txt')

        if voice_path.exists() and txt_path.exists():
            ref_audio_path = str(voice_path)
            ref_text = txt_path.read_text().strip()
            print(f"[F5-TTS] Using voice clone: {voice_path}")

    if not ref_audio_path:
        print("[F5-TTS] Using default voice")

    # Generate audio to temp file
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
        tmp_path = tmp.name

    try:
        generate(
            generation_text=text,
            ref_audio_path=ref_audio_path,
            ref_audio_text=ref_text,
            steps=8,
            speed=1.0,
            output_path=tmp_path,
        )

        # Load the generated audio
        audio, sr = librosa.load(tmp_path, sr=None, mono=True)

        # Resample to 24kHz if needed
        if sr != 24000:
            audio = librosa.resample(audio, orig_sr=sr, target_sr=24000)

        return audio.astype(np.float32), 24000
    finally:
        Path(tmp_path).unlink(missing_ok=True)


async def generate_speech_chattts(text: str, voice_id: str) -> tuple[np.ndarray, int]:
    """Generate speech using ChatTTS (natural conversational speech, multiple speakers)"""
    import ChatTTS

    chat = get_chattts_model()

    # Ensure text is not empty and has reasonable content
    text = text.strip()
    if not text:
        text = "Hello."

    print(f"[ChatTTS] Generating speech for: {text[:50]}...")

    # Get consistent speaker for this voice
    seed = CHATTTS_SEEDS.get(voice_id, DEFAULT_CHATTTS_SEED)
    torch.manual_seed(seed)

    # Configure inference parameters
    params_infer_code = ChatTTS.Chat.InferCodeParams(
        temperature=0.3,
        top_P=0.7,
        top_K=20,
    )

    # Generate audio (skip text refinement to avoid narrow() error with this version)
    wavs = chat.infer(
        [text],
        params_infer_code=params_infer_code,
        skip_refine_text=True,
    )

    if wavs is None or len(wavs) == 0:
        raise RuntimeError("ChatTTS failed to generate audio")

    audio = wavs[0]

    # Handle different output shapes
    if audio.ndim > 1:
        audio = audio.squeeze()

    # Normalize
    max_val = np.abs(audio).max()
    if max_val > 0.95:
        audio = audio * (0.95 / max_val)

    return audio.astype(np.float32), 24000


_EXCITED_KEYWORDS = {"excited", "amazing", "incredible", "can't believe", "so happy",
                     "hell yeah", "fired up", "furious", "pissed", "angry", "what the hell",
                     "are you kidding", "unbelievable", "!!", "oh my god"}
_SAD_KEYWORDS = {"sad", "miss them", "passed away", "funeral", "crying", "broke my heart",
                 "can't stop thinking", "lonely", "depressed", "sorry", "regret",
                 "wish I could", "never got to", "lost", "grief"}


def _detect_speech_rate(text: str, base_speed: float) -> float:
    """Adjust speech rate based on emotional content of the text.
    Returns a speed value clamped to Inworld's 0.5-1.5 range."""
    text_lower = text.lower()
    excited = sum(1 for kw in _EXCITED_KEYWORDS if kw in text_lower)
    sad = sum(1 for kw in _SAD_KEYWORDS if kw in text_lower)

    if excited >= 2:
        return min(1.5, base_speed + 0.15)
    elif excited >= 1:
        return min(1.5, base_speed + 0.08)
    elif sad >= 2:
        return max(0.5, base_speed - 0.2)
    elif sad >= 1:
        return max(0.5, base_speed - 0.1)
    return base_speed


async def generate_speech_inworld(text: str, voice_id: str) -> tuple[np.ndarray, int]:
    """Generate speech using Inworld TTS API (high quality, natural voices)"""
    import httpx
    import base64
    import librosa

    # voice_id is now the Inworld voice name directly (e.g. "Edward")
    # Fall back to legacy mapping if it's an ElevenLabs ID
    if voice_id in INWORLD_VOICES:
        voice = INWORLD_VOICES[voice_id]
    else:
        voice = voice_id

    api_key = settings.inworld_api_key
    if not api_key:
        raise RuntimeError("INWORLD_API_KEY not set in environment")

    base_speed = INWORLD_SPEED_OVERRIDES.get(voice, DEFAULT_INWORLD_SPEED)
    speed = _detect_speech_rate(text, base_speed)
    print(f"[Inworld TTS] Voice: {voice}, Speed: {speed:.2f} (base {base_speed}), Text: {text[:50]}...")

    url = "https://api.inworld.ai/tts/v1/voice"
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Basic {api_key}",
    }
    payload = {
        "text": text,
        "voiceId": voice,
        "modelId": "inworld-tts-1.5-max",
        "audioConfig": {
            "audioEncoding": "LINEAR16",
            "sampleRateHertz": 48000,
            "speakingRate": speed,
        },
    }

    async with httpx.AsyncClient(timeout=12.0) as client:
        response = await client.post(url, json=payload, headers=headers)
        response.raise_for_status()
        data = response.json()

    # Decode base64 audio
    audio_b64 = data.get("audioContent")
    if not audio_b64:
        raise RuntimeError("Inworld TTS returned no audio content")

    audio_bytes = base64.b64decode(audio_b64)

    # Parse audio using soundfile (handles WAV, MP3, etc.)
    import soundfile as sf
    import io

    # soundfile can read WAV, FLAC, OGG, and with ffmpeg: MP3
    # MP3 files start with ID3 tag or 0xff sync bytes
    try:
        audio, sr = sf.read(io.BytesIO(audio_bytes))
    except Exception as e:
        print(f"[Inworld TTS] soundfile failed: {e}, trying raw PCM")
        # Fallback to raw PCM
        if len(audio_bytes) % 2 != 0:
            audio_bytes = audio_bytes[:-1]
        audio = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32768.0
        sr = 48000

    # Resample to 24kHz to match other providers
    if sr != 24000:
        audio = librosa.resample(audio, orig_sr=sr, target_sr=24000)

    return audio.astype(np.float32), 24000


def pick_caller_tts_provider() -> str | None:
    """Randomly assign a TTS provider for a caller.
    Returns None to use the global default, or a specific provider name.
    ~70% inworld (default), ~20% kokoro, ~10% other available."""
    import random
    roll = random.random()
    if roll < 0.70:
        return None  # Use global default (typically inworld)
    elif roll < 0.90:
        return "kokoro"
    else:
        return random.choice(["kokoro", "f5tts", "chattts"])


_TTS_PROVIDERS = {
    "kokoro": lambda text, vid: generate_speech_kokoro(text, vid),
    "f5tts": lambda text, vid: generate_speech_f5tts(text, vid),
    "inworld": lambda text, vid: generate_speech_inworld(text, vid),
    "chattts": lambda text, vid: generate_speech_chattts(text, vid),
    "styletts2": lambda text, vid: generate_speech_styletts2(text, vid),
    "bark": lambda text, vid: generate_speech_bark(text, vid),
    "vits": lambda text, vid: generate_speech_vits(text, vid),
    "elevenlabs": lambda text, vid: generate_speech_elevenlabs(text, vid),
}

TTS_MAX_RETRIES = 2
TTS_RETRY_DELAYS = [0.5, 1.0]  # seconds between retries


async def generate_speech(
    text: str,
    voice_id: str,
    phone_quality: str = "normal",
    apply_filter: bool = True,
    provider_override: str = None
) -> bytes:
    """
    Generate speech from text with automatic retry on failure.

    Args:
        text: Text to speak
        voice_id: ElevenLabs voice ID (mapped to local voice if using local TTS)
        phone_quality: Quality of phone filter ("none" to disable)
        apply_filter: Whether to apply phone filter
        provider_override: Override the global TTS provider for this call

    Returns:
        Raw PCM audio bytes (16-bit signed int, 24kHz)
    """
    import asyncio

    provider = provider_override or settings.tts_provider
    print(f"[TTS] Provider: {provider}{' (override)' if provider_override else ''}, Text: {text[:50]}...")

    gen_fn = _TTS_PROVIDERS.get(provider)
    if not gen_fn:
        raise ValueError(f"Unknown TTS provider: {provider}")

    last_error = None
    try:
        async with asyncio.timeout(20):
            for attempt in range(TTS_MAX_RETRIES):
                try:
                    audio, sample_rate = await gen_fn(text, voice_id)
                    cost_tracker.record_tts_call(provider, voice_id, len(text))
                    if attempt > 0:
                        print(f"[TTS] Succeeded on retry {attempt}")
                    break
                except TimeoutError:
                    raise  # Let asyncio.timeout propagate
                except Exception as e:
                    last_error = e
                    if attempt < TTS_MAX_RETRIES - 1:
                        delay = TTS_RETRY_DELAYS[attempt]
                        print(f"[TTS] {provider} attempt {attempt + 1} failed: {e} — retrying in {delay}s...")
                        await asyncio.sleep(delay)
                    else:
                        print(f"[TTS] {provider} failed after {TTS_MAX_RETRIES} attempts: {e}")
                        raise
    except TimeoutError:
        print(f"[TTS] Overall timeout (20s) for {provider}")
        raise RuntimeError(f"TTS generation timed out after 20s")

    # Apply phone filter if requested
    # Skip filter for Bark - it already has rough audio quality
    if apply_filter and phone_quality not in ("none", "studio") and provider != "bark":
        audio = phone_filter(audio, sample_rate, phone_quality)

    # Convert to bytes
    audio_int16 = (audio * 32768).clip(-32768, 32767).astype(np.int16)
    return audio_int16.tobytes()


# Voice IDs for cohost and announcer
COHOST_VOICE_ID = "nPczCjzI2devNBz1zQrb"
ANNOUNCER_VOICE_ID = "ErXwobaYiN019PkySvjV"


async def generate_cohost_speech(text: str) -> bytes:
    """Generate speech for cohost Bobby (no phone filter)"""
    return await generate_speech(text, COHOST_VOICE_ID, apply_filter=False)


async def generate_announcer_speech(text: str) -> bytes:
    """Generate speech for announcer (no phone filter)"""
    return await generate_speech(text, ANNOUNCER_VOICE_ID, apply_filter=False)