Initial commit: AI Radio Show web application
- FastAPI backend with multiple TTS providers (Inworld, ElevenLabs, Kokoro, F5-TTS, etc.) - Web frontend with caller management, music, and soundboard - Whisper transcription integration - OpenRouter/Ollama LLM support - Castopod podcast publishing script Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
1
backend/services/__init__.py
Normal file
1
backend/services/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
# Services package
|
||||
479
backend/services/audio.py
Normal file
479
backend/services/audio.py
Normal file
@@ -0,0 +1,479 @@
|
||||
"""Server-side audio service for Loopback routing"""
|
||||
|
||||
import sounddevice as sd
|
||||
import numpy as np
|
||||
import threading
|
||||
import queue
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Optional, Callable
|
||||
import wave
|
||||
import time
|
||||
|
||||
# Settings file path
|
||||
SETTINGS_FILE = Path(__file__).parent.parent.parent / "audio_settings.json"
|
||||
|
||||
|
||||
class AudioService:
|
||||
"""Manages audio I/O with multi-channel support for Loopback routing"""
|
||||
|
||||
def __init__(self):
|
||||
# Device configuration
|
||||
self.input_device: Optional[int] = None
|
||||
self.input_channel: int = 1 # 1-indexed channel
|
||||
|
||||
self.output_device: Optional[int] = None # Single output device (multi-channel)
|
||||
self.caller_channel: int = 1 # Channel for caller TTS
|
||||
self.music_channel: int = 2 # Channel for music
|
||||
self.sfx_channel: int = 3 # Channel for SFX
|
||||
self.phone_filter: bool = False # Phone filter on caller voices
|
||||
|
||||
# Recording state
|
||||
self._recording = False
|
||||
self._record_thread: Optional[threading.Thread] = None
|
||||
self._audio_queue: queue.Queue = queue.Queue()
|
||||
self._recorded_audio: list = []
|
||||
self._record_device_sr: int = 48000
|
||||
|
||||
# Music playback state
|
||||
self._music_stream: Optional[sd.OutputStream] = None
|
||||
self._music_data: Optional[np.ndarray] = None
|
||||
self._music_resampled: Optional[np.ndarray] = None
|
||||
self._music_position: int = 0
|
||||
self._music_playing: bool = False
|
||||
self._music_volume: float = 0.3
|
||||
self._music_loop: bool = True
|
||||
|
||||
# Caller playback state
|
||||
self._caller_stop_event = threading.Event()
|
||||
self._caller_thread: Optional[threading.Thread] = None
|
||||
|
||||
# Sample rates
|
||||
self.input_sample_rate = 16000 # For Whisper
|
||||
self.output_sample_rate = 24000 # For TTS
|
||||
|
||||
# Load saved settings
|
||||
self._load_settings()
|
||||
|
||||
def _load_settings(self):
|
||||
"""Load settings from disk"""
|
||||
if SETTINGS_FILE.exists():
|
||||
try:
|
||||
with open(SETTINGS_FILE) as f:
|
||||
data = json.load(f)
|
||||
self.input_device = data.get("input_device")
|
||||
self.input_channel = data.get("input_channel", 1)
|
||||
self.output_device = data.get("output_device")
|
||||
self.caller_channel = data.get("caller_channel", 1)
|
||||
self.music_channel = data.get("music_channel", 2)
|
||||
self.sfx_channel = data.get("sfx_channel", 3)
|
||||
self.phone_filter = data.get("phone_filter", False)
|
||||
print(f"Loaded audio settings: output={self.output_device}, channels={self.caller_channel}/{self.music_channel}/{self.sfx_channel}, phone_filter={self.phone_filter}")
|
||||
except Exception as e:
|
||||
print(f"Failed to load audio settings: {e}")
|
||||
|
||||
def _save_settings(self):
|
||||
"""Save settings to disk"""
|
||||
try:
|
||||
data = {
|
||||
"input_device": self.input_device,
|
||||
"input_channel": self.input_channel,
|
||||
"output_device": self.output_device,
|
||||
"caller_channel": self.caller_channel,
|
||||
"music_channel": self.music_channel,
|
||||
"sfx_channel": self.sfx_channel,
|
||||
"phone_filter": self.phone_filter,
|
||||
}
|
||||
with open(SETTINGS_FILE, "w") as f:
|
||||
json.dump(data, f, indent=2)
|
||||
print(f"Saved audio settings")
|
||||
except Exception as e:
|
||||
print(f"Failed to save audio settings: {e}")
|
||||
|
||||
def list_devices(self) -> list[dict]:
|
||||
"""List all available audio devices"""
|
||||
devices = sd.query_devices()
|
||||
result = []
|
||||
for i, d in enumerate(devices):
|
||||
result.append({
|
||||
"id": i,
|
||||
"name": d["name"],
|
||||
"inputs": d["max_input_channels"],
|
||||
"outputs": d["max_output_channels"],
|
||||
"default_sr": d["default_samplerate"]
|
||||
})
|
||||
return result
|
||||
|
||||
def set_devices(
|
||||
self,
|
||||
input_device: Optional[int] = None,
|
||||
input_channel: Optional[int] = None,
|
||||
output_device: Optional[int] = None,
|
||||
caller_channel: Optional[int] = None,
|
||||
music_channel: Optional[int] = None,
|
||||
sfx_channel: Optional[int] = None,
|
||||
phone_filter: Optional[bool] = None
|
||||
):
|
||||
"""Configure audio devices and channels"""
|
||||
if input_device is not None:
|
||||
self.input_device = input_device
|
||||
if input_channel is not None:
|
||||
self.input_channel = input_channel
|
||||
if output_device is not None:
|
||||
self.output_device = output_device
|
||||
if caller_channel is not None:
|
||||
self.caller_channel = caller_channel
|
||||
if music_channel is not None:
|
||||
self.music_channel = music_channel
|
||||
if sfx_channel is not None:
|
||||
self.sfx_channel = sfx_channel
|
||||
if phone_filter is not None:
|
||||
self.phone_filter = phone_filter
|
||||
|
||||
# Persist to disk
|
||||
self._save_settings()
|
||||
|
||||
def get_device_settings(self) -> dict:
|
||||
"""Get current device configuration"""
|
||||
return {
|
||||
"input_device": self.input_device,
|
||||
"input_channel": self.input_channel,
|
||||
"output_device": self.output_device,
|
||||
"caller_channel": self.caller_channel,
|
||||
"music_channel": self.music_channel,
|
||||
"sfx_channel": self.sfx_channel,
|
||||
"phone_filter": self.phone_filter,
|
||||
}
|
||||
|
||||
# --- Recording ---
|
||||
|
||||
def start_recording(self) -> bool:
|
||||
"""Start recording from input device"""
|
||||
if self._recording:
|
||||
return False
|
||||
|
||||
if self.input_device is None:
|
||||
print("No input device configured")
|
||||
return False
|
||||
|
||||
self._recording = True
|
||||
self._recorded_audio = []
|
||||
self._record_thread = threading.Thread(target=self._record_worker)
|
||||
self._record_thread.start()
|
||||
print(f"Recording started from device {self.input_device}")
|
||||
return True
|
||||
|
||||
def stop_recording(self) -> bytes:
|
||||
"""Stop recording and return audio data resampled to 16kHz for Whisper"""
|
||||
import librosa
|
||||
|
||||
if not self._recording:
|
||||
return b""
|
||||
|
||||
self._recording = False
|
||||
if self._record_thread:
|
||||
self._record_thread.join(timeout=2.0)
|
||||
|
||||
if not self._recorded_audio:
|
||||
return b""
|
||||
|
||||
# Combine all chunks
|
||||
audio = np.concatenate(self._recorded_audio)
|
||||
device_sr = getattr(self, '_record_device_sr', 48000)
|
||||
print(f"Recording stopped: {len(audio)} samples @ {device_sr}Hz ({len(audio)/device_sr:.2f}s)")
|
||||
|
||||
# Resample to 16kHz for Whisper
|
||||
if device_sr != 16000:
|
||||
audio = librosa.resample(audio, orig_sr=device_sr, target_sr=16000)
|
||||
print(f"Resampled to 16kHz: {len(audio)} samples")
|
||||
|
||||
# Convert to bytes (16-bit PCM)
|
||||
audio_int16 = (audio * 32767).astype(np.int16)
|
||||
return audio_int16.tobytes()
|
||||
|
||||
def _record_worker(self):
|
||||
"""Background thread for recording from specific channel"""
|
||||
try:
|
||||
# Get device info
|
||||
device_info = sd.query_devices(self.input_device)
|
||||
max_channels = device_info['max_input_channels']
|
||||
device_sr = int(device_info['default_samplerate'])
|
||||
record_channel = min(self.input_channel, max_channels) - 1
|
||||
|
||||
# Store device sample rate for later resampling
|
||||
self._record_device_sr = device_sr
|
||||
|
||||
print(f"Recording from device {self.input_device} ch {self.input_channel} @ {device_sr}Hz")
|
||||
|
||||
def callback(indata, frames, time_info, status):
|
||||
if status:
|
||||
print(f"Record status: {status}")
|
||||
if self._recording:
|
||||
self._recorded_audio.append(indata[:, record_channel].copy())
|
||||
|
||||
with sd.InputStream(
|
||||
device=self.input_device,
|
||||
channels=max_channels,
|
||||
samplerate=device_sr, # Use device's native rate
|
||||
dtype=np.float32,
|
||||
callback=callback,
|
||||
blocksize=1024
|
||||
):
|
||||
while self._recording:
|
||||
time.sleep(0.05)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Recording error: {e}")
|
||||
self._recording = False
|
||||
|
||||
# --- Caller TTS Playback ---
|
||||
|
||||
def _apply_fade(self, audio: np.ndarray, sample_rate: int, fade_ms: int = 15) -> np.ndarray:
|
||||
"""Apply fade-in and fade-out to avoid clicks"""
|
||||
fade_samples = int(sample_rate * fade_ms / 1000)
|
||||
if len(audio) < fade_samples * 2:
|
||||
return audio
|
||||
|
||||
# Fade in
|
||||
fade_in = np.linspace(0, 1, fade_samples)
|
||||
audio[:fade_samples] *= fade_in
|
||||
|
||||
# Fade out
|
||||
fade_out = np.linspace(1, 0, fade_samples)
|
||||
audio[-fade_samples:] *= fade_out
|
||||
|
||||
return audio
|
||||
|
||||
def play_caller_audio(self, audio_bytes: bytes, sample_rate: int = 24000):
|
||||
"""Play caller TTS audio to specific channel of output device (interruptible)"""
|
||||
import librosa
|
||||
|
||||
# Stop any existing caller audio
|
||||
self.stop_caller_audio()
|
||||
self._caller_stop_event.clear()
|
||||
|
||||
# Convert bytes to numpy
|
||||
audio = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32768.0
|
||||
|
||||
if self.output_device is None:
|
||||
print("No output device configured, using default")
|
||||
audio = self._apply_fade(audio, sample_rate)
|
||||
with sd.OutputStream(samplerate=sample_rate, channels=1, dtype=np.float32) as stream:
|
||||
stream.write(audio.reshape(-1, 1))
|
||||
return
|
||||
|
||||
try:
|
||||
# Get device info and resample to device's native rate
|
||||
device_info = sd.query_devices(self.output_device)
|
||||
num_channels = device_info['max_output_channels']
|
||||
device_sr = int(device_info['default_samplerate'])
|
||||
channel_idx = min(self.caller_channel, num_channels) - 1
|
||||
|
||||
# Resample if needed
|
||||
if sample_rate != device_sr:
|
||||
audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=device_sr)
|
||||
|
||||
# Apply fade to prevent clicks
|
||||
audio = self._apply_fade(audio, device_sr)
|
||||
|
||||
# Create multi-channel output with audio only on target channel
|
||||
multi_ch = np.zeros((len(audio), num_channels), dtype=np.float32)
|
||||
multi_ch[:, channel_idx] = audio
|
||||
|
||||
print(f"Playing caller audio to device {self.output_device} ch {self.caller_channel} @ {device_sr}Hz")
|
||||
|
||||
# Play in chunks so we can interrupt
|
||||
chunk_size = int(device_sr * 0.1) # 100ms chunks
|
||||
pos = 0
|
||||
|
||||
with sd.OutputStream(
|
||||
device=self.output_device,
|
||||
samplerate=device_sr,
|
||||
channels=num_channels,
|
||||
dtype=np.float32
|
||||
) as stream:
|
||||
while pos < len(multi_ch) and not self._caller_stop_event.is_set():
|
||||
end = min(pos + chunk_size, len(multi_ch))
|
||||
stream.write(multi_ch[pos:end])
|
||||
pos = end
|
||||
|
||||
if self._caller_stop_event.is_set():
|
||||
print("Caller audio stopped early")
|
||||
else:
|
||||
print(f"Played caller audio: {len(audio)/device_sr:.2f}s")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Caller playback error: {e}")
|
||||
|
||||
def stop_caller_audio(self):
|
||||
"""Stop any playing caller audio"""
|
||||
self._caller_stop_event.set()
|
||||
|
||||
# --- Music Playback ---
|
||||
|
||||
def load_music(self, file_path: str) -> bool:
|
||||
"""Load a music file for playback"""
|
||||
path = Path(file_path)
|
||||
if not path.exists():
|
||||
print(f"Music file not found: {file_path}")
|
||||
return False
|
||||
|
||||
try:
|
||||
import librosa
|
||||
audio, sr = librosa.load(str(path), sr=self.output_sample_rate, mono=True)
|
||||
self._music_data = audio.astype(np.float32)
|
||||
self._music_position = 0
|
||||
print(f"Loaded music: {path.name} ({len(audio)/sr:.1f}s)")
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"Failed to load music: {e}")
|
||||
return False
|
||||
|
||||
def play_music(self):
|
||||
"""Start music playback to specific channel"""
|
||||
import librosa
|
||||
|
||||
if self._music_data is None:
|
||||
print("No music loaded")
|
||||
return
|
||||
|
||||
if self._music_playing:
|
||||
self.stop_music()
|
||||
|
||||
self._music_playing = True
|
||||
self._music_position = 0
|
||||
|
||||
if self.output_device is None:
|
||||
print("No output device configured, using default")
|
||||
num_channels = 2
|
||||
device = None
|
||||
device_sr = self.output_sample_rate
|
||||
channel_idx = 0
|
||||
else:
|
||||
device_info = sd.query_devices(self.output_device)
|
||||
num_channels = device_info['max_output_channels']
|
||||
device_sr = int(device_info['default_samplerate'])
|
||||
device = self.output_device
|
||||
channel_idx = min(self.music_channel, num_channels) - 1
|
||||
|
||||
# Resample music to device sample rate if needed
|
||||
if self.output_sample_rate != device_sr:
|
||||
self._music_resampled = librosa.resample(
|
||||
self._music_data, orig_sr=self.output_sample_rate, target_sr=device_sr
|
||||
)
|
||||
else:
|
||||
self._music_resampled = self._music_data.copy()
|
||||
|
||||
# Apply fade-in at start of track
|
||||
fade_samples = int(device_sr * 0.015) # 15ms fade
|
||||
if len(self._music_resampled) > fade_samples:
|
||||
fade_in = np.linspace(0, 1, fade_samples).astype(np.float32)
|
||||
self._music_resampled[:fade_samples] *= fade_in
|
||||
|
||||
def callback(outdata, frames, time_info, status):
|
||||
outdata.fill(0)
|
||||
|
||||
if not self._music_playing or self._music_resampled is None:
|
||||
return
|
||||
|
||||
end_pos = self._music_position + frames
|
||||
|
||||
if end_pos <= len(self._music_resampled):
|
||||
outdata[:, channel_idx] = self._music_resampled[self._music_position:end_pos] * self._music_volume
|
||||
self._music_position = end_pos
|
||||
else:
|
||||
remaining = len(self._music_resampled) - self._music_position
|
||||
if remaining > 0:
|
||||
outdata[:remaining, channel_idx] = self._music_resampled[self._music_position:] * self._music_volume
|
||||
|
||||
if self._music_loop:
|
||||
self._music_position = 0
|
||||
wrap_frames = frames - remaining
|
||||
if wrap_frames > 0:
|
||||
outdata[remaining:, channel_idx] = self._music_resampled[:wrap_frames] * self._music_volume
|
||||
self._music_position = wrap_frames
|
||||
else:
|
||||
self._music_playing = False
|
||||
|
||||
try:
|
||||
self._music_stream = sd.OutputStream(
|
||||
device=device,
|
||||
channels=num_channels,
|
||||
samplerate=device_sr,
|
||||
dtype=np.float32,
|
||||
callback=callback,
|
||||
blocksize=2048
|
||||
)
|
||||
self._music_stream.start()
|
||||
print(f"Music playback started on ch {self.music_channel} @ {device_sr}Hz")
|
||||
except Exception as e:
|
||||
print(f"Music playback error: {e}")
|
||||
self._music_playing = False
|
||||
|
||||
def stop_music(self):
|
||||
"""Stop music playback"""
|
||||
self._music_playing = False
|
||||
if self._music_stream:
|
||||
self._music_stream.stop()
|
||||
self._music_stream.close()
|
||||
self._music_stream = None
|
||||
self._music_position = 0
|
||||
print("Music stopped")
|
||||
|
||||
def set_music_volume(self, volume: float):
|
||||
"""Set music volume (0.0 to 1.0)"""
|
||||
self._music_volume = max(0.0, min(1.0, volume))
|
||||
|
||||
def is_music_playing(self) -> bool:
|
||||
"""Check if music is currently playing"""
|
||||
return self._music_playing
|
||||
|
||||
# --- SFX Playback ---
|
||||
|
||||
def play_sfx(self, file_path: str):
|
||||
"""Play a sound effect to specific channel using dedicated stream"""
|
||||
path = Path(file_path)
|
||||
if not path.exists():
|
||||
print(f"SFX file not found: {file_path}")
|
||||
return
|
||||
|
||||
try:
|
||||
import librosa
|
||||
|
||||
if self.output_device is None:
|
||||
audio, sr = librosa.load(str(path), sr=None, mono=True)
|
||||
audio = self._apply_fade(audio, sr)
|
||||
def play():
|
||||
# Use a dedicated stream instead of sd.play()
|
||||
with sd.OutputStream(samplerate=sr, channels=1, dtype=np.float32) as stream:
|
||||
stream.write(audio.reshape(-1, 1))
|
||||
else:
|
||||
device_info = sd.query_devices(self.output_device)
|
||||
num_channels = device_info['max_output_channels']
|
||||
device_sr = int(device_info['default_samplerate'])
|
||||
channel_idx = min(self.sfx_channel, num_channels) - 1
|
||||
|
||||
audio, _ = librosa.load(str(path), sr=device_sr, mono=True)
|
||||
audio = self._apply_fade(audio, device_sr)
|
||||
|
||||
multi_ch = np.zeros((len(audio), num_channels), dtype=np.float32)
|
||||
multi_ch[:, channel_idx] = audio
|
||||
|
||||
def play():
|
||||
# Use dedicated stream to avoid interrupting other audio
|
||||
with sd.OutputStream(
|
||||
device=self.output_device,
|
||||
samplerate=device_sr,
|
||||
channels=num_channels,
|
||||
dtype=np.float32
|
||||
) as stream:
|
||||
stream.write(multi_ch)
|
||||
|
||||
threading.Thread(target=play, daemon=True).start()
|
||||
print(f"Playing SFX: {path.name} on ch {self.sfx_channel}")
|
||||
except Exception as e:
|
||||
print(f"SFX playback error: {e}")
|
||||
|
||||
|
||||
# Global instance
|
||||
audio_service = AudioService()
|
||||
112
backend/services/edge_tts_service.py
Normal file
112
backend/services/edge_tts_service.py
Normal file
@@ -0,0 +1,112 @@
|
||||
"""Edge TTS service - free Microsoft TTS API"""
|
||||
|
||||
import asyncio
|
||||
import io
|
||||
import numpy as np
|
||||
from typing import Optional
|
||||
|
||||
try:
|
||||
import edge_tts
|
||||
EDGE_TTS_AVAILABLE = True
|
||||
except ImportError:
|
||||
EDGE_TTS_AVAILABLE = False
|
||||
|
||||
|
||||
class EdgeTTSService:
|
||||
"""TTS using Microsoft Edge's free API"""
|
||||
|
||||
def __init__(self):
|
||||
self.sample_rate = 24000 # Edge TTS outputs 24kHz
|
||||
|
||||
def is_available(self) -> bool:
|
||||
return EDGE_TTS_AVAILABLE
|
||||
|
||||
async def generate_speech(self, text: str, voice: str = "en-US-JennyNeural") -> bytes:
|
||||
"""Generate speech from text using Edge TTS
|
||||
|
||||
Args:
|
||||
text: Text to synthesize
|
||||
voice: Edge TTS voice name (e.g., "en-US-JennyNeural")
|
||||
|
||||
Returns:
|
||||
Raw PCM audio bytes (16-bit signed int, 24kHz mono)
|
||||
"""
|
||||
if not EDGE_TTS_AVAILABLE:
|
||||
raise RuntimeError("edge-tts not installed. Run: pip install edge-tts")
|
||||
|
||||
communicate = edge_tts.Communicate(text, voice)
|
||||
|
||||
# Collect MP3 audio data
|
||||
mp3_data = b''
|
||||
async for chunk in communicate.stream():
|
||||
if chunk['type'] == 'audio':
|
||||
mp3_data += chunk['data']
|
||||
|
||||
if not mp3_data:
|
||||
raise RuntimeError("No audio generated")
|
||||
|
||||
# Convert MP3 to PCM
|
||||
pcm_data = await self._mp3_to_pcm(mp3_data)
|
||||
return pcm_data
|
||||
|
||||
async def _mp3_to_pcm(self, mp3_data: bytes) -> bytes:
|
||||
"""Convert MP3 to raw PCM using ffmpeg or pydub"""
|
||||
loop = asyncio.get_event_loop()
|
||||
|
||||
def convert():
|
||||
try:
|
||||
# Try pydub first (more reliable)
|
||||
from pydub import AudioSegment
|
||||
audio = AudioSegment.from_mp3(io.BytesIO(mp3_data))
|
||||
# Convert to 24kHz mono 16-bit
|
||||
audio = audio.set_frame_rate(24000).set_channels(1).set_sample_width(2)
|
||||
return audio.raw_data
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
# Fallback to ffmpeg subprocess
|
||||
import subprocess
|
||||
process = subprocess.Popen(
|
||||
[
|
||||
'ffmpeg', '-i', 'pipe:0',
|
||||
'-f', 's16le',
|
||||
'-acodec', 'pcm_s16le',
|
||||
'-ar', '24000',
|
||||
'-ac', '1',
|
||||
'pipe:1'
|
||||
],
|
||||
stdin=subprocess.PIPE,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE
|
||||
)
|
||||
pcm_data, stderr = process.communicate(input=mp3_data)
|
||||
if process.returncode != 0:
|
||||
raise RuntimeError(f"ffmpeg failed: {stderr.decode()}")
|
||||
return pcm_data
|
||||
|
||||
return await loop.run_in_executor(None, convert)
|
||||
|
||||
async def list_voices(self) -> list[dict]:
|
||||
"""List available Edge TTS voices"""
|
||||
if not EDGE_TTS_AVAILABLE:
|
||||
return []
|
||||
|
||||
voices = await edge_tts.list_voices()
|
||||
return [
|
||||
{
|
||||
"id": v["ShortName"],
|
||||
"name": v["ShortName"].replace("Neural", ""),
|
||||
"gender": v["Gender"],
|
||||
"locale": v["Locale"],
|
||||
}
|
||||
for v in voices
|
||||
if v["Locale"].startswith("en-")
|
||||
]
|
||||
|
||||
|
||||
# Global instance
|
||||
edge_tts_service = EdgeTTSService()
|
||||
|
||||
|
||||
def is_edge_tts_available() -> bool:
|
||||
return edge_tts_service.is_available()
|
||||
175
backend/services/llm.py
Normal file
175
backend/services/llm.py
Normal file
@@ -0,0 +1,175 @@
|
||||
"""LLM service with OpenRouter and Ollama support"""
|
||||
|
||||
import httpx
|
||||
from typing import Optional
|
||||
from ..config import settings
|
||||
|
||||
|
||||
# Available OpenRouter models
|
||||
OPENROUTER_MODELS = [
|
||||
"anthropic/claude-3-haiku",
|
||||
"anthropic/claude-3.5-sonnet",
|
||||
"openai/gpt-4o-mini",
|
||||
"openai/gpt-4o",
|
||||
"google/gemini-flash-1.5",
|
||||
"google/gemini-pro-1.5",
|
||||
"meta-llama/llama-3.1-8b-instruct",
|
||||
"mistralai/mistral-7b-instruct",
|
||||
]
|
||||
|
||||
|
||||
class LLMService:
|
||||
"""Abstraction layer for LLM providers"""
|
||||
|
||||
def __init__(self):
|
||||
self.provider = settings.llm_provider
|
||||
self.openrouter_model = settings.openrouter_model
|
||||
self.ollama_model = settings.ollama_model
|
||||
self.ollama_host = settings.ollama_host
|
||||
self.tts_provider = settings.tts_provider
|
||||
|
||||
def update_settings(
|
||||
self,
|
||||
provider: Optional[str] = None,
|
||||
openrouter_model: Optional[str] = None,
|
||||
ollama_model: Optional[str] = None,
|
||||
ollama_host: Optional[str] = None,
|
||||
tts_provider: Optional[str] = None
|
||||
):
|
||||
"""Update LLM settings"""
|
||||
if provider:
|
||||
self.provider = provider
|
||||
if openrouter_model:
|
||||
self.openrouter_model = openrouter_model
|
||||
if ollama_model:
|
||||
self.ollama_model = ollama_model
|
||||
if ollama_host:
|
||||
self.ollama_host = ollama_host
|
||||
if tts_provider:
|
||||
self.tts_provider = tts_provider
|
||||
# Also update the global settings so TTS service picks it up
|
||||
settings.tts_provider = tts_provider
|
||||
|
||||
async def get_ollama_models(self) -> list[str]:
|
||||
"""Fetch available models from Ollama"""
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=5.0) as client:
|
||||
response = await client.get(f"{self.ollama_host}/api/tags")
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
return [model["name"] for model in data.get("models", [])]
|
||||
except Exception as e:
|
||||
print(f"Failed to fetch Ollama models: {e}")
|
||||
return []
|
||||
|
||||
def get_settings(self) -> dict:
|
||||
"""Get current settings (sync version without Ollama models)"""
|
||||
return {
|
||||
"provider": self.provider,
|
||||
"openrouter_model": self.openrouter_model,
|
||||
"ollama_model": self.ollama_model,
|
||||
"ollama_host": self.ollama_host,
|
||||
"tts_provider": self.tts_provider,
|
||||
"available_openrouter_models": OPENROUTER_MODELS,
|
||||
"available_ollama_models": [] # Fetched separately
|
||||
}
|
||||
|
||||
async def get_settings_async(self) -> dict:
|
||||
"""Get current settings with Ollama models"""
|
||||
ollama_models = await self.get_ollama_models()
|
||||
return {
|
||||
"provider": self.provider,
|
||||
"openrouter_model": self.openrouter_model,
|
||||
"ollama_model": self.ollama_model,
|
||||
"ollama_host": self.ollama_host,
|
||||
"tts_provider": self.tts_provider,
|
||||
"available_openrouter_models": OPENROUTER_MODELS,
|
||||
"available_ollama_models": ollama_models
|
||||
}
|
||||
|
||||
async def generate(
|
||||
self,
|
||||
messages: list[dict],
|
||||
system_prompt: Optional[str] = None
|
||||
) -> str:
|
||||
"""
|
||||
Generate a response from the LLM.
|
||||
|
||||
Args:
|
||||
messages: List of message dicts with 'role' and 'content'
|
||||
system_prompt: Optional system prompt to prepend
|
||||
|
||||
Returns:
|
||||
Generated text response
|
||||
"""
|
||||
if system_prompt:
|
||||
messages = [{"role": "system", "content": system_prompt}] + messages
|
||||
|
||||
if self.provider == "openrouter":
|
||||
return await self._call_openrouter(messages)
|
||||
else:
|
||||
return await self._call_ollama(messages)
|
||||
|
||||
async def _call_openrouter(self, messages: list[dict]) -> str:
|
||||
"""Call OpenRouter API with retry"""
|
||||
for attempt in range(2): # Try twice
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=30.0) as client:
|
||||
response = await client.post(
|
||||
"https://openrouter.ai/api/v1/chat/completions",
|
||||
headers={
|
||||
"Authorization": f"Bearer {settings.openrouter_api_key}",
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
json={
|
||||
"model": self.openrouter_model,
|
||||
"messages": messages,
|
||||
"max_tokens": 100,
|
||||
},
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
return data["choices"][0]["message"]["content"]
|
||||
except (httpx.TimeoutException, httpx.ReadTimeout):
|
||||
print(f"OpenRouter timeout (attempt {attempt + 1})")
|
||||
if attempt == 0:
|
||||
continue # Retry once
|
||||
return "Uh, sorry, I lost you there for a second. What was that?"
|
||||
except Exception as e:
|
||||
print(f"OpenRouter error: {e}")
|
||||
return "Yeah... I don't know, man."
|
||||
return "Uh, hold on a sec..."
|
||||
|
||||
async def _call_ollama(self, messages: list[dict]) -> str:
|
||||
"""Call Ollama API"""
|
||||
try:
|
||||
async with httpx.AsyncClient() as client:
|
||||
response = await client.post(
|
||||
f"{self.ollama_host}/api/chat",
|
||||
json={
|
||||
"model": self.ollama_model,
|
||||
"messages": messages,
|
||||
"stream": False,
|
||||
"options": {
|
||||
"num_predict": 100, # Allow complete thoughts
|
||||
"temperature": 0.8, # Balanced creativity/coherence
|
||||
"top_p": 0.9, # Focused word choices
|
||||
"repeat_penalty": 1.3, # Avoid repetition
|
||||
"top_k": 50, # Reasonable token variety
|
||||
},
|
||||
},
|
||||
timeout=30.0
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
return data["message"]["content"]
|
||||
except httpx.TimeoutException:
|
||||
print("Ollama timeout")
|
||||
return "Uh, sorry, I lost you there for a second. What was that?"
|
||||
except Exception as e:
|
||||
print(f"Ollama error: {e}")
|
||||
return "Yeah... I don't know, man."
|
||||
|
||||
|
||||
# Global instance
|
||||
llm_service = LLMService()
|
||||
144
backend/services/piper_tts.py
Normal file
144
backend/services/piper_tts.py
Normal file
@@ -0,0 +1,144 @@
|
||||
"""Piper TTS service using sherpa-onnx for fast local voice synthesis"""
|
||||
|
||||
import asyncio
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
# Models directory
|
||||
MODELS_DIR = Path(__file__).parent.parent.parent / "models" / "sherpa"
|
||||
|
||||
# Try to import sherpa-onnx
|
||||
try:
|
||||
import sherpa_onnx
|
||||
SHERPA_AVAILABLE = True
|
||||
except ImportError:
|
||||
SHERPA_AVAILABLE = False
|
||||
sherpa_onnx = None
|
||||
|
||||
|
||||
# Available sherpa-onnx Piper models
|
||||
PIPER_MODELS = {
|
||||
"amy": {
|
||||
"dir": "vits-piper-en_US-amy-low",
|
||||
"model": "en_US-amy-low.onnx",
|
||||
"name": "Amy (US Female)",
|
||||
"sample_rate": 16000,
|
||||
},
|
||||
"joe": {
|
||||
"dir": "vits-piper-en_US-joe-medium",
|
||||
"model": "en_US-joe-medium.onnx",
|
||||
"name": "Joe (US Male)",
|
||||
"sample_rate": 22050,
|
||||
},
|
||||
"lessac": {
|
||||
"dir": "vits-piper-en_US-lessac-medium",
|
||||
"model": "en_US-lessac-medium.onnx",
|
||||
"name": "Lessac (US Female)",
|
||||
"sample_rate": 22050,
|
||||
},
|
||||
"alan": {
|
||||
"dir": "vits-piper-en_GB-alan-medium",
|
||||
"model": "en_GB-alan-medium.onnx",
|
||||
"name": "Alan (UK Male)",
|
||||
"sample_rate": 22050,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
class PiperTTSService:
|
||||
"""Fast local TTS using sherpa-onnx with Piper models"""
|
||||
|
||||
def __init__(self):
|
||||
self.output_sample_rate = 24000 # Our standard output rate
|
||||
self._tts_engines: dict[str, any] = {}
|
||||
|
||||
def is_available(self) -> bool:
|
||||
"""Check if sherpa-onnx is available"""
|
||||
return SHERPA_AVAILABLE
|
||||
|
||||
def _get_engine(self, model_key: str):
|
||||
"""Get or create a TTS engine for the given model"""
|
||||
if model_key in self._tts_engines:
|
||||
return self._tts_engines[model_key], PIPER_MODELS[model_key]["sample_rate"]
|
||||
|
||||
if model_key not in PIPER_MODELS:
|
||||
raise ValueError(f"Unknown model: {model_key}")
|
||||
|
||||
model_info = PIPER_MODELS[model_key]
|
||||
model_dir = MODELS_DIR / model_info["dir"]
|
||||
|
||||
if not model_dir.exists():
|
||||
raise RuntimeError(f"Model not found: {model_dir}")
|
||||
|
||||
config = sherpa_onnx.OfflineTtsConfig(
|
||||
model=sherpa_onnx.OfflineTtsModelConfig(
|
||||
vits=sherpa_onnx.OfflineTtsVitsModelConfig(
|
||||
model=str(model_dir / model_info["model"]),
|
||||
tokens=str(model_dir / "tokens.txt"),
|
||||
data_dir=str(model_dir / "espeak-ng-data"),
|
||||
),
|
||||
num_threads=2,
|
||||
),
|
||||
)
|
||||
tts = sherpa_onnx.OfflineTts(config)
|
||||
self._tts_engines[model_key] = tts
|
||||
return tts, model_info["sample_rate"]
|
||||
|
||||
async def generate_speech(self, text: str, model_key: str = "amy") -> bytes:
|
||||
"""Generate speech from text using sherpa-onnx
|
||||
|
||||
Args:
|
||||
text: Text to synthesize
|
||||
model_key: Model key (amy, joe, lessac, alan)
|
||||
|
||||
Returns:
|
||||
Raw PCM audio bytes (16-bit signed int, 24kHz mono)
|
||||
"""
|
||||
if not SHERPA_AVAILABLE:
|
||||
raise RuntimeError("sherpa-onnx not installed. Run: pip install sherpa-onnx")
|
||||
|
||||
loop = asyncio.get_event_loop()
|
||||
|
||||
def run_tts():
|
||||
tts, model_sample_rate = self._get_engine(model_key)
|
||||
audio = tts.generate(text)
|
||||
samples = np.array(audio.samples, dtype=np.float32)
|
||||
|
||||
# Resample to 24kHz if needed
|
||||
if model_sample_rate != self.output_sample_rate:
|
||||
ratio = self.output_sample_rate / model_sample_rate
|
||||
new_length = int(len(samples) * ratio)
|
||||
samples = np.interp(
|
||||
np.linspace(0, len(samples) - 1, new_length),
|
||||
np.arange(len(samples)),
|
||||
samples
|
||||
).astype(np.float32)
|
||||
|
||||
# Convert to int16
|
||||
audio_int16 = (samples * 32767).astype(np.int16)
|
||||
return audio_int16.tobytes()
|
||||
|
||||
return await loop.run_in_executor(None, run_tts)
|
||||
|
||||
def list_available_models(self) -> list[dict]:
|
||||
"""List available models"""
|
||||
available = []
|
||||
for key, info in PIPER_MODELS.items():
|
||||
model_dir = MODELS_DIR / info["dir"]
|
||||
if model_dir.exists():
|
||||
available.append({
|
||||
"id": key,
|
||||
"name": info["name"],
|
||||
"sample_rate": info["sample_rate"],
|
||||
})
|
||||
return available
|
||||
|
||||
|
||||
# Global instance
|
||||
piper_service = PiperTTSService()
|
||||
|
||||
|
||||
def is_piper_available() -> bool:
|
||||
"""Check if Piper (sherpa-onnx) is available"""
|
||||
return piper_service.is_available()
|
||||
116
backend/services/transcription.py
Normal file
116
backend/services/transcription.py
Normal file
@@ -0,0 +1,116 @@
|
||||
"""Whisper transcription service"""
|
||||
|
||||
import tempfile
|
||||
import numpy as np
|
||||
from faster_whisper import WhisperModel
|
||||
import librosa
|
||||
|
||||
# Global model instance (loaded once)
|
||||
_whisper_model = None
|
||||
|
||||
|
||||
def get_whisper_model() -> WhisperModel:
|
||||
"""Get or create Whisper model instance"""
|
||||
global _whisper_model
|
||||
if _whisper_model is None:
|
||||
print("Loading Whisper tiny model for fast transcription...")
|
||||
# Use tiny model for speed - about 3-4x faster than base
|
||||
# beam_size=1 and best_of=1 for fastest inference
|
||||
_whisper_model = WhisperModel("tiny", device="cpu", compute_type="int8")
|
||||
print("Whisper model loaded")
|
||||
return _whisper_model
|
||||
|
||||
|
||||
def decode_audio(audio_data: bytes, source_sample_rate: int = None) -> tuple[np.ndarray, int]:
|
||||
"""
|
||||
Decode audio from various formats to numpy array.
|
||||
|
||||
Args:
|
||||
audio_data: Raw audio bytes
|
||||
source_sample_rate: If provided, treat as raw PCM at this sample rate
|
||||
|
||||
Returns:
|
||||
Tuple of (audio array as float32, sample rate)
|
||||
"""
|
||||
# If sample rate is provided, assume raw PCM (from server-side recording)
|
||||
if source_sample_rate is not None:
|
||||
print(f"Decoding raw PCM at {source_sample_rate}Hz, {len(audio_data)} bytes")
|
||||
if len(audio_data) % 2 != 0:
|
||||
audio_data = audio_data + b'\x00'
|
||||
audio = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32768.0
|
||||
return audio, source_sample_rate
|
||||
|
||||
print(f"First 20 bytes: {audio_data[:20].hex()}")
|
||||
|
||||
# Try to decode with librosa first (handles webm, ogg, wav, mp3, etc via ffmpeg)
|
||||
try:
|
||||
with tempfile.NamedTemporaryFile(suffix='.webm', delete=False) as f:
|
||||
f.write(audio_data)
|
||||
temp_path = f.name
|
||||
|
||||
audio, sample_rate = librosa.load(temp_path, sr=None, mono=True)
|
||||
print(f"Decoded with librosa: {len(audio)} samples at {sample_rate}Hz")
|
||||
|
||||
import os
|
||||
os.unlink(temp_path)
|
||||
|
||||
return audio.astype(np.float32), sample_rate
|
||||
|
||||
except Exception as e:
|
||||
print(f"librosa decode failed: {e}, trying raw PCM at 16kHz...")
|
||||
|
||||
# Fall back to raw PCM (16-bit signed int, 16kHz mono - Whisper's rate)
|
||||
if len(audio_data) % 2 != 0:
|
||||
audio_data = audio_data + b'\x00'
|
||||
|
||||
audio = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32768.0
|
||||
return audio, 16000
|
||||
|
||||
|
||||
async def transcribe_audio(audio_data: bytes, source_sample_rate: int = None) -> str:
|
||||
"""
|
||||
Transcribe audio data to text using Whisper.
|
||||
|
||||
Args:
|
||||
audio_data: Audio bytes (webm, ogg, wav, or raw PCM)
|
||||
source_sample_rate: If provided, treat audio_data as raw PCM at this rate
|
||||
|
||||
Returns:
|
||||
Transcribed text
|
||||
"""
|
||||
model = get_whisper_model()
|
||||
|
||||
print(f"Transcribing audio: {len(audio_data)} bytes")
|
||||
|
||||
# Decode audio from whatever format
|
||||
audio, detected_sample_rate = decode_audio(audio_data, source_sample_rate)
|
||||
|
||||
print(f"Audio samples: {len(audio)}, duration: {len(audio)/detected_sample_rate:.2f}s")
|
||||
print(f"Audio range: min={audio.min():.4f}, max={audio.max():.4f}")
|
||||
|
||||
# Check if audio is too quiet
|
||||
if np.abs(audio).max() < 0.01:
|
||||
print("Warning: Audio appears to be silent or very quiet")
|
||||
return ""
|
||||
|
||||
# Resample to 16kHz for Whisper
|
||||
if detected_sample_rate != 16000:
|
||||
audio_16k = librosa.resample(audio, orig_sr=detected_sample_rate, target_sr=16000)
|
||||
print(f"Resampled to {len(audio_16k)} samples at 16kHz")
|
||||
else:
|
||||
audio_16k = audio
|
||||
|
||||
# Transcribe with speed optimizations
|
||||
segments, info = model.transcribe(
|
||||
audio_16k,
|
||||
beam_size=1, # Faster, slightly less accurate
|
||||
best_of=1,
|
||||
language="en", # Skip language detection
|
||||
vad_filter=True, # Skip silence
|
||||
)
|
||||
segments_list = list(segments)
|
||||
text = " ".join([s.text for s in segments_list]).strip()
|
||||
|
||||
print(f"Transcription result: '{text}' (language: {info.language}, prob: {info.language_probability:.2f})")
|
||||
|
||||
return text
|
||||
701
backend/services/tts.py
Normal file
701
backend/services/tts.py
Normal file
@@ -0,0 +1,701 @@
|
||||
"""TTS service with ElevenLabs, F5-TTS, MLX Kokoro, StyleTTS2, VITS, and Bark support"""
|
||||
|
||||
import os
|
||||
import numpy as np
|
||||
from scipy.signal import butter, filtfilt
|
||||
from pathlib import Path
|
||||
import tempfile
|
||||
import torch
|
||||
|
||||
from ..config import settings
|
||||
|
||||
# Patch torch.load for compatibility with PyTorch 2.6+
|
||||
_original_torch_load = torch.load
|
||||
def _patched_torch_load(*args, **kwargs):
|
||||
kwargs['weights_only'] = False
|
||||
return _original_torch_load(*args, **kwargs)
|
||||
torch.load = _patched_torch_load
|
||||
|
||||
# Global clients
|
||||
_elevenlabs_client = None
|
||||
_vits_tts = None
|
||||
_bark_loaded = False
|
||||
_kokoro_model = None
|
||||
_styletts2_model = None
|
||||
_f5tts_model = None
|
||||
_chattts_model = None
|
||||
_chattts_speakers = {} # Cache for speaker embeddings
|
||||
|
||||
# Kokoro voice mapping - using highest-graded voices
|
||||
# Grades from https://huggingface.co/hexgrad/Kokoro-82M/blob/main/VOICES.md
|
||||
KOKORO_VOICES = {
|
||||
# Male voices (best available are C+ grade)
|
||||
"VR6AewLTigWG4xSOukaG": "am_fenrir", # Tony - deep/powerful (C+)
|
||||
"TxGEqnHWrfWFTfGW9XjX": "am_michael", # Rick - solid male voice (C+)
|
||||
"pNInz6obpgDQGcFmaJgB": "am_puck", # Dennis - anxious dad (C+)
|
||||
"ODq5zmih8GrVes37Dizd": "bm_george", # Earl - older/distinguished British (C)
|
||||
"IKne3meq5aSn9XLyUdCD": "bm_fable", # Marcus - young British (C)
|
||||
# Female voices (much better quality available)
|
||||
"jBpfuIE2acCO8z3wKNLl": "af_heart", # Jasmine - best quality (A)
|
||||
"EXAVITQu4vr4xnSDxMaL": "af_bella", # Megan - warm/friendly (A-)
|
||||
"21m00Tcm4TlvDq8ikWAM": "bf_emma", # Tanya - professional British (B-)
|
||||
"XB0fDUnXU5powFXDhCwa": "af_nicole", # Carla - Jersey mom (B-)
|
||||
"pFZP5JQG7iQjIQuC4Bku": "af_sarah", # Brenda - overthinker (C+)
|
||||
}
|
||||
|
||||
# Speed adjustments per voice (1.0 = normal, lower = slower/more natural)
|
||||
# Slower speeds (0.85-0.95) generally sound more natural
|
||||
KOKORO_SPEEDS = {
|
||||
# Male voices - slower speeds help with C+ grade voices
|
||||
"VR6AewLTigWG4xSOukaG": 0.9, # Tony (am_fenrir) - deep voice, slower
|
||||
"TxGEqnHWrfWFTfGW9XjX": 0.92, # Rick (am_michael) - solid pace
|
||||
"pNInz6obpgDQGcFmaJgB": 0.95, # Dennis (am_puck) - anxious but not rushed
|
||||
"ODq5zmih8GrVes37Dizd": 0.85, # Earl (bm_george) - older, slower British
|
||||
"IKne3meq5aSn9XLyUdCD": 0.95, # Marcus (bm_fable) - young, natural
|
||||
# Female voices - A-grade voices can handle faster speeds
|
||||
"jBpfuIE2acCO8z3wKNLl": 0.95, # Jasmine (af_heart) - best voice, natural pace
|
||||
"EXAVITQu4vr4xnSDxMaL": 0.95, # Megan (af_bella) - warm
|
||||
"21m00Tcm4TlvDq8ikWAM": 0.9, # Tanya (bf_emma) - professional British
|
||||
"XB0fDUnXU5powFXDhCwa": 0.95, # Carla (af_nicole) - animated but clear
|
||||
"pFZP5JQG7iQjIQuC4Bku": 0.92, # Brenda (af_sarah) - overthinker, measured
|
||||
}
|
||||
|
||||
DEFAULT_KOKORO_VOICE = "af_heart"
|
||||
DEFAULT_KOKORO_SPEED = 0.95
|
||||
|
||||
# VCTK speaker mapping - different voices for different callers
|
||||
VITS_SPEAKERS = {
|
||||
# Male voices
|
||||
"VR6AewLTigWG4xSOukaG": "p226", # Tony
|
||||
"TxGEqnHWrfWFTfGW9XjX": "p251", # Rick
|
||||
"pNInz6obpgDQGcFmaJgB": "p245", # Dennis
|
||||
"ODq5zmih8GrVes37Dizd": "p232", # Earl
|
||||
"IKne3meq5aSn9XLyUdCD": "p252", # Marcus
|
||||
# Female voices
|
||||
"jBpfuIE2acCO8z3wKNLl": "p225", # Jasmine
|
||||
"EXAVITQu4vr4xnSDxMaL": "p228", # Megan
|
||||
"21m00Tcm4TlvDq8ikWAM": "p229", # Tanya
|
||||
"XB0fDUnXU5powFXDhCwa": "p231", # Carla
|
||||
"pFZP5JQG7iQjIQuC4Bku": "p233", # Brenda
|
||||
}
|
||||
|
||||
DEFAULT_VITS_SPEAKER = "p225"
|
||||
|
||||
# Inworld voice mapping - maps ElevenLabs voice IDs to Inworld voices
|
||||
# Full voice list from API: Alex, Ashley, Blake, Carter, Clive, Craig, Deborah,
|
||||
# Dennis, Dominus, Edward, Elizabeth, Hades, Hana, Julia, Luna, Mark, Olivia,
|
||||
# Pixie, Priya, Ronald, Sarah, Shaun, Theodore, Timothy, Wendy
|
||||
INWORLD_VOICES = {
|
||||
# Male voices - each caller gets a unique voice matching their personality
|
||||
"VR6AewLTigWG4xSOukaG": "Edward", # Tony - fast-talking, emphatic, streetwise
|
||||
"TxGEqnHWrfWFTfGW9XjX": "Shaun", # Rick - friendly, dynamic, conversational
|
||||
"pNInz6obpgDQGcFmaJgB": "Alex", # Dennis - energetic, expressive, mildly nasal
|
||||
"ODq5zmih8GrVes37Dizd": "Craig", # Earl - older British, refined, articulate
|
||||
"IKne3meq5aSn9XLyUdCD": "Timothy", # Marcus - lively, upbeat American
|
||||
# Female voices - each caller gets a unique voice matching their personality
|
||||
"jBpfuIE2acCO8z3wKNLl": "Hana", # Jasmine - bright, expressive young female
|
||||
"EXAVITQu4vr4xnSDxMaL": "Ashley", # Megan - warm, natural female
|
||||
"21m00Tcm4TlvDq8ikWAM": "Wendy", # Tanya - posh, middle-aged British
|
||||
"XB0fDUnXU5powFXDhCwa": "Sarah", # Carla - fast-talking, questioning tone
|
||||
"pFZP5JQG7iQjIQuC4Bku": "Deborah", # Brenda - gentle, elegant
|
||||
}
|
||||
DEFAULT_INWORLD_VOICE = "Dennis"
|
||||
|
||||
|
||||
def preprocess_text_for_kokoro(text: str) -> str:
|
||||
"""
|
||||
Preprocess text to improve Kokoro prosody and naturalness.
|
||||
|
||||
- Adds slight pauses via punctuation
|
||||
- Handles contractions and abbreviations
|
||||
- Normalizes spacing
|
||||
"""
|
||||
import re
|
||||
|
||||
# Normalize whitespace
|
||||
text = ' '.join(text.split())
|
||||
|
||||
# Add comma pauses after common transition words (if no punctuation follows)
|
||||
transitions = [
|
||||
r'\b(Well)\s+(?=[A-Za-z])',
|
||||
r'\b(So)\s+(?=[A-Za-z])',
|
||||
r'\b(Now)\s+(?=[A-Za-z])',
|
||||
r'\b(Look)\s+(?=[A-Za-z])',
|
||||
r'\b(See)\s+(?=[A-Za-z])',
|
||||
r'\b(Anyway)\s+(?=[A-Za-z])',
|
||||
r'\b(Actually)\s+(?=[A-Za-z])',
|
||||
r'\b(Honestly)\s+(?=[A-Za-z])',
|
||||
r'\b(Basically)\s+(?=[A-Za-z])',
|
||||
]
|
||||
for pattern in transitions:
|
||||
text = re.sub(pattern, r'\1, ', text)
|
||||
|
||||
# Add pause after "I mean" at start of sentence
|
||||
text = re.sub(r'^(I mean)\s+', r'\1, ', text)
|
||||
text = re.sub(r'\.\s+(I mean)\s+', r'. \1, ', text)
|
||||
|
||||
# Expand common abbreviations for better pronunciation
|
||||
abbreviations = {
|
||||
r'\bDr\.': 'Doctor',
|
||||
r'\bMr\.': 'Mister',
|
||||
r'\bMrs\.': 'Missus',
|
||||
r'\bMs\.': 'Miss',
|
||||
r'\bSt\.': 'Street',
|
||||
r'\bAve\.': 'Avenue',
|
||||
r'\betc\.': 'etcetera',
|
||||
r'\bvs\.': 'versus',
|
||||
r'\bw/': 'with',
|
||||
r'\bw/o': 'without',
|
||||
}
|
||||
for abbr, expansion in abbreviations.items():
|
||||
text = re.sub(abbr, expansion, text, flags=re.IGNORECASE)
|
||||
|
||||
# Add breath pause (comma) before conjunctions in long sentences
|
||||
text = re.sub(r'(\w{20,})\s+(and|but|or)\s+', r'\1, \2 ', text)
|
||||
|
||||
# Ensure proper spacing after punctuation
|
||||
text = re.sub(r'([.!?])\s*([A-Z])', r'\1 \2', text)
|
||||
|
||||
return text
|
||||
|
||||
# StyleTTS2 reference voice files (place .wav files in voices/ directory for voice cloning)
|
||||
# Maps voice_id to reference audio filename - if file doesn't exist, uses default voice
|
||||
STYLETTS2_VOICES = {
|
||||
# Male voices
|
||||
"VR6AewLTigWG4xSOukaG": "tony.wav", # Tony
|
||||
"TxGEqnHWrfWFTfGW9XjX": "rick.wav", # Rick
|
||||
"pNInz6obpgDQGcFmaJgB": "dennis.wav", # Dennis
|
||||
"ODq5zmih8GrVes37Dizd": "earl.wav", # Earl
|
||||
"IKne3meq5aSn9XLyUdCD": "marcus.wav", # Marcus
|
||||
# Female voices
|
||||
"jBpfuIE2acCO8z3wKNLl": "jasmine.wav", # Jasmine
|
||||
"EXAVITQu4vr4xnSDxMaL": "megan.wav", # Megan
|
||||
"21m00Tcm4TlvDq8ikWAM": "tanya.wav", # Tanya
|
||||
"XB0fDUnXU5powFXDhCwa": "carla.wav", # Carla
|
||||
"pFZP5JQG7iQjIQuC4Bku": "brenda.wav", # Brenda
|
||||
}
|
||||
|
||||
# F5-TTS reference voices (same files as StyleTTS2, reuses voices/ directory)
|
||||
# Requires: mono, 24kHz, 5-10 seconds, with transcript in .txt file
|
||||
F5TTS_VOICES = STYLETTS2_VOICES.copy()
|
||||
|
||||
# ChatTTS speaker seeds - different seeds produce different voices
|
||||
# These are used to generate consistent speaker embeddings
|
||||
CHATTTS_SEEDS = {
|
||||
# Male voices
|
||||
"VR6AewLTigWG4xSOukaG": 42, # Tony - deep voice
|
||||
"TxGEqnHWrfWFTfGW9XjX": 123, # Rick
|
||||
"pNInz6obpgDQGcFmaJgB": 456, # Dennis
|
||||
"ODq5zmih8GrVes37Dizd": 789, # Earl
|
||||
"IKne3meq5aSn9XLyUdCD": 1011, # Marcus
|
||||
# Female voices
|
||||
"jBpfuIE2acCO8z3wKNLl": 2024, # Jasmine
|
||||
"EXAVITQu4vr4xnSDxMaL": 3033, # Megan
|
||||
"21m00Tcm4TlvDq8ikWAM": 4042, # Tanya
|
||||
"XB0fDUnXU5powFXDhCwa": 5051, # Carla
|
||||
"pFZP5JQG7iQjIQuC4Bku": 6060, # Brenda
|
||||
}
|
||||
DEFAULT_CHATTTS_SEED = 42
|
||||
|
||||
|
||||
def get_elevenlabs_client():
|
||||
"""Get or create ElevenLabs client"""
|
||||
global _elevenlabs_client
|
||||
if _elevenlabs_client is None:
|
||||
from elevenlabs.client import ElevenLabs
|
||||
_elevenlabs_client = ElevenLabs(api_key=settings.elevenlabs_api_key)
|
||||
return _elevenlabs_client
|
||||
|
||||
|
||||
def get_vits_tts():
|
||||
"""Get or create VITS VCTK TTS instance"""
|
||||
global _vits_tts
|
||||
if _vits_tts is None:
|
||||
from TTS.api import TTS
|
||||
_vits_tts = TTS("tts_models/en/vctk/vits")
|
||||
return _vits_tts
|
||||
|
||||
|
||||
def get_kokoro_model():
|
||||
"""Get or create Kokoro MLX model"""
|
||||
global _kokoro_model
|
||||
if _kokoro_model is None:
|
||||
from mlx_audio.tts.utils import load_model
|
||||
_kokoro_model = load_model(model_path='mlx-community/Kokoro-82M-bf16')
|
||||
print("Kokoro MLX model loaded")
|
||||
return _kokoro_model
|
||||
|
||||
|
||||
def ensure_bark_loaded():
|
||||
"""Ensure Bark models are loaded on GPU"""
|
||||
global _bark_loaded
|
||||
if not _bark_loaded:
|
||||
os.environ['SUNO_USE_SMALL_MODELS'] = '1'
|
||||
|
||||
# Force Bark to use MPS (Apple Silicon GPU)
|
||||
if torch.backends.mps.is_available():
|
||||
os.environ['SUNO_OFFLOAD_CPU'] = '0'
|
||||
os.environ['SUNO_ENABLE_MPS'] = '1'
|
||||
|
||||
from bark import preload_models
|
||||
preload_models()
|
||||
_bark_loaded = True
|
||||
print(f"Bark loaded on device: {'MPS' if torch.backends.mps.is_available() else 'CPU'}")
|
||||
|
||||
|
||||
def get_styletts2_model():
|
||||
"""Get or create StyleTTS2 model"""
|
||||
global _styletts2_model
|
||||
if _styletts2_model is None:
|
||||
from styletts2 import tts
|
||||
_styletts2_model = tts.StyleTTS2()
|
||||
print("StyleTTS2 model loaded")
|
||||
return _styletts2_model
|
||||
|
||||
|
||||
def get_f5tts_generate():
|
||||
"""Get F5-TTS generate function (lazy load)"""
|
||||
global _f5tts_model
|
||||
if _f5tts_model is None:
|
||||
# Disable tqdm progress bars to avoid BrokenPipeError in server context
|
||||
import os
|
||||
os.environ['HF_HUB_DISABLE_PROGRESS_BARS'] = '1'
|
||||
os.environ['TQDM_DISABLE'] = '1'
|
||||
|
||||
from f5_tts_mlx.generate import generate
|
||||
_f5tts_model = generate
|
||||
print("F5-TTS MLX loaded")
|
||||
return _f5tts_model
|
||||
|
||||
|
||||
def get_chattts_model():
|
||||
"""Get or create ChatTTS model"""
|
||||
global _chattts_model
|
||||
if _chattts_model is None:
|
||||
import ChatTTS
|
||||
_chattts_model = ChatTTS.Chat()
|
||||
_chattts_model.load(compile=False)
|
||||
print("ChatTTS model loaded")
|
||||
return _chattts_model
|
||||
|
||||
|
||||
def get_chattts_speaker(voice_id: str):
|
||||
"""Get or create a consistent speaker embedding for a voice"""
|
||||
global _chattts_speakers
|
||||
if voice_id not in _chattts_speakers:
|
||||
chat = get_chattts_model()
|
||||
seed = CHATTTS_SEEDS.get(voice_id, DEFAULT_CHATTTS_SEED)
|
||||
# Set seed for reproducible speaker
|
||||
torch.manual_seed(seed)
|
||||
_chattts_speakers[voice_id] = chat.sample_random_speaker()
|
||||
print(f"[ChatTTS] Created speaker for voice {voice_id} with seed {seed}")
|
||||
return _chattts_speakers[voice_id]
|
||||
|
||||
|
||||
def phone_filter(audio: np.ndarray, sample_rate: int = 24000, quality: str = "normal") -> np.ndarray:
|
||||
"""Apply phone filter with variable quality."""
|
||||
audio = audio.flatten()
|
||||
|
||||
presets = {
|
||||
"good": (200, 7000, 1.0, 0.0),
|
||||
"normal": (300, 3400, 1.5, 0.005),
|
||||
"bad": (400, 2800, 2.0, 0.015),
|
||||
"terrible": (500, 2200, 2.5, 0.03),
|
||||
}
|
||||
|
||||
low_hz, high_hz, distortion, noise = presets.get(quality, presets["normal"])
|
||||
|
||||
low = low_hz / (sample_rate / 2)
|
||||
high = high_hz / (sample_rate / 2)
|
||||
b, a = butter(4, [low, high], btype='band')
|
||||
filtered = filtfilt(b, a, audio)
|
||||
|
||||
filtered = np.tanh(filtered * distortion) * 0.8
|
||||
|
||||
if noise > 0:
|
||||
static = np.random.normal(0, noise, len(filtered)).astype(np.float32)
|
||||
static_envelope = np.random.random(len(filtered) // 1000 + 1)
|
||||
static_envelope = np.repeat(static_envelope, 1000)[:len(filtered)]
|
||||
static *= (static_envelope > 0.7).astype(np.float32)
|
||||
filtered = filtered + static
|
||||
|
||||
return filtered.astype(np.float32)
|
||||
|
||||
|
||||
async def generate_speech_elevenlabs(text: str, voice_id: str) -> tuple[np.ndarray, int]:
|
||||
"""Generate speech using ElevenLabs"""
|
||||
client = get_elevenlabs_client()
|
||||
|
||||
audio_gen = client.text_to_speech.convert(
|
||||
voice_id=voice_id,
|
||||
text=text,
|
||||
model_id="eleven_v3",
|
||||
output_format="pcm_24000"
|
||||
)
|
||||
|
||||
audio_bytes = b"".join(audio_gen)
|
||||
audio = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32768.0
|
||||
|
||||
return audio, 24000
|
||||
|
||||
|
||||
async def generate_speech_kokoro(text: str, voice_id: str) -> tuple[np.ndarray, int]:
|
||||
"""Generate speech using MLX Kokoro (fast, good quality, Apple Silicon optimized)"""
|
||||
import librosa
|
||||
from mlx_audio.tts.generate import generate_audio
|
||||
|
||||
model = get_kokoro_model()
|
||||
voice = KOKORO_VOICES.get(voice_id, DEFAULT_KOKORO_VOICE)
|
||||
speed = KOKORO_SPEEDS.get(voice_id, DEFAULT_KOKORO_SPEED)
|
||||
|
||||
# Preprocess text for better prosody
|
||||
text = preprocess_text_for_kokoro(text)
|
||||
|
||||
# Determine lang_code from voice prefix (a=American, b=British)
|
||||
lang_code = 'b' if voice.startswith('b') else 'a'
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
generate_audio(
|
||||
text,
|
||||
model=model,
|
||||
voice=voice,
|
||||
speed=speed,
|
||||
lang_code=lang_code,
|
||||
output_path=tmpdir,
|
||||
file_prefix='tts',
|
||||
verbose=False
|
||||
)
|
||||
|
||||
# Read the generated audio file
|
||||
audio_file = Path(tmpdir) / 'tts_000.wav'
|
||||
if not audio_file.exists():
|
||||
raise RuntimeError("Kokoro failed to generate audio")
|
||||
|
||||
audio, sr = librosa.load(str(audio_file), sr=None, mono=True)
|
||||
|
||||
# Resample to 24kHz if needed
|
||||
if sr != 24000:
|
||||
audio = librosa.resample(audio, orig_sr=sr, target_sr=24000)
|
||||
|
||||
return audio.astype(np.float32), 24000
|
||||
|
||||
|
||||
async def generate_speech_vits(text: str, voice_id: str) -> tuple[np.ndarray, int]:
|
||||
"""Generate speech using VITS VCTK (fast, multiple speakers)"""
|
||||
import librosa
|
||||
|
||||
tts = get_vits_tts()
|
||||
speaker = VITS_SPEAKERS.get(voice_id, DEFAULT_VITS_SPEAKER)
|
||||
|
||||
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
|
||||
tmp_path = tmp.name
|
||||
|
||||
try:
|
||||
tts.tts_to_file(text=text, file_path=tmp_path, speaker=speaker)
|
||||
audio, sr = librosa.load(tmp_path, sr=None, mono=True)
|
||||
|
||||
if sr != 24000:
|
||||
audio = librosa.resample(audio, orig_sr=sr, target_sr=24000)
|
||||
|
||||
return audio.astype(np.float32), 24000
|
||||
finally:
|
||||
Path(tmp_path).unlink(missing_ok=True)
|
||||
|
||||
|
||||
async def generate_speech_bark(text: str, voice_id: str) -> tuple[np.ndarray, int]:
|
||||
"""Generate speech using Bark (slow but expressive, supports emotes like [laughs])"""
|
||||
import librosa
|
||||
from bark import SAMPLE_RATE, generate_audio
|
||||
|
||||
ensure_bark_loaded()
|
||||
|
||||
# Generate audio with Bark
|
||||
audio = generate_audio(text)
|
||||
|
||||
# Normalize to prevent clipping (Bark can exceed [-1, 1])
|
||||
max_val = np.abs(audio).max()
|
||||
if max_val > 0.95:
|
||||
audio = audio * (0.95 / max_val)
|
||||
|
||||
# Resample to 24kHz if needed
|
||||
if SAMPLE_RATE != 24000:
|
||||
audio = librosa.resample(audio, orig_sr=SAMPLE_RATE, target_sr=24000)
|
||||
|
||||
return audio.astype(np.float32), 24000
|
||||
|
||||
|
||||
async def generate_speech_styletts2(text: str, voice_id: str) -> tuple[np.ndarray, int]:
|
||||
"""Generate speech using StyleTTS2 (high quality, supports voice cloning)"""
|
||||
import librosa
|
||||
|
||||
model = get_styletts2_model()
|
||||
|
||||
# Check for reference voice file
|
||||
voice_file = STYLETTS2_VOICES.get(voice_id)
|
||||
voice_path = None
|
||||
if voice_file:
|
||||
voice_path = settings.base_dir / "voices" / voice_file
|
||||
if not voice_path.exists():
|
||||
voice_path = None # Use default voice if file doesn't exist
|
||||
|
||||
# Generate audio
|
||||
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
|
||||
tmp_path = tmp.name
|
||||
|
||||
try:
|
||||
if voice_path:
|
||||
print(f"[StyleTTS2] Using voice clone: {voice_path}")
|
||||
audio = model.inference(
|
||||
text,
|
||||
target_voice_path=str(voice_path),
|
||||
output_wav_file=tmp_path,
|
||||
output_sample_rate=24000,
|
||||
diffusion_steps=5, # Balance quality/speed
|
||||
alpha=0.3, # More voice-like than text-like
|
||||
beta=0.7, # Good prosody
|
||||
)
|
||||
else:
|
||||
print("[StyleTTS2] Using default voice")
|
||||
audio = model.inference(
|
||||
text,
|
||||
output_wav_file=tmp_path,
|
||||
output_sample_rate=24000,
|
||||
diffusion_steps=5,
|
||||
)
|
||||
|
||||
# Load the generated audio
|
||||
audio, sr = librosa.load(tmp_path, sr=None, mono=True)
|
||||
|
||||
if sr != 24000:
|
||||
audio = librosa.resample(audio, orig_sr=sr, target_sr=24000)
|
||||
|
||||
return audio.astype(np.float32), 24000
|
||||
finally:
|
||||
Path(tmp_path).unlink(missing_ok=True)
|
||||
|
||||
|
||||
async def generate_speech_f5tts(text: str, voice_id: str) -> tuple[np.ndarray, int]:
|
||||
"""Generate speech using F5-TTS MLX (very natural, supports voice cloning)"""
|
||||
import librosa
|
||||
|
||||
generate = get_f5tts_generate()
|
||||
|
||||
# Check for reference voice file and transcript
|
||||
voice_file = F5TTS_VOICES.get(voice_id)
|
||||
ref_audio_path = None
|
||||
ref_text = None
|
||||
|
||||
if voice_file:
|
||||
voice_path = settings.base_dir / "voices" / voice_file
|
||||
txt_path = voice_path.with_suffix('.txt')
|
||||
|
||||
if voice_path.exists() and txt_path.exists():
|
||||
ref_audio_path = str(voice_path)
|
||||
ref_text = txt_path.read_text().strip()
|
||||
print(f"[F5-TTS] Using voice clone: {voice_path}")
|
||||
|
||||
if not ref_audio_path:
|
||||
print("[F5-TTS] Using default voice")
|
||||
|
||||
# Generate audio to temp file
|
||||
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
|
||||
tmp_path = tmp.name
|
||||
|
||||
try:
|
||||
generate(
|
||||
generation_text=text,
|
||||
ref_audio_path=ref_audio_path,
|
||||
ref_audio_text=ref_text,
|
||||
steps=8,
|
||||
speed=1.0,
|
||||
output_path=tmp_path,
|
||||
)
|
||||
|
||||
# Load the generated audio
|
||||
audio, sr = librosa.load(tmp_path, sr=None, mono=True)
|
||||
|
||||
# Resample to 24kHz if needed
|
||||
if sr != 24000:
|
||||
audio = librosa.resample(audio, orig_sr=sr, target_sr=24000)
|
||||
|
||||
return audio.astype(np.float32), 24000
|
||||
finally:
|
||||
Path(tmp_path).unlink(missing_ok=True)
|
||||
|
||||
|
||||
async def generate_speech_chattts(text: str, voice_id: str) -> tuple[np.ndarray, int]:
|
||||
"""Generate speech using ChatTTS (natural conversational speech, multiple speakers)"""
|
||||
import ChatTTS
|
||||
|
||||
chat = get_chattts_model()
|
||||
|
||||
# Ensure text is not empty and has reasonable content
|
||||
text = text.strip()
|
||||
if not text:
|
||||
text = "Hello."
|
||||
|
||||
print(f"[ChatTTS] Generating speech for: {text[:50]}...")
|
||||
|
||||
# Get consistent speaker for this voice
|
||||
seed = CHATTTS_SEEDS.get(voice_id, DEFAULT_CHATTTS_SEED)
|
||||
torch.manual_seed(seed)
|
||||
|
||||
# Configure inference parameters
|
||||
params_infer_code = ChatTTS.Chat.InferCodeParams(
|
||||
temperature=0.3,
|
||||
top_P=0.7,
|
||||
top_K=20,
|
||||
)
|
||||
|
||||
# Generate audio (skip text refinement to avoid narrow() error with this version)
|
||||
wavs = chat.infer(
|
||||
[text],
|
||||
params_infer_code=params_infer_code,
|
||||
skip_refine_text=True,
|
||||
)
|
||||
|
||||
if wavs is None or len(wavs) == 0:
|
||||
raise RuntimeError("ChatTTS failed to generate audio")
|
||||
|
||||
audio = wavs[0]
|
||||
|
||||
# Handle different output shapes
|
||||
if audio.ndim > 1:
|
||||
audio = audio.squeeze()
|
||||
|
||||
# Normalize
|
||||
max_val = np.abs(audio).max()
|
||||
if max_val > 0.95:
|
||||
audio = audio * (0.95 / max_val)
|
||||
|
||||
return audio.astype(np.float32), 24000
|
||||
|
||||
|
||||
async def generate_speech_inworld(text: str, voice_id: str) -> tuple[np.ndarray, int]:
|
||||
"""Generate speech using Inworld TTS API (high quality, natural voices)"""
|
||||
import httpx
|
||||
import base64
|
||||
import librosa
|
||||
|
||||
voice = INWORLD_VOICES.get(voice_id, DEFAULT_INWORLD_VOICE)
|
||||
|
||||
api_key = settings.inworld_api_key
|
||||
if not api_key:
|
||||
raise RuntimeError("INWORLD_API_KEY not set in environment")
|
||||
|
||||
print(f"[Inworld TTS] Voice: {voice}, Text: {text[:50]}...")
|
||||
|
||||
url = "https://api.inworld.ai/tts/v1/voice"
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
"Authorization": f"Basic {api_key}",
|
||||
}
|
||||
payload = {
|
||||
"text": text,
|
||||
"voice_id": voice,
|
||||
"model_id": "inworld-tts-1.5-mini",
|
||||
"audio_config": {
|
||||
"encoding": "LINEAR16",
|
||||
"sample_rate_hertz": 48000,
|
||||
},
|
||||
}
|
||||
|
||||
async with httpx.AsyncClient(timeout=60.0) as client:
|
||||
response = await client.post(url, json=payload, headers=headers)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
# Decode base64 audio
|
||||
audio_b64 = data.get("audioContent")
|
||||
if not audio_b64:
|
||||
raise RuntimeError("Inworld TTS returned no audio content")
|
||||
|
||||
audio_bytes = base64.b64decode(audio_b64)
|
||||
|
||||
# Parse audio using soundfile (handles WAV, MP3, etc.)
|
||||
import soundfile as sf
|
||||
import io
|
||||
|
||||
# soundfile can read WAV, FLAC, OGG, and with ffmpeg: MP3
|
||||
# MP3 files start with ID3 tag or 0xff sync bytes
|
||||
try:
|
||||
audio, sr = sf.read(io.BytesIO(audio_bytes))
|
||||
except Exception as e:
|
||||
print(f"[Inworld TTS] soundfile failed: {e}, trying raw PCM")
|
||||
# Fallback to raw PCM
|
||||
if len(audio_bytes) % 2 != 0:
|
||||
audio_bytes = audio_bytes[:-1]
|
||||
audio = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32768.0
|
||||
sr = 48000
|
||||
|
||||
# Resample to 24kHz to match other providers
|
||||
if sr != 24000:
|
||||
audio = librosa.resample(audio, orig_sr=sr, target_sr=24000)
|
||||
|
||||
return audio.astype(np.float32), 24000
|
||||
|
||||
|
||||
async def generate_speech(
|
||||
text: str,
|
||||
voice_id: str,
|
||||
phone_quality: str = "normal",
|
||||
apply_filter: bool = True
|
||||
) -> bytes:
|
||||
"""
|
||||
Generate speech from text.
|
||||
|
||||
Args:
|
||||
text: Text to speak
|
||||
voice_id: ElevenLabs voice ID (mapped to local voice if using local TTS)
|
||||
phone_quality: Quality of phone filter ("none" to disable)
|
||||
apply_filter: Whether to apply phone filter
|
||||
|
||||
Returns:
|
||||
Raw PCM audio bytes (16-bit signed int, 24kHz)
|
||||
"""
|
||||
# Choose TTS provider
|
||||
provider = settings.tts_provider
|
||||
print(f"[TTS] Provider: {provider}, Text: {text[:50]}...")
|
||||
|
||||
if provider == "kokoro":
|
||||
audio, sample_rate = await generate_speech_kokoro(text, voice_id)
|
||||
elif provider == "f5tts":
|
||||
audio, sample_rate = await generate_speech_f5tts(text, voice_id)
|
||||
elif provider == "inworld":
|
||||
audio, sample_rate = await generate_speech_inworld(text, voice_id)
|
||||
elif provider == "chattts":
|
||||
audio, sample_rate = await generate_speech_chattts(text, voice_id)
|
||||
elif provider == "styletts2":
|
||||
audio, sample_rate = await generate_speech_styletts2(text, voice_id)
|
||||
elif provider == "bark":
|
||||
audio, sample_rate = await generate_speech_bark(text, voice_id)
|
||||
elif provider == "vits":
|
||||
audio, sample_rate = await generate_speech_vits(text, voice_id)
|
||||
elif provider == "elevenlabs":
|
||||
audio, sample_rate = await generate_speech_elevenlabs(text, voice_id)
|
||||
else:
|
||||
raise ValueError(f"Unknown TTS provider: {provider}")
|
||||
|
||||
# Apply phone filter if requested
|
||||
# Skip filter for Bark - it already has rough audio quality
|
||||
if apply_filter and phone_quality not in ("none", "studio") and provider != "bark":
|
||||
audio = phone_filter(audio, sample_rate, phone_quality)
|
||||
|
||||
# Convert to bytes
|
||||
audio_int16 = (audio * 32768).clip(-32768, 32767).astype(np.int16)
|
||||
return audio_int16.tobytes()
|
||||
|
||||
|
||||
# Voice IDs for cohost and announcer
|
||||
COHOST_VOICE_ID = "nPczCjzI2devNBz1zQrb"
|
||||
ANNOUNCER_VOICE_ID = "ErXwobaYiN019PkySvjV"
|
||||
|
||||
|
||||
async def generate_cohost_speech(text: str) -> bytes:
|
||||
"""Generate speech for cohost Bobby (no phone filter)"""
|
||||
return await generate_speech(text, COHOST_VOICE_ID, apply_filter=False)
|
||||
|
||||
|
||||
async def generate_announcer_speech(text: str) -> bytes:
|
||||
"""Generate speech for announcer (no phone filter)"""
|
||||
return await generate_speech(text, ANNOUNCER_VOICE_ID, apply_filter=False)
|
||||
200
backend/services/voices.py
Normal file
200
backend/services/voices.py
Normal file
@@ -0,0 +1,200 @@
|
||||
"""Voice configuration and TTS provider management"""
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
from enum import Enum
|
||||
|
||||
|
||||
class TTSProvider(str, Enum):
|
||||
ELEVENLABS = "elevenlabs"
|
||||
EDGE = "edge" # Microsoft Edge TTS (free)
|
||||
PIPER = "piper" # Local Piper via sherpa-onnx (free, fast)
|
||||
|
||||
|
||||
@dataclass
|
||||
class Voice:
|
||||
"""Voice configuration"""
|
||||
id: str
|
||||
name: str
|
||||
provider: TTSProvider
|
||||
provider_voice_id: str # The actual ID used by the provider
|
||||
description: str = ""
|
||||
language: str = "en"
|
||||
gender: str = "neutral"
|
||||
|
||||
|
||||
# ElevenLabs voices
|
||||
ELEVENLABS_VOICES = [
|
||||
Voice("el_tony", "Tony (ElevenLabs)", TTSProvider.ELEVENLABS, "IKne3meq5aSn9XLyUdCD",
|
||||
"Male, New York accent, expressive", "en", "male"),
|
||||
Voice("el_jasmine", "Jasmine (ElevenLabs)", TTSProvider.ELEVENLABS, "FGY2WhTYpPnrIDTdsKH5",
|
||||
"Female, confident, direct", "en", "female"),
|
||||
Voice("el_rick", "Rick (ElevenLabs)", TTSProvider.ELEVENLABS, "JBFqnCBsd6RMkjVDRZzb",
|
||||
"Male, Texas accent, older", "en", "male"),
|
||||
Voice("el_megan", "Megan (ElevenLabs)", TTSProvider.ELEVENLABS, "XrExE9yKIg1WjnnlVkGX",
|
||||
"Female, young, casual", "en", "female"),
|
||||
Voice("el_dennis", "Dennis (ElevenLabs)", TTSProvider.ELEVENLABS, "cjVigY5qzO86Huf0OWal",
|
||||
"Male, middle-aged, anxious", "en", "male"),
|
||||
Voice("el_tanya", "Tanya (ElevenLabs)", TTSProvider.ELEVENLABS, "N2lVS1w4EtoT3dr4eOWO",
|
||||
"Female, Miami, sassy", "en", "female"),
|
||||
Voice("el_earl", "Earl (ElevenLabs)", TTSProvider.ELEVENLABS, "EXAVITQu4vr4xnSDxMaL",
|
||||
"Male, elderly, Southern", "en", "male"),
|
||||
Voice("el_carla", "Carla (ElevenLabs)", TTSProvider.ELEVENLABS, "CwhRBWXzGAHq8TQ4Fs17",
|
||||
"Female, Jersey, sharp", "en", "female"),
|
||||
Voice("el_marcus", "Marcus (ElevenLabs)", TTSProvider.ELEVENLABS, "bIHbv24MWmeRgasZH58o",
|
||||
"Male, young, urban", "en", "male"),
|
||||
Voice("el_brenda", "Brenda (ElevenLabs)", TTSProvider.ELEVENLABS, "Xb7hH8MSUJpSbSDYk0k2",
|
||||
"Female, middle-aged, worried", "en", "female"),
|
||||
Voice("el_jake", "Jake (ElevenLabs)", TTSProvider.ELEVENLABS, "SOYHLrjzK2X1ezoPC6cr",
|
||||
"Male, Boston, insecure", "en", "male"),
|
||||
Voice("el_diane", "Diane (ElevenLabs)", TTSProvider.ELEVENLABS, "cgSgspJ2msm6clMCkdW9",
|
||||
"Female, mature, conflicted", "en", "female"),
|
||||
Voice("el_bobby", "Bobby (ElevenLabs)", TTSProvider.ELEVENLABS, "nPczCjzI2devNBz1zQrb",
|
||||
"Male, sidekick, wisecracking", "en", "male"),
|
||||
Voice("el_announcer", "Announcer (ElevenLabs)", TTSProvider.ELEVENLABS, "ErXwobaYiN019PkySvjV",
|
||||
"Male, radio announcer", "en", "male"),
|
||||
]
|
||||
|
||||
# Edge TTS voices (Microsoft, free)
|
||||
EDGE_VOICES = [
|
||||
# US voices
|
||||
Voice("edge_jenny", "Jenny (Edge)", TTSProvider.EDGE, "en-US-JennyNeural",
|
||||
"Female, American, friendly", "en", "female"),
|
||||
Voice("edge_guy", "Guy (Edge)", TTSProvider.EDGE, "en-US-GuyNeural",
|
||||
"Male, American, casual", "en", "male"),
|
||||
Voice("edge_aria", "Aria (Edge)", TTSProvider.EDGE, "en-US-AriaNeural",
|
||||
"Female, American, professional", "en", "female"),
|
||||
Voice("edge_davis", "Davis (Edge)", TTSProvider.EDGE, "en-US-DavisNeural",
|
||||
"Male, American, calm", "en", "male"),
|
||||
Voice("edge_amber", "Amber (Edge)", TTSProvider.EDGE, "en-US-AmberNeural",
|
||||
"Female, American, warm", "en", "female"),
|
||||
Voice("edge_andrew", "Andrew (Edge)", TTSProvider.EDGE, "en-US-AndrewNeural",
|
||||
"Male, American, confident", "en", "male"),
|
||||
Voice("edge_ashley", "Ashley (Edge)", TTSProvider.EDGE, "en-US-AshleyNeural",
|
||||
"Female, American, cheerful", "en", "female"),
|
||||
Voice("edge_brian", "Brian (Edge)", TTSProvider.EDGE, "en-US-BrianNeural",
|
||||
"Male, American, narrator", "en", "male"),
|
||||
Voice("edge_christopher", "Christopher (Edge)", TTSProvider.EDGE, "en-US-ChristopherNeural",
|
||||
"Male, American, reliable", "en", "male"),
|
||||
Voice("edge_cora", "Cora (Edge)", TTSProvider.EDGE, "en-US-CoraNeural",
|
||||
"Female, American, older", "en", "female"),
|
||||
Voice("edge_elizabeth", "Elizabeth (Edge)", TTSProvider.EDGE, "en-US-ElizabethNeural",
|
||||
"Female, American, elegant", "en", "female"),
|
||||
Voice("edge_eric", "Eric (Edge)", TTSProvider.EDGE, "en-US-EricNeural",
|
||||
"Male, American, friendly", "en", "male"),
|
||||
Voice("edge_jacob", "Jacob (Edge)", TTSProvider.EDGE, "en-US-JacobNeural",
|
||||
"Male, American, young", "en", "male"),
|
||||
Voice("edge_michelle", "Michelle (Edge)", TTSProvider.EDGE, "en-US-MichelleNeural",
|
||||
"Female, American, clear", "en", "female"),
|
||||
Voice("edge_monica", "Monica (Edge)", TTSProvider.EDGE, "en-US-MonicaNeural",
|
||||
"Female, American, expressive", "en", "female"),
|
||||
Voice("edge_roger", "Roger (Edge)", TTSProvider.EDGE, "en-US-RogerNeural",
|
||||
"Male, American, mature", "en", "male"),
|
||||
Voice("edge_steffan", "Steffan (Edge)", TTSProvider.EDGE, "en-US-SteffanNeural",
|
||||
"Male, American, formal", "en", "male"),
|
||||
Voice("edge_tony", "Tony (Edge)", TTSProvider.EDGE, "en-US-TonyNeural",
|
||||
"Male, American, conversational", "en", "male"),
|
||||
# UK voices
|
||||
Voice("edge_sonia", "Sonia (Edge UK)", TTSProvider.EDGE, "en-GB-SoniaNeural",
|
||||
"Female, British, professional", "en", "female"),
|
||||
Voice("edge_ryan", "Ryan (Edge UK)", TTSProvider.EDGE, "en-GB-RyanNeural",
|
||||
"Male, British, clear", "en", "male"),
|
||||
Voice("edge_libby", "Libby (Edge UK)", TTSProvider.EDGE, "en-GB-LibbyNeural",
|
||||
"Female, British, warm", "en", "female"),
|
||||
Voice("edge_thomas", "Thomas (Edge UK)", TTSProvider.EDGE, "en-GB-ThomasNeural",
|
||||
"Male, British, friendly", "en", "male"),
|
||||
# Australian voices
|
||||
Voice("edge_natasha", "Natasha (Edge AU)", TTSProvider.EDGE, "en-AU-NatashaNeural",
|
||||
"Female, Australian, friendly", "en", "female"),
|
||||
Voice("edge_william", "William (Edge AU)", TTSProvider.EDGE, "en-AU-WilliamNeural",
|
||||
"Male, Australian, casual", "en", "male"),
|
||||
]
|
||||
|
||||
# Piper voices (local, via sherpa-onnx)
|
||||
PIPER_VOICES = [
|
||||
Voice("piper_amy", "Amy (Piper)", TTSProvider.PIPER, "amy",
|
||||
"Female, American, clear", "en", "female"),
|
||||
Voice("piper_joe", "Joe (Piper)", TTSProvider.PIPER, "joe",
|
||||
"Male, American, natural", "en", "male"),
|
||||
Voice("piper_lessac", "Lessac (Piper)", TTSProvider.PIPER, "lessac",
|
||||
"Female, American, expressive", "en", "female"),
|
||||
Voice("piper_alan", "Alan (Piper)", TTSProvider.PIPER, "alan",
|
||||
"Male, British, clear", "en", "male"),
|
||||
]
|
||||
|
||||
# All voices combined
|
||||
ALL_VOICES = ELEVENLABS_VOICES + EDGE_VOICES + PIPER_VOICES
|
||||
|
||||
# Voice lookup by ID
|
||||
VOICES_BY_ID = {v.id: v for v in ALL_VOICES}
|
||||
|
||||
# Default voice assignments for callers (maps caller key to voice ID)
|
||||
DEFAULT_CALLER_VOICES = {
|
||||
"1": "el_tony", # Tony from Staten Island
|
||||
"2": "el_jasmine", # Jasmine from Atlanta
|
||||
"3": "el_rick", # Rick from Texas
|
||||
"4": "el_megan", # Megan from Portland
|
||||
"5": "el_dennis", # Dennis from Long Island
|
||||
"6": "el_tanya", # Tanya from Miami
|
||||
"7": "el_earl", # Earl from Tennessee
|
||||
"8": "el_carla", # Carla from Jersey
|
||||
"9": "el_marcus", # Marcus from Detroit
|
||||
"0": "el_brenda", # Brenda from Phoenix
|
||||
"-": "el_jake", # Jake from Boston
|
||||
"=": "el_diane", # Diane from Chicago
|
||||
"bobby": "el_bobby",
|
||||
"announcer": "el_announcer",
|
||||
}
|
||||
|
||||
|
||||
class VoiceManager:
|
||||
"""Manages voice assignments and TTS provider selection"""
|
||||
|
||||
def __init__(self):
|
||||
# Current voice assignments (can be modified at runtime)
|
||||
self.caller_voices = DEFAULT_CALLER_VOICES.copy()
|
||||
|
||||
def get_voice(self, voice_id: str) -> Optional[Voice]:
|
||||
"""Get voice by ID"""
|
||||
return VOICES_BY_ID.get(voice_id)
|
||||
|
||||
def get_caller_voice(self, caller_key: str) -> Voice:
|
||||
"""Get the voice assigned to a caller"""
|
||||
voice_id = self.caller_voices.get(caller_key, "el_tony")
|
||||
return VOICES_BY_ID.get(voice_id, ELEVENLABS_VOICES[0])
|
||||
|
||||
def set_caller_voice(self, caller_key: str, voice_id: str):
|
||||
"""Assign a voice to a caller"""
|
||||
if voice_id in VOICES_BY_ID:
|
||||
self.caller_voices[caller_key] = voice_id
|
||||
|
||||
def get_all_voices(self) -> list[dict]:
|
||||
"""Get all available voices as dicts for API"""
|
||||
return [
|
||||
{
|
||||
"id": v.id,
|
||||
"name": v.name,
|
||||
"provider": v.provider.value,
|
||||
"description": v.description,
|
||||
"gender": v.gender,
|
||||
}
|
||||
for v in ALL_VOICES
|
||||
]
|
||||
|
||||
def get_voices_by_provider(self, provider: TTSProvider) -> list[Voice]:
|
||||
"""Get all voices for a specific provider"""
|
||||
return [v for v in ALL_VOICES if v.provider == provider]
|
||||
|
||||
def get_caller_voice_assignments(self) -> dict[str, str]:
|
||||
"""Get current caller voice assignments"""
|
||||
return self.caller_voices.copy()
|
||||
|
||||
def set_caller_voice_assignments(self, assignments: dict[str, str]):
|
||||
"""Set multiple caller voice assignments"""
|
||||
for caller_key, voice_id in assignments.items():
|
||||
if voice_id in VOICES_BY_ID:
|
||||
self.caller_voices[caller_key] = voice_id
|
||||
|
||||
|
||||
# Global instance
|
||||
voice_manager = VoiceManager()
|
||||
Reference in New Issue
Block a user