- FastAPI backend with multiple TTS providers (Inworld, ElevenLabs, Kokoro, F5-TTS, etc.) - Web frontend with caller management, music, and soundboard - Whisper transcription integration - OpenRouter/Ollama LLM support - Castopod podcast publishing script Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
113 lines
3.3 KiB
Python
113 lines
3.3 KiB
Python
"""Edge TTS service - free Microsoft TTS API"""
|
|
|
|
import asyncio
|
|
import io
|
|
import numpy as np
|
|
from typing import Optional
|
|
|
|
try:
|
|
import edge_tts
|
|
EDGE_TTS_AVAILABLE = True
|
|
except ImportError:
|
|
EDGE_TTS_AVAILABLE = False
|
|
|
|
|
|
class EdgeTTSService:
|
|
"""TTS using Microsoft Edge's free API"""
|
|
|
|
def __init__(self):
|
|
self.sample_rate = 24000 # Edge TTS outputs 24kHz
|
|
|
|
def is_available(self) -> bool:
|
|
return EDGE_TTS_AVAILABLE
|
|
|
|
async def generate_speech(self, text: str, voice: str = "en-US-JennyNeural") -> bytes:
|
|
"""Generate speech from text using Edge TTS
|
|
|
|
Args:
|
|
text: Text to synthesize
|
|
voice: Edge TTS voice name (e.g., "en-US-JennyNeural")
|
|
|
|
Returns:
|
|
Raw PCM audio bytes (16-bit signed int, 24kHz mono)
|
|
"""
|
|
if not EDGE_TTS_AVAILABLE:
|
|
raise RuntimeError("edge-tts not installed. Run: pip install edge-tts")
|
|
|
|
communicate = edge_tts.Communicate(text, voice)
|
|
|
|
# Collect MP3 audio data
|
|
mp3_data = b''
|
|
async for chunk in communicate.stream():
|
|
if chunk['type'] == 'audio':
|
|
mp3_data += chunk['data']
|
|
|
|
if not mp3_data:
|
|
raise RuntimeError("No audio generated")
|
|
|
|
# Convert MP3 to PCM
|
|
pcm_data = await self._mp3_to_pcm(mp3_data)
|
|
return pcm_data
|
|
|
|
async def _mp3_to_pcm(self, mp3_data: bytes) -> bytes:
|
|
"""Convert MP3 to raw PCM using ffmpeg or pydub"""
|
|
loop = asyncio.get_event_loop()
|
|
|
|
def convert():
|
|
try:
|
|
# Try pydub first (more reliable)
|
|
from pydub import AudioSegment
|
|
audio = AudioSegment.from_mp3(io.BytesIO(mp3_data))
|
|
# Convert to 24kHz mono 16-bit
|
|
audio = audio.set_frame_rate(24000).set_channels(1).set_sample_width(2)
|
|
return audio.raw_data
|
|
except ImportError:
|
|
pass
|
|
|
|
# Fallback to ffmpeg subprocess
|
|
import subprocess
|
|
process = subprocess.Popen(
|
|
[
|
|
'ffmpeg', '-i', 'pipe:0',
|
|
'-f', 's16le',
|
|
'-acodec', 'pcm_s16le',
|
|
'-ar', '24000',
|
|
'-ac', '1',
|
|
'pipe:1'
|
|
],
|
|
stdin=subprocess.PIPE,
|
|
stdout=subprocess.PIPE,
|
|
stderr=subprocess.PIPE
|
|
)
|
|
pcm_data, stderr = process.communicate(input=mp3_data)
|
|
if process.returncode != 0:
|
|
raise RuntimeError(f"ffmpeg failed: {stderr.decode()}")
|
|
return pcm_data
|
|
|
|
return await loop.run_in_executor(None, convert)
|
|
|
|
async def list_voices(self) -> list[dict]:
|
|
"""List available Edge TTS voices"""
|
|
if not EDGE_TTS_AVAILABLE:
|
|
return []
|
|
|
|
voices = await edge_tts.list_voices()
|
|
return [
|
|
{
|
|
"id": v["ShortName"],
|
|
"name": v["ShortName"].replace("Neural", ""),
|
|
"gender": v["Gender"],
|
|
"locale": v["Locale"],
|
|
}
|
|
for v in voices
|
|
if v["Locale"].startswith("en-")
|
|
]
|
|
|
|
|
|
# Global instance
|
|
edge_tts_service = EdgeTTSService()
|
|
|
|
|
|
def is_edge_tts_available() -> bool:
|
|
return edge_tts_service.is_available()
|