Initial commit: AI Radio Show web application

- FastAPI backend with multiple TTS providers (Inworld, ElevenLabs, Kokoro, F5-TTS, etc.) - Web frontend with caller management, music, and soundboard - Whisper transcription integration - OpenRouter/Ollama LLM support - Castopod podcast publishing script Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-04 23:11:20 -07:00
commit 029ce6d689
25 changed files with 6817 additions and 0 deletions
@@ -0,0 +1,112 @@
+"""Edge TTS service - free Microsoft TTS API"""
+
+import asyncio
+import io
+import numpy as np
+from typing import Optional
+
+try:
+    import edge_tts
+    EDGE_TTS_AVAILABLE = True
+except ImportError:
+    EDGE_TTS_AVAILABLE = False
+
+
+class EdgeTTSService:
+    """TTS using Microsoft Edge's free API"""
+
+    def __init__(self):
+        self.sample_rate = 24000  # Edge TTS outputs 24kHz
+
+    def is_available(self) -> bool:
+        return EDGE_TTS_AVAILABLE
+
+    async def generate_speech(self, text: str, voice: str = "en-US-JennyNeural") -> bytes:
+        """Generate speech from text using Edge TTS
+
+        Args:
+            text: Text to synthesize
+            voice: Edge TTS voice name (e.g., "en-US-JennyNeural")
+
+        Returns:
+            Raw PCM audio bytes (16-bit signed int, 24kHz mono)
+        """
+        if not EDGE_TTS_AVAILABLE:
+            raise RuntimeError("edge-tts not installed. Run: pip install edge-tts")
+
+        communicate = edge_tts.Communicate(text, voice)
+
+        # Collect MP3 audio data
+        mp3_data = b''
+        async for chunk in communicate.stream():
+            if chunk['type'] == 'audio':
+                mp3_data += chunk['data']
+
+        if not mp3_data:
+            raise RuntimeError("No audio generated")
+
+        # Convert MP3 to PCM
+        pcm_data = await self._mp3_to_pcm(mp3_data)
+        return pcm_data
+
+    async def _mp3_to_pcm(self, mp3_data: bytes) -> bytes:
+        """Convert MP3 to raw PCM using ffmpeg or pydub"""
+        loop = asyncio.get_event_loop()
+
+        def convert():
+            try:
+                # Try pydub first (more reliable)
+                from pydub import AudioSegment
+                audio = AudioSegment.from_mp3(io.BytesIO(mp3_data))
+                # Convert to 24kHz mono 16-bit
+                audio = audio.set_frame_rate(24000).set_channels(1).set_sample_width(2)
+                return audio.raw_data
+            except ImportError:
+                pass
+
+            # Fallback to ffmpeg subprocess
+            import subprocess
+            process = subprocess.Popen(
+                [
+                    'ffmpeg', '-i', 'pipe:0',
+                    '-f', 's16le',
+                    '-acodec', 'pcm_s16le',
+                    '-ar', '24000',
+                    '-ac', '1',
+                    'pipe:1'
+                ],
+                stdin=subprocess.PIPE,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE
+            )
+            pcm_data, stderr = process.communicate(input=mp3_data)
+            if process.returncode != 0:
+                raise RuntimeError(f"ffmpeg failed: {stderr.decode()}")
+            return pcm_data
+
+        return await loop.run_in_executor(None, convert)
+
+    async def list_voices(self) -> list[dict]:
+        """List available Edge TTS voices"""
+        if not EDGE_TTS_AVAILABLE:
+            return []
+
+        voices = await edge_tts.list_voices()
+        return [
+            {
+                "id": v["ShortName"],
+                "name": v["ShortName"].replace("Neural", ""),
+                "gender": v["Gender"],
+                "locale": v["Locale"],
+            }
+            for v in voices
+            if v["Locale"].startswith("en-")
+        ]
+
+
+# Global instance
+edge_tts_service = EdgeTTSService()
+
+
+def is_edge_tts_available() -> bool:
+    return edge_tts_service.is_available()