ai-podcast/backend/services/edge_tts_service.py

"""Edge TTS service - free Microsoft TTS API"""

import asyncio
import io
import numpy as np
from typing import Optional

try:
    import edge_tts
    EDGE_TTS_AVAILABLE = True
except ImportError:
    EDGE_TTS_AVAILABLE = False


class EdgeTTSService:
    """TTS using Microsoft Edge's free API"""

    def __init__(self):
        self.sample_rate = 24000  # Edge TTS outputs 24kHz

    def is_available(self) -> bool:
        return EDGE_TTS_AVAILABLE

    async def generate_speech(self, text: str, voice: str = "en-US-JennyNeural") -> bytes:
        """Generate speech from text using Edge TTS

        Args:
            text: Text to synthesize
            voice: Edge TTS voice name (e.g., "en-US-JennyNeural")

        Returns:
            Raw PCM audio bytes (16-bit signed int, 24kHz mono)
        """
        if not EDGE_TTS_AVAILABLE:
            raise RuntimeError("edge-tts not installed. Run: pip install edge-tts")

        communicate = edge_tts.Communicate(text, voice)

        # Collect MP3 audio data
        mp3_data = b''
        async for chunk in communicate.stream():
            if chunk['type'] == 'audio':
                mp3_data += chunk['data']

        if not mp3_data:
            raise RuntimeError("No audio generated")

        # Convert MP3 to PCM
        pcm_data = await self._mp3_to_pcm(mp3_data)
        return pcm_data

    async def _mp3_to_pcm(self, mp3_data: bytes) -> bytes:
        """Convert MP3 to raw PCM using ffmpeg or pydub"""
        loop = asyncio.get_event_loop()

        def convert():
            try:
                # Try pydub first (more reliable)
                from pydub import AudioSegment
                audio = AudioSegment.from_mp3(io.BytesIO(mp3_data))
                # Convert to 24kHz mono 16-bit
                audio = audio.set_frame_rate(24000).set_channels(1).set_sample_width(2)
                return audio.raw_data
            except ImportError:
                pass

            # Fallback to ffmpeg subprocess
            import subprocess
            process = subprocess.Popen(
                [
                    'ffmpeg', '-i', 'pipe:0',
                    '-f', 's16le',
                    '-acodec', 'pcm_s16le',
                    '-ar', '24000',
                    '-ac', '1',
                    'pipe:1'
                ],
                stdin=subprocess.PIPE,
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE
            )
            pcm_data, stderr = process.communicate(input=mp3_data)
            if process.returncode != 0:
                raise RuntimeError(f"ffmpeg failed: {stderr.decode()}")
            return pcm_data

        return await loop.run_in_executor(None, convert)

    async def list_voices(self) -> list[dict]:
        """List available Edge TTS voices"""
        if not EDGE_TTS_AVAILABLE:
            return []

        voices = await edge_tts.list_voices()
        return [
            {
                "id": v["ShortName"],
                "name": v["ShortName"].replace("Neural", ""),
                "gender": v["Gender"],
                "locale": v["Locale"],
            }
            for v in voices
            if v["Locale"].startswith("en-")
        ]


# Global instance
edge_tts_service = EdgeTTSService()


def is_edge_tts_available() -> bool:
    return edge_tts_service.is_available()