- FastAPI backend with multiple TTS providers (Inworld, ElevenLabs, Kokoro, F5-TTS, etc.) - Web frontend with caller management, music, and soundboard - Whisper transcription integration - OpenRouter/Ollama LLM support - Castopod podcast publishing script Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
141 lines
4.7 KiB
Python
141 lines
4.7 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Simplified Radio Show - for debugging
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
from pathlib import Path
|
|
import numpy as np
|
|
import sounddevice as sd
|
|
import soundfile as sf
|
|
from faster_whisper import WhisperModel
|
|
from scipy.signal import butter, filtfilt
|
|
from dotenv import load_dotenv
|
|
|
|
load_dotenv()
|
|
|
|
SAMPLE_RATE = 24000
|
|
|
|
CALLERS = {
|
|
"1": ("Big Tony", "IKne3meq5aSn9XLyUdCD", "You are Big Tony, a loud Italian guy from Staten Island. Swear naturally, be opinionated. Keep it to 2 sentences."),
|
|
"2": ("Drunk Diane", "FGY2WhTYpPnrIDTdsKH5", "You are Drunk Diane, tipsy woman at a bar. Ramble a bit, be funny. Keep it to 2 sentences."),
|
|
"3": ("Stoner Phil", "bIHbv24MWmeRgasZH58o", "You are Stoner Phil, super chill stoner dude. Speak slow, be spacey but profound. Keep it to 2 sentences."),
|
|
}
|
|
|
|
def phone_filter(audio):
|
|
b, a = butter(4, [300/(SAMPLE_RATE/2), 3400/(SAMPLE_RATE/2)], btype='band')
|
|
return (np.tanh(filtfilt(b, a, audio.flatten()) * 1.5) * 0.8).astype(np.float32)
|
|
|
|
class SimpleRadio:
|
|
def __init__(self):
|
|
print("Loading Whisper...")
|
|
self.whisper = WhisperModel("base", device="cpu", compute_type="int8")
|
|
|
|
print("Connecting to ElevenLabs...")
|
|
from elevenlabs.client import ElevenLabs
|
|
self.tts = ElevenLabs(api_key=os.getenv("ELEVENLABS_API_KEY"))
|
|
|
|
print("Connecting to Ollama...")
|
|
import ollama
|
|
self.ollama = ollama
|
|
|
|
self.caller = CALLERS["1"]
|
|
self.history = []
|
|
print("\nReady!\n")
|
|
|
|
def record(self):
|
|
print(" [Recording - press Enter to stop]")
|
|
chunks = []
|
|
recording = True
|
|
|
|
def callback(indata, frames, time, status):
|
|
if recording:
|
|
chunks.append(indata.copy())
|
|
|
|
with sd.InputStream(samplerate=SAMPLE_RATE, channels=1, callback=callback):
|
|
input() # Wait for Enter
|
|
|
|
recording = False
|
|
return np.vstack(chunks) if chunks else None
|
|
|
|
def transcribe(self, audio):
|
|
import librosa
|
|
audio_16k = librosa.resample(audio.flatten().astype(np.float32), orig_sr=SAMPLE_RATE, target_sr=16000)
|
|
segments, _ = self.whisper.transcribe(audio_16k)
|
|
return " ".join([s.text for s in segments]).strip()
|
|
|
|
def respond(self, text):
|
|
self.history.append({"role": "user", "content": text})
|
|
|
|
response = self.ollama.chat(
|
|
model="llama3.2:latest",
|
|
messages=[{"role": "system", "content": self.caller[2]}] + self.history[-6:],
|
|
options={"temperature": 0.9}
|
|
)
|
|
|
|
reply = response["message"]["content"]
|
|
self.history.append({"role": "assistant", "content": reply})
|
|
return reply
|
|
|
|
def speak(self, text):
|
|
print(" [Generating voice...]")
|
|
audio_gen = self.tts.text_to_speech.convert(
|
|
voice_id=self.caller[1],
|
|
text=text,
|
|
model_id="eleven_turbo_v2_5",
|
|
output_format="pcm_24000"
|
|
)
|
|
|
|
audio_bytes = b"".join(audio_gen)
|
|
audio = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32768.0
|
|
filtered = phone_filter(audio)
|
|
|
|
print(" [Playing...]")
|
|
sd.play(filtered, SAMPLE_RATE)
|
|
sd.wait()
|
|
|
|
def run(self):
|
|
print("=" * 50)
|
|
print(" SIMPLE RADIO - Type commands:")
|
|
print(" 1/2/3 = switch caller")
|
|
print(" r = record & respond")
|
|
print(" t = type message (skip recording)")
|
|
print(" q = quit")
|
|
print("=" * 50)
|
|
print(f"\nCaller: {self.caller[0]}\n")
|
|
|
|
while True:
|
|
cmd = input("> ").strip().lower()
|
|
|
|
if cmd == 'q':
|
|
break
|
|
elif cmd in '123':
|
|
self.caller = CALLERS[cmd]
|
|
self.history = []
|
|
print(f"\n📞 Switched to: {self.caller[0]}\n")
|
|
elif cmd == 'r':
|
|
audio = self.record()
|
|
if audio is not None:
|
|
print(" [Transcribing...]")
|
|
text = self.transcribe(audio)
|
|
print(f"\n YOU: {text}\n")
|
|
if text:
|
|
print(" [Thinking...]")
|
|
reply = self.respond(text)
|
|
print(f"\n 📞 {self.caller[0].upper()}: {reply}\n")
|
|
self.speak(reply)
|
|
elif cmd == 't':
|
|
text = input(" Type message: ")
|
|
if text:
|
|
print(" [Thinking...]")
|
|
reply = self.respond(text)
|
|
print(f"\n 📞 {self.caller[0].upper()}: {reply}\n")
|
|
self.speak(reply)
|
|
else:
|
|
print(" Commands: r=record, t=type, 1/2/3=caller, q=quit")
|
|
|
|
if __name__ == "__main__":
|
|
radio = SimpleRadio()
|
|
radio.run()
|