Files
ai-podcast/backend/services/transcription.py
tcpsyn 3164a70e48 Ep13 publish, MLX whisper, voicemail system, hero redesign, massive topic expansion
- Switch whisper transcription from faster-whisper (CPU) to lightning-whisper-mlx (GPU)
- Fix word_timestamps hanging, use ffprobe for accurate duration
- Add Cloudflare Pages Worker for SignalWire voicemail fallback when server offline
- Add voicemail sync on startup, delete tracking, save feature
- Add /feed RSS proxy to _worker.js (was broken by worker taking over routing)
- Redesign website hero section: ghost buttons, compact phone, plain text links
- Rewrite caller prompts for faster point-getting and host-following
- Expand TOPIC_CALLIN from ~250 to 547 entries across 34 categories
- Add new categories: biology, psychology, engineering, math, geology, animals,
  work, money, books, movies, relationships, health, language, true crime,
  drunk/high/unhinged callers
- Remove bad Inworld voices (Pixie, Dominus), reduce repeat caller frequency
- Add audio monitor device routing, uvicorn --reload-dir fix
- Publish episode 13

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-16 01:56:47 -07:00

115 lines
3.8 KiB
Python

"""Whisper transcription service"""
import tempfile
import numpy as np
from faster_whisper import WhisperModel
import librosa
# Global model instance (loaded once)
_whisper_model = None
def get_whisper_model() -> WhisperModel:
"""Get or create Whisper model instance"""
global _whisper_model
if _whisper_model is None:
print("Loading Whisper base model...")
_whisper_model = WhisperModel("base", device="cpu", compute_type="int8")
print("Whisper model loaded")
return _whisper_model
def decode_audio(audio_data: bytes, source_sample_rate: int = None) -> tuple[np.ndarray, int]:
"""
Decode audio from various formats to numpy array.
Args:
audio_data: Raw audio bytes
source_sample_rate: If provided, treat as raw PCM at this sample rate
Returns:
Tuple of (audio array as float32, sample rate)
"""
# If sample rate is provided, assume raw PCM (from server-side recording)
if source_sample_rate is not None:
print(f"Decoding raw PCM at {source_sample_rate}Hz, {len(audio_data)} bytes")
if len(audio_data) % 2 != 0:
audio_data = audio_data + b'\x00'
audio = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32768.0
return audio, source_sample_rate
print(f"First 20 bytes: {audio_data[:20].hex()}")
# Try to decode with librosa first (handles webm, ogg, wav, mp3, etc via ffmpeg)
try:
with tempfile.NamedTemporaryFile(suffix='.webm', delete=False) as f:
f.write(audio_data)
temp_path = f.name
audio, sample_rate = librosa.load(temp_path, sr=None, mono=True)
print(f"Decoded with librosa: {len(audio)} samples at {sample_rate}Hz")
import os
os.unlink(temp_path)
return audio.astype(np.float32), sample_rate
except Exception as e:
print(f"librosa decode failed: {e}, trying raw PCM at 16kHz...")
# Fall back to raw PCM (16-bit signed int, 16kHz mono - Whisper's rate)
if len(audio_data) % 2 != 0:
audio_data = audio_data + b'\x00'
audio = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32768.0
return audio, 16000
async def transcribe_audio(audio_data: bytes, source_sample_rate: int = None) -> str:
"""
Transcribe audio data to text using Whisper.
Args:
audio_data: Audio bytes (webm, ogg, wav, or raw PCM)
source_sample_rate: If provided, treat audio_data as raw PCM at this rate
Returns:
Transcribed text
"""
model = get_whisper_model()
print(f"Transcribing audio: {len(audio_data)} bytes")
# Decode audio from whatever format
audio, detected_sample_rate = decode_audio(audio_data, source_sample_rate)
print(f"Audio samples: {len(audio)}, duration: {len(audio)/detected_sample_rate:.2f}s")
print(f"Audio range: min={audio.min():.4f}, max={audio.max():.4f}")
# Check if audio is too quiet
if np.abs(audio).max() < 0.01:
print("Warning: Audio appears to be silent or very quiet")
return ""
# Resample to 16kHz for Whisper
if detected_sample_rate != 16000:
audio_16k = librosa.resample(audio, orig_sr=detected_sample_rate, target_sr=16000)
print(f"Resampled to {len(audio_16k)} samples at 16kHz")
else:
audio_16k = audio
# Transcribe
segments, info = model.transcribe(
audio_16k,
beam_size=3,
language="en",
vad_filter=True,
initial_prompt="Luke at the Roost, a late-night radio talk show. The host Luke talks to callers about life, relationships, sports, politics, and pop culture.",
)
segments_list = list(segments)
text = " ".join([s.text for s in segments_list]).strip()
print(f"Transcription result: '{text}' (language: {info.language}, prob: {info.language_probability:.2f})")
return text