Add Twilio WebSocket media stream handler with real-time transcription

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-05 13:36:04 -07:00
parent 28ff8c2d16
commit 88d7fd3457
2 changed files with 179 additions and 1 deletions
@@ -4,10 +4,14 @@ import uuid
 import asyncio
 from dataclasses import dataclass, field
 from pathlib import Path
-from fastapi import FastAPI, HTTPException, UploadFile, File, Form
+from fastapi import FastAPI, HTTPException, UploadFile, File, Form, WebSocket, WebSocketDisconnect
 from fastapi.staticfiles import StaticFiles
 from fastapi.responses import FileResponse, Response
 from twilio.twiml.voice_response import VoiceResponse
+import json
+import base64
+import audioop
+import time
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
 from typing import Optional
@@ -849,6 +853,144 @@ async def drop_from_queue(call_sid: str):
    return {"status": "dropped"}


+# --- Twilio WebSocket Media Stream ---
+
+@app.websocket("/api/twilio/stream")
+async def twilio_media_stream(websocket: WebSocket):
+    """Handle Twilio Media Streams WebSocket — bidirectional audio"""
+    await websocket.accept()
+    print("[Twilio WS] Media stream connected")
+
+    call_sid = None
+    stream_sid = None
+    audio_buffer = bytearray()
+    CHUNK_DURATION_S = 3  # Transcribe every 3 seconds of audio
+    MULAW_SAMPLE_RATE = 8000
+    chunk_samples = CHUNK_DURATION_S * MULAW_SAMPLE_RATE
+
+    try:
+        while True:
+            data = await websocket.receive_text()
+            msg = json.loads(data)
+            event = msg.get("event")
+
+            if event == "start":
+                stream_sid = msg["start"]["streamSid"]
+                call_sid = msg["start"]["callSid"]
+                print(f"[Twilio WS] Stream started: {stream_sid} for call {call_sid}")
+
+            elif event == "media":
+                # Decode mulaw audio from base64
+                payload = base64.b64decode(msg["media"]["payload"])
+                # Convert mulaw to 16-bit PCM
+                pcm_data = audioop.ulaw2lin(payload, 2)
+                audio_buffer.extend(pcm_data)
+
+                # Get channel for this caller
+                call_info = twilio_service.active_calls.get(call_sid)
+                if call_info:
+                    channel = call_info["channel"]
+                    # Route PCM to the caller's dedicated Loopback channel
+                    audio_service.route_real_caller_audio(pcm_data, channel, MULAW_SAMPLE_RATE)
+
+                # When we have enough audio, transcribe
+                if len(audio_buffer) >= chunk_samples * 2:  # 2 bytes per sample
+                    pcm_chunk = bytes(audio_buffer[:chunk_samples * 2])
+                    audio_buffer = audio_buffer[chunk_samples * 2:]
+
+                    # Transcribe in background
+                    asyncio.create_task(
+                        _handle_real_caller_transcription(call_sid, pcm_chunk, MULAW_SAMPLE_RATE)
+                    )
+
+            elif event == "stop":
+                print(f"[Twilio WS] Stream stopped: {stream_sid}")
+                break
+
+    except WebSocketDisconnect:
+        print(f"[Twilio WS] Disconnected: {call_sid}")
+    except Exception as e:
+        print(f"[Twilio WS] Error: {e}")
+    finally:
+        # Transcribe any remaining audio
+        if audio_buffer and call_sid:
+            asyncio.create_task(
+                _handle_real_caller_transcription(call_sid, bytes(audio_buffer), MULAW_SAMPLE_RATE)
+            )
+
+
+async def _handle_real_caller_transcription(call_sid: str, pcm_data: bytes, sample_rate: int):
+    """Transcribe a chunk of real caller audio and add to conversation"""
+    call_info = twilio_service.active_calls.get(call_sid)
+    if not call_info:
+        return
+
+    text = await transcribe_audio(pcm_data, source_sample_rate=sample_rate)
+    if not text or not text.strip():
+        return
+
+    caller_name = call_info["name"]
+    print(f"[Real Caller] {caller_name}: {text}")
+
+    # Add to conversation with real_caller role
+    session.add_message(f"real_caller:{caller_name}", text)
+
+    # If AI auto-respond mode is on and an AI caller is active, check if AI should respond
+    if session.ai_respond_mode == "auto" and session.current_caller_key:
+        asyncio.create_task(_check_ai_auto_respond(text, caller_name))
+
+
+async def _check_ai_auto_respond(real_caller_text: str, real_caller_name: str):
+    """Check if AI caller should jump in, and generate response if so"""
+    if not session.caller:
+        return
+
+    # Cooldown check
+    if not hasattr(session, '_last_ai_auto_respond'):
+        session._last_ai_auto_respond = 0
+    if time.time() - session._last_ai_auto_respond < 10:
+        return
+
+    ai_name = session.caller["name"]
+
+    # Quick "should I respond?" check with minimal LLM call
+    should_respond = await llm_service.generate(
+        messages=[{"role": "user", "content": f'Someone just said: "{real_caller_text}". Should {ai_name} jump in? Reply only YES or NO.'}],
+        system_prompt=f"You're deciding if {ai_name} should respond to what was just said on a radio show. Say YES if it's interesting or relevant to them, NO if not.",
+    )
+
+    if "YES" not in should_respond.upper():
+        return
+
+    print(f"[Auto-Respond] {ai_name} is jumping in...")
+    session._last_ai_auto_respond = time.time()
+
+    # Generate full response
+    conversation_summary = session.get_conversation_summary()
+    system_prompt = get_caller_prompt(session.caller, conversation_summary)
+
+    response = await llm_service.generate(
+        messages=session.conversation[-10:],
+        system_prompt=system_prompt,
+    )
+    response = clean_for_tts(response)
+    if not response or not response.strip():
+        return
+
+    session.add_message(f"ai_caller:{ai_name}", response)
+
+    # Generate TTS and play
+    audio_bytes = await generate_speech(response, session.caller["voice"], "none")
+
+    import threading
+    thread = threading.Thread(
+        target=audio_service.play_caller_audio,
+        args=(audio_bytes, 24000),
+        daemon=True,
+    )
+    thread.start()
+
+
 # --- Server Control Endpoints ---

 import subprocess
@@ -309,6 +309,42 @@ class AudioService:
        """Stop any playing caller audio"""
        self._caller_stop_event.set()

+    def route_real_caller_audio(self, pcm_data: bytes, channel: int, sample_rate: int):
+        """Route real caller PCM audio to a specific Loopback channel"""
+        import librosa
+
+        if self.output_device is None:
+            return
+
+        try:
+            # Convert bytes to float32
+            audio = np.frombuffer(pcm_data, dtype=np.int16).astype(np.float32) / 32768.0
+
+            device_info = sd.query_devices(self.output_device)
+            num_channels = device_info['max_output_channels']
+            device_sr = int(device_info['default_samplerate'])
+            channel_idx = min(channel, num_channels) - 1
+
+            # Resample from Twilio's 8kHz to device sample rate
+            if sample_rate != device_sr:
+                audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=device_sr)
+
+            # Create multi-channel output
+            multi_ch = np.zeros((len(audio), num_channels), dtype=np.float32)
+            multi_ch[:, channel_idx] = audio
+
+            # Write to output device
+            with sd.OutputStream(
+                device=self.output_device,
+                samplerate=device_sr,
+                channels=num_channels,
+                dtype=np.float32,
+            ) as stream:
+                stream.write(multi_ch)
+
+        except Exception as e:
+            print(f"Real caller audio routing error: {e}")
+
    # --- Music Playback ---

    def load_music(self, file_path: str) -> bool: