From d4e25ceb88cbab69a18ddd4a2b7f4ab858dc04e2 Mon Sep 17 00:00:00 2001 From: tcpsyn Date: Thu, 5 Feb 2026 16:56:22 -0700 Subject: [PATCH] Stream TTS audio to caller in real-time chunks TTS audio was sent as a single huge WebSocket frame that overflowed the browser's 3s ring buffer. Now streams in 60ms chunks at real-time rate. Also increased browser ring buffer from 3s to 10s as safety net. Co-Authored-By: Claude Opus 4.6 --- backend/main.py | 4 ++-- backend/services/caller_service.py | 35 ++++++++++++++++++++++++++++-- frontend/call-in.html | 2 +- frontend/js/call-in.js | 2 +- 4 files changed, 37 insertions(+), 6 deletions(-) diff --git a/backend/main.py b/backend/main.py index 129251b..5d43c9b 100644 --- a/backend/main.py +++ b/backend/main.py @@ -673,11 +673,11 @@ async def text_to_speech(request: TTSRequest): ) thread.start() - # Also send to active real callers so they hear the AI + # Also stream to active real callers so they hear the AI if session.active_real_caller: caller_id = session.active_real_caller["caller_id"] asyncio.create_task( - caller_service.send_audio_to_caller(caller_id, audio_bytes, 24000) + caller_service.stream_audio_to_caller(caller_id, audio_bytes, 24000) ) return {"status": "playing", "duration": len(audio_bytes) / 2 / 24000} diff --git a/backend/services/caller_service.py b/backend/services/caller_service.py index 75d282e..6cc12ee 100644 --- a/backend/services/caller_service.py +++ b/backend/services/caller_service.py @@ -3,6 +3,7 @@ import asyncio import time import threading +import numpy as np from typing import Optional @@ -111,14 +112,16 @@ class CallerService: self._websockets.pop(caller_id, None) async def send_audio_to_caller(self, caller_id: str, pcm_data: bytes, sample_rate: int): - """Send audio to real caller via WebSocket binary frame""" + """Send small audio chunk to real caller via WebSocket binary frame. + For short chunks (host mic, ≤960 samples), sends immediately. + For large chunks (TTS), use stream_audio_to_caller instead. + """ ws = self._websockets.get(caller_id) if not ws: return try: if sample_rate != 16000: - import numpy as np audio = np.frombuffer(pcm_data, dtype=np.int16).astype(np.float32) / 32768.0 ratio = 16000 / sample_rate out_len = int(len(audio) * ratio) @@ -130,6 +133,34 @@ class CallerService: except Exception as e: print(f"[Caller] Failed to send audio: {e}") + async def stream_audio_to_caller(self, caller_id: str, pcm_data: bytes, sample_rate: int): + """Stream large audio (TTS) to caller in real-time chunks to avoid buffer overflow.""" + ws = self._websockets.get(caller_id) + if not ws: + return + + try: + audio = np.frombuffer(pcm_data, dtype=np.int16).astype(np.float32) / 32768.0 + if sample_rate != 16000: + ratio = 16000 / sample_rate + out_len = int(len(audio) * ratio) + indices = (np.arange(out_len) / ratio).astype(int) + indices = np.clip(indices, 0, len(audio) - 1) + audio = audio[indices] + + # Send in 60ms chunks at real-time rate + chunk_samples = 960 + for i in range(0, len(audio), chunk_samples): + if caller_id not in self._websockets: + break + chunk = audio[i:i + chunk_samples] + pcm_chunk = (chunk * 32767).astype(np.int16).tobytes() + await ws.send_bytes(pcm_chunk) + await asyncio.sleep(0.055) # ~60ms, slightly under to stay ahead + + except Exception as e: + print(f"[Caller] Failed to stream audio: {e}") + async def notify_caller(self, caller_id: str, message: dict): """Send JSON control message to caller""" ws = self._websockets.get(caller_id) diff --git a/frontend/call-in.html b/frontend/call-in.html index 055cbbe..a9dc4f1 100644 --- a/frontend/call-in.html +++ b/frontend/call-in.html @@ -150,6 +150,6 @@ - + diff --git a/frontend/js/call-in.js b/frontend/js/call-in.js index 7293c37..0396b68 100644 --- a/frontend/js/call-in.js +++ b/frontend/js/call-in.js @@ -78,7 +78,7 @@ registerProcessor('caller-processor', CallerProcessor); class PlaybackProcessor extends AudioWorkletProcessor { constructor() { super(); - this.ringSize = 16000 * 3; // 3s ring buffer at 16kHz + this.ringSize = 16000 * 10; // 10s ring buffer at 16kHz this.ring = new Float32Array(this.ringSize); this.writePos = 0; this.readPos = 0;