Stream TTS audio to caller in real-time chunks
TTS audio was sent as a single huge WebSocket frame that overflowed the browser's 3s ring buffer. Now streams in 60ms chunks at real-time rate. Also increased browser ring buffer from 3s to 10s as safety net. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -673,11 +673,11 @@ async def text_to_speech(request: TTSRequest):
|
|||||||
)
|
)
|
||||||
thread.start()
|
thread.start()
|
||||||
|
|
||||||
# Also send to active real callers so they hear the AI
|
# Also stream to active real callers so they hear the AI
|
||||||
if session.active_real_caller:
|
if session.active_real_caller:
|
||||||
caller_id = session.active_real_caller["caller_id"]
|
caller_id = session.active_real_caller["caller_id"]
|
||||||
asyncio.create_task(
|
asyncio.create_task(
|
||||||
caller_service.send_audio_to_caller(caller_id, audio_bytes, 24000)
|
caller_service.stream_audio_to_caller(caller_id, audio_bytes, 24000)
|
||||||
)
|
)
|
||||||
|
|
||||||
return {"status": "playing", "duration": len(audio_bytes) / 2 / 24000}
|
return {"status": "playing", "duration": len(audio_bytes) / 2 / 24000}
|
||||||
|
|||||||
@@ -3,6 +3,7 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
import time
|
import time
|
||||||
import threading
|
import threading
|
||||||
|
import numpy as np
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
|
|
||||||
@@ -111,14 +112,16 @@ class CallerService:
|
|||||||
self._websockets.pop(caller_id, None)
|
self._websockets.pop(caller_id, None)
|
||||||
|
|
||||||
async def send_audio_to_caller(self, caller_id: str, pcm_data: bytes, sample_rate: int):
|
async def send_audio_to_caller(self, caller_id: str, pcm_data: bytes, sample_rate: int):
|
||||||
"""Send audio to real caller via WebSocket binary frame"""
|
"""Send small audio chunk to real caller via WebSocket binary frame.
|
||||||
|
For short chunks (host mic, ≤960 samples), sends immediately.
|
||||||
|
For large chunks (TTS), use stream_audio_to_caller instead.
|
||||||
|
"""
|
||||||
ws = self._websockets.get(caller_id)
|
ws = self._websockets.get(caller_id)
|
||||||
if not ws:
|
if not ws:
|
||||||
return
|
return
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if sample_rate != 16000:
|
if sample_rate != 16000:
|
||||||
import numpy as np
|
|
||||||
audio = np.frombuffer(pcm_data, dtype=np.int16).astype(np.float32) / 32768.0
|
audio = np.frombuffer(pcm_data, dtype=np.int16).astype(np.float32) / 32768.0
|
||||||
ratio = 16000 / sample_rate
|
ratio = 16000 / sample_rate
|
||||||
out_len = int(len(audio) * ratio)
|
out_len = int(len(audio) * ratio)
|
||||||
@@ -130,6 +133,34 @@ class CallerService:
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"[Caller] Failed to send audio: {e}")
|
print(f"[Caller] Failed to send audio: {e}")
|
||||||
|
|
||||||
|
async def stream_audio_to_caller(self, caller_id: str, pcm_data: bytes, sample_rate: int):
|
||||||
|
"""Stream large audio (TTS) to caller in real-time chunks to avoid buffer overflow."""
|
||||||
|
ws = self._websockets.get(caller_id)
|
||||||
|
if not ws:
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
audio = np.frombuffer(pcm_data, dtype=np.int16).astype(np.float32) / 32768.0
|
||||||
|
if sample_rate != 16000:
|
||||||
|
ratio = 16000 / sample_rate
|
||||||
|
out_len = int(len(audio) * ratio)
|
||||||
|
indices = (np.arange(out_len) / ratio).astype(int)
|
||||||
|
indices = np.clip(indices, 0, len(audio) - 1)
|
||||||
|
audio = audio[indices]
|
||||||
|
|
||||||
|
# Send in 60ms chunks at real-time rate
|
||||||
|
chunk_samples = 960
|
||||||
|
for i in range(0, len(audio), chunk_samples):
|
||||||
|
if caller_id not in self._websockets:
|
||||||
|
break
|
||||||
|
chunk = audio[i:i + chunk_samples]
|
||||||
|
pcm_chunk = (chunk * 32767).astype(np.int16).tobytes()
|
||||||
|
await ws.send_bytes(pcm_chunk)
|
||||||
|
await asyncio.sleep(0.055) # ~60ms, slightly under to stay ahead
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[Caller] Failed to stream audio: {e}")
|
||||||
|
|
||||||
async def notify_caller(self, caller_id: str, message: dict):
|
async def notify_caller(self, caller_id: str, message: dict):
|
||||||
"""Send JSON control message to caller"""
|
"""Send JSON control message to caller"""
|
||||||
ws = self._websockets.get(caller_id)
|
ws = self._websockets.get(caller_id)
|
||||||
|
|||||||
@@ -150,6 +150,6 @@
|
|||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<script src="/js/call-in.js?v=4"></script>
|
<script src="/js/call-in.js?v=5"></script>
|
||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
||||||
|
|||||||
@@ -78,7 +78,7 @@ registerProcessor('caller-processor', CallerProcessor);
|
|||||||
class PlaybackProcessor extends AudioWorkletProcessor {
|
class PlaybackProcessor extends AudioWorkletProcessor {
|
||||||
constructor() {
|
constructor() {
|
||||||
super();
|
super();
|
||||||
this.ringSize = 16000 * 3; // 3s ring buffer at 16kHz
|
this.ringSize = 16000 * 10; // 10s ring buffer at 16kHz
|
||||||
this.ring = new Float32Array(this.ringSize);
|
this.ring = new Float32Array(this.ringSize);
|
||||||
this.writePos = 0;
|
this.writePos = 0;
|
||||||
this.readPos = 0;
|
this.readPos = 0;
|
||||||
|
|||||||
Reference in New Issue
Block a user