diff --git a/backend/main.py b/backend/main.py
index 129251b..5d43c9b 100644
--- a/backend/main.py
+++ b/backend/main.py
@@ -673,11 +673,11 @@ async def text_to_speech(request: TTSRequest):
)
thread.start()
- # Also send to active real callers so they hear the AI
+ # Also stream to active real callers so they hear the AI
if session.active_real_caller:
caller_id = session.active_real_caller["caller_id"]
asyncio.create_task(
- caller_service.send_audio_to_caller(caller_id, audio_bytes, 24000)
+ caller_service.stream_audio_to_caller(caller_id, audio_bytes, 24000)
)
return {"status": "playing", "duration": len(audio_bytes) / 2 / 24000}
diff --git a/backend/services/caller_service.py b/backend/services/caller_service.py
index 75d282e..6cc12ee 100644
--- a/backend/services/caller_service.py
+++ b/backend/services/caller_service.py
@@ -3,6 +3,7 @@
import asyncio
import time
import threading
+import numpy as np
from typing import Optional
@@ -111,14 +112,16 @@ class CallerService:
self._websockets.pop(caller_id, None)
async def send_audio_to_caller(self, caller_id: str, pcm_data: bytes, sample_rate: int):
- """Send audio to real caller via WebSocket binary frame"""
+ """Send small audio chunk to real caller via WebSocket binary frame.
+ For short chunks (host mic, ≤960 samples), sends immediately.
+ For large chunks (TTS), use stream_audio_to_caller instead.
+ """
ws = self._websockets.get(caller_id)
if not ws:
return
try:
if sample_rate != 16000:
- import numpy as np
audio = np.frombuffer(pcm_data, dtype=np.int16).astype(np.float32) / 32768.0
ratio = 16000 / sample_rate
out_len = int(len(audio) * ratio)
@@ -130,6 +133,34 @@ class CallerService:
except Exception as e:
print(f"[Caller] Failed to send audio: {e}")
+ async def stream_audio_to_caller(self, caller_id: str, pcm_data: bytes, sample_rate: int):
+ """Stream large audio (TTS) to caller in real-time chunks to avoid buffer overflow."""
+ ws = self._websockets.get(caller_id)
+ if not ws:
+ return
+
+ try:
+ audio = np.frombuffer(pcm_data, dtype=np.int16).astype(np.float32) / 32768.0
+ if sample_rate != 16000:
+ ratio = 16000 / sample_rate
+ out_len = int(len(audio) * ratio)
+ indices = (np.arange(out_len) / ratio).astype(int)
+ indices = np.clip(indices, 0, len(audio) - 1)
+ audio = audio[indices]
+
+ # Send in 60ms chunks at real-time rate
+ chunk_samples = 960
+ for i in range(0, len(audio), chunk_samples):
+ if caller_id not in self._websockets:
+ break
+ chunk = audio[i:i + chunk_samples]
+ pcm_chunk = (chunk * 32767).astype(np.int16).tobytes()
+ await ws.send_bytes(pcm_chunk)
+ await asyncio.sleep(0.055) # ~60ms, slightly under to stay ahead
+
+ except Exception as e:
+ print(f"[Caller] Failed to stream audio: {e}")
+
async def notify_caller(self, caller_id: str, message: dict):
"""Send JSON control message to caller"""
ws = self._websockets.get(caller_id)
diff --git a/frontend/call-in.html b/frontend/call-in.html
index 055cbbe..a9dc4f1 100644
--- a/frontend/call-in.html
+++ b/frontend/call-in.html
@@ -150,6 +150,6 @@
-
+