Harden LLM: model fallback chain, reuse client, remove fighting timeouts

- Primary model gets 15s, then auto-falls back through gemini-flash,
  gpt-4o-mini, llama-3.1-8b (10s each)
- Always returns a response — canned in-character line as last resort
- Reuse httpx client instead of creating new one per request
- Remove asyncio.timeout wrappers that were killing requests before
  the LLM service could try fallbacks

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-06 22:07:39 -07:00
parent 73129374f4
commit aa3899b1fc
2 changed files with 113 additions and 109 deletions

View File

@@ -1116,28 +1116,22 @@ async def chat(request: ChatRequest):
session.add_message("user", request.text)
# session._research_task = asyncio.create_task(_background_research(request.text))
try:
async with asyncio.timeout(20):
async with _ai_response_lock:
if _session_epoch != epoch:
raise HTTPException(409, "Call ended while waiting")
async with _ai_response_lock:
if _session_epoch != epoch:
raise HTTPException(409, "Call ended while waiting")
# Stop any playing caller audio so responses don't overlap
audio_service.stop_caller_audio()
# Stop any playing caller audio so responses don't overlap
audio_service.stop_caller_audio()
# Include conversation summary and show history for context
conversation_summary = session.get_conversation_summary()
show_history = session.get_show_history()
system_prompt = get_caller_prompt(session.caller, conversation_summary, show_history)
conversation_summary = session.get_conversation_summary()
show_history = session.get_show_history()
system_prompt = get_caller_prompt(session.caller, conversation_summary, show_history)
messages = _normalize_messages_for_llm(session.conversation[-10:])
response = await llm_service.generate(
messages=messages,
system_prompt=system_prompt
)
except TimeoutError:
caller_name = session.caller["name"] if session.caller else "Caller"
return {"text": "Uh... hold on, I lost my train of thought.", "caller": caller_name, "voice_id": session.caller["voice"] if session.caller else ""}
messages = _normalize_messages_for_llm(session.conversation[-10:])
response = await llm_service.generate(
messages=messages,
system_prompt=system_prompt
)
# Discard if call changed while we were generating
if _session_epoch != epoch:
@@ -1644,30 +1638,24 @@ async def _trigger_ai_auto_respond(accumulated_text: str):
ai_name = session.caller["name"]
try:
async with asyncio.timeout(20):
async with _ai_response_lock:
if _session_epoch != epoch:
return # Call changed while waiting for lock
async with _ai_response_lock:
if _session_epoch != epoch:
return # Call changed while waiting for lock
print(f"[Auto-Respond] {ai_name} is jumping in...")
session._last_ai_auto_respond = time.time()
audio_service.stop_caller_audio()
broadcast_event("ai_status", {"text": f"{ai_name} is thinking..."})
print(f"[Auto-Respond] {ai_name} is jumping in...")
session._last_ai_auto_respond = time.time()
audio_service.stop_caller_audio()
broadcast_event("ai_status", {"text": f"{ai_name} is thinking..."})
conversation_summary = session.get_conversation_summary()
show_history = session.get_show_history()
system_prompt = get_caller_prompt(session.caller, conversation_summary, show_history)
conversation_summary = session.get_conversation_summary()
show_history = session.get_show_history()
system_prompt = get_caller_prompt(session.caller, conversation_summary, show_history)
messages = _normalize_messages_for_llm(session.conversation[-10:])
response = await llm_service.generate(
messages=messages,
system_prompt=system_prompt,
)
except TimeoutError:
print(f"[Auto-Respond] Timed out for {ai_name}")
broadcast_event("ai_done")
return
messages = _normalize_messages_for_llm(session.conversation[-10:])
response = await llm_service.generate(
messages=messages,
system_prompt=system_prompt,
)
# Discard if call changed during generation
if _session_epoch != epoch:
@@ -1725,25 +1713,21 @@ async def ai_respond():
epoch = _session_epoch
try:
async with asyncio.timeout(20):
async with _ai_response_lock:
if _session_epoch != epoch:
raise HTTPException(409, "Call ended while waiting")
async with _ai_response_lock:
if _session_epoch != epoch:
raise HTTPException(409, "Call ended while waiting")
audio_service.stop_caller_audio()
audio_service.stop_caller_audio()
conversation_summary = session.get_conversation_summary()
show_history = session.get_show_history()
system_prompt = get_caller_prompt(session.caller, conversation_summary, show_history)
conversation_summary = session.get_conversation_summary()
show_history = session.get_show_history()
system_prompt = get_caller_prompt(session.caller, conversation_summary, show_history)
messages = _normalize_messages_for_llm(session.conversation[-10:])
response = await llm_service.generate(
messages=messages,
system_prompt=system_prompt
)
except TimeoutError:
return {"text": "Uh... sorry, I spaced out for a second there.", "caller": session.caller["name"], "voice_id": session.caller["voice"]}
messages = _normalize_messages_for_llm(session.conversation[-10:])
response = await llm_service.generate(
messages=messages,
system_prompt=system_prompt
)
if _session_epoch != epoch:
raise HTTPException(409, "Call changed during response")

View File

@@ -17,6 +17,13 @@ OPENROUTER_MODELS = [
"mistralai/mistral-7b-instruct",
]
# Fast models to try as fallbacks (cheap, fast, good enough for conversation)
FALLBACK_MODELS = [
"google/gemini-flash-1.5",
"openai/gpt-4o-mini",
"meta-llama/llama-3.1-8b-instruct",
]
class LLMService:
"""Abstraction layer for LLM providers"""
@@ -27,6 +34,13 @@ class LLMService:
self.ollama_model = settings.ollama_model
self.ollama_host = settings.ollama_host
self.tts_provider = settings.tts_provider
self._client: httpx.AsyncClient | None = None
@property
def client(self) -> httpx.AsyncClient:
if self._client is None or self._client.is_closed:
self._client = httpx.AsyncClient(timeout=15.0)
return self._client
def update_settings(
self,
@@ -47,7 +61,6 @@ class LLMService:
self.ollama_host = ollama_host
if tts_provider:
self.tts_provider = tts_provider
# Also update the global settings so TTS service picks it up
settings.tts_provider = tts_provider
async def get_ollama_models(self) -> list[str]:
@@ -71,7 +84,7 @@ class LLMService:
"ollama_host": self.ollama_host,
"tts_provider": self.tts_provider,
"available_openrouter_models": OPENROUTER_MODELS,
"available_ollama_models": [] # Fetched separately
"available_ollama_models": []
}
async def get_settings_async(self) -> dict:
@@ -92,57 +105,64 @@ class LLMService:
messages: list[dict],
system_prompt: Optional[str] = None
) -> str:
"""
Generate a response from the LLM.
Args:
messages: List of message dicts with 'role' and 'content'
system_prompt: Optional system prompt to prepend
Returns:
Generated text response
"""
if system_prompt:
messages = [{"role": "system", "content": system_prompt}] + messages
if self.provider == "openrouter":
return await self._call_openrouter(messages)
return await self._call_openrouter_with_fallback(messages)
else:
return await self._call_ollama(messages)
async def _call_openrouter(self, messages: list[dict]) -> str:
"""Call OpenRouter API with retry"""
for attempt in range(2): # Try twice
try:
async with httpx.AsyncClient(timeout=25.0) as client:
response = await client.post(
"https://openrouter.ai/api/v1/chat/completions",
headers={
"Authorization": f"Bearer {settings.openrouter_api_key}",
"Content-Type": "application/json",
},
json={
"model": self.openrouter_model,
"messages": messages,
"max_tokens": 150,
},
)
response.raise_for_status()
data = response.json()
content = data["choices"][0]["message"]["content"]
if not content or not content.strip():
print(f"OpenRouter returned empty response")
return ""
return content
except (httpx.TimeoutException, httpx.ReadTimeout):
print(f"OpenRouter timeout (attempt {attempt + 1})")
if attempt == 0:
continue # Retry once
return "Uh, sorry, I lost you there for a second. What was that?"
except Exception as e:
print(f"OpenRouter error: {e}")
return "Yeah... I don't know, man."
return "Uh, hold on a sec..."
async def _call_openrouter_with_fallback(self, messages: list[dict]) -> str:
"""Try primary model, then fallback models. Always returns a response."""
# Try primary model first
result = await self._call_openrouter_once(messages, self.openrouter_model)
if result is not None:
return result
# Try fallback models
for model in FALLBACK_MODELS:
if model == self.openrouter_model:
continue # Already tried
print(f"[LLM] Falling back to {model}...")
result = await self._call_openrouter_once(messages, model, timeout=10.0)
if result is not None:
return result
# Everything failed — return an in-character line so the show continues
print("[LLM] All models failed, using canned response")
return "Sorry, I totally blanked out for a second. What were you saying?"
async def _call_openrouter_once(self, messages: list[dict], model: str, timeout: float = 15.0) -> str | None:
"""Single attempt to call OpenRouter. Returns None on failure (not a fallback string)."""
try:
response = await self.client.post(
"https://openrouter.ai/api/v1/chat/completions",
headers={
"Authorization": f"Bearer {settings.openrouter_api_key}",
"Content-Type": "application/json",
},
json={
"model": model,
"messages": messages,
"max_tokens": 150,
},
timeout=timeout,
)
response.raise_for_status()
data = response.json()
content = data["choices"][0]["message"]["content"]
if content and content.strip():
return content
print(f"[LLM] {model} returned empty response")
return None
except httpx.TimeoutException:
print(f"[LLM] {model} timed out ({timeout}s)")
return None
except Exception as e:
print(f"[LLM] {model} error: {e}")
return None
async def _call_ollama(self, messages: list[dict]) -> str:
"""Call Ollama API"""
@@ -155,11 +175,11 @@ class LLMService:
"messages": messages,
"stream": False,
"options": {
"num_predict": 100, # Allow complete thoughts
"temperature": 0.8, # Balanced creativity/coherence
"top_p": 0.9, # Focused word choices
"repeat_penalty": 1.3, # Avoid repetition
"top_k": 50, # Reasonable token variety
"num_predict": 100,
"temperature": 0.8,
"top_p": 0.9,
"repeat_penalty": 1.3,
"top_k": 50,
},
},
timeout=30.0
@@ -169,10 +189,10 @@ class LLMService:
return data["message"]["content"]
except httpx.TimeoutException:
print("Ollama timeout")
return "Uh, sorry, I lost you there for a second. What was that?"
return "Sorry, I totally blanked out for a second. What were you saying?"
except Exception as e:
print(f"Ollama error: {e}")
return "Yeah... I don't know, man."
return "Sorry, I totally blanked out for a second. What were you saying?"
# Global instance