Harden LLM: model fallback chain, reuse client, remove fighting timeouts
- Primary model gets 15s, then auto-falls back through gemini-flash, gpt-4o-mini, llama-3.1-8b (10s each) - Always returns a response — canned in-character line as last resort - Reuse httpx client instead of creating new one per request - Remove asyncio.timeout wrappers that were killing requests before the LLM service could try fallbacks Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1116,28 +1116,22 @@ async def chat(request: ChatRequest):
|
||||
session.add_message("user", request.text)
|
||||
# session._research_task = asyncio.create_task(_background_research(request.text))
|
||||
|
||||
try:
|
||||
async with asyncio.timeout(20):
|
||||
async with _ai_response_lock:
|
||||
if _session_epoch != epoch:
|
||||
raise HTTPException(409, "Call ended while waiting")
|
||||
async with _ai_response_lock:
|
||||
if _session_epoch != epoch:
|
||||
raise HTTPException(409, "Call ended while waiting")
|
||||
|
||||
# Stop any playing caller audio so responses don't overlap
|
||||
audio_service.stop_caller_audio()
|
||||
# Stop any playing caller audio so responses don't overlap
|
||||
audio_service.stop_caller_audio()
|
||||
|
||||
# Include conversation summary and show history for context
|
||||
conversation_summary = session.get_conversation_summary()
|
||||
show_history = session.get_show_history()
|
||||
system_prompt = get_caller_prompt(session.caller, conversation_summary, show_history)
|
||||
conversation_summary = session.get_conversation_summary()
|
||||
show_history = session.get_show_history()
|
||||
system_prompt = get_caller_prompt(session.caller, conversation_summary, show_history)
|
||||
|
||||
messages = _normalize_messages_for_llm(session.conversation[-10:])
|
||||
response = await llm_service.generate(
|
||||
messages=messages,
|
||||
system_prompt=system_prompt
|
||||
)
|
||||
except TimeoutError:
|
||||
caller_name = session.caller["name"] if session.caller else "Caller"
|
||||
return {"text": "Uh... hold on, I lost my train of thought.", "caller": caller_name, "voice_id": session.caller["voice"] if session.caller else ""}
|
||||
messages = _normalize_messages_for_llm(session.conversation[-10:])
|
||||
response = await llm_service.generate(
|
||||
messages=messages,
|
||||
system_prompt=system_prompt
|
||||
)
|
||||
|
||||
# Discard if call changed while we were generating
|
||||
if _session_epoch != epoch:
|
||||
@@ -1644,30 +1638,24 @@ async def _trigger_ai_auto_respond(accumulated_text: str):
|
||||
|
||||
ai_name = session.caller["name"]
|
||||
|
||||
try:
|
||||
async with asyncio.timeout(20):
|
||||
async with _ai_response_lock:
|
||||
if _session_epoch != epoch:
|
||||
return # Call changed while waiting for lock
|
||||
async with _ai_response_lock:
|
||||
if _session_epoch != epoch:
|
||||
return # Call changed while waiting for lock
|
||||
|
||||
print(f"[Auto-Respond] {ai_name} is jumping in...")
|
||||
session._last_ai_auto_respond = time.time()
|
||||
audio_service.stop_caller_audio()
|
||||
broadcast_event("ai_status", {"text": f"{ai_name} is thinking..."})
|
||||
print(f"[Auto-Respond] {ai_name} is jumping in...")
|
||||
session._last_ai_auto_respond = time.time()
|
||||
audio_service.stop_caller_audio()
|
||||
broadcast_event("ai_status", {"text": f"{ai_name} is thinking..."})
|
||||
|
||||
conversation_summary = session.get_conversation_summary()
|
||||
show_history = session.get_show_history()
|
||||
system_prompt = get_caller_prompt(session.caller, conversation_summary, show_history)
|
||||
conversation_summary = session.get_conversation_summary()
|
||||
show_history = session.get_show_history()
|
||||
system_prompt = get_caller_prompt(session.caller, conversation_summary, show_history)
|
||||
|
||||
messages = _normalize_messages_for_llm(session.conversation[-10:])
|
||||
response = await llm_service.generate(
|
||||
messages=messages,
|
||||
system_prompt=system_prompt,
|
||||
)
|
||||
except TimeoutError:
|
||||
print(f"[Auto-Respond] Timed out for {ai_name}")
|
||||
broadcast_event("ai_done")
|
||||
return
|
||||
messages = _normalize_messages_for_llm(session.conversation[-10:])
|
||||
response = await llm_service.generate(
|
||||
messages=messages,
|
||||
system_prompt=system_prompt,
|
||||
)
|
||||
|
||||
# Discard if call changed during generation
|
||||
if _session_epoch != epoch:
|
||||
@@ -1725,25 +1713,21 @@ async def ai_respond():
|
||||
|
||||
epoch = _session_epoch
|
||||
|
||||
try:
|
||||
async with asyncio.timeout(20):
|
||||
async with _ai_response_lock:
|
||||
if _session_epoch != epoch:
|
||||
raise HTTPException(409, "Call ended while waiting")
|
||||
async with _ai_response_lock:
|
||||
if _session_epoch != epoch:
|
||||
raise HTTPException(409, "Call ended while waiting")
|
||||
|
||||
audio_service.stop_caller_audio()
|
||||
audio_service.stop_caller_audio()
|
||||
|
||||
conversation_summary = session.get_conversation_summary()
|
||||
show_history = session.get_show_history()
|
||||
system_prompt = get_caller_prompt(session.caller, conversation_summary, show_history)
|
||||
conversation_summary = session.get_conversation_summary()
|
||||
show_history = session.get_show_history()
|
||||
system_prompt = get_caller_prompt(session.caller, conversation_summary, show_history)
|
||||
|
||||
messages = _normalize_messages_for_llm(session.conversation[-10:])
|
||||
response = await llm_service.generate(
|
||||
messages=messages,
|
||||
system_prompt=system_prompt
|
||||
)
|
||||
except TimeoutError:
|
||||
return {"text": "Uh... sorry, I spaced out for a second there.", "caller": session.caller["name"], "voice_id": session.caller["voice"]}
|
||||
messages = _normalize_messages_for_llm(session.conversation[-10:])
|
||||
response = await llm_service.generate(
|
||||
messages=messages,
|
||||
system_prompt=system_prompt
|
||||
)
|
||||
|
||||
if _session_epoch != epoch:
|
||||
raise HTTPException(409, "Call changed during response")
|
||||
|
||||
@@ -17,6 +17,13 @@ OPENROUTER_MODELS = [
|
||||
"mistralai/mistral-7b-instruct",
|
||||
]
|
||||
|
||||
# Fast models to try as fallbacks (cheap, fast, good enough for conversation)
|
||||
FALLBACK_MODELS = [
|
||||
"google/gemini-flash-1.5",
|
||||
"openai/gpt-4o-mini",
|
||||
"meta-llama/llama-3.1-8b-instruct",
|
||||
]
|
||||
|
||||
|
||||
class LLMService:
|
||||
"""Abstraction layer for LLM providers"""
|
||||
@@ -27,6 +34,13 @@ class LLMService:
|
||||
self.ollama_model = settings.ollama_model
|
||||
self.ollama_host = settings.ollama_host
|
||||
self.tts_provider = settings.tts_provider
|
||||
self._client: httpx.AsyncClient | None = None
|
||||
|
||||
@property
|
||||
def client(self) -> httpx.AsyncClient:
|
||||
if self._client is None or self._client.is_closed:
|
||||
self._client = httpx.AsyncClient(timeout=15.0)
|
||||
return self._client
|
||||
|
||||
def update_settings(
|
||||
self,
|
||||
@@ -47,7 +61,6 @@ class LLMService:
|
||||
self.ollama_host = ollama_host
|
||||
if tts_provider:
|
||||
self.tts_provider = tts_provider
|
||||
# Also update the global settings so TTS service picks it up
|
||||
settings.tts_provider = tts_provider
|
||||
|
||||
async def get_ollama_models(self) -> list[str]:
|
||||
@@ -71,7 +84,7 @@ class LLMService:
|
||||
"ollama_host": self.ollama_host,
|
||||
"tts_provider": self.tts_provider,
|
||||
"available_openrouter_models": OPENROUTER_MODELS,
|
||||
"available_ollama_models": [] # Fetched separately
|
||||
"available_ollama_models": []
|
||||
}
|
||||
|
||||
async def get_settings_async(self) -> dict:
|
||||
@@ -92,57 +105,64 @@ class LLMService:
|
||||
messages: list[dict],
|
||||
system_prompt: Optional[str] = None
|
||||
) -> str:
|
||||
"""
|
||||
Generate a response from the LLM.
|
||||
|
||||
Args:
|
||||
messages: List of message dicts with 'role' and 'content'
|
||||
system_prompt: Optional system prompt to prepend
|
||||
|
||||
Returns:
|
||||
Generated text response
|
||||
"""
|
||||
if system_prompt:
|
||||
messages = [{"role": "system", "content": system_prompt}] + messages
|
||||
|
||||
if self.provider == "openrouter":
|
||||
return await self._call_openrouter(messages)
|
||||
return await self._call_openrouter_with_fallback(messages)
|
||||
else:
|
||||
return await self._call_ollama(messages)
|
||||
|
||||
async def _call_openrouter(self, messages: list[dict]) -> str:
|
||||
"""Call OpenRouter API with retry"""
|
||||
for attempt in range(2): # Try twice
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=25.0) as client:
|
||||
response = await client.post(
|
||||
"https://openrouter.ai/api/v1/chat/completions",
|
||||
headers={
|
||||
"Authorization": f"Bearer {settings.openrouter_api_key}",
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
json={
|
||||
"model": self.openrouter_model,
|
||||
"messages": messages,
|
||||
"max_tokens": 150,
|
||||
},
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
content = data["choices"][0]["message"]["content"]
|
||||
if not content or not content.strip():
|
||||
print(f"OpenRouter returned empty response")
|
||||
return ""
|
||||
return content
|
||||
except (httpx.TimeoutException, httpx.ReadTimeout):
|
||||
print(f"OpenRouter timeout (attempt {attempt + 1})")
|
||||
if attempt == 0:
|
||||
continue # Retry once
|
||||
return "Uh, sorry, I lost you there for a second. What was that?"
|
||||
except Exception as e:
|
||||
print(f"OpenRouter error: {e}")
|
||||
return "Yeah... I don't know, man."
|
||||
return "Uh, hold on a sec..."
|
||||
async def _call_openrouter_with_fallback(self, messages: list[dict]) -> str:
|
||||
"""Try primary model, then fallback models. Always returns a response."""
|
||||
|
||||
# Try primary model first
|
||||
result = await self._call_openrouter_once(messages, self.openrouter_model)
|
||||
if result is not None:
|
||||
return result
|
||||
|
||||
# Try fallback models
|
||||
for model in FALLBACK_MODELS:
|
||||
if model == self.openrouter_model:
|
||||
continue # Already tried
|
||||
print(f"[LLM] Falling back to {model}...")
|
||||
result = await self._call_openrouter_once(messages, model, timeout=10.0)
|
||||
if result is not None:
|
||||
return result
|
||||
|
||||
# Everything failed — return an in-character line so the show continues
|
||||
print("[LLM] All models failed, using canned response")
|
||||
return "Sorry, I totally blanked out for a second. What were you saying?"
|
||||
|
||||
async def _call_openrouter_once(self, messages: list[dict], model: str, timeout: float = 15.0) -> str | None:
|
||||
"""Single attempt to call OpenRouter. Returns None on failure (not a fallback string)."""
|
||||
try:
|
||||
response = await self.client.post(
|
||||
"https://openrouter.ai/api/v1/chat/completions",
|
||||
headers={
|
||||
"Authorization": f"Bearer {settings.openrouter_api_key}",
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
json={
|
||||
"model": model,
|
||||
"messages": messages,
|
||||
"max_tokens": 150,
|
||||
},
|
||||
timeout=timeout,
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
content = data["choices"][0]["message"]["content"]
|
||||
if content and content.strip():
|
||||
return content
|
||||
print(f"[LLM] {model} returned empty response")
|
||||
return None
|
||||
except httpx.TimeoutException:
|
||||
print(f"[LLM] {model} timed out ({timeout}s)")
|
||||
return None
|
||||
except Exception as e:
|
||||
print(f"[LLM] {model} error: {e}")
|
||||
return None
|
||||
|
||||
async def _call_ollama(self, messages: list[dict]) -> str:
|
||||
"""Call Ollama API"""
|
||||
@@ -155,11 +175,11 @@ class LLMService:
|
||||
"messages": messages,
|
||||
"stream": False,
|
||||
"options": {
|
||||
"num_predict": 100, # Allow complete thoughts
|
||||
"temperature": 0.8, # Balanced creativity/coherence
|
||||
"top_p": 0.9, # Focused word choices
|
||||
"repeat_penalty": 1.3, # Avoid repetition
|
||||
"top_k": 50, # Reasonable token variety
|
||||
"num_predict": 100,
|
||||
"temperature": 0.8,
|
||||
"top_p": 0.9,
|
||||
"repeat_penalty": 1.3,
|
||||
"top_k": 50,
|
||||
},
|
||||
},
|
||||
timeout=30.0
|
||||
@@ -169,10 +189,10 @@ class LLMService:
|
||||
return data["message"]["content"]
|
||||
except httpx.TimeoutException:
|
||||
print("Ollama timeout")
|
||||
return "Uh, sorry, I lost you there for a second. What was that?"
|
||||
return "Sorry, I totally blanked out for a second. What were you saying?"
|
||||
except Exception as e:
|
||||
print(f"Ollama error: {e}")
|
||||
return "Yeah... I don't know, man."
|
||||
return "Sorry, I totally blanked out for a second. What were you saying?"
|
||||
|
||||
|
||||
# Global instance
|
||||
|
||||
Reference in New Issue
Block a user