Harden LLM: model fallback chain, reuse client, remove fighting timeouts
- Primary model gets 15s, then auto-falls back through gemini-flash, gpt-4o-mini, llama-3.1-8b (10s each) - Always returns a response — canned in-character line as last resort - Reuse httpx client instead of creating new one per request - Remove asyncio.timeout wrappers that were killing requests before the LLM service could try fallbacks Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1116,8 +1116,6 @@ async def chat(request: ChatRequest):
|
|||||||
session.add_message("user", request.text)
|
session.add_message("user", request.text)
|
||||||
# session._research_task = asyncio.create_task(_background_research(request.text))
|
# session._research_task = asyncio.create_task(_background_research(request.text))
|
||||||
|
|
||||||
try:
|
|
||||||
async with asyncio.timeout(20):
|
|
||||||
async with _ai_response_lock:
|
async with _ai_response_lock:
|
||||||
if _session_epoch != epoch:
|
if _session_epoch != epoch:
|
||||||
raise HTTPException(409, "Call ended while waiting")
|
raise HTTPException(409, "Call ended while waiting")
|
||||||
@@ -1125,7 +1123,6 @@ async def chat(request: ChatRequest):
|
|||||||
# Stop any playing caller audio so responses don't overlap
|
# Stop any playing caller audio so responses don't overlap
|
||||||
audio_service.stop_caller_audio()
|
audio_service.stop_caller_audio()
|
||||||
|
|
||||||
# Include conversation summary and show history for context
|
|
||||||
conversation_summary = session.get_conversation_summary()
|
conversation_summary = session.get_conversation_summary()
|
||||||
show_history = session.get_show_history()
|
show_history = session.get_show_history()
|
||||||
system_prompt = get_caller_prompt(session.caller, conversation_summary, show_history)
|
system_prompt = get_caller_prompt(session.caller, conversation_summary, show_history)
|
||||||
@@ -1135,9 +1132,6 @@ async def chat(request: ChatRequest):
|
|||||||
messages=messages,
|
messages=messages,
|
||||||
system_prompt=system_prompt
|
system_prompt=system_prompt
|
||||||
)
|
)
|
||||||
except TimeoutError:
|
|
||||||
caller_name = session.caller["name"] if session.caller else "Caller"
|
|
||||||
return {"text": "Uh... hold on, I lost my train of thought.", "caller": caller_name, "voice_id": session.caller["voice"] if session.caller else ""}
|
|
||||||
|
|
||||||
# Discard if call changed while we were generating
|
# Discard if call changed while we were generating
|
||||||
if _session_epoch != epoch:
|
if _session_epoch != epoch:
|
||||||
@@ -1644,8 +1638,6 @@ async def _trigger_ai_auto_respond(accumulated_text: str):
|
|||||||
|
|
||||||
ai_name = session.caller["name"]
|
ai_name = session.caller["name"]
|
||||||
|
|
||||||
try:
|
|
||||||
async with asyncio.timeout(20):
|
|
||||||
async with _ai_response_lock:
|
async with _ai_response_lock:
|
||||||
if _session_epoch != epoch:
|
if _session_epoch != epoch:
|
||||||
return # Call changed while waiting for lock
|
return # Call changed while waiting for lock
|
||||||
@@ -1664,10 +1656,6 @@ async def _trigger_ai_auto_respond(accumulated_text: str):
|
|||||||
messages=messages,
|
messages=messages,
|
||||||
system_prompt=system_prompt,
|
system_prompt=system_prompt,
|
||||||
)
|
)
|
||||||
except TimeoutError:
|
|
||||||
print(f"[Auto-Respond] Timed out for {ai_name}")
|
|
||||||
broadcast_event("ai_done")
|
|
||||||
return
|
|
||||||
|
|
||||||
# Discard if call changed during generation
|
# Discard if call changed during generation
|
||||||
if _session_epoch != epoch:
|
if _session_epoch != epoch:
|
||||||
@@ -1725,8 +1713,6 @@ async def ai_respond():
|
|||||||
|
|
||||||
epoch = _session_epoch
|
epoch = _session_epoch
|
||||||
|
|
||||||
try:
|
|
||||||
async with asyncio.timeout(20):
|
|
||||||
async with _ai_response_lock:
|
async with _ai_response_lock:
|
||||||
if _session_epoch != epoch:
|
if _session_epoch != epoch:
|
||||||
raise HTTPException(409, "Call ended while waiting")
|
raise HTTPException(409, "Call ended while waiting")
|
||||||
@@ -1742,8 +1728,6 @@ async def ai_respond():
|
|||||||
messages=messages,
|
messages=messages,
|
||||||
system_prompt=system_prompt
|
system_prompt=system_prompt
|
||||||
)
|
)
|
||||||
except TimeoutError:
|
|
||||||
return {"text": "Uh... sorry, I spaced out for a second there.", "caller": session.caller["name"], "voice_id": session.caller["voice"]}
|
|
||||||
|
|
||||||
if _session_epoch != epoch:
|
if _session_epoch != epoch:
|
||||||
raise HTTPException(409, "Call changed during response")
|
raise HTTPException(409, "Call changed during response")
|
||||||
|
|||||||
@@ -17,6 +17,13 @@ OPENROUTER_MODELS = [
|
|||||||
"mistralai/mistral-7b-instruct",
|
"mistralai/mistral-7b-instruct",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# Fast models to try as fallbacks (cheap, fast, good enough for conversation)
|
||||||
|
FALLBACK_MODELS = [
|
||||||
|
"google/gemini-flash-1.5",
|
||||||
|
"openai/gpt-4o-mini",
|
||||||
|
"meta-llama/llama-3.1-8b-instruct",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
class LLMService:
|
class LLMService:
|
||||||
"""Abstraction layer for LLM providers"""
|
"""Abstraction layer for LLM providers"""
|
||||||
@@ -27,6 +34,13 @@ class LLMService:
|
|||||||
self.ollama_model = settings.ollama_model
|
self.ollama_model = settings.ollama_model
|
||||||
self.ollama_host = settings.ollama_host
|
self.ollama_host = settings.ollama_host
|
||||||
self.tts_provider = settings.tts_provider
|
self.tts_provider = settings.tts_provider
|
||||||
|
self._client: httpx.AsyncClient | None = None
|
||||||
|
|
||||||
|
@property
|
||||||
|
def client(self) -> httpx.AsyncClient:
|
||||||
|
if self._client is None or self._client.is_closed:
|
||||||
|
self._client = httpx.AsyncClient(timeout=15.0)
|
||||||
|
return self._client
|
||||||
|
|
||||||
def update_settings(
|
def update_settings(
|
||||||
self,
|
self,
|
||||||
@@ -47,7 +61,6 @@ class LLMService:
|
|||||||
self.ollama_host = ollama_host
|
self.ollama_host = ollama_host
|
||||||
if tts_provider:
|
if tts_provider:
|
||||||
self.tts_provider = tts_provider
|
self.tts_provider = tts_provider
|
||||||
# Also update the global settings so TTS service picks it up
|
|
||||||
settings.tts_provider = tts_provider
|
settings.tts_provider = tts_provider
|
||||||
|
|
||||||
async def get_ollama_models(self) -> list[str]:
|
async def get_ollama_models(self) -> list[str]:
|
||||||
@@ -71,7 +84,7 @@ class LLMService:
|
|||||||
"ollama_host": self.ollama_host,
|
"ollama_host": self.ollama_host,
|
||||||
"tts_provider": self.tts_provider,
|
"tts_provider": self.tts_provider,
|
||||||
"available_openrouter_models": OPENROUTER_MODELS,
|
"available_openrouter_models": OPENROUTER_MODELS,
|
||||||
"available_ollama_models": [] # Fetched separately
|
"available_ollama_models": []
|
||||||
}
|
}
|
||||||
|
|
||||||
async def get_settings_async(self) -> dict:
|
async def get_settings_async(self) -> dict:
|
||||||
@@ -92,57 +105,64 @@ class LLMService:
|
|||||||
messages: list[dict],
|
messages: list[dict],
|
||||||
system_prompt: Optional[str] = None
|
system_prompt: Optional[str] = None
|
||||||
) -> str:
|
) -> str:
|
||||||
"""
|
|
||||||
Generate a response from the LLM.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
messages: List of message dicts with 'role' and 'content'
|
|
||||||
system_prompt: Optional system prompt to prepend
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Generated text response
|
|
||||||
"""
|
|
||||||
if system_prompt:
|
if system_prompt:
|
||||||
messages = [{"role": "system", "content": system_prompt}] + messages
|
messages = [{"role": "system", "content": system_prompt}] + messages
|
||||||
|
|
||||||
if self.provider == "openrouter":
|
if self.provider == "openrouter":
|
||||||
return await self._call_openrouter(messages)
|
return await self._call_openrouter_with_fallback(messages)
|
||||||
else:
|
else:
|
||||||
return await self._call_ollama(messages)
|
return await self._call_ollama(messages)
|
||||||
|
|
||||||
async def _call_openrouter(self, messages: list[dict]) -> str:
|
async def _call_openrouter_with_fallback(self, messages: list[dict]) -> str:
|
||||||
"""Call OpenRouter API with retry"""
|
"""Try primary model, then fallback models. Always returns a response."""
|
||||||
for attempt in range(2): # Try twice
|
|
||||||
|
# Try primary model first
|
||||||
|
result = await self._call_openrouter_once(messages, self.openrouter_model)
|
||||||
|
if result is not None:
|
||||||
|
return result
|
||||||
|
|
||||||
|
# Try fallback models
|
||||||
|
for model in FALLBACK_MODELS:
|
||||||
|
if model == self.openrouter_model:
|
||||||
|
continue # Already tried
|
||||||
|
print(f"[LLM] Falling back to {model}...")
|
||||||
|
result = await self._call_openrouter_once(messages, model, timeout=10.0)
|
||||||
|
if result is not None:
|
||||||
|
return result
|
||||||
|
|
||||||
|
# Everything failed — return an in-character line so the show continues
|
||||||
|
print("[LLM] All models failed, using canned response")
|
||||||
|
return "Sorry, I totally blanked out for a second. What were you saying?"
|
||||||
|
|
||||||
|
async def _call_openrouter_once(self, messages: list[dict], model: str, timeout: float = 15.0) -> str | None:
|
||||||
|
"""Single attempt to call OpenRouter. Returns None on failure (not a fallback string)."""
|
||||||
try:
|
try:
|
||||||
async with httpx.AsyncClient(timeout=25.0) as client:
|
response = await self.client.post(
|
||||||
response = await client.post(
|
|
||||||
"https://openrouter.ai/api/v1/chat/completions",
|
"https://openrouter.ai/api/v1/chat/completions",
|
||||||
headers={
|
headers={
|
||||||
"Authorization": f"Bearer {settings.openrouter_api_key}",
|
"Authorization": f"Bearer {settings.openrouter_api_key}",
|
||||||
"Content-Type": "application/json",
|
"Content-Type": "application/json",
|
||||||
},
|
},
|
||||||
json={
|
json={
|
||||||
"model": self.openrouter_model,
|
"model": model,
|
||||||
"messages": messages,
|
"messages": messages,
|
||||||
"max_tokens": 150,
|
"max_tokens": 150,
|
||||||
},
|
},
|
||||||
|
timeout=timeout,
|
||||||
)
|
)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
data = response.json()
|
data = response.json()
|
||||||
content = data["choices"][0]["message"]["content"]
|
content = data["choices"][0]["message"]["content"]
|
||||||
if not content or not content.strip():
|
if content and content.strip():
|
||||||
print(f"OpenRouter returned empty response")
|
|
||||||
return ""
|
|
||||||
return content
|
return content
|
||||||
except (httpx.TimeoutException, httpx.ReadTimeout):
|
print(f"[LLM] {model} returned empty response")
|
||||||
print(f"OpenRouter timeout (attempt {attempt + 1})")
|
return None
|
||||||
if attempt == 0:
|
except httpx.TimeoutException:
|
||||||
continue # Retry once
|
print(f"[LLM] {model} timed out ({timeout}s)")
|
||||||
return "Uh, sorry, I lost you there for a second. What was that?"
|
return None
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"OpenRouter error: {e}")
|
print(f"[LLM] {model} error: {e}")
|
||||||
return "Yeah... I don't know, man."
|
return None
|
||||||
return "Uh, hold on a sec..."
|
|
||||||
|
|
||||||
async def _call_ollama(self, messages: list[dict]) -> str:
|
async def _call_ollama(self, messages: list[dict]) -> str:
|
||||||
"""Call Ollama API"""
|
"""Call Ollama API"""
|
||||||
@@ -155,11 +175,11 @@ class LLMService:
|
|||||||
"messages": messages,
|
"messages": messages,
|
||||||
"stream": False,
|
"stream": False,
|
||||||
"options": {
|
"options": {
|
||||||
"num_predict": 100, # Allow complete thoughts
|
"num_predict": 100,
|
||||||
"temperature": 0.8, # Balanced creativity/coherence
|
"temperature": 0.8,
|
||||||
"top_p": 0.9, # Focused word choices
|
"top_p": 0.9,
|
||||||
"repeat_penalty": 1.3, # Avoid repetition
|
"repeat_penalty": 1.3,
|
||||||
"top_k": 50, # Reasonable token variety
|
"top_k": 50,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
timeout=30.0
|
timeout=30.0
|
||||||
@@ -169,10 +189,10 @@ class LLMService:
|
|||||||
return data["message"]["content"]
|
return data["message"]["content"]
|
||||||
except httpx.TimeoutException:
|
except httpx.TimeoutException:
|
||||||
print("Ollama timeout")
|
print("Ollama timeout")
|
||||||
return "Uh, sorry, I lost you there for a second. What was that?"
|
return "Sorry, I totally blanked out for a second. What were you saying?"
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Ollama error: {e}")
|
print(f"Ollama error: {e}")
|
||||||
return "Yeah... I don't know, man."
|
return "Sorry, I totally blanked out for a second. What were you saying?"
|
||||||
|
|
||||||
|
|
||||||
# Global instance
|
# Global instance
|
||||||
|
|||||||
Reference in New Issue
Block a user