Harden LLM: model fallback chain, reuse client, remove fighting timeouts

- Primary model gets 15s, then auto-falls back through gemini-flash,
  gpt-4o-mini, llama-3.1-8b (10s each)
- Always returns a response — canned in-character line as last resort
- Reuse httpx client instead of creating new one per request
- Remove asyncio.timeout wrappers that were killing requests before
  the LLM service could try fallbacks

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-06 22:07:39 -07:00
parent 73129374f4
commit aa3899b1fc
2 changed files with 113 additions and 109 deletions

View File

@@ -1116,28 +1116,22 @@ async def chat(request: ChatRequest):
session.add_message("user", request.text)
# session._research_task = asyncio.create_task(_background_research(request.text))
try:
async with asyncio.timeout(20):
async with _ai_response_lock:
if _session_epoch != epoch:
raise HTTPException(409, "Call ended while waiting")
async with _ai_response_lock:
if _session_epoch != epoch:
raise HTTPException(409, "Call ended while waiting")
# Stop any playing caller audio so responses don't overlap
audio_service.stop_caller_audio()
# Stop any playing caller audio so responses don't overlap
audio_service.stop_caller_audio()
# Include conversation summary and show history for context
conversation_summary = session.get_conversation_summary()
show_history = session.get_show_history()
system_prompt = get_caller_prompt(session.caller, conversation_summary, show_history)
conversation_summary = session.get_conversation_summary()
show_history = session.get_show_history()
system_prompt = get_caller_prompt(session.caller, conversation_summary, show_history)
messages = _normalize_messages_for_llm(session.conversation[-10:])
response = await llm_service.generate(
messages=messages,
system_prompt=system_prompt
)
except TimeoutError:
caller_name = session.caller["name"] if session.caller else "Caller"
return {"text": "Uh... hold on, I lost my train of thought.", "caller": caller_name, "voice_id": session.caller["voice"] if session.caller else ""}
messages = _normalize_messages_for_llm(session.conversation[-10:])
response = await llm_service.generate(
messages=messages,
system_prompt=system_prompt
)
# Discard if call changed while we were generating
if _session_epoch != epoch:
@@ -1644,30 +1638,24 @@ async def _trigger_ai_auto_respond(accumulated_text: str):
ai_name = session.caller["name"]
try:
async with asyncio.timeout(20):
async with _ai_response_lock:
if _session_epoch != epoch:
return # Call changed while waiting for lock
async with _ai_response_lock:
if _session_epoch != epoch:
return # Call changed while waiting for lock
print(f"[Auto-Respond] {ai_name} is jumping in...")
session._last_ai_auto_respond = time.time()
audio_service.stop_caller_audio()
broadcast_event("ai_status", {"text": f"{ai_name} is thinking..."})
print(f"[Auto-Respond] {ai_name} is jumping in...")
session._last_ai_auto_respond = time.time()
audio_service.stop_caller_audio()
broadcast_event("ai_status", {"text": f"{ai_name} is thinking..."})
conversation_summary = session.get_conversation_summary()
show_history = session.get_show_history()
system_prompt = get_caller_prompt(session.caller, conversation_summary, show_history)
conversation_summary = session.get_conversation_summary()
show_history = session.get_show_history()
system_prompt = get_caller_prompt(session.caller, conversation_summary, show_history)
messages = _normalize_messages_for_llm(session.conversation[-10:])
response = await llm_service.generate(
messages=messages,
system_prompt=system_prompt,
)
except TimeoutError:
print(f"[Auto-Respond] Timed out for {ai_name}")
broadcast_event("ai_done")
return
messages = _normalize_messages_for_llm(session.conversation[-10:])
response = await llm_service.generate(
messages=messages,
system_prompt=system_prompt,
)
# Discard if call changed during generation
if _session_epoch != epoch:
@@ -1725,25 +1713,21 @@ async def ai_respond():
epoch = _session_epoch
try:
async with asyncio.timeout(20):
async with _ai_response_lock:
if _session_epoch != epoch:
raise HTTPException(409, "Call ended while waiting")
async with _ai_response_lock:
if _session_epoch != epoch:
raise HTTPException(409, "Call ended while waiting")
audio_service.stop_caller_audio()
audio_service.stop_caller_audio()
conversation_summary = session.get_conversation_summary()
show_history = session.get_show_history()
system_prompt = get_caller_prompt(session.caller, conversation_summary, show_history)
conversation_summary = session.get_conversation_summary()
show_history = session.get_show_history()
system_prompt = get_caller_prompt(session.caller, conversation_summary, show_history)
messages = _normalize_messages_for_llm(session.conversation[-10:])
response = await llm_service.generate(
messages=messages,
system_prompt=system_prompt
)
except TimeoutError:
return {"text": "Uh... sorry, I spaced out for a second there.", "caller": session.caller["name"], "voice_id": session.caller["voice"]}
messages = _normalize_messages_for_llm(session.conversation[-10:])
response = await llm_service.generate(
messages=messages,
system_prompt=system_prompt
)
if _session_epoch != epoch:
raise HTTPException(409, "Call changed during response")