Harden LLM: model fallback chain, reuse client, remove fighting timeouts
- Primary model gets 15s, then auto-falls back through gemini-flash, gpt-4o-mini, llama-3.1-8b (10s each) - Always returns a response — canned in-character line as last resort - Reuse httpx client instead of creating new one per request - Remove asyncio.timeout wrappers that were killing requests before the LLM service could try fallbacks Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1116,28 +1116,22 @@ async def chat(request: ChatRequest):
|
||||
session.add_message("user", request.text)
|
||||
# session._research_task = asyncio.create_task(_background_research(request.text))
|
||||
|
||||
try:
|
||||
async with asyncio.timeout(20):
|
||||
async with _ai_response_lock:
|
||||
if _session_epoch != epoch:
|
||||
raise HTTPException(409, "Call ended while waiting")
|
||||
async with _ai_response_lock:
|
||||
if _session_epoch != epoch:
|
||||
raise HTTPException(409, "Call ended while waiting")
|
||||
|
||||
# Stop any playing caller audio so responses don't overlap
|
||||
audio_service.stop_caller_audio()
|
||||
# Stop any playing caller audio so responses don't overlap
|
||||
audio_service.stop_caller_audio()
|
||||
|
||||
# Include conversation summary and show history for context
|
||||
conversation_summary = session.get_conversation_summary()
|
||||
show_history = session.get_show_history()
|
||||
system_prompt = get_caller_prompt(session.caller, conversation_summary, show_history)
|
||||
conversation_summary = session.get_conversation_summary()
|
||||
show_history = session.get_show_history()
|
||||
system_prompt = get_caller_prompt(session.caller, conversation_summary, show_history)
|
||||
|
||||
messages = _normalize_messages_for_llm(session.conversation[-10:])
|
||||
response = await llm_service.generate(
|
||||
messages=messages,
|
||||
system_prompt=system_prompt
|
||||
)
|
||||
except TimeoutError:
|
||||
caller_name = session.caller["name"] if session.caller else "Caller"
|
||||
return {"text": "Uh... hold on, I lost my train of thought.", "caller": caller_name, "voice_id": session.caller["voice"] if session.caller else ""}
|
||||
messages = _normalize_messages_for_llm(session.conversation[-10:])
|
||||
response = await llm_service.generate(
|
||||
messages=messages,
|
||||
system_prompt=system_prompt
|
||||
)
|
||||
|
||||
# Discard if call changed while we were generating
|
||||
if _session_epoch != epoch:
|
||||
@@ -1644,30 +1638,24 @@ async def _trigger_ai_auto_respond(accumulated_text: str):
|
||||
|
||||
ai_name = session.caller["name"]
|
||||
|
||||
try:
|
||||
async with asyncio.timeout(20):
|
||||
async with _ai_response_lock:
|
||||
if _session_epoch != epoch:
|
||||
return # Call changed while waiting for lock
|
||||
async with _ai_response_lock:
|
||||
if _session_epoch != epoch:
|
||||
return # Call changed while waiting for lock
|
||||
|
||||
print(f"[Auto-Respond] {ai_name} is jumping in...")
|
||||
session._last_ai_auto_respond = time.time()
|
||||
audio_service.stop_caller_audio()
|
||||
broadcast_event("ai_status", {"text": f"{ai_name} is thinking..."})
|
||||
print(f"[Auto-Respond] {ai_name} is jumping in...")
|
||||
session._last_ai_auto_respond = time.time()
|
||||
audio_service.stop_caller_audio()
|
||||
broadcast_event("ai_status", {"text": f"{ai_name} is thinking..."})
|
||||
|
||||
conversation_summary = session.get_conversation_summary()
|
||||
show_history = session.get_show_history()
|
||||
system_prompt = get_caller_prompt(session.caller, conversation_summary, show_history)
|
||||
conversation_summary = session.get_conversation_summary()
|
||||
show_history = session.get_show_history()
|
||||
system_prompt = get_caller_prompt(session.caller, conversation_summary, show_history)
|
||||
|
||||
messages = _normalize_messages_for_llm(session.conversation[-10:])
|
||||
response = await llm_service.generate(
|
||||
messages=messages,
|
||||
system_prompt=system_prompt,
|
||||
)
|
||||
except TimeoutError:
|
||||
print(f"[Auto-Respond] Timed out for {ai_name}")
|
||||
broadcast_event("ai_done")
|
||||
return
|
||||
messages = _normalize_messages_for_llm(session.conversation[-10:])
|
||||
response = await llm_service.generate(
|
||||
messages=messages,
|
||||
system_prompt=system_prompt,
|
||||
)
|
||||
|
||||
# Discard if call changed during generation
|
||||
if _session_epoch != epoch:
|
||||
@@ -1725,25 +1713,21 @@ async def ai_respond():
|
||||
|
||||
epoch = _session_epoch
|
||||
|
||||
try:
|
||||
async with asyncio.timeout(20):
|
||||
async with _ai_response_lock:
|
||||
if _session_epoch != epoch:
|
||||
raise HTTPException(409, "Call ended while waiting")
|
||||
async with _ai_response_lock:
|
||||
if _session_epoch != epoch:
|
||||
raise HTTPException(409, "Call ended while waiting")
|
||||
|
||||
audio_service.stop_caller_audio()
|
||||
audio_service.stop_caller_audio()
|
||||
|
||||
conversation_summary = session.get_conversation_summary()
|
||||
show_history = session.get_show_history()
|
||||
system_prompt = get_caller_prompt(session.caller, conversation_summary, show_history)
|
||||
conversation_summary = session.get_conversation_summary()
|
||||
show_history = session.get_show_history()
|
||||
system_prompt = get_caller_prompt(session.caller, conversation_summary, show_history)
|
||||
|
||||
messages = _normalize_messages_for_llm(session.conversation[-10:])
|
||||
response = await llm_service.generate(
|
||||
messages=messages,
|
||||
system_prompt=system_prompt
|
||||
)
|
||||
except TimeoutError:
|
||||
return {"text": "Uh... sorry, I spaced out for a second there.", "caller": session.caller["name"], "voice_id": session.caller["voice"]}
|
||||
messages = _normalize_messages_for_llm(session.conversation[-10:])
|
||||
response = await llm_service.generate(
|
||||
messages=messages,
|
||||
system_prompt=system_prompt
|
||||
)
|
||||
|
||||
if _session_epoch != epoch:
|
||||
raise HTTPException(409, "Call changed during response")
|
||||
|
||||
Reference in New Issue
Block a user