Harden LLM: model fallback chain, reuse client, remove fighting timeouts

- Primary model gets 15s, then auto-falls back through gemini-flash, gpt-4o-mini, llama-3.1-8b (10s each) - Always returns a response — canned in-character line as last resort - Reuse httpx client instead of creating new one per request - Remove asyncio.timeout wrappers that were killing requests before the LLM service could try fallbacks Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-06 22:07:39 -07:00
parent 73129374f4
commit aa3899b1fc
2 changed files with 113 additions and 109 deletions
--- a/backend/main.py
+++ b/backend/main.py
@@ -1116,28 +1116,22 @@ async def chat(request: ChatRequest):
    session.add_message("user", request.text)
    # session._research_task = asyncio.create_task(_background_research(request.text))

-    try:
-        async with asyncio.timeout(20):
-            async with _ai_response_lock:
-                if _session_epoch != epoch:
-                    raise HTTPException(409, "Call ended while waiting")
+    async with _ai_response_lock:
+        if _session_epoch != epoch:
+            raise HTTPException(409, "Call ended while waiting")

-                # Stop any playing caller audio so responses don't overlap
-                audio_service.stop_caller_audio()
+        # Stop any playing caller audio so responses don't overlap
+        audio_service.stop_caller_audio()

-                # Include conversation summary and show history for context
-                conversation_summary = session.get_conversation_summary()
-                show_history = session.get_show_history()
-                system_prompt = get_caller_prompt(session.caller, conversation_summary, show_history)
+        conversation_summary = session.get_conversation_summary()
+        show_history = session.get_show_history()
+        system_prompt = get_caller_prompt(session.caller, conversation_summary, show_history)

-                messages = _normalize_messages_for_llm(session.conversation[-10:])
-                response = await llm_service.generate(
-                    messages=messages,
-                    system_prompt=system_prompt
-                )
-    except TimeoutError:
-        caller_name = session.caller["name"] if session.caller else "Caller"
-        return {"text": "Uh... hold on, I lost my train of thought.", "caller": caller_name, "voice_id": session.caller["voice"] if session.caller else ""}
+        messages = _normalize_messages_for_llm(session.conversation[-10:])
+        response = await llm_service.generate(
+            messages=messages,
+            system_prompt=system_prompt
+        )

    # Discard if call changed while we were generating
    if _session_epoch != epoch:
@@ -1644,30 +1638,24 @@ async def _trigger_ai_auto_respond(accumulated_text: str):

    ai_name = session.caller["name"]

-    try:
-        async with asyncio.timeout(20):
-            async with _ai_response_lock:
-                if _session_epoch != epoch:
-                    return  # Call changed while waiting for lock
+    async with _ai_response_lock:
+        if _session_epoch != epoch:
+            return  # Call changed while waiting for lock

-                print(f"[Auto-Respond] {ai_name} is jumping in...")
-                session._last_ai_auto_respond = time.time()
-                audio_service.stop_caller_audio()
-                broadcast_event("ai_status", {"text": f"{ai_name} is thinking..."})
+        print(f"[Auto-Respond] {ai_name} is jumping in...")
+        session._last_ai_auto_respond = time.time()
+        audio_service.stop_caller_audio()
+        broadcast_event("ai_status", {"text": f"{ai_name} is thinking..."})

-                conversation_summary = session.get_conversation_summary()
-                show_history = session.get_show_history()
-                system_prompt = get_caller_prompt(session.caller, conversation_summary, show_history)
+        conversation_summary = session.get_conversation_summary()
+        show_history = session.get_show_history()
+        system_prompt = get_caller_prompt(session.caller, conversation_summary, show_history)

-                messages = _normalize_messages_for_llm(session.conversation[-10:])
-                response = await llm_service.generate(
-                    messages=messages,
-                    system_prompt=system_prompt,
-                )
-    except TimeoutError:
-        print(f"[Auto-Respond] Timed out for {ai_name}")
-        broadcast_event("ai_done")
-        return
+        messages = _normalize_messages_for_llm(session.conversation[-10:])
+        response = await llm_service.generate(
+            messages=messages,
+            system_prompt=system_prompt,
+        )

    # Discard if call changed during generation
    if _session_epoch != epoch:
@@ -1725,25 +1713,21 @@ async def ai_respond():

    epoch = _session_epoch

-    try:
-        async with asyncio.timeout(20):
-            async with _ai_response_lock:
-                if _session_epoch != epoch:
-                    raise HTTPException(409, "Call ended while waiting")
+    async with _ai_response_lock:
+        if _session_epoch != epoch:
+            raise HTTPException(409, "Call ended while waiting")

-                audio_service.stop_caller_audio()
+        audio_service.stop_caller_audio()

-                conversation_summary = session.get_conversation_summary()
-                show_history = session.get_show_history()
-                system_prompt = get_caller_prompt(session.caller, conversation_summary, show_history)
+        conversation_summary = session.get_conversation_summary()
+        show_history = session.get_show_history()
+        system_prompt = get_caller_prompt(session.caller, conversation_summary, show_history)

-                messages = _normalize_messages_for_llm(session.conversation[-10:])
-                response = await llm_service.generate(
-                    messages=messages,
-                    system_prompt=system_prompt
-                )
-    except TimeoutError:
-        return {"text": "Uh... sorry, I spaced out for a second there.", "caller": session.caller["name"], "voice_id": session.caller["voice"]}
+        messages = _normalize_messages_for_llm(session.conversation[-10:])
+        response = await llm_service.generate(
+            messages=messages,
+            system_prompt=system_prompt
+        )

    if _session_epoch != epoch:
        raise HTTPException(409, "Call changed during response")