Harden LLM: model fallback chain, reuse client, remove fighting timeouts

- Primary model gets 15s, then auto-falls back through gemini-flash, gpt-4o-mini, llama-3.1-8b (10s each) - Always returns a response — canned in-character line as last resort - Reuse httpx client instead of creating new one per request - Remove asyncio.timeout wrappers that were killing requests before the LLM service could try fallbacks Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-06 22:07:39 -07:00
parent 73129374f4
commit aa3899b1fc
2 changed files with 113 additions and 109 deletions
@@ -1116,28 +1116,22 @@ async def chat(request: ChatRequest):
    session.add_message("user", request.text)
    # session._research_task = asyncio.create_task(_background_research(request.text))

-    try:
-        async with asyncio.timeout(20):
-            async with _ai_response_lock:
-                if _session_epoch != epoch:
-                    raise HTTPException(409, "Call ended while waiting")
+    async with _ai_response_lock:
+        if _session_epoch != epoch:
+            raise HTTPException(409, "Call ended while waiting")

-                # Stop any playing caller audio so responses don't overlap
-                audio_service.stop_caller_audio()
+        # Stop any playing caller audio so responses don't overlap
+        audio_service.stop_caller_audio()

-                # Include conversation summary and show history for context
-                conversation_summary = session.get_conversation_summary()
-                show_history = session.get_show_history()
-                system_prompt = get_caller_prompt(session.caller, conversation_summary, show_history)
+        conversation_summary = session.get_conversation_summary()
+        show_history = session.get_show_history()
+        system_prompt = get_caller_prompt(session.caller, conversation_summary, show_history)

-                messages = _normalize_messages_for_llm(session.conversation[-10:])
-                response = await llm_service.generate(
-                    messages=messages,
-                    system_prompt=system_prompt
-                )
-    except TimeoutError:
-        caller_name = session.caller["name"] if session.caller else "Caller"
-        return {"text": "Uh... hold on, I lost my train of thought.", "caller": caller_name, "voice_id": session.caller["voice"] if session.caller else ""}
+        messages = _normalize_messages_for_llm(session.conversation[-10:])
+        response = await llm_service.generate(
+            messages=messages,
+            system_prompt=system_prompt
+        )

    # Discard if call changed while we were generating
    if _session_epoch != epoch:
@@ -1644,30 +1638,24 @@ async def _trigger_ai_auto_respond(accumulated_text: str):

    ai_name = session.caller["name"]

-    try:
-        async with asyncio.timeout(20):
-            async with _ai_response_lock:
-                if _session_epoch != epoch:
-                    return  # Call changed while waiting for lock
+    async with _ai_response_lock:
+        if _session_epoch != epoch:
+            return  # Call changed while waiting for lock

-                print(f"[Auto-Respond] {ai_name} is jumping in...")
-                session._last_ai_auto_respond = time.time()
-                audio_service.stop_caller_audio()
-                broadcast_event("ai_status", {"text": f"{ai_name} is thinking..."})
+        print(f"[Auto-Respond] {ai_name} is jumping in...")
+        session._last_ai_auto_respond = time.time()
+        audio_service.stop_caller_audio()
+        broadcast_event("ai_status", {"text": f"{ai_name} is thinking..."})

-                conversation_summary = session.get_conversation_summary()
-                show_history = session.get_show_history()
-                system_prompt = get_caller_prompt(session.caller, conversation_summary, show_history)
+        conversation_summary = session.get_conversation_summary()
+        show_history = session.get_show_history()
+        system_prompt = get_caller_prompt(session.caller, conversation_summary, show_history)

-                messages = _normalize_messages_for_llm(session.conversation[-10:])
-                response = await llm_service.generate(
-                    messages=messages,
-                    system_prompt=system_prompt,
-                )
-    except TimeoutError:
-        print(f"[Auto-Respond] Timed out for {ai_name}")
-        broadcast_event("ai_done")
-        return
+        messages = _normalize_messages_for_llm(session.conversation[-10:])
+        response = await llm_service.generate(
+            messages=messages,
+            system_prompt=system_prompt,
+        )

    # Discard if call changed during generation
    if _session_epoch != epoch:
@@ -1725,25 +1713,21 @@ async def ai_respond():

    epoch = _session_epoch

-    try:
-        async with asyncio.timeout(20):
-            async with _ai_response_lock:
-                if _session_epoch != epoch:
-                    raise HTTPException(409, "Call ended while waiting")
+    async with _ai_response_lock:
+        if _session_epoch != epoch:
+            raise HTTPException(409, "Call ended while waiting")

-                audio_service.stop_caller_audio()
+        audio_service.stop_caller_audio()

-                conversation_summary = session.get_conversation_summary()
-                show_history = session.get_show_history()
-                system_prompt = get_caller_prompt(session.caller, conversation_summary, show_history)
+        conversation_summary = session.get_conversation_summary()
+        show_history = session.get_show_history()
+        system_prompt = get_caller_prompt(session.caller, conversation_summary, show_history)

-                messages = _normalize_messages_for_llm(session.conversation[-10:])
-                response = await llm_service.generate(
-                    messages=messages,
-                    system_prompt=system_prompt
-                )
-    except TimeoutError:
-        return {"text": "Uh... sorry, I spaced out for a second there.", "caller": session.caller["name"], "voice_id": session.caller["voice"]}
+        messages = _normalize_messages_for_llm(session.conversation[-10:])
+        response = await llm_service.generate(
+            messages=messages,
+            system_prompt=system_prompt
+        )

    if _session_epoch != epoch:
        raise HTTPException(409, "Call changed during response")
@@ -17,6 +17,13 @@ OPENROUTER_MODELS = [
    "mistralai/mistral-7b-instruct",
 ]

+# Fast models to try as fallbacks (cheap, fast, good enough for conversation)
+FALLBACK_MODELS = [
+    "google/gemini-flash-1.5",
+    "openai/gpt-4o-mini",
+    "meta-llama/llama-3.1-8b-instruct",
+]
+

 class LLMService:
    """Abstraction layer for LLM providers"""
@@ -27,6 +34,13 @@ class LLMService:
        self.ollama_model = settings.ollama_model
        self.ollama_host = settings.ollama_host
        self.tts_provider = settings.tts_provider
+        self._client: httpx.AsyncClient | None = None
+
+    @property
+    def client(self) -> httpx.AsyncClient:
+        if self._client is None or self._client.is_closed:
+            self._client = httpx.AsyncClient(timeout=15.0)
+        return self._client

    def update_settings(
        self,
@@ -47,7 +61,6 @@ class LLMService:
            self.ollama_host = ollama_host
        if tts_provider:
            self.tts_provider = tts_provider
-            # Also update the global settings so TTS service picks it up
            settings.tts_provider = tts_provider

    async def get_ollama_models(self) -> list[str]:
@@ -71,7 +84,7 @@ class LLMService:
            "ollama_host": self.ollama_host,
            "tts_provider": self.tts_provider,
            "available_openrouter_models": OPENROUTER_MODELS,
-            "available_ollama_models": []  # Fetched separately
+            "available_ollama_models": []
        }

    async def get_settings_async(self) -> dict:
@@ -92,57 +105,64 @@ class LLMService:
        messages: list[dict],
        system_prompt: Optional[str] = None
    ) -> str:
-        """
-        Generate a response from the LLM.
-
-        Args:
-            messages: List of message dicts with 'role' and 'content'
-            system_prompt: Optional system prompt to prepend
-
-        Returns:
-            Generated text response
-        """
        if system_prompt:
            messages = [{"role": "system", "content": system_prompt}] + messages

        if self.provider == "openrouter":
-            return await self._call_openrouter(messages)
+            return await self._call_openrouter_with_fallback(messages)
        else:
            return await self._call_ollama(messages)

-    async def _call_openrouter(self, messages: list[dict]) -> str:
-        """Call OpenRouter API with retry"""
-        for attempt in range(2):  # Try twice
-            try:
-                async with httpx.AsyncClient(timeout=25.0) as client:
-                    response = await client.post(
-                        "https://openrouter.ai/api/v1/chat/completions",
-                        headers={
-                            "Authorization": f"Bearer {settings.openrouter_api_key}",
-                            "Content-Type": "application/json",
-                        },
-                        json={
-                            "model": self.openrouter_model,
-                            "messages": messages,
-                            "max_tokens": 150,
-                        },
-                    )
-                    response.raise_for_status()
-                    data = response.json()
-                    content = data["choices"][0]["message"]["content"]
-                    if not content or not content.strip():
-                        print(f"OpenRouter returned empty response")
-                        return ""
-                    return content
-            except (httpx.TimeoutException, httpx.ReadTimeout):
-                print(f"OpenRouter timeout (attempt {attempt + 1})")
-                if attempt == 0:
-                    continue  # Retry once
-                return "Uh, sorry, I lost you there for a second. What was that?"
-            except Exception as e:
-                print(f"OpenRouter error: {e}")
-                return "Yeah... I don't know, man."
-        return "Uh, hold on a sec..."
+    async def _call_openrouter_with_fallback(self, messages: list[dict]) -> str:
+        """Try primary model, then fallback models. Always returns a response."""
+
+        # Try primary model first
+        result = await self._call_openrouter_once(messages, self.openrouter_model)
+        if result is not None:
+            return result
+
+        # Try fallback models
+        for model in FALLBACK_MODELS:
+            if model == self.openrouter_model:
+                continue  # Already tried
+            print(f"[LLM] Falling back to {model}...")
+            result = await self._call_openrouter_once(messages, model, timeout=10.0)
+            if result is not None:
+                return result
+
+        # Everything failed — return an in-character line so the show continues
+        print("[LLM] All models failed, using canned response")
+        return "Sorry, I totally blanked out for a second. What were you saying?"
+
+    async def _call_openrouter_once(self, messages: list[dict], model: str, timeout: float = 15.0) -> str | None:
+        """Single attempt to call OpenRouter. Returns None on failure (not a fallback string)."""
+        try:
+            response = await self.client.post(
+                "https://openrouter.ai/api/v1/chat/completions",
+                headers={
+                    "Authorization": f"Bearer {settings.openrouter_api_key}",
+                    "Content-Type": "application/json",
+                },
+                json={
+                    "model": model,
+                    "messages": messages,
+                    "max_tokens": 150,
+                },
+                timeout=timeout,
+            )
+            response.raise_for_status()
+            data = response.json()
+            content = data["choices"][0]["message"]["content"]
+            if content and content.strip():
+                return content
+            print(f"[LLM] {model} returned empty response")
+            return None
+        except httpx.TimeoutException:
+            print(f"[LLM] {model} timed out ({timeout}s)")
+            return None
+        except Exception as e:
+            print(f"[LLM] {model} error: {e}")
+            return None

    async def _call_ollama(self, messages: list[dict]) -> str:
        """Call Ollama API"""
@@ -155,11 +175,11 @@ class LLMService:
                        "messages": messages,
                        "stream": False,
                        "options": {
-                            "num_predict": 100,     # Allow complete thoughts
-                            "temperature": 0.8,     # Balanced creativity/coherence
-                            "top_p": 0.9,           # Focused word choices
-                            "repeat_penalty": 1.3,  # Avoid repetition
-                            "top_k": 50,            # Reasonable token variety
+                            "num_predict": 100,
+                            "temperature": 0.8,
+                            "top_p": 0.9,
+                            "repeat_penalty": 1.3,
+                            "top_k": 50,
                        },
                    },
                    timeout=30.0
@@ -169,10 +189,10 @@ class LLMService:
                return data["message"]["content"]
        except httpx.TimeoutException:
            print("Ollama timeout")
-            return "Uh, sorry, I lost you there for a second. What was that?"
+            return "Sorry, I totally blanked out for a second. What were you saying?"
        except Exception as e:
            print(f"Ollama error: {e}")
-            return "Yeah... I don't know, man."
+            return "Sorry, I totally blanked out for a second. What were you saying?"


 # Global instance