Harden LLM: model fallback chain, reuse client, remove fighting timeouts

- Primary model gets 15s, then auto-falls back through gemini-flash, gpt-4o-mini, llama-3.1-8b (10s each) - Always returns a response — canned in-character line as last resort - Reuse httpx client instead of creating new one per request - Remove asyncio.timeout wrappers that were killing requests before the LLM service could try fallbacks Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-06 22:07:39 -07:00
parent 73129374f4
commit aa3899b1fc
2 changed files with 113 additions and 109 deletions
@@ -1116,8 +1116,6 @@ async def chat(request: ChatRequest):
    session.add_message("user", request.text)
    # session._research_task = asyncio.create_task(_background_research(request.text))
    try:
        async with asyncio.timeout(20):
    async with _ai_response_lock:
        if _session_epoch != epoch:
            raise HTTPException(409, "Call ended while waiting")
@@ -1125,7 +1123,6 @@ async def chat(request: ChatRequest):
        # Stop any playing caller audio so responses don't overlap
        audio_service.stop_caller_audio()
                # Include conversation summary and show history for context
        conversation_summary = session.get_conversation_summary()
        show_history = session.get_show_history()
        system_prompt = get_caller_prompt(session.caller, conversation_summary, show_history)
@@ -1135,9 +1132,6 @@ async def chat(request: ChatRequest):
            messages=messages,
            system_prompt=system_prompt
        )
    except TimeoutError:
        caller_name = session.caller["name"] if session.caller else "Caller"
        return {"text": "Uh... hold on, I lost my train of thought.", "caller": caller_name, "voice_id": session.caller["voice"] if session.caller else ""}
    # Discard if call changed while we were generating
    if _session_epoch != epoch:
@@ -1644,8 +1638,6 @@ async def _trigger_ai_auto_respond(accumulated_text: str):
    ai_name = session.caller["name"]
    try:
        async with asyncio.timeout(20):
    async with _ai_response_lock:
        if _session_epoch != epoch:
            return  # Call changed while waiting for lock
@@ -1664,10 +1656,6 @@ async def _trigger_ai_auto_respond(accumulated_text: str):
            messages=messages,
            system_prompt=system_prompt,
        )
    except TimeoutError:
        print(f"[Auto-Respond] Timed out for {ai_name}")
        broadcast_event("ai_done")
        return
    # Discard if call changed during generation
    if _session_epoch != epoch:
@@ -1725,8 +1713,6 @@ async def ai_respond():
    epoch = _session_epoch
    try:
        async with asyncio.timeout(20):
    async with _ai_response_lock:
        if _session_epoch != epoch:
            raise HTTPException(409, "Call ended while waiting")
@@ -1742,8 +1728,6 @@ async def ai_respond():
            messages=messages,
            system_prompt=system_prompt
        )
    except TimeoutError:
        return {"text": "Uh... sorry, I spaced out for a second there.", "caller": session.caller["name"], "voice_id": session.caller["voice"]}
    if _session_epoch != epoch:
        raise HTTPException(409, "Call changed during response")
@@ -17,6 +17,13 @@ OPENROUTER_MODELS = [
    "mistralai/mistral-7b-instruct",
 ]
 # Fast models to try as fallbacks (cheap, fast, good enough for conversation)
 FALLBACK_MODELS = [
    "google/gemini-flash-1.5",
    "openai/gpt-4o-mini",
    "meta-llama/llama-3.1-8b-instruct",
 ]
 class LLMService:
    """Abstraction layer for LLM providers"""
@@ -27,6 +34,13 @@ class LLMService:
        self.ollama_model = settings.ollama_model
        self.ollama_host = settings.ollama_host
        self.tts_provider = settings.tts_provider
        self._client: httpx.AsyncClient | None = None
    @property
    def client(self) -> httpx.AsyncClient:
        if self._client is None or self._client.is_closed:
            self._client = httpx.AsyncClient(timeout=15.0)
        return self._client
    def update_settings(
        self,
@@ -47,7 +61,6 @@ class LLMService:
            self.ollama_host = ollama_host
        if tts_provider:
            self.tts_provider = tts_provider
            # Also update the global settings so TTS service picks it up
            settings.tts_provider = tts_provider
    async def get_ollama_models(self) -> list[str]:
@@ -71,7 +84,7 @@ class LLMService:
            "ollama_host": self.ollama_host,
            "tts_provider": self.tts_provider,
            "available_openrouter_models": OPENROUTER_MODELS,
-            "available_ollama_models": []  # Fetched separately
+            "available_ollama_models": []
        }
    async def get_settings_async(self) -> dict:
@@ -92,57 +105,64 @@ class LLMService:
        messages: list[dict],
        system_prompt: Optional[str] = None
    ) -> str:
        """
        Generate a response from the LLM.
        Args:
            messages: List of message dicts with 'role' and 'content'
            system_prompt: Optional system prompt to prepend
        Returns:
            Generated text response
        """
        if system_prompt:
            messages = [{"role": "system", "content": system_prompt}] + messages
        if self.provider == "openrouter":
-            return await self._call_openrouter(messages)
+            return await self._call_openrouter_with_fallback(messages)
        else:
            return await self._call_ollama(messages)
-    async def _call_openrouter(self, messages: list[dict]) -> str:
+    async def _call_openrouter_with_fallback(self, messages: list[dict]) -> str:
-        """Call OpenRouter API with retry"""
+        """Try primary model, then fallback models. Always returns a response."""
-        for attempt in range(2):  # Try twice
+
        # Try primary model first
        result = await self._call_openrouter_once(messages, self.openrouter_model)
        if result is not None:
            return result
        # Try fallback models
        for model in FALLBACK_MODELS:
            if model == self.openrouter_model:
                continue  # Already tried
            print(f"[LLM] Falling back to {model}...")
            result = await self._call_openrouter_once(messages, model, timeout=10.0)
            if result is not None:
                return result
        # Everything failed — return an in-character line so the show continues
        print("[LLM] All models failed, using canned response")
        return "Sorry, I totally blanked out for a second. What were you saying?"
    async def _call_openrouter_once(self, messages: list[dict], model: str, timeout: float = 15.0) -> str | None:
        """Single attempt to call OpenRouter. Returns None on failure (not a fallback string)."""
        try:
-                async with httpx.AsyncClient(timeout=25.0) as client:
+            response = await self.client.post(
                    response = await client.post(
                "https://openrouter.ai/api/v1/chat/completions",
                headers={
                    "Authorization": f"Bearer {settings.openrouter_api_key}",
                    "Content-Type": "application/json",
                },
                json={
-                            "model": self.openrouter_model,
+                    "model": model,
                    "messages": messages,
                    "max_tokens": 150,
                },
                timeout=timeout,
            )
            response.raise_for_status()
            data = response.json()
            content = data["choices"][0]["message"]["content"]
-                    if not content or not content.strip():
+            if content and content.strip():
                        print(f"OpenRouter returned empty response")
                        return ""
                return content
-            except (httpx.TimeoutException, httpx.ReadTimeout):
+            print(f"[LLM] {model} returned empty response")
-                print(f"OpenRouter timeout (attempt {attempt + 1})")
+            return None
-                if attempt == 0:
+        except httpx.TimeoutException:
-                    continue  # Retry once
+            print(f"[LLM] {model} timed out ({timeout}s)")
-                return "Uh, sorry, I lost you there for a second. What was that?"
+            return None
        except Exception as e:
-                print(f"OpenRouter error: {e}")
+            print(f"[LLM] {model} error: {e}")
-                return "Yeah... I don't know, man."
+            return None
        return "Uh, hold on a sec..."
    async def _call_ollama(self, messages: list[dict]) -> str:
        """Call Ollama API"""
@@ -155,11 +175,11 @@ class LLMService:
                        "messages": messages,
                        "stream": False,
                        "options": {
-                            "num_predict": 100,     # Allow complete thoughts
+                            "num_predict": 100,
-                            "temperature": 0.8,     # Balanced creativity/coherence
+                            "temperature": 0.8,
-                            "top_p": 0.9,           # Focused word choices
+                            "top_p": 0.9,
-                            "repeat_penalty": 1.3,  # Avoid repetition
+                            "repeat_penalty": 1.3,
-                            "top_k": 50,            # Reasonable token variety
+                            "top_k": 50,
                        },
                    },
                    timeout=30.0
@@ -169,10 +189,10 @@ class LLMService:
                return data["message"]["content"]
        except httpx.TimeoutException:
            print("Ollama timeout")
-            return "Uh, sorry, I lost you there for a second. What was that?"
+            return "Sorry, I totally blanked out for a second. What were you saying?"
        except Exception as e:
            print(f"Ollama error: {e}")
-            return "Yeah... I don't know, man."
+            return "Sorry, I totally blanked out for a second. What were you saying?"
 # Global instance