diff --git a/backend/main.py b/backend/main.py
index a6a7a0d..509e595 100644
--- a/backend/main.py
+++ b/backend/main.py
@@ -1116,28 +1116,22 @@ async def chat(request: ChatRequest):
     session.add_message("user", request.text)
     # session._research_task = asyncio.create_task(_background_research(request.text))
 
-    try:
-        async with asyncio.timeout(20):
-            async with _ai_response_lock:
-                if _session_epoch != epoch:
-                    raise HTTPException(409, "Call ended while waiting")
+    async with _ai_response_lock:
+        if _session_epoch != epoch:
+            raise HTTPException(409, "Call ended while waiting")
 
-                # Stop any playing caller audio so responses don't overlap
-                audio_service.stop_caller_audio()
+        # Stop any playing caller audio so responses don't overlap
+        audio_service.stop_caller_audio()
 
-                # Include conversation summary and show history for context
-                conversation_summary = session.get_conversation_summary()
-                show_history = session.get_show_history()
-                system_prompt = get_caller_prompt(session.caller, conversation_summary, show_history)
+        conversation_summary = session.get_conversation_summary()
+        show_history = session.get_show_history()
+        system_prompt = get_caller_prompt(session.caller, conversation_summary, show_history)
 
-                messages = _normalize_messages_for_llm(session.conversation[-10:])
-                response = await llm_service.generate(
-                    messages=messages,
-                    system_prompt=system_prompt
-                )
-    except TimeoutError:
-        caller_name = session.caller["name"] if session.caller else "Caller"
-        return {"text": "Uh... hold on, I lost my train of thought.", "caller": caller_name, "voice_id": session.caller["voice"] if session.caller else ""}
+        messages = _normalize_messages_for_llm(session.conversation[-10:])
+        response = await llm_service.generate(
+            messages=messages,
+            system_prompt=system_prompt
+        )
 
     # Discard if call changed while we were generating
     if _session_epoch != epoch:
@@ -1644,30 +1638,24 @@ async def _trigger_ai_auto_respond(accumulated_text: str):
 
     ai_name = session.caller["name"]
 
-    try:
-        async with asyncio.timeout(20):
-            async with _ai_response_lock:
-                if _session_epoch != epoch:
-                    return  # Call changed while waiting for lock
+    async with _ai_response_lock:
+        if _session_epoch != epoch:
+            return  # Call changed while waiting for lock
 
-                print(f"[Auto-Respond] {ai_name} is jumping in...")
-                session._last_ai_auto_respond = time.time()
-                audio_service.stop_caller_audio()
-                broadcast_event("ai_status", {"text": f"{ai_name} is thinking..."})
+        print(f"[Auto-Respond] {ai_name} is jumping in...")
+        session._last_ai_auto_respond = time.time()
+        audio_service.stop_caller_audio()
+        broadcast_event("ai_status", {"text": f"{ai_name} is thinking..."})
 
-                conversation_summary = session.get_conversation_summary()
-                show_history = session.get_show_history()
-                system_prompt = get_caller_prompt(session.caller, conversation_summary, show_history)
+        conversation_summary = session.get_conversation_summary()
+        show_history = session.get_show_history()
+        system_prompt = get_caller_prompt(session.caller, conversation_summary, show_history)
 
-                messages = _normalize_messages_for_llm(session.conversation[-10:])
-                response = await llm_service.generate(
-                    messages=messages,
-                    system_prompt=system_prompt,
-                )
-    except TimeoutError:
-        print(f"[Auto-Respond] Timed out for {ai_name}")
-        broadcast_event("ai_done")
-        return
+        messages = _normalize_messages_for_llm(session.conversation[-10:])
+        response = await llm_service.generate(
+            messages=messages,
+            system_prompt=system_prompt,
+        )
 
     # Discard if call changed during generation
     if _session_epoch != epoch:
@@ -1725,25 +1713,21 @@ async def ai_respond():
 
     epoch = _session_epoch
 
-    try:
-        async with asyncio.timeout(20):
-            async with _ai_response_lock:
-                if _session_epoch != epoch:
-                    raise HTTPException(409, "Call ended while waiting")
+    async with _ai_response_lock:
+        if _session_epoch != epoch:
+            raise HTTPException(409, "Call ended while waiting")
 
-                audio_service.stop_caller_audio()
+        audio_service.stop_caller_audio()
 
-                conversation_summary = session.get_conversation_summary()
-                show_history = session.get_show_history()
-                system_prompt = get_caller_prompt(session.caller, conversation_summary, show_history)
+        conversation_summary = session.get_conversation_summary()
+        show_history = session.get_show_history()
+        system_prompt = get_caller_prompt(session.caller, conversation_summary, show_history)
 
-                messages = _normalize_messages_for_llm(session.conversation[-10:])
-                response = await llm_service.generate(
-                    messages=messages,
-                    system_prompt=system_prompt
-                )
-    except TimeoutError:
-        return {"text": "Uh... sorry, I spaced out for a second there.", "caller": session.caller["name"], "voice_id": session.caller["voice"]}
+        messages = _normalize_messages_for_llm(session.conversation[-10:])
+        response = await llm_service.generate(
+            messages=messages,
+            system_prompt=system_prompt
+        )
 
     if _session_epoch != epoch:
         raise HTTPException(409, "Call changed during response")
diff --git a/backend/services/llm.py b/backend/services/llm.py
index 2776fd4..b9e5e73 100644
--- a/backend/services/llm.py
+++ b/backend/services/llm.py
@@ -17,6 +17,13 @@ OPENROUTER_MODELS = [
     "mistralai/mistral-7b-instruct",
 ]
 
+# Fast models to try as fallbacks (cheap, fast, good enough for conversation)
+FALLBACK_MODELS = [
+    "google/gemini-flash-1.5",
+    "openai/gpt-4o-mini",
+    "meta-llama/llama-3.1-8b-instruct",
+]
+
 
 class LLMService:
     """Abstraction layer for LLM providers"""
@@ -27,6 +34,13 @@ class LLMService:
         self.ollama_model = settings.ollama_model
         self.ollama_host = settings.ollama_host
         self.tts_provider = settings.tts_provider
+        self._client: httpx.AsyncClient | None = None
+
+    @property
+    def client(self) -> httpx.AsyncClient:
+        if self._client is None or self._client.is_closed:
+            self._client = httpx.AsyncClient(timeout=15.0)
+        return self._client
 
     def update_settings(
         self,
@@ -47,7 +61,6 @@ class LLMService:
             self.ollama_host = ollama_host
         if tts_provider:
             self.tts_provider = tts_provider
-            # Also update the global settings so TTS service picks it up
             settings.tts_provider = tts_provider
 
     async def get_ollama_models(self) -> list[str]:
@@ -71,7 +84,7 @@ class LLMService:
             "ollama_host": self.ollama_host,
             "tts_provider": self.tts_provider,
             "available_openrouter_models": OPENROUTER_MODELS,
-            "available_ollama_models": []  # Fetched separately
+            "available_ollama_models": []
         }
 
     async def get_settings_async(self) -> dict:
@@ -92,57 +105,64 @@ class LLMService:
         messages: list[dict],
         system_prompt: Optional[str] = None
     ) -> str:
-        """
-        Generate a response from the LLM.
-
-        Args:
-            messages: List of message dicts with 'role' and 'content'
-            system_prompt: Optional system prompt to prepend
-
-        Returns:
-            Generated text response
-        """
         if system_prompt:
             messages = [{"role": "system", "content": system_prompt}] + messages
 
         if self.provider == "openrouter":
-            return await self._call_openrouter(messages)
+            return await self._call_openrouter_with_fallback(messages)
         else:
             return await self._call_ollama(messages)
 
-    async def _call_openrouter(self, messages: list[dict]) -> str:
-        """Call OpenRouter API with retry"""
-        for attempt in range(2):  # Try twice
-            try:
-                async with httpx.AsyncClient(timeout=25.0) as client:
-                    response = await client.post(
-                        "https://openrouter.ai/api/v1/chat/completions",
-                        headers={
-                            "Authorization": f"Bearer {settings.openrouter_api_key}",
-                            "Content-Type": "application/json",
-                        },
-                        json={
-                            "model": self.openrouter_model,
-                            "messages": messages,
-                            "max_tokens": 150,
-                        },
-                    )
-                    response.raise_for_status()
-                    data = response.json()
-                    content = data["choices"][0]["message"]["content"]
-                    if not content or not content.strip():
-                        print(f"OpenRouter returned empty response")
-                        return ""
-                    return content
-            except (httpx.TimeoutException, httpx.ReadTimeout):
-                print(f"OpenRouter timeout (attempt {attempt + 1})")
-                if attempt == 0:
-                    continue  # Retry once
-                return "Uh, sorry, I lost you there for a second. What was that?"
-            except Exception as e:
-                print(f"OpenRouter error: {e}")
-                return "Yeah... I don't know, man."
-        return "Uh, hold on a sec..."
+    async def _call_openrouter_with_fallback(self, messages: list[dict]) -> str:
+        """Try primary model, then fallback models. Always returns a response."""
+
+        # Try primary model first
+        result = await self._call_openrouter_once(messages, self.openrouter_model)
+        if result is not None:
+            return result
+
+        # Try fallback models
+        for model in FALLBACK_MODELS:
+            if model == self.openrouter_model:
+                continue  # Already tried
+            print(f"[LLM] Falling back to {model}...")
+            result = await self._call_openrouter_once(messages, model, timeout=10.0)
+            if result is not None:
+                return result
+
+        # Everything failed — return an in-character line so the show continues
+        print("[LLM] All models failed, using canned response")
+        return "Sorry, I totally blanked out for a second. What were you saying?"
+
+    async def _call_openrouter_once(self, messages: list[dict], model: str, timeout: float = 15.0) -> str | None:
+        """Single attempt to call OpenRouter. Returns None on failure (not a fallback string)."""
+        try:
+            response = await self.client.post(
+                "https://openrouter.ai/api/v1/chat/completions",
+                headers={
+                    "Authorization": f"Bearer {settings.openrouter_api_key}",
+                    "Content-Type": "application/json",
+                },
+                json={
+                    "model": model,
+                    "messages": messages,
+                    "max_tokens": 150,
+                },
+                timeout=timeout,
+            )
+            response.raise_for_status()
+            data = response.json()
+            content = data["choices"][0]["message"]["content"]
+            if content and content.strip():
+                return content
+            print(f"[LLM] {model} returned empty response")
+            return None
+        except httpx.TimeoutException:
+            print(f"[LLM] {model} timed out ({timeout}s)")
+            return None
+        except Exception as e:
+            print(f"[LLM] {model} error: {e}")
+            return None
 
     async def _call_ollama(self, messages: list[dict]) -> str:
         """Call Ollama API"""
@@ -155,11 +175,11 @@ class LLMService:
                         "messages": messages,
                         "stream": False,
                         "options": {
-                            "num_predict": 100,     # Allow complete thoughts
-                            "temperature": 0.8,     # Balanced creativity/coherence
-                            "top_p": 0.9,           # Focused word choices
-                            "repeat_penalty": 1.3,  # Avoid repetition
-                            "top_k": 50,            # Reasonable token variety
+                            "num_predict": 100,
+                            "temperature": 0.8,
+                            "top_p": 0.9,
+                            "repeat_penalty": 1.3,
+                            "top_k": 50,
                         },
                     },
                     timeout=30.0
@@ -169,10 +189,10 @@ class LLMService:
                 return data["message"]["content"]
         except httpx.TimeoutException:
             print("Ollama timeout")
-            return "Uh, sorry, I lost you there for a second. What was that?"
+            return "Sorry, I totally blanked out for a second. What were you saying?"
         except Exception as e:
             print(f"Ollama error: {e}")
-            return "Yeah... I don't know, man."
+            return "Sorry, I totally blanked out for a second. What were you saying?"
 
 
 # Global instance