diff --git a/backend/main.py b/backend/main.py index a6a7a0d..509e595 100644 --- a/backend/main.py +++ b/backend/main.py @@ -1116,28 +1116,22 @@ async def chat(request: ChatRequest): session.add_message("user", request.text) # session._research_task = asyncio.create_task(_background_research(request.text)) - try: - async with asyncio.timeout(20): - async with _ai_response_lock: - if _session_epoch != epoch: - raise HTTPException(409, "Call ended while waiting") + async with _ai_response_lock: + if _session_epoch != epoch: + raise HTTPException(409, "Call ended while waiting") - # Stop any playing caller audio so responses don't overlap - audio_service.stop_caller_audio() + # Stop any playing caller audio so responses don't overlap + audio_service.stop_caller_audio() - # Include conversation summary and show history for context - conversation_summary = session.get_conversation_summary() - show_history = session.get_show_history() - system_prompt = get_caller_prompt(session.caller, conversation_summary, show_history) + conversation_summary = session.get_conversation_summary() + show_history = session.get_show_history() + system_prompt = get_caller_prompt(session.caller, conversation_summary, show_history) - messages = _normalize_messages_for_llm(session.conversation[-10:]) - response = await llm_service.generate( - messages=messages, - system_prompt=system_prompt - ) - except TimeoutError: - caller_name = session.caller["name"] if session.caller else "Caller" - return {"text": "Uh... hold on, I lost my train of thought.", "caller": caller_name, "voice_id": session.caller["voice"] if session.caller else ""} + messages = _normalize_messages_for_llm(session.conversation[-10:]) + response = await llm_service.generate( + messages=messages, + system_prompt=system_prompt + ) # Discard if call changed while we were generating if _session_epoch != epoch: @@ -1644,30 +1638,24 @@ async def _trigger_ai_auto_respond(accumulated_text: str): ai_name = session.caller["name"] - try: - async with asyncio.timeout(20): - async with _ai_response_lock: - if _session_epoch != epoch: - return # Call changed while waiting for lock + async with _ai_response_lock: + if _session_epoch != epoch: + return # Call changed while waiting for lock - print(f"[Auto-Respond] {ai_name} is jumping in...") - session._last_ai_auto_respond = time.time() - audio_service.stop_caller_audio() - broadcast_event("ai_status", {"text": f"{ai_name} is thinking..."}) + print(f"[Auto-Respond] {ai_name} is jumping in...") + session._last_ai_auto_respond = time.time() + audio_service.stop_caller_audio() + broadcast_event("ai_status", {"text": f"{ai_name} is thinking..."}) - conversation_summary = session.get_conversation_summary() - show_history = session.get_show_history() - system_prompt = get_caller_prompt(session.caller, conversation_summary, show_history) + conversation_summary = session.get_conversation_summary() + show_history = session.get_show_history() + system_prompt = get_caller_prompt(session.caller, conversation_summary, show_history) - messages = _normalize_messages_for_llm(session.conversation[-10:]) - response = await llm_service.generate( - messages=messages, - system_prompt=system_prompt, - ) - except TimeoutError: - print(f"[Auto-Respond] Timed out for {ai_name}") - broadcast_event("ai_done") - return + messages = _normalize_messages_for_llm(session.conversation[-10:]) + response = await llm_service.generate( + messages=messages, + system_prompt=system_prompt, + ) # Discard if call changed during generation if _session_epoch != epoch: @@ -1725,25 +1713,21 @@ async def ai_respond(): epoch = _session_epoch - try: - async with asyncio.timeout(20): - async with _ai_response_lock: - if _session_epoch != epoch: - raise HTTPException(409, "Call ended while waiting") + async with _ai_response_lock: + if _session_epoch != epoch: + raise HTTPException(409, "Call ended while waiting") - audio_service.stop_caller_audio() + audio_service.stop_caller_audio() - conversation_summary = session.get_conversation_summary() - show_history = session.get_show_history() - system_prompt = get_caller_prompt(session.caller, conversation_summary, show_history) + conversation_summary = session.get_conversation_summary() + show_history = session.get_show_history() + system_prompt = get_caller_prompt(session.caller, conversation_summary, show_history) - messages = _normalize_messages_for_llm(session.conversation[-10:]) - response = await llm_service.generate( - messages=messages, - system_prompt=system_prompt - ) - except TimeoutError: - return {"text": "Uh... sorry, I spaced out for a second there.", "caller": session.caller["name"], "voice_id": session.caller["voice"]} + messages = _normalize_messages_for_llm(session.conversation[-10:]) + response = await llm_service.generate( + messages=messages, + system_prompt=system_prompt + ) if _session_epoch != epoch: raise HTTPException(409, "Call changed during response") diff --git a/backend/services/llm.py b/backend/services/llm.py index 2776fd4..b9e5e73 100644 --- a/backend/services/llm.py +++ b/backend/services/llm.py @@ -17,6 +17,13 @@ OPENROUTER_MODELS = [ "mistralai/mistral-7b-instruct", ] +# Fast models to try as fallbacks (cheap, fast, good enough for conversation) +FALLBACK_MODELS = [ + "google/gemini-flash-1.5", + "openai/gpt-4o-mini", + "meta-llama/llama-3.1-8b-instruct", +] + class LLMService: """Abstraction layer for LLM providers""" @@ -27,6 +34,13 @@ class LLMService: self.ollama_model = settings.ollama_model self.ollama_host = settings.ollama_host self.tts_provider = settings.tts_provider + self._client: httpx.AsyncClient | None = None + + @property + def client(self) -> httpx.AsyncClient: + if self._client is None or self._client.is_closed: + self._client = httpx.AsyncClient(timeout=15.0) + return self._client def update_settings( self, @@ -47,7 +61,6 @@ class LLMService: self.ollama_host = ollama_host if tts_provider: self.tts_provider = tts_provider - # Also update the global settings so TTS service picks it up settings.tts_provider = tts_provider async def get_ollama_models(self) -> list[str]: @@ -71,7 +84,7 @@ class LLMService: "ollama_host": self.ollama_host, "tts_provider": self.tts_provider, "available_openrouter_models": OPENROUTER_MODELS, - "available_ollama_models": [] # Fetched separately + "available_ollama_models": [] } async def get_settings_async(self) -> dict: @@ -92,57 +105,64 @@ class LLMService: messages: list[dict], system_prompt: Optional[str] = None ) -> str: - """ - Generate a response from the LLM. - - Args: - messages: List of message dicts with 'role' and 'content' - system_prompt: Optional system prompt to prepend - - Returns: - Generated text response - """ if system_prompt: messages = [{"role": "system", "content": system_prompt}] + messages if self.provider == "openrouter": - return await self._call_openrouter(messages) + return await self._call_openrouter_with_fallback(messages) else: return await self._call_ollama(messages) - async def _call_openrouter(self, messages: list[dict]) -> str: - """Call OpenRouter API with retry""" - for attempt in range(2): # Try twice - try: - async with httpx.AsyncClient(timeout=25.0) as client: - response = await client.post( - "https://openrouter.ai/api/v1/chat/completions", - headers={ - "Authorization": f"Bearer {settings.openrouter_api_key}", - "Content-Type": "application/json", - }, - json={ - "model": self.openrouter_model, - "messages": messages, - "max_tokens": 150, - }, - ) - response.raise_for_status() - data = response.json() - content = data["choices"][0]["message"]["content"] - if not content or not content.strip(): - print(f"OpenRouter returned empty response") - return "" - return content - except (httpx.TimeoutException, httpx.ReadTimeout): - print(f"OpenRouter timeout (attempt {attempt + 1})") - if attempt == 0: - continue # Retry once - return "Uh, sorry, I lost you there for a second. What was that?" - except Exception as e: - print(f"OpenRouter error: {e}") - return "Yeah... I don't know, man." - return "Uh, hold on a sec..." + async def _call_openrouter_with_fallback(self, messages: list[dict]) -> str: + """Try primary model, then fallback models. Always returns a response.""" + + # Try primary model first + result = await self._call_openrouter_once(messages, self.openrouter_model) + if result is not None: + return result + + # Try fallback models + for model in FALLBACK_MODELS: + if model == self.openrouter_model: + continue # Already tried + print(f"[LLM] Falling back to {model}...") + result = await self._call_openrouter_once(messages, model, timeout=10.0) + if result is not None: + return result + + # Everything failed — return an in-character line so the show continues + print("[LLM] All models failed, using canned response") + return "Sorry, I totally blanked out for a second. What were you saying?" + + async def _call_openrouter_once(self, messages: list[dict], model: str, timeout: float = 15.0) -> str | None: + """Single attempt to call OpenRouter. Returns None on failure (not a fallback string).""" + try: + response = await self.client.post( + "https://openrouter.ai/api/v1/chat/completions", + headers={ + "Authorization": f"Bearer {settings.openrouter_api_key}", + "Content-Type": "application/json", + }, + json={ + "model": model, + "messages": messages, + "max_tokens": 150, + }, + timeout=timeout, + ) + response.raise_for_status() + data = response.json() + content = data["choices"][0]["message"]["content"] + if content and content.strip(): + return content + print(f"[LLM] {model} returned empty response") + return None + except httpx.TimeoutException: + print(f"[LLM] {model} timed out ({timeout}s)") + return None + except Exception as e: + print(f"[LLM] {model} error: {e}") + return None async def _call_ollama(self, messages: list[dict]) -> str: """Call Ollama API""" @@ -155,11 +175,11 @@ class LLMService: "messages": messages, "stream": False, "options": { - "num_predict": 100, # Allow complete thoughts - "temperature": 0.8, # Balanced creativity/coherence - "top_p": 0.9, # Focused word choices - "repeat_penalty": 1.3, # Avoid repetition - "top_k": 50, # Reasonable token variety + "num_predict": 100, + "temperature": 0.8, + "top_p": 0.9, + "repeat_penalty": 1.3, + "top_k": 50, }, }, timeout=30.0 @@ -169,10 +189,10 @@ class LLMService: return data["message"]["content"] except httpx.TimeoutException: print("Ollama timeout") - return "Uh, sorry, I lost you there for a second. What was that?" + return "Sorry, I totally blanked out for a second. What were you saying?" except Exception as e: print(f"Ollama error: {e}") - return "Yeah... I don't know, man." + return "Sorry, I totally blanked out for a second. What were you saying?" # Global instance