Caller model routing — cycle, style-matched, mid-show override

- Three strategies: single model, cycle through pool, style-matched - 18 communication styles mapped to 7 models (Grok, Sonnet, Mistral, Qwen, DeepSeek, Gemini, Llama) - Per-caller model locked for entire call, overridable mid-show - Model badges on caller buttons and info panel - Settings UI for strategy, pool, style mapping, fallback - Fallback to Sonnet on model failure - 6 new models added to pricing and dropdown - Checkpoint persistence for all model state Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-21 01:58:03 -06:00
parent e0fb3cac68
commit 314d5f9452
6 changed files with 487 additions and 4 deletions
@@ -45,6 +45,12 @@ OPENROUTER_PRICING = {
    "openai/gpt-4o-mini":              {"prompt": 0.15,  "completion": 0.60},
    "openai/gpt-4o":                   {"prompt": 2.50,  "completion": 10.00},
    "meta-llama/llama-3.1-8b-instruct": {"prompt": 0.06, "completion": 0.06},
+    "deepseek/deepseek-chat-v3-0324":  {"prompt": 0.27,  "completion": 1.10},
+    "moonshotai/kimi-k2":              {"prompt": 0.60,  "completion": 2.00},
+    "mistralai/mistral-medium-3":      {"prompt": 0.40,  "completion": 2.00},
+    "meta-llama/llama-4-maverick":     {"prompt": 0.20,  "completion": 0.60},
+    "qwen/qwen3-235b-a22b":           {"prompt": 0.20,  "completion": 0.60},
+    "google/gemini-2.5-pro":           {"prompt": 1.25,  "completion": 10.00},
 }

 # TTS pricing per character
@@ -23,6 +23,13 @@ OPENROUTER_MODELS = [
    "google/gemini-2.5-flash",
    "openai/gpt-4o-mini",
    "openai/gpt-4o",
+    # New dialog models
+    "deepseek/deepseek-chat-v3-0324",
+    "moonshotai/kimi-k2",
+    "mistralai/mistral-medium-3",
+    "meta-llama/llama-4-maverick",
+    "qwen/qwen3-235b-a22b",
+    "google/gemini-2.5-pro",
    # Legacy
    "anthropic/claude-3-haiku",
    "google/gemini-flash-1.5",
@@ -125,12 +132,13 @@ class LLMService:
        response_format: Optional[dict] = None,
        category: str = "unknown",
        caller_name: str = "",
+        model_override: Optional[str] = None,
    ) -> str:
        if system_prompt:
            messages = [{"role": "system", "content": system_prompt}] + messages

        if self.provider == "openrouter":
-            return await self._call_openrouter_with_fallback(messages, max_tokens=max_tokens, response_format=response_format, category=category, caller_name=caller_name)
+            return await self._call_openrouter_with_fallback(messages, max_tokens=max_tokens, response_format=response_format, category=category, caller_name=caller_name, model_override=model_override)
        else:
            return await self._call_ollama(messages, max_tokens=max_tokens)

@@ -295,11 +303,11 @@ class LLMService:
        """Get the best model for a given category based on config routing."""
        return settings.category_models.get(category, self.openrouter_model)

-    async def _call_openrouter_with_fallback(self, messages: list[dict], max_tokens: Optional[int] = None, response_format: Optional[dict] = None, category: str = "unknown", caller_name: str = "") -> str:
+    async def _call_openrouter_with_fallback(self, messages: list[dict], max_tokens: Optional[int] = None, response_format: Optional[dict] = None, category: str = "unknown", caller_name: str = "", model_override: Optional[str] = None) -> str:
        """Try category-specific model, then fallback models. Always returns a response."""

-        # Use category-specific model if configured, otherwise primary
-        model = self._get_model_for_category(category)
+        # Use explicit override if provided, else category routing, else primary
+        model = model_override or self._get_model_for_category(category)
        result = await self._call_openrouter_once(messages, model, max_tokens=max_tokens, response_format=response_format, category=category, caller_name=caller_name)
        if result is not None:
            return result