Cost monitoring, PTT fix, Devon tuning, WEIRD pool expansion, YT thumbnails, LLM SEO, publish ep37

- Add real-time LLM/TTS cost tracking with live status bar display and post-show reports - Fix PTT bug where Devon suggestion layout shift stopped recording via mouseleave - Devon: facts-only during calls, full personality between calls - Double WEIRD topic pool (109→203), bump weight to 14-25% - Auto-generate YouTube thumbnails with bold hook text in publish pipeline - LLM SEO: llms.txt, robots.txt for LLM crawlers, structured data, BreadcrumbList schemas - Publish episode 37 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-15 05:33:27 -06:00
parent 3329cf9ac2
commit c70f83d04a
35 changed files with 4781 additions and 875 deletions
@@ -0,0 +1,364 @@
+"""Cost tracking for LLM and TTS API calls during podcast sessions"""
+
+import json
+import time
+from dataclasses import dataclass, field, asdict
+from pathlib import Path
+from typing import Optional
+
+
+@dataclass
+class LLMCallRecord:
+    timestamp: float
+    category: str
+    model: str
+    prompt_tokens: int
+    completion_tokens: int
+    total_tokens: int
+    cost_usd: float
+    caller_name: str
+    max_tokens_requested: int
+    latency_ms: float
+
+
+@dataclass
+class TTSCallRecord:
+    timestamp: float
+    provider: str
+    voice: str
+    char_count: int
+    cost_usd: float
+
+
+# OpenRouter pricing per 1M tokens (as of March 2026)
+OPENROUTER_PRICING = {
+    "anthropic/claude-sonnet-4-5":      {"prompt": 3.00,  "completion": 15.00},
+    "anthropic/claude-haiku-4.5":       {"prompt": 0.80,  "completion": 4.00},
+    "anthropic/claude-3-haiku":         {"prompt": 0.25,  "completion": 1.25},
+    "x-ai/grok-4-fast":                {"prompt": 5.00,  "completion": 15.00},
+    "minimax/minimax-m2-her":           {"prompt": 0.50,  "completion": 1.50},
+    "mistralai/mistral-small-creative": {"prompt": 0.20,  "completion": 0.60},
+    "deepseek/deepseek-v3.2":          {"prompt": 0.14,  "completion": 0.28},
+    "google/gemini-2.5-flash":          {"prompt": 0.15,  "completion": 0.60},
+    "google/gemini-flash-1.5":          {"prompt": 0.075, "completion": 0.30},
+    "openai/gpt-4o-mini":              {"prompt": 0.15,  "completion": 0.60},
+    "openai/gpt-4o":                   {"prompt": 2.50,  "completion": 10.00},
+    "meta-llama/llama-3.1-8b-instruct": {"prompt": 0.06, "completion": 0.06},
+}
+
+# TTS pricing per character
+TTS_PRICING = {
+    "inworld": 0.000015,
+    "elevenlabs": 0.000030,
+    "kokoro": 0.0,
+    "f5tts": 0.0,
+    "chattts": 0.0,
+    "styletts2": 0.0,
+    "vits": 0.0,
+    "bark": 0.0,
+    "piper": 0.0,
+    "edge": 0.0,
+}
+
+
+def _calc_llm_cost(model: str, prompt_tokens: int, completion_tokens: int) -> float:
+    pricing = OPENROUTER_PRICING.get(model)
+    if not pricing:
+        return 0.0
+    return (prompt_tokens * pricing["prompt"] + completion_tokens * pricing["completion"]) / 1_000_000
+
+
+def _calc_tts_cost(provider: str, char_count: int) -> float:
+    rate = TTS_PRICING.get(provider, 0.0)
+    return char_count * rate
+
+
+class CostTracker:
+    def __init__(self):
+        self.llm_records: list[LLMCallRecord] = []
+        self.tts_records: list[TTSCallRecord] = []
+        # Running totals for fast get_live_summary()
+        self._llm_cost: float = 0.0
+        self._tts_cost: float = 0.0
+        self._llm_calls: int = 0
+        self._prompt_tokens: int = 0
+        self._completion_tokens: int = 0
+        self._total_tokens: int = 0
+        self._by_category: dict[str, dict] = {}
+
+    def record_llm_call(
+        self,
+        category: str,
+        model: str,
+        usage_data: dict,
+        max_tokens: int = 0,
+        latency_ms: float = 0.0,
+        caller_name: str = "",
+    ):
+        prompt_tokens = usage_data.get("prompt_tokens", 0)
+        completion_tokens = usage_data.get("completion_tokens", 0)
+        total_tokens = usage_data.get("total_tokens", 0) or (prompt_tokens + completion_tokens)
+        cost = _calc_llm_cost(model, prompt_tokens, completion_tokens)
+
+        if not OPENROUTER_PRICING.get(model) and total_tokens > 0:
+            print(f"[Costs] Unknown model pricing: {model} ({total_tokens} tokens, cost unknown)")
+
+        record = LLMCallRecord(
+            timestamp=time.time(),
+            category=category,
+            model=model,
+            prompt_tokens=prompt_tokens,
+            completion_tokens=completion_tokens,
+            total_tokens=total_tokens,
+            cost_usd=cost,
+            caller_name=caller_name,
+            max_tokens_requested=max_tokens,
+            latency_ms=latency_ms,
+        )
+        self.llm_records.append(record)
+
+        # Update running totals
+        self._llm_cost += cost
+        self._llm_calls += 1
+        self._prompt_tokens += prompt_tokens
+        self._completion_tokens += completion_tokens
+        self._total_tokens += total_tokens
+
+        cat = self._by_category.setdefault(category, {"cost": 0.0, "calls": 0, "tokens": 0})
+        cat["cost"] += cost
+        cat["calls"] += 1
+        cat["tokens"] += total_tokens
+
+    def record_tts_call(
+        self,
+        provider: str,
+        voice: str,
+        char_count: int,
+        caller_name: str = "",
+    ):
+        cost = _calc_tts_cost(provider, char_count)
+        record = TTSCallRecord(
+            timestamp=time.time(),
+            provider=provider,
+            voice=voice,
+            char_count=char_count,
+            cost_usd=cost,
+        )
+        self.tts_records.append(record)
+        self._tts_cost += cost
+
+    def get_live_summary(self) -> dict:
+        return {
+            "total_cost_usd": round(self._llm_cost + self._tts_cost, 4),
+            "llm_cost_usd": round(self._llm_cost, 4),
+            "tts_cost_usd": round(self._tts_cost, 4),
+            "total_llm_calls": self._llm_calls,
+            "total_tokens": self._total_tokens,
+            "prompt_tokens": self._prompt_tokens,
+            "completion_tokens": self._completion_tokens,
+            "by_category": {
+                k: {"cost": round(v["cost"], 4), "calls": v["calls"], "tokens": v["tokens"]}
+                for k, v in self._by_category.items()
+            },
+        }
+
+    def generate_report(self) -> dict:
+        summary = self.get_live_summary()
+
+        # Per-model breakdown
+        by_model: dict[str, dict] = {}
+        for r in self.llm_records:
+            m = by_model.setdefault(r.model, {"cost": 0.0, "calls": 0, "tokens": 0, "prompt_tokens": 0, "completion_tokens": 0})
+            m["cost"] += r.cost_usd
+            m["calls"] += 1
+            m["tokens"] += r.total_tokens
+            m["prompt_tokens"] += r.prompt_tokens
+            m["completion_tokens"] += r.completion_tokens
+
+        # Per-caller breakdown
+        by_caller: dict[str, dict] = {}
+        for r in self.llm_records:
+            if not r.caller_name:
+                continue
+            c = by_caller.setdefault(r.caller_name, {"cost": 0.0, "calls": 0, "tokens": 0})
+            c["cost"] += r.cost_usd
+            c["calls"] += 1
+            c["tokens"] += r.total_tokens
+
+        # Top 5 most expensive calls
+        sorted_records = sorted(self.llm_records, key=lambda r: r.cost_usd, reverse=True)
+        top_5 = [
+            {
+                "category": r.category,
+                "model": r.model,
+                "caller_name": r.caller_name,
+                "cost_usd": round(r.cost_usd, 6),
+                "total_tokens": r.total_tokens,
+                "prompt_tokens": r.prompt_tokens,
+                "completion_tokens": r.completion_tokens,
+                "latency_ms": round(r.latency_ms, 1),
+            }
+            for r in sorted_records[:5]
+        ]
+
+        # Devon efficiency
+        devon_total = sum(1 for r in self.llm_records if r.category == "devon_monitor")
+        devon_nothing = sum(
+            1 for r in self.llm_records
+            if r.category == "devon_monitor" and r.completion_tokens < 20
+        )
+        devon_useful = devon_total - devon_nothing
+        devon_cost = sum(r.cost_usd for r in self.llm_records if r.category == "devon_monitor")
+
+        # TTS by provider
+        tts_by_provider: dict[str, dict] = {}
+        for r in self.tts_records:
+            p = tts_by_provider.setdefault(r.provider, {"cost": 0.0, "calls": 0, "chars": 0})
+            p["cost"] += r.cost_usd
+            p["calls"] += 1
+            p["chars"] += r.char_count
+
+        # Avg prompt vs completion ratio
+        prompt_ratio = (self._prompt_tokens / self._total_tokens * 100) if self._total_tokens > 0 else 0
+
+        # Recommendations
+        recommendations = self._generate_recommendations(
+            by_model, devon_total, devon_nothing, devon_cost, prompt_ratio
+        )
+
+        # Historical comparison
+        history = self._load_history()
+
+        report = {
+            **summary,
+            "by_model": {k: {kk: round(vv, 4) if isinstance(vv, float) else vv for kk, vv in v.items()} for k, v in by_model.items()},
+            "by_caller": {k: {kk: round(vv, 4) if isinstance(vv, float) else vv for kk, vv in v.items()} for k, v in by_caller.items()},
+            "top_5_expensive": top_5,
+            "devon_efficiency": {
+                "total_monitor_calls": devon_total,
+                "useful": devon_useful,
+                "nothing_to_add": devon_nothing,
+                "total_cost": round(devon_cost, 4),
+                "waste_pct": round(devon_nothing / devon_total * 100, 1) if devon_total > 0 else 0,
+            },
+            "tts_by_provider": {k: {kk: round(vv, 4) if isinstance(vv, float) else vv for kk, vv in v.items()} for k, v in tts_by_provider.items()},
+            "prompt_token_pct": round(prompt_ratio, 1),
+            "recommendations": recommendations,
+            "history": history,
+        }
+        return report
+
+    def _generate_recommendations(
+        self,
+        by_model: dict,
+        devon_total: int,
+        devon_nothing: int,
+        devon_cost: float,
+        prompt_ratio: float,
+    ) -> list[str]:
+        recs = []
+        total = self._llm_cost + self._tts_cost
+        if total == 0:
+            return recs
+
+        # Devon monitoring waste
+        if devon_total > 0:
+            waste_pct = devon_nothing / devon_total * 100
+            if waste_pct > 60:
+                recs.append(
+                    f"Devon monitoring: {devon_nothing}/{devon_total} calls returned nothing "
+                    f"(${devon_cost:.2f}, {devon_cost/total*100:.0f}% of total). "
+                    f"Consider increasing monitor interval from 15s to 25-30s."
+                )
+
+        # Model cost comparison
+        for model, data in by_model.items():
+            if "sonnet" in model and data["calls"] > 5:
+                haiku_cost = _calc_llm_cost(
+                    "anthropic/claude-haiku-4.5",
+                    data["prompt_tokens"],
+                    data["completion_tokens"],
+                )
+                savings = data["cost"] - haiku_cost
+                if savings > 0.05:
+                    recs.append(
+                        f"{model} cost ${data['cost']:.2f} ({data['calls']} calls). "
+                        f"Switching to Haiku 4.5 would save ~${savings:.2f} per session."
+                    )
+
+        # Background gen on expensive model
+        bg = self._by_category.get("background_gen")
+        if bg and bg["cost"] > 0.05:
+            recs.append(
+                f"Background generation: ${bg['cost']:.2f} ({bg['calls']} calls). "
+                f"These are JSON outputs — a cheaper model (Gemini Flash, GPT-4o-mini) "
+                f"would likely work fine here."
+            )
+
+        # Prompt-heavy ratio
+        if prompt_ratio > 80:
+            recs.append(
+                f"Prompt tokens are {prompt_ratio:.0f}% of total usage. "
+                f"System prompts and context windows dominate cost. "
+                f"Consider trimming system prompt length or reducing context window size."
+            )
+
+        # Caller dialog cost dominance
+        cd = self._by_category.get("caller_dialog")
+        if cd and total > 0 and cd["cost"] / total > 0.6:
+            avg_tokens = cd["tokens"] / cd["calls"] if cd["calls"] > 0 else 0
+            recs.append(
+                f"Caller dialog is {cd['cost']/total*100:.0f}% of costs "
+                f"(avg {avg_tokens:.0f} tokens/call). "
+                f"Consider using a cheaper model for standard calls and reserving "
+                f"the primary model for complex call shapes."
+            )
+
+        return recs
+
+    def _load_history(self) -> list[dict]:
+        """Load summaries from previous sessions for comparison"""
+        history_dir = Path("data/cost_reports")
+        if not history_dir.exists():
+            return []
+        sessions = []
+        for f in sorted(history_dir.glob("session-*.json"))[-5:]:
+            try:
+                data = json.loads(f.read_text())
+                sessions.append({
+                    "session_id": data.get("session_id", f.stem),
+                    "total_cost_usd": data.get("total_cost_usd", 0),
+                    "llm_cost_usd": data.get("llm_cost_usd", 0),
+                    "tts_cost_usd": data.get("tts_cost_usd", 0),
+                    "total_llm_calls": data.get("total_llm_calls", 0),
+                    "total_tokens": data.get("total_tokens", 0),
+                    "saved_at": data.get("saved_at", 0),
+                })
+            except Exception:
+                continue
+        return sessions
+
+    def save(self, filepath: Path):
+        filepath.parent.mkdir(parents=True, exist_ok=True)
+        report = self.generate_report()
+        report["session_id"] = filepath.stem
+        report["saved_at"] = time.time()
+        report["raw_llm_records"] = [asdict(r) for r in self.llm_records]
+        report["raw_tts_records"] = [asdict(r) for r in self.tts_records]
+        with open(filepath, "w") as f:
+            json.dump(report, f, indent=2)
+        print(f"[Costs] Report saved to {filepath}")
+
+    def reset(self):
+        self.llm_records.clear()
+        self.tts_records.clear()
+        self._llm_cost = 0.0
+        self._tts_cost = 0.0
+        self._llm_calls = 0
+        self._prompt_tokens = 0
+        self._completion_tokens = 0
+        self._total_tokens = 0
+        self._by_category.clear()
+
+
+cost_tracker = CostTracker()
@@ -328,7 +328,7 @@ class InternService:

    # --- Main interface ---

-    async def ask(self, question: str, conversation_context: list[dict] | None = None) -> dict:
+    async def ask(self, question: str, conversation_context: list[dict] | None = None, caller_active: bool = False) -> dict:
        """Host asks intern a direct question. Returns {text, sources, tool_calls}."""
        messages = []

@@ -343,6 +343,13 @@ class InternService:
                "content": f"CURRENT ON-AIR CONVERSATION:\n{context_text}"
            })

+        # When a caller is on the line, Devon should focus on facts not personal stories
+        if caller_active:
+            messages.append({
+                "role": "system",
+                "content": "A caller is on the line right now. Focus on delivering useful facts, context, and information. Skip personal stories and anecdotes — save those for when it's just you and Luke talking between calls."
+            })
+
        # Include Devon's own recent conversation history
        if self._devon_history:
            messages.extend(self._devon_history[-10:])
@@ -357,6 +364,7 @@ class InternService:
            model=self.model,
            max_tokens=300,
            max_tool_rounds=3,
+            category="devon_ask",
        )

        # Clean up for TTS
@@ -388,7 +396,7 @@ class InternService:
            "tool_calls": tool_calls,
        }

-    async def interject(self, conversation: list[dict]) -> dict | None:
+    async def interject(self, conversation: list[dict], caller_active: bool = False) -> dict | None:
        """Intern looks at conversation and decides if there's something worth adding.
        Returns {text, sources, tool_calls} or None if nothing to add."""
        if not conversation or len(conversation) < 2:
@@ -399,9 +407,16 @@ class InternService:
            for msg in conversation[-8:]
        )

-        messages = [{
-            "role": "user",
-            "content": (
+        if caller_active:
+            interjection_prompt = (
+                f"You're listening to this conversation on the show:\n\n{context_text}\n\n"
+                "A caller is on the line. Is there a useful fact, context, or piece of information "
+                "you can add to this conversation? Use your tools to look something up if needed. "
+                "Keep it focused — facts and context only, no personal stories or anecdotes right now. "
+                "If you truly have nothing useful to add, say exactly: NOTHING_TO_ADD"
+            )
+        else:
+            interjection_prompt = (
                f"You're listening to this conversation on the show:\n\n{context_text}\n\n"
                "You've been listening to this. Is there ANYTHING you want to jump in about? "
                "Could be a fact you want to look up, a personal story this reminds you of, "
@@ -409,7 +424,11 @@ class InternService:
                "or something you just have to say. You're Devon — you always have something. "
                "Use your tools if you want to look something up, or just riff. "
                "If you truly have absolutely nothing, say exactly: NOTHING_TO_ADD"
-            ),
+            )
+
+        messages = [{
+            "role": "user",
+            "content": interjection_prompt,
        }]

        text, tool_calls = await llm_service.generate_with_tools(
@@ -420,6 +439,7 @@ class InternService:
            model=self.model,
            max_tokens=300,
            max_tool_rounds=2,
+            category="devon_monitor",
        )

        text = self._clean_for_tts(text)
@@ -443,7 +463,7 @@ class InternService:
            "tool_calls": tool_calls,
        }

-    async def monitor_conversation(self, get_conversation: callable, on_suggestion: callable):
+    async def monitor_conversation(self, get_conversation: callable, on_suggestion: callable, get_caller_active: callable = None):
        """Background task that watches conversation and buffers suggestions.
        get_conversation() should return the current conversation list.
        on_suggestion(text, sources) is called when a suggestion is ready."""
@@ -465,7 +485,8 @@ class InternService:
            last_checked_len = len(conversation)

            try:
-                result = await self.interject(conversation)
+                caller_active = get_caller_active() if get_caller_active else False
+                result = await self.interject(conversation, caller_active=caller_active)
                if result:
                    self.pending_interjection = result["text"]
                    self.pending_sources = result.get("tool_calls", [])
@@ -474,12 +495,12 @@ class InternService:
            except Exception as e:
                print(f"[Intern] Monitor error: {e}")

-    def start_monitoring(self, get_conversation: callable, on_suggestion: callable):
+    def start_monitoring(self, get_conversation: callable, on_suggestion: callable, get_caller_active: callable = None):
        if self.monitoring:
            return
        self.monitoring = True
        self._monitor_task = asyncio.create_task(
-            self.monitor_conversation(get_conversation, on_suggestion)
+            self.monitor_conversation(get_conversation, on_suggestion, get_caller_active)
        )
        print("[Intern] Monitoring started")

@@ -1,9 +1,11 @@
 """LLM service with OpenRouter and Ollama support"""

 import json
+import time
 import httpx
 from typing import Optional, Callable, Awaitable
 from ..config import settings
+from .cost_tracker import cost_tracker


 # Available OpenRouter models
@@ -114,13 +116,15 @@ class LLMService:
        messages: list[dict],
        system_prompt: Optional[str] = None,
        max_tokens: Optional[int] = None,
-        response_format: Optional[dict] = None
+        response_format: Optional[dict] = None,
+        category: str = "unknown",
+        caller_name: str = "",
    ) -> str:
        if system_prompt:
            messages = [{"role": "system", "content": system_prompt}] + messages

        if self.provider == "openrouter":
-            return await self._call_openrouter_with_fallback(messages, max_tokens=max_tokens, response_format=response_format)
+            return await self._call_openrouter_with_fallback(messages, max_tokens=max_tokens, response_format=response_format, category=category, caller_name=caller_name)
        else:
            return await self._call_ollama(messages, max_tokens=max_tokens)

@@ -133,6 +137,8 @@ class LLMService:
        model: Optional[str] = None,
        max_tokens: int = 500,
        max_tool_rounds: int = 3,
+        category: str = "unknown",
+        caller_name: str = "",
    ) -> tuple[str, list[dict]]:
        """Generate a response with OpenRouter function calling.

@@ -166,6 +172,7 @@ class LLMService:
                "tool_choice": "auto",
            }

+            start_time = time.time()
            try:
                response = await self.client.post(
                    "https://openrouter.ai/api/v1/chat/completions",
@@ -185,6 +192,18 @@ class LLMService:
                print(f"[LLM-Tools] {model} error (round {round_num}): {e}")
                break

+            latency_ms = (time.time() - start_time) * 1000
+            usage = data.get("usage", {})
+            if usage:
+                cost_tracker.record_llm_call(
+                    category=category,
+                    model=model,
+                    usage_data=usage,
+                    max_tokens=max_tokens,
+                    latency_ms=latency_ms,
+                    caller_name=caller_name,
+                )
+
            choice = data["choices"][0]
            msg = choice["message"]

@@ -230,6 +249,7 @@ class LLMService:

        # Exhausted tool rounds or hit an error — do one final call without tools
        print(f"[LLM-Tools] Finishing after {len(all_tool_calls)} tool calls")
+        start_time = time.time()
        try:
            final_payload = {
                "model": model,
@@ -248,17 +268,28 @@ class LLMService:
            )
            response.raise_for_status()
            data = response.json()
+            latency_ms = (time.time() - start_time) * 1000
+            usage = data.get("usage", {})
+            if usage:
+                cost_tracker.record_llm_call(
+                    category=category,
+                    model=model,
+                    usage_data=usage,
+                    max_tokens=max_tokens,
+                    latency_ms=latency_ms,
+                    caller_name=caller_name,
+                )
            content = data["choices"][0]["message"].get("content", "")
            return content or "", all_tool_calls
        except Exception as e:
            print(f"[LLM-Tools] Final call failed: {e}")
            return "", all_tool_calls

-    async def _call_openrouter_with_fallback(self, messages: list[dict], max_tokens: Optional[int] = None, response_format: Optional[dict] = None) -> str:
+    async def _call_openrouter_with_fallback(self, messages: list[dict], max_tokens: Optional[int] = None, response_format: Optional[dict] = None, category: str = "unknown", caller_name: str = "") -> str:
        """Try primary model, then fallback models. Always returns a response."""

        # Try primary model first
-        result = await self._call_openrouter_once(messages, self.openrouter_model, max_tokens=max_tokens, response_format=response_format)
+        result = await self._call_openrouter_once(messages, self.openrouter_model, max_tokens=max_tokens, response_format=response_format, category=category, caller_name=caller_name)
        if result is not None:
            return result

@@ -267,7 +298,7 @@ class LLMService:
            if model == self.openrouter_model:
                continue  # Already tried
            print(f"[LLM] Falling back to {model}...")
-            result = await self._call_openrouter_once(messages, model, timeout=8.0, max_tokens=max_tokens)
+            result = await self._call_openrouter_once(messages, model, timeout=8.0, max_tokens=max_tokens, category=category, caller_name=caller_name)
            if result is not None:
                return result

@@ -275,8 +306,9 @@ class LLMService:
        print("[LLM] All models failed, using canned response")
        return "Sorry, I totally blanked out for a second. What were you saying?"

-    async def _call_openrouter_once(self, messages: list[dict], model: str, timeout: float = 10.0, max_tokens: Optional[int] = None, response_format: Optional[dict] = None) -> str | None:
+    async def _call_openrouter_once(self, messages: list[dict], model: str, timeout: float = 10.0, max_tokens: Optional[int] = None, response_format: Optional[dict] = None, category: str = "unknown", caller_name: str = "") -> str | None:
        """Single attempt to call OpenRouter. Returns None on failure (not a fallback string)."""
+        start_time = time.time()
        try:
            payload = {
                "model": model,
@@ -300,6 +332,17 @@ class LLMService:
            )
            response.raise_for_status()
            data = response.json()
+            latency_ms = (time.time() - start_time) * 1000
+            usage = data.get("usage", {})
+            if usage:
+                cost_tracker.record_llm_call(
+                    category=category,
+                    model=model,
+                    usage_data=usage,
+                    max_tokens=max_tokens or 500,
+                    latency_ms=latency_ms,
+                    caller_name=caller_name,
+                )
            content = data["choices"][0]["message"]["content"]
            if content and content.strip():
                return content
@@ -53,7 +53,8 @@ class RegularCallerService:
                    location: str, personality_traits: list[str],
                    first_call_summary: str, voice: str = None,
                    stable_seeds: dict = None,
-                    structured_background: dict = None) -> dict:
+                    structured_background: dict = None,
+                    avatar: str = None) -> dict:
        """Promote a first-time caller to regular"""
        # Retire oldest if at cap
        if len(self._regulars) >= MAX_REGULARS:
@@ -72,6 +73,7 @@ class RegularCallerService:
            "voice": voice,
            "stable_seeds": stable_seeds or {},
            "structured_background": structured_background,
+            "avatar": avatar,
            "relationships": {},
            "call_history": [
                {"summary": first_call_summary, "timestamp": time.time(),
@@ -8,6 +8,7 @@ import tempfile
 import torch

 from ..config import settings
+from .cost_tracker import cost_tracker

 # Patch torch.load for compatibility with PyTorch 2.6+
 _original_torch_load = torch.load
@@ -845,6 +846,7 @@ async def generate_speech(
            for attempt in range(TTS_MAX_RETRIES):
                try:
                    audio, sample_rate = await gen_fn(text, voice_id)
+                    cost_tracker.record_tts_call(provider, voice_id, len(text))
                    if attempt > 0:
                        print(f"[TTS] Succeeded on retry {attempt}")
                    break