Files
ai-podcast/backend/services/cost_tracker.py
tcpsyn c70f83d04a Cost monitoring, PTT fix, Devon tuning, WEIRD pool expansion, YT thumbnails, LLM SEO, publish ep37
- Add real-time LLM/TTS cost tracking with live status bar display and post-show reports
- Fix PTT bug where Devon suggestion layout shift stopped recording via mouseleave
- Devon: facts-only during calls, full personality between calls
- Double WEIRD topic pool (109→203), bump weight to 14-25%
- Auto-generate YouTube thumbnails with bold hook text in publish pipeline
- LLM SEO: llms.txt, robots.txt for LLM crawlers, structured data, BreadcrumbList schemas
- Publish episode 37

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-15 05:33:27 -06:00

365 lines
13 KiB
Python

"""Cost tracking for LLM and TTS API calls during podcast sessions"""
import json
import time
from dataclasses import dataclass, field, asdict
from pathlib import Path
from typing import Optional
@dataclass
class LLMCallRecord:
timestamp: float
category: str
model: str
prompt_tokens: int
completion_tokens: int
total_tokens: int
cost_usd: float
caller_name: str
max_tokens_requested: int
latency_ms: float
@dataclass
class TTSCallRecord:
timestamp: float
provider: str
voice: str
char_count: int
cost_usd: float
# OpenRouter pricing per 1M tokens (as of March 2026)
OPENROUTER_PRICING = {
"anthropic/claude-sonnet-4-5": {"prompt": 3.00, "completion": 15.00},
"anthropic/claude-haiku-4.5": {"prompt": 0.80, "completion": 4.00},
"anthropic/claude-3-haiku": {"prompt": 0.25, "completion": 1.25},
"x-ai/grok-4-fast": {"prompt": 5.00, "completion": 15.00},
"minimax/minimax-m2-her": {"prompt": 0.50, "completion": 1.50},
"mistralai/mistral-small-creative": {"prompt": 0.20, "completion": 0.60},
"deepseek/deepseek-v3.2": {"prompt": 0.14, "completion": 0.28},
"google/gemini-2.5-flash": {"prompt": 0.15, "completion": 0.60},
"google/gemini-flash-1.5": {"prompt": 0.075, "completion": 0.30},
"openai/gpt-4o-mini": {"prompt": 0.15, "completion": 0.60},
"openai/gpt-4o": {"prompt": 2.50, "completion": 10.00},
"meta-llama/llama-3.1-8b-instruct": {"prompt": 0.06, "completion": 0.06},
}
# TTS pricing per character
TTS_PRICING = {
"inworld": 0.000015,
"elevenlabs": 0.000030,
"kokoro": 0.0,
"f5tts": 0.0,
"chattts": 0.0,
"styletts2": 0.0,
"vits": 0.0,
"bark": 0.0,
"piper": 0.0,
"edge": 0.0,
}
def _calc_llm_cost(model: str, prompt_tokens: int, completion_tokens: int) -> float:
pricing = OPENROUTER_PRICING.get(model)
if not pricing:
return 0.0
return (prompt_tokens * pricing["prompt"] + completion_tokens * pricing["completion"]) / 1_000_000
def _calc_tts_cost(provider: str, char_count: int) -> float:
rate = TTS_PRICING.get(provider, 0.0)
return char_count * rate
class CostTracker:
def __init__(self):
self.llm_records: list[LLMCallRecord] = []
self.tts_records: list[TTSCallRecord] = []
# Running totals for fast get_live_summary()
self._llm_cost: float = 0.0
self._tts_cost: float = 0.0
self._llm_calls: int = 0
self._prompt_tokens: int = 0
self._completion_tokens: int = 0
self._total_tokens: int = 0
self._by_category: dict[str, dict] = {}
def record_llm_call(
self,
category: str,
model: str,
usage_data: dict,
max_tokens: int = 0,
latency_ms: float = 0.0,
caller_name: str = "",
):
prompt_tokens = usage_data.get("prompt_tokens", 0)
completion_tokens = usage_data.get("completion_tokens", 0)
total_tokens = usage_data.get("total_tokens", 0) or (prompt_tokens + completion_tokens)
cost = _calc_llm_cost(model, prompt_tokens, completion_tokens)
if not OPENROUTER_PRICING.get(model) and total_tokens > 0:
print(f"[Costs] Unknown model pricing: {model} ({total_tokens} tokens, cost unknown)")
record = LLMCallRecord(
timestamp=time.time(),
category=category,
model=model,
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
total_tokens=total_tokens,
cost_usd=cost,
caller_name=caller_name,
max_tokens_requested=max_tokens,
latency_ms=latency_ms,
)
self.llm_records.append(record)
# Update running totals
self._llm_cost += cost
self._llm_calls += 1
self._prompt_tokens += prompt_tokens
self._completion_tokens += completion_tokens
self._total_tokens += total_tokens
cat = self._by_category.setdefault(category, {"cost": 0.0, "calls": 0, "tokens": 0})
cat["cost"] += cost
cat["calls"] += 1
cat["tokens"] += total_tokens
def record_tts_call(
self,
provider: str,
voice: str,
char_count: int,
caller_name: str = "",
):
cost = _calc_tts_cost(provider, char_count)
record = TTSCallRecord(
timestamp=time.time(),
provider=provider,
voice=voice,
char_count=char_count,
cost_usd=cost,
)
self.tts_records.append(record)
self._tts_cost += cost
def get_live_summary(self) -> dict:
return {
"total_cost_usd": round(self._llm_cost + self._tts_cost, 4),
"llm_cost_usd": round(self._llm_cost, 4),
"tts_cost_usd": round(self._tts_cost, 4),
"total_llm_calls": self._llm_calls,
"total_tokens": self._total_tokens,
"prompt_tokens": self._prompt_tokens,
"completion_tokens": self._completion_tokens,
"by_category": {
k: {"cost": round(v["cost"], 4), "calls": v["calls"], "tokens": v["tokens"]}
for k, v in self._by_category.items()
},
}
def generate_report(self) -> dict:
summary = self.get_live_summary()
# Per-model breakdown
by_model: dict[str, dict] = {}
for r in self.llm_records:
m = by_model.setdefault(r.model, {"cost": 0.0, "calls": 0, "tokens": 0, "prompt_tokens": 0, "completion_tokens": 0})
m["cost"] += r.cost_usd
m["calls"] += 1
m["tokens"] += r.total_tokens
m["prompt_tokens"] += r.prompt_tokens
m["completion_tokens"] += r.completion_tokens
# Per-caller breakdown
by_caller: dict[str, dict] = {}
for r in self.llm_records:
if not r.caller_name:
continue
c = by_caller.setdefault(r.caller_name, {"cost": 0.0, "calls": 0, "tokens": 0})
c["cost"] += r.cost_usd
c["calls"] += 1
c["tokens"] += r.total_tokens
# Top 5 most expensive calls
sorted_records = sorted(self.llm_records, key=lambda r: r.cost_usd, reverse=True)
top_5 = [
{
"category": r.category,
"model": r.model,
"caller_name": r.caller_name,
"cost_usd": round(r.cost_usd, 6),
"total_tokens": r.total_tokens,
"prompt_tokens": r.prompt_tokens,
"completion_tokens": r.completion_tokens,
"latency_ms": round(r.latency_ms, 1),
}
for r in sorted_records[:5]
]
# Devon efficiency
devon_total = sum(1 for r in self.llm_records if r.category == "devon_monitor")
devon_nothing = sum(
1 for r in self.llm_records
if r.category == "devon_monitor" and r.completion_tokens < 20
)
devon_useful = devon_total - devon_nothing
devon_cost = sum(r.cost_usd for r in self.llm_records if r.category == "devon_monitor")
# TTS by provider
tts_by_provider: dict[str, dict] = {}
for r in self.tts_records:
p = tts_by_provider.setdefault(r.provider, {"cost": 0.0, "calls": 0, "chars": 0})
p["cost"] += r.cost_usd
p["calls"] += 1
p["chars"] += r.char_count
# Avg prompt vs completion ratio
prompt_ratio = (self._prompt_tokens / self._total_tokens * 100) if self._total_tokens > 0 else 0
# Recommendations
recommendations = self._generate_recommendations(
by_model, devon_total, devon_nothing, devon_cost, prompt_ratio
)
# Historical comparison
history = self._load_history()
report = {
**summary,
"by_model": {k: {kk: round(vv, 4) if isinstance(vv, float) else vv for kk, vv in v.items()} for k, v in by_model.items()},
"by_caller": {k: {kk: round(vv, 4) if isinstance(vv, float) else vv for kk, vv in v.items()} for k, v in by_caller.items()},
"top_5_expensive": top_5,
"devon_efficiency": {
"total_monitor_calls": devon_total,
"useful": devon_useful,
"nothing_to_add": devon_nothing,
"total_cost": round(devon_cost, 4),
"waste_pct": round(devon_nothing / devon_total * 100, 1) if devon_total > 0 else 0,
},
"tts_by_provider": {k: {kk: round(vv, 4) if isinstance(vv, float) else vv for kk, vv in v.items()} for k, v in tts_by_provider.items()},
"prompt_token_pct": round(prompt_ratio, 1),
"recommendations": recommendations,
"history": history,
}
return report
def _generate_recommendations(
self,
by_model: dict,
devon_total: int,
devon_nothing: int,
devon_cost: float,
prompt_ratio: float,
) -> list[str]:
recs = []
total = self._llm_cost + self._tts_cost
if total == 0:
return recs
# Devon monitoring waste
if devon_total > 0:
waste_pct = devon_nothing / devon_total * 100
if waste_pct > 60:
recs.append(
f"Devon monitoring: {devon_nothing}/{devon_total} calls returned nothing "
f"(${devon_cost:.2f}, {devon_cost/total*100:.0f}% of total). "
f"Consider increasing monitor interval from 15s to 25-30s."
)
# Model cost comparison
for model, data in by_model.items():
if "sonnet" in model and data["calls"] > 5:
haiku_cost = _calc_llm_cost(
"anthropic/claude-haiku-4.5",
data["prompt_tokens"],
data["completion_tokens"],
)
savings = data["cost"] - haiku_cost
if savings > 0.05:
recs.append(
f"{model} cost ${data['cost']:.2f} ({data['calls']} calls). "
f"Switching to Haiku 4.5 would save ~${savings:.2f} per session."
)
# Background gen on expensive model
bg = self._by_category.get("background_gen")
if bg and bg["cost"] > 0.05:
recs.append(
f"Background generation: ${bg['cost']:.2f} ({bg['calls']} calls). "
f"These are JSON outputs — a cheaper model (Gemini Flash, GPT-4o-mini) "
f"would likely work fine here."
)
# Prompt-heavy ratio
if prompt_ratio > 80:
recs.append(
f"Prompt tokens are {prompt_ratio:.0f}% of total usage. "
f"System prompts and context windows dominate cost. "
f"Consider trimming system prompt length or reducing context window size."
)
# Caller dialog cost dominance
cd = self._by_category.get("caller_dialog")
if cd and total > 0 and cd["cost"] / total > 0.6:
avg_tokens = cd["tokens"] / cd["calls"] if cd["calls"] > 0 else 0
recs.append(
f"Caller dialog is {cd['cost']/total*100:.0f}% of costs "
f"(avg {avg_tokens:.0f} tokens/call). "
f"Consider using a cheaper model for standard calls and reserving "
f"the primary model for complex call shapes."
)
return recs
def _load_history(self) -> list[dict]:
"""Load summaries from previous sessions for comparison"""
history_dir = Path("data/cost_reports")
if not history_dir.exists():
return []
sessions = []
for f in sorted(history_dir.glob("session-*.json"))[-5:]:
try:
data = json.loads(f.read_text())
sessions.append({
"session_id": data.get("session_id", f.stem),
"total_cost_usd": data.get("total_cost_usd", 0),
"llm_cost_usd": data.get("llm_cost_usd", 0),
"tts_cost_usd": data.get("tts_cost_usd", 0),
"total_llm_calls": data.get("total_llm_calls", 0),
"total_tokens": data.get("total_tokens", 0),
"saved_at": data.get("saved_at", 0),
})
except Exception:
continue
return sessions
def save(self, filepath: Path):
filepath.parent.mkdir(parents=True, exist_ok=True)
report = self.generate_report()
report["session_id"] = filepath.stem
report["saved_at"] = time.time()
report["raw_llm_records"] = [asdict(r) for r in self.llm_records]
report["raw_tts_records"] = [asdict(r) for r in self.tts_records]
with open(filepath, "w") as f:
json.dump(report, f, indent=2)
print(f"[Costs] Report saved to {filepath}")
def reset(self):
self.llm_records.clear()
self.tts_records.clear()
self._llm_cost = 0.0
self._tts_cost = 0.0
self._llm_calls = 0
self._prompt_tokens = 0
self._completion_tokens = 0
self._total_tokens = 0
self._by_category.clear()
cost_tracker = CostTracker()