Fix research hanging: add timeouts, fix keyword extraction, cache failures

- Google News RSS returns 302: add follow_redirects and User-Agent header - Cache failed headline fetches for 5min so they don't retry every call - Add 8s timeout on background research tasks - Fix keyword extraction: skip short texts, require 2+ proper nouns (not names), increase min word length to 6, add radio show filler to stop words - Stops garbage searches like "Megan welcome" and "sounds thats youre" Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-06 21:25:31 -07:00
parent b3fb3b1127
commit 69b7078142
2 changed files with 49 additions and 20 deletions
@@ -661,15 +661,19 @@ async def _background_research(text: str):
    if query.lower() in session.research_notes:
        return
    try:
-        results = await news_service.search_topic(query)
-        if results:
-            session.research_notes[query.lower()] = results
-            print(f"[Research] Found {len(results)} results for '{query}'")
+        async with asyncio.timeout(8):
+            results = await news_service.search_topic(query)
+            if results:
+                session.research_notes[query.lower()] = results
+                print(f"[Research] Found {len(results)} results for '{query}'")
+    except TimeoutError:
+        print(f"[Research] Timed out for '{query}'")
    except Exception as e:
        print(f"[Research] Error: {e}")


 def _build_news_context() -> tuple[str, str]:
+    """Build context from cached news/research only — never does network calls."""
    news_context = ""
    if session.news_headlines:
        news_context = news_service.format_headlines_for_prompt(session.news_headlines[:6])
@@ -34,11 +34,16 @@ class NewsService:

    async def get_headlines(self) -> list[NewsItem]:
        async with self._headlines_lock:
-            if self._headlines_cache and time.time() - self._headlines_ts < 1800:
+            # Cache for 30min on success, 5min on failure (avoid hammering)
+            if time.time() - self._headlines_ts < (1800 if self._headlines_cache else 300):
                return self._headlines_cache

            try:
-                resp = await self.client.get("https://news.google.com/rss")
+                resp = await self.client.get(
+                    "https://news.google.com/rss",
+                    follow_redirects=True,
+                    headers={"User-Agent": "Mozilla/5.0"}
+                )
                resp.raise_for_status()
                items = self._parse_rss(resp.text, max_items=10)
                self._headlines_cache = items
@@ -46,9 +51,8 @@ class NewsService:
                return items
            except Exception as e:
                print(f"[News] Headlines fetch failed: {e}")
-                if self._headlines_cache:
-                    return self._headlines_cache
-                return []
+                self._headlines_ts = time.time()  # Don't retry immediately
+                return self._headlines_cache

    async def search_topic(self, query: str) -> list[NewsItem]:
        cache_key = query.lower()
@@ -67,7 +71,7 @@ class NewsService:
        try:
            encoded = quote_plus(query)
            url = f"https://news.google.com/rss/search?q={encoded}&hl=en-US&gl=US&ceid=US:en"
-            resp = await self.client.get(url)
+            resp = await self.client.get(url, follow_redirects=True, headers={"User-Agent": "Mozilla/5.0"})
            resp.raise_for_status()
            items = self._parse_rss(resp.text, max_items=5)

@@ -143,31 +147,52 @@ STOP_WORDS = {
    "first", "last", "back", "down", "ever", "away", "cant", "dont",
    "didnt", "doesnt", "isnt", "wasnt", "wont", "wouldnt", "couldnt",
    "shouldnt", "aint", "stop", "start", "started", "help",
+    # Radio show filler
+    "welcome", "thanks", "thank", "show", "roost", "luke", "whats",
+    "youre", "thats", "heres", "theyre", "ive", "youve", "weve",
+    "sounds", "sounds", "listen", "hear", "heard", "happen", "happened",
+    "happening", "absolutely", "definitely", "exactly", "totally",
+    "pretty", "little", "whole", "every", "point", "sense", "real",
+    "great", "cool", "awesome", "amazing", "crazy", "weird", "funny",
+    "tough", "hard", "wrong", "true", "trying", "tried", "works",
+    "working", "anymore", "already", "enough", "though", "whatever",
+    "theres", "making", "saying", "keeping", "possible", "instead",
+    "front", "behind", "course", "talks", "happens", "watch",
+    "everybodys", "pants", "husband", "client",
 }


 def extract_keywords(text: str, max_keywords: int = 3) -> list[str]:
    words = text.split()
+    if len(words) < 8:
+        return []  # Too short to extract meaningful topics
+
    keywords = []

-    # Pass 1: capitalized words (proper nouns) not at sentence start
+    # Only look for proper nouns that are likely real topics (not caller names)
+    # Skip first few words (usually greetings) and single proper nouns (usually names)
+    proper_nouns = []
    for i, word in enumerate(words):
        clean = re.sub(r'[^\w]', '', word)
-        if not clean:
+        if not clean or len(clean) < 3:
            continue
        is_sentence_start = i == 0 or (i > 0 and words[i - 1].rstrip()[-1:] in '.!?')
        if clean[0].isupper() and not is_sentence_start and clean.lower() not in STOP_WORDS:
-            if clean not in keywords:
-                keywords.append(clean)
-            if len(keywords) >= max_keywords:
-                return keywords
+            proper_nouns.append(clean)

-    # Pass 2: uncommon words (>4 chars, not in stop words)
+    # Only use proper nouns if we found 2+ (single one is probably a name)
+    if len(proper_nouns) >= 2:
+        for noun in proper_nouns[:max_keywords]:
+            if noun not in keywords:
+                keywords.append(noun)
+        if len(keywords) >= max_keywords:
+            return keywords
+
+    # Pass 2: uncommon words (>5 chars, not in stop words)
    for word in words:
        clean = re.sub(r'[^\w]', '', word).lower()
-        if len(clean) > 4 and clean not in STOP_WORDS:
-            title_clean = clean.capitalize()
-            if title_clean not in keywords and clean not in [k.lower() for k in keywords]:
+        if len(clean) > 5 and clean not in STOP_WORDS:
+            if clean not in [k.lower() for k in keywords]:
                keywords.append(clean)
            if len(keywords) >= max_keywords:
                return keywords