Fix research hanging: add timeouts, fix keyword extraction, cache failures

- Google News RSS returns 302: add follow_redirects and User-Agent header - Cache failed headline fetches for 5min so they don't retry every call - Add 8s timeout on background research tasks - Fix keyword extraction: skip short texts, require 2+ proper nouns (not names), increase min word length to 6, add radio show filler to stop words - Stops garbage searches like "Megan welcome" and "sounds thats youre" Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-06 21:25:31 -07:00
parent b3fb3b1127
commit 69b7078142
2 changed files with 49 additions and 20 deletions
@@ -661,15 +661,19 @@ async def _background_research(text: str):
    if query.lower() in session.research_notes:
        return
    try:
        async with asyncio.timeout(8):
            results = await news_service.search_topic(query)
            if results:
                session.research_notes[query.lower()] = results
                print(f"[Research] Found {len(results)} results for '{query}'")
    except TimeoutError:
        print(f"[Research] Timed out for '{query}'")
    except Exception as e:
        print(f"[Research] Error: {e}")
 def _build_news_context() -> tuple[str, str]:
    """Build context from cached news/research only — never does network calls."""
    news_context = ""
    if session.news_headlines:
        news_context = news_service.format_headlines_for_prompt(session.news_headlines[:6])
@@ -34,11 +34,16 @@ class NewsService:
    async def get_headlines(self) -> list[NewsItem]:
        async with self._headlines_lock:
-            if self._headlines_cache and time.time() - self._headlines_ts < 1800:
+            # Cache for 30min on success, 5min on failure (avoid hammering)
            if time.time() - self._headlines_ts < (1800 if self._headlines_cache else 300):
                return self._headlines_cache
            try:
-                resp = await self.client.get("https://news.google.com/rss")
+                resp = await self.client.get(
                    "https://news.google.com/rss",
                    follow_redirects=True,
                    headers={"User-Agent": "Mozilla/5.0"}
                )
                resp.raise_for_status()
                items = self._parse_rss(resp.text, max_items=10)
                self._headlines_cache = items
@@ -46,9 +51,8 @@ class NewsService:
                return items
            except Exception as e:
                print(f"[News] Headlines fetch failed: {e}")
-                if self._headlines_cache:
+                self._headlines_ts = time.time()  # Don't retry immediately
                return self._headlines_cache
                return []
    async def search_topic(self, query: str) -> list[NewsItem]:
        cache_key = query.lower()
@@ -67,7 +71,7 @@ class NewsService:
        try:
            encoded = quote_plus(query)
            url = f"https://news.google.com/rss/search?q={encoded}&hl=en-US&gl=US&ceid=US:en"
-            resp = await self.client.get(url)
+            resp = await self.client.get(url, follow_redirects=True, headers={"User-Agent": "Mozilla/5.0"})
            resp.raise_for_status()
            items = self._parse_rss(resp.text, max_items=5)
@@ -143,31 +147,52 @@ STOP_WORDS = {
    "first", "last", "back", "down", "ever", "away", "cant", "dont",
    "didnt", "doesnt", "isnt", "wasnt", "wont", "wouldnt", "couldnt",
    "shouldnt", "aint", "stop", "start", "started", "help",
    # Radio show filler
    "welcome", "thanks", "thank", "show", "roost", "luke", "whats",
    "youre", "thats", "heres", "theyre", "ive", "youve", "weve",
    "sounds", "sounds", "listen", "hear", "heard", "happen", "happened",
    "happening", "absolutely", "definitely", "exactly", "totally",
    "pretty", "little", "whole", "every", "point", "sense", "real",
    "great", "cool", "awesome", "amazing", "crazy", "weird", "funny",
    "tough", "hard", "wrong", "true", "trying", "tried", "works",
    "working", "anymore", "already", "enough", "though", "whatever",
    "theres", "making", "saying", "keeping", "possible", "instead",
    "front", "behind", "course", "talks", "happens", "watch",
    "everybodys", "pants", "husband", "client",
 }
 def extract_keywords(text: str, max_keywords: int = 3) -> list[str]:
    words = text.split()
    if len(words) < 8:
        return []  # Too short to extract meaningful topics
    keywords = []
-    # Pass 1: capitalized words (proper nouns) not at sentence start
+    # Only look for proper nouns that are likely real topics (not caller names)
    # Skip first few words (usually greetings) and single proper nouns (usually names)
    proper_nouns = []
    for i, word in enumerate(words):
        clean = re.sub(r'[^\w]', '', word)
-        if not clean:
+        if not clean or len(clean) < 3:
            continue
        is_sentence_start = i == 0 or (i > 0 and words[i - 1].rstrip()[-1:] in '.!?')
        if clean[0].isupper() and not is_sentence_start and clean.lower() not in STOP_WORDS:
-            if clean not in keywords:
+            proper_nouns.append(clean)
-                keywords.append(clean)
+
    # Only use proper nouns if we found 2+ (single one is probably a name)
    if len(proper_nouns) >= 2:
        for noun in proper_nouns[:max_keywords]:
            if noun not in keywords:
                keywords.append(noun)
        if len(keywords) >= max_keywords:
            return keywords
-    # Pass 2: uncommon words (>4 chars, not in stop words)
+    # Pass 2: uncommon words (>5 chars, not in stop words)
    for word in words:
        clean = re.sub(r'[^\w]', '', word).lower()
-        if len(clean) > 4 and clean not in STOP_WORDS:
+        if len(clean) > 5 and clean not in STOP_WORDS:
-            title_clean = clean.capitalize()
+            if clean not in [k.lower() for k in keywords]:
            if title_clean not in keywords and clean not in [k.lower() for k in keywords]:
                keywords.append(clean)
            if len(keywords) >= max_keywords:
                return keywords