Fix research hanging: add timeouts, fix keyword extraction, cache failures
- Google News RSS returns 302: add follow_redirects and User-Agent header - Cache failed headline fetches for 5min so they don't retry every call - Add 8s timeout on background research tasks - Fix keyword extraction: skip short texts, require 2+ proper nouns (not names), increase min word length to 6, add radio show filler to stop words - Stops garbage searches like "Megan welcome" and "sounds thats youre" Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -661,15 +661,19 @@ async def _background_research(text: str):
|
||||
if query.lower() in session.research_notes:
|
||||
return
|
||||
try:
|
||||
results = await news_service.search_topic(query)
|
||||
if results:
|
||||
session.research_notes[query.lower()] = results
|
||||
print(f"[Research] Found {len(results)} results for '{query}'")
|
||||
async with asyncio.timeout(8):
|
||||
results = await news_service.search_topic(query)
|
||||
if results:
|
||||
session.research_notes[query.lower()] = results
|
||||
print(f"[Research] Found {len(results)} results for '{query}'")
|
||||
except TimeoutError:
|
||||
print(f"[Research] Timed out for '{query}'")
|
||||
except Exception as e:
|
||||
print(f"[Research] Error: {e}")
|
||||
|
||||
|
||||
def _build_news_context() -> tuple[str, str]:
|
||||
"""Build context from cached news/research only — never does network calls."""
|
||||
news_context = ""
|
||||
if session.news_headlines:
|
||||
news_context = news_service.format_headlines_for_prompt(session.news_headlines[:6])
|
||||
|
||||
@@ -34,11 +34,16 @@ class NewsService:
|
||||
|
||||
async def get_headlines(self) -> list[NewsItem]:
|
||||
async with self._headlines_lock:
|
||||
if self._headlines_cache and time.time() - self._headlines_ts < 1800:
|
||||
# Cache for 30min on success, 5min on failure (avoid hammering)
|
||||
if time.time() - self._headlines_ts < (1800 if self._headlines_cache else 300):
|
||||
return self._headlines_cache
|
||||
|
||||
try:
|
||||
resp = await self.client.get("https://news.google.com/rss")
|
||||
resp = await self.client.get(
|
||||
"https://news.google.com/rss",
|
||||
follow_redirects=True,
|
||||
headers={"User-Agent": "Mozilla/5.0"}
|
||||
)
|
||||
resp.raise_for_status()
|
||||
items = self._parse_rss(resp.text, max_items=10)
|
||||
self._headlines_cache = items
|
||||
@@ -46,9 +51,8 @@ class NewsService:
|
||||
return items
|
||||
except Exception as e:
|
||||
print(f"[News] Headlines fetch failed: {e}")
|
||||
if self._headlines_cache:
|
||||
return self._headlines_cache
|
||||
return []
|
||||
self._headlines_ts = time.time() # Don't retry immediately
|
||||
return self._headlines_cache
|
||||
|
||||
async def search_topic(self, query: str) -> list[NewsItem]:
|
||||
cache_key = query.lower()
|
||||
@@ -67,7 +71,7 @@ class NewsService:
|
||||
try:
|
||||
encoded = quote_plus(query)
|
||||
url = f"https://news.google.com/rss/search?q={encoded}&hl=en-US&gl=US&ceid=US:en"
|
||||
resp = await self.client.get(url)
|
||||
resp = await self.client.get(url, follow_redirects=True, headers={"User-Agent": "Mozilla/5.0"})
|
||||
resp.raise_for_status()
|
||||
items = self._parse_rss(resp.text, max_items=5)
|
||||
|
||||
@@ -143,31 +147,52 @@ STOP_WORDS = {
|
||||
"first", "last", "back", "down", "ever", "away", "cant", "dont",
|
||||
"didnt", "doesnt", "isnt", "wasnt", "wont", "wouldnt", "couldnt",
|
||||
"shouldnt", "aint", "stop", "start", "started", "help",
|
||||
# Radio show filler
|
||||
"welcome", "thanks", "thank", "show", "roost", "luke", "whats",
|
||||
"youre", "thats", "heres", "theyre", "ive", "youve", "weve",
|
||||
"sounds", "sounds", "listen", "hear", "heard", "happen", "happened",
|
||||
"happening", "absolutely", "definitely", "exactly", "totally",
|
||||
"pretty", "little", "whole", "every", "point", "sense", "real",
|
||||
"great", "cool", "awesome", "amazing", "crazy", "weird", "funny",
|
||||
"tough", "hard", "wrong", "true", "trying", "tried", "works",
|
||||
"working", "anymore", "already", "enough", "though", "whatever",
|
||||
"theres", "making", "saying", "keeping", "possible", "instead",
|
||||
"front", "behind", "course", "talks", "happens", "watch",
|
||||
"everybodys", "pants", "husband", "client",
|
||||
}
|
||||
|
||||
|
||||
def extract_keywords(text: str, max_keywords: int = 3) -> list[str]:
|
||||
words = text.split()
|
||||
if len(words) < 8:
|
||||
return [] # Too short to extract meaningful topics
|
||||
|
||||
keywords = []
|
||||
|
||||
# Pass 1: capitalized words (proper nouns) not at sentence start
|
||||
# Only look for proper nouns that are likely real topics (not caller names)
|
||||
# Skip first few words (usually greetings) and single proper nouns (usually names)
|
||||
proper_nouns = []
|
||||
for i, word in enumerate(words):
|
||||
clean = re.sub(r'[^\w]', '', word)
|
||||
if not clean:
|
||||
if not clean or len(clean) < 3:
|
||||
continue
|
||||
is_sentence_start = i == 0 or (i > 0 and words[i - 1].rstrip()[-1:] in '.!?')
|
||||
if clean[0].isupper() and not is_sentence_start and clean.lower() not in STOP_WORDS:
|
||||
if clean not in keywords:
|
||||
keywords.append(clean)
|
||||
if len(keywords) >= max_keywords:
|
||||
return keywords
|
||||
proper_nouns.append(clean)
|
||||
|
||||
# Pass 2: uncommon words (>4 chars, not in stop words)
|
||||
# Only use proper nouns if we found 2+ (single one is probably a name)
|
||||
if len(proper_nouns) >= 2:
|
||||
for noun in proper_nouns[:max_keywords]:
|
||||
if noun not in keywords:
|
||||
keywords.append(noun)
|
||||
if len(keywords) >= max_keywords:
|
||||
return keywords
|
||||
|
||||
# Pass 2: uncommon words (>5 chars, not in stop words)
|
||||
for word in words:
|
||||
clean = re.sub(r'[^\w]', '', word).lower()
|
||||
if len(clean) > 4 and clean not in STOP_WORDS:
|
||||
title_clean = clean.capitalize()
|
||||
if title_clean not in keywords and clean not in [k.lower() for k in keywords]:
|
||||
if len(clean) > 5 and clean not in STOP_WORDS:
|
||||
if clean not in [k.lower() for k in keywords]:
|
||||
keywords.append(clean)
|
||||
if len(keywords) >= max_keywords:
|
||||
return keywords
|
||||
|
||||
Reference in New Issue
Block a user