Fix research hanging: add timeouts, fix keyword extraction, cache failures
- Google News RSS returns 302: add follow_redirects and User-Agent header - Cache failed headline fetches for 5min so they don't retry every call - Add 8s timeout on background research tasks - Fix keyword extraction: skip short texts, require 2+ proper nouns (not names), increase min word length to 6, add radio show filler to stop words - Stops garbage searches like "Megan welcome" and "sounds thats youre" Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -661,15 +661,19 @@ async def _background_research(text: str):
|
|||||||
if query.lower() in session.research_notes:
|
if query.lower() in session.research_notes:
|
||||||
return
|
return
|
||||||
try:
|
try:
|
||||||
|
async with asyncio.timeout(8):
|
||||||
results = await news_service.search_topic(query)
|
results = await news_service.search_topic(query)
|
||||||
if results:
|
if results:
|
||||||
session.research_notes[query.lower()] = results
|
session.research_notes[query.lower()] = results
|
||||||
print(f"[Research] Found {len(results)} results for '{query}'")
|
print(f"[Research] Found {len(results)} results for '{query}'")
|
||||||
|
except TimeoutError:
|
||||||
|
print(f"[Research] Timed out for '{query}'")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"[Research] Error: {e}")
|
print(f"[Research] Error: {e}")
|
||||||
|
|
||||||
|
|
||||||
def _build_news_context() -> tuple[str, str]:
|
def _build_news_context() -> tuple[str, str]:
|
||||||
|
"""Build context from cached news/research only — never does network calls."""
|
||||||
news_context = ""
|
news_context = ""
|
||||||
if session.news_headlines:
|
if session.news_headlines:
|
||||||
news_context = news_service.format_headlines_for_prompt(session.news_headlines[:6])
|
news_context = news_service.format_headlines_for_prompt(session.news_headlines[:6])
|
||||||
|
|||||||
@@ -34,11 +34,16 @@ class NewsService:
|
|||||||
|
|
||||||
async def get_headlines(self) -> list[NewsItem]:
|
async def get_headlines(self) -> list[NewsItem]:
|
||||||
async with self._headlines_lock:
|
async with self._headlines_lock:
|
||||||
if self._headlines_cache and time.time() - self._headlines_ts < 1800:
|
# Cache for 30min on success, 5min on failure (avoid hammering)
|
||||||
|
if time.time() - self._headlines_ts < (1800 if self._headlines_cache else 300):
|
||||||
return self._headlines_cache
|
return self._headlines_cache
|
||||||
|
|
||||||
try:
|
try:
|
||||||
resp = await self.client.get("https://news.google.com/rss")
|
resp = await self.client.get(
|
||||||
|
"https://news.google.com/rss",
|
||||||
|
follow_redirects=True,
|
||||||
|
headers={"User-Agent": "Mozilla/5.0"}
|
||||||
|
)
|
||||||
resp.raise_for_status()
|
resp.raise_for_status()
|
||||||
items = self._parse_rss(resp.text, max_items=10)
|
items = self._parse_rss(resp.text, max_items=10)
|
||||||
self._headlines_cache = items
|
self._headlines_cache = items
|
||||||
@@ -46,9 +51,8 @@ class NewsService:
|
|||||||
return items
|
return items
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"[News] Headlines fetch failed: {e}")
|
print(f"[News] Headlines fetch failed: {e}")
|
||||||
if self._headlines_cache:
|
self._headlines_ts = time.time() # Don't retry immediately
|
||||||
return self._headlines_cache
|
return self._headlines_cache
|
||||||
return []
|
|
||||||
|
|
||||||
async def search_topic(self, query: str) -> list[NewsItem]:
|
async def search_topic(self, query: str) -> list[NewsItem]:
|
||||||
cache_key = query.lower()
|
cache_key = query.lower()
|
||||||
@@ -67,7 +71,7 @@ class NewsService:
|
|||||||
try:
|
try:
|
||||||
encoded = quote_plus(query)
|
encoded = quote_plus(query)
|
||||||
url = f"https://news.google.com/rss/search?q={encoded}&hl=en-US&gl=US&ceid=US:en"
|
url = f"https://news.google.com/rss/search?q={encoded}&hl=en-US&gl=US&ceid=US:en"
|
||||||
resp = await self.client.get(url)
|
resp = await self.client.get(url, follow_redirects=True, headers={"User-Agent": "Mozilla/5.0"})
|
||||||
resp.raise_for_status()
|
resp.raise_for_status()
|
||||||
items = self._parse_rss(resp.text, max_items=5)
|
items = self._parse_rss(resp.text, max_items=5)
|
||||||
|
|
||||||
@@ -143,31 +147,52 @@ STOP_WORDS = {
|
|||||||
"first", "last", "back", "down", "ever", "away", "cant", "dont",
|
"first", "last", "back", "down", "ever", "away", "cant", "dont",
|
||||||
"didnt", "doesnt", "isnt", "wasnt", "wont", "wouldnt", "couldnt",
|
"didnt", "doesnt", "isnt", "wasnt", "wont", "wouldnt", "couldnt",
|
||||||
"shouldnt", "aint", "stop", "start", "started", "help",
|
"shouldnt", "aint", "stop", "start", "started", "help",
|
||||||
|
# Radio show filler
|
||||||
|
"welcome", "thanks", "thank", "show", "roost", "luke", "whats",
|
||||||
|
"youre", "thats", "heres", "theyre", "ive", "youve", "weve",
|
||||||
|
"sounds", "sounds", "listen", "hear", "heard", "happen", "happened",
|
||||||
|
"happening", "absolutely", "definitely", "exactly", "totally",
|
||||||
|
"pretty", "little", "whole", "every", "point", "sense", "real",
|
||||||
|
"great", "cool", "awesome", "amazing", "crazy", "weird", "funny",
|
||||||
|
"tough", "hard", "wrong", "true", "trying", "tried", "works",
|
||||||
|
"working", "anymore", "already", "enough", "though", "whatever",
|
||||||
|
"theres", "making", "saying", "keeping", "possible", "instead",
|
||||||
|
"front", "behind", "course", "talks", "happens", "watch",
|
||||||
|
"everybodys", "pants", "husband", "client",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def extract_keywords(text: str, max_keywords: int = 3) -> list[str]:
|
def extract_keywords(text: str, max_keywords: int = 3) -> list[str]:
|
||||||
words = text.split()
|
words = text.split()
|
||||||
|
if len(words) < 8:
|
||||||
|
return [] # Too short to extract meaningful topics
|
||||||
|
|
||||||
keywords = []
|
keywords = []
|
||||||
|
|
||||||
# Pass 1: capitalized words (proper nouns) not at sentence start
|
# Only look for proper nouns that are likely real topics (not caller names)
|
||||||
|
# Skip first few words (usually greetings) and single proper nouns (usually names)
|
||||||
|
proper_nouns = []
|
||||||
for i, word in enumerate(words):
|
for i, word in enumerate(words):
|
||||||
clean = re.sub(r'[^\w]', '', word)
|
clean = re.sub(r'[^\w]', '', word)
|
||||||
if not clean:
|
if not clean or len(clean) < 3:
|
||||||
continue
|
continue
|
||||||
is_sentence_start = i == 0 or (i > 0 and words[i - 1].rstrip()[-1:] in '.!?')
|
is_sentence_start = i == 0 or (i > 0 and words[i - 1].rstrip()[-1:] in '.!?')
|
||||||
if clean[0].isupper() and not is_sentence_start and clean.lower() not in STOP_WORDS:
|
if clean[0].isupper() and not is_sentence_start and clean.lower() not in STOP_WORDS:
|
||||||
if clean not in keywords:
|
proper_nouns.append(clean)
|
||||||
keywords.append(clean)
|
|
||||||
|
# Only use proper nouns if we found 2+ (single one is probably a name)
|
||||||
|
if len(proper_nouns) >= 2:
|
||||||
|
for noun in proper_nouns[:max_keywords]:
|
||||||
|
if noun not in keywords:
|
||||||
|
keywords.append(noun)
|
||||||
if len(keywords) >= max_keywords:
|
if len(keywords) >= max_keywords:
|
||||||
return keywords
|
return keywords
|
||||||
|
|
||||||
# Pass 2: uncommon words (>4 chars, not in stop words)
|
# Pass 2: uncommon words (>5 chars, not in stop words)
|
||||||
for word in words:
|
for word in words:
|
||||||
clean = re.sub(r'[^\w]', '', word).lower()
|
clean = re.sub(r'[^\w]', '', word).lower()
|
||||||
if len(clean) > 4 and clean not in STOP_WORDS:
|
if len(clean) > 5 and clean not in STOP_WORDS:
|
||||||
title_clean = clean.capitalize()
|
if clean not in [k.lower() for k in keywords]:
|
||||||
if title_clean not in keywords and clean not in [k.lower() for k in keywords]:
|
|
||||||
keywords.append(clean)
|
keywords.append(clean)
|
||||||
if len(keywords) >= max_keywords:
|
if len(keywords) >= max_keywords:
|
||||||
return keywords
|
return keywords
|
||||||
|
|||||||
Reference in New Issue
Block a user