179 lines
7.0 KiB
Python
179 lines
7.0 KiB
Python
"""News service for current events awareness in AI callers"""
|
|
|
|
import asyncio
|
|
import time
|
|
import re
|
|
from dataclasses import dataclass
|
|
from urllib.parse import quote_plus
|
|
from xml.etree import ElementTree
|
|
|
|
import httpx
|
|
|
|
|
|
@dataclass
|
|
class NewsItem:
|
|
title: str
|
|
source: str
|
|
published: str
|
|
|
|
|
|
class NewsService:
|
|
def __init__(self):
|
|
self._client: httpx.AsyncClient | None = None
|
|
self._headlines_cache: list[NewsItem] = []
|
|
self._headlines_ts: float = 0
|
|
self._headlines_lock = asyncio.Lock()
|
|
self._search_cache: dict[str, tuple[float, list[NewsItem]]] = {}
|
|
self._search_lock = asyncio.Lock()
|
|
|
|
@property
|
|
def client(self) -> httpx.AsyncClient:
|
|
if self._client is None or self._client.is_closed:
|
|
self._client = httpx.AsyncClient(timeout=10.0)
|
|
return self._client
|
|
|
|
async def get_headlines(self) -> list[NewsItem]:
|
|
async with self._headlines_lock:
|
|
if self._headlines_cache and time.time() - self._headlines_ts < 1800:
|
|
return self._headlines_cache
|
|
|
|
try:
|
|
resp = await self.client.get("https://news.google.com/rss")
|
|
resp.raise_for_status()
|
|
items = self._parse_rss(resp.text, max_items=10)
|
|
self._headlines_cache = items
|
|
self._headlines_ts = time.time()
|
|
return items
|
|
except Exception as e:
|
|
print(f"[News] Headlines fetch failed: {e}")
|
|
if self._headlines_cache:
|
|
return self._headlines_cache
|
|
return []
|
|
|
|
async def search_topic(self, query: str) -> list[NewsItem]:
|
|
cache_key = query.lower()
|
|
|
|
async with self._search_lock:
|
|
if cache_key in self._search_cache:
|
|
ts, items = self._search_cache[cache_key]
|
|
if time.time() - ts < 600:
|
|
return items
|
|
|
|
# Evict oldest when cache too large
|
|
if len(self._search_cache) > 50:
|
|
oldest_key = min(self._search_cache, key=lambda k: self._search_cache[k][0])
|
|
del self._search_cache[oldest_key]
|
|
|
|
try:
|
|
encoded = quote_plus(query)
|
|
url = f"https://news.google.com/rss/search?q={encoded}&hl=en-US&gl=US&ceid=US:en"
|
|
resp = await self.client.get(url)
|
|
resp.raise_for_status()
|
|
items = self._parse_rss(resp.text, max_items=5)
|
|
|
|
async with self._search_lock:
|
|
self._search_cache[cache_key] = (time.time(), items)
|
|
|
|
return items
|
|
except Exception as e:
|
|
print(f"[News] Search failed for '{query}': {e}")
|
|
async with self._search_lock:
|
|
if cache_key in self._search_cache:
|
|
return self._search_cache[cache_key][1]
|
|
return []
|
|
|
|
def _parse_rss(self, xml_text: str, max_items: int = 10) -> list[NewsItem]:
|
|
items = []
|
|
try:
|
|
root = ElementTree.fromstring(xml_text)
|
|
for item_el in root.iter("item"):
|
|
if len(items) >= max_items:
|
|
break
|
|
title = item_el.findtext("title", "").strip()
|
|
source_el = item_el.find("source")
|
|
source = source_el.text.strip() if source_el is not None and source_el.text else ""
|
|
published = item_el.findtext("pubDate", "").strip()
|
|
if title:
|
|
items.append(NewsItem(title=title, source=source, published=published))
|
|
except ElementTree.ParseError as e:
|
|
print(f"[News] RSS parse error: {e}")
|
|
return items
|
|
|
|
def format_headlines_for_prompt(self, items: list[NewsItem]) -> str:
|
|
lines = []
|
|
for item in items:
|
|
if item.source:
|
|
lines.append(f"- {item.title} ({item.source})")
|
|
else:
|
|
lines.append(f"- {item.title}")
|
|
return "\n".join(lines)
|
|
|
|
async def close(self):
|
|
if self._client and not self._client.is_closed:
|
|
await self._client.aclose()
|
|
|
|
|
|
STOP_WORDS = {
|
|
"the", "a", "an", "is", "are", "was", "were", "be", "been", "being",
|
|
"have", "has", "had", "do", "does", "did", "will", "would", "could",
|
|
"should", "may", "might", "shall", "can", "need", "dare", "ought",
|
|
"used", "to", "of", "in", "for", "on", "with", "at", "by", "from",
|
|
"as", "into", "through", "during", "before", "after", "above", "below",
|
|
"between", "out", "off", "over", "under", "again", "further", "then",
|
|
"once", "here", "there", "when", "where", "why", "how", "all", "both",
|
|
"each", "few", "more", "most", "other", "some", "such", "no", "nor",
|
|
"not", "only", "own", "same", "so", "than", "too", "very", "just",
|
|
"but", "and", "or", "if", "while", "because", "until", "about",
|
|
"that", "this", "these", "those", "what", "which", "who", "whom",
|
|
"it", "its", "he", "him", "his", "she", "her", "they", "them",
|
|
"their", "we", "us", "our", "you", "your", "me", "my", "i",
|
|
# Casual speech fillers
|
|
"yeah", "well", "like", "man", "dude", "okay", "right", "know",
|
|
"think", "mean", "really", "actually", "honestly", "basically",
|
|
"literally", "stuff", "thing", "things", "something", "anything",
|
|
"nothing", "everything", "someone", "anyone", "everyone", "nobody",
|
|
"gonna", "wanna", "gotta", "kinda", "sorta", "dunno",
|
|
"look", "see", "say", "said", "tell", "told", "talk", "talking",
|
|
"feel", "felt", "guess", "sure", "maybe", "probably", "never",
|
|
"always", "still", "even", "much", "many", "also", "got", "get",
|
|
"getting", "going", "come", "came", "make", "made", "take", "took",
|
|
"give", "gave", "want", "keep", "kept", "let", "put", "went",
|
|
"been", "being", "doing", "having", "call", "called", "calling",
|
|
"tonight", "today", "night", "time", "long", "good", "bad",
|
|
"first", "last", "back", "down", "ever", "away", "cant", "dont",
|
|
"didnt", "doesnt", "isnt", "wasnt", "wont", "wouldnt", "couldnt",
|
|
"shouldnt", "aint", "stop", "start", "started", "help",
|
|
}
|
|
|
|
|
|
def extract_keywords(text: str, max_keywords: int = 3) -> list[str]:
|
|
words = text.split()
|
|
keywords = []
|
|
|
|
# Pass 1: capitalized words (proper nouns) not at sentence start
|
|
for i, word in enumerate(words):
|
|
clean = re.sub(r'[^\w]', '', word)
|
|
if not clean:
|
|
continue
|
|
is_sentence_start = i == 0 or (i > 0 and words[i - 1].rstrip()[-1:] in '.!?')
|
|
if clean[0].isupper() and not is_sentence_start and clean.lower() not in STOP_WORDS:
|
|
if clean not in keywords:
|
|
keywords.append(clean)
|
|
if len(keywords) >= max_keywords:
|
|
return keywords
|
|
|
|
# Pass 2: uncommon words (>4 chars, not in stop words)
|
|
for word in words:
|
|
clean = re.sub(r'[^\w]', '', word).lower()
|
|
if len(clean) > 4 and clean not in STOP_WORDS:
|
|
title_clean = clean.capitalize()
|
|
if title_clean not in keywords and clean not in [k.lower() for k in keywords]:
|
|
keywords.append(clean)
|
|
if len(keywords) >= max_keywords:
|
|
return keywords
|
|
|
|
return keywords
|
|
|
|
|
|
news_service = NewsService()
|