Initial commit: Ten31 Signal Engine (ingest, scoring brain, corpus seeds)
This commit is contained in:
@@ -0,0 +1,65 @@
|
||||
"""Podcast RSS ingestion (§4.1).
|
||||
|
||||
feedparser + conditional GET (ETag/Last-Modified) for efficient incremental polling, with a
|
||||
composite (feed_url, guid) dedup discipline. Many podcast CDNs send no validators and some feeds
|
||||
truncate to recent episodes — for the §7.1 backtest, older episodes may need the show's full
|
||||
archive feed (some hosts expose `?limit=` / a separate archive URL) or a YouTube back-catalog.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import time
|
||||
from typing import Any
|
||||
|
||||
import feedparser
|
||||
|
||||
DEFAULT_UA = "Ten31SignalEngine/0.1 (+https://ten31.xyz)"
|
||||
|
||||
|
||||
def fetch_feed(url: str, *, etag: str | None = None, modified: str | None = None,
|
||||
user_agent: str = DEFAULT_UA) -> feedparser.FeedParserDict:
|
||||
"""Conditional GET. On HTTP 304 the result has .status == 304 and .entries == [] → skip."""
|
||||
return feedparser.parse(url, etag=etag, modified=modified, agent=user_agent)
|
||||
|
||||
|
||||
def _published_iso(entry: Any) -> str | None:
|
||||
t = entry.get("published_parsed") or entry.get("updated_parsed")
|
||||
if not t:
|
||||
return None
|
||||
return time.strftime("%Y-%m-%d", t)
|
||||
|
||||
|
||||
def _enclosure_audio_url(entry: Any) -> str | None:
|
||||
for enc in entry.get("enclosures", []) or []:
|
||||
if str(enc.get("type", "")).startswith("audio"):
|
||||
return enc.get("href") or enc.get("url")
|
||||
# some feeds put audio only in links rel=enclosure
|
||||
for link in entry.get("links", []) or []:
|
||||
if link.get("rel") == "enclosure" and str(link.get("type", "")).startswith("audio"):
|
||||
return link.get("href")
|
||||
return None
|
||||
|
||||
|
||||
def _guid(entry: Any) -> str:
|
||||
g = entry.get("id") or entry.get("link")
|
||||
if g:
|
||||
return str(g)
|
||||
basis = f"{entry.get('title','')}|{entry.get('published','')}"
|
||||
return "sha1:" + hashlib.sha1(basis.encode()).hexdigest()
|
||||
|
||||
|
||||
def episode_records(parsed: feedparser.FeedParserDict) -> list[dict]:
|
||||
"""Normalize feed entries to episode records. Skips entries with no audio enclosure."""
|
||||
out: list[dict] = []
|
||||
for e in parsed.entries:
|
||||
audio = _enclosure_audio_url(e)
|
||||
if not audio:
|
||||
continue
|
||||
out.append({
|
||||
"guid": _guid(e),
|
||||
"title": e.get("title"),
|
||||
"audio_url": audio,
|
||||
"link": e.get("link"),
|
||||
"published": _published_iso(e),
|
||||
})
|
||||
return out
|
||||
Reference in New Issue
Block a user