"""Podcast RSS ingestion (§4.1). feedparser + conditional GET (ETag/Last-Modified) for efficient incremental polling, with a composite (feed_url, guid) dedup discipline. Many podcast CDNs send no validators and some feeds truncate to recent episodes — for the §7.1 backtest, older episodes may need the show's full archive feed (some hosts expose `?limit=` / a separate archive URL) or a YouTube back-catalog. """ from __future__ import annotations import hashlib import time from typing import Any import feedparser DEFAULT_UA = "Ten31SignalEngine/0.1 (+https://ten31.xyz)" def fetch_feed(url: str, *, etag: str | None = None, modified: str | None = None, user_agent: str = DEFAULT_UA) -> feedparser.FeedParserDict: """Conditional GET. On HTTP 304 the result has .status == 304 and .entries == [] → skip.""" return feedparser.parse(url, etag=etag, modified=modified, agent=user_agent) def _published_iso(entry: Any) -> str | None: t = entry.get("published_parsed") or entry.get("updated_parsed") if not t: return None return time.strftime("%Y-%m-%d", t) def _enclosure_audio_url(entry: Any) -> str | None: for enc in entry.get("enclosures", []) or []: if str(enc.get("type", "")).startswith("audio"): return enc.get("href") or enc.get("url") # some feeds put audio only in links rel=enclosure for link in entry.get("links", []) or []: if link.get("rel") == "enclosure" and str(link.get("type", "")).startswith("audio"): return link.get("href") return None def _guid(entry: Any) -> str: g = entry.get("id") or entry.get("link") if g: return str(g) basis = f"{entry.get('title','')}|{entry.get('published','')}" return "sha1:" + hashlib.sha1(basis.encode()).hexdigest() def episode_records(parsed: feedparser.FeedParserDict) -> list[dict]: """Normalize feed entries to episode records. Skips entries with no audio enclosure.""" out: list[dict] = [] for e in parsed.entries: audio = _enclosure_audio_url(e) if not audio: continue out.append({ "guid": _guid(e), "title": e.get("title"), "audio_url": audio, "link": e.get("link"), "published": _published_iso(e), }) return out