Initial commit: Ten31 Signal Engine (ingest, scoring brain, corpus seeds)

2026-06-15 09:24:29 -05:00
commit a6aec77506
77 changed files with 6263 additions and 0 deletions
@@ -0,0 +1,65 @@
+"""Podcast RSS ingestion (§4.1).
+
+feedparser + conditional GET (ETag/Last-Modified) for efficient incremental polling, with a
+composite (feed_url, guid) dedup discipline. Many podcast CDNs send no validators and some feeds
+truncate to recent episodes — for the §7.1 backtest, older episodes may need the show's full
+archive feed (some hosts expose `?limit=` / a separate archive URL) or a YouTube back-catalog.
+"""
+from __future__ import annotations
+
+import hashlib
+import time
+from typing import Any
+
+import feedparser
+
+DEFAULT_UA = "Ten31SignalEngine/0.1 (+https://ten31.xyz)"
+
+
+def fetch_feed(url: str, *, etag: str | None = None, modified: str | None = None,
+               user_agent: str = DEFAULT_UA) -> feedparser.FeedParserDict:
+    """Conditional GET. On HTTP 304 the result has .status == 304 and .entries == [] → skip."""
+    return feedparser.parse(url, etag=etag, modified=modified, agent=user_agent)
+
+
+def _published_iso(entry: Any) -> str | None:
+    t = entry.get("published_parsed") or entry.get("updated_parsed")
+    if not t:
+        return None
+    return time.strftime("%Y-%m-%d", t)
+
+
+def _enclosure_audio_url(entry: Any) -> str | None:
+    for enc in entry.get("enclosures", []) or []:
+        if str(enc.get("type", "")).startswith("audio"):
+            return enc.get("href") or enc.get("url")
+    # some feeds put audio only in links rel=enclosure
+    for link in entry.get("links", []) or []:
+        if link.get("rel") == "enclosure" and str(link.get("type", "")).startswith("audio"):
+            return link.get("href")
+    return None
+
+
+def _guid(entry: Any) -> str:
+    g = entry.get("id") or entry.get("link")
+    if g:
+        return str(g)
+    basis = f"{entry.get('title','')}|{entry.get('published','')}"
+    return "sha1:" + hashlib.sha1(basis.encode()).hexdigest()
+
+
+def episode_records(parsed: feedparser.FeedParserDict) -> list[dict]:
+    """Normalize feed entries to episode records. Skips entries with no audio enclosure."""
+    out: list[dict] = []
+    for e in parsed.entries:
+        audio = _enclosure_audio_url(e)
+        if not audio:
+            continue
+        out.append({
+            "guid": _guid(e),
+            "title": e.get("title"),
+            "audio_url": audio,
+            "link": e.get("link"),
+            "published": _published_iso(e),
+        })
+    return out