Initial commit: Ten31 Signal Engine (ingest, scoring brain, corpus seeds)

2026-06-15 09:24:29 -05:00
commit a6aec77506
77 changed files with 6263 additions and 0 deletions
@@ -0,0 +1,28 @@
+"""Small shared utilities (normalization, dedup keys)."""
+from __future__ import annotations
+
+import re
+
+_SHOW_SUFFIX = re.compile(r"\s*[|\-–—]\s*[^|\-–—]*(podcast|show|ep(isode)?\s*\d+).*$", re.I)
+_EP_PREFIX = re.compile(r"^\s*(ep(isode)?\.?\s*\d+\s*[:\-–]|#\s*\d+\s*[:\-–]|\d+\s*[:\-–])\s*", re.I)
+_NONALNUM = re.compile(r"[^a-z0-9]+")
+
+
+def slugify(s: str, *, maxlen: int = 60) -> str:
+    return _NONALNUM.sub("-", (s or "").lower()).strip("-")[:maxlen] or "x"
+
+
+def normalize_title(title: str) -> str:
+    """Normalize an episode title so the SAME episode matches across feeds/mirrors despite cosmetic
+    differences ('Ep 42: Foo' vs 'Foo | The Show'). Best-effort — a safety net, not the primary key."""
+    t = title or ""
+    t = _SHOW_SUFFIX.sub("", t)
+    t = _EP_PREFIX.sub("", t)
+    return _NONALNUM.sub(" ", t.lower()).strip()
+
+
+def audio_dedup_key(title: str | None, date: str | None) -> str:
+    """Cross-mirror dedup key for audio: normalized title + date. Computed BEFORE transcription so a
+    duplicate episode (same content via a different feed/mirror) is skipped without spending GPU.
+    NOT derived from the transcript (ASR is non-deterministic — a transcript hash would be brittle)."""
+    return f"{normalize_title(title or '')}|{date or ''}"