Initial commit: Ten31 Signal Engine (ingest, scoring brain, corpus seeds)

This commit is contained in:
Keysat
2026-06-15 09:24:29 -05:00
commit a6aec77506
77 changed files with 6263 additions and 0 deletions
+28
View File
@@ -0,0 +1,28 @@
"""Small shared utilities (normalization, dedup keys)."""
from __future__ import annotations
import re
_SHOW_SUFFIX = re.compile(r"\s*[|\-–—]\s*[^|\-–—]*(podcast|show|ep(isode)?\s*\d+).*$", re.I)
_EP_PREFIX = re.compile(r"^\s*(ep(isode)?\.?\s*\d+\s*[:\-]|#\s*\d+\s*[:\-]|\d+\s*[:\-])\s*", re.I)
_NONALNUM = re.compile(r"[^a-z0-9]+")
def slugify(s: str, *, maxlen: int = 60) -> str:
return _NONALNUM.sub("-", (s or "").lower()).strip("-")[:maxlen] or "x"
def normalize_title(title: str) -> str:
"""Normalize an episode title so the SAME episode matches across feeds/mirrors despite cosmetic
differences ('Ep 42: Foo' vs 'Foo | The Show'). Best-effort — a safety net, not the primary key."""
t = title or ""
t = _SHOW_SUFFIX.sub("", t)
t = _EP_PREFIX.sub("", t)
return _NONALNUM.sub(" ", t.lower()).strip()
def audio_dedup_key(title: str | None, date: str | None) -> str:
"""Cross-mirror dedup key for audio: normalized title + date. Computed BEFORE transcription so a
duplicate episode (same content via a different feed/mirror) is skipped without spending GPU.
NOT derived from the transcript (ASR is non-deterministic — a transcript hash would be brittle)."""
return f"{normalize_title(title or '')}|{date or ''}"