Initial commit: Ten31 Signal Engine (ingest, scoring brain, corpus seeds)
This commit is contained in:
@@ -0,0 +1,28 @@
|
||||
"""Small shared utilities (normalization, dedup keys)."""
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
|
||||
_SHOW_SUFFIX = re.compile(r"\s*[|\-–—]\s*[^|\-–—]*(podcast|show|ep(isode)?\s*\d+).*$", re.I)
|
||||
_EP_PREFIX = re.compile(r"^\s*(ep(isode)?\.?\s*\d+\s*[:\-–]|#\s*\d+\s*[:\-–]|\d+\s*[:\-–])\s*", re.I)
|
||||
_NONALNUM = re.compile(r"[^a-z0-9]+")
|
||||
|
||||
|
||||
def slugify(s: str, *, maxlen: int = 60) -> str:
|
||||
return _NONALNUM.sub("-", (s or "").lower()).strip("-")[:maxlen] or "x"
|
||||
|
||||
|
||||
def normalize_title(title: str) -> str:
|
||||
"""Normalize an episode title so the SAME episode matches across feeds/mirrors despite cosmetic
|
||||
differences ('Ep 42: Foo' vs 'Foo | The Show'). Best-effort — a safety net, not the primary key."""
|
||||
t = title or ""
|
||||
t = _SHOW_SUFFIX.sub("", t)
|
||||
t = _EP_PREFIX.sub("", t)
|
||||
return _NONALNUM.sub(" ", t.lower()).strip()
|
||||
|
||||
|
||||
def audio_dedup_key(title: str | None, date: str | None) -> str:
|
||||
"""Cross-mirror dedup key for audio: normalized title + date. Computed BEFORE transcription so a
|
||||
duplicate episode (same content via a different feed/mirror) is skipped without spending GPU.
|
||||
NOT derived from the transcript (ASR is non-deterministic — a transcript hash would be brittle)."""
|
||||
return f"{normalize_title(title or '')}|{date or ''}"
|
||||
Reference in New Issue
Block a user