"""Small shared utilities (normalization, dedup keys).""" from __future__ import annotations import re _SHOW_SUFFIX = re.compile(r"\s*[|\-–—]\s*[^|\-–—]*(podcast|show|ep(isode)?\s*\d+).*$", re.I) _EP_PREFIX = re.compile(r"^\s*(ep(isode)?\.?\s*\d+\s*[:\-–]|#\s*\d+\s*[:\-–]|\d+\s*[:\-–])\s*", re.I) _NONALNUM = re.compile(r"[^a-z0-9]+") def slugify(s: str, *, maxlen: int = 60) -> str: return _NONALNUM.sub("-", (s or "").lower()).strip("-")[:maxlen] or "x" def normalize_title(title: str) -> str: """Normalize an episode title so the SAME episode matches across feeds/mirrors despite cosmetic differences ('Ep 42: Foo' vs 'Foo | The Show'). Best-effort — a safety net, not the primary key.""" t = title or "" t = _SHOW_SUFFIX.sub("", t) t = _EP_PREFIX.sub("", t) return _NONALNUM.sub(" ", t.lower()).strip() def audio_dedup_key(title: str | None, date: str | None) -> str: """Cross-mirror dedup key for audio: normalized title + date. Computed BEFORE transcription so a duplicate episode (same content via a different feed/mirror) is skipped without spending GPU. NOT derived from the transcript (ASR is non-deterministic — a transcript hash would be brittle).""" return f"{normalize_title(title or '')}|{date or ''}"