Files
ten31-signal-engine/signal_engine/util.py
T

29 lines
1.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Small shared utilities (normalization, dedup keys)."""
from __future__ import annotations
import re
_SHOW_SUFFIX = re.compile(r"\s*[|\-–—]\s*[^|\-–—]*(podcast|show|ep(isode)?\s*\d+).*$", re.I)
_EP_PREFIX = re.compile(r"^\s*(ep(isode)?\.?\s*\d+\s*[:\-]|#\s*\d+\s*[:\-]|\d+\s*[:\-])\s*", re.I)
_NONALNUM = re.compile(r"[^a-z0-9]+")
def slugify(s: str, *, maxlen: int = 60) -> str:
return _NONALNUM.sub("-", (s or "").lower()).strip("-")[:maxlen] or "x"
def normalize_title(title: str) -> str:
"""Normalize an episode title so the SAME episode matches across feeds/mirrors despite cosmetic
differences ('Ep 42: Foo' vs 'Foo | The Show'). Best-effort — a safety net, not the primary key."""
t = title or ""
t = _SHOW_SUFFIX.sub("", t)
t = _EP_PREFIX.sub("", t)
return _NONALNUM.sub(" ", t.lower()).strip()
def audio_dedup_key(title: str | None, date: str | None) -> str:
"""Cross-mirror dedup key for audio: normalized title + date. Computed BEFORE transcription so a
duplicate episode (same content via a different feed/mirror) is skipped without spending GPU.
NOT derived from the transcript (ASR is non-deterministic — a transcript hash would be brittle)."""
return f"{normalize_title(title or '')}|{date or ''}"