Initial commit: Ten31 Signal Engine (ingest, scoring brain, corpus seeds)
This commit is contained in:
@@ -0,0 +1,60 @@
|
||||
"""Cross-chunk speaker stitching + the voiceprint library (§4.1, §4.5).
|
||||
|
||||
diarize-chunk returns a 192-d TitaNet voiceprint per speaker per chunk. Because each chunk is
|
||||
diarized independently, "Speaker 1" in chunk 3 is not the same label as "Speaker 1" in chunk 7 —
|
||||
we re-cluster by cosine similarity (~0.7 distance threshold) so one person gets one identity across
|
||||
the whole episode. The SAME library then matches a guest ACROSS shows by voice (the independence
|
||||
graph's hardest edge, §4.5).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import numpy as np
|
||||
|
||||
DISTANCE_THRESHOLD = 0.7 # cosine DISTANCE (1 - cosine similarity); §4.1
|
||||
|
||||
|
||||
def _unit(v: np.ndarray) -> np.ndarray:
|
||||
n = np.linalg.norm(v)
|
||||
return v / n if n else v
|
||||
|
||||
|
||||
def cosine_distance(a: np.ndarray, b: np.ndarray) -> float:
|
||||
return float(1.0 - np.dot(_unit(np.asarray(a, dtype=float)), _unit(np.asarray(b, dtype=float))))
|
||||
|
||||
|
||||
def stitch_chunks(chunk_voiceprints: list[np.ndarray], *, threshold: float = DISTANCE_THRESHOLD) -> list[int]:
|
||||
"""Greedy online clustering of per-(chunk,speaker) voiceprints into stable speaker ids.
|
||||
|
||||
Input: a flat list of voiceprint vectors (one per chunk-speaker, in encounter order).
|
||||
Output: a parallel list of cluster ids. A vector joins the nearest existing cluster if its
|
||||
distance to that cluster's centroid < threshold, else it starts a new cluster.
|
||||
"""
|
||||
centroids: list[np.ndarray] = []
|
||||
counts: list[int] = []
|
||||
labels: list[int] = []
|
||||
for vp in chunk_voiceprints:
|
||||
vp = np.asarray(vp, dtype=float)
|
||||
if centroids:
|
||||
dists = [cosine_distance(vp, c) for c in centroids]
|
||||
j = int(np.argmin(dists))
|
||||
if dists[j] < threshold:
|
||||
centroids[j] = (centroids[j] * counts[j] + vp) / (counts[j] + 1)
|
||||
counts[j] += 1
|
||||
labels.append(j)
|
||||
continue
|
||||
centroids.append(vp.copy())
|
||||
counts.append(1)
|
||||
labels.append(len(centroids) - 1)
|
||||
return labels
|
||||
|
||||
|
||||
def match_library(vp: np.ndarray, library: list[tuple[str, np.ndarray]], *,
|
||||
threshold: float = DISTANCE_THRESHOLD) -> str | None:
|
||||
"""Return the voiceprint_id of the closest library entry within threshold, else None
|
||||
(a new speaker → caller mints a new library id)."""
|
||||
best_id, best_d = None, threshold
|
||||
for vid, lib_vec in library:
|
||||
d = cosine_distance(vp, lib_vec)
|
||||
if d < best_d:
|
||||
best_id, best_d = vid, d
|
||||
return best_id
|
||||
Reference in New Issue
Block a user