ten31-signal-engine/signal_engine/ingest/speaker_stitch.py

"""Cross-chunk speaker stitching + the voiceprint library (§4.1, §4.5).

diarize-chunk returns a 192-d TitaNet voiceprint per speaker per chunk. Because each chunk is
diarized independently, "Speaker 1" in chunk 3 is not the same label as "Speaker 1" in chunk 7 —
we re-cluster by cosine similarity (~0.7 distance threshold) so one person gets one identity across
the whole episode. The SAME library then matches a guest ACROSS shows by voice (the independence
graph's hardest edge, §4.5).
"""
from __future__ import annotations

import numpy as np

DISTANCE_THRESHOLD = 0.7  # cosine DISTANCE (1 - cosine similarity); §4.1


def _unit(v: np.ndarray) -> np.ndarray:
    n = np.linalg.norm(v)
    return v / n if n else v


def cosine_distance(a: np.ndarray, b: np.ndarray) -> float:
    return float(1.0 - np.dot(_unit(np.asarray(a, dtype=float)), _unit(np.asarray(b, dtype=float))))


def stitch_chunks(chunk_voiceprints: list[np.ndarray], *, threshold: float = DISTANCE_THRESHOLD) -> list[int]:
    """Greedy online clustering of per-(chunk,speaker) voiceprints into stable speaker ids.

    Input: a flat list of voiceprint vectors (one per chunk-speaker, in encounter order).
    Output: a parallel list of cluster ids. A vector joins the nearest existing cluster if its
    distance to that cluster's centroid < threshold, else it starts a new cluster.
    """
    centroids: list[np.ndarray] = []
    counts: list[int] = []
    labels: list[int] = []
    for vp in chunk_voiceprints:
        vp = np.asarray(vp, dtype=float)
        if centroids:
            dists = [cosine_distance(vp, c) for c in centroids]
            j = int(np.argmin(dists))
            if dists[j] < threshold:
                centroids[j] = (centroids[j] * counts[j] + vp) / (counts[j] + 1)
                counts[j] += 1
                labels.append(j)
                continue
        centroids.append(vp.copy())
        counts.append(1)
        labels.append(len(centroids) - 1)
    return labels


def match_library(vp: np.ndarray, library: list[tuple[str, np.ndarray]], *,
                  threshold: float = DISTANCE_THRESHOLD) -> str | None:
    """Return the voiceprint_id of the closest library entry within threshold, else None
    (a new speaker → caller mints a new library id)."""
    best_id, best_d = None, threshold
    for vid, lib_vec in library:
        d = cosine_distance(vp, lib_vec)
        if d < best_d:
            best_id, best_d = vid, d
    return best_id