Files
recap-relay/server/speaker-clustering.js
T

625 lines
25 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
// Cross-chunk speaker reconciliation for Phase 1D of the diarization
// pipeline. Spark Control's /api/audio/diarize-chunk hands back
// per-chunk speaker labels ("Speaker_0", "Speaker_1") that are local
// to one chunk — Sortformer has no memory across calls, so Speaker_0
// in chunk 5 might or might not be the same person as Speaker_0 in
// chunk 6. The per-speaker 192-dim TitaNet voice embedding it also
// returns IS persistent though, so we cluster fingerprints across
// chunks via cosine similarity to recover the global speaker identity.
//
// Algorithm: average-linkage agglomerative clustering. Start with N
// singleton clusters (one per fingerprint), repeatedly merge the
// closest pair until no pair is closer than the operator-configured
// threshold. Average-linkage was the choice over single/complete
// because it's robust to outlier embeddings (one bad embedding from
// a noisy chunk doesn't anchor or repel an entire cluster).
//
// Complexity: O(N³) where N = total fingerprints across all chunks.
// Typical: 2 speakers × 21 chunks = 42 → ~74k ops, sub-millisecond.
// Worst case for a 4-hour all-talk-show video: 6 speakers × 48 chunks
// = ~288 → ~24M ops, still milliseconds in Node.
//
// Threshold convention: configured as INTEGER percentage 50-95
// representing cosine similarity. 70 (= 0.70 sim) is NeMo's
// recommended default for TitaNet embeddings. Internally we work
// in cosine DISTANCE (= 1 - similarity) for the merge condition.
// ─── Cosine similarity ──────────────────────────────────────────────
// Standard dot-product / (||a|| * ||b||). Both inputs must be number
// arrays of the same length. Returns 0 for any zero-magnitude input
// to avoid NaN propagation.
export function cosineSimilarity(a, b) {
if (!a || !b || a.length !== b.length) return 0;
let dot = 0;
let na = 0;
let nb = 0;
for (let i = 0; i < a.length; i++) {
dot += a[i] * b[i];
na += a[i] * a[i];
nb += b[i] * b[i];
}
if (!na || !nb) return 0;
return dot / (Math.sqrt(na) * Math.sqrt(nb));
}
// ─── Cluster ID naming ──────────────────────────────────────────────
// Speaker_A, Speaker_B, ..., Speaker_Z, Speaker_AA, Speaker_AB...
// Capital letter prefix is intentionally distinct from the chunk-
// local "Speaker_0" / "Speaker_1" naming that Sortformer uses, so
// the source of a label is obvious at a glance.
function globalSpeakerLabel(index) {
let n = index;
let s = "";
do {
s = String.fromCharCode(65 + (n % 26)) + s;
n = Math.floor(n / 26) - 1;
} while (n >= 0);
return "Speaker_" + s;
}
// ─── Agglomerative clustering (average linkage) ─────────────────────
// Input: array of { key, vector } pairs. `key` is opaque to the
// algorithm — we just propagate it into the returned cluster's
// `members` list for the caller to map back to (chunkIdx, localLabel).
//
// Output: array of clusters, each { members: [keys], vectors: [vecs] }.
// Order: clusters are emitted in the order their FIRST member was
// added to the input — keeps "Speaker_A" assigned to whoever spoke
// first across the audio, which is the natural user expectation.
function agglomerativeCluster(items, distanceThreshold) {
if (items.length === 0) return [];
const clusters = items.map((it) => ({
members: [it.key],
vectors: [it.vector],
firstOrder: it.firstOrder, // preserve original input order for stable sort later
}));
// Cache pairwise singleton distances to avoid recomputing as
// clusters grow. distMatrix[i][j] = avg cosine distance between
// cluster i's vectors and cluster j's vectors. Recomputed on merge.
while (clusters.length > 1) {
let best = { dist: Infinity, i: -1, j: -1 };
for (let i = 0; i < clusters.length; i++) {
for (let j = i + 1; j < clusters.length; j++) {
const d = avgLinkageDistance(clusters[i], clusters[j]);
if (d < best.dist) {
best = { dist: d, i, j };
}
}
}
if (best.dist > distanceThreshold) break;
// Merge clusters[j] into clusters[i], remove clusters[j]
clusters[best.i].members.push(...clusters[best.j].members);
clusters[best.i].vectors.push(...clusters[best.j].vectors);
// Keep the earliest firstOrder so the merged cluster sorts to
// the position of its earliest-appearing member.
if (clusters[best.j].firstOrder < clusters[best.i].firstOrder) {
clusters[best.i].firstOrder = clusters[best.j].firstOrder;
}
clusters.splice(best.j, 1);
}
// Sort by first-appearance order so Speaker_A = whoever spoke first.
clusters.sort((a, b) => a.firstOrder - b.firstOrder);
return clusters;
}
function avgLinkageDistance(c1, c2) {
let sum = 0;
let count = 0;
for (const v1 of c1.vectors) {
for (const v2 of c2.vectors) {
sum += 1 - cosineSimilarity(v1, v2);
count += 1;
}
}
return count > 0 ? sum / count : Infinity;
}
// ─── Public: cluster per-chunk diarization → global speaker map ─────
//
// Input shape (one entry per chunk; failed/skipped chunks are
// silently filtered):
// {
// ok: true,
// chunkIndex: 5,
// startSeconds: 1080,
// segments: [{ start, end, speaker_local, confidence }],
// speakers_local: ["Speaker_0", "Speaker_1"],
// fingerprints: { "Speaker_0": [192 floats], "Speaker_1": [192 floats] }
// }
//
// Returns:
// {
// globalMap: Map<"chunkIdx:localLabel", "Speaker_A">,
// speakers: {
// Speaker_A: { turns, total_speaking_seconds, mean_confidence,
// chunks_appeared_in, fingerprint_count },
// ...
// },
// clusterCount: 2,
// thresholdSimilarity: 0.70
// }
//
// When fingerprintCount === 0 (diarization off or all chunks failed)
// returns an empty result: { globalMap: empty, speakers: {}, ... }.
// Post-cluster suppression tunables. After the initial agglomerative
// cluster pass, walk the resulting clusters and re-categorize the
// small ones to fix the "14 speakers detected when really only 2"
// case Grant flagged on a 2h53m podcast. The clustering algorithm
// itself stays strict (no false-positive merges); suppression is a
// second pass that operates on cluster size + cross-cluster
// similarity to catch the noise-induced spurious clusters.
//
// anchor_min_speaking_sec — a cluster needs at least this much
// total speaking time to be considered an "anchor" (= a real
// speaker). Anchors keep their own global ID + colored chip.
//
// small_cluster_max_speaking_sec — clusters with LESS than this
// are suppression candidates. Brief utterances are common false
// positives (background noise, crosstalk fragments, brief
// intros).
//
// uncertain_margin_pct — a small cluster whose best similarity
// to any anchor is within this many percentage points of the
// main threshold gets REASSIGNED to that anchor and marked
// uncertain (chip shows "?"). Far-from-anchor small clusters
// become Speaker_Unknown.
//
// All three are operator-editable via Settings → Operator hardware.
// Defaults are conservative — no false-positive merges into
// anchors, just reassignment of small clusters that are PROBABLY
// the anchor in noisy conditions. A real 30+ second second speaker
// still gets their own chip; only brief flecks of similar voice get
// pulled in.
const DEFAULT_ANCHOR_MIN_SPEAKING_SEC = 30;
const DEFAULT_SMALL_CLUSTER_MAX_SPEAKING_SEC = 15;
const DEFAULT_UNCERTAIN_MARGIN_PCT = 10;
// Average cosine similarity between two clusters' vector sets —
// inverse of avgLinkageDistance, expressed as similarity for
// readability in the suppression logic.
function avgLinkageSimilarity(c1, c2) {
return 1 - avgLinkageDistance(c1, c2);
}
// Clamp an option value to an integer in [lo, hi], falling back to
// `fallback` when the value is missing or non-finite. Used to make
// out-of-range or absent operator settings safe.
function clampInt(v, fallback, lo, hi) {
const n = Number(v);
if (!Number.isFinite(n)) return fallback;
const i = Math.round(n);
if (i < lo) return lo;
if (i > hi) return hi;
return i;
}
export function clusterSpeakers(
chunkDiarization,
clusterThresholdPct = 70,
options = {}
) {
// Use Number.isFinite-guarded fallback rather than the `|| 70`
// idiom — the latter substitutes 70 for ANY falsy value including
// 0 (a valid input we want to clamp to 50, not silently bump up).
const raw = Number(clusterThresholdPct);
const pct = Math.max(50, Math.min(95, Number.isFinite(raw) ? raw : 70));
const similarityThreshold = pct / 100;
const distanceThreshold = 1 - similarityThreshold;
// Operator-tunable suppression thresholds — accept from options
// with Number.isFinite-guarded fallbacks to the conservative
// defaults. Clamped to the same ranges the admin.js SETTINGS_RANGES
// enforces on save, so a hand-edited relay-config.json with an
// out-of-range value still produces sane behavior.
const anchorMinSec = clampInt(
options.anchorMinSpeakingSec,
DEFAULT_ANCHOR_MIN_SPEAKING_SEC,
5,
120
);
const smallMaxSec = clampInt(
options.smallClusterMaxSpeakingSec,
DEFAULT_SMALL_CLUSTER_MAX_SPEAKING_SEC,
1,
60
);
const uncertainMarginPct = clampInt(
options.uncertainMarginPct,
DEFAULT_UNCERTAIN_MARGIN_PCT,
0,
30
);
const uncertainSimThreshold = Math.max(
0,
similarityThreshold - uncertainMarginPct / 100
);
// Flatten fingerprints into the clustering input. Preserve insertion
// order so the first-appearance speaker gets Speaker_A.
const items = [];
let order = 0;
for (const d of chunkDiarization || []) {
if (!d || !d.ok || !d.fingerprints) continue;
for (const [localLabel, vector] of Object.entries(d.fingerprints)) {
if (!Array.isArray(vector) || vector.length === 0) continue;
items.push({
key: `${d.chunkIndex}:${localLabel}`,
vector,
firstOrder: order++,
});
}
}
if (items.length === 0) {
return {
globalMap: new Map(),
uncertaintyMap: new Map(),
speakers: {},
clusterCount: 0,
thresholdSimilarity: similarityThreshold,
};
}
const clusters = agglomerativeCluster(items, distanceThreshold);
// ─── First pass: compute speaking time per cluster ───────────────
// We need cluster sizes BEFORE building the global map so the
// suppression pass can identify anchors. Walk all diar segments,
// map each (chunkIdx, speaker_local) to its cluster index, and
// accumulate seg duration.
const clusterIdxByMember = new Map();
for (let i = 0; i < clusters.length; i++) {
for (const memberKey of clusters[i].members) {
clusterIdxByMember.set(memberKey, i);
}
}
const totalSecsByCluster = new Array(clusters.length).fill(0);
for (const d of chunkDiarization || []) {
if (!d || !d.ok || !Array.isArray(d.segments)) continue;
for (const seg of d.segments) {
const key = `${d.chunkIndex}:${seg.speaker_local}`;
const ci = clusterIdxByMember.get(key);
if (ci === undefined) continue;
const dur = Math.max(0, (seg.end || 0) - (seg.start || 0));
totalSecsByCluster[ci] += dur;
}
}
// ─── Second pass: identify anchors + plan suppression ────────────
const isAnchorIdx = new Array(clusters.length).fill(false);
const anchorIdxs = [];
for (let i = 0; i < clusters.length; i++) {
if (totalSecsByCluster[i] >= anchorMinSec) {
isAnchorIdx[i] = true;
anchorIdxs.push(i);
}
}
// reassignTo[i] = anchor cluster idx that absorbs i; uncertain
// unknownClusters: set of cluster idxs whose members map to
// Speaker_Unknown. keptAsOwn: non-anchor cluster idxs that stay
// as their own speaker (large + low-sim — plausibly a real
// third+ speaker even if rare).
const reassignTo = new Map();
const unknownClusters = new Set();
if (anchorIdxs.length >= 1) {
for (let i = 0; i < clusters.length; i++) {
if (isAnchorIdx[i]) continue;
// Find best anchor by average cosine similarity
let bestAnchorIdx = -1;
let bestSim = -Infinity;
for (const ai of anchorIdxs) {
const sim = avgLinkageSimilarity(clusters[i], clusters[ai]);
if (sim > bestSim) {
bestSim = sim;
bestAnchorIdx = ai;
}
}
const totalSecs = totalSecsByCluster[i];
if (bestAnchorIdx >= 0 && bestSim >= uncertainSimThreshold) {
// Close-to-anchor (within uncertain_margin_pct of
// main threshold) → reassign to anchor with uncertainty.
// Chip will show e.g. "MH?" so the user knows attribution
// is best-guess.
reassignTo.set(i, bestAnchorIdx);
} else if (totalSecs < smallMaxSec) {
// Small + far-from-anchor → Unknown. Brief noise / crosstalk /
// background voices that don't confidently match either main
// speaker. Merged into a single Speaker_Unknown pseudo-
// speaker so the legend doesn't fill with N "unidentified
// brief speaker" entries.
unknownClusters.add(i);
}
// else: large (>= 15s) + far-from-anchor → keep as own speaker.
// Plausibly a real third+ person who's distinct from the main
// anchors. Rare but possible.
}
}
// ─── Build the final cluster-label ordering ──────────────────────
// Order by first-appearance: whoever spoke first in the audio
// gets Speaker_A. Anchors + kept-as-own clusters get labels;
// reassigned + unknown clusters don't.
const ordered = [];
for (let i = 0; i < clusters.length; i++) {
if (reassignTo.has(i) || unknownClusters.has(i)) continue;
ordered.push({ idx: i, firstOrder: clusters[i].firstOrder });
}
ordered.sort((a, b) => a.firstOrder - b.firstOrder);
const labelByOrigIdx = new Map();
for (let j = 0; j < ordered.length; j++) {
labelByOrigIdx.set(ordered[j].idx, globalSpeakerLabel(j));
}
// Reassigned clusters inherit their anchor's label
for (const [i, ai] of reassignTo) {
const anchorLabel = labelByOrigIdx.get(ai);
if (anchorLabel) labelByOrigIdx.set(i, anchorLabel);
}
// ─── Build globalMap + uncertaintyMap ────────────────────────────
const globalMap = new Map();
const uncertaintyMap = new Map();
let hasUnknown = false;
for (let i = 0; i < clusters.length; i++) {
if (unknownClusters.has(i)) {
for (const memberKey of clusters[i].members) {
globalMap.set(memberKey, "Speaker_Unknown");
hasUnknown = true;
}
continue;
}
const label = labelByOrigIdx.get(i);
if (!label) continue;
const isReassigned = reassignTo.has(i);
for (const memberKey of clusters[i].members) {
globalMap.set(memberKey, label);
if (isReassigned) uncertaintyMap.set(memberKey, true);
}
}
// ─── Build the per-speaker summary ───────────────────────────────
const speakers = {};
const seenLabels = new Set([...labelByOrigIdx.values()]);
for (const label of seenLabels) {
speakers[label] = {
turns: 0,
total_speaking_seconds: 0,
mean_confidence: null,
chunks_appeared_in: 0,
fingerprint_count: 0,
};
}
if (hasUnknown) {
speakers["Speaker_Unknown"] = {
turns: 0,
total_speaking_seconds: 0,
mean_confidence: null,
chunks_appeared_in: 0,
fingerprint_count: 0,
};
}
// Accumulate fingerprint counts from clusters that contributed to
// each label. Reassigned clusters' fingerprints count toward
// their anchor's total.
for (let i = 0; i < clusters.length; i++) {
let targetLabel;
if (unknownClusters.has(i)) targetLabel = "Speaker_Unknown";
else targetLabel = labelByOrigIdx.get(i);
if (!targetLabel || !speakers[targetLabel]) continue;
speakers[targetLabel].fingerprint_count += clusters[i].members.length;
}
// Accumulate turns / speaking time / confidence by walking
// diarization segments through the globalMap.
//
// Two earlier bugs fixed here:
//
// 1. UNFINGERPRINTED SEGMENTS WERE SILENTLY DROPPED. The cluster-
// index lookup only contains entries that have fingerprints —
// but Sortformer routinely emits diar segments for speakers
// whose voice TitaNet didn't aggregate a fingerprint for (very
// brief utterances, soft speech, overlapped speech). Those
// segments produced globalId === undefined and were dropped,
// so the "total speech detected" totals understated reality
// dramatically. A 1.5-hour call could show as "34% speech
// detected" when in reality 70%+ of the audio was speech that
// Sortformer found but TitaNet couldn't fingerprint. Now: an
// unmapped segment falls through to Speaker_Unknown so the
// time still gets accounted for. (The chip on the per-line
// transcript still shows "?" for those segments — they just
// aren't claimed by a wrong cluster.)
//
// 2. CHUNK-OVERLAP DOUBLE-COUNTING. Transcribe segments are
// deduped at the chunk overlap boundary (handled in
// hardware.js), but diar segments are not. Until this fix the
// same speech in a 30s overlap zone got counted toward TWO
// chunks, inflating speaker totals. Dedup here using the
// chunk's overlapBoundarySec when present.
const confidenceSum = new Map();
const confidenceCount = new Map();
const chunksByLabel = new Map();
// Ensure Speaker_Unknown exists in speakers map before we attribute
// any unmapped time to it — clusterSpeakers may have created it
// already (via the unknownClusters path) or not (when no clusters
// were suppressed). Either way, we want it as a destination bucket.
if (!speakers["Speaker_Unknown"]) {
speakers["Speaker_Unknown"] = {
turns: 0,
total_speaking_seconds: 0,
mean_confidence: null,
chunks_appeared_in: 0,
fingerprint_count: 0,
};
hasUnknown = true;
}
for (const d of chunkDiarization || []) {
if (!d || !d.ok || !Array.isArray(d.segments)) continue;
// Chunk-overlap dedup: skip any segment whose GLOBAL start time
// sits in the prior chunk's tail (which this chunk overlapped).
// chunkOverlapBoundary is the global timestamp BEFORE which
// segments in this chunk are duplicates of the prior chunk's
// tail. Comes from the chunk planner (audio-meta.js) and is
// 0 for chunk 0 (no prior chunk → no dedup).
const chunkOverlapBoundary =
typeof d.chunkOverlapBoundarySec === "number"
? d.chunkOverlapBoundarySec
: 0;
const labelsInThisChunk = new Set();
for (const seg of d.segments) {
if ((seg.start || 0) < chunkOverlapBoundary) continue;
let globalId = globalMap.get(`${d.chunkIndex}:${seg.speaker_local}`);
// Unmapped (no fingerprint produced for this speaker_local in
// this chunk) → bucket into Speaker_Unknown rather than drop.
if (!globalId || !speakers[globalId]) {
globalId = "Speaker_Unknown";
}
speakers[globalId].turns += 1;
const segDuration = Math.max(0, (seg.end || 0) - (seg.start || 0));
speakers[globalId].total_speaking_seconds += segDuration;
if (typeof seg.confidence === "number" && Number.isFinite(seg.confidence)) {
confidenceSum.set(globalId, (confidenceSum.get(globalId) || 0) + seg.confidence);
confidenceCount.set(globalId, (confidenceCount.get(globalId) || 0) + 1);
}
labelsInThisChunk.add(globalId);
}
for (const label of labelsInThisChunk) {
if (!chunksByLabel.has(label)) chunksByLabel.set(label, new Set());
chunksByLabel.get(label).add(d.chunkIndex);
}
}
// If Speaker_Unknown ended up with zero turns (no unmapped + no
// suppressed clusters contributed), drop it from the legend so we
// don't show "? Unknown 0:00" by default.
if (speakers["Speaker_Unknown"] && speakers["Speaker_Unknown"].turns === 0) {
delete speakers["Speaker_Unknown"];
hasUnknown = false;
}
for (const label of Object.keys(speakers)) {
if (confidenceCount.get(label)) {
speakers[label].mean_confidence =
confidenceSum.get(label) / confidenceCount.get(label);
}
speakers[label].chunks_appeared_in = (chunksByLabel.get(label) || new Set()).size;
speakers[label].total_speaking_seconds =
Math.round(speakers[label].total_speaking_seconds * 10) / 10;
}
// Logging: surface the suppression summary so operators can see
// what happened ("14 clusters → 2 anchors + 12 small/uncertain
// suppressed").
const reassignedCount = reassignTo.size;
const unknownClusterCount = unknownClusters.size;
const finalCount =
Object.keys(speakers).length - (hasUnknown ? 1 : 0);
console.log(
`[clustering] ${clusters.length} raw clusters → ${finalCount} primary + ` +
`${reassignedCount} reassigned (uncertain) + ${unknownClusterCount} unknown ` +
`(anchors >= ${anchorMinSec}s, uncertain margin ${uncertainMarginPct}%, ` +
`unknown < ${smallMaxSec}s)`
);
return {
globalMap,
uncertaintyMap,
speakers,
clusterCount: clusters.length,
thresholdSimilarity: similarityThreshold,
};
}
// ─── Public: stamp global speaker labels onto transcript segments ───
//
// Walks the merged transcript segments and assigns each one a
// `speaker` (global ID, e.g. "Speaker_A") + `speaker_confidence`
// based on which diarization segment its midpoint falls inside.
// When no diar segment covers the midpoint, we fall back to nearest-
// midpoint matching with a 5-second window — beyond that, leave the
// speaker null so the frontend can render as "(speaker unknown)" or
// just drop the label.
//
// Mutates the segments in-place (and also returns the array) so
// callers don't have to remember which they got.
export function assignSpeakersToSegments(segments, chunkDiarization, globalMap, uncertaintyMap = null) {
if (!Array.isArray(segments) || segments.length === 0) return segments;
if (!globalMap || globalMap.size === 0) {
// Diarization didn't run / produced nothing — leave segments
// alone. Caller can detect this state via speakers === {}.
return segments;
}
// Flatten all per-chunk diar segments into one timeline annotated
// with the global speaker label + the suppression-uncertainty
// flag (set when a small cluster was reassigned to an anchor —
// chip will show "?" so the user knows attribution is best-guess).
const flatDiar = [];
for (const d of chunkDiarization || []) {
if (!d || !d.ok || !Array.isArray(d.segments)) continue;
for (const seg of d.segments) {
const memberKey = `${d.chunkIndex}:${seg.speaker_local}`;
const globalId = globalMap.get(memberKey);
if (!globalId) continue;
const uncertain = uncertaintyMap ? !!uncertaintyMap.get(memberKey) : false;
flatDiar.push({
start: seg.start || 0,
end: seg.end || 0,
speaker: globalId,
confidence: typeof seg.confidence === "number" ? seg.confidence : null,
uncertain,
});
}
}
flatDiar.sort((a, b) => a.start - b.start);
const NEAREST_FALLBACK_WINDOW_SEC = 5;
for (const e of segments) {
const mid = ((e.start || 0) + (e.end || 0)) / 2;
// Find segments that contain the midpoint
let containing = null;
let containingMostOverlap = 0;
for (const d of flatDiar) {
if (d.start <= mid && mid <= d.end) {
// Score by overlap with the entry to handle the rare case of
// multiple diar segments straddling one transcript line
// (chunk overlap zones, choppy speaker turns).
const overlap =
Math.min(d.end, e.end || 0) - Math.max(d.start, e.start || 0);
if (overlap > containingMostOverlap) {
containing = d;
containingMostOverlap = overlap;
}
}
}
if (containing) {
e.speaker = containing.speaker;
e.speaker_confidence = containing.confidence;
e.speaker_uncertain = !!containing.uncertain;
continue;
}
// Fall back to nearest by midpoint distance (within window)
let nearest = null;
let nearestDist = Infinity;
for (const d of flatDiar) {
const dMid = (d.start + d.end) / 2;
const dist = Math.abs(dMid - mid);
if (dist < nearestDist) {
nearestDist = dist;
nearest = d;
}
}
if (nearest && nearestDist <= NEAREST_FALLBACK_WINDOW_SEC) {
e.speaker = nearest.speaker;
e.speaker_confidence = nearest.confidence;
e.speaker_uncertain = !!nearest.uncertain;
} else {
e.speaker = null;
e.speaker_confidence = null;
e.speaker_uncertain = false;
}
}
return segments;
}