// Cross-chunk speaker reconciliation for Phase 1D of the diarization // pipeline. Spark Control's /api/audio/diarize-chunk hands back // per-chunk speaker labels ("Speaker_0", "Speaker_1") that are local // to one chunk — Sortformer has no memory across calls, so Speaker_0 // in chunk 5 might or might not be the same person as Speaker_0 in // chunk 6. The per-speaker 192-dim TitaNet voice embedding it also // returns IS persistent though, so we cluster fingerprints across // chunks via cosine similarity to recover the global speaker identity. // // Algorithm: average-linkage agglomerative clustering. Start with N // singleton clusters (one per fingerprint), repeatedly merge the // closest pair until no pair is closer than the operator-configured // threshold. Average-linkage was the choice over single/complete // because it's robust to outlier embeddings (one bad embedding from // a noisy chunk doesn't anchor or repel an entire cluster). // // Complexity: O(N³) where N = total fingerprints across all chunks. // Typical: 2 speakers × 21 chunks = 42 → ~74k ops, sub-millisecond. // Worst case for a 4-hour all-talk-show video: 6 speakers × 48 chunks // = ~288 → ~24M ops, still milliseconds in Node. // // Threshold convention: configured as INTEGER percentage 50-95 // representing cosine similarity. 70 (= 0.70 sim) is NeMo's // recommended default for TitaNet embeddings. Internally we work // in cosine DISTANCE (= 1 - similarity) for the merge condition. // ─── Cosine similarity ────────────────────────────────────────────── // Standard dot-product / (||a|| * ||b||). Both inputs must be number // arrays of the same length. Returns 0 for any zero-magnitude input // to avoid NaN propagation. export function cosineSimilarity(a, b) { if (!a || !b || a.length !== b.length) return 0; let dot = 0; let na = 0; let nb = 0; for (let i = 0; i < a.length; i++) { dot += a[i] * b[i]; na += a[i] * a[i]; nb += b[i] * b[i]; } if (!na || !nb) return 0; return dot / (Math.sqrt(na) * Math.sqrt(nb)); } // ─── Cluster ID naming ────────────────────────────────────────────── // Speaker_A, Speaker_B, ..., Speaker_Z, Speaker_AA, Speaker_AB... // Capital letter prefix is intentionally distinct from the chunk- // local "Speaker_0" / "Speaker_1" naming that Sortformer uses, so // the source of a label is obvious at a glance. function globalSpeakerLabel(index) { let n = index; let s = ""; do { s = String.fromCharCode(65 + (n % 26)) + s; n = Math.floor(n / 26) - 1; } while (n >= 0); return "Speaker_" + s; } // ─── Agglomerative clustering (average linkage) ───────────────────── // Input: array of { key, vector } pairs. `key` is opaque to the // algorithm — we just propagate it into the returned cluster's // `members` list for the caller to map back to (chunkIdx, localLabel). // // Output: array of clusters, each { members: [keys], vectors: [vecs] }. // Order: clusters are emitted in the order their FIRST member was // added to the input — keeps "Speaker_A" assigned to whoever spoke // first across the audio, which is the natural user expectation. function agglomerativeCluster(items, distanceThreshold) { if (items.length === 0) return []; const clusters = items.map((it) => ({ members: [it.key], vectors: [it.vector], firstOrder: it.firstOrder, // preserve original input order for stable sort later })); // Cache pairwise singleton distances to avoid recomputing as // clusters grow. distMatrix[i][j] = avg cosine distance between // cluster i's vectors and cluster j's vectors. Recomputed on merge. while (clusters.length > 1) { let best = { dist: Infinity, i: -1, j: -1 }; for (let i = 0; i < clusters.length; i++) { for (let j = i + 1; j < clusters.length; j++) { const d = avgLinkageDistance(clusters[i], clusters[j]); if (d < best.dist) { best = { dist: d, i, j }; } } } if (best.dist > distanceThreshold) break; // Merge clusters[j] into clusters[i], remove clusters[j] clusters[best.i].members.push(...clusters[best.j].members); clusters[best.i].vectors.push(...clusters[best.j].vectors); // Keep the earliest firstOrder so the merged cluster sorts to // the position of its earliest-appearing member. if (clusters[best.j].firstOrder < clusters[best.i].firstOrder) { clusters[best.i].firstOrder = clusters[best.j].firstOrder; } clusters.splice(best.j, 1); } // Sort by first-appearance order so Speaker_A = whoever spoke first. clusters.sort((a, b) => a.firstOrder - b.firstOrder); return clusters; } function avgLinkageDistance(c1, c2) { let sum = 0; let count = 0; for (const v1 of c1.vectors) { for (const v2 of c2.vectors) { sum += 1 - cosineSimilarity(v1, v2); count += 1; } } return count > 0 ? sum / count : Infinity; } // ─── Public: cluster per-chunk diarization → global speaker map ───── // // Input shape (one entry per chunk; failed/skipped chunks are // silently filtered): // { // ok: true, // chunkIndex: 5, // startSeconds: 1080, // segments: [{ start, end, speaker_local, confidence }], // speakers_local: ["Speaker_0", "Speaker_1"], // fingerprints: { "Speaker_0": [192 floats], "Speaker_1": [192 floats] } // } // // Returns: // { // globalMap: Map<"chunkIdx:localLabel", "Speaker_A">, // speakers: { // Speaker_A: { turns, total_speaking_seconds, mean_confidence, // chunks_appeared_in, fingerprint_count }, // ... // }, // clusterCount: 2, // thresholdSimilarity: 0.70 // } // // When fingerprintCount === 0 (diarization off or all chunks failed) // returns an empty result: { globalMap: empty, speakers: {}, ... }. // Post-cluster suppression tunables. After the initial agglomerative // cluster pass, walk the resulting clusters and re-categorize the // small ones to fix the "14 speakers detected when really only 2" // case Grant flagged on a 2h53m podcast. The clustering algorithm // itself stays strict (no false-positive merges); suppression is a // second pass that operates on cluster size + cross-cluster // similarity to catch the noise-induced spurious clusters. // // anchor_min_speaking_sec — a cluster needs at least this much // total speaking time to be considered an "anchor" (= a real // speaker). Anchors keep their own global ID + colored chip. // // small_cluster_max_speaking_sec — clusters with LESS than this // are suppression candidates. Brief utterances are common false // positives (background noise, crosstalk fragments, brief // intros). // // uncertain_margin_pct — a small cluster whose best similarity // to any anchor is within this many percentage points of the // main threshold gets REASSIGNED to that anchor and marked // uncertain (chip shows "?"). Far-from-anchor small clusters // become Speaker_Unknown. // // All three are operator-editable via Settings → Operator hardware. // Defaults are conservative — no false-positive merges into // anchors, just reassignment of small clusters that are PROBABLY // the anchor in noisy conditions. A real 30+ second second speaker // still gets their own chip; only brief flecks of similar voice get // pulled in. const DEFAULT_ANCHOR_MIN_SPEAKING_SEC = 30; const DEFAULT_SMALL_CLUSTER_MAX_SPEAKING_SEC = 15; const DEFAULT_UNCERTAIN_MARGIN_PCT = 10; // Average cosine similarity between two clusters' vector sets — // inverse of avgLinkageDistance, expressed as similarity for // readability in the suppression logic. function avgLinkageSimilarity(c1, c2) { return 1 - avgLinkageDistance(c1, c2); } // Clamp an option value to an integer in [lo, hi], falling back to // `fallback` when the value is missing or non-finite. Used to make // out-of-range or absent operator settings safe. function clampInt(v, fallback, lo, hi) { const n = Number(v); if (!Number.isFinite(n)) return fallback; const i = Math.round(n); if (i < lo) return lo; if (i > hi) return hi; return i; } export function clusterSpeakers( chunkDiarization, clusterThresholdPct = 70, options = {} ) { // Use Number.isFinite-guarded fallback rather than the `|| 70` // idiom — the latter substitutes 70 for ANY falsy value including // 0 (a valid input we want to clamp to 50, not silently bump up). const raw = Number(clusterThresholdPct); const pct = Math.max(50, Math.min(95, Number.isFinite(raw) ? raw : 70)); const similarityThreshold = pct / 100; const distanceThreshold = 1 - similarityThreshold; // Operator-tunable suppression thresholds — accept from options // with Number.isFinite-guarded fallbacks to the conservative // defaults. Clamped to the same ranges the admin.js SETTINGS_RANGES // enforces on save, so a hand-edited relay-config.json with an // out-of-range value still produces sane behavior. const anchorMinSec = clampInt( options.anchorMinSpeakingSec, DEFAULT_ANCHOR_MIN_SPEAKING_SEC, 5, 120 ); const smallMaxSec = clampInt( options.smallClusterMaxSpeakingSec, DEFAULT_SMALL_CLUSTER_MAX_SPEAKING_SEC, 1, 60 ); const uncertainMarginPct = clampInt( options.uncertainMarginPct, DEFAULT_UNCERTAIN_MARGIN_PCT, 0, 30 ); const uncertainSimThreshold = Math.max( 0, similarityThreshold - uncertainMarginPct / 100 ); // Flatten fingerprints into the clustering input. Preserve insertion // order so the first-appearance speaker gets Speaker_A. const items = []; let order = 0; for (const d of chunkDiarization || []) { if (!d || !d.ok || !d.fingerprints) continue; for (const [localLabel, vector] of Object.entries(d.fingerprints)) { if (!Array.isArray(vector) || vector.length === 0) continue; items.push({ key: `${d.chunkIndex}:${localLabel}`, vector, firstOrder: order++, }); } } if (items.length === 0) { return { globalMap: new Map(), uncertaintyMap: new Map(), speakers: {}, clusterCount: 0, thresholdSimilarity: similarityThreshold, }; } const clusters = agglomerativeCluster(items, distanceThreshold); // ─── First pass: compute speaking time per cluster ─────────────── // We need cluster sizes BEFORE building the global map so the // suppression pass can identify anchors. Walk all diar segments, // map each (chunkIdx, speaker_local) to its cluster index, and // accumulate seg duration. const clusterIdxByMember = new Map(); for (let i = 0; i < clusters.length; i++) { for (const memberKey of clusters[i].members) { clusterIdxByMember.set(memberKey, i); } } const totalSecsByCluster = new Array(clusters.length).fill(0); for (const d of chunkDiarization || []) { if (!d || !d.ok || !Array.isArray(d.segments)) continue; for (const seg of d.segments) { const key = `${d.chunkIndex}:${seg.speaker_local}`; const ci = clusterIdxByMember.get(key); if (ci === undefined) continue; const dur = Math.max(0, (seg.end || 0) - (seg.start || 0)); totalSecsByCluster[ci] += dur; } } // ─── Second pass: identify anchors + plan suppression ──────────── const isAnchorIdx = new Array(clusters.length).fill(false); const anchorIdxs = []; for (let i = 0; i < clusters.length; i++) { if (totalSecsByCluster[i] >= anchorMinSec) { isAnchorIdx[i] = true; anchorIdxs.push(i); } } // reassignTo[i] = anchor cluster idx that absorbs i; uncertain // unknownClusters: set of cluster idxs whose members map to // Speaker_Unknown. keptAsOwn: non-anchor cluster idxs that stay // as their own speaker (large + low-sim — plausibly a real // third+ speaker even if rare). const reassignTo = new Map(); const unknownClusters = new Set(); if (anchorIdxs.length >= 1) { for (let i = 0; i < clusters.length; i++) { if (isAnchorIdx[i]) continue; // Find best anchor by average cosine similarity let bestAnchorIdx = -1; let bestSim = -Infinity; for (const ai of anchorIdxs) { const sim = avgLinkageSimilarity(clusters[i], clusters[ai]); if (sim > bestSim) { bestSim = sim; bestAnchorIdx = ai; } } const totalSecs = totalSecsByCluster[i]; if (bestAnchorIdx >= 0 && bestSim >= uncertainSimThreshold) { // Close-to-anchor (within uncertain_margin_pct of // main threshold) → reassign to anchor with uncertainty. // Chip will show e.g. "MH?" so the user knows attribution // is best-guess. reassignTo.set(i, bestAnchorIdx); } else if (totalSecs < smallMaxSec) { // Small + far-from-anchor → Unknown. Brief noise / crosstalk / // background voices that don't confidently match either main // speaker. Merged into a single Speaker_Unknown pseudo- // speaker so the legend doesn't fill with N "unidentified // brief speaker" entries. unknownClusters.add(i); } // else: large (>= 15s) + far-from-anchor → keep as own speaker. // Plausibly a real third+ person who's distinct from the main // anchors. Rare but possible. } } // ─── Build the final cluster-label ordering ────────────────────── // Order by first-appearance: whoever spoke first in the audio // gets Speaker_A. Anchors + kept-as-own clusters get labels; // reassigned + unknown clusters don't. const ordered = []; for (let i = 0; i < clusters.length; i++) { if (reassignTo.has(i) || unknownClusters.has(i)) continue; ordered.push({ idx: i, firstOrder: clusters[i].firstOrder }); } ordered.sort((a, b) => a.firstOrder - b.firstOrder); const labelByOrigIdx = new Map(); for (let j = 0; j < ordered.length; j++) { labelByOrigIdx.set(ordered[j].idx, globalSpeakerLabel(j)); } // Reassigned clusters inherit their anchor's label for (const [i, ai] of reassignTo) { const anchorLabel = labelByOrigIdx.get(ai); if (anchorLabel) labelByOrigIdx.set(i, anchorLabel); } // ─── Build globalMap + uncertaintyMap ──────────────────────────── const globalMap = new Map(); const uncertaintyMap = new Map(); let hasUnknown = false; for (let i = 0; i < clusters.length; i++) { if (unknownClusters.has(i)) { for (const memberKey of clusters[i].members) { globalMap.set(memberKey, "Speaker_Unknown"); hasUnknown = true; } continue; } const label = labelByOrigIdx.get(i); if (!label) continue; const isReassigned = reassignTo.has(i); for (const memberKey of clusters[i].members) { globalMap.set(memberKey, label); if (isReassigned) uncertaintyMap.set(memberKey, true); } } // ─── Build the per-speaker summary ─────────────────────────────── const speakers = {}; const seenLabels = new Set([...labelByOrigIdx.values()]); for (const label of seenLabels) { speakers[label] = { turns: 0, total_speaking_seconds: 0, mean_confidence: null, chunks_appeared_in: 0, fingerprint_count: 0, }; } if (hasUnknown) { speakers["Speaker_Unknown"] = { turns: 0, total_speaking_seconds: 0, mean_confidence: null, chunks_appeared_in: 0, fingerprint_count: 0, }; } // Accumulate fingerprint counts from clusters that contributed to // each label. Reassigned clusters' fingerprints count toward // their anchor's total. for (let i = 0; i < clusters.length; i++) { let targetLabel; if (unknownClusters.has(i)) targetLabel = "Speaker_Unknown"; else targetLabel = labelByOrigIdx.get(i); if (!targetLabel || !speakers[targetLabel]) continue; speakers[targetLabel].fingerprint_count += clusters[i].members.length; } // Accumulate turns / speaking time / confidence by walking // diarization segments through the globalMap. // // Two earlier bugs fixed here: // // 1. UNFINGERPRINTED SEGMENTS WERE SILENTLY DROPPED. The cluster- // index lookup only contains entries that have fingerprints — // but Sortformer routinely emits diar segments for speakers // whose voice TitaNet didn't aggregate a fingerprint for (very // brief utterances, soft speech, overlapped speech). Those // segments produced globalId === undefined and were dropped, // so the "total speech detected" totals understated reality // dramatically. A 1.5-hour call could show as "34% speech // detected" when in reality 70%+ of the audio was speech that // Sortformer found but TitaNet couldn't fingerprint. Now: an // unmapped segment falls through to Speaker_Unknown so the // time still gets accounted for. (The chip on the per-line // transcript still shows "?" for those segments — they just // aren't claimed by a wrong cluster.) // // 2. CHUNK-OVERLAP DOUBLE-COUNTING. Transcribe segments are // deduped at the chunk overlap boundary (handled in // hardware.js), but diar segments are not. Until this fix the // same speech in a 30s overlap zone got counted toward TWO // chunks, inflating speaker totals. Dedup here using the // chunk's overlapBoundarySec when present. const confidenceSum = new Map(); const confidenceCount = new Map(); const chunksByLabel = new Map(); // Ensure Speaker_Unknown exists in speakers map before we attribute // any unmapped time to it — clusterSpeakers may have created it // already (via the unknownClusters path) or not (when no clusters // were suppressed). Either way, we want it as a destination bucket. if (!speakers["Speaker_Unknown"]) { speakers["Speaker_Unknown"] = { turns: 0, total_speaking_seconds: 0, mean_confidence: null, chunks_appeared_in: 0, fingerprint_count: 0, }; hasUnknown = true; } for (const d of chunkDiarization || []) { if (!d || !d.ok || !Array.isArray(d.segments)) continue; // Chunk-overlap dedup: skip any segment whose GLOBAL start time // sits in the prior chunk's tail (which this chunk overlapped). // chunkOverlapBoundary is the global timestamp BEFORE which // segments in this chunk are duplicates of the prior chunk's // tail. Comes from the chunk planner (audio-meta.js) and is // 0 for chunk 0 (no prior chunk → no dedup). const chunkOverlapBoundary = typeof d.chunkOverlapBoundarySec === "number" ? d.chunkOverlapBoundarySec : 0; const labelsInThisChunk = new Set(); for (const seg of d.segments) { if ((seg.start || 0) < chunkOverlapBoundary) continue; let globalId = globalMap.get(`${d.chunkIndex}:${seg.speaker_local}`); // Unmapped (no fingerprint produced for this speaker_local in // this chunk) → bucket into Speaker_Unknown rather than drop. if (!globalId || !speakers[globalId]) { globalId = "Speaker_Unknown"; } speakers[globalId].turns += 1; const segDuration = Math.max(0, (seg.end || 0) - (seg.start || 0)); speakers[globalId].total_speaking_seconds += segDuration; if (typeof seg.confidence === "number" && Number.isFinite(seg.confidence)) { confidenceSum.set(globalId, (confidenceSum.get(globalId) || 0) + seg.confidence); confidenceCount.set(globalId, (confidenceCount.get(globalId) || 0) + 1); } labelsInThisChunk.add(globalId); } for (const label of labelsInThisChunk) { if (!chunksByLabel.has(label)) chunksByLabel.set(label, new Set()); chunksByLabel.get(label).add(d.chunkIndex); } } // If Speaker_Unknown ended up with zero turns (no unmapped + no // suppressed clusters contributed), drop it from the legend so we // don't show "? Unknown 0:00" by default. if (speakers["Speaker_Unknown"] && speakers["Speaker_Unknown"].turns === 0) { delete speakers["Speaker_Unknown"]; hasUnknown = false; } for (const label of Object.keys(speakers)) { if (confidenceCount.get(label)) { speakers[label].mean_confidence = confidenceSum.get(label) / confidenceCount.get(label); } speakers[label].chunks_appeared_in = (chunksByLabel.get(label) || new Set()).size; speakers[label].total_speaking_seconds = Math.round(speakers[label].total_speaking_seconds * 10) / 10; } // Logging: surface the suppression summary so operators can see // what happened ("14 clusters → 2 anchors + 12 small/uncertain // suppressed"). const reassignedCount = reassignTo.size; const unknownClusterCount = unknownClusters.size; const finalCount = Object.keys(speakers).length - (hasUnknown ? 1 : 0); console.log( `[clustering] ${clusters.length} raw clusters → ${finalCount} primary + ` + `${reassignedCount} reassigned (uncertain) + ${unknownClusterCount} unknown ` + `(anchors >= ${anchorMinSec}s, uncertain margin ${uncertainMarginPct}%, ` + `unknown < ${smallMaxSec}s)` ); return { globalMap, uncertaintyMap, speakers, clusterCount: clusters.length, thresholdSimilarity: similarityThreshold, }; } // ─── Public: stamp global speaker labels onto transcript segments ─── // // Walks the merged transcript segments and assigns each one a // `speaker` (global ID, e.g. "Speaker_A") + `speaker_confidence` // based on which diarization segment its midpoint falls inside. // When no diar segment covers the midpoint, we fall back to nearest- // midpoint matching with a 5-second window — beyond that, leave the // speaker null so the frontend can render as "(speaker unknown)" or // just drop the label. // // Mutates the segments in-place (and also returns the array) so // callers don't have to remember which they got. export function assignSpeakersToSegments(segments, chunkDiarization, globalMap, uncertaintyMap = null) { if (!Array.isArray(segments) || segments.length === 0) return segments; if (!globalMap || globalMap.size === 0) { // Diarization didn't run / produced nothing — leave segments // alone. Caller can detect this state via speakers === {}. return segments; } // Flatten all per-chunk diar segments into one timeline annotated // with the global speaker label + the suppression-uncertainty // flag (set when a small cluster was reassigned to an anchor — // chip will show "?" so the user knows attribution is best-guess). const flatDiar = []; for (const d of chunkDiarization || []) { if (!d || !d.ok || !Array.isArray(d.segments)) continue; for (const seg of d.segments) { const memberKey = `${d.chunkIndex}:${seg.speaker_local}`; const globalId = globalMap.get(memberKey); if (!globalId) continue; const uncertain = uncertaintyMap ? !!uncertaintyMap.get(memberKey) : false; flatDiar.push({ start: seg.start || 0, end: seg.end || 0, speaker: globalId, confidence: typeof seg.confidence === "number" ? seg.confidence : null, uncertain, }); } } flatDiar.sort((a, b) => a.start - b.start); const NEAREST_FALLBACK_WINDOW_SEC = 5; for (const e of segments) { const mid = ((e.start || 0) + (e.end || 0)) / 2; // Find segments that contain the midpoint let containing = null; let containingMostOverlap = 0; for (const d of flatDiar) { if (d.start <= mid && mid <= d.end) { // Score by overlap with the entry to handle the rare case of // multiple diar segments straddling one transcript line // (chunk overlap zones, choppy speaker turns). const overlap = Math.min(d.end, e.end || 0) - Math.max(d.start, e.start || 0); if (overlap > containingMostOverlap) { containing = d; containingMostOverlap = overlap; } } } if (containing) { e.speaker = containing.speaker; e.speaker_confidence = containing.confidence; e.speaker_uncertain = !!containing.uncertain; continue; } // Fall back to nearest by midpoint distance (within window) let nearest = null; let nearestDist = Infinity; for (const d of flatDiar) { const dMid = (d.start + d.end) / 2; const dist = Math.abs(dMid - mid); if (dist < nearestDist) { nearestDist = dist; nearest = d; } } if (nearest && nearestDist <= NEAREST_FALLBACK_WINDOW_SEC) { e.speaker = nearest.speaker; e.speaker_confidence = nearest.confidence; e.speaker_uncertain = !!nearest.uncertain; } else { e.speaker = null; e.speaker_confidence = null; e.speaker_uncertain = false; } } return segments; }