Add internal-meetings pipeline and post-hoc speaker tools

This commit is contained in:
Keysat
2026-06-13 13:35:53 -05:00
parent 9a2dbf69df
commit 705807e286
15 changed files with 7375 additions and 0 deletions
+624
View File
@@ -0,0 +1,624 @@
// Cross-chunk speaker reconciliation for Phase 1D of the diarization
// pipeline. Spark Control's /api/audio/diarize-chunk hands back
// per-chunk speaker labels ("Speaker_0", "Speaker_1") that are local
// to one chunk — Sortformer has no memory across calls, so Speaker_0
// in chunk 5 might or might not be the same person as Speaker_0 in
// chunk 6. The per-speaker 192-dim TitaNet voice embedding it also
// returns IS persistent though, so we cluster fingerprints across
// chunks via cosine similarity to recover the global speaker identity.
//
// Algorithm: average-linkage agglomerative clustering. Start with N
// singleton clusters (one per fingerprint), repeatedly merge the
// closest pair until no pair is closer than the operator-configured
// threshold. Average-linkage was the choice over single/complete
// because it's robust to outlier embeddings (one bad embedding from
// a noisy chunk doesn't anchor or repel an entire cluster).
//
// Complexity: O(N³) where N = total fingerprints across all chunks.
// Typical: 2 speakers × 21 chunks = 42 → ~74k ops, sub-millisecond.
// Worst case for a 4-hour all-talk-show video: 6 speakers × 48 chunks
// = ~288 → ~24M ops, still milliseconds in Node.
//
// Threshold convention: configured as INTEGER percentage 50-95
// representing cosine similarity. 70 (= 0.70 sim) is NeMo's
// recommended default for TitaNet embeddings. Internally we work
// in cosine DISTANCE (= 1 - similarity) for the merge condition.
// ─── Cosine similarity ──────────────────────────────────────────────
// Standard dot-product / (||a|| * ||b||). Both inputs must be number
// arrays of the same length. Returns 0 for any zero-magnitude input
// to avoid NaN propagation.
export function cosineSimilarity(a, b) {
if (!a || !b || a.length !== b.length) return 0;
let dot = 0;
let na = 0;
let nb = 0;
for (let i = 0; i < a.length; i++) {
dot += a[i] * b[i];
na += a[i] * a[i];
nb += b[i] * b[i];
}
if (!na || !nb) return 0;
return dot / (Math.sqrt(na) * Math.sqrt(nb));
}
// ─── Cluster ID naming ──────────────────────────────────────────────
// Speaker_A, Speaker_B, ..., Speaker_Z, Speaker_AA, Speaker_AB...
// Capital letter prefix is intentionally distinct from the chunk-
// local "Speaker_0" / "Speaker_1" naming that Sortformer uses, so
// the source of a label is obvious at a glance.
function globalSpeakerLabel(index) {
let n = index;
let s = "";
do {
s = String.fromCharCode(65 + (n % 26)) + s;
n = Math.floor(n / 26) - 1;
} while (n >= 0);
return "Speaker_" + s;
}
// ─── Agglomerative clustering (average linkage) ─────────────────────
// Input: array of { key, vector } pairs. `key` is opaque to the
// algorithm — we just propagate it into the returned cluster's
// `members` list for the caller to map back to (chunkIdx, localLabel).
//
// Output: array of clusters, each { members: [keys], vectors: [vecs] }.
// Order: clusters are emitted in the order their FIRST member was
// added to the input — keeps "Speaker_A" assigned to whoever spoke
// first across the audio, which is the natural user expectation.
function agglomerativeCluster(items, distanceThreshold) {
if (items.length === 0) return [];
const clusters = items.map((it) => ({
members: [it.key],
vectors: [it.vector],
firstOrder: it.firstOrder, // preserve original input order for stable sort later
}));
// Cache pairwise singleton distances to avoid recomputing as
// clusters grow. distMatrix[i][j] = avg cosine distance between
// cluster i's vectors and cluster j's vectors. Recomputed on merge.
while (clusters.length > 1) {
let best = { dist: Infinity, i: -1, j: -1 };
for (let i = 0; i < clusters.length; i++) {
for (let j = i + 1; j < clusters.length; j++) {
const d = avgLinkageDistance(clusters[i], clusters[j]);
if (d < best.dist) {
best = { dist: d, i, j };
}
}
}
if (best.dist > distanceThreshold) break;
// Merge clusters[j] into clusters[i], remove clusters[j]
clusters[best.i].members.push(...clusters[best.j].members);
clusters[best.i].vectors.push(...clusters[best.j].vectors);
// Keep the earliest firstOrder so the merged cluster sorts to
// the position of its earliest-appearing member.
if (clusters[best.j].firstOrder < clusters[best.i].firstOrder) {
clusters[best.i].firstOrder = clusters[best.j].firstOrder;
}
clusters.splice(best.j, 1);
}
// Sort by first-appearance order so Speaker_A = whoever spoke first.
clusters.sort((a, b) => a.firstOrder - b.firstOrder);
return clusters;
}
function avgLinkageDistance(c1, c2) {
let sum = 0;
let count = 0;
for (const v1 of c1.vectors) {
for (const v2 of c2.vectors) {
sum += 1 - cosineSimilarity(v1, v2);
count += 1;
}
}
return count > 0 ? sum / count : Infinity;
}
// ─── Public: cluster per-chunk diarization → global speaker map ─────
//
// Input shape (one entry per chunk; failed/skipped chunks are
// silently filtered):
// {
// ok: true,
// chunkIndex: 5,
// startSeconds: 1080,
// segments: [{ start, end, speaker_local, confidence }],
// speakers_local: ["Speaker_0", "Speaker_1"],
// fingerprints: { "Speaker_0": [192 floats], "Speaker_1": [192 floats] }
// }
//
// Returns:
// {
// globalMap: Map<"chunkIdx:localLabel", "Speaker_A">,
// speakers: {
// Speaker_A: { turns, total_speaking_seconds, mean_confidence,
// chunks_appeared_in, fingerprint_count },
// ...
// },
// clusterCount: 2,
// thresholdSimilarity: 0.70
// }
//
// When fingerprintCount === 0 (diarization off or all chunks failed)
// returns an empty result: { globalMap: empty, speakers: {}, ... }.
// Post-cluster suppression tunables. After the initial agglomerative
// cluster pass, walk the resulting clusters and re-categorize the
// small ones to fix the "14 speakers detected when really only 2"
// case Grant flagged on a 2h53m podcast. The clustering algorithm
// itself stays strict (no false-positive merges); suppression is a
// second pass that operates on cluster size + cross-cluster
// similarity to catch the noise-induced spurious clusters.
//
// anchor_min_speaking_sec — a cluster needs at least this much
// total speaking time to be considered an "anchor" (= a real
// speaker). Anchors keep their own global ID + colored chip.
//
// small_cluster_max_speaking_sec — clusters with LESS than this
// are suppression candidates. Brief utterances are common false
// positives (background noise, crosstalk fragments, brief
// intros).
//
// uncertain_margin_pct — a small cluster whose best similarity
// to any anchor is within this many percentage points of the
// main threshold gets REASSIGNED to that anchor and marked
// uncertain (chip shows "?"). Far-from-anchor small clusters
// become Speaker_Unknown.
//
// All three are operator-editable via Settings → Operator hardware.
// Defaults are conservative — no false-positive merges into
// anchors, just reassignment of small clusters that are PROBABLY
// the anchor in noisy conditions. A real 30+ second second speaker
// still gets their own chip; only brief flecks of similar voice get
// pulled in.
const DEFAULT_ANCHOR_MIN_SPEAKING_SEC = 30;
const DEFAULT_SMALL_CLUSTER_MAX_SPEAKING_SEC = 15;
const DEFAULT_UNCERTAIN_MARGIN_PCT = 10;
// Average cosine similarity between two clusters' vector sets —
// inverse of avgLinkageDistance, expressed as similarity for
// readability in the suppression logic.
function avgLinkageSimilarity(c1, c2) {
return 1 - avgLinkageDistance(c1, c2);
}
// Clamp an option value to an integer in [lo, hi], falling back to
// `fallback` when the value is missing or non-finite. Used to make
// out-of-range or absent operator settings safe.
function clampInt(v, fallback, lo, hi) {
const n = Number(v);
if (!Number.isFinite(n)) return fallback;
const i = Math.round(n);
if (i < lo) return lo;
if (i > hi) return hi;
return i;
}
export function clusterSpeakers(
chunkDiarization,
clusterThresholdPct = 70,
options = {}
) {
// Use Number.isFinite-guarded fallback rather than the `|| 70`
// idiom — the latter substitutes 70 for ANY falsy value including
// 0 (a valid input we want to clamp to 50, not silently bump up).
const raw = Number(clusterThresholdPct);
const pct = Math.max(50, Math.min(95, Number.isFinite(raw) ? raw : 70));
const similarityThreshold = pct / 100;
const distanceThreshold = 1 - similarityThreshold;
// Operator-tunable suppression thresholds — accept from options
// with Number.isFinite-guarded fallbacks to the conservative
// defaults. Clamped to the same ranges the admin.js SETTINGS_RANGES
// enforces on save, so a hand-edited relay-config.json with an
// out-of-range value still produces sane behavior.
const anchorMinSec = clampInt(
options.anchorMinSpeakingSec,
DEFAULT_ANCHOR_MIN_SPEAKING_SEC,
5,
120
);
const smallMaxSec = clampInt(
options.smallClusterMaxSpeakingSec,
DEFAULT_SMALL_CLUSTER_MAX_SPEAKING_SEC,
1,
60
);
const uncertainMarginPct = clampInt(
options.uncertainMarginPct,
DEFAULT_UNCERTAIN_MARGIN_PCT,
0,
30
);
const uncertainSimThreshold = Math.max(
0,
similarityThreshold - uncertainMarginPct / 100
);
// Flatten fingerprints into the clustering input. Preserve insertion
// order so the first-appearance speaker gets Speaker_A.
const items = [];
let order = 0;
for (const d of chunkDiarization || []) {
if (!d || !d.ok || !d.fingerprints) continue;
for (const [localLabel, vector] of Object.entries(d.fingerprints)) {
if (!Array.isArray(vector) || vector.length === 0) continue;
items.push({
key: `${d.chunkIndex}:${localLabel}`,
vector,
firstOrder: order++,
});
}
}
if (items.length === 0) {
return {
globalMap: new Map(),
uncertaintyMap: new Map(),
speakers: {},
clusterCount: 0,
thresholdSimilarity: similarityThreshold,
};
}
const clusters = agglomerativeCluster(items, distanceThreshold);
// ─── First pass: compute speaking time per cluster ───────────────
// We need cluster sizes BEFORE building the global map so the
// suppression pass can identify anchors. Walk all diar segments,
// map each (chunkIdx, speaker_local) to its cluster index, and
// accumulate seg duration.
const clusterIdxByMember = new Map();
for (let i = 0; i < clusters.length; i++) {
for (const memberKey of clusters[i].members) {
clusterIdxByMember.set(memberKey, i);
}
}
const totalSecsByCluster = new Array(clusters.length).fill(0);
for (const d of chunkDiarization || []) {
if (!d || !d.ok || !Array.isArray(d.segments)) continue;
for (const seg of d.segments) {
const key = `${d.chunkIndex}:${seg.speaker_local}`;
const ci = clusterIdxByMember.get(key);
if (ci === undefined) continue;
const dur = Math.max(0, (seg.end || 0) - (seg.start || 0));
totalSecsByCluster[ci] += dur;
}
}
// ─── Second pass: identify anchors + plan suppression ────────────
const isAnchorIdx = new Array(clusters.length).fill(false);
const anchorIdxs = [];
for (let i = 0; i < clusters.length; i++) {
if (totalSecsByCluster[i] >= anchorMinSec) {
isAnchorIdx[i] = true;
anchorIdxs.push(i);
}
}
// reassignTo[i] = anchor cluster idx that absorbs i; uncertain
// unknownClusters: set of cluster idxs whose members map to
// Speaker_Unknown. keptAsOwn: non-anchor cluster idxs that stay
// as their own speaker (large + low-sim — plausibly a real
// third+ speaker even if rare).
const reassignTo = new Map();
const unknownClusters = new Set();
if (anchorIdxs.length >= 1) {
for (let i = 0; i < clusters.length; i++) {
if (isAnchorIdx[i]) continue;
// Find best anchor by average cosine similarity
let bestAnchorIdx = -1;
let bestSim = -Infinity;
for (const ai of anchorIdxs) {
const sim = avgLinkageSimilarity(clusters[i], clusters[ai]);
if (sim > bestSim) {
bestSim = sim;
bestAnchorIdx = ai;
}
}
const totalSecs = totalSecsByCluster[i];
if (bestAnchorIdx >= 0 && bestSim >= uncertainSimThreshold) {
// Close-to-anchor (within uncertain_margin_pct of
// main threshold) → reassign to anchor with uncertainty.
// Chip will show e.g. "MH?" so the user knows attribution
// is best-guess.
reassignTo.set(i, bestAnchorIdx);
} else if (totalSecs < smallMaxSec) {
// Small + far-from-anchor → Unknown. Brief noise / crosstalk /
// background voices that don't confidently match either main
// speaker. Merged into a single Speaker_Unknown pseudo-
// speaker so the legend doesn't fill with N "unidentified
// brief speaker" entries.
unknownClusters.add(i);
}
// else: large (>= 15s) + far-from-anchor → keep as own speaker.
// Plausibly a real third+ person who's distinct from the main
// anchors. Rare but possible.
}
}
// ─── Build the final cluster-label ordering ──────────────────────
// Order by first-appearance: whoever spoke first in the audio
// gets Speaker_A. Anchors + kept-as-own clusters get labels;
// reassigned + unknown clusters don't.
const ordered = [];
for (let i = 0; i < clusters.length; i++) {
if (reassignTo.has(i) || unknownClusters.has(i)) continue;
ordered.push({ idx: i, firstOrder: clusters[i].firstOrder });
}
ordered.sort((a, b) => a.firstOrder - b.firstOrder);
const labelByOrigIdx = new Map();
for (let j = 0; j < ordered.length; j++) {
labelByOrigIdx.set(ordered[j].idx, globalSpeakerLabel(j));
}
// Reassigned clusters inherit their anchor's label
for (const [i, ai] of reassignTo) {
const anchorLabel = labelByOrigIdx.get(ai);
if (anchorLabel) labelByOrigIdx.set(i, anchorLabel);
}
// ─── Build globalMap + uncertaintyMap ────────────────────────────
const globalMap = new Map();
const uncertaintyMap = new Map();
let hasUnknown = false;
for (let i = 0; i < clusters.length; i++) {
if (unknownClusters.has(i)) {
for (const memberKey of clusters[i].members) {
globalMap.set(memberKey, "Speaker_Unknown");
hasUnknown = true;
}
continue;
}
const label = labelByOrigIdx.get(i);
if (!label) continue;
const isReassigned = reassignTo.has(i);
for (const memberKey of clusters[i].members) {
globalMap.set(memberKey, label);
if (isReassigned) uncertaintyMap.set(memberKey, true);
}
}
// ─── Build the per-speaker summary ───────────────────────────────
const speakers = {};
const seenLabels = new Set([...labelByOrigIdx.values()]);
for (const label of seenLabels) {
speakers[label] = {
turns: 0,
total_speaking_seconds: 0,
mean_confidence: null,
chunks_appeared_in: 0,
fingerprint_count: 0,
};
}
if (hasUnknown) {
speakers["Speaker_Unknown"] = {
turns: 0,
total_speaking_seconds: 0,
mean_confidence: null,
chunks_appeared_in: 0,
fingerprint_count: 0,
};
}
// Accumulate fingerprint counts from clusters that contributed to
// each label. Reassigned clusters' fingerprints count toward
// their anchor's total.
for (let i = 0; i < clusters.length; i++) {
let targetLabel;
if (unknownClusters.has(i)) targetLabel = "Speaker_Unknown";
else targetLabel = labelByOrigIdx.get(i);
if (!targetLabel || !speakers[targetLabel]) continue;
speakers[targetLabel].fingerprint_count += clusters[i].members.length;
}
// Accumulate turns / speaking time / confidence by walking
// diarization segments through the globalMap.
//
// Two earlier bugs fixed here:
//
// 1. UNFINGERPRINTED SEGMENTS WERE SILENTLY DROPPED. The cluster-
// index lookup only contains entries that have fingerprints —
// but Sortformer routinely emits diar segments for speakers
// whose voice TitaNet didn't aggregate a fingerprint for (very
// brief utterances, soft speech, overlapped speech). Those
// segments produced globalId === undefined and were dropped,
// so the "total speech detected" totals understated reality
// dramatically. A 1.5-hour call could show as "34% speech
// detected" when in reality 70%+ of the audio was speech that
// Sortformer found but TitaNet couldn't fingerprint. Now: an
// unmapped segment falls through to Speaker_Unknown so the
// time still gets accounted for. (The chip on the per-line
// transcript still shows "?" for those segments — they just
// aren't claimed by a wrong cluster.)
//
// 2. CHUNK-OVERLAP DOUBLE-COUNTING. Transcribe segments are
// deduped at the chunk overlap boundary (handled in
// hardware.js), but diar segments are not. Until this fix the
// same speech in a 30s overlap zone got counted toward TWO
// chunks, inflating speaker totals. Dedup here using the
// chunk's overlapBoundarySec when present.
const confidenceSum = new Map();
const confidenceCount = new Map();
const chunksByLabel = new Map();
// Ensure Speaker_Unknown exists in speakers map before we attribute
// any unmapped time to it — clusterSpeakers may have created it
// already (via the unknownClusters path) or not (when no clusters
// were suppressed). Either way, we want it as a destination bucket.
if (!speakers["Speaker_Unknown"]) {
speakers["Speaker_Unknown"] = {
turns: 0,
total_speaking_seconds: 0,
mean_confidence: null,
chunks_appeared_in: 0,
fingerprint_count: 0,
};
hasUnknown = true;
}
for (const d of chunkDiarization || []) {
if (!d || !d.ok || !Array.isArray(d.segments)) continue;
// Chunk-overlap dedup: skip any segment whose GLOBAL start time
// sits in the prior chunk's tail (which this chunk overlapped).
// chunkOverlapBoundary is the global timestamp BEFORE which
// segments in this chunk are duplicates of the prior chunk's
// tail. Comes from the chunk planner (audio-meta.js) and is
// 0 for chunk 0 (no prior chunk → no dedup).
const chunkOverlapBoundary =
typeof d.chunkOverlapBoundarySec === "number"
? d.chunkOverlapBoundarySec
: 0;
const labelsInThisChunk = new Set();
for (const seg of d.segments) {
if ((seg.start || 0) < chunkOverlapBoundary) continue;
let globalId = globalMap.get(`${d.chunkIndex}:${seg.speaker_local}`);
// Unmapped (no fingerprint produced for this speaker_local in
// this chunk) → bucket into Speaker_Unknown rather than drop.
if (!globalId || !speakers[globalId]) {
globalId = "Speaker_Unknown";
}
speakers[globalId].turns += 1;
const segDuration = Math.max(0, (seg.end || 0) - (seg.start || 0));
speakers[globalId].total_speaking_seconds += segDuration;
if (typeof seg.confidence === "number" && Number.isFinite(seg.confidence)) {
confidenceSum.set(globalId, (confidenceSum.get(globalId) || 0) + seg.confidence);
confidenceCount.set(globalId, (confidenceCount.get(globalId) || 0) + 1);
}
labelsInThisChunk.add(globalId);
}
for (const label of labelsInThisChunk) {
if (!chunksByLabel.has(label)) chunksByLabel.set(label, new Set());
chunksByLabel.get(label).add(d.chunkIndex);
}
}
// If Speaker_Unknown ended up with zero turns (no unmapped + no
// suppressed clusters contributed), drop it from the legend so we
// don't show "? Unknown 0:00" by default.
if (speakers["Speaker_Unknown"] && speakers["Speaker_Unknown"].turns === 0) {
delete speakers["Speaker_Unknown"];
hasUnknown = false;
}
for (const label of Object.keys(speakers)) {
if (confidenceCount.get(label)) {
speakers[label].mean_confidence =
confidenceSum.get(label) / confidenceCount.get(label);
}
speakers[label].chunks_appeared_in = (chunksByLabel.get(label) || new Set()).size;
speakers[label].total_speaking_seconds =
Math.round(speakers[label].total_speaking_seconds * 10) / 10;
}
// Logging: surface the suppression summary so operators can see
// what happened ("14 clusters → 2 anchors + 12 small/uncertain
// suppressed").
const reassignedCount = reassignTo.size;
const unknownClusterCount = unknownClusters.size;
const finalCount =
Object.keys(speakers).length - (hasUnknown ? 1 : 0);
console.log(
`[clustering] ${clusters.length} raw clusters → ${finalCount} primary + ` +
`${reassignedCount} reassigned (uncertain) + ${unknownClusterCount} unknown ` +
`(anchors >= ${anchorMinSec}s, uncertain margin ${uncertainMarginPct}%, ` +
`unknown < ${smallMaxSec}s)`
);
return {
globalMap,
uncertaintyMap,
speakers,
clusterCount: clusters.length,
thresholdSimilarity: similarityThreshold,
};
}
// ─── Public: stamp global speaker labels onto transcript segments ───
//
// Walks the merged transcript segments and assigns each one a
// `speaker` (global ID, e.g. "Speaker_A") + `speaker_confidence`
// based on which diarization segment its midpoint falls inside.
// When no diar segment covers the midpoint, we fall back to nearest-
// midpoint matching with a 5-second window — beyond that, leave the
// speaker null so the frontend can render as "(speaker unknown)" or
// just drop the label.
//
// Mutates the segments in-place (and also returns the array) so
// callers don't have to remember which they got.
export function assignSpeakersToSegments(segments, chunkDiarization, globalMap, uncertaintyMap = null) {
if (!Array.isArray(segments) || segments.length === 0) return segments;
if (!globalMap || globalMap.size === 0) {
// Diarization didn't run / produced nothing — leave segments
// alone. Caller can detect this state via speakers === {}.
return segments;
}
// Flatten all per-chunk diar segments into one timeline annotated
// with the global speaker label + the suppression-uncertainty
// flag (set when a small cluster was reassigned to an anchor —
// chip will show "?" so the user knows attribution is best-guess).
const flatDiar = [];
for (const d of chunkDiarization || []) {
if (!d || !d.ok || !Array.isArray(d.segments)) continue;
for (const seg of d.segments) {
const memberKey = `${d.chunkIndex}:${seg.speaker_local}`;
const globalId = globalMap.get(memberKey);
if (!globalId) continue;
const uncertain = uncertaintyMap ? !!uncertaintyMap.get(memberKey) : false;
flatDiar.push({
start: seg.start || 0,
end: seg.end || 0,
speaker: globalId,
confidence: typeof seg.confidence === "number" ? seg.confidence : null,
uncertain,
});
}
}
flatDiar.sort((a, b) => a.start - b.start);
const NEAREST_FALLBACK_WINDOW_SEC = 5;
for (const e of segments) {
const mid = ((e.start || 0) + (e.end || 0)) / 2;
// Find segments that contain the midpoint
let containing = null;
let containingMostOverlap = 0;
for (const d of flatDiar) {
if (d.start <= mid && mid <= d.end) {
// Score by overlap with the entry to handle the rare case of
// multiple diar segments straddling one transcript line
// (chunk overlap zones, choppy speaker turns).
const overlap =
Math.min(d.end, e.end || 0) - Math.max(d.start, e.start || 0);
if (overlap > containingMostOverlap) {
containing = d;
containingMostOverlap = overlap;
}
}
}
if (containing) {
e.speaker = containing.speaker;
e.speaker_confidence = containing.confidence;
e.speaker_uncertain = !!containing.uncertain;
continue;
}
// Fall back to nearest by midpoint distance (within window)
let nearest = null;
let nearestDist = Infinity;
for (const d of flatDiar) {
const dMid = (d.start + d.end) / 2;
const dist = Math.abs(dMid - mid);
if (dist < nearestDist) {
nearestDist = dist;
nearest = d;
}
}
if (nearest && nearestDist <= NEAREST_FALLBACK_WINDOW_SEC) {
e.speaker = nearest.speaker;
e.speaker_confidence = nearest.confidence;
e.speaker_uncertain = !!nearest.uncertain;
} else {
e.speaker = null;
e.speaker_confidence = null;
e.speaker_uncertain = false;
}
}
return segments;
}