Add internal-meetings pipeline and post-hoc speaker tools
This commit is contained in:
@@ -0,0 +1,624 @@
|
||||
// Cross-chunk speaker reconciliation for Phase 1D of the diarization
|
||||
// pipeline. Spark Control's /api/audio/diarize-chunk hands back
|
||||
// per-chunk speaker labels ("Speaker_0", "Speaker_1") that are local
|
||||
// to one chunk — Sortformer has no memory across calls, so Speaker_0
|
||||
// in chunk 5 might or might not be the same person as Speaker_0 in
|
||||
// chunk 6. The per-speaker 192-dim TitaNet voice embedding it also
|
||||
// returns IS persistent though, so we cluster fingerprints across
|
||||
// chunks via cosine similarity to recover the global speaker identity.
|
||||
//
|
||||
// Algorithm: average-linkage agglomerative clustering. Start with N
|
||||
// singleton clusters (one per fingerprint), repeatedly merge the
|
||||
// closest pair until no pair is closer than the operator-configured
|
||||
// threshold. Average-linkage was the choice over single/complete
|
||||
// because it's robust to outlier embeddings (one bad embedding from
|
||||
// a noisy chunk doesn't anchor or repel an entire cluster).
|
||||
//
|
||||
// Complexity: O(N³) where N = total fingerprints across all chunks.
|
||||
// Typical: 2 speakers × 21 chunks = 42 → ~74k ops, sub-millisecond.
|
||||
// Worst case for a 4-hour all-talk-show video: 6 speakers × 48 chunks
|
||||
// = ~288 → ~24M ops, still milliseconds in Node.
|
||||
//
|
||||
// Threshold convention: configured as INTEGER percentage 50-95
|
||||
// representing cosine similarity. 70 (= 0.70 sim) is NeMo's
|
||||
// recommended default for TitaNet embeddings. Internally we work
|
||||
// in cosine DISTANCE (= 1 - similarity) for the merge condition.
|
||||
|
||||
// ─── Cosine similarity ──────────────────────────────────────────────
|
||||
// Standard dot-product / (||a|| * ||b||). Both inputs must be number
|
||||
// arrays of the same length. Returns 0 for any zero-magnitude input
|
||||
// to avoid NaN propagation.
|
||||
export function cosineSimilarity(a, b) {
|
||||
if (!a || !b || a.length !== b.length) return 0;
|
||||
let dot = 0;
|
||||
let na = 0;
|
||||
let nb = 0;
|
||||
for (let i = 0; i < a.length; i++) {
|
||||
dot += a[i] * b[i];
|
||||
na += a[i] * a[i];
|
||||
nb += b[i] * b[i];
|
||||
}
|
||||
if (!na || !nb) return 0;
|
||||
return dot / (Math.sqrt(na) * Math.sqrt(nb));
|
||||
}
|
||||
|
||||
// ─── Cluster ID naming ──────────────────────────────────────────────
|
||||
// Speaker_A, Speaker_B, ..., Speaker_Z, Speaker_AA, Speaker_AB...
|
||||
// Capital letter prefix is intentionally distinct from the chunk-
|
||||
// local "Speaker_0" / "Speaker_1" naming that Sortformer uses, so
|
||||
// the source of a label is obvious at a glance.
|
||||
function globalSpeakerLabel(index) {
|
||||
let n = index;
|
||||
let s = "";
|
||||
do {
|
||||
s = String.fromCharCode(65 + (n % 26)) + s;
|
||||
n = Math.floor(n / 26) - 1;
|
||||
} while (n >= 0);
|
||||
return "Speaker_" + s;
|
||||
}
|
||||
|
||||
// ─── Agglomerative clustering (average linkage) ─────────────────────
|
||||
// Input: array of { key, vector } pairs. `key` is opaque to the
|
||||
// algorithm — we just propagate it into the returned cluster's
|
||||
// `members` list for the caller to map back to (chunkIdx, localLabel).
|
||||
//
|
||||
// Output: array of clusters, each { members: [keys], vectors: [vecs] }.
|
||||
// Order: clusters are emitted in the order their FIRST member was
|
||||
// added to the input — keeps "Speaker_A" assigned to whoever spoke
|
||||
// first across the audio, which is the natural user expectation.
|
||||
function agglomerativeCluster(items, distanceThreshold) {
|
||||
if (items.length === 0) return [];
|
||||
const clusters = items.map((it) => ({
|
||||
members: [it.key],
|
||||
vectors: [it.vector],
|
||||
firstOrder: it.firstOrder, // preserve original input order for stable sort later
|
||||
}));
|
||||
// Cache pairwise singleton distances to avoid recomputing as
|
||||
// clusters grow. distMatrix[i][j] = avg cosine distance between
|
||||
// cluster i's vectors and cluster j's vectors. Recomputed on merge.
|
||||
while (clusters.length > 1) {
|
||||
let best = { dist: Infinity, i: -1, j: -1 };
|
||||
for (let i = 0; i < clusters.length; i++) {
|
||||
for (let j = i + 1; j < clusters.length; j++) {
|
||||
const d = avgLinkageDistance(clusters[i], clusters[j]);
|
||||
if (d < best.dist) {
|
||||
best = { dist: d, i, j };
|
||||
}
|
||||
}
|
||||
}
|
||||
if (best.dist > distanceThreshold) break;
|
||||
// Merge clusters[j] into clusters[i], remove clusters[j]
|
||||
clusters[best.i].members.push(...clusters[best.j].members);
|
||||
clusters[best.i].vectors.push(...clusters[best.j].vectors);
|
||||
// Keep the earliest firstOrder so the merged cluster sorts to
|
||||
// the position of its earliest-appearing member.
|
||||
if (clusters[best.j].firstOrder < clusters[best.i].firstOrder) {
|
||||
clusters[best.i].firstOrder = clusters[best.j].firstOrder;
|
||||
}
|
||||
clusters.splice(best.j, 1);
|
||||
}
|
||||
// Sort by first-appearance order so Speaker_A = whoever spoke first.
|
||||
clusters.sort((a, b) => a.firstOrder - b.firstOrder);
|
||||
return clusters;
|
||||
}
|
||||
|
||||
function avgLinkageDistance(c1, c2) {
|
||||
let sum = 0;
|
||||
let count = 0;
|
||||
for (const v1 of c1.vectors) {
|
||||
for (const v2 of c2.vectors) {
|
||||
sum += 1 - cosineSimilarity(v1, v2);
|
||||
count += 1;
|
||||
}
|
||||
}
|
||||
return count > 0 ? sum / count : Infinity;
|
||||
}
|
||||
|
||||
// ─── Public: cluster per-chunk diarization → global speaker map ─────
|
||||
//
|
||||
// Input shape (one entry per chunk; failed/skipped chunks are
|
||||
// silently filtered):
|
||||
// {
|
||||
// ok: true,
|
||||
// chunkIndex: 5,
|
||||
// startSeconds: 1080,
|
||||
// segments: [{ start, end, speaker_local, confidence }],
|
||||
// speakers_local: ["Speaker_0", "Speaker_1"],
|
||||
// fingerprints: { "Speaker_0": [192 floats], "Speaker_1": [192 floats] }
|
||||
// }
|
||||
//
|
||||
// Returns:
|
||||
// {
|
||||
// globalMap: Map<"chunkIdx:localLabel", "Speaker_A">,
|
||||
// speakers: {
|
||||
// Speaker_A: { turns, total_speaking_seconds, mean_confidence,
|
||||
// chunks_appeared_in, fingerprint_count },
|
||||
// ...
|
||||
// },
|
||||
// clusterCount: 2,
|
||||
// thresholdSimilarity: 0.70
|
||||
// }
|
||||
//
|
||||
// When fingerprintCount === 0 (diarization off or all chunks failed)
|
||||
// returns an empty result: { globalMap: empty, speakers: {}, ... }.
|
||||
// Post-cluster suppression tunables. After the initial agglomerative
|
||||
// cluster pass, walk the resulting clusters and re-categorize the
|
||||
// small ones to fix the "14 speakers detected when really only 2"
|
||||
// case Grant flagged on a 2h53m podcast. The clustering algorithm
|
||||
// itself stays strict (no false-positive merges); suppression is a
|
||||
// second pass that operates on cluster size + cross-cluster
|
||||
// similarity to catch the noise-induced spurious clusters.
|
||||
//
|
||||
// anchor_min_speaking_sec — a cluster needs at least this much
|
||||
// total speaking time to be considered an "anchor" (= a real
|
||||
// speaker). Anchors keep their own global ID + colored chip.
|
||||
//
|
||||
// small_cluster_max_speaking_sec — clusters with LESS than this
|
||||
// are suppression candidates. Brief utterances are common false
|
||||
// positives (background noise, crosstalk fragments, brief
|
||||
// intros).
|
||||
//
|
||||
// uncertain_margin_pct — a small cluster whose best similarity
|
||||
// to any anchor is within this many percentage points of the
|
||||
// main threshold gets REASSIGNED to that anchor and marked
|
||||
// uncertain (chip shows "?"). Far-from-anchor small clusters
|
||||
// become Speaker_Unknown.
|
||||
//
|
||||
// All three are operator-editable via Settings → Operator hardware.
|
||||
// Defaults are conservative — no false-positive merges into
|
||||
// anchors, just reassignment of small clusters that are PROBABLY
|
||||
// the anchor in noisy conditions. A real 30+ second second speaker
|
||||
// still gets their own chip; only brief flecks of similar voice get
|
||||
// pulled in.
|
||||
const DEFAULT_ANCHOR_MIN_SPEAKING_SEC = 30;
|
||||
const DEFAULT_SMALL_CLUSTER_MAX_SPEAKING_SEC = 15;
|
||||
const DEFAULT_UNCERTAIN_MARGIN_PCT = 10;
|
||||
|
||||
// Average cosine similarity between two clusters' vector sets —
|
||||
// inverse of avgLinkageDistance, expressed as similarity for
|
||||
// readability in the suppression logic.
|
||||
function avgLinkageSimilarity(c1, c2) {
|
||||
return 1 - avgLinkageDistance(c1, c2);
|
||||
}
|
||||
|
||||
// Clamp an option value to an integer in [lo, hi], falling back to
|
||||
// `fallback` when the value is missing or non-finite. Used to make
|
||||
// out-of-range or absent operator settings safe.
|
||||
function clampInt(v, fallback, lo, hi) {
|
||||
const n = Number(v);
|
||||
if (!Number.isFinite(n)) return fallback;
|
||||
const i = Math.round(n);
|
||||
if (i < lo) return lo;
|
||||
if (i > hi) return hi;
|
||||
return i;
|
||||
}
|
||||
|
||||
export function clusterSpeakers(
|
||||
chunkDiarization,
|
||||
clusterThresholdPct = 70,
|
||||
options = {}
|
||||
) {
|
||||
// Use Number.isFinite-guarded fallback rather than the `|| 70`
|
||||
// idiom — the latter substitutes 70 for ANY falsy value including
|
||||
// 0 (a valid input we want to clamp to 50, not silently bump up).
|
||||
const raw = Number(clusterThresholdPct);
|
||||
const pct = Math.max(50, Math.min(95, Number.isFinite(raw) ? raw : 70));
|
||||
const similarityThreshold = pct / 100;
|
||||
const distanceThreshold = 1 - similarityThreshold;
|
||||
|
||||
// Operator-tunable suppression thresholds — accept from options
|
||||
// with Number.isFinite-guarded fallbacks to the conservative
|
||||
// defaults. Clamped to the same ranges the admin.js SETTINGS_RANGES
|
||||
// enforces on save, so a hand-edited relay-config.json with an
|
||||
// out-of-range value still produces sane behavior.
|
||||
const anchorMinSec = clampInt(
|
||||
options.anchorMinSpeakingSec,
|
||||
DEFAULT_ANCHOR_MIN_SPEAKING_SEC,
|
||||
5,
|
||||
120
|
||||
);
|
||||
const smallMaxSec = clampInt(
|
||||
options.smallClusterMaxSpeakingSec,
|
||||
DEFAULT_SMALL_CLUSTER_MAX_SPEAKING_SEC,
|
||||
1,
|
||||
60
|
||||
);
|
||||
const uncertainMarginPct = clampInt(
|
||||
options.uncertainMarginPct,
|
||||
DEFAULT_UNCERTAIN_MARGIN_PCT,
|
||||
0,
|
||||
30
|
||||
);
|
||||
const uncertainSimThreshold = Math.max(
|
||||
0,
|
||||
similarityThreshold - uncertainMarginPct / 100
|
||||
);
|
||||
|
||||
// Flatten fingerprints into the clustering input. Preserve insertion
|
||||
// order so the first-appearance speaker gets Speaker_A.
|
||||
const items = [];
|
||||
let order = 0;
|
||||
for (const d of chunkDiarization || []) {
|
||||
if (!d || !d.ok || !d.fingerprints) continue;
|
||||
for (const [localLabel, vector] of Object.entries(d.fingerprints)) {
|
||||
if (!Array.isArray(vector) || vector.length === 0) continue;
|
||||
items.push({
|
||||
key: `${d.chunkIndex}:${localLabel}`,
|
||||
vector,
|
||||
firstOrder: order++,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
if (items.length === 0) {
|
||||
return {
|
||||
globalMap: new Map(),
|
||||
uncertaintyMap: new Map(),
|
||||
speakers: {},
|
||||
clusterCount: 0,
|
||||
thresholdSimilarity: similarityThreshold,
|
||||
};
|
||||
}
|
||||
|
||||
const clusters = agglomerativeCluster(items, distanceThreshold);
|
||||
|
||||
// ─── First pass: compute speaking time per cluster ───────────────
|
||||
// We need cluster sizes BEFORE building the global map so the
|
||||
// suppression pass can identify anchors. Walk all diar segments,
|
||||
// map each (chunkIdx, speaker_local) to its cluster index, and
|
||||
// accumulate seg duration.
|
||||
const clusterIdxByMember = new Map();
|
||||
for (let i = 0; i < clusters.length; i++) {
|
||||
for (const memberKey of clusters[i].members) {
|
||||
clusterIdxByMember.set(memberKey, i);
|
||||
}
|
||||
}
|
||||
const totalSecsByCluster = new Array(clusters.length).fill(0);
|
||||
for (const d of chunkDiarization || []) {
|
||||
if (!d || !d.ok || !Array.isArray(d.segments)) continue;
|
||||
for (const seg of d.segments) {
|
||||
const key = `${d.chunkIndex}:${seg.speaker_local}`;
|
||||
const ci = clusterIdxByMember.get(key);
|
||||
if (ci === undefined) continue;
|
||||
const dur = Math.max(0, (seg.end || 0) - (seg.start || 0));
|
||||
totalSecsByCluster[ci] += dur;
|
||||
}
|
||||
}
|
||||
|
||||
// ─── Second pass: identify anchors + plan suppression ────────────
|
||||
const isAnchorIdx = new Array(clusters.length).fill(false);
|
||||
const anchorIdxs = [];
|
||||
for (let i = 0; i < clusters.length; i++) {
|
||||
if (totalSecsByCluster[i] >= anchorMinSec) {
|
||||
isAnchorIdx[i] = true;
|
||||
anchorIdxs.push(i);
|
||||
}
|
||||
}
|
||||
|
||||
// reassignTo[i] = anchor cluster idx that absorbs i; uncertain
|
||||
// unknownClusters: set of cluster idxs whose members map to
|
||||
// Speaker_Unknown. keptAsOwn: non-anchor cluster idxs that stay
|
||||
// as their own speaker (large + low-sim — plausibly a real
|
||||
// third+ speaker even if rare).
|
||||
const reassignTo = new Map();
|
||||
const unknownClusters = new Set();
|
||||
if (anchorIdxs.length >= 1) {
|
||||
for (let i = 0; i < clusters.length; i++) {
|
||||
if (isAnchorIdx[i]) continue;
|
||||
// Find best anchor by average cosine similarity
|
||||
let bestAnchorIdx = -1;
|
||||
let bestSim = -Infinity;
|
||||
for (const ai of anchorIdxs) {
|
||||
const sim = avgLinkageSimilarity(clusters[i], clusters[ai]);
|
||||
if (sim > bestSim) {
|
||||
bestSim = sim;
|
||||
bestAnchorIdx = ai;
|
||||
}
|
||||
}
|
||||
const totalSecs = totalSecsByCluster[i];
|
||||
if (bestAnchorIdx >= 0 && bestSim >= uncertainSimThreshold) {
|
||||
// Close-to-anchor (within uncertain_margin_pct of
|
||||
// main threshold) → reassign to anchor with uncertainty.
|
||||
// Chip will show e.g. "MH?" so the user knows attribution
|
||||
// is best-guess.
|
||||
reassignTo.set(i, bestAnchorIdx);
|
||||
} else if (totalSecs < smallMaxSec) {
|
||||
// Small + far-from-anchor → Unknown. Brief noise / crosstalk /
|
||||
// background voices that don't confidently match either main
|
||||
// speaker. Merged into a single Speaker_Unknown pseudo-
|
||||
// speaker so the legend doesn't fill with N "unidentified
|
||||
// brief speaker" entries.
|
||||
unknownClusters.add(i);
|
||||
}
|
||||
// else: large (>= 15s) + far-from-anchor → keep as own speaker.
|
||||
// Plausibly a real third+ person who's distinct from the main
|
||||
// anchors. Rare but possible.
|
||||
}
|
||||
}
|
||||
|
||||
// ─── Build the final cluster-label ordering ──────────────────────
|
||||
// Order by first-appearance: whoever spoke first in the audio
|
||||
// gets Speaker_A. Anchors + kept-as-own clusters get labels;
|
||||
// reassigned + unknown clusters don't.
|
||||
const ordered = [];
|
||||
for (let i = 0; i < clusters.length; i++) {
|
||||
if (reassignTo.has(i) || unknownClusters.has(i)) continue;
|
||||
ordered.push({ idx: i, firstOrder: clusters[i].firstOrder });
|
||||
}
|
||||
ordered.sort((a, b) => a.firstOrder - b.firstOrder);
|
||||
const labelByOrigIdx = new Map();
|
||||
for (let j = 0; j < ordered.length; j++) {
|
||||
labelByOrigIdx.set(ordered[j].idx, globalSpeakerLabel(j));
|
||||
}
|
||||
// Reassigned clusters inherit their anchor's label
|
||||
for (const [i, ai] of reassignTo) {
|
||||
const anchorLabel = labelByOrigIdx.get(ai);
|
||||
if (anchorLabel) labelByOrigIdx.set(i, anchorLabel);
|
||||
}
|
||||
|
||||
// ─── Build globalMap + uncertaintyMap ────────────────────────────
|
||||
const globalMap = new Map();
|
||||
const uncertaintyMap = new Map();
|
||||
let hasUnknown = false;
|
||||
for (let i = 0; i < clusters.length; i++) {
|
||||
if (unknownClusters.has(i)) {
|
||||
for (const memberKey of clusters[i].members) {
|
||||
globalMap.set(memberKey, "Speaker_Unknown");
|
||||
hasUnknown = true;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
const label = labelByOrigIdx.get(i);
|
||||
if (!label) continue;
|
||||
const isReassigned = reassignTo.has(i);
|
||||
for (const memberKey of clusters[i].members) {
|
||||
globalMap.set(memberKey, label);
|
||||
if (isReassigned) uncertaintyMap.set(memberKey, true);
|
||||
}
|
||||
}
|
||||
|
||||
// ─── Build the per-speaker summary ───────────────────────────────
|
||||
const speakers = {};
|
||||
const seenLabels = new Set([...labelByOrigIdx.values()]);
|
||||
for (const label of seenLabels) {
|
||||
speakers[label] = {
|
||||
turns: 0,
|
||||
total_speaking_seconds: 0,
|
||||
mean_confidence: null,
|
||||
chunks_appeared_in: 0,
|
||||
fingerprint_count: 0,
|
||||
};
|
||||
}
|
||||
if (hasUnknown) {
|
||||
speakers["Speaker_Unknown"] = {
|
||||
turns: 0,
|
||||
total_speaking_seconds: 0,
|
||||
mean_confidence: null,
|
||||
chunks_appeared_in: 0,
|
||||
fingerprint_count: 0,
|
||||
};
|
||||
}
|
||||
// Accumulate fingerprint counts from clusters that contributed to
|
||||
// each label. Reassigned clusters' fingerprints count toward
|
||||
// their anchor's total.
|
||||
for (let i = 0; i < clusters.length; i++) {
|
||||
let targetLabel;
|
||||
if (unknownClusters.has(i)) targetLabel = "Speaker_Unknown";
|
||||
else targetLabel = labelByOrigIdx.get(i);
|
||||
if (!targetLabel || !speakers[targetLabel]) continue;
|
||||
speakers[targetLabel].fingerprint_count += clusters[i].members.length;
|
||||
}
|
||||
|
||||
// Accumulate turns / speaking time / confidence by walking
|
||||
// diarization segments through the globalMap.
|
||||
//
|
||||
// Two earlier bugs fixed here:
|
||||
//
|
||||
// 1. UNFINGERPRINTED SEGMENTS WERE SILENTLY DROPPED. The cluster-
|
||||
// index lookup only contains entries that have fingerprints —
|
||||
// but Sortformer routinely emits diar segments for speakers
|
||||
// whose voice TitaNet didn't aggregate a fingerprint for (very
|
||||
// brief utterances, soft speech, overlapped speech). Those
|
||||
// segments produced globalId === undefined and were dropped,
|
||||
// so the "total speech detected" totals understated reality
|
||||
// dramatically. A 1.5-hour call could show as "34% speech
|
||||
// detected" when in reality 70%+ of the audio was speech that
|
||||
// Sortformer found but TitaNet couldn't fingerprint. Now: an
|
||||
// unmapped segment falls through to Speaker_Unknown so the
|
||||
// time still gets accounted for. (The chip on the per-line
|
||||
// transcript still shows "?" for those segments — they just
|
||||
// aren't claimed by a wrong cluster.)
|
||||
//
|
||||
// 2. CHUNK-OVERLAP DOUBLE-COUNTING. Transcribe segments are
|
||||
// deduped at the chunk overlap boundary (handled in
|
||||
// hardware.js), but diar segments are not. Until this fix the
|
||||
// same speech in a 30s overlap zone got counted toward TWO
|
||||
// chunks, inflating speaker totals. Dedup here using the
|
||||
// chunk's overlapBoundarySec when present.
|
||||
const confidenceSum = new Map();
|
||||
const confidenceCount = new Map();
|
||||
const chunksByLabel = new Map();
|
||||
|
||||
// Ensure Speaker_Unknown exists in speakers map before we attribute
|
||||
// any unmapped time to it — clusterSpeakers may have created it
|
||||
// already (via the unknownClusters path) or not (when no clusters
|
||||
// were suppressed). Either way, we want it as a destination bucket.
|
||||
if (!speakers["Speaker_Unknown"]) {
|
||||
speakers["Speaker_Unknown"] = {
|
||||
turns: 0,
|
||||
total_speaking_seconds: 0,
|
||||
mean_confidence: null,
|
||||
chunks_appeared_in: 0,
|
||||
fingerprint_count: 0,
|
||||
};
|
||||
hasUnknown = true;
|
||||
}
|
||||
|
||||
for (const d of chunkDiarization || []) {
|
||||
if (!d || !d.ok || !Array.isArray(d.segments)) continue;
|
||||
// Chunk-overlap dedup: skip any segment whose GLOBAL start time
|
||||
// sits in the prior chunk's tail (which this chunk overlapped).
|
||||
// chunkOverlapBoundary is the global timestamp BEFORE which
|
||||
// segments in this chunk are duplicates of the prior chunk's
|
||||
// tail. Comes from the chunk planner (audio-meta.js) and is
|
||||
// 0 for chunk 0 (no prior chunk → no dedup).
|
||||
const chunkOverlapBoundary =
|
||||
typeof d.chunkOverlapBoundarySec === "number"
|
||||
? d.chunkOverlapBoundarySec
|
||||
: 0;
|
||||
const labelsInThisChunk = new Set();
|
||||
for (const seg of d.segments) {
|
||||
if ((seg.start || 0) < chunkOverlapBoundary) continue;
|
||||
let globalId = globalMap.get(`${d.chunkIndex}:${seg.speaker_local}`);
|
||||
// Unmapped (no fingerprint produced for this speaker_local in
|
||||
// this chunk) → bucket into Speaker_Unknown rather than drop.
|
||||
if (!globalId || !speakers[globalId]) {
|
||||
globalId = "Speaker_Unknown";
|
||||
}
|
||||
speakers[globalId].turns += 1;
|
||||
const segDuration = Math.max(0, (seg.end || 0) - (seg.start || 0));
|
||||
speakers[globalId].total_speaking_seconds += segDuration;
|
||||
if (typeof seg.confidence === "number" && Number.isFinite(seg.confidence)) {
|
||||
confidenceSum.set(globalId, (confidenceSum.get(globalId) || 0) + seg.confidence);
|
||||
confidenceCount.set(globalId, (confidenceCount.get(globalId) || 0) + 1);
|
||||
}
|
||||
labelsInThisChunk.add(globalId);
|
||||
}
|
||||
for (const label of labelsInThisChunk) {
|
||||
if (!chunksByLabel.has(label)) chunksByLabel.set(label, new Set());
|
||||
chunksByLabel.get(label).add(d.chunkIndex);
|
||||
}
|
||||
}
|
||||
|
||||
// If Speaker_Unknown ended up with zero turns (no unmapped + no
|
||||
// suppressed clusters contributed), drop it from the legend so we
|
||||
// don't show "? Unknown 0:00" by default.
|
||||
if (speakers["Speaker_Unknown"] && speakers["Speaker_Unknown"].turns === 0) {
|
||||
delete speakers["Speaker_Unknown"];
|
||||
hasUnknown = false;
|
||||
}
|
||||
for (const label of Object.keys(speakers)) {
|
||||
if (confidenceCount.get(label)) {
|
||||
speakers[label].mean_confidence =
|
||||
confidenceSum.get(label) / confidenceCount.get(label);
|
||||
}
|
||||
speakers[label].chunks_appeared_in = (chunksByLabel.get(label) || new Set()).size;
|
||||
speakers[label].total_speaking_seconds =
|
||||
Math.round(speakers[label].total_speaking_seconds * 10) / 10;
|
||||
}
|
||||
|
||||
// Logging: surface the suppression summary so operators can see
|
||||
// what happened ("14 clusters → 2 anchors + 12 small/uncertain
|
||||
// suppressed").
|
||||
const reassignedCount = reassignTo.size;
|
||||
const unknownClusterCount = unknownClusters.size;
|
||||
const finalCount =
|
||||
Object.keys(speakers).length - (hasUnknown ? 1 : 0);
|
||||
console.log(
|
||||
`[clustering] ${clusters.length} raw clusters → ${finalCount} primary + ` +
|
||||
`${reassignedCount} reassigned (uncertain) + ${unknownClusterCount} unknown ` +
|
||||
`(anchors >= ${anchorMinSec}s, uncertain margin ${uncertainMarginPct}%, ` +
|
||||
`unknown < ${smallMaxSec}s)`
|
||||
);
|
||||
|
||||
return {
|
||||
globalMap,
|
||||
uncertaintyMap,
|
||||
speakers,
|
||||
clusterCount: clusters.length,
|
||||
thresholdSimilarity: similarityThreshold,
|
||||
};
|
||||
}
|
||||
|
||||
// ─── Public: stamp global speaker labels onto transcript segments ───
|
||||
//
|
||||
// Walks the merged transcript segments and assigns each one a
|
||||
// `speaker` (global ID, e.g. "Speaker_A") + `speaker_confidence`
|
||||
// based on which diarization segment its midpoint falls inside.
|
||||
// When no diar segment covers the midpoint, we fall back to nearest-
|
||||
// midpoint matching with a 5-second window — beyond that, leave the
|
||||
// speaker null so the frontend can render as "(speaker unknown)" or
|
||||
// just drop the label.
|
||||
//
|
||||
// Mutates the segments in-place (and also returns the array) so
|
||||
// callers don't have to remember which they got.
|
||||
export function assignSpeakersToSegments(segments, chunkDiarization, globalMap, uncertaintyMap = null) {
|
||||
if (!Array.isArray(segments) || segments.length === 0) return segments;
|
||||
if (!globalMap || globalMap.size === 0) {
|
||||
// Diarization didn't run / produced nothing — leave segments
|
||||
// alone. Caller can detect this state via speakers === {}.
|
||||
return segments;
|
||||
}
|
||||
// Flatten all per-chunk diar segments into one timeline annotated
|
||||
// with the global speaker label + the suppression-uncertainty
|
||||
// flag (set when a small cluster was reassigned to an anchor —
|
||||
// chip will show "?" so the user knows attribution is best-guess).
|
||||
const flatDiar = [];
|
||||
for (const d of chunkDiarization || []) {
|
||||
if (!d || !d.ok || !Array.isArray(d.segments)) continue;
|
||||
for (const seg of d.segments) {
|
||||
const memberKey = `${d.chunkIndex}:${seg.speaker_local}`;
|
||||
const globalId = globalMap.get(memberKey);
|
||||
if (!globalId) continue;
|
||||
const uncertain = uncertaintyMap ? !!uncertaintyMap.get(memberKey) : false;
|
||||
flatDiar.push({
|
||||
start: seg.start || 0,
|
||||
end: seg.end || 0,
|
||||
speaker: globalId,
|
||||
confidence: typeof seg.confidence === "number" ? seg.confidence : null,
|
||||
uncertain,
|
||||
});
|
||||
}
|
||||
}
|
||||
flatDiar.sort((a, b) => a.start - b.start);
|
||||
|
||||
const NEAREST_FALLBACK_WINDOW_SEC = 5;
|
||||
|
||||
for (const e of segments) {
|
||||
const mid = ((e.start || 0) + (e.end || 0)) / 2;
|
||||
// Find segments that contain the midpoint
|
||||
let containing = null;
|
||||
let containingMostOverlap = 0;
|
||||
for (const d of flatDiar) {
|
||||
if (d.start <= mid && mid <= d.end) {
|
||||
// Score by overlap with the entry to handle the rare case of
|
||||
// multiple diar segments straddling one transcript line
|
||||
// (chunk overlap zones, choppy speaker turns).
|
||||
const overlap =
|
||||
Math.min(d.end, e.end || 0) - Math.max(d.start, e.start || 0);
|
||||
if (overlap > containingMostOverlap) {
|
||||
containing = d;
|
||||
containingMostOverlap = overlap;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (containing) {
|
||||
e.speaker = containing.speaker;
|
||||
e.speaker_confidence = containing.confidence;
|
||||
e.speaker_uncertain = !!containing.uncertain;
|
||||
continue;
|
||||
}
|
||||
// Fall back to nearest by midpoint distance (within window)
|
||||
let nearest = null;
|
||||
let nearestDist = Infinity;
|
||||
for (const d of flatDiar) {
|
||||
const dMid = (d.start + d.end) / 2;
|
||||
const dist = Math.abs(dMid - mid);
|
||||
if (dist < nearestDist) {
|
||||
nearestDist = dist;
|
||||
nearest = d;
|
||||
}
|
||||
}
|
||||
if (nearest && nearestDist <= NEAREST_FALLBACK_WINDOW_SEC) {
|
||||
e.speaker = nearest.speaker;
|
||||
e.speaker_confidence = nearest.confidence;
|
||||
e.speaker_uncertain = !!nearest.uncertain;
|
||||
} else {
|
||||
e.speaker = null;
|
||||
e.speaker_confidence = null;
|
||||
e.speaker_uncertain = false;
|
||||
}
|
||||
}
|
||||
|
||||
return segments;
|
||||
}
|
||||
Reference in New Issue
Block a user