Add internal-meetings pipeline and post-hoc speaker tools

2026-06-13 13:35:53 -05:00
parent 9a2dbf69df
commit 705807e286
15 changed files with 7375 additions and 0 deletions
@@ -0,0 +1,624 @@
+// Cross-chunk speaker reconciliation for Phase 1D of the diarization
+// pipeline. Spark Control's /api/audio/diarize-chunk hands back
+// per-chunk speaker labels ("Speaker_0", "Speaker_1") that are local
+// to one chunk — Sortformer has no memory across calls, so Speaker_0
+// in chunk 5 might or might not be the same person as Speaker_0 in
+// chunk 6. The per-speaker 192-dim TitaNet voice embedding it also
+// returns IS persistent though, so we cluster fingerprints across
+// chunks via cosine similarity to recover the global speaker identity.
+//
+// Algorithm: average-linkage agglomerative clustering. Start with N
+// singleton clusters (one per fingerprint), repeatedly merge the
+// closest pair until no pair is closer than the operator-configured
+// threshold. Average-linkage was the choice over single/complete
+// because it's robust to outlier embeddings (one bad embedding from
+// a noisy chunk doesn't anchor or repel an entire cluster).
+//
+// Complexity: O(N³) where N = total fingerprints across all chunks.
+// Typical: 2 speakers × 21 chunks = 42 → ~74k ops, sub-millisecond.
+// Worst case for a 4-hour all-talk-show video: 6 speakers × 48 chunks
+// = ~288 → ~24M ops, still milliseconds in Node.
+//
+// Threshold convention: configured as INTEGER percentage 50-95
+// representing cosine similarity. 70 (= 0.70 sim) is NeMo's
+// recommended default for TitaNet embeddings. Internally we work
+// in cosine DISTANCE (= 1 - similarity) for the merge condition.
+
+// ─── Cosine similarity ──────────────────────────────────────────────
+// Standard dot-product / (||a|| * ||b||). Both inputs must be number
+// arrays of the same length. Returns 0 for any zero-magnitude input
+// to avoid NaN propagation.
+export function cosineSimilarity(a, b) {
+  if (!a || !b || a.length !== b.length) return 0;
+  let dot = 0;
+  let na = 0;
+  let nb = 0;
+  for (let i = 0; i < a.length; i++) {
+    dot += a[i] * b[i];
+    na += a[i] * a[i];
+    nb += b[i] * b[i];
+  }
+  if (!na || !nb) return 0;
+  return dot / (Math.sqrt(na) * Math.sqrt(nb));
+}
+
+// ─── Cluster ID naming ──────────────────────────────────────────────
+// Speaker_A, Speaker_B, ..., Speaker_Z, Speaker_AA, Speaker_AB...
+// Capital letter prefix is intentionally distinct from the chunk-
+// local "Speaker_0" / "Speaker_1" naming that Sortformer uses, so
+// the source of a label is obvious at a glance.
+function globalSpeakerLabel(index) {
+  let n = index;
+  let s = "";
+  do {
+    s = String.fromCharCode(65 + (n % 26)) + s;
+    n = Math.floor(n / 26) - 1;
+  } while (n >= 0);
+  return "Speaker_" + s;
+}
+
+// ─── Agglomerative clustering (average linkage) ─────────────────────
+// Input: array of { key, vector } pairs. `key` is opaque to the
+// algorithm — we just propagate it into the returned cluster's
+// `members` list for the caller to map back to (chunkIdx, localLabel).
+//
+// Output: array of clusters, each { members: [keys], vectors: [vecs] }.
+// Order: clusters are emitted in the order their FIRST member was
+// added to the input — keeps "Speaker_A" assigned to whoever spoke
+// first across the audio, which is the natural user expectation.
+function agglomerativeCluster(items, distanceThreshold) {
+  if (items.length === 0) return [];
+  const clusters = items.map((it) => ({
+    members: [it.key],
+    vectors: [it.vector],
+    firstOrder: it.firstOrder, // preserve original input order for stable sort later
+  }));
+  // Cache pairwise singleton distances to avoid recomputing as
+  // clusters grow. distMatrix[i][j] = avg cosine distance between
+  // cluster i's vectors and cluster j's vectors. Recomputed on merge.
+  while (clusters.length > 1) {
+    let best = { dist: Infinity, i: -1, j: -1 };
+    for (let i = 0; i < clusters.length; i++) {
+      for (let j = i + 1; j < clusters.length; j++) {
+        const d = avgLinkageDistance(clusters[i], clusters[j]);
+        if (d < best.dist) {
+          best = { dist: d, i, j };
+        }
+      }
+    }
+    if (best.dist > distanceThreshold) break;
+    // Merge clusters[j] into clusters[i], remove clusters[j]
+    clusters[best.i].members.push(...clusters[best.j].members);
+    clusters[best.i].vectors.push(...clusters[best.j].vectors);
+    // Keep the earliest firstOrder so the merged cluster sorts to
+    // the position of its earliest-appearing member.
+    if (clusters[best.j].firstOrder < clusters[best.i].firstOrder) {
+      clusters[best.i].firstOrder = clusters[best.j].firstOrder;
+    }
+    clusters.splice(best.j, 1);
+  }
+  // Sort by first-appearance order so Speaker_A = whoever spoke first.
+  clusters.sort((a, b) => a.firstOrder - b.firstOrder);
+  return clusters;
+}
+
+function avgLinkageDistance(c1, c2) {
+  let sum = 0;
+  let count = 0;
+  for (const v1 of c1.vectors) {
+    for (const v2 of c2.vectors) {
+      sum += 1 - cosineSimilarity(v1, v2);
+      count += 1;
+    }
+  }
+  return count > 0 ? sum / count : Infinity;
+}
+
+// ─── Public: cluster per-chunk diarization → global speaker map ─────
+//
+// Input shape (one entry per chunk; failed/skipped chunks are
+// silently filtered):
+//   {
+//     ok: true,
+//     chunkIndex: 5,
+//     startSeconds: 1080,
+//     segments: [{ start, end, speaker_local, confidence }],
+//     speakers_local: ["Speaker_0", "Speaker_1"],
+//     fingerprints: { "Speaker_0": [192 floats], "Speaker_1": [192 floats] }
+//   }
+//
+// Returns:
+//   {
+//     globalMap: Map<"chunkIdx:localLabel", "Speaker_A">,
+//     speakers: {
+//       Speaker_A: { turns, total_speaking_seconds, mean_confidence,
+//                    chunks_appeared_in, fingerprint_count },
+//       ...
+//     },
+//     clusterCount: 2,
+//     thresholdSimilarity: 0.70
+//   }
+//
+// When fingerprintCount === 0 (diarization off or all chunks failed)
+// returns an empty result: { globalMap: empty, speakers: {}, ... }.
+// Post-cluster suppression tunables. After the initial agglomerative
+// cluster pass, walk the resulting clusters and re-categorize the
+// small ones to fix the "14 speakers detected when really only 2"
+// case Grant flagged on a 2h53m podcast. The clustering algorithm
+// itself stays strict (no false-positive merges); suppression is a
+// second pass that operates on cluster size + cross-cluster
+// similarity to catch the noise-induced spurious clusters.
+//
+//   anchor_min_speaking_sec — a cluster needs at least this much
+//   total speaking time to be considered an "anchor" (= a real
+//   speaker). Anchors keep their own global ID + colored chip.
+//
+//   small_cluster_max_speaking_sec — clusters with LESS than this
+//   are suppression candidates. Brief utterances are common false
+//   positives (background noise, crosstalk fragments, brief
+//   intros).
+//
+//   uncertain_margin_pct — a small cluster whose best similarity
+//   to any anchor is within this many percentage points of the
+//   main threshold gets REASSIGNED to that anchor and marked
+//   uncertain (chip shows "?"). Far-from-anchor small clusters
+//   become Speaker_Unknown.
+//
+// All three are operator-editable via Settings → Operator hardware.
+// Defaults are conservative — no false-positive merges into
+// anchors, just reassignment of small clusters that are PROBABLY
+// the anchor in noisy conditions. A real 30+ second second speaker
+// still gets their own chip; only brief flecks of similar voice get
+// pulled in.
+const DEFAULT_ANCHOR_MIN_SPEAKING_SEC = 30;
+const DEFAULT_SMALL_CLUSTER_MAX_SPEAKING_SEC = 15;
+const DEFAULT_UNCERTAIN_MARGIN_PCT = 10;
+
+// Average cosine similarity between two clusters' vector sets —
+// inverse of avgLinkageDistance, expressed as similarity for
+// readability in the suppression logic.
+function avgLinkageSimilarity(c1, c2) {
+  return 1 - avgLinkageDistance(c1, c2);
+}
+
+// Clamp an option value to an integer in [lo, hi], falling back to
+// `fallback` when the value is missing or non-finite. Used to make
+// out-of-range or absent operator settings safe.
+function clampInt(v, fallback, lo, hi) {
+  const n = Number(v);
+  if (!Number.isFinite(n)) return fallback;
+  const i = Math.round(n);
+  if (i < lo) return lo;
+  if (i > hi) return hi;
+  return i;
+}
+
+export function clusterSpeakers(
+  chunkDiarization,
+  clusterThresholdPct = 70,
+  options = {}
+) {
+  // Use Number.isFinite-guarded fallback rather than the `|| 70`
+  // idiom — the latter substitutes 70 for ANY falsy value including
+  // 0 (a valid input we want to clamp to 50, not silently bump up).
+  const raw = Number(clusterThresholdPct);
+  const pct = Math.max(50, Math.min(95, Number.isFinite(raw) ? raw : 70));
+  const similarityThreshold = pct / 100;
+  const distanceThreshold = 1 - similarityThreshold;
+
+  // Operator-tunable suppression thresholds — accept from options
+  // with Number.isFinite-guarded fallbacks to the conservative
+  // defaults. Clamped to the same ranges the admin.js SETTINGS_RANGES
+  // enforces on save, so a hand-edited relay-config.json with an
+  // out-of-range value still produces sane behavior.
+  const anchorMinSec = clampInt(
+    options.anchorMinSpeakingSec,
+    DEFAULT_ANCHOR_MIN_SPEAKING_SEC,
+    5,
+    120
+  );
+  const smallMaxSec = clampInt(
+    options.smallClusterMaxSpeakingSec,
+    DEFAULT_SMALL_CLUSTER_MAX_SPEAKING_SEC,
+    1,
+    60
+  );
+  const uncertainMarginPct = clampInt(
+    options.uncertainMarginPct,
+    DEFAULT_UNCERTAIN_MARGIN_PCT,
+    0,
+    30
+  );
+  const uncertainSimThreshold = Math.max(
+    0,
+    similarityThreshold - uncertainMarginPct / 100
+  );
+
+  // Flatten fingerprints into the clustering input. Preserve insertion
+  // order so the first-appearance speaker gets Speaker_A.
+  const items = [];
+  let order = 0;
+  for (const d of chunkDiarization || []) {
+    if (!d || !d.ok || !d.fingerprints) continue;
+    for (const [localLabel, vector] of Object.entries(d.fingerprints)) {
+      if (!Array.isArray(vector) || vector.length === 0) continue;
+      items.push({
+        key: `${d.chunkIndex}:${localLabel}`,
+        vector,
+        firstOrder: order++,
+      });
+    }
+  }
+
+  if (items.length === 0) {
+    return {
+      globalMap: new Map(),
+      uncertaintyMap: new Map(),
+      speakers: {},
+      clusterCount: 0,
+      thresholdSimilarity: similarityThreshold,
+    };
+  }
+
+  const clusters = agglomerativeCluster(items, distanceThreshold);
+
+  // ─── First pass: compute speaking time per cluster ───────────────
+  // We need cluster sizes BEFORE building the global map so the
+  // suppression pass can identify anchors. Walk all diar segments,
+  // map each (chunkIdx, speaker_local) to its cluster index, and
+  // accumulate seg duration.
+  const clusterIdxByMember = new Map();
+  for (let i = 0; i < clusters.length; i++) {
+    for (const memberKey of clusters[i].members) {
+      clusterIdxByMember.set(memberKey, i);
+    }
+  }
+  const totalSecsByCluster = new Array(clusters.length).fill(0);
+  for (const d of chunkDiarization || []) {
+    if (!d || !d.ok || !Array.isArray(d.segments)) continue;
+    for (const seg of d.segments) {
+      const key = `${d.chunkIndex}:${seg.speaker_local}`;
+      const ci = clusterIdxByMember.get(key);
+      if (ci === undefined) continue;
+      const dur = Math.max(0, (seg.end || 0) - (seg.start || 0));
+      totalSecsByCluster[ci] += dur;
+    }
+  }
+
+  // ─── Second pass: identify anchors + plan suppression ────────────
+  const isAnchorIdx = new Array(clusters.length).fill(false);
+  const anchorIdxs = [];
+  for (let i = 0; i < clusters.length; i++) {
+    if (totalSecsByCluster[i] >= anchorMinSec) {
+      isAnchorIdx[i] = true;
+      anchorIdxs.push(i);
+    }
+  }
+
+  // reassignTo[i] = anchor cluster idx that absorbs i; uncertain
+  // unknownClusters: set of cluster idxs whose members map to
+  // Speaker_Unknown. keptAsOwn: non-anchor cluster idxs that stay
+  // as their own speaker (large + low-sim — plausibly a real
+  // third+ speaker even if rare).
+  const reassignTo = new Map();
+  const unknownClusters = new Set();
+  if (anchorIdxs.length >= 1) {
+    for (let i = 0; i < clusters.length; i++) {
+      if (isAnchorIdx[i]) continue;
+      // Find best anchor by average cosine similarity
+      let bestAnchorIdx = -1;
+      let bestSim = -Infinity;
+      for (const ai of anchorIdxs) {
+        const sim = avgLinkageSimilarity(clusters[i], clusters[ai]);
+        if (sim > bestSim) {
+          bestSim = sim;
+          bestAnchorIdx = ai;
+        }
+      }
+      const totalSecs = totalSecsByCluster[i];
+      if (bestAnchorIdx >= 0 && bestSim >= uncertainSimThreshold) {
+        // Close-to-anchor (within uncertain_margin_pct of
+        // main threshold) → reassign to anchor with uncertainty.
+        // Chip will show e.g. "MH?" so the user knows attribution
+        // is best-guess.
+        reassignTo.set(i, bestAnchorIdx);
+      } else if (totalSecs < smallMaxSec) {
+        // Small + far-from-anchor → Unknown. Brief noise / crosstalk /
+        // background voices that don't confidently match either main
+        // speaker. Merged into a single Speaker_Unknown pseudo-
+        // speaker so the legend doesn't fill with N "unidentified
+        // brief speaker" entries.
+        unknownClusters.add(i);
+      }
+      // else: large (>= 15s) + far-from-anchor → keep as own speaker.
+      // Plausibly a real third+ person who's distinct from the main
+      // anchors. Rare but possible.
+    }
+  }
+
+  // ─── Build the final cluster-label ordering ──────────────────────
+  // Order by first-appearance: whoever spoke first in the audio
+  // gets Speaker_A. Anchors + kept-as-own clusters get labels;
+  // reassigned + unknown clusters don't.
+  const ordered = [];
+  for (let i = 0; i < clusters.length; i++) {
+    if (reassignTo.has(i) || unknownClusters.has(i)) continue;
+    ordered.push({ idx: i, firstOrder: clusters[i].firstOrder });
+  }
+  ordered.sort((a, b) => a.firstOrder - b.firstOrder);
+  const labelByOrigIdx = new Map();
+  for (let j = 0; j < ordered.length; j++) {
+    labelByOrigIdx.set(ordered[j].idx, globalSpeakerLabel(j));
+  }
+  // Reassigned clusters inherit their anchor's label
+  for (const [i, ai] of reassignTo) {
+    const anchorLabel = labelByOrigIdx.get(ai);
+    if (anchorLabel) labelByOrigIdx.set(i, anchorLabel);
+  }
+
+  // ─── Build globalMap + uncertaintyMap ────────────────────────────
+  const globalMap = new Map();
+  const uncertaintyMap = new Map();
+  let hasUnknown = false;
+  for (let i = 0; i < clusters.length; i++) {
+    if (unknownClusters.has(i)) {
+      for (const memberKey of clusters[i].members) {
+        globalMap.set(memberKey, "Speaker_Unknown");
+        hasUnknown = true;
+      }
+      continue;
+    }
+    const label = labelByOrigIdx.get(i);
+    if (!label) continue;
+    const isReassigned = reassignTo.has(i);
+    for (const memberKey of clusters[i].members) {
+      globalMap.set(memberKey, label);
+      if (isReassigned) uncertaintyMap.set(memberKey, true);
+    }
+  }
+
+  // ─── Build the per-speaker summary ───────────────────────────────
+  const speakers = {};
+  const seenLabels = new Set([...labelByOrigIdx.values()]);
+  for (const label of seenLabels) {
+    speakers[label] = {
+      turns: 0,
+      total_speaking_seconds: 0,
+      mean_confidence: null,
+      chunks_appeared_in: 0,
+      fingerprint_count: 0,
+    };
+  }
+  if (hasUnknown) {
+    speakers["Speaker_Unknown"] = {
+      turns: 0,
+      total_speaking_seconds: 0,
+      mean_confidence: null,
+      chunks_appeared_in: 0,
+      fingerprint_count: 0,
+    };
+  }
+  // Accumulate fingerprint counts from clusters that contributed to
+  // each label. Reassigned clusters' fingerprints count toward
+  // their anchor's total.
+  for (let i = 0; i < clusters.length; i++) {
+    let targetLabel;
+    if (unknownClusters.has(i)) targetLabel = "Speaker_Unknown";
+    else targetLabel = labelByOrigIdx.get(i);
+    if (!targetLabel || !speakers[targetLabel]) continue;
+    speakers[targetLabel].fingerprint_count += clusters[i].members.length;
+  }
+
+  // Accumulate turns / speaking time / confidence by walking
+  // diarization segments through the globalMap.
+  //
+  // Two earlier bugs fixed here:
+  //
+  // 1. UNFINGERPRINTED SEGMENTS WERE SILENTLY DROPPED. The cluster-
+  //    index lookup only contains entries that have fingerprints —
+  //    but Sortformer routinely emits diar segments for speakers
+  //    whose voice TitaNet didn't aggregate a fingerprint for (very
+  //    brief utterances, soft speech, overlapped speech). Those
+  //    segments produced globalId === undefined and were dropped,
+  //    so the "total speech detected" totals understated reality
+  //    dramatically. A 1.5-hour call could show as "34% speech
+  //    detected" when in reality 70%+ of the audio was speech that
+  //    Sortformer found but TitaNet couldn't fingerprint. Now: an
+  //    unmapped segment falls through to Speaker_Unknown so the
+  //    time still gets accounted for. (The chip on the per-line
+  //    transcript still shows "?" for those segments — they just
+  //    aren't claimed by a wrong cluster.)
+  //
+  // 2. CHUNK-OVERLAP DOUBLE-COUNTING. Transcribe segments are
+  //    deduped at the chunk overlap boundary (handled in
+  //    hardware.js), but diar segments are not. Until this fix the
+  //    same speech in a 30s overlap zone got counted toward TWO
+  //    chunks, inflating speaker totals. Dedup here using the
+  //    chunk's overlapBoundarySec when present.
+  const confidenceSum = new Map();
+  const confidenceCount = new Map();
+  const chunksByLabel = new Map();
+
+  // Ensure Speaker_Unknown exists in speakers map before we attribute
+  // any unmapped time to it — clusterSpeakers may have created it
+  // already (via the unknownClusters path) or not (when no clusters
+  // were suppressed). Either way, we want it as a destination bucket.
+  if (!speakers["Speaker_Unknown"]) {
+    speakers["Speaker_Unknown"] = {
+      turns: 0,
+      total_speaking_seconds: 0,
+      mean_confidence: null,
+      chunks_appeared_in: 0,
+      fingerprint_count: 0,
+    };
+    hasUnknown = true;
+  }
+
+  for (const d of chunkDiarization || []) {
+    if (!d || !d.ok || !Array.isArray(d.segments)) continue;
+    // Chunk-overlap dedup: skip any segment whose GLOBAL start time
+    // sits in the prior chunk's tail (which this chunk overlapped).
+    // chunkOverlapBoundary is the global timestamp BEFORE which
+    // segments in this chunk are duplicates of the prior chunk's
+    // tail. Comes from the chunk planner (audio-meta.js) and is
+    // 0 for chunk 0 (no prior chunk → no dedup).
+    const chunkOverlapBoundary =
+      typeof d.chunkOverlapBoundarySec === "number"
+        ? d.chunkOverlapBoundarySec
+        : 0;
+    const labelsInThisChunk = new Set();
+    for (const seg of d.segments) {
+      if ((seg.start || 0) < chunkOverlapBoundary) continue;
+      let globalId = globalMap.get(`${d.chunkIndex}:${seg.speaker_local}`);
+      // Unmapped (no fingerprint produced for this speaker_local in
+      // this chunk) → bucket into Speaker_Unknown rather than drop.
+      if (!globalId || !speakers[globalId]) {
+        globalId = "Speaker_Unknown";
+      }
+      speakers[globalId].turns += 1;
+      const segDuration = Math.max(0, (seg.end || 0) - (seg.start || 0));
+      speakers[globalId].total_speaking_seconds += segDuration;
+      if (typeof seg.confidence === "number" && Number.isFinite(seg.confidence)) {
+        confidenceSum.set(globalId, (confidenceSum.get(globalId) || 0) + seg.confidence);
+        confidenceCount.set(globalId, (confidenceCount.get(globalId) || 0) + 1);
+      }
+      labelsInThisChunk.add(globalId);
+    }
+    for (const label of labelsInThisChunk) {
+      if (!chunksByLabel.has(label)) chunksByLabel.set(label, new Set());
+      chunksByLabel.get(label).add(d.chunkIndex);
+    }
+  }
+
+  // If Speaker_Unknown ended up with zero turns (no unmapped + no
+  // suppressed clusters contributed), drop it from the legend so we
+  // don't show "? Unknown 0:00" by default.
+  if (speakers["Speaker_Unknown"] && speakers["Speaker_Unknown"].turns === 0) {
+    delete speakers["Speaker_Unknown"];
+    hasUnknown = false;
+  }
+  for (const label of Object.keys(speakers)) {
+    if (confidenceCount.get(label)) {
+      speakers[label].mean_confidence =
+        confidenceSum.get(label) / confidenceCount.get(label);
+    }
+    speakers[label].chunks_appeared_in = (chunksByLabel.get(label) || new Set()).size;
+    speakers[label].total_speaking_seconds =
+      Math.round(speakers[label].total_speaking_seconds * 10) / 10;
+  }
+
+  // Logging: surface the suppression summary so operators can see
+  // what happened ("14 clusters → 2 anchors + 12 small/uncertain
+  // suppressed").
+  const reassignedCount = reassignTo.size;
+  const unknownClusterCount = unknownClusters.size;
+  const finalCount =
+    Object.keys(speakers).length - (hasUnknown ? 1 : 0);
+  console.log(
+    `[clustering] ${clusters.length} raw clusters → ${finalCount} primary + ` +
+      `${reassignedCount} reassigned (uncertain) + ${unknownClusterCount} unknown ` +
+      `(anchors >= ${anchorMinSec}s, uncertain margin ${uncertainMarginPct}%, ` +
+      `unknown < ${smallMaxSec}s)`
+  );
+
+  return {
+    globalMap,
+    uncertaintyMap,
+    speakers,
+    clusterCount: clusters.length,
+    thresholdSimilarity: similarityThreshold,
+  };
+}
+
+// ─── Public: stamp global speaker labels onto transcript segments ───
+//
+// Walks the merged transcript segments and assigns each one a
+// `speaker` (global ID, e.g. "Speaker_A") + `speaker_confidence`
+// based on which diarization segment its midpoint falls inside.
+// When no diar segment covers the midpoint, we fall back to nearest-
+// midpoint matching with a 5-second window — beyond that, leave the
+// speaker null so the frontend can render as "(speaker unknown)" or
+// just drop the label.
+//
+// Mutates the segments in-place (and also returns the array) so
+// callers don't have to remember which they got.
+export function assignSpeakersToSegments(segments, chunkDiarization, globalMap, uncertaintyMap = null) {
+  if (!Array.isArray(segments) || segments.length === 0) return segments;
+  if (!globalMap || globalMap.size === 0) {
+    // Diarization didn't run / produced nothing — leave segments
+    // alone. Caller can detect this state via speakers === {}.
+    return segments;
+  }
+  // Flatten all per-chunk diar segments into one timeline annotated
+  // with the global speaker label + the suppression-uncertainty
+  // flag (set when a small cluster was reassigned to an anchor —
+  // chip will show "?" so the user knows attribution is best-guess).
+  const flatDiar = [];
+  for (const d of chunkDiarization || []) {
+    if (!d || !d.ok || !Array.isArray(d.segments)) continue;
+    for (const seg of d.segments) {
+      const memberKey = `${d.chunkIndex}:${seg.speaker_local}`;
+      const globalId = globalMap.get(memberKey);
+      if (!globalId) continue;
+      const uncertain = uncertaintyMap ? !!uncertaintyMap.get(memberKey) : false;
+      flatDiar.push({
+        start: seg.start || 0,
+        end: seg.end || 0,
+        speaker: globalId,
+        confidence: typeof seg.confidence === "number" ? seg.confidence : null,
+        uncertain,
+      });
+    }
+  }
+  flatDiar.sort((a, b) => a.start - b.start);
+
+  const NEAREST_FALLBACK_WINDOW_SEC = 5;
+
+  for (const e of segments) {
+    const mid = ((e.start || 0) + (e.end || 0)) / 2;
+    // Find segments that contain the midpoint
+    let containing = null;
+    let containingMostOverlap = 0;
+    for (const d of flatDiar) {
+      if (d.start <= mid && mid <= d.end) {
+        // Score by overlap with the entry to handle the rare case of
+        // multiple diar segments straddling one transcript line
+        // (chunk overlap zones, choppy speaker turns).
+        const overlap =
+          Math.min(d.end, e.end || 0) - Math.max(d.start, e.start || 0);
+        if (overlap > containingMostOverlap) {
+          containing = d;
+          containingMostOverlap = overlap;
+        }
+      }
+    }
+    if (containing) {
+      e.speaker = containing.speaker;
+      e.speaker_confidence = containing.confidence;
+      e.speaker_uncertain = !!containing.uncertain;
+      continue;
+    }
+    // Fall back to nearest by midpoint distance (within window)
+    let nearest = null;
+    let nearestDist = Infinity;
+    for (const d of flatDiar) {
+      const dMid = (d.start + d.end) / 2;
+      const dist = Math.abs(dMid - mid);
+      if (dist < nearestDist) {
+        nearestDist = dist;
+        nearest = d;
+      }
+    }
+    if (nearest && nearestDist <= NEAREST_FALLBACK_WINDOW_SEC) {
+      e.speaker = nearest.speaker;
+      e.speaker_confidence = nearest.confidence;
+      e.speaker_uncertain = !!nearest.uncertain;
+    } else {
+      e.speaker = null;
+      e.speaker_confidence = null;
+      e.speaker_uncertain = false;
+    }
+  }
+
+  return segments;
+}