recap-relay/server/speaker-clustering.js

// Cross-chunk speaker reconciliation for Phase 1D of the diarization
// pipeline. Spark Control's /api/audio/diarize-chunk hands back
// per-chunk speaker labels ("Speaker_0", "Speaker_1") that are local
// to one chunk — Sortformer has no memory across calls, so Speaker_0
// in chunk 5 might or might not be the same person as Speaker_0 in
// chunk 6. The per-speaker 192-dim TitaNet voice embedding it also
// returns IS persistent though, so we cluster fingerprints across
// chunks via cosine similarity to recover the global speaker identity.
//
// Algorithm: average-linkage agglomerative clustering. Start with N
// singleton clusters (one per fingerprint), repeatedly merge the
// closest pair until no pair is closer than the operator-configured
// threshold. Average-linkage was the choice over single/complete
// because it's robust to outlier embeddings (one bad embedding from
// a noisy chunk doesn't anchor or repel an entire cluster).
//
// Complexity: O(N³) where N = total fingerprints across all chunks.
// Typical: 2 speakers × 21 chunks = 42 → ~74k ops, sub-millisecond.
// Worst case for a 4-hour all-talk-show video: 6 speakers × 48 chunks
// = ~288 → ~24M ops, still milliseconds in Node.
//
// Threshold convention: configured as INTEGER percentage 50-95
// representing cosine similarity. 70 (= 0.70 sim) is NeMo's
// recommended default for TitaNet embeddings. Internally we work
// in cosine DISTANCE (= 1 - similarity) for the merge condition.

// ─── Cosine similarity ──────────────────────────────────────────────
// Standard dot-product / (||a|| * ||b||). Both inputs must be number
// arrays of the same length. Returns 0 for any zero-magnitude input
// to avoid NaN propagation.
export function cosineSimilarity(a, b) {
  if (!a || !b || a.length !== b.length) return 0;
  let dot = 0;
  let na = 0;
  let nb = 0;
  for (let i = 0; i < a.length; i++) {
    dot += a[i] * b[i];
    na += a[i] * a[i];
    nb += b[i] * b[i];
  }
  if (!na || !nb) return 0;
  return dot / (Math.sqrt(na) * Math.sqrt(nb));
}

// ─── Cluster ID naming ──────────────────────────────────────────────
// Speaker_A, Speaker_B, ..., Speaker_Z, Speaker_AA, Speaker_AB...
// Capital letter prefix is intentionally distinct from the chunk-
// local "Speaker_0" / "Speaker_1" naming that Sortformer uses, so
// the source of a label is obvious at a glance.
function globalSpeakerLabel(index) {
  let n = index;
  let s = "";
  do {
    s = String.fromCharCode(65 + (n % 26)) + s;
    n = Math.floor(n / 26) - 1;
  } while (n >= 0);
  return "Speaker_" + s;
}

// ─── Agglomerative clustering (average linkage) ─────────────────────
// Input: array of { key, vector } pairs. `key` is opaque to the
// algorithm — we just propagate it into the returned cluster's
// `members` list for the caller to map back to (chunkIdx, localLabel).
//
// Output: array of clusters, each { members: [keys], vectors: [vecs] }.
// Order: clusters are emitted in the order their FIRST member was
// added to the input — keeps "Speaker_A" assigned to whoever spoke
// first across the audio, which is the natural user expectation.
function agglomerativeCluster(items, distanceThreshold) {
  if (items.length === 0) return [];
  const clusters = items.map((it) => ({
    members: [it.key],
    vectors: [it.vector],
    firstOrder: it.firstOrder, // preserve original input order for stable sort later
  }));
  // Cache pairwise singleton distances to avoid recomputing as
  // clusters grow. distMatrix[i][j] = avg cosine distance between
  // cluster i's vectors and cluster j's vectors. Recomputed on merge.
  while (clusters.length > 1) {
    let best = { dist: Infinity, i: -1, j: -1 };
    for (let i = 0; i < clusters.length; i++) {
      for (let j = i + 1; j < clusters.length; j++) {
        const d = avgLinkageDistance(clusters[i], clusters[j]);
        if (d < best.dist) {
          best = { dist: d, i, j };
        }
      }
    }
    if (best.dist > distanceThreshold) break;
    // Merge clusters[j] into clusters[i], remove clusters[j]
    clusters[best.i].members.push(...clusters[best.j].members);
    clusters[best.i].vectors.push(...clusters[best.j].vectors);
    // Keep the earliest firstOrder so the merged cluster sorts to
    // the position of its earliest-appearing member.
    if (clusters[best.j].firstOrder < clusters[best.i].firstOrder) {
      clusters[best.i].firstOrder = clusters[best.j].firstOrder;
    }
    clusters.splice(best.j, 1);
  }
  // Sort by first-appearance order so Speaker_A = whoever spoke first.
  clusters.sort((a, b) => a.firstOrder - b.firstOrder);
  return clusters;
}

function avgLinkageDistance(c1, c2) {
  let sum = 0;
  let count = 0;
  for (const v1 of c1.vectors) {
    for (const v2 of c2.vectors) {
      sum += 1 - cosineSimilarity(v1, v2);
      count += 1;
    }
  }
  return count > 0 ? sum / count : Infinity;
}

// ─── Public: cluster per-chunk diarization → global speaker map ─────
//
// Input shape (one entry per chunk; failed/skipped chunks are
// silently filtered):
//   {
//     ok: true,
//     chunkIndex: 5,
//     startSeconds: 1080,
//     segments: [{ start, end, speaker_local, confidence }],
//     speakers_local: ["Speaker_0", "Speaker_1"],
//     fingerprints: { "Speaker_0": [192 floats], "Speaker_1": [192 floats] }
//   }
//
// Returns:
//   {
//     globalMap: Map<"chunkIdx:localLabel", "Speaker_A">,
//     speakers: {
//       Speaker_A: { turns, total_speaking_seconds, mean_confidence,
//                    chunks_appeared_in, fingerprint_count },
//       ...
//     },
//     clusterCount: 2,
//     thresholdSimilarity: 0.70
//   }
//
// When fingerprintCount === 0 (diarization off or all chunks failed)
// returns an empty result: { globalMap: empty, speakers: {}, ... }.
// Post-cluster suppression tunables. After the initial agglomerative
// cluster pass, walk the resulting clusters and re-categorize the
// small ones to fix the "14 speakers detected when really only 2"
// case Grant flagged on a 2h53m podcast. The clustering algorithm
// itself stays strict (no false-positive merges); suppression is a
// second pass that operates on cluster size + cross-cluster
// similarity to catch the noise-induced spurious clusters.
//
//   anchor_min_speaking_sec — a cluster needs at least this much
//   total speaking time to be considered an "anchor" (= a real
//   speaker). Anchors keep their own global ID + colored chip.
//
//   small_cluster_max_speaking_sec — clusters with LESS than this
//   are suppression candidates. Brief utterances are common false
//   positives (background noise, crosstalk fragments, brief
//   intros).
//
//   uncertain_margin_pct — a small cluster whose best similarity
//   to any anchor is within this many percentage points of the
//   main threshold gets REASSIGNED to that anchor and marked
//   uncertain (chip shows "?"). Far-from-anchor small clusters
//   become Speaker_Unknown.
//
// All three are operator-editable via Settings → Operator hardware.
// Defaults are conservative — no false-positive merges into
// anchors, just reassignment of small clusters that are PROBABLY
// the anchor in noisy conditions. A real 30+ second second speaker
// still gets their own chip; only brief flecks of similar voice get
// pulled in.
const DEFAULT_ANCHOR_MIN_SPEAKING_SEC = 30;
const DEFAULT_SMALL_CLUSTER_MAX_SPEAKING_SEC = 15;
const DEFAULT_UNCERTAIN_MARGIN_PCT = 10;

// Average cosine similarity between two clusters' vector sets —
// inverse of avgLinkageDistance, expressed as similarity for
// readability in the suppression logic.
function avgLinkageSimilarity(c1, c2) {
  return 1 - avgLinkageDistance(c1, c2);
}

// Clamp an option value to an integer in [lo, hi], falling back to
// `fallback` when the value is missing or non-finite. Used to make
// out-of-range or absent operator settings safe.
function clampInt(v, fallback, lo, hi) {
  const n = Number(v);
  if (!Number.isFinite(n)) return fallback;
  const i = Math.round(n);
  if (i < lo) return lo;
  if (i > hi) return hi;
  return i;
}

export function clusterSpeakers(
  chunkDiarization,
  clusterThresholdPct = 70,
  options = {}
) {
  // Use Number.isFinite-guarded fallback rather than the `|| 70`
  // idiom — the latter substitutes 70 for ANY falsy value including
  // 0 (a valid input we want to clamp to 50, not silently bump up).
  const raw = Number(clusterThresholdPct);
  const pct = Math.max(50, Math.min(95, Number.isFinite(raw) ? raw : 70));
  const similarityThreshold = pct / 100;
  const distanceThreshold = 1 - similarityThreshold;

  // Operator-tunable suppression thresholds — accept from options
  // with Number.isFinite-guarded fallbacks to the conservative
  // defaults. Clamped to the same ranges the admin.js SETTINGS_RANGES
  // enforces on save, so a hand-edited relay-config.json with an
  // out-of-range value still produces sane behavior.
  const anchorMinSec = clampInt(
    options.anchorMinSpeakingSec,
    DEFAULT_ANCHOR_MIN_SPEAKING_SEC,
    5,
    120
  );
  const smallMaxSec = clampInt(
    options.smallClusterMaxSpeakingSec,
    DEFAULT_SMALL_CLUSTER_MAX_SPEAKING_SEC,
    1,
    60
  );
  const uncertainMarginPct = clampInt(
    options.uncertainMarginPct,
    DEFAULT_UNCERTAIN_MARGIN_PCT,
    0,
    30
  );
  const uncertainSimThreshold = Math.max(
    0,
    similarityThreshold - uncertainMarginPct / 100
  );

  // Flatten fingerprints into the clustering input. Preserve insertion
  // order so the first-appearance speaker gets Speaker_A.
  const items = [];
  let order = 0;
  for (const d of chunkDiarization || []) {
    if (!d || !d.ok || !d.fingerprints) continue;
    for (const [localLabel, vector] of Object.entries(d.fingerprints)) {
      if (!Array.isArray(vector) || vector.length === 0) continue;
      items.push({
        key: `${d.chunkIndex}:${localLabel}`,
        vector,
        firstOrder: order++,
      });
    }
  }

  if (items.length === 0) {
    return {
      globalMap: new Map(),
      uncertaintyMap: new Map(),
      speakers: {},
      clusterCount: 0,
      thresholdSimilarity: similarityThreshold,
    };
  }

  const clusters = agglomerativeCluster(items, distanceThreshold);

  // ─── First pass: compute speaking time per cluster ───────────────
  // We need cluster sizes BEFORE building the global map so the
  // suppression pass can identify anchors. Walk all diar segments,
  // map each (chunkIdx, speaker_local) to its cluster index, and
  // accumulate seg duration.
  const clusterIdxByMember = new Map();
  for (let i = 0; i < clusters.length; i++) {
    for (const memberKey of clusters[i].members) {
      clusterIdxByMember.set(memberKey, i);
    }
  }
  const totalSecsByCluster = new Array(clusters.length).fill(0);
  for (const d of chunkDiarization || []) {
    if (!d || !d.ok || !Array.isArray(d.segments)) continue;
    for (const seg of d.segments) {
      const key = `${d.chunkIndex}:${seg.speaker_local}`;
      const ci = clusterIdxByMember.get(key);
      if (ci === undefined) continue;
      const dur = Math.max(0, (seg.end || 0) - (seg.start || 0));
      totalSecsByCluster[ci] += dur;
    }
  }

  // ─── Second pass: identify anchors + plan suppression ────────────
  const isAnchorIdx = new Array(clusters.length).fill(false);
  const anchorIdxs = [];
  for (let i = 0; i < clusters.length; i++) {
    if (totalSecsByCluster[i] >= anchorMinSec) {
      isAnchorIdx[i] = true;
      anchorIdxs.push(i);
    }
  }

  // reassignTo[i] = anchor cluster idx that absorbs i; uncertain
  // unknownClusters: set of cluster idxs whose members map to
  // Speaker_Unknown. keptAsOwn: non-anchor cluster idxs that stay
  // as their own speaker (large + low-sim — plausibly a real
  // third+ speaker even if rare).
  const reassignTo = new Map();
  const unknownClusters = new Set();
  if (anchorIdxs.length >= 1) {
    for (let i = 0; i < clusters.length; i++) {
      if (isAnchorIdx[i]) continue;
      // Find best anchor by average cosine similarity
      let bestAnchorIdx = -1;
      let bestSim = -Infinity;
      for (const ai of anchorIdxs) {
        const sim = avgLinkageSimilarity(clusters[i], clusters[ai]);
        if (sim > bestSim) {
          bestSim = sim;
          bestAnchorIdx = ai;
        }
      }
      const totalSecs = totalSecsByCluster[i];
      if (bestAnchorIdx >= 0 && bestSim >= uncertainSimThreshold) {
        // Close-to-anchor (within uncertain_margin_pct of
        // main threshold) → reassign to anchor with uncertainty.
        // Chip will show e.g. "MH?" so the user knows attribution
        // is best-guess.
        reassignTo.set(i, bestAnchorIdx);
      } else if (totalSecs < smallMaxSec) {
        // Small + far-from-anchor → Unknown. Brief noise / crosstalk /
        // background voices that don't confidently match either main
        // speaker. Merged into a single Speaker_Unknown pseudo-
        // speaker so the legend doesn't fill with N "unidentified
        // brief speaker" entries.
        unknownClusters.add(i);
      }
      // else: large (>= 15s) + far-from-anchor → keep as own speaker.
      // Plausibly a real third+ person who's distinct from the main
      // anchors. Rare but possible.
    }
  }

  // ─── Build the final cluster-label ordering ──────────────────────
  // Order by first-appearance: whoever spoke first in the audio
  // gets Speaker_A. Anchors + kept-as-own clusters get labels;
  // reassigned + unknown clusters don't.
  const ordered = [];
  for (let i = 0; i < clusters.length; i++) {
    if (reassignTo.has(i) || unknownClusters.has(i)) continue;
    ordered.push({ idx: i, firstOrder: clusters[i].firstOrder });
  }
  ordered.sort((a, b) => a.firstOrder - b.firstOrder);
  const labelByOrigIdx = new Map();
  for (let j = 0; j < ordered.length; j++) {
    labelByOrigIdx.set(ordered[j].idx, globalSpeakerLabel(j));
  }
  // Reassigned clusters inherit their anchor's label
  for (const [i, ai] of reassignTo) {
    const anchorLabel = labelByOrigIdx.get(ai);
    if (anchorLabel) labelByOrigIdx.set(i, anchorLabel);
  }

  // ─── Build globalMap + uncertaintyMap ────────────────────────────
  const globalMap = new Map();
  const uncertaintyMap = new Map();
  let hasUnknown = false;
  for (let i = 0; i < clusters.length; i++) {
    if (unknownClusters.has(i)) {
      for (const memberKey of clusters[i].members) {
        globalMap.set(memberKey, "Speaker_Unknown");
        hasUnknown = true;
      }
      continue;
    }
    const label = labelByOrigIdx.get(i);
    if (!label) continue;
    const isReassigned = reassignTo.has(i);
    for (const memberKey of clusters[i].members) {
      globalMap.set(memberKey, label);
      if (isReassigned) uncertaintyMap.set(memberKey, true);
    }
  }

  // ─── Build the per-speaker summary ───────────────────────────────
  const speakers = {};
  const seenLabels = new Set([...labelByOrigIdx.values()]);
  for (const label of seenLabels) {
    speakers[label] = {
      turns: 0,
      total_speaking_seconds: 0,
      mean_confidence: null,
      chunks_appeared_in: 0,
      fingerprint_count: 0,
    };
  }
  if (hasUnknown) {
    speakers["Speaker_Unknown"] = {
      turns: 0,
      total_speaking_seconds: 0,
      mean_confidence: null,
      chunks_appeared_in: 0,
      fingerprint_count: 0,
    };
  }
  // Accumulate fingerprint counts from clusters that contributed to
  // each label. Reassigned clusters' fingerprints count toward
  // their anchor's total.
  for (let i = 0; i < clusters.length; i++) {
    let targetLabel;
    if (unknownClusters.has(i)) targetLabel = "Speaker_Unknown";
    else targetLabel = labelByOrigIdx.get(i);
    if (!targetLabel || !speakers[targetLabel]) continue;
    speakers[targetLabel].fingerprint_count += clusters[i].members.length;
  }

  // Accumulate turns / speaking time / confidence by walking
  // diarization segments through the globalMap.
  //
  // Two earlier bugs fixed here:
  //
  // 1. UNFINGERPRINTED SEGMENTS WERE SILENTLY DROPPED. The cluster-
  //    index lookup only contains entries that have fingerprints —
  //    but Sortformer routinely emits diar segments for speakers
  //    whose voice TitaNet didn't aggregate a fingerprint for (very
  //    brief utterances, soft speech, overlapped speech). Those
  //    segments produced globalId === undefined and were dropped,
  //    so the "total speech detected" totals understated reality
  //    dramatically. A 1.5-hour call could show as "34% speech
  //    detected" when in reality 70%+ of the audio was speech that
  //    Sortformer found but TitaNet couldn't fingerprint. Now: an
  //    unmapped segment falls through to Speaker_Unknown so the
  //    time still gets accounted for. (The chip on the per-line
  //    transcript still shows "?" for those segments — they just
  //    aren't claimed by a wrong cluster.)
  //
  // 2. CHUNK-OVERLAP DOUBLE-COUNTING. Transcribe segments are
  //    deduped at the chunk overlap boundary (handled in
  //    hardware.js), but diar segments are not. Until this fix the
  //    same speech in a 30s overlap zone got counted toward TWO
  //    chunks, inflating speaker totals. Dedup here using the
  //    chunk's overlapBoundarySec when present.
  const confidenceSum = new Map();
  const confidenceCount = new Map();
  const chunksByLabel = new Map();

  // Ensure Speaker_Unknown exists in speakers map before we attribute
  // any unmapped time to it — clusterSpeakers may have created it
  // already (via the unknownClusters path) or not (when no clusters
  // were suppressed). Either way, we want it as a destination bucket.
  if (!speakers["Speaker_Unknown"]) {
    speakers["Speaker_Unknown"] = {
      turns: 0,
      total_speaking_seconds: 0,
      mean_confidence: null,
      chunks_appeared_in: 0,
      fingerprint_count: 0,
    };
    hasUnknown = true;
  }

  for (const d of chunkDiarization || []) {
    if (!d || !d.ok || !Array.isArray(d.segments)) continue;
    // Chunk-overlap dedup: skip any segment whose GLOBAL start time
    // sits in the prior chunk's tail (which this chunk overlapped).
    // chunkOverlapBoundary is the global timestamp BEFORE which
    // segments in this chunk are duplicates of the prior chunk's
    // tail. Comes from the chunk planner (audio-meta.js) and is
    // 0 for chunk 0 (no prior chunk → no dedup).
    const chunkOverlapBoundary =
      typeof d.chunkOverlapBoundarySec === "number"
        ? d.chunkOverlapBoundarySec
        : 0;
    const labelsInThisChunk = new Set();
    for (const seg of d.segments) {
      if ((seg.start || 0) < chunkOverlapBoundary) continue;
      let globalId = globalMap.get(`${d.chunkIndex}:${seg.speaker_local}`);
      // Unmapped (no fingerprint produced for this speaker_local in
      // this chunk) → bucket into Speaker_Unknown rather than drop.
      if (!globalId || !speakers[globalId]) {
        globalId = "Speaker_Unknown";
      }
      speakers[globalId].turns += 1;
      const segDuration = Math.max(0, (seg.end || 0) - (seg.start || 0));
      speakers[globalId].total_speaking_seconds += segDuration;
      if (typeof seg.confidence === "number" && Number.isFinite(seg.confidence)) {
        confidenceSum.set(globalId, (confidenceSum.get(globalId) || 0) + seg.confidence);
        confidenceCount.set(globalId, (confidenceCount.get(globalId) || 0) + 1);
      }
      labelsInThisChunk.add(globalId);
    }
    for (const label of labelsInThisChunk) {
      if (!chunksByLabel.has(label)) chunksByLabel.set(label, new Set());
      chunksByLabel.get(label).add(d.chunkIndex);
    }
  }

  // If Speaker_Unknown ended up with zero turns (no unmapped + no
  // suppressed clusters contributed), drop it from the legend so we
  // don't show "? Unknown 0:00" by default.
  if (speakers["Speaker_Unknown"] && speakers["Speaker_Unknown"].turns === 0) {
    delete speakers["Speaker_Unknown"];
    hasUnknown = false;
  }
  for (const label of Object.keys(speakers)) {
    if (confidenceCount.get(label)) {
      speakers[label].mean_confidence =
        confidenceSum.get(label) / confidenceCount.get(label);
    }
    speakers[label].chunks_appeared_in = (chunksByLabel.get(label) || new Set()).size;
    speakers[label].total_speaking_seconds =
      Math.round(speakers[label].total_speaking_seconds * 10) / 10;
  }

  // Logging: surface the suppression summary so operators can see
  // what happened ("14 clusters → 2 anchors + 12 small/uncertain
  // suppressed").
  const reassignedCount = reassignTo.size;
  const unknownClusterCount = unknownClusters.size;
  const finalCount =
    Object.keys(speakers).length - (hasUnknown ? 1 : 0);
  console.log(
    `[clustering] ${clusters.length} raw clusters → ${finalCount} primary + ` +
      `${reassignedCount} reassigned (uncertain) + ${unknownClusterCount} unknown ` +
      `(anchors >= ${anchorMinSec}s, uncertain margin ${uncertainMarginPct}%, ` +
      `unknown < ${smallMaxSec}s)`
  );

  return {
    globalMap,
    uncertaintyMap,
    speakers,
    clusterCount: clusters.length,
    thresholdSimilarity: similarityThreshold,
  };
}

// ─── Public: stamp global speaker labels onto transcript segments ───
//
// Walks the merged transcript segments and assigns each one a
// `speaker` (global ID, e.g. "Speaker_A") + `speaker_confidence`
// based on which diarization segment its midpoint falls inside.
// When no diar segment covers the midpoint, we fall back to nearest-
// midpoint matching with a 5-second window — beyond that, leave the
// speaker null so the frontend can render as "(speaker unknown)" or
// just drop the label.
//
// Mutates the segments in-place (and also returns the array) so
// callers don't have to remember which they got.
export function assignSpeakersToSegments(segments, chunkDiarization, globalMap, uncertaintyMap = null) {
  if (!Array.isArray(segments) || segments.length === 0) return segments;
  if (!globalMap || globalMap.size === 0) {
    // Diarization didn't run / produced nothing — leave segments
    // alone. Caller can detect this state via speakers === {}.
    return segments;
  }
  // Flatten all per-chunk diar segments into one timeline annotated
  // with the global speaker label + the suppression-uncertainty
  // flag (set when a small cluster was reassigned to an anchor —
  // chip will show "?" so the user knows attribution is best-guess).
  const flatDiar = [];
  for (const d of chunkDiarization || []) {
    if (!d || !d.ok || !Array.isArray(d.segments)) continue;
    for (const seg of d.segments) {
      const memberKey = `${d.chunkIndex}:${seg.speaker_local}`;
      const globalId = globalMap.get(memberKey);
      if (!globalId) continue;
      const uncertain = uncertaintyMap ? !!uncertaintyMap.get(memberKey) : false;
      flatDiar.push({
        start: seg.start || 0,
        end: seg.end || 0,
        speaker: globalId,
        confidence: typeof seg.confidence === "number" ? seg.confidence : null,
        uncertain,
      });
    }
  }
  flatDiar.sort((a, b) => a.start - b.start);

  const NEAREST_FALLBACK_WINDOW_SEC = 5;

  for (const e of segments) {
    const mid = ((e.start || 0) + (e.end || 0)) / 2;
    // Find segments that contain the midpoint
    let containing = null;
    let containingMostOverlap = 0;
    for (const d of flatDiar) {
      if (d.start <= mid && mid <= d.end) {
        // Score by overlap with the entry to handle the rare case of
        // multiple diar segments straddling one transcript line
        // (chunk overlap zones, choppy speaker turns).
        const overlap =
          Math.min(d.end, e.end || 0) - Math.max(d.start, e.start || 0);
        if (overlap > containingMostOverlap) {
          containing = d;
          containingMostOverlap = overlap;
        }
      }
    }
    if (containing) {
      e.speaker = containing.speaker;
      e.speaker_confidence = containing.confidence;
      e.speaker_uncertain = !!containing.uncertain;
      continue;
    }
    // Fall back to nearest by midpoint distance (within window)
    let nearest = null;
    let nearestDist = Infinity;
    for (const d of flatDiar) {
      const dMid = (d.start + d.end) / 2;
      const dist = Math.abs(dMid - mid);
      if (dist < nearestDist) {
        nearestDist = dist;
        nearest = d;
      }
    }
    if (nearest && nearestDist <= NEAREST_FALLBACK_WINDOW_SEC) {
      e.speaker = nearest.speaker;
      e.speaker_confidence = nearest.confidence;
      e.speaker_uncertain = !!nearest.uncertain;
    } else {
      e.speaker = null;
      e.speaker_confidence = null;
      e.speaker_uncertain = false;
    }
  }

  return segments;
}