diff --git a/image/parakeet_patches/diarizer.py b/image/parakeet_patches/diarizer.py index 60c4e46..f59335a 100644 --- a/image/parakeet_patches/diarizer.py +++ b/image/parakeet_patches/diarizer.py @@ -159,7 +159,10 @@ class SortformerDiarizer: Returns: { "duration": float, - "segments": [{"start_s", "end_s", "speaker"}, ...], + "segments": [ + {"start_s", "end_s", "speaker", "confidence": float|None}, + ... + ], "speakers_detected": ["Speaker_0", ...], "fingerprints": { "Speaker_0": [192 floats], @@ -168,6 +171,14 @@ class SortformerDiarizer: }, "models": {"diarization": ..., "embedding": ...}, } + + `confidence` per segment is the mean probability the assigned speaker + was active during that segment's frames (Sortformer's raw per-frame + per-speaker sigmoid outputs, ~12.6 fps). Range [0, 1], higher = more + confident. Typical values for clean speech: >0.5 for confident + assignments, 0.2-0.5 for ambiguous, <0.2 for very weak. Recap Relay + can use a threshold to mark uncertain segments as "Speaker_0?" in + the UI rather than confidently mislabel. """ if not self._loaded: self.load_model() @@ -180,10 +191,17 @@ class SortformerDiarizer: duration = len(data) / sr logger.info(f"diarize_chunk: {duration:.1f}s audio, running Sortformer...") - # 1. Diarize + # 1. Diarize WITH the per-frame per-speaker tensor outputs so we + # can derive per-segment confidence. with torch.no_grad(): - raw = self.model.diarize(audio=[wav_path], batch_size=1, verbose=False) + raw, tensor_outputs = self.model.diarize( + audio=[wav_path], + batch_size=1, + include_tensor_outputs=True, + verbose=False, + ) segments = _parse_sortformer_segments(raw) + self._attach_confidence(segments, tensor_outputs, duration) speakers = sorted({s["speaker"] for s in segments}) logger.info(f" detected {len(speakers)} local speakers, {len(segments)} turns") @@ -208,6 +226,54 @@ class SortformerDiarizer: try: os.unlink(wav_path) except OSError: pass + def _attach_confidence( + self, + segments: list[dict], + tensor_outputs: Optional[list], + duration_s: float, + ) -> None: + """Add `confidence` (mean probability for the assigned speaker across + the segment's frames) to each segment in-place. None on any failure.""" + try: + if not tensor_outputs: + for seg in segments: + seg["confidence"] = None + return + scores = tensor_outputs[0] + if hasattr(scores, "dim") and scores.dim() == 3: + scores = scores.squeeze(0) # [n_frames, n_speakers] + if not hasattr(scores, "shape") or len(scores.shape) != 2: + for seg in segments: + seg["confidence"] = None + return + n_frames, n_speakers = scores.shape[0], scores.shape[1] + if n_frames == 0 or duration_s <= 0: + for seg in segments: + seg["confidence"] = None + return + fps = n_frames / duration_s # frames per second + for seg in segments: + spk_label = seg.get("speaker", "") + try: + spk_idx = int(spk_label.rsplit("_", 1)[1]) + except (ValueError, IndexError): + seg["confidence"] = None + continue + if spk_idx < 0 or spk_idx >= n_speakers: + seg["confidence"] = None + continue + f_start = max(0, int(seg["start_s"] * fps)) + f_end = min(n_frames, int(seg["end_s"] * fps) + 1) + if f_end <= f_start: + seg["confidence"] = None + continue + window = scores[f_start:f_end, spk_idx] + seg["confidence"] = round(float(window.mean()), 4) + except Exception as e: + logger.warning(f"failed to attach confidence: {e}") + for seg in segments: + seg.setdefault("confidence", None) + def _extract_fingerprints_internal( self, audio: np.ndarray, sr: int, segments: list[dict] ) -> dict[str, list[float]]: diff --git a/image/parakeet_patches/main.py b/image/parakeet_patches/main.py index 17aed4f..2d8ea4b 100644 --- a/image/parakeet_patches/main.py +++ b/image/parakeet_patches/main.py @@ -175,7 +175,10 @@ async def diarize_chunk( Response shape: { "duration": 300.0, - "segments": [{"start_s": 1.2, "end_s": 4.8, "speaker": "Speaker_0"}, ...], + "segments": [ + {"start_s": 1.2, "end_s": 4.8, "speaker": "Speaker_0", "confidence": 0.78}, + ... + ], "speakers_detected": ["Speaker_0", "Speaker_1", "Speaker_2"], "fingerprints": { "Speaker_0": [0.123, -0.045, ..., 0.211], # 192-dim TitaNet embedding @@ -188,6 +191,13 @@ async def diarize_chunk( } } + confidence per segment: mean probability that the assigned speaker was + active across the segment's frames (Sortformer's raw per-frame per- + speaker sigmoid outputs). Range [0, 1], higher = more confident. + Clean speech typically >0.5; ambiguous regions (overlap, weak signal) + fall lower. None on derivation failure. Recap Relay can threshold + this to render uncertain segments as "Speaker_0?" in the UI. + Speaker labels are LOCAL to this chunk. Run cosine-similarity clustering across the fingerprints from all chunks to merge `chunkA.Speaker_0` with `chunkB.Speaker_2` when they're the same voice. Recommended threshold: diff --git a/package/startos/versions/v0_1_0.ts b/package/startos/versions/v0_1_0.ts index 43bc877..f93136b 100644 --- a/package/startos/versions/v0_1_0.ts +++ b/package/startos/versions/v0_1_0.ts @@ -1,10 +1,10 @@ import { VersionInfo, IMPOSSIBLE } from '@start9labs/start-sdk' export const v0_1_0 = VersionInfo.of({ - version: '0.13.0:1', + version: '0.13.0:2', releaseNotes: { en_US: - 'v0.13.0:1 — per-chunk diarization worker with voice fingerprints. Adds POST /api/audio/diarize-chunk to Spark Control: given one audio chunk, returns Sortformer diarization segments (with LOCAL speaker labels) PLUS a 192-dim TitaNet voice fingerprint per detected speaker. Designed for Recap Relay to call per-chunk and then cluster fingerprints across chunks via cosine similarity for globally consistent speaker IDs. Parakeet container also gets a new /v1/audio/diarize-chunk endpoint and loads NVIDIA TitaNet (nvidia/speakerverification_en_titanet_large, ~25 MB, NeMo-native, no torchaudio drama). Click Reapply patches on the Speech Models card after install to pick up the diarizer.py + main.py updates. Sortformer + Parakeet + Magpie unchanged.', + 'v0.13.0:2 — per-segment confidence in diarize-chunk. Sortformer outputs per-frame per-speaker sigmoid probabilities (~12.6 fps) that we previously discarded. Now: for each diarization segment, compute mean probability of the assigned speaker across the segment\'s frames → confidence in [0, 1]. Recap Relay (and other consumers) can threshold this to render uncertain segments as "Speaker_0?" with a question mark, or to skip them entirely. Endpoint shape is otherwise unchanged — segments[].confidence is a new field, value may be None on derivation failure. Click Reapply patches on the Speech Models card after install to pick up the updated diarizer.py + main.py.', }, migrations: { up: async ({ effects }) => {},