diff --git a/image/parakeet_patches/diarizer.py b/image/parakeet_patches/diarizer.py
index 60c4e46..f59335a 100644
--- a/image/parakeet_patches/diarizer.py
+++ b/image/parakeet_patches/diarizer.py
@@ -159,7 +159,10 @@ class SortformerDiarizer:
         Returns:
             {
               "duration": float,
-              "segments": [{"start_s", "end_s", "speaker"}, ...],
+              "segments": [
+                {"start_s", "end_s", "speaker", "confidence": float|None},
+                ...
+              ],
               "speakers_detected": ["Speaker_0", ...],
               "fingerprints": {
                 "Speaker_0": [192 floats],
@@ -168,6 +171,14 @@ class SortformerDiarizer:
               },
               "models": {"diarization": ..., "embedding": ...},
             }
+
+        `confidence` per segment is the mean probability the assigned speaker
+        was active during that segment's frames (Sortformer's raw per-frame
+        per-speaker sigmoid outputs, ~12.6 fps). Range [0, 1], higher = more
+        confident. Typical values for clean speech: >0.5 for confident
+        assignments, 0.2-0.5 for ambiguous, <0.2 for very weak. Recap Relay
+        can use a threshold to mark uncertain segments as "Speaker_0?" in
+        the UI rather than confidently mislabel.
         """
         if not self._loaded:
             self.load_model()
@@ -180,10 +191,17 @@ class SortformerDiarizer:
             duration = len(data) / sr
             logger.info(f"diarize_chunk: {duration:.1f}s audio, running Sortformer...")
 
-            # 1. Diarize
+            # 1. Diarize WITH the per-frame per-speaker tensor outputs so we
+            #    can derive per-segment confidence.
             with torch.no_grad():
-                raw = self.model.diarize(audio=[wav_path], batch_size=1, verbose=False)
+                raw, tensor_outputs = self.model.diarize(
+                    audio=[wav_path],
+                    batch_size=1,
+                    include_tensor_outputs=True,
+                    verbose=False,
+                )
             segments = _parse_sortformer_segments(raw)
+            self._attach_confidence(segments, tensor_outputs, duration)
             speakers = sorted({s["speaker"] for s in segments})
             logger.info(f"  detected {len(speakers)} local speakers, {len(segments)} turns")
 
@@ -208,6 +226,54 @@ class SortformerDiarizer:
                 try: os.unlink(wav_path)
                 except OSError: pass
 
+    def _attach_confidence(
+        self,
+        segments: list[dict],
+        tensor_outputs: Optional[list],
+        duration_s: float,
+    ) -> None:
+        """Add `confidence` (mean probability for the assigned speaker across
+        the segment's frames) to each segment in-place. None on any failure."""
+        try:
+            if not tensor_outputs:
+                for seg in segments:
+                    seg["confidence"] = None
+                return
+            scores = tensor_outputs[0]
+            if hasattr(scores, "dim") and scores.dim() == 3:
+                scores = scores.squeeze(0)        # [n_frames, n_speakers]
+            if not hasattr(scores, "shape") or len(scores.shape) != 2:
+                for seg in segments:
+                    seg["confidence"] = None
+                return
+            n_frames, n_speakers = scores.shape[0], scores.shape[1]
+            if n_frames == 0 or duration_s <= 0:
+                for seg in segments:
+                    seg["confidence"] = None
+                return
+            fps = n_frames / duration_s  # frames per second
+            for seg in segments:
+                spk_label = seg.get("speaker", "")
+                try:
+                    spk_idx = int(spk_label.rsplit("_", 1)[1])
+                except (ValueError, IndexError):
+                    seg["confidence"] = None
+                    continue
+                if spk_idx < 0 or spk_idx >= n_speakers:
+                    seg["confidence"] = None
+                    continue
+                f_start = max(0, int(seg["start_s"] * fps))
+                f_end = min(n_frames, int(seg["end_s"] * fps) + 1)
+                if f_end <= f_start:
+                    seg["confidence"] = None
+                    continue
+                window = scores[f_start:f_end, spk_idx]
+                seg["confidence"] = round(float(window.mean()), 4)
+        except Exception as e:
+            logger.warning(f"failed to attach confidence: {e}")
+            for seg in segments:
+                seg.setdefault("confidence", None)
+
     def _extract_fingerprints_internal(
         self, audio: np.ndarray, sr: int, segments: list[dict]
     ) -> dict[str, list[float]]:
diff --git a/image/parakeet_patches/main.py b/image/parakeet_patches/main.py
index 17aed4f..2d8ea4b 100644
--- a/image/parakeet_patches/main.py
+++ b/image/parakeet_patches/main.py
@@ -175,7 +175,10 @@ async def diarize_chunk(
     Response shape:
         {
           "duration": 300.0,
-          "segments": [{"start_s": 1.2, "end_s": 4.8, "speaker": "Speaker_0"}, ...],
+          "segments": [
+            {"start_s": 1.2, "end_s": 4.8, "speaker": "Speaker_0", "confidence": 0.78},
+            ...
+          ],
           "speakers_detected": ["Speaker_0", "Speaker_1", "Speaker_2"],
           "fingerprints": {
             "Speaker_0": [0.123, -0.045, ..., 0.211],   # 192-dim TitaNet embedding
@@ -188,6 +191,13 @@ async def diarize_chunk(
           }
         }
 
+    confidence per segment: mean probability that the assigned speaker was
+    active across the segment's frames (Sortformer's raw per-frame per-
+    speaker sigmoid outputs). Range [0, 1], higher = more confident.
+    Clean speech typically >0.5; ambiguous regions (overlap, weak signal)
+    fall lower. None on derivation failure. Recap Relay can threshold
+    this to render uncertain segments as "Speaker_0?" in the UI.
+
     Speaker labels are LOCAL to this chunk. Run cosine-similarity clustering
     across the fingerprints from all chunks to merge `chunkA.Speaker_0` with
     `chunkB.Speaker_2` when they're the same voice. Recommended threshold:
diff --git a/package/startos/versions/v0_1_0.ts b/package/startos/versions/v0_1_0.ts
index 43bc877..f93136b 100644
--- a/package/startos/versions/v0_1_0.ts
+++ b/package/startos/versions/v0_1_0.ts
@@ -1,10 +1,10 @@
 import { VersionInfo, IMPOSSIBLE } from '@start9labs/start-sdk'
 
 export const v0_1_0 = VersionInfo.of({
-  version: '0.13.0:1',
+  version: '0.13.0:2',
   releaseNotes: {
     en_US:
-      'v0.13.0:1 — per-chunk diarization worker with voice fingerprints. Adds POST /api/audio/diarize-chunk to Spark Control: given one audio chunk, returns Sortformer diarization segments (with LOCAL speaker labels) PLUS a 192-dim TitaNet voice fingerprint per detected speaker. Designed for Recap Relay to call per-chunk and then cluster fingerprints across chunks via cosine similarity for globally consistent speaker IDs. Parakeet container also gets a new /v1/audio/diarize-chunk endpoint and loads NVIDIA TitaNet (nvidia/speakerverification_en_titanet_large, ~25 MB, NeMo-native, no torchaudio drama). Click Reapply patches on the Speech Models card after install to pick up the diarizer.py + main.py updates. Sortformer + Parakeet + Magpie unchanged.',
+      'v0.13.0:2 — per-segment confidence in diarize-chunk. Sortformer outputs per-frame per-speaker sigmoid probabilities (~12.6 fps) that we previously discarded. Now: for each diarization segment, compute mean probability of the assigned speaker across the segment\'s frames → confidence in [0, 1]. Recap Relay (and other consumers) can threshold this to render uncertain segments as "Speaker_0?" with a question mark, or to skip them entirely. Endpoint shape is otherwise unchanged — segments[].confidence is a new field, value may be None on derivation failure. Click Reapply patches on the Speech Models card after install to pick up the updated diarizer.py + main.py.',
   },
   migrations: {
     up: async ({ effects }) => {},