v0.13.0:4 - redaction gateway, embeddings proxy, expanded audio API

- Add redaction gateway (redaction_gateway.py, redaction/ scrub + tests) - Add embeddings proxy and spark_embed service (Dockerfile + main.py) - Expand audio_proxy with speaker-aware handling; deep_health/health/server updates - Package: configureSparks action + sparkConfig model updates, manifest/main wiring - Docs: AUDIO_API, EMBEDDINGS, REDACTION_GATEWAY; HANDOFF and runbook/known-issues refresh
2026-06-11 17:45:21 -05:00
parent 4a75274db3
commit 8d839e3714
37 changed files with 3763 additions and 197 deletions
@@ -1,10 +1,12 @@
 """OpenAI-compatible audio proxy: lets any OpenAI-shaped client (Open WebUI,
-Home Assistant, etc.) talk to Parakeet (STT) and Magpie (TTS) through one URL.
+Home Assistant, etc.) talk to Parakeet (STT) and Kokoro (TTS) through one URL.

 Endpoints exposed on spark-control's port (same as the dashboard):
-  GET  /v1/models                 — lists STT model + Magpie voices in OpenAI shape
-  POST /v1/audio/speech           — OpenAI TTS → Magpie /v1/audio/synthesize
-  POST /v1/audio/transcriptions   — forward to Parakeet (already OpenAI-compatible)
+  GET  /v1/models                       — lists STT model + Kokoro voices in OpenAI shape
+  POST /v1/audio/speech                 — OpenAI TTS → Kokoro /v1/audio/speech
+  POST /v1/audio/transcriptions         — forward to Parakeet (already OpenAI-compatible)
+  POST /api/audio/diarize-chunk         — per-chunk diarization (Parakeet container, Sortformer+TitaNet)
+  POST /api/audio/transcribe-with-speakers — ASR + diarization merged

 Both downstream services already speak HTTP on the LAN; this module just adapts
 request/response shapes so OpenAI clients don't need a custom integration.
@@ -13,10 +15,20 @@ When Parakeet returns a 500 (commonly the recurring CUDA wedge), the proxy
 returns a clearer 503 with Retry-After=60, and fires the deep-health probe in
 the background — which detects the wedge and triggers a rate-limited container
 restart inside seconds. The client's next attempt ~60s later then succeeds.
+
+TTS is intentionally simple: forward the request body to Kokoro and stream the
+response back. Kokoro-82M is reliable enough (24/24 successful renders across
+the same input lengths that broke Magpie 13/24 times) that no retry, chunking,
+or duration-validation layer is needed. This used to be a ~150-line tangle
+under v0.13.0:6's Magpie-with-chunking workaround; it's now a single forward.
 """
 from __future__ import annotations
 import asyncio
+import io
+import json
 import logging
+import wave
+from array import array
 from typing import Any, Optional

 import httpx
@@ -28,38 +40,33 @@ from .config import Settings

 logger = logging.getLogger("spark-control.audio")

-# Magpie voice name encodes its language. Example:
-#   Magpie-Multilingual.EN-US.Mia        -> en-US
-#   Magpie-Multilingual.ES-US.Diego      -> es-US
-#   Magpie-Multilingual.FR-FR.Pascal     -> fr-FR
-def _lang_from_voice(voice: str) -> str:
-    try:
-        parts = voice.split(".")
-        # parts = ["Magpie-Multilingual", "EN-US", "Mia"] (or with emotion suffix)
-        if len(parts) >= 2 and "-" in parts[1]:
-            lang_part = parts[1]  # "EN-US"
-            primary, region = lang_part.split("-", 1)
-            return f"{primary.lower()}-{region.upper()}"
-    except Exception:
-        pass
-    return "en-US"

+# Kokoro default voice. The four curated voices below were Alice-tested for
+# narration/recap-style content; bm_george is the default. Clients can pass
+# any of Kokoro's 67 voices in the `voice` field — see /v1/models.
+DEFAULT_VOICE = "bm_george"

-# Default voice: configurable, falls back to a sensible English voice if unset.
-DEFAULT_VOICE = "Magpie-Multilingual.EN-US.Mia"
+# Curated quick-pick voices surfaced at the top of /v1/models. The full list
+# of 67 voices is fetched live from Kokoro and appended after these.
+CURATED_VOICES: list[dict] = [
+    {"id": "bm_george", "name": "George (British male, narrator-style)",      "language": "en-GB"},
+    {"id": "bf_emma",   "name": "Emma (British female, audiobook-style)",     "language": "en-GB"},
+    {"id": "am_michael","name": "Michael (American male, warm narrator)",     "language": "en-US"},
+    {"id": "af_heart",  "name": "Heart (American female, warm and balanced)", "language": "en-US"},
+]


 class SpeechRequest(BaseModel):
-    """OpenAI /v1/audio/speech request body."""
-    model: Optional[str] = None              # ignored — Magpie has one model
-    input: str                                # the text to speak
-    voice: Optional[str] = None              # e.g. "Magpie-Multilingual.EN-US.Mia"
-    response_format: Optional[str] = "wav"   # only "wav" supported today
-    speed: Optional[float] = 1.0             # ignored by Magpie
-    # Magpie-specific extensions (clients may pass these through)
-    language: Optional[str] = None
-    sample_rate_hz: Optional[int] = 22050
-    encoding: Optional[str] = "LINEAR_PCM"
+    """OpenAI /v1/audio/speech request body. Forwarded to Kokoro mostly-verbatim.
+
+    Kokoro accepts the OpenAI shape natively, so we only need to substitute the
+    default voice when the client doesn't specify one.
+    """
+    model: Optional[str] = None              # Kokoro tolerates any model id
+    input: str                               # the text to speak
+    voice: Optional[str] = None              # e.g. "bm_george"; default: DEFAULT_VOICE
+    response_format: Optional[str] = "wav"   # Kokoro supports wav, mp3, opus, flac
+    speed: Optional[float] = 1.0


 def build_router(settings: Settings, deep_health: Any = None) -> APIRouter:
@@ -74,15 +81,17 @@ def build_router(settings: Settings, deep_health: Any = None) -> APIRouter:
    def _parakeet_base() -> str:
        return f"http://{settings.parakeet_host}:{settings.parakeet_port}"

-    def _magpie_base() -> str:
-        return f"http://{settings.magpie_host}:{settings.magpie_port}"
+    def _kokoro_base() -> str:
+        return f"http://{settings.kokoro_host}:{settings.kokoro_port}"

    # ---- /v1/models ----
    @router.get("/v1/models")
    async def list_models() -> dict:
-        """Advertise the STT model + a small voice menu so clients can
-        populate their voice-picker UIs. Falls back gracefully if Magpie
-        is offline (returns just the STT entry)."""
+        """Advertise the STT model + Kokoro voices in OpenAI list shape.
+
+        Curated voices appear first; the rest of Kokoro's catalog follows.
+        Falls back to just the STT entry + curated voices if Kokoro is offline.
+        """
        data: list[dict] = [
            {
                "id": "parakeet-tdt-0.6b-v3",
@@ -91,66 +100,82 @@ def build_router(settings: Settings, deep_health: Any = None) -> APIRouter:
                "kind": "stt",
            },
        ]
-        # Try to enumerate voices from Magpie; if unreachable, just skip.
+        # Curated first — these are the four Alice chose for narration/recap.
+        seen = set()
+        for v in CURATED_VOICES:
+            data.append({
+                "id": v["id"],
+                "object": "model",
+                "owned_by": "kokoro",
+                "kind": "tts",
+                "display_name": v.get("name"),
+                "language": v.get("language"),
+                "curated": True,
+            })
+            seen.add(v["id"])
+
+        # Append everything else Kokoro advertises (~63 more voices across many
+        # languages). Best-effort — if Kokoro is unreachable, the curated list
+        # alone is still usable.
        try:
            async with httpx.AsyncClient(timeout=5.0) as client:
-                r = await client.get(f"{_magpie_base()}/v1/audio/list_voices")
+                r = await client.get(f"{_kokoro_base()}/v1/audio/voices")
            if r.status_code == 200:
-                voices_by_locales = r.json()
-                seen = set()
-                for _locales, payload in voices_by_locales.items():
-                    for v in payload.get("voices", []):
-                        # Collapse emotion variants — expose only the base voice name.
-                        # "Magpie-Multilingual.EN-US.Mia.Angry" -> "Magpie-Multilingual.EN-US.Mia"
-                        parts = v.split(".")
-                        base = ".".join(parts[:3]) if len(parts) >= 3 else v
-                        if base not in seen:
-                            seen.add(base)
-                            data.append({
-                                "id": base,
-                                "object": "model",
-                                "owned_by": "nvidia",
-                                "kind": "tts",
-                            })
+                body = r.json()
+                for v in body.get("voices", []):
+                    vid = v.get("id") if isinstance(v, dict) else v
+                    if not vid or vid in seen:
+                        continue
+                    data.append({
+                        "id": vid,
+                        "object": "model",
+                        "owned_by": "kokoro",
+                        "kind": "tts",
+                    })
+                    seen.add(vid)
        except Exception as e:
-            logger.warning("magpie voice list unavailable: %s", e)
+            logger.warning("kokoro voice list unavailable: %s", e)
        return {"object": "list", "data": data}

    # ---- /v1/audio/speech (TTS) ----
    @router.post("/v1/audio/speech")
    async def speech(body: SpeechRequest) -> Response:
-        """OpenAI-style TTS. Translates to Magpie's multipart synth call.
+        """OpenAI-style TTS. Forwards to Kokoro and returns the audio bytes.

-        Returns raw WAV bytes (Content-Type: audio/wav) — browsers and most
-        clients play these directly.
+        Kokoro accepts the OpenAI shape natively. We only substitute the
+        default voice when not specified. Response is whatever format Kokoro
+        produces (WAV by default, mp3/opus/flac if the client asked for one).
+
+        No retry layer needed — Kokoro is reliable at any input length.
        """
        text = (body.input or "").strip()
        if not text:
            raise HTTPException(400, "input text is required")

        voice = body.voice or DEFAULT_VOICE
-        language = body.language or _lang_from_voice(voice)
-        sample_rate = int(body.sample_rate_hz or 22050)
-        encoding = body.encoding or "LINEAR_PCM"
-
-        form = {
-            "text": text,
-            "language": language,
+        response_format = body.response_format or "wav"
+        payload = {
+            "model": body.model or "kokoro",
+            "input": text,
            "voice": voice,
-            "sample_rate_hz": str(sample_rate),
-            "encoding": encoding,
+            "response_format": response_format,
        }
+        if body.speed is not None:
+            payload["speed"] = body.speed
+
        try:
            async with httpx.AsyncClient(timeout=120.0) as client:
-                r = await client.post(f"{_magpie_base()}/v1/audio/synthesize", data=form)
+                r = await client.post(
+                    f"{_kokoro_base()}/v1/audio/speech", json=payload
+                )
        except httpx.HTTPError as e:
-            raise HTTPException(502, f"magpie unreachable: {e}")
+            raise HTTPException(502, f"kokoro unreachable: {e}")

        if r.status_code != 200:
-            # Surface Magpie's error message verbatim so clients can debug voice/lang typos.
+            # Surface Kokoro's error verbatim (bad voice, bad format, etc.).
            raise HTTPException(r.status_code, r.text[:500])

-        # Magpie returns WAV bytes already (Content-Type: audio/wav). Pass through.
+        # Forward Kokoro's content-type so the client knows the format.
        media_type = r.headers.get("content-type", "audio/wav")
        return Response(content=r.content, media_type=media_type)

@@ -209,11 +234,11 @@ def build_router(settings: Settings, deep_health: Any = None) -> APIRouter:
            raise HTTPException(r.status_code, r.text[:500])
        return Response(content=r.content, media_type=r.headers.get("content-type", "application/json"))

-    # ---- /api/audio/diarize-chunk (per-chunk worker for Recap Relay) ----
+    # ---- /api/audio/diarize-chunk (per-chunk worker for chunked workflows) ----
    @router.post("/api/audio/diarize-chunk")
    async def diarize_chunk(file: UploadFile = File(...)) -> dict:
-        """Per-chunk worker designed for orchestrators (Recap Relay) that
-        handle chunking + cross-chunk speaker clustering themselves.
+        """Per-chunk worker designed for orchestrators that handle chunking +
+        cross-chunk speaker clustering themselves.

        Given ONE audio chunk, returns diarization segments (with LOCAL
        speaker labels — Speaker_0/1/... reset per chunk) AND a 192-dim
@@ -271,7 +296,7 @@ def build_router(settings: Settings, deep_health: Any = None) -> APIRouter:
        """Diarized transcription: run Parakeet ASR and Sortformer diarization on
        the same audio in parallel, then merge by timestamp.

-        Response shape (designed for downstream UIs like recap-relay):
+        Response shape (designed for downstream UIs):

            {
              "duration": 90.5,
@@ -299,8 +324,6 @@ def build_router(settings: Settings, deep_health: Any = None) -> APIRouter:
        filename = file.filename or "audio.wav"
        content_type = file.content_type or "application/octet-stream"

-        # Parakeet ASR + Sortformer diarizer in parallel. (A WhisperX detour
-        # lived here briefly — reverted in v0.13.0:0; see release notes.)
        async def _call_transcribe(client: httpx.AsyncClient) -> dict:
            files = {"file": (filename, body, content_type)}
            data = {"response_format": "verbose_json"}
@@ -359,9 +382,353 @@ def build_router(settings: Settings, deep_health: Any = None) -> APIRouter:
            },
        }

+    # ---- /api/audio/label-merge (diarize + name clusters from a visual timeline) ----
+    async def _diar(client, b, fn):
+        r = await client.post(f"{_parakeet_base()}/v1/audio/diarize-chunk",
+                              files={"file": (fn, b, "audio/wav")})
+        r.raise_for_status()
+        return r.json()
+
+    async def _txn(client, b, fn):
+        r = await client.post(f"{_parakeet_base()}/v1/audio/transcriptions",
+                              files={"file": (fn, b, "audio/wav")},
+                              data={"response_format": "verbose_json"})
+        r.raise_for_status()
+        return r.json()
+
+    @router.post("/api/audio/label-merge")
+    async def label_merge(
+        file: Optional[UploadFile] = File(default=None),
+        mic_file: Optional[UploadFile] = File(default=None),
+        system_file: Optional[UploadFile] = File(default=None),
+        timeline: str = Form(...),
+        self_name: str = Form(default="Me"),
+        self_vad: Optional[str] = Form(default=None),
+        known_voiceprints: Optional[str] = Form(default=None),
+        transcribe: bool = Form(default=False),
+        min_overlap: float = Form(default=0.0),
+        voiceprint_threshold: float = Form(default=0.5),
+    ) -> dict:
+        """Diarize audio and NAME each anonymous cluster from a caller-supplied visual
+        timeline (who-was-on-screen-when) by majority temporal overlap, with a voice-
+        fingerprint fallback. Stateless + portable — the caller owns the timeline and
+        voiceprint library; nothing is persisted here.
+
+        TWO MODES:
+
+        * MONO (legacy): send `file` (mixed mono). Diarizes the mix, names clusters.
+
+        * DUAL-CHANNEL: send `mic_file` (the local user's mic) + `system_file`
+          (everyone else, from screen capture), sample-aligned to a shared t0. This
+          uses the channels to SPLIT the problem instead of forcing the diarizer to
+          re-disentangle a mono mix:
+            - mic track  -> the local user's words, gated to windows where the mic is
+              actually the user speaking (mic louder than system — a self-VAD computed
+              server-side from the two channels, or supplied via `self_vad`). The mic
+              picks up the remote audio as quiet bleed, so this gate is LOAD-BEARING:
+              without it the bleed would be transcribed as the user.
+            - system track -> diarized (only has to separate the *remote* people, a
+              strictly easier problem) and named via the visual timeline + voiceprints.
+            - the user's clean voiceprint is enrolled from the mic track and injected
+              into the voiceprint library, so a system-track cluster that's actually the
+              user dialed in from a second device (dual-login) resolves to the user, not
+              a stranger.
+          Self-attribution becomes near-perfect (dedicated channel), remote diarization
+          gets cleaner, overlapping speech is trivially separated, and the user no longer
+          consumes one of Sortformer's 4 speaker slots.
+
+        Form fields (multipart):
+          file | (mic_file + system_file)   audio — mono mix OR the two channels
+          timeline              JSON [{"start","end","name","confidence?"}, ...] (visual hints for remote folks)
+          self_name             name for the local user (mic channel). Default "Me".
+          self_vad              optional JSON [{"start","end"}] mic-active-and-louder windows;
+                                if omitted, computed server-side by per-window RMS.
+          known_voiceprints     optional JSON {name: [192 floats]} from past calls (include the user's)
+          transcribe            "true" to attach per-segment text (always on in dual-channel)
+          min_overlap           min fraction of a cluster's time overlapping the winning name (default 0)
+          voiceprint_threshold  cosine similarity to accept a voiceprint match (default 0.5)
+        """
+        try:
+            tl = json.loads(timeline)
+            assert isinstance(tl, list)
+        except Exception:
+            raise HTTPException(400, "timeline must be a JSON array of {start,end,name}")
+        known_vp: dict[str, list[float]] = {}
+        if known_voiceprints:
+            try:
+                known_vp = json.loads(known_voiceprints)
+                assert isinstance(known_vp, dict)
+            except Exception:
+                raise HTTPException(400, "known_voiceprints must be a JSON object {name: [floats]}")
+
+        dual = mic_file is not None and system_file is not None
+        if not dual and file is None:
+            raise HTTPException(400, "provide either 'file' (mono) or both 'mic_file' and 'system_file'")
+
+        try:
+            async with httpx.AsyncClient(timeout=600.0) as client:
+                if dual:
+                    return await _label_merge_dual(
+                        client, _diar, _txn, await mic_file.read(), await system_file.read(),
+                        tl, self_name, self_vad, known_vp, min_overlap, voiceprint_threshold)
+                body = await file.read()
+                if not body:
+                    raise HTTPException(400, "Empty file")
+                fn = file.filename or "audio.wav"
+                if transcribe:
+                    diar, stt = await asyncio.gather(_diar(client, body, fn), _txn(client, body, fn))
+                else:
+                    diar, stt = await _diar(client, body, fn), None
+        except HTTPException:
+            raise
+        except httpx.HTTPStatusError as e:
+            if e.response.status_code == 500 and deep_health is not None:
+                try:
+                    asyncio.create_task(deep_health.run_one("parakeet"))
+                except Exception:
+                    pass
+                raise HTTPException(503, "Parakeet transient error (likely CUDA wedge). Retry in ~60s.",
+                                    headers={"Retry-After": "60"})
+            raise HTTPException(e.response.status_code, e.response.text[:500])
+        except httpx.HTTPError as e:
+            raise HTTPException(502, f"parakeet unreachable: {e}")
+
+        # ---- MONO path ----
+        diar_segments = diar.get("segments", [])
+        fingerprints = diar.get("fingerprints", {}) or {}
+        clusters = diar.get("speakers_detected", [])
+        assignment = _name_clusters(diar_segments, fingerprints, clusters, tl, known_vp,
+                                    min_overlap, voiceprint_threshold)
+        relabeled_turns = [
+            {"start_s": s.get("start_s"), "end_s": s.get("end_s"),
+             "speaker": assignment[s.get("speaker")]["name"]}
+            for s in diar_segments if s.get("speaker") in assignment
+        ]
+        if transcribe and stt is not None:
+            out_segments = _merge_words_with_speakers(stt.get("words", []), relabeled_turns)
+        else:
+            out_segments = [{
+                "start_s": s.get("start_s"), "end_s": s.get("end_s"),
+                "speaker": assignment.get(s.get("speaker"), {}).get("name", s.get("speaker")),
+                "confidence": s.get("confidence"),
+            } for s in diar_segments]
+        speakers, named_fingerprints = _speaker_list(clusters, assignment, fingerprints)
+        return {
+            "mode": "mono",
+            "duration": diar.get("duration", 0.0),
+            "speakers": speakers,
+            "segments": out_segments,
+            "fingerprints": named_fingerprints,
+            "models": diar.get("models", {}),
+        }
+
    return router


+# ---- Label-merge helpers ----
+
+def _overlap_seconds(a0: float, a1: float, b0: float, b1: float) -> float:
+    return max(0.0, min(a1, b1) - max(a0, b0))
+
+
+def _cosine(a: Optional[list], b: Optional[list]) -> float:
+    if not a or not b or len(a) != len(b):
+        return 0.0
+    dot = sum(x * y for x, y in zip(a, b))
+    na = sum(x * x for x in a) ** 0.5
+    nb = sum(x * x for x in b) ** 0.5
+    if na == 0 or nb == 0:
+        return 0.0
+    return dot / (na * nb)
+
+
+def _name_clusters(diar_segments, fingerprints, clusters, tl, known_vp,
+                   min_overlap, voiceprint_threshold):
+    """Assign a name to each anonymous diarization cluster: visual-timeline overlap
+    winner -> closest known-voiceprint match -> Unknown_N. Shared by mono + dual."""
+    cluster_dur: dict[str, float] = {}
+    cluster_name_overlap: dict[str, dict[str, float]] = {}
+    for seg in diar_segments:
+        spk = seg.get("speaker")
+        s0, s1 = float(seg.get("start_s", 0)), float(seg.get("end_s", 0))
+        cluster_dur[spk] = cluster_dur.get(spk, 0.0) + max(0.0, s1 - s0)
+        for entry in tl:
+            name = (entry.get("name") or "").strip()
+            if not name:
+                continue
+            ov = _overlap_seconds(s0, s1, float(entry.get("start", 0)), float(entry.get("end", 0)))
+            if ov > 0:
+                cluster_name_overlap.setdefault(spk, {})
+                cluster_name_overlap[spk][name] = cluster_name_overlap[spk].get(name, 0.0) + ov
+    assignment: dict[str, dict] = {}
+    used_unknown = 0
+    for cluster in clusters:
+        names = cluster_name_overlap.get(cluster, {})
+        total = cluster_dur.get(cluster, 0.0) or 1.0
+        if names:
+            winner = max(names.items(), key=lambda kv: kv[1])
+            conf = winner[1] / total
+            if conf >= min_overlap:
+                assignment[cluster] = {"name": winner[0], "source": "visual",
+                                       "overlap_confidence": round(conf, 4)}
+                continue
+        fp = fingerprints.get(cluster)
+        best_name, best_sim = None, 0.0
+        if fp and known_vp:
+            for nm, vec in known_vp.items():
+                sim = _cosine(fp, vec)
+                if sim > best_sim:
+                    best_name, best_sim = nm, sim
+        if best_name and best_sim >= voiceprint_threshold:
+            assignment[cluster] = {"name": best_name, "source": "voiceprint",
+                                   "match_similarity": round(best_sim, 4)}
+        else:
+            assignment[cluster] = {"name": f"Unknown_{used_unknown}", "source": "unmatched"}
+            used_unknown += 1
+    return assignment
+
+
+def _speaker_list(clusters, assignment, fingerprints):
+    """Build the response `speakers` list + name->fingerprint map from an assignment."""
+    speakers, named = [], {}
+    for cluster in clusters:
+        a = assignment[cluster]
+        entry = {"cluster": cluster, "name": a["name"], "source": a["source"],
+                 "fingerprint": fingerprints.get(cluster)}
+        if "overlap_confidence" in a:
+            entry["overlap_confidence"] = a["overlap_confidence"]
+        if "match_similarity" in a:
+            entry["match_similarity"] = a["match_similarity"]
+        speakers.append(entry)
+        if fingerprints.get(cluster) is not None:
+            named[a["name"]] = fingerprints.get(cluster)
+    return speakers, named
+
+
+def _wav_pcm(b: bytes):
+    """Decode a 16-bit mono/stereo WAV to (int16 array, sample_rate). Returns
+    (None, 0) if it can't decode (caller then requires a client-supplied self_vad)."""
+    try:
+        with wave.open(io.BytesIO(b), "rb") as w:
+            sr, n, ch, sw = w.getframerate(), w.getnframes(), w.getnchannels(), w.getsampwidth()
+            raw = w.readframes(n)
+        if sw != 2:
+            return None, 0
+        a = array("h")
+        a.frombytes(raw)
+        if ch > 1:
+            a = a[0::ch]  # take channel 0
+        return a, sr
+    except Exception:
+        return None, 0
+
+
+def _win_rms(pcm_sr, s: float, e: float) -> float:
+    """Normalized RMS (0..1) of the [s,e]-second window of a decoded PCM array."""
+    a, sr = pcm_sr
+    if a is None or sr <= 0:
+        return 0.0
+    i, j = max(0, int(s * sr)), min(len(a), int(e * sr))
+    if j <= i:
+        return 0.0
+    ss = 0
+    for x in a[i:j]:
+        ss += x * x
+    return (ss / (j - i)) ** 0.5 / 32768.0
+
+
+async def _label_merge_dual(client, diar_fn, txn_fn, mic_b, sys_b, tl, self_name,
+                            self_vad_json, known_vp, min_overlap, voiceprint_threshold):
+    """Dual-channel label-merge: mic track = the local user (gated to mic-dominant
+    windows so remote bleed isn't transcribed as the user); system track = diarized +
+    named remote speakers. See label_merge docstring for the full rationale."""
+    if not mic_b or not sys_b:
+        raise HTTPException(400, "empty mic_file or system_file")
+
+    # System: diarize + transcribe (parallel). Mic: transcribe + diarize (parallel) —
+    # the mic diarization yields the user's clean enrollment voiceprint.
+    sys_diar, sys_stt, mic_stt, mic_diar = await asyncio.gather(
+        diar_fn(client, sys_b, "system.wav"), txn_fn(client, sys_b, "system.wav"),
+        txn_fn(client, mic_b, "mic.wav"), diar_fn(client, mic_b, "mic.wav"))
+
+    # Enroll the user's voiceprint = fingerprint of the dominant cluster on the mic track.
+    self_vp = None
+    mic_fps = mic_diar.get("fingerprints", {}) or {}
+    if mic_fps:
+        durs: dict[str, float] = {}
+        for s in mic_diar.get("segments", []):
+            durs[s["speaker"]] = durs.get(s["speaker"], 0.0) + (s["end_s"] - s["start_s"])
+        top = max(durs, key=durs.get) if durs else next(iter(mic_fps))
+        self_vp = mic_fps.get(top)
+    # Inject self voiceprint so a dual-login (phone) system cluster resolves to the user.
+    vp_lib = dict(known_vp)
+    if self_vp is not None:
+        vp_lib.setdefault(self_name, self_vp)
+
+    # Name the SYSTEM clusters (remote people, possibly incl. phone-self via voiceprint).
+    sys_segments = sys_diar.get("segments", [])
+    sys_fps = sys_diar.get("fingerprints", {}) or {}
+    sys_clusters = sys_diar.get("speakers_detected", [])
+    sys_assign = _name_clusters(sys_segments, sys_fps, sys_clusters, tl, vp_lib,
+                                min_overlap, voiceprint_threshold)
+    sys_turns = [{"start_s": s["start_s"], "end_s": s["end_s"],
+                  "speaker": sys_assign[s["speaker"]]["name"]}
+                 for s in sys_segments if s["speaker"] in sys_assign]
+    remote_blocks = _merge_words_with_speakers(sys_stt.get("words", []), sys_turns)
+
+    # Self-VAD: keep only mic words where the mic is genuinely the local user (mic
+    # louder than system), excluding the remote bleed the mic also picks up.
+    vad_windows = None
+    if self_vad_json:
+        try:
+            vad_windows = json.loads(self_vad_json)
+            assert isinstance(vad_windows, list)
+        except Exception:
+            vad_windows = None
+    mic_pcm = _wav_pcm(mic_b)
+    sys_pcm = _wav_pcm(sys_b)
+    if vad_windows is None and mic_pcm[0] is None:
+        raise HTTPException(400, "could not decode WAV for self-VAD; send 16-bit mono WAV or a self_vad array")
+
+    # Margin so the mic must be CLEARLY louder than system to count as local — guards
+    # against brief remote bleed near utterance boundaries (real local speech runs many
+    # times louder than the bleed; real remote runs many times quieter).
+    _LOCAL_MARGIN = 1.2
+
+    def _is_local(s: float, e: float) -> bool:
+        if vad_windows is not None:
+            return any(_overlap_seconds(s, e, float(w.get("start", 0)), float(w.get("end", 0))) > 0
+                       for w in vad_windows)
+        return _win_rms(mic_pcm, s, e) > _win_rms(sys_pcm, s, e) * _LOCAL_MARGIN
+
+    # Keep mic words where the mic is clearly the dominant channel (margin excludes the
+    # remote bleed the mic also picks up), THEN group the surviving local words into
+    # blocks. Filtering before grouping means a block never mixes local speech with loud
+    # bleed (which would average to system-dominant and drop the whole utterance).
+    local_words = [w for w in mic_stt.get("words", [])
+                   if _is_local(float(w.get("start", 0)), float(w.get("end", 0)))]
+    local_blocks = (_merge_words_with_speakers(
+        local_words, [{"start_s": 0.0, "end_s": 1e12, "speaker": self_name}])
+        if local_words else [])
+
+    segments = sorted(remote_blocks + local_blocks, key=lambda b: b.get("start_ms", 0))
+
+    speakers, named = _speaker_list(sys_clusters, sys_assign, sys_fps)
+    speakers.append({"cluster": "mic", "name": self_name, "source": "mic_channel",
+                     "fingerprint": self_vp})
+    if self_vp is not None:
+        named[self_name] = self_vp
+
+    return {
+        "mode": "dual_channel",
+        "duration": max(sys_diar.get("duration", 0.0), mic_stt.get("duration", 0.0)),
+        "speakers": speakers,
+        "segments": segments,
+        "fingerprints": named,
+        "models": sys_diar.get("models", {}),
+    }
+
+
 # ---- Merge helper: assign speaker to each word, then group into blocks ----

 def _assign_speaker_to_word(word_start_s: float, word_end_s: float, diar_turns: list[dict]) -> str: