"""OpenAI-compatible audio proxy: lets any OpenAI-shaped client (Open WebUI, Home Assistant, etc.) talk to Parakeet (STT) and Kokoro (TTS) through one URL. Endpoints exposed on spark-control's port (same as the dashboard): GET /v1/models — lists STT model + Kokoro voices in OpenAI shape POST /v1/audio/speech — OpenAI TTS → Kokoro /v1/audio/speech POST /v1/audio/transcriptions — forward to Parakeet (already OpenAI-compatible) POST /api/audio/diarize-chunk — per-chunk diarization (Parakeet container, Sortformer+TitaNet) POST /api/audio/transcribe-with-speakers — ASR + diarization merged Both downstream services already speak HTTP on the LAN; this module just adapts request/response shapes so OpenAI clients don't need a custom integration. When Parakeet returns a 500 (commonly the recurring CUDA wedge), the proxy returns a clearer 503 with Retry-After=60, and fires the deep-health probe in the background — which detects the wedge and triggers a rate-limited container restart inside seconds. The client's next attempt ~60s later then succeeds. TTS is intentionally simple: forward the request body to Kokoro and stream the response back. Kokoro-82M is reliable enough (24/24 successful renders across the same input lengths that broke Magpie 13/24 times) that no retry, chunking, or duration-validation layer is needed. This used to be a ~150-line tangle under v0.13.0:6's Magpie-with-chunking workaround; it's now a single forward. """ from __future__ import annotations import asyncio import io import json import logging import wave from array import array from typing import Any, Optional import httpx from fastapi import APIRouter, Form, HTTPException, Request, UploadFile, File from fastapi.responses import Response, StreamingResponse from pydantic import BaseModel from .config import Settings logger = logging.getLogger("spark-control.audio") # Kokoro default voice. The four curated voices below were Alice-tested for # narration/recap-style content; bm_george is the default. Clients can pass # any of Kokoro's 67 voices in the `voice` field — see /v1/models. DEFAULT_VOICE = "bm_george" # Curated quick-pick voices surfaced at the top of /v1/models. The full list # of 67 voices is fetched live from Kokoro and appended after these. CURATED_VOICES: list[dict] = [ {"id": "bm_george", "name": "George (British male, narrator-style)", "language": "en-GB"}, {"id": "bf_emma", "name": "Emma (British female, audiobook-style)", "language": "en-GB"}, {"id": "am_michael","name": "Michael (American male, warm narrator)", "language": "en-US"}, {"id": "af_heart", "name": "Heart (American female, warm and balanced)", "language": "en-US"}, ] class SpeechRequest(BaseModel): """OpenAI /v1/audio/speech request body. Forwarded to Kokoro mostly-verbatim. Kokoro accepts the OpenAI shape natively, so we only need to substitute the default voice when the client doesn't specify one. """ model: Optional[str] = None # Kokoro tolerates any model id input: str # the text to speak voice: Optional[str] = None # e.g. "bm_george"; default: DEFAULT_VOICE response_format: Optional[str] = "wav" # Kokoro supports wav, mp3, opus, flac speed: Optional[float] = 1.0 def build_router(settings: Settings, deep_health: Any = None) -> APIRouter: """Build the audio proxy router. If `deep_health` is provided, 500s from Parakeet trigger an immediate background probe (which contains the same wedge-detect → auto-restart logic as the 5-minute periodic loop, but fires now instead of waiting). """ router = APIRouter() def _parakeet_base() -> str: return f"http://{settings.parakeet_host}:{settings.parakeet_port}" def _kokoro_base() -> str: return f"http://{settings.kokoro_host}:{settings.kokoro_port}" # ---- /v1/models ---- @router.get("/v1/models") async def list_models() -> dict: """Advertise the STT model + Kokoro voices in OpenAI list shape. Curated voices appear first; the rest of Kokoro's catalog follows. Falls back to just the STT entry + curated voices if Kokoro is offline. """ data: list[dict] = [ { "id": "parakeet-tdt-0.6b-v3", "object": "model", "owned_by": "nvidia", "kind": "stt", }, ] # Curated first — these are the four Alice chose for narration/recap. seen = set() for v in CURATED_VOICES: data.append({ "id": v["id"], "object": "model", "owned_by": "kokoro", "kind": "tts", "display_name": v.get("name"), "language": v.get("language"), "curated": True, }) seen.add(v["id"]) # Append everything else Kokoro advertises (~63 more voices across many # languages). Best-effort — if Kokoro is unreachable, the curated list # alone is still usable. try: async with httpx.AsyncClient(timeout=5.0) as client: r = await client.get(f"{_kokoro_base()}/v1/audio/voices") if r.status_code == 200: body = r.json() for v in body.get("voices", []): vid = v.get("id") if isinstance(v, dict) else v if not vid or vid in seen: continue data.append({ "id": vid, "object": "model", "owned_by": "kokoro", "kind": "tts", }) seen.add(vid) except Exception as e: logger.warning("kokoro voice list unavailable: %s", e) return {"object": "list", "data": data} # ---- /v1/audio/speech (TTS) ---- @router.post("/v1/audio/speech") async def speech(body: SpeechRequest) -> Response: """OpenAI-style TTS. Forwards to Kokoro and returns the audio bytes. Kokoro accepts the OpenAI shape natively. We only substitute the default voice when not specified. Response is whatever format Kokoro produces (WAV by default, mp3/opus/flac if the client asked for one). No retry layer needed — Kokoro is reliable at any input length. """ text = (body.input or "").strip() if not text: raise HTTPException(400, "input text is required") voice = body.voice or DEFAULT_VOICE response_format = body.response_format or "wav" payload = { "model": body.model or "kokoro", "input": text, "voice": voice, "response_format": response_format, } if body.speed is not None: payload["speed"] = body.speed try: async with httpx.AsyncClient(timeout=120.0) as client: r = await client.post( f"{_kokoro_base()}/v1/audio/speech", json=payload ) except httpx.HTTPError as e: raise HTTPException(502, f"kokoro unreachable: {e}") if r.status_code != 200: # Surface Kokoro's error verbatim (bad voice, bad format, etc.). raise HTTPException(r.status_code, r.text[:500]) # Forward Kokoro's content-type so the client knows the format. media_type = r.headers.get("content-type", "audio/wav") return Response(content=r.content, media_type=media_type) # ---- /v1/audio/transcriptions (STT) ---- @router.post("/v1/audio/transcriptions") async def transcriptions( file: UploadFile = File(...), model: Optional[str] = Form(default=None), language: Optional[str] = Form(default=None), prompt: Optional[str] = Form(default=None), response_format: Optional[str] = Form(default="json"), temperature: Optional[float] = Form(default=None), ) -> Response: """Forward to Parakeet's already-OpenAI-compatible endpoint. We relay rather than redirect so clients only need to know one URL (spark-control's) — and so any future client-side rewrites of the request shape (e.g. translating Whisper-format params) happen here. """ body = await file.read() files = {"file": (file.filename or "audio.wav", body, file.content_type or "application/octet-stream")} data: dict[str, str] = {} if model: data["model"] = model if language: data["language"] = language if prompt: data["prompt"] = prompt if response_format: data["response_format"] = response_format if temperature is not None: data["temperature"] = str(temperature) try: async with httpx.AsyncClient(timeout=300.0) as client: r = await client.post( f"{_parakeet_base()}/v1/audio/transcriptions", files=files, data=data, ) except httpx.HTTPError as e: raise HTTPException(502, f"parakeet unreachable: {e}") if r.status_code == 500: # Parakeet 500s are almost always the CUDA wedge (CUBLAS_*_ERROR # mid-attention). Kick deep-health to detect+restart in the # background, and return a clean retry signal to the client. err_snippet = r.text[:400] logger.warning("parakeet 500 — firing deep-health probe in background. detail=%s", err_snippet) if deep_health is not None: try: asyncio.create_task(deep_health.run_one("parakeet")) except Exception as e: logger.error("failed to schedule deep-health probe: %s", e) raise HTTPException( status_code=503, detail="Parakeet returned a transient error (likely CUDA wedge). Auto-restart triggered; retry in ~60s.", headers={"Retry-After": "60"}, ) if r.status_code != 200: raise HTTPException(r.status_code, r.text[:500]) return Response(content=r.content, media_type=r.headers.get("content-type", "application/json")) # ---- /api/audio/diarize-chunk (per-chunk worker for chunked workflows) ---- @router.post("/api/audio/diarize-chunk") async def diarize_chunk(file: UploadFile = File(...)) -> dict: """Per-chunk worker designed for orchestrators that handle chunking + cross-chunk speaker clustering themselves. Given ONE audio chunk, returns diarization segments (with LOCAL speaker labels — Speaker_0/1/... reset per chunk) AND a 192-dim TitaNet voice fingerprint per detected speaker. The caller is expected to: 1. Collect fingerprints from every chunk 2. Run cosine-similarity clustering across all of them (e.g., sklearn AgglomerativeClustering, distance_threshold=0.7) 3. Re-label segments using the resulting global cluster IDs Pair with a SEPARATE call to /v1/audio/transcriptions on the same chunk to get the text. (Kept separate because the caller may want to cache transcription independently of diarization, or run them on different parts of the pipeline.) Response shape: { "duration": 300.0, "segments": [{"start_s", "end_s", "speaker"}, ...], "speakers_detected": ["Speaker_0", "Speaker_1", ...], "fingerprints": {"Speaker_0": [192 floats], "Speaker_1": [...]}, "models": {"diarization": "...", "embedding": "..."} } """ body = await file.read() if not body: raise HTTPException(400, "Empty file") files = {"file": (file.filename or "audio.wav", body, file.content_type or "application/octet-stream")} try: async with httpx.AsyncClient(timeout=600.0) as client: r = await client.post(f"{_parakeet_base()}/v1/audio/diarize-chunk", files=files) except httpx.HTTPError as e: raise HTTPException(502, f"parakeet unreachable: {e}") if r.status_code == 500 and deep_health is not None: # Same CUDA-wedge recovery as the other endpoints try: asyncio.create_task(deep_health.run_one("parakeet")) except Exception: pass raise HTTPException( status_code=503, detail="Parakeet returned a transient error (likely CUDA wedge). Auto-restart triggered; retry in ~60s.", headers={"Retry-After": "60"}, ) if r.status_code != 200: raise HTTPException(r.status_code, r.text[:500]) return r.json() # ---- /api/audio/transcribe-with-speakers (STT + diarization, merged) ---- @router.post("/api/audio/transcribe-with-speakers") async def transcribe_with_speakers( file: UploadFile = File(...), ) -> dict: """Diarized transcription: run Parakeet ASR and Sortformer diarization on the same audio in parallel, then merge by timestamp. Response shape (designed for downstream UIs): { "duration": 90.5, "language": "en", "speakers_detected": ["Speaker_0", "Speaker_1"], "segments": [ {"start_ms": 39308, "end_ms": 51000, "speaker": "Speaker_0", "text": "good morning i think..."}, ... ], "models": { "transcription": "parakeet-tdt-0.6b-v3", "diarization": "nvidia/diar_sortformer_4spk-v1" } } Each segment is a block of consecutive words by the same speaker. Speaker labels are anonymous (Speaker_0, Speaker_1, ...) — name resolution is the caller's responsibility (LLM analysis with optional participant hints, or manual mapping UI). """ body = await file.read() if not body: raise HTTPException(400, "Empty file") filename = file.filename or "audio.wav" content_type = file.content_type or "application/octet-stream" async def _call_transcribe(client: httpx.AsyncClient) -> dict: files = {"file": (filename, body, content_type)} data = {"response_format": "verbose_json"} r = await client.post( f"{_parakeet_base()}/v1/audio/transcriptions", files=files, data=data, ) r.raise_for_status() return r.json() async def _call_diarize(client: httpx.AsyncClient) -> dict: files = {"file": (filename, body, content_type)} r = await client.post( f"{_parakeet_base()}/v1/audio/diarize", files=files, ) r.raise_for_status() return r.json() # Run both in parallel against the same Parakeet container — Sortformer # and Parakeet ASR are independent forward passes that share the GPU. try: async with httpx.AsyncClient(timeout=600.0) as client: stt, diar = await asyncio.gather( _call_transcribe(client), _call_diarize(client), ) except httpx.HTTPStatusError as e: # Surface upstream errors. If transcribe wedged, kick deep-health. if e.response.status_code == 500 and deep_health is not None: try: asyncio.create_task(deep_health.run_one("parakeet")) except Exception: pass raise HTTPException( status_code=503, detail="Parakeet transient error (likely CUDA wedge). Auto-restart triggered; retry in ~60s.", headers={"Retry-After": "60"}, ) raise HTTPException(e.response.status_code, e.response.text[:500]) except httpx.HTTPError as e: raise HTTPException(502, f"parakeet unreachable: {e}") merged = _merge_words_with_speakers( words=stt.get("words", []), diar_turns=diar.get("segments", []), ) return { "duration": stt.get("duration") or diar.get("duration") or 0.0, "language": stt.get("language", "en"), "speakers_detected": diar.get("speakers_detected", []), "segments": merged, "models": { "transcription": stt.get("model") if isinstance(stt.get("model"), str) else "parakeet", "diarization": diar.get("model", "sortformer"), }, } # ---- /api/audio/label-merge (diarize + name clusters from a visual timeline) ---- async def _diar(client, b, fn): r = await client.post(f"{_parakeet_base()}/v1/audio/diarize-chunk", files={"file": (fn, b, "audio/wav")}) r.raise_for_status() return r.json() async def _txn(client, b, fn): r = await client.post(f"{_parakeet_base()}/v1/audio/transcriptions", files={"file": (fn, b, "audio/wav")}, data={"response_format": "verbose_json"}) r.raise_for_status() return r.json() @router.post("/api/audio/label-merge") async def label_merge( file: Optional[UploadFile] = File(default=None), mic_file: Optional[UploadFile] = File(default=None), system_file: Optional[UploadFile] = File(default=None), timeline: str = Form(...), self_name: str = Form(default="Me"), self_vad: Optional[str] = Form(default=None), known_voiceprints: Optional[str] = Form(default=None), transcribe: bool = Form(default=False), min_overlap: float = Form(default=0.0), voiceprint_threshold: float = Form(default=0.5), ) -> dict: """Diarize audio and NAME each anonymous cluster from a caller-supplied visual timeline (who-was-on-screen-when) by majority temporal overlap, with a voice- fingerprint fallback. Stateless + portable — the caller owns the timeline and voiceprint library; nothing is persisted here. TWO MODES: * MONO (legacy): send `file` (mixed mono). Diarizes the mix, names clusters. * DUAL-CHANNEL: send `mic_file` (the local user's mic) + `system_file` (everyone else, from screen capture), sample-aligned to a shared t0. This uses the channels to SPLIT the problem instead of forcing the diarizer to re-disentangle a mono mix: - mic track -> the local user's words, gated to windows where the mic is actually the user speaking (mic louder than system — a self-VAD computed server-side from the two channels, or supplied via `self_vad`). The mic picks up the remote audio as quiet bleed, so this gate is LOAD-BEARING: without it the bleed would be transcribed as the user. - system track -> diarized (only has to separate the *remote* people, a strictly easier problem) and named via the visual timeline + voiceprints. - the user's clean voiceprint is enrolled from the mic track and injected into the voiceprint library, so a system-track cluster that's actually the user dialed in from a second device (dual-login) resolves to the user, not a stranger. Self-attribution becomes near-perfect (dedicated channel), remote diarization gets cleaner, overlapping speech is trivially separated, and the user no longer consumes one of Sortformer's 4 speaker slots. Form fields (multipart): file | (mic_file + system_file) audio — mono mix OR the two channels timeline JSON [{"start","end","name","confidence?"}, ...] (visual hints for remote folks) self_name name for the local user (mic channel). Default "Me". self_vad optional JSON [{"start","end"}] mic-active-and-louder windows; if omitted, computed server-side by per-window RMS. known_voiceprints optional JSON {name: [192 floats]} from past calls (include the user's) transcribe "true" to attach per-segment text (always on in dual-channel) min_overlap min fraction of a cluster's time overlapping the winning name (default 0) voiceprint_threshold cosine similarity to accept a voiceprint match (default 0.5) """ try: tl = json.loads(timeline) assert isinstance(tl, list) except Exception: raise HTTPException(400, "timeline must be a JSON array of {start,end,name}") known_vp: dict[str, list[float]] = {} if known_voiceprints: try: known_vp = json.loads(known_voiceprints) assert isinstance(known_vp, dict) except Exception: raise HTTPException(400, "known_voiceprints must be a JSON object {name: [floats]}") dual = mic_file is not None and system_file is not None if not dual and file is None: raise HTTPException(400, "provide either 'file' (mono) or both 'mic_file' and 'system_file'") try: async with httpx.AsyncClient(timeout=600.0) as client: if dual: return await _label_merge_dual( client, _diar, _txn, await mic_file.read(), await system_file.read(), tl, self_name, self_vad, known_vp, min_overlap, voiceprint_threshold) body = await file.read() if not body: raise HTTPException(400, "Empty file") fn = file.filename or "audio.wav" if transcribe: diar, stt = await asyncio.gather(_diar(client, body, fn), _txn(client, body, fn)) else: diar, stt = await _diar(client, body, fn), None except HTTPException: raise except httpx.HTTPStatusError as e: if e.response.status_code == 500 and deep_health is not None: try: asyncio.create_task(deep_health.run_one("parakeet")) except Exception: pass raise HTTPException(503, "Parakeet transient error (likely CUDA wedge). Retry in ~60s.", headers={"Retry-After": "60"}) raise HTTPException(e.response.status_code, e.response.text[:500]) except httpx.HTTPError as e: raise HTTPException(502, f"parakeet unreachable: {e}") # ---- MONO path ---- diar_segments = diar.get("segments", []) fingerprints = diar.get("fingerprints", {}) or {} clusters = diar.get("speakers_detected", []) assignment = _name_clusters(diar_segments, fingerprints, clusters, tl, known_vp, min_overlap, voiceprint_threshold) relabeled_turns = [ {"start_s": s.get("start_s"), "end_s": s.get("end_s"), "speaker": assignment[s.get("speaker")]["name"]} for s in diar_segments if s.get("speaker") in assignment ] if transcribe and stt is not None: out_segments = _merge_words_with_speakers(stt.get("words", []), relabeled_turns) else: out_segments = [{ "start_s": s.get("start_s"), "end_s": s.get("end_s"), "speaker": assignment.get(s.get("speaker"), {}).get("name", s.get("speaker")), "confidence": s.get("confidence"), } for s in diar_segments] speakers, named_fingerprints = _speaker_list(clusters, assignment, fingerprints) return { "mode": "mono", "duration": diar.get("duration", 0.0), "speakers": speakers, "segments": out_segments, "fingerprints": named_fingerprints, "models": diar.get("models", {}), } return router # ---- Label-merge helpers ---- def _overlap_seconds(a0: float, a1: float, b0: float, b1: float) -> float: return max(0.0, min(a1, b1) - max(a0, b0)) def _cosine(a: Optional[list], b: Optional[list]) -> float: if not a or not b or len(a) != len(b): return 0.0 dot = sum(x * y for x, y in zip(a, b)) na = sum(x * x for x in a) ** 0.5 nb = sum(x * x for x in b) ** 0.5 if na == 0 or nb == 0: return 0.0 return dot / (na * nb) def _name_clusters(diar_segments, fingerprints, clusters, tl, known_vp, min_overlap, voiceprint_threshold): """Assign a name to each anonymous diarization cluster: visual-timeline overlap winner -> closest known-voiceprint match -> Unknown_N. Shared by mono + dual.""" cluster_dur: dict[str, float] = {} cluster_name_overlap: dict[str, dict[str, float]] = {} for seg in diar_segments: spk = seg.get("speaker") s0, s1 = float(seg.get("start_s", 0)), float(seg.get("end_s", 0)) cluster_dur[spk] = cluster_dur.get(spk, 0.0) + max(0.0, s1 - s0) for entry in tl: name = (entry.get("name") or "").strip() if not name: continue ov = _overlap_seconds(s0, s1, float(entry.get("start", 0)), float(entry.get("end", 0))) if ov > 0: cluster_name_overlap.setdefault(spk, {}) cluster_name_overlap[spk][name] = cluster_name_overlap[spk].get(name, 0.0) + ov assignment: dict[str, dict] = {} used_unknown = 0 for cluster in clusters: names = cluster_name_overlap.get(cluster, {}) total = cluster_dur.get(cluster, 0.0) or 1.0 if names: winner = max(names.items(), key=lambda kv: kv[1]) conf = winner[1] / total if conf >= min_overlap: assignment[cluster] = {"name": winner[0], "source": "visual", "overlap_confidence": round(conf, 4)} continue fp = fingerprints.get(cluster) best_name, best_sim = None, 0.0 if fp and known_vp: for nm, vec in known_vp.items(): sim = _cosine(fp, vec) if sim > best_sim: best_name, best_sim = nm, sim if best_name and best_sim >= voiceprint_threshold: assignment[cluster] = {"name": best_name, "source": "voiceprint", "match_similarity": round(best_sim, 4)} else: assignment[cluster] = {"name": f"Unknown_{used_unknown}", "source": "unmatched"} used_unknown += 1 return assignment def _speaker_list(clusters, assignment, fingerprints): """Build the response `speakers` list + name->fingerprint map from an assignment.""" speakers, named = [], {} for cluster in clusters: a = assignment[cluster] entry = {"cluster": cluster, "name": a["name"], "source": a["source"], "fingerprint": fingerprints.get(cluster)} if "overlap_confidence" in a: entry["overlap_confidence"] = a["overlap_confidence"] if "match_similarity" in a: entry["match_similarity"] = a["match_similarity"] speakers.append(entry) if fingerprints.get(cluster) is not None: named[a["name"]] = fingerprints.get(cluster) return speakers, named def _wav_pcm(b: bytes): """Decode a 16-bit mono/stereo WAV to (int16 array, sample_rate). Returns (None, 0) if it can't decode (caller then requires a client-supplied self_vad).""" try: with wave.open(io.BytesIO(b), "rb") as w: sr, n, ch, sw = w.getframerate(), w.getnframes(), w.getnchannels(), w.getsampwidth() raw = w.readframes(n) if sw != 2: return None, 0 a = array("h") a.frombytes(raw) if ch > 1: a = a[0::ch] # take channel 0 return a, sr except Exception: return None, 0 def _win_rms(pcm_sr, s: float, e: float) -> float: """Normalized RMS (0..1) of the [s,e]-second window of a decoded PCM array.""" a, sr = pcm_sr if a is None or sr <= 0: return 0.0 i, j = max(0, int(s * sr)), min(len(a), int(e * sr)) if j <= i: return 0.0 ss = 0 for x in a[i:j]: ss += x * x return (ss / (j - i)) ** 0.5 / 32768.0 async def _label_merge_dual(client, diar_fn, txn_fn, mic_b, sys_b, tl, self_name, self_vad_json, known_vp, min_overlap, voiceprint_threshold): """Dual-channel label-merge: mic track = the local user (gated to mic-dominant windows so remote bleed isn't transcribed as the user); system track = diarized + named remote speakers. See label_merge docstring for the full rationale.""" if not mic_b or not sys_b: raise HTTPException(400, "empty mic_file or system_file") # System: diarize + transcribe (parallel). Mic: transcribe + diarize (parallel) — # the mic diarization yields the user's clean enrollment voiceprint. sys_diar, sys_stt, mic_stt, mic_diar = await asyncio.gather( diar_fn(client, sys_b, "system.wav"), txn_fn(client, sys_b, "system.wav"), txn_fn(client, mic_b, "mic.wav"), diar_fn(client, mic_b, "mic.wav")) # Enroll the user's voiceprint = fingerprint of the dominant cluster on the mic track. self_vp = None mic_fps = mic_diar.get("fingerprints", {}) or {} if mic_fps: durs: dict[str, float] = {} for s in mic_diar.get("segments", []): durs[s["speaker"]] = durs.get(s["speaker"], 0.0) + (s["end_s"] - s["start_s"]) top = max(durs, key=durs.get) if durs else next(iter(mic_fps)) self_vp = mic_fps.get(top) # Inject self voiceprint so a dual-login (phone) system cluster resolves to the user. vp_lib = dict(known_vp) if self_vp is not None: vp_lib.setdefault(self_name, self_vp) # Name the SYSTEM clusters (remote people, possibly incl. phone-self via voiceprint). sys_segments = sys_diar.get("segments", []) sys_fps = sys_diar.get("fingerprints", {}) or {} sys_clusters = sys_diar.get("speakers_detected", []) sys_assign = _name_clusters(sys_segments, sys_fps, sys_clusters, tl, vp_lib, min_overlap, voiceprint_threshold) sys_turns = [{"start_s": s["start_s"], "end_s": s["end_s"], "speaker": sys_assign[s["speaker"]]["name"]} for s in sys_segments if s["speaker"] in sys_assign] remote_blocks = _merge_words_with_speakers(sys_stt.get("words", []), sys_turns) # Self-VAD: keep only mic words where the mic is genuinely the local user (mic # louder than system), excluding the remote bleed the mic also picks up. vad_windows = None if self_vad_json: try: vad_windows = json.loads(self_vad_json) assert isinstance(vad_windows, list) except Exception: vad_windows = None mic_pcm = _wav_pcm(mic_b) sys_pcm = _wav_pcm(sys_b) if vad_windows is None and mic_pcm[0] is None: raise HTTPException(400, "could not decode WAV for self-VAD; send 16-bit mono WAV or a self_vad array") # Margin so the mic must be CLEARLY louder than system to count as local — guards # against brief remote bleed near utterance boundaries (real local speech runs many # times louder than the bleed; real remote runs many times quieter). _LOCAL_MARGIN = 1.2 def _is_local(s: float, e: float) -> bool: if vad_windows is not None: return any(_overlap_seconds(s, e, float(w.get("start", 0)), float(w.get("end", 0))) > 0 for w in vad_windows) return _win_rms(mic_pcm, s, e) > _win_rms(sys_pcm, s, e) * _LOCAL_MARGIN # Keep mic words where the mic is clearly the dominant channel (margin excludes the # remote bleed the mic also picks up), THEN group the surviving local words into # blocks. Filtering before grouping means a block never mixes local speech with loud # bleed (which would average to system-dominant and drop the whole utterance). local_words = [w for w in mic_stt.get("words", []) if _is_local(float(w.get("start", 0)), float(w.get("end", 0)))] local_blocks = (_merge_words_with_speakers( local_words, [{"start_s": 0.0, "end_s": 1e12, "speaker": self_name}]) if local_words else []) segments = sorted(remote_blocks + local_blocks, key=lambda b: b.get("start_ms", 0)) speakers, named = _speaker_list(sys_clusters, sys_assign, sys_fps) speakers.append({"cluster": "mic", "name": self_name, "source": "mic_channel", "fingerprint": self_vp}) if self_vp is not None: named[self_name] = self_vp return { "mode": "dual_channel", "duration": max(sys_diar.get("duration", 0.0), mic_stt.get("duration", 0.0)), "speakers": speakers, "segments": segments, "fingerprints": named, "models": sys_diar.get("models", {}), } # ---- Merge helper: assign speaker to each word, then group into blocks ---- def _assign_speaker_to_word(word_start_s: float, word_end_s: float, diar_turns: list[dict]) -> str: """Find the diarization turn that contains this word, or has the most overlap with it. Returns the speaker label, or 'Speaker_unknown' if no turn overlaps at all.""" word_mid = (word_start_s + word_end_s) / 2.0 # Fast path: find the turn containing the midpoint for t in diar_turns: if t["start_s"] <= word_mid <= t["end_s"]: return t["speaker"] # Slow path: pick the turn with max overlap with the word's span best_speaker = "Speaker_unknown" best_overlap = 0.0 for t in diar_turns: overlap = max(0.0, min(word_end_s, t["end_s"]) - max(word_start_s, t["start_s"])) if overlap > best_overlap: best_overlap = overlap best_speaker = t["speaker"] return best_speaker def _merge_words_with_speakers(words: list[dict], diar_turns: list[dict]) -> list[dict]: """Group consecutive same-speaker words into blocks. Each input word: {"start": float_s, "end": float_s, "text": str} (Parakeet verbose_json format; values are seconds). Each input turn: {"start_s": float, "end_s": float, "speaker": str}. Output: [{"start_ms": int, "end_ms": int, "speaker": str, "text": str}, ...] Also breaks a block on a long silence gap (>1.5 s) even within the same speaker — keeps blocks readable in UI rendering. """ if not words: return [] SILENCE_BREAK_S = 1.5 def _join_words(parts: list[str]) -> str: """Join word tokens with proper spacing. Different STT outputs vary — some include leading spaces in the word text (' morning'), some don't ('morning'). Normalize by stripping each token then joining with one space; collapse multiple spaces. Keeps punctuation tight (no space before period/comma/etc.).""" cleaned = [p.strip() for p in parts if p and p.strip()] if not cleaned: return "" out = cleaned[0] for token in cleaned[1:]: # No leading space before pure-punctuation tokens if token and token[0] in ".,;:!?)]}'\"": out += token else: out += " " + token return out blocks: list[dict] = [] cur_words: list[str] = [] cur_speaker: Optional[str] = None cur_start_s: Optional[float] = None cur_end_s: Optional[float] = None for w in words: ws = float(w.get("start", 0.0)) we = float(w.get("end", ws)) wt = str(w.get("text", "")) spk = _assign_speaker_to_word(ws, we, diar_turns) is_new_block = ( cur_speaker is None or spk != cur_speaker or (cur_end_s is not None and ws - cur_end_s > SILENCE_BREAK_S) ) if is_new_block: if cur_speaker is not None: blocks.append({ "start_ms": int(cur_start_s * 1000), "end_ms": int(cur_end_s * 1000), "speaker": cur_speaker, "text": _join_words(cur_words), }) cur_words = [wt] cur_speaker = spk cur_start_s = ws cur_end_s = we else: cur_words.append(wt) cur_end_s = we if cur_speaker is not None and cur_words: blocks.append({ "start_ms": int(cur_start_s * 1000), "end_ms": int(cur_end_s * 1000), "speaker": cur_speaker, "text": _join_words(cur_words), }) return blocks