v0.13.0:4 - redaction gateway, embeddings proxy, expanded audio API

- Add redaction gateway (redaction_gateway.py, redaction/ scrub + tests)
- Add embeddings proxy and spark_embed service (Dockerfile + main.py)
- Expand audio_proxy with speaker-aware handling; deep_health/health/server updates
- Package: configureSparks action + sparkConfig model updates, manifest/main wiring
- Docs: AUDIO_API, EMBEDDINGS, REDACTION_GATEWAY; HANDOFF and runbook/known-issues refresh
This commit is contained in:
Keysat
2026-06-11 17:45:21 -05:00
parent 4a75274db3
commit 8d839e3714
37 changed files with 3763 additions and 197 deletions
+444 -77
View File
@@ -1,10 +1,12 @@
"""OpenAI-compatible audio proxy: lets any OpenAI-shaped client (Open WebUI,
Home Assistant, etc.) talk to Parakeet (STT) and Magpie (TTS) through one URL.
Home Assistant, etc.) talk to Parakeet (STT) and Kokoro (TTS) through one URL.
Endpoints exposed on spark-control's port (same as the dashboard):
GET /v1/models — lists STT model + Magpie voices in OpenAI shape
POST /v1/audio/speech — OpenAI TTS → Magpie /v1/audio/synthesize
POST /v1/audio/transcriptions — forward to Parakeet (already OpenAI-compatible)
GET /v1/models — lists STT model + Kokoro voices in OpenAI shape
POST /v1/audio/speech — OpenAI TTS → Kokoro /v1/audio/speech
POST /v1/audio/transcriptions — forward to Parakeet (already OpenAI-compatible)
POST /api/audio/diarize-chunk — per-chunk diarization (Parakeet container, Sortformer+TitaNet)
POST /api/audio/transcribe-with-speakers — ASR + diarization merged
Both downstream services already speak HTTP on the LAN; this module just adapts
request/response shapes so OpenAI clients don't need a custom integration.
@@ -13,10 +15,20 @@ When Parakeet returns a 500 (commonly the recurring CUDA wedge), the proxy
returns a clearer 503 with Retry-After=60, and fires the deep-health probe in
the background — which detects the wedge and triggers a rate-limited container
restart inside seconds. The client's next attempt ~60s later then succeeds.
TTS is intentionally simple: forward the request body to Kokoro and stream the
response back. Kokoro-82M is reliable enough (24/24 successful renders across
the same input lengths that broke Magpie 13/24 times) that no retry, chunking,
or duration-validation layer is needed. This used to be a ~150-line tangle
under v0.13.0:6's Magpie-with-chunking workaround; it's now a single forward.
"""
from __future__ import annotations
import asyncio
import io
import json
import logging
import wave
from array import array
from typing import Any, Optional
import httpx
@@ -28,38 +40,33 @@ from .config import Settings
logger = logging.getLogger("spark-control.audio")
# Magpie voice name encodes its language. Example:
# Magpie-Multilingual.EN-US.Mia -> en-US
# Magpie-Multilingual.ES-US.Diego -> es-US
# Magpie-Multilingual.FR-FR.Pascal -> fr-FR
def _lang_from_voice(voice: str) -> str:
try:
parts = voice.split(".")
# parts = ["Magpie-Multilingual", "EN-US", "Mia"] (or with emotion suffix)
if len(parts) >= 2 and "-" in parts[1]:
lang_part = parts[1] # "EN-US"
primary, region = lang_part.split("-", 1)
return f"{primary.lower()}-{region.upper()}"
except Exception:
pass
return "en-US"
# Kokoro default voice. The four curated voices below were Alice-tested for
# narration/recap-style content; bm_george is the default. Clients can pass
# any of Kokoro's 67 voices in the `voice` field — see /v1/models.
DEFAULT_VOICE = "bm_george"
# Default voice: configurable, falls back to a sensible English voice if unset.
DEFAULT_VOICE = "Magpie-Multilingual.EN-US.Mia"
# Curated quick-pick voices surfaced at the top of /v1/models. The full list
# of 67 voices is fetched live from Kokoro and appended after these.
CURATED_VOICES: list[dict] = [
{"id": "bm_george", "name": "George (British male, narrator-style)", "language": "en-GB"},
{"id": "bf_emma", "name": "Emma (British female, audiobook-style)", "language": "en-GB"},
{"id": "am_michael","name": "Michael (American male, warm narrator)", "language": "en-US"},
{"id": "af_heart", "name": "Heart (American female, warm and balanced)", "language": "en-US"},
]
class SpeechRequest(BaseModel):
"""OpenAI /v1/audio/speech request body."""
model: Optional[str] = None # ignored — Magpie has one model
input: str # the text to speak
voice: Optional[str] = None # e.g. "Magpie-Multilingual.EN-US.Mia"
response_format: Optional[str] = "wav" # only "wav" supported today
speed: Optional[float] = 1.0 # ignored by Magpie
# Magpie-specific extensions (clients may pass these through)
language: Optional[str] = None
sample_rate_hz: Optional[int] = 22050
encoding: Optional[str] = "LINEAR_PCM"
"""OpenAI /v1/audio/speech request body. Forwarded to Kokoro mostly-verbatim.
Kokoro accepts the OpenAI shape natively, so we only need to substitute the
default voice when the client doesn't specify one.
"""
model: Optional[str] = None # Kokoro tolerates any model id
input: str # the text to speak
voice: Optional[str] = None # e.g. "bm_george"; default: DEFAULT_VOICE
response_format: Optional[str] = "wav" # Kokoro supports wav, mp3, opus, flac
speed: Optional[float] = 1.0
def build_router(settings: Settings, deep_health: Any = None) -> APIRouter:
@@ -74,15 +81,17 @@ def build_router(settings: Settings, deep_health: Any = None) -> APIRouter:
def _parakeet_base() -> str:
return f"http://{settings.parakeet_host}:{settings.parakeet_port}"
def _magpie_base() -> str:
return f"http://{settings.magpie_host}:{settings.magpie_port}"
def _kokoro_base() -> str:
return f"http://{settings.kokoro_host}:{settings.kokoro_port}"
# ---- /v1/models ----
@router.get("/v1/models")
async def list_models() -> dict:
"""Advertise the STT model + a small voice menu so clients can
populate their voice-picker UIs. Falls back gracefully if Magpie
is offline (returns just the STT entry)."""
"""Advertise the STT model + Kokoro voices in OpenAI list shape.
Curated voices appear first; the rest of Kokoro's catalog follows.
Falls back to just the STT entry + curated voices if Kokoro is offline.
"""
data: list[dict] = [
{
"id": "parakeet-tdt-0.6b-v3",
@@ -91,66 +100,82 @@ def build_router(settings: Settings, deep_health: Any = None) -> APIRouter:
"kind": "stt",
},
]
# Try to enumerate voices from Magpie; if unreachable, just skip.
# Curated first — these are the four Alice chose for narration/recap.
seen = set()
for v in CURATED_VOICES:
data.append({
"id": v["id"],
"object": "model",
"owned_by": "kokoro",
"kind": "tts",
"display_name": v.get("name"),
"language": v.get("language"),
"curated": True,
})
seen.add(v["id"])
# Append everything else Kokoro advertises (~63 more voices across many
# languages). Best-effort — if Kokoro is unreachable, the curated list
# alone is still usable.
try:
async with httpx.AsyncClient(timeout=5.0) as client:
r = await client.get(f"{_magpie_base()}/v1/audio/list_voices")
r = await client.get(f"{_kokoro_base()}/v1/audio/voices")
if r.status_code == 200:
voices_by_locales = r.json()
seen = set()
for _locales, payload in voices_by_locales.items():
for v in payload.get("voices", []):
# Collapse emotion variants — expose only the base voice name.
# "Magpie-Multilingual.EN-US.Mia.Angry" -> "Magpie-Multilingual.EN-US.Mia"
parts = v.split(".")
base = ".".join(parts[:3]) if len(parts) >= 3 else v
if base not in seen:
seen.add(base)
data.append({
"id": base,
"object": "model",
"owned_by": "nvidia",
"kind": "tts",
})
body = r.json()
for v in body.get("voices", []):
vid = v.get("id") if isinstance(v, dict) else v
if not vid or vid in seen:
continue
data.append({
"id": vid,
"object": "model",
"owned_by": "kokoro",
"kind": "tts",
})
seen.add(vid)
except Exception as e:
logger.warning("magpie voice list unavailable: %s", e)
logger.warning("kokoro voice list unavailable: %s", e)
return {"object": "list", "data": data}
# ---- /v1/audio/speech (TTS) ----
@router.post("/v1/audio/speech")
async def speech(body: SpeechRequest) -> Response:
"""OpenAI-style TTS. Translates to Magpie's multipart synth call.
"""OpenAI-style TTS. Forwards to Kokoro and returns the audio bytes.
Returns raw WAV bytes (Content-Type: audio/wav) — browsers and most
clients play these directly.
Kokoro accepts the OpenAI shape natively. We only substitute the
default voice when not specified. Response is whatever format Kokoro
produces (WAV by default, mp3/opus/flac if the client asked for one).
No retry layer needed — Kokoro is reliable at any input length.
"""
text = (body.input or "").strip()
if not text:
raise HTTPException(400, "input text is required")
voice = body.voice or DEFAULT_VOICE
language = body.language or _lang_from_voice(voice)
sample_rate = int(body.sample_rate_hz or 22050)
encoding = body.encoding or "LINEAR_PCM"
form = {
"text": text,
"language": language,
response_format = body.response_format or "wav"
payload = {
"model": body.model or "kokoro",
"input": text,
"voice": voice,
"sample_rate_hz": str(sample_rate),
"encoding": encoding,
"response_format": response_format,
}
if body.speed is not None:
payload["speed"] = body.speed
try:
async with httpx.AsyncClient(timeout=120.0) as client:
r = await client.post(f"{_magpie_base()}/v1/audio/synthesize", data=form)
r = await client.post(
f"{_kokoro_base()}/v1/audio/speech", json=payload
)
except httpx.HTTPError as e:
raise HTTPException(502, f"magpie unreachable: {e}")
raise HTTPException(502, f"kokoro unreachable: {e}")
if r.status_code != 200:
# Surface Magpie's error message verbatim so clients can debug voice/lang typos.
# Surface Kokoro's error verbatim (bad voice, bad format, etc.).
raise HTTPException(r.status_code, r.text[:500])
# Magpie returns WAV bytes already (Content-Type: audio/wav). Pass through.
# Forward Kokoro's content-type so the client knows the format.
media_type = r.headers.get("content-type", "audio/wav")
return Response(content=r.content, media_type=media_type)
@@ -209,11 +234,11 @@ def build_router(settings: Settings, deep_health: Any = None) -> APIRouter:
raise HTTPException(r.status_code, r.text[:500])
return Response(content=r.content, media_type=r.headers.get("content-type", "application/json"))
# ---- /api/audio/diarize-chunk (per-chunk worker for Recap Relay) ----
# ---- /api/audio/diarize-chunk (per-chunk worker for chunked workflows) ----
@router.post("/api/audio/diarize-chunk")
async def diarize_chunk(file: UploadFile = File(...)) -> dict:
"""Per-chunk worker designed for orchestrators (Recap Relay) that
handle chunking + cross-chunk speaker clustering themselves.
"""Per-chunk worker designed for orchestrators that handle chunking +
cross-chunk speaker clustering themselves.
Given ONE audio chunk, returns diarization segments (with LOCAL
speaker labels — Speaker_0/1/... reset per chunk) AND a 192-dim
@@ -271,7 +296,7 @@ def build_router(settings: Settings, deep_health: Any = None) -> APIRouter:
"""Diarized transcription: run Parakeet ASR and Sortformer diarization on
the same audio in parallel, then merge by timestamp.
Response shape (designed for downstream UIs like recap-relay):
Response shape (designed for downstream UIs):
{
"duration": 90.5,
@@ -299,8 +324,6 @@ def build_router(settings: Settings, deep_health: Any = None) -> APIRouter:
filename = file.filename or "audio.wav"
content_type = file.content_type or "application/octet-stream"
# Parakeet ASR + Sortformer diarizer in parallel. (A WhisperX detour
# lived here briefly — reverted in v0.13.0:0; see release notes.)
async def _call_transcribe(client: httpx.AsyncClient) -> dict:
files = {"file": (filename, body, content_type)}
data = {"response_format": "verbose_json"}
@@ -359,9 +382,353 @@ def build_router(settings: Settings, deep_health: Any = None) -> APIRouter:
},
}
# ---- /api/audio/label-merge (diarize + name clusters from a visual timeline) ----
async def _diar(client, b, fn):
r = await client.post(f"{_parakeet_base()}/v1/audio/diarize-chunk",
files={"file": (fn, b, "audio/wav")})
r.raise_for_status()
return r.json()
async def _txn(client, b, fn):
r = await client.post(f"{_parakeet_base()}/v1/audio/transcriptions",
files={"file": (fn, b, "audio/wav")},
data={"response_format": "verbose_json"})
r.raise_for_status()
return r.json()
@router.post("/api/audio/label-merge")
async def label_merge(
file: Optional[UploadFile] = File(default=None),
mic_file: Optional[UploadFile] = File(default=None),
system_file: Optional[UploadFile] = File(default=None),
timeline: str = Form(...),
self_name: str = Form(default="Me"),
self_vad: Optional[str] = Form(default=None),
known_voiceprints: Optional[str] = Form(default=None),
transcribe: bool = Form(default=False),
min_overlap: float = Form(default=0.0),
voiceprint_threshold: float = Form(default=0.5),
) -> dict:
"""Diarize audio and NAME each anonymous cluster from a caller-supplied visual
timeline (who-was-on-screen-when) by majority temporal overlap, with a voice-
fingerprint fallback. Stateless + portable — the caller owns the timeline and
voiceprint library; nothing is persisted here.
TWO MODES:
* MONO (legacy): send `file` (mixed mono). Diarizes the mix, names clusters.
* DUAL-CHANNEL: send `mic_file` (the local user's mic) + `system_file`
(everyone else, from screen capture), sample-aligned to a shared t0. This
uses the channels to SPLIT the problem instead of forcing the diarizer to
re-disentangle a mono mix:
- mic track -> the local user's words, gated to windows where the mic is
actually the user speaking (mic louder than system — a self-VAD computed
server-side from the two channels, or supplied via `self_vad`). The mic
picks up the remote audio as quiet bleed, so this gate is LOAD-BEARING:
without it the bleed would be transcribed as the user.
- system track -> diarized (only has to separate the *remote* people, a
strictly easier problem) and named via the visual timeline + voiceprints.
- the user's clean voiceprint is enrolled from the mic track and injected
into the voiceprint library, so a system-track cluster that's actually the
user dialed in from a second device (dual-login) resolves to the user, not
a stranger.
Self-attribution becomes near-perfect (dedicated channel), remote diarization
gets cleaner, overlapping speech is trivially separated, and the user no longer
consumes one of Sortformer's 4 speaker slots.
Form fields (multipart):
file | (mic_file + system_file) audio — mono mix OR the two channels
timeline JSON [{"start","end","name","confidence?"}, ...] (visual hints for remote folks)
self_name name for the local user (mic channel). Default "Me".
self_vad optional JSON [{"start","end"}] mic-active-and-louder windows;
if omitted, computed server-side by per-window RMS.
known_voiceprints optional JSON {name: [192 floats]} from past calls (include the user's)
transcribe "true" to attach per-segment text (always on in dual-channel)
min_overlap min fraction of a cluster's time overlapping the winning name (default 0)
voiceprint_threshold cosine similarity to accept a voiceprint match (default 0.5)
"""
try:
tl = json.loads(timeline)
assert isinstance(tl, list)
except Exception:
raise HTTPException(400, "timeline must be a JSON array of {start,end,name}")
known_vp: dict[str, list[float]] = {}
if known_voiceprints:
try:
known_vp = json.loads(known_voiceprints)
assert isinstance(known_vp, dict)
except Exception:
raise HTTPException(400, "known_voiceprints must be a JSON object {name: [floats]}")
dual = mic_file is not None and system_file is not None
if not dual and file is None:
raise HTTPException(400, "provide either 'file' (mono) or both 'mic_file' and 'system_file'")
try:
async with httpx.AsyncClient(timeout=600.0) as client:
if dual:
return await _label_merge_dual(
client, _diar, _txn, await mic_file.read(), await system_file.read(),
tl, self_name, self_vad, known_vp, min_overlap, voiceprint_threshold)
body = await file.read()
if not body:
raise HTTPException(400, "Empty file")
fn = file.filename or "audio.wav"
if transcribe:
diar, stt = await asyncio.gather(_diar(client, body, fn), _txn(client, body, fn))
else:
diar, stt = await _diar(client, body, fn), None
except HTTPException:
raise
except httpx.HTTPStatusError as e:
if e.response.status_code == 500 and deep_health is not None:
try:
asyncio.create_task(deep_health.run_one("parakeet"))
except Exception:
pass
raise HTTPException(503, "Parakeet transient error (likely CUDA wedge). Retry in ~60s.",
headers={"Retry-After": "60"})
raise HTTPException(e.response.status_code, e.response.text[:500])
except httpx.HTTPError as e:
raise HTTPException(502, f"parakeet unreachable: {e}")
# ---- MONO path ----
diar_segments = diar.get("segments", [])
fingerprints = diar.get("fingerprints", {}) or {}
clusters = diar.get("speakers_detected", [])
assignment = _name_clusters(diar_segments, fingerprints, clusters, tl, known_vp,
min_overlap, voiceprint_threshold)
relabeled_turns = [
{"start_s": s.get("start_s"), "end_s": s.get("end_s"),
"speaker": assignment[s.get("speaker")]["name"]}
for s in diar_segments if s.get("speaker") in assignment
]
if transcribe and stt is not None:
out_segments = _merge_words_with_speakers(stt.get("words", []), relabeled_turns)
else:
out_segments = [{
"start_s": s.get("start_s"), "end_s": s.get("end_s"),
"speaker": assignment.get(s.get("speaker"), {}).get("name", s.get("speaker")),
"confidence": s.get("confidence"),
} for s in diar_segments]
speakers, named_fingerprints = _speaker_list(clusters, assignment, fingerprints)
return {
"mode": "mono",
"duration": diar.get("duration", 0.0),
"speakers": speakers,
"segments": out_segments,
"fingerprints": named_fingerprints,
"models": diar.get("models", {}),
}
return router
# ---- Label-merge helpers ----
def _overlap_seconds(a0: float, a1: float, b0: float, b1: float) -> float:
return max(0.0, min(a1, b1) - max(a0, b0))
def _cosine(a: Optional[list], b: Optional[list]) -> float:
if not a or not b or len(a) != len(b):
return 0.0
dot = sum(x * y for x, y in zip(a, b))
na = sum(x * x for x in a) ** 0.5
nb = sum(x * x for x in b) ** 0.5
if na == 0 or nb == 0:
return 0.0
return dot / (na * nb)
def _name_clusters(diar_segments, fingerprints, clusters, tl, known_vp,
min_overlap, voiceprint_threshold):
"""Assign a name to each anonymous diarization cluster: visual-timeline overlap
winner -> closest known-voiceprint match -> Unknown_N. Shared by mono + dual."""
cluster_dur: dict[str, float] = {}
cluster_name_overlap: dict[str, dict[str, float]] = {}
for seg in diar_segments:
spk = seg.get("speaker")
s0, s1 = float(seg.get("start_s", 0)), float(seg.get("end_s", 0))
cluster_dur[spk] = cluster_dur.get(spk, 0.0) + max(0.0, s1 - s0)
for entry in tl:
name = (entry.get("name") or "").strip()
if not name:
continue
ov = _overlap_seconds(s0, s1, float(entry.get("start", 0)), float(entry.get("end", 0)))
if ov > 0:
cluster_name_overlap.setdefault(spk, {})
cluster_name_overlap[spk][name] = cluster_name_overlap[spk].get(name, 0.0) + ov
assignment: dict[str, dict] = {}
used_unknown = 0
for cluster in clusters:
names = cluster_name_overlap.get(cluster, {})
total = cluster_dur.get(cluster, 0.0) or 1.0
if names:
winner = max(names.items(), key=lambda kv: kv[1])
conf = winner[1] / total
if conf >= min_overlap:
assignment[cluster] = {"name": winner[0], "source": "visual",
"overlap_confidence": round(conf, 4)}
continue
fp = fingerprints.get(cluster)
best_name, best_sim = None, 0.0
if fp and known_vp:
for nm, vec in known_vp.items():
sim = _cosine(fp, vec)
if sim > best_sim:
best_name, best_sim = nm, sim
if best_name and best_sim >= voiceprint_threshold:
assignment[cluster] = {"name": best_name, "source": "voiceprint",
"match_similarity": round(best_sim, 4)}
else:
assignment[cluster] = {"name": f"Unknown_{used_unknown}", "source": "unmatched"}
used_unknown += 1
return assignment
def _speaker_list(clusters, assignment, fingerprints):
"""Build the response `speakers` list + name->fingerprint map from an assignment."""
speakers, named = [], {}
for cluster in clusters:
a = assignment[cluster]
entry = {"cluster": cluster, "name": a["name"], "source": a["source"],
"fingerprint": fingerprints.get(cluster)}
if "overlap_confidence" in a:
entry["overlap_confidence"] = a["overlap_confidence"]
if "match_similarity" in a:
entry["match_similarity"] = a["match_similarity"]
speakers.append(entry)
if fingerprints.get(cluster) is not None:
named[a["name"]] = fingerprints.get(cluster)
return speakers, named
def _wav_pcm(b: bytes):
"""Decode a 16-bit mono/stereo WAV to (int16 array, sample_rate). Returns
(None, 0) if it can't decode (caller then requires a client-supplied self_vad)."""
try:
with wave.open(io.BytesIO(b), "rb") as w:
sr, n, ch, sw = w.getframerate(), w.getnframes(), w.getnchannels(), w.getsampwidth()
raw = w.readframes(n)
if sw != 2:
return None, 0
a = array("h")
a.frombytes(raw)
if ch > 1:
a = a[0::ch] # take channel 0
return a, sr
except Exception:
return None, 0
def _win_rms(pcm_sr, s: float, e: float) -> float:
"""Normalized RMS (0..1) of the [s,e]-second window of a decoded PCM array."""
a, sr = pcm_sr
if a is None or sr <= 0:
return 0.0
i, j = max(0, int(s * sr)), min(len(a), int(e * sr))
if j <= i:
return 0.0
ss = 0
for x in a[i:j]:
ss += x * x
return (ss / (j - i)) ** 0.5 / 32768.0
async def _label_merge_dual(client, diar_fn, txn_fn, mic_b, sys_b, tl, self_name,
self_vad_json, known_vp, min_overlap, voiceprint_threshold):
"""Dual-channel label-merge: mic track = the local user (gated to mic-dominant
windows so remote bleed isn't transcribed as the user); system track = diarized +
named remote speakers. See label_merge docstring for the full rationale."""
if not mic_b or not sys_b:
raise HTTPException(400, "empty mic_file or system_file")
# System: diarize + transcribe (parallel). Mic: transcribe + diarize (parallel) —
# the mic diarization yields the user's clean enrollment voiceprint.
sys_diar, sys_stt, mic_stt, mic_diar = await asyncio.gather(
diar_fn(client, sys_b, "system.wav"), txn_fn(client, sys_b, "system.wav"),
txn_fn(client, mic_b, "mic.wav"), diar_fn(client, mic_b, "mic.wav"))
# Enroll the user's voiceprint = fingerprint of the dominant cluster on the mic track.
self_vp = None
mic_fps = mic_diar.get("fingerprints", {}) or {}
if mic_fps:
durs: dict[str, float] = {}
for s in mic_diar.get("segments", []):
durs[s["speaker"]] = durs.get(s["speaker"], 0.0) + (s["end_s"] - s["start_s"])
top = max(durs, key=durs.get) if durs else next(iter(mic_fps))
self_vp = mic_fps.get(top)
# Inject self voiceprint so a dual-login (phone) system cluster resolves to the user.
vp_lib = dict(known_vp)
if self_vp is not None:
vp_lib.setdefault(self_name, self_vp)
# Name the SYSTEM clusters (remote people, possibly incl. phone-self via voiceprint).
sys_segments = sys_diar.get("segments", [])
sys_fps = sys_diar.get("fingerprints", {}) or {}
sys_clusters = sys_diar.get("speakers_detected", [])
sys_assign = _name_clusters(sys_segments, sys_fps, sys_clusters, tl, vp_lib,
min_overlap, voiceprint_threshold)
sys_turns = [{"start_s": s["start_s"], "end_s": s["end_s"],
"speaker": sys_assign[s["speaker"]]["name"]}
for s in sys_segments if s["speaker"] in sys_assign]
remote_blocks = _merge_words_with_speakers(sys_stt.get("words", []), sys_turns)
# Self-VAD: keep only mic words where the mic is genuinely the local user (mic
# louder than system), excluding the remote bleed the mic also picks up.
vad_windows = None
if self_vad_json:
try:
vad_windows = json.loads(self_vad_json)
assert isinstance(vad_windows, list)
except Exception:
vad_windows = None
mic_pcm = _wav_pcm(mic_b)
sys_pcm = _wav_pcm(sys_b)
if vad_windows is None and mic_pcm[0] is None:
raise HTTPException(400, "could not decode WAV for self-VAD; send 16-bit mono WAV or a self_vad array")
# Margin so the mic must be CLEARLY louder than system to count as local — guards
# against brief remote bleed near utterance boundaries (real local speech runs many
# times louder than the bleed; real remote runs many times quieter).
_LOCAL_MARGIN = 1.2
def _is_local(s: float, e: float) -> bool:
if vad_windows is not None:
return any(_overlap_seconds(s, e, float(w.get("start", 0)), float(w.get("end", 0))) > 0
for w in vad_windows)
return _win_rms(mic_pcm, s, e) > _win_rms(sys_pcm, s, e) * _LOCAL_MARGIN
# Keep mic words where the mic is clearly the dominant channel (margin excludes the
# remote bleed the mic also picks up), THEN group the surviving local words into
# blocks. Filtering before grouping means a block never mixes local speech with loud
# bleed (which would average to system-dominant and drop the whole utterance).
local_words = [w for w in mic_stt.get("words", [])
if _is_local(float(w.get("start", 0)), float(w.get("end", 0)))]
local_blocks = (_merge_words_with_speakers(
local_words, [{"start_s": 0.0, "end_s": 1e12, "speaker": self_name}])
if local_words else [])
segments = sorted(remote_blocks + local_blocks, key=lambda b: b.get("start_ms", 0))
speakers, named = _speaker_list(sys_clusters, sys_assign, sys_fps)
speakers.append({"cluster": "mic", "name": self_name, "source": "mic_channel",
"fingerprint": self_vp})
if self_vp is not None:
named[self_name] = self_vp
return {
"mode": "dual_channel",
"duration": max(sys_diar.get("duration", 0.0), mic_stt.get("duration", 0.0)),
"speakers": speakers,
"segments": segments,
"fingerprints": named,
"models": sys_diar.get("models", {}),
}
# ---- Merge helper: assign speaker to each word, then group into blocks ----
def _assign_speaker_to_word(word_start_s: float, word_end_s: float, diar_turns: list[dict]) -> str: