v0.13.0:4 - redaction gateway, embeddings proxy, expanded audio API
- Add redaction gateway (redaction_gateway.py, redaction/ scrub + tests) - Add embeddings proxy and spark_embed service (Dockerfile + main.py) - Expand audio_proxy with speaker-aware handling; deep_health/health/server updates - Package: configureSparks action + sparkConfig model updates, manifest/main wiring - Docs: AUDIO_API, EMBEDDINGS, REDACTION_GATEWAY; HANDOFF and runbook/known-issues refresh
This commit is contained in:
+444
-77
@@ -1,10 +1,12 @@
|
||||
"""OpenAI-compatible audio proxy: lets any OpenAI-shaped client (Open WebUI,
|
||||
Home Assistant, etc.) talk to Parakeet (STT) and Magpie (TTS) through one URL.
|
||||
Home Assistant, etc.) talk to Parakeet (STT) and Kokoro (TTS) through one URL.
|
||||
|
||||
Endpoints exposed on spark-control's port (same as the dashboard):
|
||||
GET /v1/models — lists STT model + Magpie voices in OpenAI shape
|
||||
POST /v1/audio/speech — OpenAI TTS → Magpie /v1/audio/synthesize
|
||||
POST /v1/audio/transcriptions — forward to Parakeet (already OpenAI-compatible)
|
||||
GET /v1/models — lists STT model + Kokoro voices in OpenAI shape
|
||||
POST /v1/audio/speech — OpenAI TTS → Kokoro /v1/audio/speech
|
||||
POST /v1/audio/transcriptions — forward to Parakeet (already OpenAI-compatible)
|
||||
POST /api/audio/diarize-chunk — per-chunk diarization (Parakeet container, Sortformer+TitaNet)
|
||||
POST /api/audio/transcribe-with-speakers — ASR + diarization merged
|
||||
|
||||
Both downstream services already speak HTTP on the LAN; this module just adapts
|
||||
request/response shapes so OpenAI clients don't need a custom integration.
|
||||
@@ -13,10 +15,20 @@ When Parakeet returns a 500 (commonly the recurring CUDA wedge), the proxy
|
||||
returns a clearer 503 with Retry-After=60, and fires the deep-health probe in
|
||||
the background — which detects the wedge and triggers a rate-limited container
|
||||
restart inside seconds. The client's next attempt ~60s later then succeeds.
|
||||
|
||||
TTS is intentionally simple: forward the request body to Kokoro and stream the
|
||||
response back. Kokoro-82M is reliable enough (24/24 successful renders across
|
||||
the same input lengths that broke Magpie 13/24 times) that no retry, chunking,
|
||||
or duration-validation layer is needed. This used to be a ~150-line tangle
|
||||
under v0.13.0:6's Magpie-with-chunking workaround; it's now a single forward.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import asyncio
|
||||
import io
|
||||
import json
|
||||
import logging
|
||||
import wave
|
||||
from array import array
|
||||
from typing import Any, Optional
|
||||
|
||||
import httpx
|
||||
@@ -28,38 +40,33 @@ from .config import Settings
|
||||
|
||||
logger = logging.getLogger("spark-control.audio")
|
||||
|
||||
# Magpie voice name encodes its language. Example:
|
||||
# Magpie-Multilingual.EN-US.Mia -> en-US
|
||||
# Magpie-Multilingual.ES-US.Diego -> es-US
|
||||
# Magpie-Multilingual.FR-FR.Pascal -> fr-FR
|
||||
def _lang_from_voice(voice: str) -> str:
|
||||
try:
|
||||
parts = voice.split(".")
|
||||
# parts = ["Magpie-Multilingual", "EN-US", "Mia"] (or with emotion suffix)
|
||||
if len(parts) >= 2 and "-" in parts[1]:
|
||||
lang_part = parts[1] # "EN-US"
|
||||
primary, region = lang_part.split("-", 1)
|
||||
return f"{primary.lower()}-{region.upper()}"
|
||||
except Exception:
|
||||
pass
|
||||
return "en-US"
|
||||
|
||||
# Kokoro default voice. The four curated voices below were Alice-tested for
|
||||
# narration/recap-style content; bm_george is the default. Clients can pass
|
||||
# any of Kokoro's 67 voices in the `voice` field — see /v1/models.
|
||||
DEFAULT_VOICE = "bm_george"
|
||||
|
||||
# Default voice: configurable, falls back to a sensible English voice if unset.
|
||||
DEFAULT_VOICE = "Magpie-Multilingual.EN-US.Mia"
|
||||
# Curated quick-pick voices surfaced at the top of /v1/models. The full list
|
||||
# of 67 voices is fetched live from Kokoro and appended after these.
|
||||
CURATED_VOICES: list[dict] = [
|
||||
{"id": "bm_george", "name": "George (British male, narrator-style)", "language": "en-GB"},
|
||||
{"id": "bf_emma", "name": "Emma (British female, audiobook-style)", "language": "en-GB"},
|
||||
{"id": "am_michael","name": "Michael (American male, warm narrator)", "language": "en-US"},
|
||||
{"id": "af_heart", "name": "Heart (American female, warm and balanced)", "language": "en-US"},
|
||||
]
|
||||
|
||||
|
||||
class SpeechRequest(BaseModel):
|
||||
"""OpenAI /v1/audio/speech request body."""
|
||||
model: Optional[str] = None # ignored — Magpie has one model
|
||||
input: str # the text to speak
|
||||
voice: Optional[str] = None # e.g. "Magpie-Multilingual.EN-US.Mia"
|
||||
response_format: Optional[str] = "wav" # only "wav" supported today
|
||||
speed: Optional[float] = 1.0 # ignored by Magpie
|
||||
# Magpie-specific extensions (clients may pass these through)
|
||||
language: Optional[str] = None
|
||||
sample_rate_hz: Optional[int] = 22050
|
||||
encoding: Optional[str] = "LINEAR_PCM"
|
||||
"""OpenAI /v1/audio/speech request body. Forwarded to Kokoro mostly-verbatim.
|
||||
|
||||
Kokoro accepts the OpenAI shape natively, so we only need to substitute the
|
||||
default voice when the client doesn't specify one.
|
||||
"""
|
||||
model: Optional[str] = None # Kokoro tolerates any model id
|
||||
input: str # the text to speak
|
||||
voice: Optional[str] = None # e.g. "bm_george"; default: DEFAULT_VOICE
|
||||
response_format: Optional[str] = "wav" # Kokoro supports wav, mp3, opus, flac
|
||||
speed: Optional[float] = 1.0
|
||||
|
||||
|
||||
def build_router(settings: Settings, deep_health: Any = None) -> APIRouter:
|
||||
@@ -74,15 +81,17 @@ def build_router(settings: Settings, deep_health: Any = None) -> APIRouter:
|
||||
def _parakeet_base() -> str:
|
||||
return f"http://{settings.parakeet_host}:{settings.parakeet_port}"
|
||||
|
||||
def _magpie_base() -> str:
|
||||
return f"http://{settings.magpie_host}:{settings.magpie_port}"
|
||||
def _kokoro_base() -> str:
|
||||
return f"http://{settings.kokoro_host}:{settings.kokoro_port}"
|
||||
|
||||
# ---- /v1/models ----
|
||||
@router.get("/v1/models")
|
||||
async def list_models() -> dict:
|
||||
"""Advertise the STT model + a small voice menu so clients can
|
||||
populate their voice-picker UIs. Falls back gracefully if Magpie
|
||||
is offline (returns just the STT entry)."""
|
||||
"""Advertise the STT model + Kokoro voices in OpenAI list shape.
|
||||
|
||||
Curated voices appear first; the rest of Kokoro's catalog follows.
|
||||
Falls back to just the STT entry + curated voices if Kokoro is offline.
|
||||
"""
|
||||
data: list[dict] = [
|
||||
{
|
||||
"id": "parakeet-tdt-0.6b-v3",
|
||||
@@ -91,66 +100,82 @@ def build_router(settings: Settings, deep_health: Any = None) -> APIRouter:
|
||||
"kind": "stt",
|
||||
},
|
||||
]
|
||||
# Try to enumerate voices from Magpie; if unreachable, just skip.
|
||||
# Curated first — these are the four Alice chose for narration/recap.
|
||||
seen = set()
|
||||
for v in CURATED_VOICES:
|
||||
data.append({
|
||||
"id": v["id"],
|
||||
"object": "model",
|
||||
"owned_by": "kokoro",
|
||||
"kind": "tts",
|
||||
"display_name": v.get("name"),
|
||||
"language": v.get("language"),
|
||||
"curated": True,
|
||||
})
|
||||
seen.add(v["id"])
|
||||
|
||||
# Append everything else Kokoro advertises (~63 more voices across many
|
||||
# languages). Best-effort — if Kokoro is unreachable, the curated list
|
||||
# alone is still usable.
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=5.0) as client:
|
||||
r = await client.get(f"{_magpie_base()}/v1/audio/list_voices")
|
||||
r = await client.get(f"{_kokoro_base()}/v1/audio/voices")
|
||||
if r.status_code == 200:
|
||||
voices_by_locales = r.json()
|
||||
seen = set()
|
||||
for _locales, payload in voices_by_locales.items():
|
||||
for v in payload.get("voices", []):
|
||||
# Collapse emotion variants — expose only the base voice name.
|
||||
# "Magpie-Multilingual.EN-US.Mia.Angry" -> "Magpie-Multilingual.EN-US.Mia"
|
||||
parts = v.split(".")
|
||||
base = ".".join(parts[:3]) if len(parts) >= 3 else v
|
||||
if base not in seen:
|
||||
seen.add(base)
|
||||
data.append({
|
||||
"id": base,
|
||||
"object": "model",
|
||||
"owned_by": "nvidia",
|
||||
"kind": "tts",
|
||||
})
|
||||
body = r.json()
|
||||
for v in body.get("voices", []):
|
||||
vid = v.get("id") if isinstance(v, dict) else v
|
||||
if not vid or vid in seen:
|
||||
continue
|
||||
data.append({
|
||||
"id": vid,
|
||||
"object": "model",
|
||||
"owned_by": "kokoro",
|
||||
"kind": "tts",
|
||||
})
|
||||
seen.add(vid)
|
||||
except Exception as e:
|
||||
logger.warning("magpie voice list unavailable: %s", e)
|
||||
logger.warning("kokoro voice list unavailable: %s", e)
|
||||
return {"object": "list", "data": data}
|
||||
|
||||
# ---- /v1/audio/speech (TTS) ----
|
||||
@router.post("/v1/audio/speech")
|
||||
async def speech(body: SpeechRequest) -> Response:
|
||||
"""OpenAI-style TTS. Translates to Magpie's multipart synth call.
|
||||
"""OpenAI-style TTS. Forwards to Kokoro and returns the audio bytes.
|
||||
|
||||
Returns raw WAV bytes (Content-Type: audio/wav) — browsers and most
|
||||
clients play these directly.
|
||||
Kokoro accepts the OpenAI shape natively. We only substitute the
|
||||
default voice when not specified. Response is whatever format Kokoro
|
||||
produces (WAV by default, mp3/opus/flac if the client asked for one).
|
||||
|
||||
No retry layer needed — Kokoro is reliable at any input length.
|
||||
"""
|
||||
text = (body.input or "").strip()
|
||||
if not text:
|
||||
raise HTTPException(400, "input text is required")
|
||||
|
||||
voice = body.voice or DEFAULT_VOICE
|
||||
language = body.language or _lang_from_voice(voice)
|
||||
sample_rate = int(body.sample_rate_hz or 22050)
|
||||
encoding = body.encoding or "LINEAR_PCM"
|
||||
|
||||
form = {
|
||||
"text": text,
|
||||
"language": language,
|
||||
response_format = body.response_format or "wav"
|
||||
payload = {
|
||||
"model": body.model or "kokoro",
|
||||
"input": text,
|
||||
"voice": voice,
|
||||
"sample_rate_hz": str(sample_rate),
|
||||
"encoding": encoding,
|
||||
"response_format": response_format,
|
||||
}
|
||||
if body.speed is not None:
|
||||
payload["speed"] = body.speed
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=120.0) as client:
|
||||
r = await client.post(f"{_magpie_base()}/v1/audio/synthesize", data=form)
|
||||
r = await client.post(
|
||||
f"{_kokoro_base()}/v1/audio/speech", json=payload
|
||||
)
|
||||
except httpx.HTTPError as e:
|
||||
raise HTTPException(502, f"magpie unreachable: {e}")
|
||||
raise HTTPException(502, f"kokoro unreachable: {e}")
|
||||
|
||||
if r.status_code != 200:
|
||||
# Surface Magpie's error message verbatim so clients can debug voice/lang typos.
|
||||
# Surface Kokoro's error verbatim (bad voice, bad format, etc.).
|
||||
raise HTTPException(r.status_code, r.text[:500])
|
||||
|
||||
# Magpie returns WAV bytes already (Content-Type: audio/wav). Pass through.
|
||||
# Forward Kokoro's content-type so the client knows the format.
|
||||
media_type = r.headers.get("content-type", "audio/wav")
|
||||
return Response(content=r.content, media_type=media_type)
|
||||
|
||||
@@ -209,11 +234,11 @@ def build_router(settings: Settings, deep_health: Any = None) -> APIRouter:
|
||||
raise HTTPException(r.status_code, r.text[:500])
|
||||
return Response(content=r.content, media_type=r.headers.get("content-type", "application/json"))
|
||||
|
||||
# ---- /api/audio/diarize-chunk (per-chunk worker for Recap Relay) ----
|
||||
# ---- /api/audio/diarize-chunk (per-chunk worker for chunked workflows) ----
|
||||
@router.post("/api/audio/diarize-chunk")
|
||||
async def diarize_chunk(file: UploadFile = File(...)) -> dict:
|
||||
"""Per-chunk worker designed for orchestrators (Recap Relay) that
|
||||
handle chunking + cross-chunk speaker clustering themselves.
|
||||
"""Per-chunk worker designed for orchestrators that handle chunking +
|
||||
cross-chunk speaker clustering themselves.
|
||||
|
||||
Given ONE audio chunk, returns diarization segments (with LOCAL
|
||||
speaker labels — Speaker_0/1/... reset per chunk) AND a 192-dim
|
||||
@@ -271,7 +296,7 @@ def build_router(settings: Settings, deep_health: Any = None) -> APIRouter:
|
||||
"""Diarized transcription: run Parakeet ASR and Sortformer diarization on
|
||||
the same audio in parallel, then merge by timestamp.
|
||||
|
||||
Response shape (designed for downstream UIs like recap-relay):
|
||||
Response shape (designed for downstream UIs):
|
||||
|
||||
{
|
||||
"duration": 90.5,
|
||||
@@ -299,8 +324,6 @@ def build_router(settings: Settings, deep_health: Any = None) -> APIRouter:
|
||||
filename = file.filename or "audio.wav"
|
||||
content_type = file.content_type or "application/octet-stream"
|
||||
|
||||
# Parakeet ASR + Sortformer diarizer in parallel. (A WhisperX detour
|
||||
# lived here briefly — reverted in v0.13.0:0; see release notes.)
|
||||
async def _call_transcribe(client: httpx.AsyncClient) -> dict:
|
||||
files = {"file": (filename, body, content_type)}
|
||||
data = {"response_format": "verbose_json"}
|
||||
@@ -359,9 +382,353 @@ def build_router(settings: Settings, deep_health: Any = None) -> APIRouter:
|
||||
},
|
||||
}
|
||||
|
||||
# ---- /api/audio/label-merge (diarize + name clusters from a visual timeline) ----
|
||||
async def _diar(client, b, fn):
|
||||
r = await client.post(f"{_parakeet_base()}/v1/audio/diarize-chunk",
|
||||
files={"file": (fn, b, "audio/wav")})
|
||||
r.raise_for_status()
|
||||
return r.json()
|
||||
|
||||
async def _txn(client, b, fn):
|
||||
r = await client.post(f"{_parakeet_base()}/v1/audio/transcriptions",
|
||||
files={"file": (fn, b, "audio/wav")},
|
||||
data={"response_format": "verbose_json"})
|
||||
r.raise_for_status()
|
||||
return r.json()
|
||||
|
||||
@router.post("/api/audio/label-merge")
|
||||
async def label_merge(
|
||||
file: Optional[UploadFile] = File(default=None),
|
||||
mic_file: Optional[UploadFile] = File(default=None),
|
||||
system_file: Optional[UploadFile] = File(default=None),
|
||||
timeline: str = Form(...),
|
||||
self_name: str = Form(default="Me"),
|
||||
self_vad: Optional[str] = Form(default=None),
|
||||
known_voiceprints: Optional[str] = Form(default=None),
|
||||
transcribe: bool = Form(default=False),
|
||||
min_overlap: float = Form(default=0.0),
|
||||
voiceprint_threshold: float = Form(default=0.5),
|
||||
) -> dict:
|
||||
"""Diarize audio and NAME each anonymous cluster from a caller-supplied visual
|
||||
timeline (who-was-on-screen-when) by majority temporal overlap, with a voice-
|
||||
fingerprint fallback. Stateless + portable — the caller owns the timeline and
|
||||
voiceprint library; nothing is persisted here.
|
||||
|
||||
TWO MODES:
|
||||
|
||||
* MONO (legacy): send `file` (mixed mono). Diarizes the mix, names clusters.
|
||||
|
||||
* DUAL-CHANNEL: send `mic_file` (the local user's mic) + `system_file`
|
||||
(everyone else, from screen capture), sample-aligned to a shared t0. This
|
||||
uses the channels to SPLIT the problem instead of forcing the diarizer to
|
||||
re-disentangle a mono mix:
|
||||
- mic track -> the local user's words, gated to windows where the mic is
|
||||
actually the user speaking (mic louder than system — a self-VAD computed
|
||||
server-side from the two channels, or supplied via `self_vad`). The mic
|
||||
picks up the remote audio as quiet bleed, so this gate is LOAD-BEARING:
|
||||
without it the bleed would be transcribed as the user.
|
||||
- system track -> diarized (only has to separate the *remote* people, a
|
||||
strictly easier problem) and named via the visual timeline + voiceprints.
|
||||
- the user's clean voiceprint is enrolled from the mic track and injected
|
||||
into the voiceprint library, so a system-track cluster that's actually the
|
||||
user dialed in from a second device (dual-login) resolves to the user, not
|
||||
a stranger.
|
||||
Self-attribution becomes near-perfect (dedicated channel), remote diarization
|
||||
gets cleaner, overlapping speech is trivially separated, and the user no longer
|
||||
consumes one of Sortformer's 4 speaker slots.
|
||||
|
||||
Form fields (multipart):
|
||||
file | (mic_file + system_file) audio — mono mix OR the two channels
|
||||
timeline JSON [{"start","end","name","confidence?"}, ...] (visual hints for remote folks)
|
||||
self_name name for the local user (mic channel). Default "Me".
|
||||
self_vad optional JSON [{"start","end"}] mic-active-and-louder windows;
|
||||
if omitted, computed server-side by per-window RMS.
|
||||
known_voiceprints optional JSON {name: [192 floats]} from past calls (include the user's)
|
||||
transcribe "true" to attach per-segment text (always on in dual-channel)
|
||||
min_overlap min fraction of a cluster's time overlapping the winning name (default 0)
|
||||
voiceprint_threshold cosine similarity to accept a voiceprint match (default 0.5)
|
||||
"""
|
||||
try:
|
||||
tl = json.loads(timeline)
|
||||
assert isinstance(tl, list)
|
||||
except Exception:
|
||||
raise HTTPException(400, "timeline must be a JSON array of {start,end,name}")
|
||||
known_vp: dict[str, list[float]] = {}
|
||||
if known_voiceprints:
|
||||
try:
|
||||
known_vp = json.loads(known_voiceprints)
|
||||
assert isinstance(known_vp, dict)
|
||||
except Exception:
|
||||
raise HTTPException(400, "known_voiceprints must be a JSON object {name: [floats]}")
|
||||
|
||||
dual = mic_file is not None and system_file is not None
|
||||
if not dual and file is None:
|
||||
raise HTTPException(400, "provide either 'file' (mono) or both 'mic_file' and 'system_file'")
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=600.0) as client:
|
||||
if dual:
|
||||
return await _label_merge_dual(
|
||||
client, _diar, _txn, await mic_file.read(), await system_file.read(),
|
||||
tl, self_name, self_vad, known_vp, min_overlap, voiceprint_threshold)
|
||||
body = await file.read()
|
||||
if not body:
|
||||
raise HTTPException(400, "Empty file")
|
||||
fn = file.filename or "audio.wav"
|
||||
if transcribe:
|
||||
diar, stt = await asyncio.gather(_diar(client, body, fn), _txn(client, body, fn))
|
||||
else:
|
||||
diar, stt = await _diar(client, body, fn), None
|
||||
except HTTPException:
|
||||
raise
|
||||
except httpx.HTTPStatusError as e:
|
||||
if e.response.status_code == 500 and deep_health is not None:
|
||||
try:
|
||||
asyncio.create_task(deep_health.run_one("parakeet"))
|
||||
except Exception:
|
||||
pass
|
||||
raise HTTPException(503, "Parakeet transient error (likely CUDA wedge). Retry in ~60s.",
|
||||
headers={"Retry-After": "60"})
|
||||
raise HTTPException(e.response.status_code, e.response.text[:500])
|
||||
except httpx.HTTPError as e:
|
||||
raise HTTPException(502, f"parakeet unreachable: {e}")
|
||||
|
||||
# ---- MONO path ----
|
||||
diar_segments = diar.get("segments", [])
|
||||
fingerprints = diar.get("fingerprints", {}) or {}
|
||||
clusters = diar.get("speakers_detected", [])
|
||||
assignment = _name_clusters(diar_segments, fingerprints, clusters, tl, known_vp,
|
||||
min_overlap, voiceprint_threshold)
|
||||
relabeled_turns = [
|
||||
{"start_s": s.get("start_s"), "end_s": s.get("end_s"),
|
||||
"speaker": assignment[s.get("speaker")]["name"]}
|
||||
for s in diar_segments if s.get("speaker") in assignment
|
||||
]
|
||||
if transcribe and stt is not None:
|
||||
out_segments = _merge_words_with_speakers(stt.get("words", []), relabeled_turns)
|
||||
else:
|
||||
out_segments = [{
|
||||
"start_s": s.get("start_s"), "end_s": s.get("end_s"),
|
||||
"speaker": assignment.get(s.get("speaker"), {}).get("name", s.get("speaker")),
|
||||
"confidence": s.get("confidence"),
|
||||
} for s in diar_segments]
|
||||
speakers, named_fingerprints = _speaker_list(clusters, assignment, fingerprints)
|
||||
return {
|
||||
"mode": "mono",
|
||||
"duration": diar.get("duration", 0.0),
|
||||
"speakers": speakers,
|
||||
"segments": out_segments,
|
||||
"fingerprints": named_fingerprints,
|
||||
"models": diar.get("models", {}),
|
||||
}
|
||||
|
||||
return router
|
||||
|
||||
|
||||
# ---- Label-merge helpers ----
|
||||
|
||||
def _overlap_seconds(a0: float, a1: float, b0: float, b1: float) -> float:
|
||||
return max(0.0, min(a1, b1) - max(a0, b0))
|
||||
|
||||
|
||||
def _cosine(a: Optional[list], b: Optional[list]) -> float:
|
||||
if not a or not b or len(a) != len(b):
|
||||
return 0.0
|
||||
dot = sum(x * y for x, y in zip(a, b))
|
||||
na = sum(x * x for x in a) ** 0.5
|
||||
nb = sum(x * x for x in b) ** 0.5
|
||||
if na == 0 or nb == 0:
|
||||
return 0.0
|
||||
return dot / (na * nb)
|
||||
|
||||
|
||||
def _name_clusters(diar_segments, fingerprints, clusters, tl, known_vp,
|
||||
min_overlap, voiceprint_threshold):
|
||||
"""Assign a name to each anonymous diarization cluster: visual-timeline overlap
|
||||
winner -> closest known-voiceprint match -> Unknown_N. Shared by mono + dual."""
|
||||
cluster_dur: dict[str, float] = {}
|
||||
cluster_name_overlap: dict[str, dict[str, float]] = {}
|
||||
for seg in diar_segments:
|
||||
spk = seg.get("speaker")
|
||||
s0, s1 = float(seg.get("start_s", 0)), float(seg.get("end_s", 0))
|
||||
cluster_dur[spk] = cluster_dur.get(spk, 0.0) + max(0.0, s1 - s0)
|
||||
for entry in tl:
|
||||
name = (entry.get("name") or "").strip()
|
||||
if not name:
|
||||
continue
|
||||
ov = _overlap_seconds(s0, s1, float(entry.get("start", 0)), float(entry.get("end", 0)))
|
||||
if ov > 0:
|
||||
cluster_name_overlap.setdefault(spk, {})
|
||||
cluster_name_overlap[spk][name] = cluster_name_overlap[spk].get(name, 0.0) + ov
|
||||
assignment: dict[str, dict] = {}
|
||||
used_unknown = 0
|
||||
for cluster in clusters:
|
||||
names = cluster_name_overlap.get(cluster, {})
|
||||
total = cluster_dur.get(cluster, 0.0) or 1.0
|
||||
if names:
|
||||
winner = max(names.items(), key=lambda kv: kv[1])
|
||||
conf = winner[1] / total
|
||||
if conf >= min_overlap:
|
||||
assignment[cluster] = {"name": winner[0], "source": "visual",
|
||||
"overlap_confidence": round(conf, 4)}
|
||||
continue
|
||||
fp = fingerprints.get(cluster)
|
||||
best_name, best_sim = None, 0.0
|
||||
if fp and known_vp:
|
||||
for nm, vec in known_vp.items():
|
||||
sim = _cosine(fp, vec)
|
||||
if sim > best_sim:
|
||||
best_name, best_sim = nm, sim
|
||||
if best_name and best_sim >= voiceprint_threshold:
|
||||
assignment[cluster] = {"name": best_name, "source": "voiceprint",
|
||||
"match_similarity": round(best_sim, 4)}
|
||||
else:
|
||||
assignment[cluster] = {"name": f"Unknown_{used_unknown}", "source": "unmatched"}
|
||||
used_unknown += 1
|
||||
return assignment
|
||||
|
||||
|
||||
def _speaker_list(clusters, assignment, fingerprints):
|
||||
"""Build the response `speakers` list + name->fingerprint map from an assignment."""
|
||||
speakers, named = [], {}
|
||||
for cluster in clusters:
|
||||
a = assignment[cluster]
|
||||
entry = {"cluster": cluster, "name": a["name"], "source": a["source"],
|
||||
"fingerprint": fingerprints.get(cluster)}
|
||||
if "overlap_confidence" in a:
|
||||
entry["overlap_confidence"] = a["overlap_confidence"]
|
||||
if "match_similarity" in a:
|
||||
entry["match_similarity"] = a["match_similarity"]
|
||||
speakers.append(entry)
|
||||
if fingerprints.get(cluster) is not None:
|
||||
named[a["name"]] = fingerprints.get(cluster)
|
||||
return speakers, named
|
||||
|
||||
|
||||
def _wav_pcm(b: bytes):
|
||||
"""Decode a 16-bit mono/stereo WAV to (int16 array, sample_rate). Returns
|
||||
(None, 0) if it can't decode (caller then requires a client-supplied self_vad)."""
|
||||
try:
|
||||
with wave.open(io.BytesIO(b), "rb") as w:
|
||||
sr, n, ch, sw = w.getframerate(), w.getnframes(), w.getnchannels(), w.getsampwidth()
|
||||
raw = w.readframes(n)
|
||||
if sw != 2:
|
||||
return None, 0
|
||||
a = array("h")
|
||||
a.frombytes(raw)
|
||||
if ch > 1:
|
||||
a = a[0::ch] # take channel 0
|
||||
return a, sr
|
||||
except Exception:
|
||||
return None, 0
|
||||
|
||||
|
||||
def _win_rms(pcm_sr, s: float, e: float) -> float:
|
||||
"""Normalized RMS (0..1) of the [s,e]-second window of a decoded PCM array."""
|
||||
a, sr = pcm_sr
|
||||
if a is None or sr <= 0:
|
||||
return 0.0
|
||||
i, j = max(0, int(s * sr)), min(len(a), int(e * sr))
|
||||
if j <= i:
|
||||
return 0.0
|
||||
ss = 0
|
||||
for x in a[i:j]:
|
||||
ss += x * x
|
||||
return (ss / (j - i)) ** 0.5 / 32768.0
|
||||
|
||||
|
||||
async def _label_merge_dual(client, diar_fn, txn_fn, mic_b, sys_b, tl, self_name,
|
||||
self_vad_json, known_vp, min_overlap, voiceprint_threshold):
|
||||
"""Dual-channel label-merge: mic track = the local user (gated to mic-dominant
|
||||
windows so remote bleed isn't transcribed as the user); system track = diarized +
|
||||
named remote speakers. See label_merge docstring for the full rationale."""
|
||||
if not mic_b or not sys_b:
|
||||
raise HTTPException(400, "empty mic_file or system_file")
|
||||
|
||||
# System: diarize + transcribe (parallel). Mic: transcribe + diarize (parallel) —
|
||||
# the mic diarization yields the user's clean enrollment voiceprint.
|
||||
sys_diar, sys_stt, mic_stt, mic_diar = await asyncio.gather(
|
||||
diar_fn(client, sys_b, "system.wav"), txn_fn(client, sys_b, "system.wav"),
|
||||
txn_fn(client, mic_b, "mic.wav"), diar_fn(client, mic_b, "mic.wav"))
|
||||
|
||||
# Enroll the user's voiceprint = fingerprint of the dominant cluster on the mic track.
|
||||
self_vp = None
|
||||
mic_fps = mic_diar.get("fingerprints", {}) or {}
|
||||
if mic_fps:
|
||||
durs: dict[str, float] = {}
|
||||
for s in mic_diar.get("segments", []):
|
||||
durs[s["speaker"]] = durs.get(s["speaker"], 0.0) + (s["end_s"] - s["start_s"])
|
||||
top = max(durs, key=durs.get) if durs else next(iter(mic_fps))
|
||||
self_vp = mic_fps.get(top)
|
||||
# Inject self voiceprint so a dual-login (phone) system cluster resolves to the user.
|
||||
vp_lib = dict(known_vp)
|
||||
if self_vp is not None:
|
||||
vp_lib.setdefault(self_name, self_vp)
|
||||
|
||||
# Name the SYSTEM clusters (remote people, possibly incl. phone-self via voiceprint).
|
||||
sys_segments = sys_diar.get("segments", [])
|
||||
sys_fps = sys_diar.get("fingerprints", {}) or {}
|
||||
sys_clusters = sys_diar.get("speakers_detected", [])
|
||||
sys_assign = _name_clusters(sys_segments, sys_fps, sys_clusters, tl, vp_lib,
|
||||
min_overlap, voiceprint_threshold)
|
||||
sys_turns = [{"start_s": s["start_s"], "end_s": s["end_s"],
|
||||
"speaker": sys_assign[s["speaker"]]["name"]}
|
||||
for s in sys_segments if s["speaker"] in sys_assign]
|
||||
remote_blocks = _merge_words_with_speakers(sys_stt.get("words", []), sys_turns)
|
||||
|
||||
# Self-VAD: keep only mic words where the mic is genuinely the local user (mic
|
||||
# louder than system), excluding the remote bleed the mic also picks up.
|
||||
vad_windows = None
|
||||
if self_vad_json:
|
||||
try:
|
||||
vad_windows = json.loads(self_vad_json)
|
||||
assert isinstance(vad_windows, list)
|
||||
except Exception:
|
||||
vad_windows = None
|
||||
mic_pcm = _wav_pcm(mic_b)
|
||||
sys_pcm = _wav_pcm(sys_b)
|
||||
if vad_windows is None and mic_pcm[0] is None:
|
||||
raise HTTPException(400, "could not decode WAV for self-VAD; send 16-bit mono WAV or a self_vad array")
|
||||
|
||||
# Margin so the mic must be CLEARLY louder than system to count as local — guards
|
||||
# against brief remote bleed near utterance boundaries (real local speech runs many
|
||||
# times louder than the bleed; real remote runs many times quieter).
|
||||
_LOCAL_MARGIN = 1.2
|
||||
|
||||
def _is_local(s: float, e: float) -> bool:
|
||||
if vad_windows is not None:
|
||||
return any(_overlap_seconds(s, e, float(w.get("start", 0)), float(w.get("end", 0))) > 0
|
||||
for w in vad_windows)
|
||||
return _win_rms(mic_pcm, s, e) > _win_rms(sys_pcm, s, e) * _LOCAL_MARGIN
|
||||
|
||||
# Keep mic words where the mic is clearly the dominant channel (margin excludes the
|
||||
# remote bleed the mic also picks up), THEN group the surviving local words into
|
||||
# blocks. Filtering before grouping means a block never mixes local speech with loud
|
||||
# bleed (which would average to system-dominant and drop the whole utterance).
|
||||
local_words = [w for w in mic_stt.get("words", [])
|
||||
if _is_local(float(w.get("start", 0)), float(w.get("end", 0)))]
|
||||
local_blocks = (_merge_words_with_speakers(
|
||||
local_words, [{"start_s": 0.0, "end_s": 1e12, "speaker": self_name}])
|
||||
if local_words else [])
|
||||
|
||||
segments = sorted(remote_blocks + local_blocks, key=lambda b: b.get("start_ms", 0))
|
||||
|
||||
speakers, named = _speaker_list(sys_clusters, sys_assign, sys_fps)
|
||||
speakers.append({"cluster": "mic", "name": self_name, "source": "mic_channel",
|
||||
"fingerprint": self_vp})
|
||||
if self_vp is not None:
|
||||
named[self_name] = self_vp
|
||||
|
||||
return {
|
||||
"mode": "dual_channel",
|
||||
"duration": max(sys_diar.get("duration", 0.0), mic_stt.get("duration", 0.0)),
|
||||
"speakers": speakers,
|
||||
"segments": segments,
|
||||
"fingerprints": named,
|
||||
"models": sys_diar.get("models", {}),
|
||||
}
|
||||
|
||||
|
||||
# ---- Merge helper: assign speaker to each word, then group into blocks ----
|
||||
|
||||
def _assign_speaker_to_word(word_start_s: float, word_end_s: float, diar_turns: list[dict]) -> str:
|
||||
|
||||
Reference in New Issue
Block a user