v0.10.0:0 - speaker diarization via Sortformer + merged transcribe-with-speakers
Adds a new pipeline for diarized transcription that any client (recap-relay,
ad-hoc curl, future Mac-side tools) can call. Pure data pipeline, no LLM
or UI included — name resolution / analysis happen downstream where prompts
and rendering are configurable.
Architecture:
Spark 2 / parakeet-asr container:
+ /opt/parakeet/app/diarizer.py (new: SortformerDiarizer class)
+ /opt/parakeet/app/main.py (patched: loads diarizer, adds
/v1/audio/diarize endpoint)
Model: nvidia/diar_sortformer_4spk-v1 (~150 MB, ungated, NeMo native)
Spark Control:
+ POST /api/audio/transcribe-with-speakers
Body: multipart file
Returns: {
duration, language, speakers_detected,
segments: [{start_ms, end_ms, speaker, text}, ...],
models: {transcription, diarization}
}
Runs Parakeet ASR + Sortformer in parallel, merges words to speaker
turns by timestamp, groups into speaker-change blocks (breaks also
on >1.5s silence gaps).
+ If Parakeet 500s mid-pipeline, kicks deep-health probe and returns
503/Retry-After: 60 — same wedge-recovery pattern as v0.9.0:2.
Apply Sortformer patches to the running Parakeet container with:
bash image/parakeet_patches/apply.sh <spark2-host> <ssh-user>
Patches are reversible — apply.sh backs up the original main.py inside the
container at main.py.pre-sortformer before overwriting. Restore by copying
that file back and removing diarizer.py, then docker restart.
v0.11 follow-up: dashboard "Speech Models" panel to swap/update model
versions from the UI instead of needing to re-run apply.sh.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -209,4 +209,180 @@ def build_router(settings: Settings, deep_health: Any = None) -> APIRouter:
|
||||
raise HTTPException(r.status_code, r.text[:500])
|
||||
return Response(content=r.content, media_type=r.headers.get("content-type", "application/json"))
|
||||
|
||||
# ---- /api/audio/transcribe-with-speakers (STT + diarization, merged) ----
|
||||
@router.post("/api/audio/transcribe-with-speakers")
|
||||
async def transcribe_with_speakers(
|
||||
file: UploadFile = File(...),
|
||||
) -> dict:
|
||||
"""Diarized transcription: run Parakeet ASR and Sortformer diarization on
|
||||
the same audio in parallel, then merge by timestamp.
|
||||
|
||||
Response shape (designed for downstream UIs like recap-relay):
|
||||
|
||||
{
|
||||
"duration": 90.5,
|
||||
"language": "en",
|
||||
"speakers_detected": ["Speaker_0", "Speaker_1"],
|
||||
"segments": [
|
||||
{"start_ms": 39308, "end_ms": 51000,
|
||||
"speaker": "Speaker_0", "text": "good morning i think..."},
|
||||
...
|
||||
],
|
||||
"models": {
|
||||
"transcription": "parakeet-tdt-0.6b-v3",
|
||||
"diarization": "nvidia/diar_sortformer_4spk-v1"
|
||||
}
|
||||
}
|
||||
|
||||
Each segment is a block of consecutive words by the same speaker. Speaker
|
||||
labels are anonymous (Speaker_0, Speaker_1, ...) — name resolution is the
|
||||
caller's responsibility (LLM analysis with optional participant hints,
|
||||
or manual mapping UI).
|
||||
"""
|
||||
body = await file.read()
|
||||
if not body:
|
||||
raise HTTPException(400, "Empty file")
|
||||
filename = file.filename or "audio.wav"
|
||||
content_type = file.content_type or "application/octet-stream"
|
||||
|
||||
async def _call_transcribe(client: httpx.AsyncClient) -> dict:
|
||||
files = {"file": (filename, body, content_type)}
|
||||
data = {"response_format": "verbose_json"}
|
||||
r = await client.post(
|
||||
f"{_parakeet_base()}/v1/audio/transcriptions",
|
||||
files=files, data=data,
|
||||
)
|
||||
r.raise_for_status()
|
||||
return r.json()
|
||||
|
||||
async def _call_diarize(client: httpx.AsyncClient) -> dict:
|
||||
files = {"file": (filename, body, content_type)}
|
||||
r = await client.post(
|
||||
f"{_parakeet_base()}/v1/audio/diarize",
|
||||
files=files,
|
||||
)
|
||||
r.raise_for_status()
|
||||
return r.json()
|
||||
|
||||
# Run both in parallel against the same Parakeet container — Sortformer
|
||||
# and Parakeet ASR are independent forward passes that share the GPU.
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=600.0) as client:
|
||||
stt, diar = await asyncio.gather(
|
||||
_call_transcribe(client),
|
||||
_call_diarize(client),
|
||||
)
|
||||
except httpx.HTTPStatusError as e:
|
||||
# Surface upstream errors. If transcribe wedged, kick deep-health.
|
||||
if e.response.status_code == 500 and deep_health is not None:
|
||||
try:
|
||||
asyncio.create_task(deep_health.run_one("parakeet"))
|
||||
except Exception:
|
||||
pass
|
||||
raise HTTPException(
|
||||
status_code=503,
|
||||
detail="Parakeet transient error (likely CUDA wedge). Auto-restart triggered; retry in ~60s.",
|
||||
headers={"Retry-After": "60"},
|
||||
)
|
||||
raise HTTPException(e.response.status_code, e.response.text[:500])
|
||||
except httpx.HTTPError as e:
|
||||
raise HTTPException(502, f"parakeet unreachable: {e}")
|
||||
|
||||
merged = _merge_words_with_speakers(
|
||||
words=stt.get("words", []),
|
||||
diar_turns=diar.get("segments", []),
|
||||
)
|
||||
return {
|
||||
"duration": stt.get("duration") or diar.get("duration") or 0.0,
|
||||
"language": stt.get("language", "en"),
|
||||
"speakers_detected": diar.get("speakers_detected", []),
|
||||
"segments": merged,
|
||||
"models": {
|
||||
"transcription": stt.get("model") if isinstance(stt.get("model"), str) else "parakeet",
|
||||
"diarization": diar.get("model", "sortformer"),
|
||||
},
|
||||
}
|
||||
|
||||
return router
|
||||
|
||||
|
||||
# ---- Merge helper: assign speaker to each word, then group into blocks ----
|
||||
|
||||
def _assign_speaker_to_word(word_start_s: float, word_end_s: float, diar_turns: list[dict]) -> str:
|
||||
"""Find the diarization turn that contains this word, or has the most
|
||||
overlap with it. Returns the speaker label, or 'Speaker_unknown' if no
|
||||
turn overlaps at all."""
|
||||
word_mid = (word_start_s + word_end_s) / 2.0
|
||||
# Fast path: find the turn containing the midpoint
|
||||
for t in diar_turns:
|
||||
if t["start_s"] <= word_mid <= t["end_s"]:
|
||||
return t["speaker"]
|
||||
# Slow path: pick the turn with max overlap with the word's span
|
||||
best_speaker = "Speaker_unknown"
|
||||
best_overlap = 0.0
|
||||
for t in diar_turns:
|
||||
overlap = max(0.0, min(word_end_s, t["end_s"]) - max(word_start_s, t["start_s"]))
|
||||
if overlap > best_overlap:
|
||||
best_overlap = overlap
|
||||
best_speaker = t["speaker"]
|
||||
return best_speaker
|
||||
|
||||
|
||||
def _merge_words_with_speakers(words: list[dict], diar_turns: list[dict]) -> list[dict]:
|
||||
"""Group consecutive same-speaker words into blocks.
|
||||
|
||||
Each input word: {"start": float_s, "end": float_s, "text": str} (Parakeet
|
||||
verbose_json format; values are seconds).
|
||||
Each input turn: {"start_s": float, "end_s": float, "speaker": str}.
|
||||
|
||||
Output: [{"start_ms": int, "end_ms": int, "speaker": str, "text": str}, ...]
|
||||
|
||||
Also breaks a block on a long silence gap (>1.5 s) even within the same
|
||||
speaker — keeps blocks readable in UI rendering.
|
||||
"""
|
||||
if not words:
|
||||
return []
|
||||
SILENCE_BREAK_S = 1.5
|
||||
|
||||
blocks: list[dict] = []
|
||||
cur_words: list[str] = []
|
||||
cur_speaker: Optional[str] = None
|
||||
cur_start_s: Optional[float] = None
|
||||
cur_end_s: Optional[float] = None
|
||||
|
||||
for w in words:
|
||||
ws = float(w.get("start", 0.0))
|
||||
we = float(w.get("end", ws))
|
||||
wt = str(w.get("text", ""))
|
||||
spk = _assign_speaker_to_word(ws, we, diar_turns)
|
||||
|
||||
is_new_block = (
|
||||
cur_speaker is None
|
||||
or spk != cur_speaker
|
||||
or (cur_end_s is not None and ws - cur_end_s > SILENCE_BREAK_S)
|
||||
)
|
||||
if is_new_block:
|
||||
if cur_speaker is not None:
|
||||
blocks.append({
|
||||
"start_ms": int(cur_start_s * 1000),
|
||||
"end_ms": int(cur_end_s * 1000),
|
||||
"speaker": cur_speaker,
|
||||
"text": "".join(cur_words).strip(),
|
||||
})
|
||||
cur_words = [wt]
|
||||
cur_speaker = spk
|
||||
cur_start_s = ws
|
||||
cur_end_s = we
|
||||
else:
|
||||
cur_words.append(wt)
|
||||
cur_end_s = we
|
||||
|
||||
if cur_speaker is not None and cur_words:
|
||||
blocks.append({
|
||||
"start_ms": int(cur_start_s * 1000),
|
||||
"end_ms": int(cur_end_s * 1000),
|
||||
"speaker": cur_speaker,
|
||||
"text": "".join(cur_words).strip(),
|
||||
})
|
||||
|
||||
return blocks
|
||||
|
||||
Reference in New Issue
Block a user