v0.12.0:0 - WhisperX as a one-click dashboard install + managed service

Replaces the manual rsync+build+run with a proper spark-control feature.
First in the audio path that doesn't require shell access on Spark 2.

What's in the box
─────────────────
* image/whisperx_container/   - the build context (Dockerfile, requirements,
  app/main.py FastAPI wrapper). Mainline pipeline: faster-whisper for STT +
  pyannote 3.1 for diarization + wav2vec2 forced alignment. Single endpoint
  /v1/audio/transcribe-with-speakers returns the exact same shape spark-
  control's existing endpoint does, so the recap-relay PR spec needs no
  changes when we cut over.

* image/app/whisperx_install.py - install manager. ships build context to
  Spark 2 over SSH, runs `docker build`, runs `docker run` with 40 GB
  memory cap (vs Sortformer's unbounded which thrashed Spark 2 on a 90-min
  file), polls /health until both Whisper + pyannote report loaded.

* Audio proxy: /api/audio/transcribe-with-speakers now prefers WhisperX
  when its /health reports diarizer_loaded=true, falls back to the legacy
  Parakeet + Sortformer path otherwise. Same response shape either way.
  Clean cutover, easy rollback (`docker rm whisperx-asr`).

* Dashboard (Audio / Speech tab):
  - "Add WhisperX" banner appears when not installed, with a primary
    "Install WhisperX" button. One click triggers the install.
  - Build progress dialog with phase + elapsed timer + live build log via
    SSE (`/api/whisperx/install/{job_id}/stream`).
  - After install, WhisperX auto-registers as a managed service alongside
    Parakeet and Magpie (Start/Restart/Stop, deep-check, auto-restart).
  - Banner self-hides once /api/whisperx/status reports healthy.

New endpoints
─────────────
  GET  /api/whisperx/status
  POST /api/whisperx/install
  GET  /api/whisperx/install/{job_id}
  GET  /api/whisperx/install/{job_id}/stream  (SSE phase + log)

Config additions (env)
──────────────────────
  WHISPERX_HOST       (defaults to spark2_host)
  WHISPERX_USER       (defaults to spark2_user)
  WHISPERX_CONTAINER  (default: whisperx-asr)
  WHISPERX_PORT       (default: 8002)
  WHISPERX_MODEL      (default: medium; tiny/base/small/medium/large-v3)

Dockerfile
──────────
Added COPY whisperx_container /app/whisperx_container so the runtime
install manager can read the build context from inside the spark-control
image and ship it over SSH.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Keysat
2026-05-18 21:02:26 -05:00
parent cfc1c408d4
commit 5a0bfba6a3
14 changed files with 1033 additions and 3 deletions
+28
View File
@@ -209,6 +209,17 @@ def build_router(settings: Settings, deep_health: Any = None) -> APIRouter:
raise HTTPException(r.status_code, r.text[:500])
return Response(content=r.content, media_type=r.headers.get("content-type", "application/json"))
def _whisperx_base() -> str:
return f"http://{settings.whisperx_host}:{settings.whisperx_port}"
async def _whisperx_healthy() -> bool:
try:
async with httpx.AsyncClient(timeout=2.0) as client:
r = await client.get(f"{_whisperx_base()}/health")
return r.status_code == 200 and bool(r.json().get("diarizer_loaded"))
except Exception:
return False
# ---- /api/audio/transcribe-with-speakers (STT + diarization, merged) ----
@router.post("/api/audio/transcribe-with-speakers")
async def transcribe_with_speakers(
@@ -245,6 +256,23 @@ def build_router(settings: Settings, deep_health: Any = None) -> APIRouter:
filename = file.filename or "audio.wav"
content_type = file.content_type or "application/octet-stream"
# Prefer WhisperX (single-pipeline, handles long audio properly) when it's
# installed and healthy. Fall back to Parakeet + Sortformer otherwise.
if await _whisperx_healthy():
files = {"file": (filename, body, content_type)}
try:
async with httpx.AsyncClient(timeout=1800.0) as client:
r = await client.post(
f"{_whisperx_base()}/v1/audio/transcribe-with-speakers",
files=files,
)
except httpx.HTTPError as e:
raise HTTPException(502, f"whisperx unreachable: {e}")
if r.status_code != 200:
raise HTTPException(r.status_code, r.text[:500])
return r.json()
# ── Legacy fallback: Parakeet ASR + Sortformer diarizer in parallel ──
async def _call_transcribe(client: httpx.AsyncClient) -> dict:
files = {"file": (filename, body, content_type)}
data = {"response_format": "verbose_json"}
+11 -1
View File
@@ -35,6 +35,11 @@ class Settings:
magpie_host: str
magpie_user: str
magpie_container: str
whisperx_host: str
whisperx_user: str
whisperx_container: str
whisperx_port: int
whisperx_model: str
ssh_key_path: str
ssh_known_hosts: str
models_yaml: str
@@ -49,7 +54,7 @@ class Settings:
def from_env(cls) -> "Settings":
spark2_host = _env("SPARK2_HOST")
spark2_user = _env("SPARK2_USER")
# Parakeet and Magpie default to Spark 2 unless explicitly overridden.
# Parakeet, Magpie, and WhisperX all default to Spark 2 unless overridden.
return cls(
spark1_host=_env("SPARK1_HOST"),
spark1_user=_env("SPARK1_USER"),
@@ -61,6 +66,11 @@ class Settings:
magpie_host=_env("MAGPIE_HOST") or spark2_host,
magpie_user=_env("MAGPIE_USER") or spark2_user,
magpie_container=_env("MAGPIE_CONTAINER") or "magpie-tts",
whisperx_host=_env("WHISPERX_HOST") or spark2_host,
whisperx_user=_env("WHISPERX_USER") or spark2_user,
whisperx_container=_env("WHISPERX_CONTAINER") or "whisperx-asr",
whisperx_port=int(_env("WHISPERX_PORT", "8002")),
whisperx_model=_env("WHISPERX_MODEL", "medium"),
ssh_key_path=_env("SSH_KEY_PATH"),
ssh_known_hosts=_env("SSH_KNOWN_HOSTS"),
models_yaml=_resolve_models_yaml(),
+66
View File
@@ -24,6 +24,7 @@ from .overrides import add_custom, delete_custom, extract_knobs_from_args, load_
from .services import docker_state, run_action, services_from_settings
from .speech_models import SpeechModelsManager
from .ssh import ssh_run
from .whisperx_install import WhisperXInstaller
from .swap import SwapManager
from .updates import UpdateManager, get_update_status
from .validate import validate_launch
@@ -39,6 +40,7 @@ hardware_probe = HardwareProbe(settings)
nim_manager = NimManager(settings)
deep_health = DeepHealth(settings)
speech_models = SpeechModelsManager(settings)
whisperx_installer = WhisperXInstaller(settings)
app = FastAPI(title="spark-control", version="0.1.0")
@@ -535,6 +537,70 @@ async def post_speech_models_restart() -> dict:
return result
# ---- WhisperX install (Phase 2 of the WhisperX migration) ----
@app.get("/api/whisperx/status")
async def get_whisperx_status() -> dict:
"""Is WhisperX installed + healthy on Spark 2 right now?"""
return await whisperx_installer.status()
@app.post("/api/whisperx/install")
async def post_whisperx_install() -> dict:
"""One-click install: ships the WhisperX build context from inside
spark-control to Spark 2, runs `docker build` + `docker run`, polls
/health until both models are loaded. Streams progress via the matching
GET /api/whisperx/install/{job_id}/stream SSE endpoint."""
try:
job = await whisperx_installer.trigger()
except RuntimeError as e:
raise HTTPException(409, str(e))
return {"job_id": job.id, "started_at": job.started_at}
@app.get("/api/whisperx/install/{job_id}")
async def get_whisperx_install(job_id: str) -> dict:
job = whisperx_installer.get(job_id)
if not job:
raise HTTPException(404, "unknown job")
return {
"id": job.id,
"state": job.state,
"phase": job.phase,
"lines": job.lines,
"started_at": job.started_at,
"finished_at": job.finished_at,
"returncode": job.returncode,
}
@app.get("/api/whisperx/install/{job_id}/stream")
async def stream_whisperx_install(job_id: str) -> StreamingResponse:
job = whisperx_installer.get(job_id)
if not job:
raise HTTPException(404, "unknown job")
async def event_stream():
last_idx = 0
last_phase = ""
last_state = ""
while True:
new_lines = job.lines[last_idx:]
last_idx = len(job.lines)
for line in new_lines:
yield f"data: {json.dumps({'line': line})}\n\n"
if job.phase != last_phase or job.state != last_state:
yield f"event: phase\ndata: {json.dumps({'phase': job.phase, 'state': job.state})}\n\n"
last_phase = job.phase
last_state = job.state
if job.finished_at:
yield f"event: done\ndata: {json.dumps({'state': job.state, 'returncode': job.returncode})}\n\n"
return
await asyncio.sleep(0.6)
return StreamingResponse(event_stream(), media_type="text/event-stream")
@app.get("/api/endpoints")
async def get_endpoints() -> dict:
"""Service-discovery summary. Stable shape; other apps on the LAN can poll this
+8
View File
@@ -65,6 +65,14 @@ def services_from_settings(s: Settings) -> dict[str, ServiceDef]:
container=s.magpie_container,
port=s.magpie_port,
),
"whisperx": ServiceDef(
name="whisperx",
kind="stt+diarize",
host=s.whisperx_host,
user=s.whisperx_user,
container=s.whisperx_container,
port=s.whisperx_port,
),
}
for entry in load_custom_services():
key = entry.get("key")
+119
View File
@@ -664,6 +664,117 @@ async function onSpeechModelsRestart() {
}
}
// ===================== WhisperX install (v0.12) =====================
const wxState = {
job_id: null,
eventsource: null,
timer_handle: null,
started_at: null,
};
async function renderWhisperXBanner() {
const card = el('#whisperx-install-card');
if (!card) return;
let status;
try {
status = await fetchJSON('/api/whisperx/status');
} catch {
card.classList.add('hidden');
return;
}
if (status.installed && status.healthy) {
card.classList.add('hidden');
} else if (status.configured) {
card.classList.remove('hidden');
} else {
card.classList.add('hidden');
}
}
async function onWhisperXInstall() {
if (wxState.job_id) {
// Just re-attach to the running job
showWhisperXDialog();
return;
}
if (!confirm('Install WhisperX on Spark 2? This builds a new Docker image (~1015 min first time, mostly downloading pyannote + whisper weights). Parakeet/Magpie stay untouched.')) return;
try {
const r = await fetchJSON('/api/whisperx/install', { method: 'POST' });
attachToWhisperXInstall(r.job_id);
} catch (e) {
alert('Failed to start WhisperX install: ' + e.message);
}
}
function showWhisperXDialog() {
el('#whisperx-progress-dialog').showModal();
}
function attachToWhisperXInstall(jobId) {
wxState.job_id = jobId;
el('#wx-prog-title').textContent = 'Installing WhisperX…';
el('#wx-prog-phase').textContent = 'Starting…';
el('#wx-prog-log').textContent = '';
showWhisperXDialog();
// Tick a timer
wxState.started_at = Date.now();
if (wxState.timer_handle) clearInterval(wxState.timer_handle);
wxState.timer_handle = setInterval(() => {
const sec = Math.max(0, Math.floor((Date.now() - wxState.started_at) / 1000));
const m = Math.floor(sec / 60);
el('#wx-prog-elapsed').textContent = `${m}:${(sec % 60).toString().padStart(2, '0')}`;
}, 500);
// Backfill snapshot then connect SSE
fetchJSON(`/api/whisperx/install/${jobId}`).then((snap) => {
el('#wx-prog-phase').textContent = snap.phase || 'Working…';
el('#wx-prog-log').textContent = (snap.lines || []).join('\n');
el('#wx-prog-log').scrollTop = el('#wx-prog-log').scrollHeight;
if (snap.finished_at) {
handleWhisperXDone(snap);
return;
}
const es = new EventSource(`/api/whisperx/install/${jobId}/stream`);
wxState.eventsource = es;
es.onmessage = (ev) => {
try {
const log = el('#wx-prog-log');
log.textContent += JSON.parse(ev.data).line + '\n';
log.scrollTop = log.scrollHeight;
} catch {}
};
es.addEventListener('phase', (ev) => {
try { el('#wx-prog-phase').textContent = JSON.parse(ev.data).phase; } catch {}
});
es.addEventListener('done', (ev) => {
try { handleWhisperXDone(JSON.parse(ev.data)); } catch {}
es.close();
wxState.eventsource = null;
});
es.onerror = () => { es.close(); wxState.eventsource = null; };
}).catch(() => {});
}
function handleWhisperXDone(d) {
if (wxState.timer_handle) { clearInterval(wxState.timer_handle); wxState.timer_handle = null; }
wxState.job_id = null;
const rc = d.returncode;
if (d.state === 'failed' || (rc !== 0 && rc != null)) {
el('#wx-prog-title').textContent = `WhisperX install failed (rc=${rc})`;
el('#wx-prog-phase').textContent = 'Failed — check the build log below';
} else {
el('#wx-prog-title').textContent = 'WhisperX installed';
el('#wx-prog-phase').textContent = 'Ready ✓ — appears in Always-on services below';
// Refresh services + banner state
setTimeout(() => {
renderServices();
renderWhisperXBanner();
}, 1000);
}
}
async function onServiceAction(key) {
if (state.service_action_in_flight) return;
const [name, action] = key.split(':');
@@ -1860,6 +1971,11 @@ async function init() {
} catch {}
setupDashboardTabs();
setupEndpointCollapse();
// WhisperX install button
const wxBtn = el('#wx-install');
if (wxBtn) wxBtn.addEventListener('click', onWhisperXInstall);
const wxCloseBtn = el('#wx-prog-close');
if (wxCloseBtn) wxCloseBtn.addEventListener('click', () => el('#whisperx-progress-dialog').close());
await loadModels();
await pollStatus();
await renderServices();
@@ -1869,11 +1985,14 @@ async function init() {
loadDiskStatus();
// Speech-model patches panel — slow over SSH, runs after first paint.
renderSpeechModels();
// WhisperX install banner — show only when not yet installed/healthy.
renderWhisperXBanner();
setInterval(pollStatus, 5000);
setInterval(pollHardware, 8000); // every 8s
setInterval(pollUpdates, 300000); // every 5 min
setInterval(loadDiskStatus, 60000); // every 60s — disk state changes rarely
setInterval(renderSpeechModels, 120000); // every 2 min — patches change rarely
setInterval(renderWhisperXBanner, 60000); // every 60s — auto-hides banner after install
}
init();
+40
View File
@@ -103,6 +103,46 @@
<div class="tab-content" id="tab-audio" role="tabpanel" aria-labelledby="tab-audio-trigger">
<section id="whisperx-install-card" class="whisperx-install hidden">
<div class="wx-install-body">
<div class="wx-install-title">
<strong>Add WhisperX</strong>
<span class="tag ok">recommended</span>
</div>
<p class="muted small">
WhisperX is a single-container speech pipeline (faster-whisper for transcription + pyannote 3.1 for diarization)
designed to handle long audio cleanly. Replaces the Parakeet + Sortformer combo we patched together,
which crashed on a 90-min meeting. Pulled and built directly on Spark 2 (~1015 min first time;
you only do this once).
</p>
<p class="muted small">
Requires a Hugging Face token at <code>~/.cache/huggingface/token</code> on Spark 2 (already set up).
</p>
<div class="wx-install-actions">
<button id="wx-install" class="btn primary">Install WhisperX</button>
</div>
</div>
</section>
<dialog id="whisperx-progress-dialog" class="modal">
<form method="dialog" class="modal-form">
<h3 id="wx-prog-title">Installing WhisperX…</h3>
<div class="phase-row">
<span class="spinner"></span>
<div class="phase" id="wx-prog-phase">Starting…</div>
<span class="spacer"></span>
<span class="timer" id="wx-prog-elapsed">0:00</span>
</div>
<details open>
<summary class="muted small">Build log</summary>
<pre id="wx-prog-log" class="log"></pre>
</details>
<div class="modal-actions">
<button type="button" id="wx-prog-close" class="btn">Close</button>
</div>
</form>
</dialog>
<section id="services-panel" class="services hidden">
<div class="section-header">
<h2 class="section-title">Always-on services</h2>
+13
View File
@@ -906,3 +906,16 @@ main {
}
.tab-content { display: none; }
.tab-content.active { display: block; }
/* ===== WhisperX install banner (v0.12) ===== */
.whisperx-install {
background: var(--surface);
border: 1px solid var(--info);
border-radius: var(--radius);
padding: 16px 18px;
margin-bottom: 20px;
}
.wx-install-body { display: flex; flex-direction: column; gap: 10px; }
.wx-install-title { display: flex; align-items: center; gap: 10px; }
.wx-install-title strong { font-size: 15px; color: var(--text); }
.wx-install-actions { display: flex; gap: 10px; margin-top: 4px; }
+255
View File
@@ -0,0 +1,255 @@
"""WhisperX install action — ships the build context from inside spark-control
to Spark 2 over SSH, then runs `docker build` + `docker run` on Spark 2 and
streams progress back as SSE.
Pattern mirrors NimManager (see nim.py) but for a locally-built container
rather than an `nvcr.io` pull. Build context lives at
/app/whisperx_container/ inside the spark-control Docker image (set up by
the Dockerfile COPY directive).
Endpoints:
POST /api/whisperx/install — kick off
GET /api/whisperx/install/{job_id} — snapshot
GET /api/whisperx/install/{job_id}/stream — SSE phase + log lines
GET /api/whisperx/status — installed + healthy?
"""
from __future__ import annotations
import asyncio
import shlex
import uuid
from dataclasses import dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional
import httpx
from .config import Settings
from .ssh import _base_args, ssh_run, ssh_stream, StreamHandle
# Build context shipped inside the spark-control image (Dockerfile COPYs it).
BUILD_CONTEXT_DIR = Path(__file__).resolve().parent.parent / "whisperx_container"
# Files we ship to Spark 2's build dir. Mapped local-name → remote-relative-path.
BUILD_FILES = {
"Dockerfile": "Dockerfile",
"requirements.txt": "requirements.txt",
"README.md": "README.md",
"app/main.py": "app/main.py",
}
@dataclass
class WhisperXInstallJob:
id: str
started_at: str
state: str = "starting" # starting | sending | building | running | done | failed
phase: str = "Starting…"
lines: list[str] = field(default_factory=list)
returncode: Optional[int] = None
finished_at: Optional[str] = None
def append(self, line: str) -> None:
self.lines.append(line)
if len(self.lines) > 1500:
del self.lines[: len(self.lines) - 1500]
class WhisperXInstaller:
def __init__(self, settings: Settings) -> None:
self.settings = settings
self.lock = asyncio.Lock()
self.jobs: dict[str, WhisperXInstallJob] = {}
self.current_job_id: Optional[str] = None
def get(self, job_id: str) -> WhisperXInstallJob | None:
return self.jobs.get(job_id)
async def status(self) -> dict:
"""Probe whether WhisperX is installed + healthy on its configured host."""
s = self.settings
host_present = bool(s.whisperx_host and s.whisperx_user)
if not host_present:
return {"configured": False, "installed": False, "healthy": False}
# Probe HTTP health
url = f"http://{s.whisperx_host}:{s.whisperx_port}/health"
try:
async with httpx.AsyncClient(timeout=3.0) as client:
r = await client.get(url)
if r.status_code == 200:
body = r.json()
return {
"configured": True,
"installed": True,
"healthy": True,
"model": body.get("model"),
"device": body.get("device"),
"diarizer_loaded": body.get("diarizer_loaded", False),
}
except Exception:
pass
# No HTTP — check if the container exists at all
container_present = await self._container_exists()
return {
"configured": True,
"installed": container_present,
"healthy": False,
"current_job_id": self.current_job_id,
}
async def _container_exists(self) -> bool:
s = self.settings
cmd = f"docker ps -a --filter name=^{s.whisperx_container}$ --format '{{{{.Names}}}}'"
rc, out, _ = await ssh_run(s.whisperx_host, s.whisperx_user, cmd, s, timeout=10)
return rc == 0 and s.whisperx_container in out
async def trigger(self) -> WhisperXInstallJob:
if self.lock.locked():
raise RuntimeError("a WhisperX install is already in progress")
s = self.settings
if not s.whisperx_host or not s.whisperx_user:
raise RuntimeError("whisperx host/user not configured")
for local_name in BUILD_FILES:
if not (BUILD_CONTEXT_DIR / local_name).exists():
raise RuntimeError(f"build context file missing inside spark-control image: {local_name}")
job = WhisperXInstallJob(
id=uuid.uuid4().hex[:8],
started_at=datetime.now(timezone.utc).isoformat(),
)
self.jobs[job.id] = job
self.current_job_id = job.id
asyncio.create_task(self._run(job))
return job
async def _run(self, job: WhisperXInstallJob) -> None:
async with self.lock:
try:
await self._do(job)
if job.state != "failed":
job.state = "done"
job.returncode = 0
job.phase = "Done — WhisperX is running on port 8002"
except Exception as e:
job.append(f"[error] {type(e).__name__}: {e}")
job.state = "failed"
if job.returncode is None:
job.returncode = 1
finally:
job.finished_at = datetime.now(timezone.utc).isoformat()
if self.current_job_id == job.id:
self.current_job_id = None
async def _ssh_pipe(self, host: str, user: str, remote_cmd: str,
payload: bytes, timeout: float = 60.0) -> tuple[bool, str, str]:
"""ssh user@host <remote_cmd> with payload piped to stdin."""
args = _base_args(self.settings) + [f"{user}@{host}", remote_cmd]
proc = await asyncio.create_subprocess_exec(
*args,
stdin=asyncio.subprocess.PIPE,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
try:
stdout_b, stderr_b = await asyncio.wait_for(
proc.communicate(input=payload), timeout=timeout
)
except asyncio.TimeoutError:
proc.kill(); await proc.wait()
return False, "", f"timeout after {timeout}s"
return proc.returncode == 0, stdout_b.decode(errors="replace"), stderr_b.decode(errors="replace")
async def _do(self, job: WhisperXInstallJob) -> None:
s = self.settings
host = s.whisperx_host
user = s.whisperx_user
build_dir = "~/whisperx-build"
# ── Phase 1: stage build context on Spark 2 ──
job.state = "sending"
job.phase = "Sending build context to Spark 2…"
job.append(f"$ ssh {user}@{host} 'mkdir -p {build_dir}/app'")
rc, out, err = await ssh_run(host, user, f"mkdir -p {build_dir}/app && rm -f {build_dir}/Dockerfile {build_dir}/requirements.txt {build_dir}/README.md {build_dir}/app/main.py", s, timeout=10)
if rc != 0:
job.append(f"[mkdir failed] {err.strip()}")
raise RuntimeError("failed to create build directory")
for local_name, remote_rel in BUILD_FILES.items():
local_path = BUILD_CONTEXT_DIR / local_name
body = local_path.read_bytes()
remote_path = f"{build_dir}/{remote_rel}"
cmd = f"cat > {shlex.quote(remote_path)}"
ok, out, err = await self._ssh_pipe(host, user, cmd, body, timeout=30)
if not ok:
job.append(f"[scp {local_name} failed] {err.strip()[:200]}")
raise RuntimeError(f"failed to ship {local_name}")
job.append(f"{remote_path} ({len(body)} bytes)")
# ── Phase 2: docker build ──
job.state = "building"
job.phase = "Building Docker image on Spark 2 (this is the slow part — 515 min if base layers aren't cached)…"
build_cmd = (
f"set -e; "
f"cd {build_dir}; "
f"echo '=== docker build -t {s.whisperx_container}:latest . ==='; "
f"docker build -t {s.whisperx_container}:latest ."
)
job.append(f"$ {build_cmd}")
handle = StreamHandle()
async for line in ssh_stream(host, user, build_cmd, s, handle=handle):
job.append(line)
if "Step " in line and "/" in line:
# docker build progress: "Step 5/10 : RUN pip install ..."
job.phase = f"Building: {line.strip()[:120]}"
elif "Successfully built" in line or "naming to" in line:
job.phase = "Image built — preparing to start container…"
if (handle.returncode or 0) != 0:
job.returncode = handle.returncode
raise RuntimeError(f"docker build failed (rc={handle.returncode})")
# ── Phase 3: docker run ──
job.state = "running"
job.phase = "Starting container…"
run_cmd = (
f"set -e; "
f"echo '=== removing any prior {s.whisperx_container} container ==='; "
f"docker rm -f {s.whisperx_container} 2>/dev/null || true; "
f"echo '=== docker run -d --restart unless-stopped --name {s.whisperx_container} ==='; "
f"HF_TOKEN=$(cat ~/.cache/huggingface/token 2>/dev/null || true); "
f"if [ -z \"$HF_TOKEN\" ]; then echo 'WARN: no HF_TOKEN found at ~/.cache/huggingface/token — diarization will be disabled until you set one'; fi; "
f"docker run -d --restart unless-stopped "
f"--name {s.whisperx_container} "
f"--gpus all --memory=40g "
f"-p {s.whisperx_port}:{s.whisperx_port} "
f"-v whisperx-models:/root/.cache/huggingface "
f"-e HF_TOKEN=\"$HF_TOKEN\" "
f"-e WHISPER_MODEL={s.whisperx_model} "
f"{s.whisperx_container}:latest"
)
job.append(f"$ {run_cmd}")
rc, out, err = await ssh_run(host, user, run_cmd, s, timeout=60)
if rc != 0:
job.append(f"[docker run failed rc={rc}] {(err or out).strip()[:300]}")
raise RuntimeError("docker run failed")
job.append(out.strip())
# ── Phase 4: wait for /health to report ready ──
job.phase = "Container is starting; loading whisper + alignment + pyannote models (~60120 s on first boot)…"
url = f"http://{s.whisperx_host}:{s.whisperx_port}/health"
ready = False
for i in range(60): # up to ~180 s
await asyncio.sleep(3)
try:
async with httpx.AsyncClient(timeout=4.0) as client:
r = await client.get(url)
if r.status_code == 200:
body = r.json()
if body.get("status") == "ready":
ready = True
job.append(f"[ready] {body}")
break
job.phase = f"Loading models (transcribe={body.get('transcribe_loaded')}, align={body.get('align_loaded')}, diarize={body.get('diarizer_loaded')})…"
except Exception:
pass
if not ready:
raise RuntimeError("container started but /health did not report ready within ~180 s — check `docker logs whisperx-asr` on Spark 2")
job.phase = "Done — WhisperX is healthy and reachable on port 8002"