v0.13.0:0 - revert WhisperX migration; back to Parakeet + Sortformer
After five hotfix iterations on the WhisperX install (v0.12.0:0–:4) we
never got a working docker build. The fundamental constraint isn't
patchable from outside NVIDIA: NGC PyTorch on ARM64 (the only base that
runs on Spark 2's GB10 Blackwell) ships a custom-versioned torch
2.10.0a0+b558c98 that has no pre-built torchaudio match anywhere.
WhisperX → pyannote → torchaudio is a hard dependency chain we couldn't
satisfy without rebuilding torchaudio against torch 2.10's alpha API.
Walking away cleanly is better than another night of chasing.
Removed from the codebase:
- image/whisperx_container/* (Dockerfile + requirements + app/main.py)
- image/app/whisperx_install.py (install manager + SSH ship-context logic)
- image/Dockerfile COPY whisperx_container
- WHISPERX_* config keys in config.py
- whisperx service entry in services.py
- WhisperX-preferred branch in audio_proxy.py
- /api/whisperx/* endpoints in server.py
- install banner + progress dialog in index.html
- render + handlers in app.js
- .whisperx-install styles in style.css
Spark 2 cleaned in tandem (user-authorized): container removed,
~/whisperx-build/ removed, 5.4 GB of dangling image layers + 1.3 GB of
builder cache reclaimed. parakeet-asr and magpie-tts unaffected and
healthy throughout.
The audio path is back to exactly what shipped in v0.11.0:3:
POST /api/audio/transcribe-with-speakers
→ Parakeet (transcription) + Sortformer (diarization) in parallel
→ merged by timestamp into speaker-labeled blocks
v0.13.0:1+ will add the actually-needed fixes that the WhisperX detour
was meant to address:
1. memory cap on the parakeet-asr container so a long-audio crash
can't swap-thrash Spark 2 again
2. a chunking proxy in /api/audio/transcribe-with-speakers that
splits inputs >10 min before Sortformer
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -18,12 +18,6 @@ COPY models.yaml /app/models.yaml
|
|||||||
# time — survives docker rm + redeploy of the parakeet container.
|
# time — survives docker rm + redeploy of the parakeet container.
|
||||||
COPY parakeet_patches /app/parakeet_patches
|
COPY parakeet_patches /app/parakeet_patches
|
||||||
|
|
||||||
# WhisperX container build context (Dockerfile + requirements.txt + app/).
|
|
||||||
# The "Install WhisperX" action in spark-control ships these files to Spark 2
|
|
||||||
# over SSH, then runs `docker build` + `docker run` there. The container
|
|
||||||
# becomes a managed always-on service alongside parakeet-asr and magpie-tts.
|
|
||||||
COPY whisperx_container /app/whisperx_container
|
|
||||||
|
|
||||||
RUN pip install --no-cache-dir -e .
|
RUN pip install --no-cache-dir -e .
|
||||||
|
|
||||||
ENV BIND_PORT=9999
|
ENV BIND_PORT=9999
|
||||||
|
|||||||
@@ -209,17 +209,6 @@ def build_router(settings: Settings, deep_health: Any = None) -> APIRouter:
|
|||||||
raise HTTPException(r.status_code, r.text[:500])
|
raise HTTPException(r.status_code, r.text[:500])
|
||||||
return Response(content=r.content, media_type=r.headers.get("content-type", "application/json"))
|
return Response(content=r.content, media_type=r.headers.get("content-type", "application/json"))
|
||||||
|
|
||||||
def _whisperx_base() -> str:
|
|
||||||
return f"http://{settings.whisperx_host}:{settings.whisperx_port}"
|
|
||||||
|
|
||||||
async def _whisperx_healthy() -> bool:
|
|
||||||
try:
|
|
||||||
async with httpx.AsyncClient(timeout=2.0) as client:
|
|
||||||
r = await client.get(f"{_whisperx_base()}/health")
|
|
||||||
return r.status_code == 200 and bool(r.json().get("diarizer_loaded"))
|
|
||||||
except Exception:
|
|
||||||
return False
|
|
||||||
|
|
||||||
# ---- /api/audio/transcribe-with-speakers (STT + diarization, merged) ----
|
# ---- /api/audio/transcribe-with-speakers (STT + diarization, merged) ----
|
||||||
@router.post("/api/audio/transcribe-with-speakers")
|
@router.post("/api/audio/transcribe-with-speakers")
|
||||||
async def transcribe_with_speakers(
|
async def transcribe_with_speakers(
|
||||||
@@ -256,23 +245,8 @@ def build_router(settings: Settings, deep_health: Any = None) -> APIRouter:
|
|||||||
filename = file.filename or "audio.wav"
|
filename = file.filename or "audio.wav"
|
||||||
content_type = file.content_type or "application/octet-stream"
|
content_type = file.content_type or "application/octet-stream"
|
||||||
|
|
||||||
# Prefer WhisperX (single-pipeline, handles long audio properly) when it's
|
# Parakeet ASR + Sortformer diarizer in parallel. (A WhisperX detour
|
||||||
# installed and healthy. Fall back to Parakeet + Sortformer otherwise.
|
# lived here briefly — reverted in v0.13.0:0; see release notes.)
|
||||||
if await _whisperx_healthy():
|
|
||||||
files = {"file": (filename, body, content_type)}
|
|
||||||
try:
|
|
||||||
async with httpx.AsyncClient(timeout=1800.0) as client:
|
|
||||||
r = await client.post(
|
|
||||||
f"{_whisperx_base()}/v1/audio/transcribe-with-speakers",
|
|
||||||
files=files,
|
|
||||||
)
|
|
||||||
except httpx.HTTPError as e:
|
|
||||||
raise HTTPException(502, f"whisperx unreachable: {e}")
|
|
||||||
if r.status_code != 200:
|
|
||||||
raise HTTPException(r.status_code, r.text[:500])
|
|
||||||
return r.json()
|
|
||||||
|
|
||||||
# ── Legacy fallback: Parakeet ASR + Sortformer diarizer in parallel ──
|
|
||||||
async def _call_transcribe(client: httpx.AsyncClient) -> dict:
|
async def _call_transcribe(client: httpx.AsyncClient) -> dict:
|
||||||
files = {"file": (filename, body, content_type)}
|
files = {"file": (filename, body, content_type)}
|
||||||
data = {"response_format": "verbose_json"}
|
data = {"response_format": "verbose_json"}
|
||||||
|
|||||||
+1
-11
@@ -35,11 +35,6 @@ class Settings:
|
|||||||
magpie_host: str
|
magpie_host: str
|
||||||
magpie_user: str
|
magpie_user: str
|
||||||
magpie_container: str
|
magpie_container: str
|
||||||
whisperx_host: str
|
|
||||||
whisperx_user: str
|
|
||||||
whisperx_container: str
|
|
||||||
whisperx_port: int
|
|
||||||
whisperx_model: str
|
|
||||||
ssh_key_path: str
|
ssh_key_path: str
|
||||||
ssh_known_hosts: str
|
ssh_known_hosts: str
|
||||||
models_yaml: str
|
models_yaml: str
|
||||||
@@ -54,7 +49,7 @@ class Settings:
|
|||||||
def from_env(cls) -> "Settings":
|
def from_env(cls) -> "Settings":
|
||||||
spark2_host = _env("SPARK2_HOST")
|
spark2_host = _env("SPARK2_HOST")
|
||||||
spark2_user = _env("SPARK2_USER")
|
spark2_user = _env("SPARK2_USER")
|
||||||
# Parakeet, Magpie, and WhisperX all default to Spark 2 unless overridden.
|
# Parakeet and Magpie default to Spark 2 unless explicitly overridden.
|
||||||
return cls(
|
return cls(
|
||||||
spark1_host=_env("SPARK1_HOST"),
|
spark1_host=_env("SPARK1_HOST"),
|
||||||
spark1_user=_env("SPARK1_USER"),
|
spark1_user=_env("SPARK1_USER"),
|
||||||
@@ -66,11 +61,6 @@ class Settings:
|
|||||||
magpie_host=_env("MAGPIE_HOST") or spark2_host,
|
magpie_host=_env("MAGPIE_HOST") or spark2_host,
|
||||||
magpie_user=_env("MAGPIE_USER") or spark2_user,
|
magpie_user=_env("MAGPIE_USER") or spark2_user,
|
||||||
magpie_container=_env("MAGPIE_CONTAINER") or "magpie-tts",
|
magpie_container=_env("MAGPIE_CONTAINER") or "magpie-tts",
|
||||||
whisperx_host=_env("WHISPERX_HOST") or spark2_host,
|
|
||||||
whisperx_user=_env("WHISPERX_USER") or spark2_user,
|
|
||||||
whisperx_container=_env("WHISPERX_CONTAINER") or "whisperx-asr",
|
|
||||||
whisperx_port=int(_env("WHISPERX_PORT", "8002")),
|
|
||||||
whisperx_model=_env("WHISPERX_MODEL", "medium"),
|
|
||||||
ssh_key_path=_env("SSH_KEY_PATH"),
|
ssh_key_path=_env("SSH_KEY_PATH"),
|
||||||
ssh_known_hosts=_env("SSH_KNOWN_HOSTS"),
|
ssh_known_hosts=_env("SSH_KNOWN_HOSTS"),
|
||||||
models_yaml=_resolve_models_yaml(),
|
models_yaml=_resolve_models_yaml(),
|
||||||
|
|||||||
+4
-64
@@ -24,7 +24,6 @@ from .overrides import add_custom, delete_custom, extract_knobs_from_args, load_
|
|||||||
from .services import docker_state, run_action, services_from_settings
|
from .services import docker_state, run_action, services_from_settings
|
||||||
from .speech_models import SpeechModelsManager
|
from .speech_models import SpeechModelsManager
|
||||||
from .ssh import ssh_run
|
from .ssh import ssh_run
|
||||||
from .whisperx_install import WhisperXInstaller
|
|
||||||
from .swap import SwapManager
|
from .swap import SwapManager
|
||||||
from .updates import UpdateManager, get_update_status
|
from .updates import UpdateManager, get_update_status
|
||||||
from .validate import validate_launch
|
from .validate import validate_launch
|
||||||
@@ -40,7 +39,6 @@ hardware_probe = HardwareProbe(settings)
|
|||||||
nim_manager = NimManager(settings)
|
nim_manager = NimManager(settings)
|
||||||
deep_health = DeepHealth(settings)
|
deep_health = DeepHealth(settings)
|
||||||
speech_models = SpeechModelsManager(settings)
|
speech_models = SpeechModelsManager(settings)
|
||||||
whisperx_installer = WhisperXInstaller(settings)
|
|
||||||
|
|
||||||
app = FastAPI(title="spark-control", version="0.1.0")
|
app = FastAPI(title="spark-control", version="0.1.0")
|
||||||
|
|
||||||
@@ -537,68 +535,10 @@ async def post_speech_models_restart() -> dict:
|
|||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
# ---- WhisperX install (Phase 2 of the WhisperX migration) ----
|
# NOTE: a WhisperX-on-Spark-2 install action lived here briefly in v0.12.0:0–4
|
||||||
|
# but was reverted in v0.13.0:0. NGC's custom-versioned torch on ARM64 made
|
||||||
@app.get("/api/whisperx/status")
|
# building torchaudio (which WhisperX needs via pyannote) unworkable. The
|
||||||
async def get_whisperx_status() -> dict:
|
# existing Parakeet + Sortformer pipeline stays as the audio path.
|
||||||
"""Is WhisperX installed + healthy on Spark 2 right now?"""
|
|
||||||
return await whisperx_installer.status()
|
|
||||||
|
|
||||||
|
|
||||||
@app.post("/api/whisperx/install")
|
|
||||||
async def post_whisperx_install() -> dict:
|
|
||||||
"""One-click install: ships the WhisperX build context from inside
|
|
||||||
spark-control to Spark 2, runs `docker build` + `docker run`, polls
|
|
||||||
/health until both models are loaded. Streams progress via the matching
|
|
||||||
GET /api/whisperx/install/{job_id}/stream SSE endpoint."""
|
|
||||||
try:
|
|
||||||
job = await whisperx_installer.trigger()
|
|
||||||
except RuntimeError as e:
|
|
||||||
raise HTTPException(409, str(e))
|
|
||||||
return {"job_id": job.id, "started_at": job.started_at}
|
|
||||||
|
|
||||||
|
|
||||||
@app.get("/api/whisperx/install/{job_id}")
|
|
||||||
async def get_whisperx_install(job_id: str) -> dict:
|
|
||||||
job = whisperx_installer.get(job_id)
|
|
||||||
if not job:
|
|
||||||
raise HTTPException(404, "unknown job")
|
|
||||||
return {
|
|
||||||
"id": job.id,
|
|
||||||
"state": job.state,
|
|
||||||
"phase": job.phase,
|
|
||||||
"lines": job.lines,
|
|
||||||
"started_at": job.started_at,
|
|
||||||
"finished_at": job.finished_at,
|
|
||||||
"returncode": job.returncode,
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@app.get("/api/whisperx/install/{job_id}/stream")
|
|
||||||
async def stream_whisperx_install(job_id: str) -> StreamingResponse:
|
|
||||||
job = whisperx_installer.get(job_id)
|
|
||||||
if not job:
|
|
||||||
raise HTTPException(404, "unknown job")
|
|
||||||
|
|
||||||
async def event_stream():
|
|
||||||
last_idx = 0
|
|
||||||
last_phase = ""
|
|
||||||
last_state = ""
|
|
||||||
while True:
|
|
||||||
new_lines = job.lines[last_idx:]
|
|
||||||
last_idx = len(job.lines)
|
|
||||||
for line in new_lines:
|
|
||||||
yield f"data: {json.dumps({'line': line})}\n\n"
|
|
||||||
if job.phase != last_phase or job.state != last_state:
|
|
||||||
yield f"event: phase\ndata: {json.dumps({'phase': job.phase, 'state': job.state})}\n\n"
|
|
||||||
last_phase = job.phase
|
|
||||||
last_state = job.state
|
|
||||||
if job.finished_at:
|
|
||||||
yield f"event: done\ndata: {json.dumps({'state': job.state, 'returncode': job.returncode})}\n\n"
|
|
||||||
return
|
|
||||||
await asyncio.sleep(0.6)
|
|
||||||
|
|
||||||
return StreamingResponse(event_stream(), media_type="text/event-stream")
|
|
||||||
|
|
||||||
|
|
||||||
@app.get("/api/endpoints")
|
@app.get("/api/endpoints")
|
||||||
|
|||||||
@@ -65,14 +65,6 @@ def services_from_settings(s: Settings) -> dict[str, ServiceDef]:
|
|||||||
container=s.magpie_container,
|
container=s.magpie_container,
|
||||||
port=s.magpie_port,
|
port=s.magpie_port,
|
||||||
),
|
),
|
||||||
"whisperx": ServiceDef(
|
|
||||||
name="whisperx",
|
|
||||||
kind="stt+diarize",
|
|
||||||
host=s.whisperx_host,
|
|
||||||
user=s.whisperx_user,
|
|
||||||
container=s.whisperx_container,
|
|
||||||
port=s.whisperx_port,
|
|
||||||
),
|
|
||||||
}
|
}
|
||||||
for entry in load_custom_services():
|
for entry in load_custom_services():
|
||||||
key = entry.get("key")
|
key = entry.get("key")
|
||||||
|
|||||||
+4
-118
@@ -664,116 +664,10 @@ async function onSpeechModelsRestart() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ===================== WhisperX install (v0.12) =====================
|
// NOTE: a WhisperX install action lived here briefly in v0.12 but was
|
||||||
|
// reverted in v0.13.0:0 — the NGC PyTorch container on ARM64 doesn't ship
|
||||||
const wxState = {
|
// torchaudio and we couldn't reliably build it from source. The existing
|
||||||
job_id: null,
|
// Parakeet + Sortformer pipeline stays as the audio path. See release notes.
|
||||||
eventsource: null,
|
|
||||||
timer_handle: null,
|
|
||||||
started_at: null,
|
|
||||||
};
|
|
||||||
|
|
||||||
async function renderWhisperXBanner() {
|
|
||||||
const card = el('#whisperx-install-card');
|
|
||||||
if (!card) return;
|
|
||||||
let status;
|
|
||||||
try {
|
|
||||||
status = await fetchJSON('/api/whisperx/status');
|
|
||||||
} catch {
|
|
||||||
card.classList.add('hidden');
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
if (status.installed && status.healthy) {
|
|
||||||
card.classList.add('hidden');
|
|
||||||
} else if (status.configured) {
|
|
||||||
card.classList.remove('hidden');
|
|
||||||
} else {
|
|
||||||
card.classList.add('hidden');
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async function onWhisperXInstall() {
|
|
||||||
if (wxState.job_id) {
|
|
||||||
// Just re-attach to the running job
|
|
||||||
showWhisperXDialog();
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
if (!confirm('Install WhisperX on Spark 2? This builds a new Docker image (~10–15 min first time, mostly downloading pyannote + whisper weights). Parakeet/Magpie stay untouched.')) return;
|
|
||||||
try {
|
|
||||||
const r = await fetchJSON('/api/whisperx/install', { method: 'POST' });
|
|
||||||
attachToWhisperXInstall(r.job_id);
|
|
||||||
} catch (e) {
|
|
||||||
alert('Failed to start WhisperX install: ' + e.message);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
function showWhisperXDialog() {
|
|
||||||
el('#whisperx-progress-dialog').showModal();
|
|
||||||
}
|
|
||||||
|
|
||||||
function attachToWhisperXInstall(jobId) {
|
|
||||||
wxState.job_id = jobId;
|
|
||||||
el('#wx-prog-title').textContent = 'Installing WhisperX…';
|
|
||||||
el('#wx-prog-phase').textContent = 'Starting…';
|
|
||||||
el('#wx-prog-log').textContent = '';
|
|
||||||
showWhisperXDialog();
|
|
||||||
|
|
||||||
// Tick a timer
|
|
||||||
wxState.started_at = Date.now();
|
|
||||||
if (wxState.timer_handle) clearInterval(wxState.timer_handle);
|
|
||||||
wxState.timer_handle = setInterval(() => {
|
|
||||||
const sec = Math.max(0, Math.floor((Date.now() - wxState.started_at) / 1000));
|
|
||||||
const m = Math.floor(sec / 60);
|
|
||||||
el('#wx-prog-elapsed').textContent = `${m}:${(sec % 60).toString().padStart(2, '0')}`;
|
|
||||||
}, 500);
|
|
||||||
|
|
||||||
// Backfill snapshot then connect SSE
|
|
||||||
fetchJSON(`/api/whisperx/install/${jobId}`).then((snap) => {
|
|
||||||
el('#wx-prog-phase').textContent = snap.phase || 'Working…';
|
|
||||||
el('#wx-prog-log').textContent = (snap.lines || []).join('\n');
|
|
||||||
el('#wx-prog-log').scrollTop = el('#wx-prog-log').scrollHeight;
|
|
||||||
if (snap.finished_at) {
|
|
||||||
handleWhisperXDone(snap);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
const es = new EventSource(`/api/whisperx/install/${jobId}/stream`);
|
|
||||||
wxState.eventsource = es;
|
|
||||||
es.onmessage = (ev) => {
|
|
||||||
try {
|
|
||||||
const log = el('#wx-prog-log');
|
|
||||||
log.textContent += JSON.parse(ev.data).line + '\n';
|
|
||||||
log.scrollTop = log.scrollHeight;
|
|
||||||
} catch {}
|
|
||||||
};
|
|
||||||
es.addEventListener('phase', (ev) => {
|
|
||||||
try { el('#wx-prog-phase').textContent = JSON.parse(ev.data).phase; } catch {}
|
|
||||||
});
|
|
||||||
es.addEventListener('done', (ev) => {
|
|
||||||
try { handleWhisperXDone(JSON.parse(ev.data)); } catch {}
|
|
||||||
es.close();
|
|
||||||
wxState.eventsource = null;
|
|
||||||
});
|
|
||||||
es.onerror = () => { es.close(); wxState.eventsource = null; };
|
|
||||||
}).catch(() => {});
|
|
||||||
}
|
|
||||||
|
|
||||||
function handleWhisperXDone(d) {
|
|
||||||
if (wxState.timer_handle) { clearInterval(wxState.timer_handle); wxState.timer_handle = null; }
|
|
||||||
wxState.job_id = null;
|
|
||||||
const rc = d.returncode;
|
|
||||||
if (d.state === 'failed' || (rc !== 0 && rc != null)) {
|
|
||||||
el('#wx-prog-title').textContent = `WhisperX install failed (rc=${rc})`;
|
|
||||||
el('#wx-prog-phase').textContent = 'Failed — check the build log below';
|
|
||||||
} else {
|
|
||||||
el('#wx-prog-title').textContent = 'WhisperX installed';
|
|
||||||
el('#wx-prog-phase').textContent = 'Ready ✓ — appears in Always-on services below';
|
|
||||||
// Refresh services + banner state
|
|
||||||
setTimeout(() => {
|
|
||||||
renderServices();
|
|
||||||
renderWhisperXBanner();
|
|
||||||
}, 1000);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async function onServiceAction(key) {
|
async function onServiceAction(key) {
|
||||||
if (state.service_action_in_flight) return;
|
if (state.service_action_in_flight) return;
|
||||||
@@ -1971,11 +1865,6 @@ async function init() {
|
|||||||
} catch {}
|
} catch {}
|
||||||
setupDashboardTabs();
|
setupDashboardTabs();
|
||||||
setupEndpointCollapse();
|
setupEndpointCollapse();
|
||||||
// WhisperX install button
|
|
||||||
const wxBtn = el('#wx-install');
|
|
||||||
if (wxBtn) wxBtn.addEventListener('click', onWhisperXInstall);
|
|
||||||
const wxCloseBtn = el('#wx-prog-close');
|
|
||||||
if (wxCloseBtn) wxCloseBtn.addEventListener('click', () => el('#whisperx-progress-dialog').close());
|
|
||||||
await loadModels();
|
await loadModels();
|
||||||
await pollStatus();
|
await pollStatus();
|
||||||
await renderServices();
|
await renderServices();
|
||||||
@@ -1985,14 +1874,11 @@ async function init() {
|
|||||||
loadDiskStatus();
|
loadDiskStatus();
|
||||||
// Speech-model patches panel — slow over SSH, runs after first paint.
|
// Speech-model patches panel — slow over SSH, runs after first paint.
|
||||||
renderSpeechModels();
|
renderSpeechModels();
|
||||||
// WhisperX install banner — show only when not yet installed/healthy.
|
|
||||||
renderWhisperXBanner();
|
|
||||||
setInterval(pollStatus, 5000);
|
setInterval(pollStatus, 5000);
|
||||||
setInterval(pollHardware, 8000); // every 8s
|
setInterval(pollHardware, 8000); // every 8s
|
||||||
setInterval(pollUpdates, 300000); // every 5 min
|
setInterval(pollUpdates, 300000); // every 5 min
|
||||||
setInterval(loadDiskStatus, 60000); // every 60s — disk state changes rarely
|
setInterval(loadDiskStatus, 60000); // every 60s — disk state changes rarely
|
||||||
setInterval(renderSpeechModels, 120000); // every 2 min — patches change rarely
|
setInterval(renderSpeechModels, 120000); // every 2 min — patches change rarely
|
||||||
setInterval(renderWhisperXBanner, 60000); // every 60s — auto-hides banner after install
|
|
||||||
}
|
}
|
||||||
|
|
||||||
init();
|
init();
|
||||||
|
|||||||
@@ -103,46 +103,6 @@
|
|||||||
|
|
||||||
<div class="tab-content" id="tab-audio" role="tabpanel" aria-labelledby="tab-audio-trigger">
|
<div class="tab-content" id="tab-audio" role="tabpanel" aria-labelledby="tab-audio-trigger">
|
||||||
|
|
||||||
<section id="whisperx-install-card" class="whisperx-install hidden">
|
|
||||||
<div class="wx-install-body">
|
|
||||||
<div class="wx-install-title">
|
|
||||||
<strong>Add WhisperX</strong>
|
|
||||||
<span class="tag ok">recommended</span>
|
|
||||||
</div>
|
|
||||||
<p class="muted small">
|
|
||||||
WhisperX is a single-container speech pipeline (faster-whisper for transcription + pyannote 3.1 for diarization)
|
|
||||||
designed to handle long audio cleanly. Replaces the Parakeet + Sortformer combo we patched together,
|
|
||||||
which crashed on a 90-min meeting. Pulled and built directly on Spark 2 (~10–15 min first time;
|
|
||||||
you only do this once).
|
|
||||||
</p>
|
|
||||||
<p class="muted small">
|
|
||||||
Requires a Hugging Face token at <code>~/.cache/huggingface/token</code> on Spark 2 (already set up).
|
|
||||||
</p>
|
|
||||||
<div class="wx-install-actions">
|
|
||||||
<button id="wx-install" class="btn primary">Install WhisperX</button>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
</section>
|
|
||||||
|
|
||||||
<dialog id="whisperx-progress-dialog" class="modal">
|
|
||||||
<form method="dialog" class="modal-form">
|
|
||||||
<h3 id="wx-prog-title">Installing WhisperX…</h3>
|
|
||||||
<div class="phase-row">
|
|
||||||
<span class="spinner"></span>
|
|
||||||
<div class="phase" id="wx-prog-phase">Starting…</div>
|
|
||||||
<span class="spacer"></span>
|
|
||||||
<span class="timer" id="wx-prog-elapsed">0:00</span>
|
|
||||||
</div>
|
|
||||||
<details open>
|
|
||||||
<summary class="muted small">Build log</summary>
|
|
||||||
<pre id="wx-prog-log" class="log"></pre>
|
|
||||||
</details>
|
|
||||||
<div class="modal-actions">
|
|
||||||
<button type="button" id="wx-prog-close" class="btn">Close</button>
|
|
||||||
</div>
|
|
||||||
</form>
|
|
||||||
</dialog>
|
|
||||||
|
|
||||||
<section id="services-panel" class="services hidden">
|
<section id="services-panel" class="services hidden">
|
||||||
<div class="section-header">
|
<div class="section-header">
|
||||||
<h2 class="section-title">Always-on services</h2>
|
<h2 class="section-title">Always-on services</h2>
|
||||||
|
|||||||
@@ -907,15 +907,4 @@ main {
|
|||||||
.tab-content { display: none; }
|
.tab-content { display: none; }
|
||||||
.tab-content.active { display: block; }
|
.tab-content.active { display: block; }
|
||||||
|
|
||||||
/* ===== WhisperX install banner (v0.12) ===== */
|
/* (WhisperX install banner styles removed in v0.13.0:0 — see release notes) */
|
||||||
.whisperx-install {
|
|
||||||
background: var(--surface);
|
|
||||||
border: 1px solid var(--info);
|
|
||||||
border-radius: var(--radius);
|
|
||||||
padding: 16px 18px;
|
|
||||||
margin-bottom: 20px;
|
|
||||||
}
|
|
||||||
.wx-install-body { display: flex; flex-direction: column; gap: 10px; }
|
|
||||||
.wx-install-title { display: flex; align-items: center; gap: 10px; }
|
|
||||||
.wx-install-title strong { font-size: 15px; color: var(--text); }
|
|
||||||
.wx-install-actions { display: flex; gap: 10px; margin-top: 4px; }
|
|
||||||
|
|||||||
@@ -1,267 +0,0 @@
|
|||||||
"""WhisperX install action — ships the build context from inside spark-control
|
|
||||||
to Spark 2 over SSH, then runs `docker build` + `docker run` on Spark 2 and
|
|
||||||
streams progress back as SSE.
|
|
||||||
|
|
||||||
Pattern mirrors NimManager (see nim.py) but for a locally-built container
|
|
||||||
rather than an `nvcr.io` pull. Build context lives at
|
|
||||||
/app/whisperx_container/ inside the spark-control Docker image (set up by
|
|
||||||
the Dockerfile COPY directive).
|
|
||||||
|
|
||||||
Endpoints:
|
|
||||||
POST /api/whisperx/install — kick off
|
|
||||||
GET /api/whisperx/install/{job_id} — snapshot
|
|
||||||
GET /api/whisperx/install/{job_id}/stream — SSE phase + log lines
|
|
||||||
GET /api/whisperx/status — installed + healthy?
|
|
||||||
"""
|
|
||||||
from __future__ import annotations
|
|
||||||
import asyncio
|
|
||||||
import shlex
|
|
||||||
import uuid
|
|
||||||
from dataclasses import dataclass, field
|
|
||||||
from datetime import datetime, timezone
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
import httpx
|
|
||||||
|
|
||||||
from .config import Settings
|
|
||||||
from .ssh import _base_args, ssh_run, ssh_stream, StreamHandle
|
|
||||||
|
|
||||||
|
|
||||||
# Build context shipped inside the spark-control image (Dockerfile COPYs it).
|
|
||||||
BUILD_CONTEXT_DIR = Path(__file__).resolve().parent.parent / "whisperx_container"
|
|
||||||
|
|
||||||
# Files we ship to Spark 2's build dir. Mapped local-name → remote-relative-path.
|
|
||||||
BUILD_FILES = {
|
|
||||||
"Dockerfile": "Dockerfile",
|
|
||||||
"requirements.txt": "requirements.txt",
|
|
||||||
"README.md": "README.md",
|
|
||||||
"app/main.py": "app/main.py",
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class WhisperXInstallJob:
|
|
||||||
id: str
|
|
||||||
started_at: str
|
|
||||||
state: str = "starting" # starting | sending | building | running | done | failed
|
|
||||||
phase: str = "Starting…"
|
|
||||||
lines: list[str] = field(default_factory=list)
|
|
||||||
returncode: Optional[int] = None
|
|
||||||
finished_at: Optional[str] = None
|
|
||||||
|
|
||||||
def append(self, line: str) -> None:
|
|
||||||
self.lines.append(line)
|
|
||||||
if len(self.lines) > 1500:
|
|
||||||
del self.lines[: len(self.lines) - 1500]
|
|
||||||
|
|
||||||
|
|
||||||
class WhisperXInstaller:
|
|
||||||
def __init__(self, settings: Settings) -> None:
|
|
||||||
self.settings = settings
|
|
||||||
self.lock = asyncio.Lock()
|
|
||||||
self.jobs: dict[str, WhisperXInstallJob] = {}
|
|
||||||
self.current_job_id: Optional[str] = None
|
|
||||||
|
|
||||||
def get(self, job_id: str) -> WhisperXInstallJob | None:
|
|
||||||
return self.jobs.get(job_id)
|
|
||||||
|
|
||||||
async def status(self) -> dict:
|
|
||||||
"""Probe whether WhisperX is installed + healthy on its configured host."""
|
|
||||||
s = self.settings
|
|
||||||
host_present = bool(s.whisperx_host and s.whisperx_user)
|
|
||||||
if not host_present:
|
|
||||||
return {"configured": False, "installed": False, "healthy": False}
|
|
||||||
# Probe HTTP health
|
|
||||||
url = f"http://{s.whisperx_host}:{s.whisperx_port}/health"
|
|
||||||
try:
|
|
||||||
async with httpx.AsyncClient(timeout=3.0) as client:
|
|
||||||
r = await client.get(url)
|
|
||||||
if r.status_code == 200:
|
|
||||||
body = r.json()
|
|
||||||
return {
|
|
||||||
"configured": True,
|
|
||||||
"installed": True,
|
|
||||||
"healthy": True,
|
|
||||||
"model": body.get("model"),
|
|
||||||
"device": body.get("device"),
|
|
||||||
"diarizer_loaded": body.get("diarizer_loaded", False),
|
|
||||||
}
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
# No HTTP — check if the container exists at all
|
|
||||||
container_present = await self._container_exists()
|
|
||||||
return {
|
|
||||||
"configured": True,
|
|
||||||
"installed": container_present,
|
|
||||||
"healthy": False,
|
|
||||||
"current_job_id": self.current_job_id,
|
|
||||||
}
|
|
||||||
|
|
||||||
async def _container_exists(self) -> bool:
|
|
||||||
s = self.settings
|
|
||||||
cmd = f"docker ps -a --filter name=^{s.whisperx_container}$ --format '{{{{.Names}}}}'"
|
|
||||||
rc, out, _ = await ssh_run(s.whisperx_host, s.whisperx_user, cmd, s, timeout=10)
|
|
||||||
return rc == 0 and s.whisperx_container in out
|
|
||||||
|
|
||||||
async def trigger(self) -> WhisperXInstallJob:
|
|
||||||
if self.lock.locked():
|
|
||||||
raise RuntimeError("a WhisperX install is already in progress")
|
|
||||||
s = self.settings
|
|
||||||
if not s.whisperx_host or not s.whisperx_user:
|
|
||||||
raise RuntimeError("whisperx host/user not configured")
|
|
||||||
for local_name in BUILD_FILES:
|
|
||||||
if not (BUILD_CONTEXT_DIR / local_name).exists():
|
|
||||||
raise RuntimeError(f"build context file missing inside spark-control image: {local_name}")
|
|
||||||
job = WhisperXInstallJob(
|
|
||||||
id=uuid.uuid4().hex[:8],
|
|
||||||
started_at=datetime.now(timezone.utc).isoformat(),
|
|
||||||
)
|
|
||||||
self.jobs[job.id] = job
|
|
||||||
self.current_job_id = job.id
|
|
||||||
asyncio.create_task(self._run(job))
|
|
||||||
return job
|
|
||||||
|
|
||||||
async def _run(self, job: WhisperXInstallJob) -> None:
|
|
||||||
async with self.lock:
|
|
||||||
try:
|
|
||||||
await self._do(job)
|
|
||||||
if job.state != "failed":
|
|
||||||
job.state = "done"
|
|
||||||
job.returncode = 0
|
|
||||||
job.phase = "Done — WhisperX is running on port 8002"
|
|
||||||
except Exception as e:
|
|
||||||
job.append(f"[error] {type(e).__name__}: {e}")
|
|
||||||
job.state = "failed"
|
|
||||||
if job.returncode is None:
|
|
||||||
job.returncode = 1
|
|
||||||
finally:
|
|
||||||
job.finished_at = datetime.now(timezone.utc).isoformat()
|
|
||||||
if self.current_job_id == job.id:
|
|
||||||
self.current_job_id = None
|
|
||||||
|
|
||||||
async def _ssh_pipe(self, host: str, user: str, remote_cmd: str,
|
|
||||||
payload: bytes, timeout: float = 60.0) -> tuple[bool, str, str]:
|
|
||||||
"""ssh user@host <remote_cmd> with payload piped to stdin."""
|
|
||||||
args = _base_args(self.settings) + [f"{user}@{host}", remote_cmd]
|
|
||||||
proc = await asyncio.create_subprocess_exec(
|
|
||||||
*args,
|
|
||||||
stdin=asyncio.subprocess.PIPE,
|
|
||||||
stdout=asyncio.subprocess.PIPE,
|
|
||||||
stderr=asyncio.subprocess.PIPE,
|
|
||||||
)
|
|
||||||
try:
|
|
||||||
stdout_b, stderr_b = await asyncio.wait_for(
|
|
||||||
proc.communicate(input=payload), timeout=timeout
|
|
||||||
)
|
|
||||||
except asyncio.TimeoutError:
|
|
||||||
proc.kill(); await proc.wait()
|
|
||||||
return False, "", f"timeout after {timeout}s"
|
|
||||||
return proc.returncode == 0, stdout_b.decode(errors="replace"), stderr_b.decode(errors="replace")
|
|
||||||
|
|
||||||
async def _do(self, job: WhisperXInstallJob) -> None:
|
|
||||||
s = self.settings
|
|
||||||
host = s.whisperx_host
|
|
||||||
user = s.whisperx_user
|
|
||||||
# NOTE: `~` does not expand inside shlex.quote() single-quotes (bit us
|
|
||||||
# in v0.12.0:0). Use a $HOME-relative path that the REMOTE shell
|
|
||||||
# expands; all path components are hardcoded so injection is moot.
|
|
||||||
build_dir_remote = "\"$HOME\"/whisperx-build"
|
|
||||||
build_dir_display = "~/whisperx-build"
|
|
||||||
|
|
||||||
# ── Phase 1: stage build context on Spark 2 ──
|
|
||||||
job.state = "sending"
|
|
||||||
job.phase = "Sending build context to Spark 2…"
|
|
||||||
job.append(f"$ ssh {user}@{host} 'mkdir -p {build_dir_display}/app'")
|
|
||||||
rc, out, err = await ssh_run(
|
|
||||||
host, user,
|
|
||||||
f"mkdir -p {build_dir_remote}/app && "
|
|
||||||
f"rm -f {build_dir_remote}/Dockerfile {build_dir_remote}/requirements.txt "
|
|
||||||
f"{build_dir_remote}/README.md {build_dir_remote}/app/main.py",
|
|
||||||
s, timeout=10,
|
|
||||||
)
|
|
||||||
if rc != 0:
|
|
||||||
job.append(f"[mkdir failed] {err.strip()}")
|
|
||||||
raise RuntimeError("failed to create build directory")
|
|
||||||
for local_name, remote_rel in BUILD_FILES.items():
|
|
||||||
local_path = BUILD_CONTEXT_DIR / local_name
|
|
||||||
body = local_path.read_bytes()
|
|
||||||
remote_path_for_shell = f"{build_dir_remote}/{remote_rel}"
|
|
||||||
# remote_rel is hardcoded ("Dockerfile" / "app/main.py" etc.) — safe
|
|
||||||
# to embed unquoted inside the double-quoted $HOME path.
|
|
||||||
cmd = f"cat > {remote_path_for_shell}"
|
|
||||||
ok, out, err = await self._ssh_pipe(host, user, cmd, body, timeout=30)
|
|
||||||
if not ok:
|
|
||||||
job.append(f"[scp {local_name} failed] {err.strip()[:200]}")
|
|
||||||
raise RuntimeError(f"failed to ship {local_name}")
|
|
||||||
job.append(f" → {build_dir_display}/{remote_rel} ({len(body)} bytes)")
|
|
||||||
|
|
||||||
# ── Phase 2: docker build ──
|
|
||||||
job.state = "building"
|
|
||||||
job.phase = "Building Docker image on Spark 2 (this is the slow part — 5–15 min if base layers aren't cached)…"
|
|
||||||
build_cmd = (
|
|
||||||
f"set -e; "
|
|
||||||
f"cd {build_dir_remote}; "
|
|
||||||
f"echo '=== docker build -t {s.whisperx_container}:latest . ==='; "
|
|
||||||
f"docker build -t {s.whisperx_container}:latest ."
|
|
||||||
)
|
|
||||||
job.append(f"$ {build_cmd}")
|
|
||||||
handle = StreamHandle()
|
|
||||||
async for line in ssh_stream(host, user, build_cmd, s, handle=handle):
|
|
||||||
job.append(line)
|
|
||||||
if "Step " in line and "/" in line:
|
|
||||||
# docker build progress: "Step 5/10 : RUN pip install ..."
|
|
||||||
job.phase = f"Building: {line.strip()[:120]}"
|
|
||||||
elif "Successfully built" in line or "naming to" in line:
|
|
||||||
job.phase = "Image built — preparing to start container…"
|
|
||||||
if (handle.returncode or 0) != 0:
|
|
||||||
job.returncode = handle.returncode
|
|
||||||
raise RuntimeError(f"docker build failed (rc={handle.returncode})")
|
|
||||||
|
|
||||||
# ── Phase 3: docker run ──
|
|
||||||
job.state = "running"
|
|
||||||
job.phase = "Starting container…"
|
|
||||||
run_cmd = (
|
|
||||||
f"set -e; "
|
|
||||||
f"echo '=== removing any prior {s.whisperx_container} container ==='; "
|
|
||||||
f"docker rm -f {s.whisperx_container} 2>/dev/null || true; "
|
|
||||||
f"echo '=== docker run -d --restart unless-stopped --name {s.whisperx_container} ==='; "
|
|
||||||
f"HF_TOKEN=$(cat ~/.cache/huggingface/token 2>/dev/null || true); "
|
|
||||||
f"if [ -z \"$HF_TOKEN\" ]; then echo 'WARN: no HF_TOKEN found at ~/.cache/huggingface/token — diarization will be disabled until you set one'; fi; "
|
|
||||||
f"docker run -d --restart unless-stopped "
|
|
||||||
f"--name {s.whisperx_container} "
|
|
||||||
f"--gpus all --memory=40g "
|
|
||||||
f"-p {s.whisperx_port}:{s.whisperx_port} "
|
|
||||||
f"-v whisperx-models:/root/.cache/huggingface "
|
|
||||||
f"-e HF_TOKEN=\"$HF_TOKEN\" "
|
|
||||||
f"-e WHISPER_MODEL={s.whisperx_model} "
|
|
||||||
f"{s.whisperx_container}:latest"
|
|
||||||
)
|
|
||||||
job.append(f"$ {run_cmd}")
|
|
||||||
rc, out, err = await ssh_run(host, user, run_cmd, s, timeout=60)
|
|
||||||
if rc != 0:
|
|
||||||
job.append(f"[docker run failed rc={rc}] {(err or out).strip()[:300]}")
|
|
||||||
raise RuntimeError("docker run failed")
|
|
||||||
job.append(out.strip())
|
|
||||||
|
|
||||||
# ── Phase 4: wait for /health to report ready ──
|
|
||||||
job.phase = "Container is starting; loading whisper + alignment + pyannote models (~60–120 s on first boot)…"
|
|
||||||
url = f"http://{s.whisperx_host}:{s.whisperx_port}/health"
|
|
||||||
ready = False
|
|
||||||
for i in range(60): # up to ~180 s
|
|
||||||
await asyncio.sleep(3)
|
|
||||||
try:
|
|
||||||
async with httpx.AsyncClient(timeout=4.0) as client:
|
|
||||||
r = await client.get(url)
|
|
||||||
if r.status_code == 200:
|
|
||||||
body = r.json()
|
|
||||||
if body.get("status") == "ready":
|
|
||||||
ready = True
|
|
||||||
job.append(f"[ready] {body}")
|
|
||||||
break
|
|
||||||
job.phase = f"Loading models (transcribe={body.get('transcribe_loaded')}, align={body.get('align_loaded')}, diarize={body.get('diarizer_loaded')})…"
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
if not ready:
|
|
||||||
raise RuntimeError("container started but /health did not report ready within ~180 s — check `docker logs whisperx-asr` on Spark 2")
|
|
||||||
job.phase = "Done — WhisperX is healthy and reachable on port 8002"
|
|
||||||
@@ -1,96 +0,0 @@
|
|||||||
# WhisperX ASR + diarization container for Spark 2 (Blackwell GB10, sm_120).
|
|
||||||
#
|
|
||||||
# Replaces the custom Parakeet wrapper + Sortformer overlay with a single
|
|
||||||
# mainline pipeline: faster-whisper for transcription + pyannote.audio 3.1
|
|
||||||
# for diarization + wav2vec2 forced alignment for word-level timestamps.
|
|
||||||
#
|
|
||||||
# Build (on Spark 2, where Blackwell + nvcr.io credentials are available):
|
|
||||||
# docker build -t whisperx-asr:latest .
|
|
||||||
#
|
|
||||||
# Run:
|
|
||||||
# docker run -d --restart unless-stopped --name whisperx-asr \
|
|
||||||
# --gpus all --memory=40g \
|
|
||||||
# -p 8002:8002 \
|
|
||||||
# -v whisperx-models:/root/.cache/huggingface \
|
|
||||||
# -e HF_TOKEN="$(cat ~/.cache/huggingface/token)" \
|
|
||||||
# -e WHISPER_MODEL=medium \
|
|
||||||
# whisperx-asr:latest
|
|
||||||
#
|
|
||||||
# The memory cap is intentional: even if WhisperX hits a pathological input,
|
|
||||||
# it gets OOM-killed cleanly instead of swap-thrashing the whole Spark.
|
|
||||||
|
|
||||||
FROM nvcr.io/nvidia/pytorch:25.11-py3
|
|
||||||
|
|
||||||
# WhisperX runs ffmpeg under the hood for audio decoding.
|
|
||||||
# git + cmake + build-essential are needed to build torchaudio from source
|
|
||||||
# (see below); we remove them at the end of the next layer to keep the image
|
|
||||||
# from growing unnecessarily.
|
|
||||||
RUN apt-get update \
|
|
||||||
&& apt-get install -y --no-install-recommends \
|
|
||||||
ffmpeg git cmake build-essential ninja-build \
|
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
|
||||||
|
|
||||||
# Pin torch + torchvision to whatever NGC actually shipped so pip can't swap
|
|
||||||
# them out when it satisfies whisperx/pyannote deps. (NGC's torch is a custom
|
|
||||||
# build with a non-standard local version like "2.10.0a0+b558c986e8.nv25.11"
|
|
||||||
# — stock pip wheels would clobber it and break the ABI.)
|
|
||||||
RUN python3 -c "import torch, torchvision; \
|
|
||||||
import sys; \
|
|
||||||
sys.stdout.write(f'torch=={torch.__version__}\ntorchvision=={torchvision.__version__}\n')" \
|
|
||||||
> /tmp/torch-constraints.txt \
|
|
||||||
&& echo '── pinned torch versions ──' && cat /tmp/torch-constraints.txt
|
|
||||||
|
|
||||||
# NGC PyTorch images don't include torchaudio (NVIDIA optimizes for
|
|
||||||
# vision/text workloads). Stock torchaudio wheels are ABI-incompatible with
|
|
||||||
# NGC's custom torch 2.10a, so the only working option is building from
|
|
||||||
# source against the NGC torch already in the image.
|
|
||||||
#
|
|
||||||
# Build env knobs:
|
|
||||||
# USE_CUDA=1 — build CUDA kernels (we have a GPU)
|
|
||||||
# BUILD_SOX=0 — skip libsox (we only use audio decoding)
|
|
||||||
# TORCH_CUDA_ARCH_LIST=... — build kernels for Hopper + Blackwell datacenter
|
|
||||||
# + Blackwell consumer (sm_120 = GB10)
|
|
||||||
# --no-build-isolation — CRITICAL: PEP 517 build isolation creates a
|
|
||||||
# fresh env with no torch in it. torchaudio's
|
|
||||||
# setup.py imports torch to discover the build
|
|
||||||
# flags, so it crashes without this flag.
|
|
||||||
# With it, the build uses NGC's torch directly.
|
|
||||||
ENV USE_CUDA=1 BUILD_SOX=0 TORCH_CUDA_ARCH_LIST="9.0;10.0;12.0"
|
|
||||||
# Pre-install torchaudio's build-time deps (PEP 517 would normally install
|
|
||||||
# these in the isolated build env, but we just turned isolation off).
|
|
||||||
RUN pip install --break-system-packages --no-cache-dir \
|
|
||||||
"setuptools>=61" wheel ninja "pybind11>=2.10"
|
|
||||||
RUN pip install --break-system-packages --no-cache-dir --no-build-isolation \
|
|
||||||
git+https://github.com/pytorch/audio.git@v2.5.1 \
|
|
||||||
&& python3 -c "import torchaudio; print('torchaudio built:', torchaudio.__version__)"
|
|
||||||
|
|
||||||
# Append torchaudio to constraints so pip can't replace it later.
|
|
||||||
RUN python3 -c "import torchaudio; print(f'torchaudio=={torchaudio.__version__}')" \
|
|
||||||
>> /tmp/torch-constraints.txt \
|
|
||||||
&& echo '── final pinned versions ──' && cat /tmp/torch-constraints.txt
|
|
||||||
|
|
||||||
# Install whisperx + the FastAPI wrapper deps under the torch+torchaudio
|
|
||||||
# constraint. pip will satisfy whisperx/pyannote without swapping any of the
|
|
||||||
# pytorch-family packages.
|
|
||||||
COPY requirements.txt /tmp/requirements.txt
|
|
||||||
RUN pip install --break-system-packages --no-cache-dir \
|
|
||||||
-c /tmp/torch-constraints.txt -r /tmp/requirements.txt
|
|
||||||
|
|
||||||
# Pre-warm the default Whisper + alignment models at build time so first-call
|
|
||||||
# latency on a fresh container is small. (~3 GB cached into the image; if you
|
|
||||||
# want a smaller image, comment this out and accept the first-call download.)
|
|
||||||
ARG WHISPER_MODEL=medium
|
|
||||||
ENV WHISPER_MODEL=${WHISPER_MODEL}
|
|
||||||
RUN python3 -c "import whisperx; whisperx.load_model('${WHISPER_MODEL}', 'cpu', compute_type='int8')" \
|
|
||||||
&& python3 -c "import whisperx; whisperx.load_align_model(language_code='en', device='cpu')"
|
|
||||||
|
|
||||||
WORKDIR /opt/whisperx
|
|
||||||
COPY app /opt/whisperx/app
|
|
||||||
|
|
||||||
# Expose for spark-control's proxy on Spark 2
|
|
||||||
EXPOSE 8002
|
|
||||||
|
|
||||||
HEALTHCHECK --interval=30s --timeout=10s --start-period=180s \
|
|
||||||
CMD python3 -c "import urllib.request; urllib.request.urlopen('http://localhost:8002/health')" || exit 1
|
|
||||||
|
|
||||||
CMD ["python3", "-m", "uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8002", "--workers", "1"]
|
|
||||||
@@ -1,74 +0,0 @@
|
|||||||
# WhisperX container for Spark 2
|
|
||||||
|
|
||||||
Replaces the custom Parakeet wrapper + Sortformer overlay (v0.10/v0.11) with a
|
|
||||||
single mainline pipeline:
|
|
||||||
|
|
||||||
- **faster-whisper** (CTranslate2-optimized) for STT
|
|
||||||
- **pyannote.audio 3.1** for speaker diarization (sliding-window — handles
|
|
||||||
long files in bounded memory, fixes the Sortformer OOM on 90-min audio)
|
|
||||||
- **wav2vec2 forced alignment** for word-level timestamps
|
|
||||||
|
|
||||||
Exposes the same API surface spark-control already proxies to, so the cutover
|
|
||||||
is a one-URL change in the audio proxy:
|
|
||||||
|
|
||||||
- `GET /health` — readiness probe
|
|
||||||
- `GET /v1/models` — model list
|
|
||||||
- `POST /v1/audio/transcriptions` — OpenAI-shaped STT
|
|
||||||
- `POST /v1/audio/transcribe-with-speakers` — merged diarized transcript
|
|
||||||
(matches spark-control's response shape exactly)
|
|
||||||
|
|
||||||
## Deploy to Spark 2
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# 1. Copy this directory to Spark 2
|
|
||||||
rsync -av --delete image/whisperx_container/ <spark-user>@<spark-2-ip>:~/whisperx-build/
|
|
||||||
|
|
||||||
# 2. SSH in and build
|
|
||||||
ssh <spark-user>@<spark-2-ip>
|
|
||||||
cd ~/whisperx-build
|
|
||||||
docker build -t whisperx-asr:latest .
|
|
||||||
|
|
||||||
# 3. Run alongside the existing parakeet-asr (which stays on 8000 for now)
|
|
||||||
docker run -d --restart unless-stopped --name whisperx-asr \
|
|
||||||
--gpus all --memory=40g \
|
|
||||||
-p 8002:8002 \
|
|
||||||
-v whisperx-models:/root/.cache/huggingface \
|
|
||||||
-e HF_TOKEN="$(cat ~/.cache/huggingface/token)" \
|
|
||||||
-e WHISPER_MODEL=medium \
|
|
||||||
whisperx-asr:latest
|
|
||||||
|
|
||||||
# 4. Watch first-start logs (model load + first health check)
|
|
||||||
docker logs -f whisperx-asr
|
|
||||||
```
|
|
||||||
|
|
||||||
## Model size knobs
|
|
||||||
|
|
||||||
`WHISPER_MODEL` env var. Defaults to `medium`. Options:
|
|
||||||
|
|
||||||
| Model | Size | Speed (GB10) | Quality |
|
|
||||||
|---|---|---|---|
|
|
||||||
| `tiny` | ~75M | ~120x rt | low |
|
|
||||||
| `base` | ~74M | ~80x rt | ok |
|
|
||||||
| `small` | ~244M | ~50x rt | good |
|
|
||||||
| `medium`| ~769M | ~30x rt | excellent (**default**) |
|
|
||||||
| `large-v3`| ~1.5B | ~15x rt | best |
|
|
||||||
|
|
||||||
For a 90-min file, medium takes ~3 min STT + ~9 min diarize ≈ ~12 min total.
|
|
||||||
|
|
||||||
## Memory budget
|
|
||||||
|
|
||||||
The `--memory=40g` cap is intentional. Spark 2 has 122 GB unified, of which
|
|
||||||
~35 GB is consumed by parakeet-asr + magpie-tts. The 40 GB cap leaves
|
|
||||||
comfortable headroom for both the model weights (~5 GB) and pyannote's
|
|
||||||
in-memory features (~5–15 GB for a 90-min audio). If WhisperX hits a
|
|
||||||
pathological input it gets OOM-killed cleanly instead of swap-thrashing the
|
|
||||||
whole Spark — the symptom we hit with the unbounded Sortformer container.
|
|
||||||
|
|
||||||
## Rollback to Parakeet+Sortformer
|
|
||||||
|
|
||||||
```bash
|
|
||||||
docker stop whisperx-asr && docker rm whisperx-asr
|
|
||||||
```
|
|
||||||
|
|
||||||
The parakeet-asr container stays running throughout — spark-control's proxy
|
|
||||||
URL switch is reversible via config or version downgrade.
|
|
||||||
@@ -1,355 +0,0 @@
|
|||||||
"""WhisperX FastAPI wrapper — STT + speaker diarization in a single endpoint.
|
|
||||||
|
|
||||||
Endpoints (designed to be drop-in compatible with the existing spark-control
|
|
||||||
audio API surface, so the proxy just changes its upstream URL):
|
|
||||||
|
|
||||||
GET / — service info
|
|
||||||
GET /health — readiness probe
|
|
||||||
GET /v1/models — list loaded models
|
|
||||||
POST /v1/audio/transcriptions — OpenAI-shaped STT (no speakers)
|
|
||||||
POST /v1/audio/transcribe-with-speakers — merged diarized transcript
|
|
||||||
|
|
||||||
The /transcribe-with-speakers response shape EXACTLY matches what
|
|
||||||
spark-control's /api/audio/transcribe-with-speakers returns today (the one
|
|
||||||
that recap-relay's PR spec was written against), so swapping the upstream
|
|
||||||
from Parakeet+Sortformer to WhisperX is a one-URL change in the proxy.
|
|
||||||
"""
|
|
||||||
from __future__ import annotations
|
|
||||||
import os
|
|
||||||
import time
|
|
||||||
import tempfile
|
|
||||||
import logging
|
|
||||||
from contextlib import asynccontextmanager
|
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
import torch
|
|
||||||
import whisperx
|
|
||||||
from fastapi import FastAPI, File, Form, UploadFile, HTTPException
|
|
||||||
from fastapi.responses import JSONResponse
|
|
||||||
from fastapi.middleware.cors import CORSMiddleware
|
|
||||||
|
|
||||||
logging.basicConfig(
|
|
||||||
level=logging.INFO,
|
|
||||||
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
|
||||||
)
|
|
||||||
logger = logging.getLogger("whisperx-api")
|
|
||||||
|
|
||||||
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
|
||||||
COMPUTE_TYPE = os.getenv("COMPUTE_TYPE", "float16" if DEVICE == "cuda" else "int8")
|
|
||||||
WHISPER_MODEL = os.getenv("WHISPER_MODEL", "medium")
|
|
||||||
DEFAULT_LANG = os.getenv("DEFAULT_LANGUAGE", "en")
|
|
||||||
BATCH_SIZE = int(os.getenv("BATCH_SIZE", "16"))
|
|
||||||
HF_TOKEN = os.getenv("HF_TOKEN") or None
|
|
||||||
|
|
||||||
|
|
||||||
class WhisperXEngine:
|
|
||||||
def __init__(self) -> None:
|
|
||||||
self.transcribe_model = None
|
|
||||||
self.align_model = None
|
|
||||||
self.align_metadata = None
|
|
||||||
self.diarize_model = None
|
|
||||||
self._loaded = False
|
|
||||||
|
|
||||||
def load(self) -> None:
|
|
||||||
if self._loaded:
|
|
||||||
return
|
|
||||||
logger.info(f"Loading whisper-{WHISPER_MODEL} on {DEVICE} ({COMPUTE_TYPE})")
|
|
||||||
self.transcribe_model = whisperx.load_model(
|
|
||||||
WHISPER_MODEL, DEVICE, compute_type=COMPUTE_TYPE
|
|
||||||
)
|
|
||||||
logger.info(f"Loading alignment model for {DEFAULT_LANG}")
|
|
||||||
self.align_model, self.align_metadata = whisperx.load_align_model(
|
|
||||||
language_code=DEFAULT_LANG, device=DEVICE
|
|
||||||
)
|
|
||||||
if HF_TOKEN:
|
|
||||||
logger.info("Loading pyannote diarization pipeline (3.1)")
|
|
||||||
try:
|
|
||||||
self.diarize_model = whisperx.DiarizationPipeline(
|
|
||||||
use_auth_token=HF_TOKEN, device=DEVICE
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
logger.exception(f"Diarization pipeline failed to load: {e}")
|
|
||||||
self.diarize_model = None
|
|
||||||
else:
|
|
||||||
logger.warning(
|
|
||||||
"HF_TOKEN not set — diarization disabled. /transcribe-with-speakers "
|
|
||||||
"will return 503. /transcriptions still works."
|
|
||||||
)
|
|
||||||
self._loaded = True
|
|
||||||
logger.info("WhisperX engine ready")
|
|
||||||
|
|
||||||
def transcribe(self, audio_bytes: bytes, filename: str, want_timestamps: bool = True) -> dict:
|
|
||||||
if not self._loaded:
|
|
||||||
self.load()
|
|
||||||
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
|
|
||||||
tmp.write(audio_bytes)
|
|
||||||
tmp_path = tmp.name
|
|
||||||
try:
|
|
||||||
audio = whisperx.load_audio(tmp_path)
|
|
||||||
duration = float(audio.shape[0]) / 16000.0
|
|
||||||
result = self.transcribe_model.transcribe(
|
|
||||||
audio, batch_size=BATCH_SIZE, language=DEFAULT_LANG
|
|
||||||
)
|
|
||||||
language = result.get("language") or DEFAULT_LANG
|
|
||||||
if want_timestamps:
|
|
||||||
aligned = whisperx.align(
|
|
||||||
result["segments"],
|
|
||||||
self.align_model,
|
|
||||||
self.align_metadata,
|
|
||||||
audio,
|
|
||||||
DEVICE,
|
|
||||||
return_char_alignments=False,
|
|
||||||
)
|
|
||||||
segments = aligned.get("segments", [])
|
|
||||||
else:
|
|
||||||
segments = result.get("segments", [])
|
|
||||||
full_text = " ".join(s.get("text", "").strip() for s in segments).strip()
|
|
||||||
return {
|
|
||||||
"duration": duration,
|
|
||||||
"language": language,
|
|
||||||
"text": full_text,
|
|
||||||
"segments": segments,
|
|
||||||
"audio_path": tmp_path,
|
|
||||||
"audio": audio, # caller can reuse for diarization without re-loading
|
|
||||||
}
|
|
||||||
finally:
|
|
||||||
# NOTE: caller is responsible for unlinking the temp file. We expose it
|
|
||||||
# in the return dict so diarization can run on the same audio without
|
|
||||||
# disk re-IO. The unlink happens in the request handler's finally.
|
|
||||||
pass
|
|
||||||
|
|
||||||
def diarize(self, audio) -> dict:
|
|
||||||
if self.diarize_model is None:
|
|
||||||
raise RuntimeError(
|
|
||||||
"Diarization pipeline not loaded (HF_TOKEN missing or load failed)"
|
|
||||||
)
|
|
||||||
diar = self.diarize_model(audio)
|
|
||||||
return diar
|
|
||||||
|
|
||||||
|
|
||||||
engine = WhisperXEngine()
|
|
||||||
|
|
||||||
|
|
||||||
@asynccontextmanager
|
|
||||||
async def lifespan(app: FastAPI):
|
|
||||||
engine.load()
|
|
||||||
yield
|
|
||||||
|
|
||||||
|
|
||||||
app = FastAPI(
|
|
||||||
title="WhisperX ASR + Diarization",
|
|
||||||
version="1.0.0",
|
|
||||||
lifespan=lifespan,
|
|
||||||
)
|
|
||||||
app.add_middleware(
|
|
||||||
CORSMiddleware,
|
|
||||||
allow_origins=["*"],
|
|
||||||
allow_credentials=True,
|
|
||||||
allow_methods=["*"],
|
|
||||||
allow_headers=["*"],
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@app.get("/")
|
|
||||||
async def root() -> dict:
|
|
||||||
return {
|
|
||||||
"service": "whisperx",
|
|
||||||
"device": DEVICE,
|
|
||||||
"models": {
|
|
||||||
"transcription": f"whisper-{WHISPER_MODEL}",
|
|
||||||
"alignment": f"wav2vec2-{DEFAULT_LANG}",
|
|
||||||
"diarization": "pyannote-speaker-diarization-3.1" if engine.diarize_model else None,
|
|
||||||
},
|
|
||||||
"endpoints": {
|
|
||||||
"transcriptions": "/v1/audio/transcriptions",
|
|
||||||
"transcribe_with_speakers": "/v1/audio/transcribe-with-speakers",
|
|
||||||
"models": "/v1/models",
|
|
||||||
"health": "/health",
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@app.get("/health")
|
|
||||||
async def health() -> dict:
|
|
||||||
return {
|
|
||||||
"status": "ready" if engine._loaded else "loading",
|
|
||||||
"transcribe_loaded": engine.transcribe_model is not None,
|
|
||||||
"align_loaded": engine.align_model is not None,
|
|
||||||
"diarizer_loaded": engine.diarize_model is not None,
|
|
||||||
"model": f"whisper-{WHISPER_MODEL}",
|
|
||||||
"device": DEVICE,
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@app.get("/v1/models")
|
|
||||||
async def list_models() -> dict:
|
|
||||||
data = [
|
|
||||||
{"id": f"whisper-{WHISPER_MODEL}", "object": "model", "owned_by": "openai", "kind": "stt"},
|
|
||||||
]
|
|
||||||
if engine.diarize_model is not None:
|
|
||||||
data.append(
|
|
||||||
{"id": "pyannote-speaker-diarization-3.1", "object": "model",
|
|
||||||
"owned_by": "pyannote", "kind": "diarization"}
|
|
||||||
)
|
|
||||||
return {"object": "list", "data": data}
|
|
||||||
|
|
||||||
|
|
||||||
def _normalize_speaker(label: str) -> str:
|
|
||||||
"""WhisperX/pyannote uses 'SPEAKER_00' / 'SPEAKER_01' / ... — normalize to
|
|
||||||
the same 'Speaker_0' shape spark-control's existing endpoint returns."""
|
|
||||||
if not label:
|
|
||||||
return "Speaker_unknown"
|
|
||||||
if label.upper().startswith("SPEAKER_"):
|
|
||||||
idx = label.split("_", 1)[1].lstrip("0") or "0"
|
|
||||||
return f"Speaker_{idx}"
|
|
||||||
return label
|
|
||||||
|
|
||||||
|
|
||||||
def _segments_to_blocks(segments: list[dict]) -> list[dict]:
|
|
||||||
"""Convert WhisperX's per-utterance segments into the
|
|
||||||
[{start_ms, end_ms, speaker, text}, ...] block shape spark-control returns
|
|
||||||
today. Groups consecutive same-speaker segments into one block."""
|
|
||||||
blocks: list[dict] = []
|
|
||||||
cur = None
|
|
||||||
for s in segments:
|
|
||||||
spk_raw = s.get("speaker") or "Speaker_unknown"
|
|
||||||
spk = _normalize_speaker(spk_raw)
|
|
||||||
text = (s.get("text") or "").strip()
|
|
||||||
start_ms = int(float(s.get("start", 0)) * 1000)
|
|
||||||
end_ms = int(float(s.get("end", 0)) * 1000)
|
|
||||||
if not text:
|
|
||||||
continue
|
|
||||||
if cur is None or cur["speaker"] != spk or start_ms - cur["end_ms"] > 1500:
|
|
||||||
if cur is not None:
|
|
||||||
blocks.append(cur)
|
|
||||||
cur = {"start_ms": start_ms, "end_ms": end_ms, "speaker": spk, "text": text}
|
|
||||||
else:
|
|
||||||
cur["text"] = (cur["text"] + " " + text).strip()
|
|
||||||
cur["end_ms"] = end_ms
|
|
||||||
if cur is not None:
|
|
||||||
blocks.append(cur)
|
|
||||||
return blocks
|
|
||||||
|
|
||||||
|
|
||||||
@app.post("/v1/audio/transcriptions")
|
|
||||||
async def transcribe(
|
|
||||||
file: UploadFile = File(...),
|
|
||||||
model: Optional[str] = Form(default=None),
|
|
||||||
language: Optional[str] = Form(default=None),
|
|
||||||
response_format: Optional[str] = Form(default="json"),
|
|
||||||
temperature: Optional[float] = Form(default=None),
|
|
||||||
prompt: Optional[str] = Form(default=None),
|
|
||||||
):
|
|
||||||
if not engine._loaded:
|
|
||||||
raise HTTPException(status_code=503, detail="Engine loading")
|
|
||||||
audio_bytes = await file.read()
|
|
||||||
if not audio_bytes:
|
|
||||||
raise HTTPException(status_code=400, detail="Empty file")
|
|
||||||
|
|
||||||
start_t = time.time()
|
|
||||||
audio_path = None
|
|
||||||
try:
|
|
||||||
result = engine.transcribe(
|
|
||||||
audio_bytes,
|
|
||||||
file.filename or "audio.wav",
|
|
||||||
want_timestamps=(response_format == "verbose_json"),
|
|
||||||
)
|
|
||||||
audio_path = result.pop("audio_path", None)
|
|
||||||
result.pop("audio", None)
|
|
||||||
except Exception as e:
|
|
||||||
logger.exception("Transcription failed")
|
|
||||||
raise HTTPException(status_code=500, detail=f"Failed: {e}")
|
|
||||||
finally:
|
|
||||||
if audio_path:
|
|
||||||
try: os.unlink(audio_path)
|
|
||||||
except OSError: pass
|
|
||||||
|
|
||||||
elapsed = time.time() - start_t
|
|
||||||
duration = result.get("duration", 0.0)
|
|
||||||
logger.info(f"Transcribed {duration:.1f}s in {elapsed:.1f}s ({duration/elapsed:.0f}x rt)")
|
|
||||||
|
|
||||||
if response_format == "text":
|
|
||||||
return JSONResponse(content=result["text"], media_type="text/plain")
|
|
||||||
if response_format == "verbose_json":
|
|
||||||
words = []
|
|
||||||
for s in result.get("segments", []):
|
|
||||||
for w in s.get("words", []) or []:
|
|
||||||
words.append({
|
|
||||||
"word": w.get("word"),
|
|
||||||
"start": w.get("start"),
|
|
||||||
"end": w.get("end"),
|
|
||||||
"score": w.get("score"),
|
|
||||||
})
|
|
||||||
return {
|
|
||||||
"task": "transcribe",
|
|
||||||
"language": result.get("language", "en"),
|
|
||||||
"duration": duration,
|
|
||||||
"text": result["text"],
|
|
||||||
"segments": [
|
|
||||||
{"start": s.get("start"), "end": s.get("end"), "text": s.get("text", "").strip()}
|
|
||||||
for s in result.get("segments", [])
|
|
||||||
],
|
|
||||||
"words": words,
|
|
||||||
}
|
|
||||||
return {"text": result["text"]}
|
|
||||||
|
|
||||||
|
|
||||||
@app.post("/v1/audio/transcribe-with-speakers")
|
|
||||||
async def transcribe_with_speakers(file: UploadFile = File(...)) -> dict:
|
|
||||||
"""Merged STT + diarization. Response shape matches spark-control's
|
|
||||||
/api/audio/transcribe-with-speakers exactly — recap-relay's PR spec
|
|
||||||
needs no changes when we cut over."""
|
|
||||||
if not engine._loaded:
|
|
||||||
raise HTTPException(status_code=503, detail="Engine loading")
|
|
||||||
if engine.diarize_model is None:
|
|
||||||
raise HTTPException(
|
|
||||||
status_code=503,
|
|
||||||
detail="Diarization unavailable — HF_TOKEN not set or pyannote failed to load",
|
|
||||||
)
|
|
||||||
audio_bytes = await file.read()
|
|
||||||
if not audio_bytes:
|
|
||||||
raise HTTPException(status_code=400, detail="Empty file")
|
|
||||||
|
|
||||||
start_t = time.time()
|
|
||||||
audio_path = None
|
|
||||||
try:
|
|
||||||
result = engine.transcribe(
|
|
||||||
audio_bytes, file.filename or "audio.wav", want_timestamps=True
|
|
||||||
)
|
|
||||||
audio_path = result.pop("audio_path", None)
|
|
||||||
audio = result.pop("audio")
|
|
||||||
# Diarize on the in-memory audio (no second decode)
|
|
||||||
logger.info("Running pyannote diarization…")
|
|
||||||
diar = engine.diarize(audio)
|
|
||||||
# whisperx.assign_word_speakers writes speaker labels into the
|
|
||||||
# aligned segments + their nested words
|
|
||||||
result_with_speakers = whisperx.assign_word_speakers(
|
|
||||||
diar, {"segments": result["segments"]}
|
|
||||||
)
|
|
||||||
segments_in = result_with_speakers.get("segments", [])
|
|
||||||
blocks = _segments_to_blocks(segments_in)
|
|
||||||
speakers = sorted({b["speaker"] for b in blocks if b["speaker"] != "Speaker_unknown"})
|
|
||||||
except Exception as e:
|
|
||||||
logger.exception("Diarized transcription failed")
|
|
||||||
raise HTTPException(status_code=500, detail=f"Failed: {e}")
|
|
||||||
finally:
|
|
||||||
if audio_path:
|
|
||||||
try: os.unlink(audio_path)
|
|
||||||
except OSError: pass
|
|
||||||
|
|
||||||
elapsed = time.time() - start_t
|
|
||||||
duration = result.get("duration", 0.0)
|
|
||||||
logger.info(
|
|
||||||
f"Transcribed+diarized {duration:.1f}s in {elapsed:.1f}s "
|
|
||||||
f"({duration/elapsed:.0f}x rt), {len(speakers)} speakers, {len(blocks)} blocks"
|
|
||||||
)
|
|
||||||
return {
|
|
||||||
"duration": duration,
|
|
||||||
"language": result.get("language", "en"),
|
|
||||||
"speakers_detected": speakers,
|
|
||||||
"segments": blocks,
|
|
||||||
"models": {
|
|
||||||
"transcription": f"whisper-{WHISPER_MODEL}",
|
|
||||||
"diarization": "pyannote-speaker-diarization-3.1",
|
|
||||||
},
|
|
||||||
}
|
|
||||||
@@ -1,5 +0,0 @@
|
|||||||
whisperx==3.4.3
|
|
||||||
fastapi>=0.115
|
|
||||||
uvicorn[standard]>=0.32
|
|
||||||
python-multipart>=0.0.9
|
|
||||||
soundfile>=0.12
|
|
||||||
@@ -1,10 +1,10 @@
|
|||||||
import { VersionInfo, IMPOSSIBLE } from '@start9labs/start-sdk'
|
import { VersionInfo, IMPOSSIBLE } from '@start9labs/start-sdk'
|
||||||
|
|
||||||
export const v0_1_0 = VersionInfo.of({
|
export const v0_1_0 = VersionInfo.of({
|
||||||
version: '0.12.0:4',
|
version: '0.13.0:0',
|
||||||
releaseNotes: {
|
releaseNotes: {
|
||||||
en_US:
|
en_US:
|
||||||
'v0.12.0:4 — hotfix: torchaudio build was failing with "ModuleNotFoundError: No module named torch" during its setup.py. Root cause: pip\'s PEP 517 build isolation creates a fresh Python env for the build that doesn\'t see NGC\'s torch (which is what we need for ABI compat). Fix: add --no-build-isolation to the pip install so the build uses the existing torch, plus pre-install setuptools/wheel/ninja/pybind11 since pip won\'t auto-pull them when build isolation is off. Should now finally compile torchaudio v2.5.1 against NGC\'s torch 2.10 and proceed to the whisperx install.',
|
'v0.13.0 — WhisperX migration reverted. Five hotfixes deep with no working build; the fundamental problem (NGC PyTorch on ARM64 ships a custom-versioned torch with no matching torchaudio anywhere) was always going to bite. All WhisperX install plumbing has been removed from spark-control: the install banner + progress dialog, the install endpoints, the audio-proxy WhisperX-preferred branch, the whisperx service registration, the WHISPERX_* env vars, and the build-context files. Spark 2 has been cleaned (container removed, build dir removed, ~6.8 GB of dangling layers + builder cache reclaimed). The dashboard now looks as it did before the migration attempt: Parakeet + Sortformer is the only audio path, unchanged. v0.13.0:1+ will add the actually-needed fixes: a memory cap on the parakeet container (so the 90-min audio crash can\'t take down Spark 2 again — worst case is a clean OOM-kill of the container), and a chunking proxy that splits long audio before sending to Sortformer.',
|
||||||
},
|
},
|
||||||
migrations: {
|
migrations: {
|
||||||
up: async ({ effects }) => {},
|
up: async ({ effects }) => {},
|
||||||
|
|||||||
Reference in New Issue
Block a user