95524f4983
After five hotfix iterations on the WhisperX install (v0.12.0:0–:4) we
never got a working docker build. The fundamental constraint isn't
patchable from outside NVIDIA: NGC PyTorch on ARM64 (the only base that
runs on Spark 2's GB10 Blackwell) ships a custom-versioned torch
2.10.0a0+b558c98 that has no pre-built torchaudio match anywhere.
WhisperX → pyannote → torchaudio is a hard dependency chain we couldn't
satisfy without rebuilding torchaudio against torch 2.10's alpha API.
Walking away cleanly is better than another night of chasing.
Removed from the codebase:
- image/whisperx_container/* (Dockerfile + requirements + app/main.py)
- image/app/whisperx_install.py (install manager + SSH ship-context logic)
- image/Dockerfile COPY whisperx_container
- WHISPERX_* config keys in config.py
- whisperx service entry in services.py
- WhisperX-preferred branch in audio_proxy.py
- /api/whisperx/* endpoints in server.py
- install banner + progress dialog in index.html
- render + handlers in app.js
- .whisperx-install styles in style.css
Spark 2 cleaned in tandem (user-authorized): container removed,
~/whisperx-build/ removed, 5.4 GB of dangling image layers + 1.3 GB of
builder cache reclaimed. parakeet-asr and magpie-tts unaffected and
healthy throughout.
The audio path is back to exactly what shipped in v0.11.0:3:
POST /api/audio/transcribe-with-speakers
→ Parakeet (transcription) + Sortformer (diarization) in parallel
→ merged by timestamp into speaker-labeled blocks
v0.13.0:1+ will add the actually-needed fixes that the WhisperX detour
was meant to address:
1. memory cap on the parakeet-asr container so a long-audio crash
can't swap-thrash Spark 2 again
2. a chunking proxy in /api/audio/transcribe-with-speakers that
splits inputs >10 min before Sortformer
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
129 lines
4.5 KiB
Python
129 lines
4.5 KiB
Python
"""Lifecycle controls for support-service containers (Parakeet, Magpie, etc.).
|
|
|
|
These are independent always-on containers that don't go through the LLM-swap
|
|
machinery. We just run `docker start|stop|restart <container>` via SSH on the
|
|
appropriate host.
|
|
"""
|
|
from __future__ import annotations
|
|
import time
|
|
from dataclasses import dataclass
|
|
from typing import Literal, Optional
|
|
|
|
from .config import Settings
|
|
from .ssh import ssh_run
|
|
|
|
|
|
# Cache the "unreachable" verdict per (host, user) for a short period so that a
|
|
# repeated docker_state call doesn't re-pay the 6 s SSH connect timeout each time.
|
|
_UNREACHABLE_TTL = 25.0
|
|
_unreachable_cache: dict[tuple[str, str], float] = {}
|
|
|
|
|
|
def _is_recently_unreachable(host: str, user: str) -> bool:
|
|
ts = _unreachable_cache.get((host, user))
|
|
return bool(ts and time.monotonic() - ts < _UNREACHABLE_TTL)
|
|
|
|
|
|
def _mark_unreachable(host: str, user: str) -> None:
|
|
_unreachable_cache[(host, user)] = time.monotonic()
|
|
|
|
|
|
def _clear_unreachable(host: str, user: str) -> None:
|
|
_unreachable_cache.pop((host, user), None)
|
|
|
|
|
|
ServiceName = Literal["parakeet", "magpie"]
|
|
ServiceAction = Literal["start", "stop", "restart"]
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class ServiceDef:
|
|
name: str
|
|
kind: str # 'stt' | 'tts' | …
|
|
host: str
|
|
user: str
|
|
container: str
|
|
port: int
|
|
|
|
|
|
def services_from_settings(s: Settings) -> dict[str, ServiceDef]:
|
|
from .custom_services import load_custom_services
|
|
out: dict[str, ServiceDef] = {
|
|
"parakeet": ServiceDef(
|
|
name="parakeet",
|
|
kind="stt",
|
|
host=s.parakeet_host,
|
|
user=s.parakeet_user,
|
|
container=s.parakeet_container,
|
|
port=s.parakeet_port,
|
|
),
|
|
"magpie": ServiceDef(
|
|
name="magpie",
|
|
kind="tts",
|
|
host=s.magpie_host,
|
|
user=s.magpie_user,
|
|
container=s.magpie_container,
|
|
port=s.magpie_port,
|
|
),
|
|
}
|
|
for entry in load_custom_services():
|
|
key = entry.get("key")
|
|
if not key or key in out:
|
|
continue
|
|
out[key] = ServiceDef(
|
|
name=key,
|
|
kind=entry.get("kind", ""),
|
|
host=entry.get("host", ""),
|
|
user=entry.get("user", ""),
|
|
container=entry.get("container", key),
|
|
port=int(entry.get("port", 0)),
|
|
)
|
|
return out
|
|
|
|
|
|
async def docker_state(settings: Settings, svc: ServiceDef) -> dict:
|
|
"""Get docker state (running, exited, restarting, etc.) + restart count."""
|
|
if not svc.host or not svc.user:
|
|
return {"state": "unconfigured", "restart_count": None, "uptime": None}
|
|
if _is_recently_unreachable(svc.host, svc.user):
|
|
return {"state": "unreachable", "host_unreachable": True, "restart_count": None, "uptime": None}
|
|
cmd = (
|
|
f"docker inspect {svc.container} "
|
|
f"--format '{{{{.State.Status}}}}|{{{{.State.StartedAt}}}}|{{{{.RestartCount}}}}|{{{{.State.ExitCode}}}}|{{{{.State.Error}}}}' "
|
|
f"2>&1 || echo 'NOT_FOUND'"
|
|
)
|
|
rc, out, _ = await ssh_run(svc.host, svc.user, cmd, settings, timeout=6)
|
|
out = out.strip()
|
|
if rc == 124 or "timeout after" in out.lower():
|
|
_mark_unreachable(svc.host, svc.user)
|
|
return {"state": "unreachable", "host_unreachable": True, "restart_count": None, "uptime": None}
|
|
_clear_unreachable(svc.host, svc.user)
|
|
if rc != 0 or out.startswith("NOT_FOUND") or "Error" in out and "no such object" in out.lower():
|
|
return {"state": "missing", "restart_count": None, "uptime": None, "raw": out}
|
|
parts = out.split("|")
|
|
if len(parts) < 4:
|
|
return {"state": "unknown", "raw": out}
|
|
status, started_at, restart_count, exit_code = parts[0], parts[1], parts[2], parts[3]
|
|
error = parts[4] if len(parts) > 4 else ""
|
|
return {
|
|
"state": status,
|
|
"started_at": started_at,
|
|
"restart_count": int(restart_count) if restart_count.isdigit() else None,
|
|
"exit_code": int(exit_code) if exit_code.lstrip("-").isdigit() else None,
|
|
"error": error or None,
|
|
}
|
|
|
|
|
|
async def run_action(settings: Settings, svc: ServiceDef, action: ServiceAction) -> dict:
|
|
"""Run docker start/stop/restart on the target host."""
|
|
if not svc.host or not svc.user:
|
|
return {"ok": False, "error": "service host not configured"}
|
|
cmd = f"docker {action} {svc.container}"
|
|
rc, out, err = await ssh_run(svc.host, svc.user, cmd, settings, timeout=30)
|
|
return {
|
|
"ok": rc == 0,
|
|
"rc": rc,
|
|
"stdout": out.strip(),
|
|
"stderr": err.strip(),
|
|
}
|