Files
spark-control/image/app/services.py
T
Keysat 5a0bfba6a3 v0.12.0:0 - WhisperX as a one-click dashboard install + managed service
Replaces the manual rsync+build+run with a proper spark-control feature.
First in the audio path that doesn't require shell access on Spark 2.

What's in the box
─────────────────
* image/whisperx_container/   - the build context (Dockerfile, requirements,
  app/main.py FastAPI wrapper). Mainline pipeline: faster-whisper for STT +
  pyannote 3.1 for diarization + wav2vec2 forced alignment. Single endpoint
  /v1/audio/transcribe-with-speakers returns the exact same shape spark-
  control's existing endpoint does, so the recap-relay PR spec needs no
  changes when we cut over.

* image/app/whisperx_install.py - install manager. ships build context to
  Spark 2 over SSH, runs `docker build`, runs `docker run` with 40 GB
  memory cap (vs Sortformer's unbounded which thrashed Spark 2 on a 90-min
  file), polls /health until both Whisper + pyannote report loaded.

* Audio proxy: /api/audio/transcribe-with-speakers now prefers WhisperX
  when its /health reports diarizer_loaded=true, falls back to the legacy
  Parakeet + Sortformer path otherwise. Same response shape either way.
  Clean cutover, easy rollback (`docker rm whisperx-asr`).

* Dashboard (Audio / Speech tab):
  - "Add WhisperX" banner appears when not installed, with a primary
    "Install WhisperX" button. One click triggers the install.
  - Build progress dialog with phase + elapsed timer + live build log via
    SSE (`/api/whisperx/install/{job_id}/stream`).
  - After install, WhisperX auto-registers as a managed service alongside
    Parakeet and Magpie (Start/Restart/Stop, deep-check, auto-restart).
  - Banner self-hides once /api/whisperx/status reports healthy.

New endpoints
─────────────
  GET  /api/whisperx/status
  POST /api/whisperx/install
  GET  /api/whisperx/install/{job_id}
  GET  /api/whisperx/install/{job_id}/stream  (SSE phase + log)

Config additions (env)
──────────────────────
  WHISPERX_HOST       (defaults to spark2_host)
  WHISPERX_USER       (defaults to spark2_user)
  WHISPERX_CONTAINER  (default: whisperx-asr)
  WHISPERX_PORT       (default: 8002)
  WHISPERX_MODEL      (default: medium; tiny/base/small/medium/large-v3)

Dockerfile
──────────
Added COPY whisperx_container /app/whisperx_container so the runtime
install manager can read the build context from inside the spark-control
image and ship it over SSH.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-18 21:02:26 -05:00

137 lines
4.7 KiB
Python

"""Lifecycle controls for support-service containers (Parakeet, Magpie, etc.).
These are independent always-on containers that don't go through the LLM-swap
machinery. We just run `docker start|stop|restart <container>` via SSH on the
appropriate host.
"""
from __future__ import annotations
import time
from dataclasses import dataclass
from typing import Literal, Optional
from .config import Settings
from .ssh import ssh_run
# Cache the "unreachable" verdict per (host, user) for a short period so that a
# repeated docker_state call doesn't re-pay the 6 s SSH connect timeout each time.
_UNREACHABLE_TTL = 25.0
_unreachable_cache: dict[tuple[str, str], float] = {}
def _is_recently_unreachable(host: str, user: str) -> bool:
ts = _unreachable_cache.get((host, user))
return bool(ts and time.monotonic() - ts < _UNREACHABLE_TTL)
def _mark_unreachable(host: str, user: str) -> None:
_unreachable_cache[(host, user)] = time.monotonic()
def _clear_unreachable(host: str, user: str) -> None:
_unreachable_cache.pop((host, user), None)
ServiceName = Literal["parakeet", "magpie"]
ServiceAction = Literal["start", "stop", "restart"]
@dataclass(frozen=True)
class ServiceDef:
name: str
kind: str # 'stt' | 'tts' | …
host: str
user: str
container: str
port: int
def services_from_settings(s: Settings) -> dict[str, ServiceDef]:
from .custom_services import load_custom_services
out: dict[str, ServiceDef] = {
"parakeet": ServiceDef(
name="parakeet",
kind="stt",
host=s.parakeet_host,
user=s.parakeet_user,
container=s.parakeet_container,
port=s.parakeet_port,
),
"magpie": ServiceDef(
name="magpie",
kind="tts",
host=s.magpie_host,
user=s.magpie_user,
container=s.magpie_container,
port=s.magpie_port,
),
"whisperx": ServiceDef(
name="whisperx",
kind="stt+diarize",
host=s.whisperx_host,
user=s.whisperx_user,
container=s.whisperx_container,
port=s.whisperx_port,
),
}
for entry in load_custom_services():
key = entry.get("key")
if not key or key in out:
continue
out[key] = ServiceDef(
name=key,
kind=entry.get("kind", ""),
host=entry.get("host", ""),
user=entry.get("user", ""),
container=entry.get("container", key),
port=int(entry.get("port", 0)),
)
return out
async def docker_state(settings: Settings, svc: ServiceDef) -> dict:
"""Get docker state (running, exited, restarting, etc.) + restart count."""
if not svc.host or not svc.user:
return {"state": "unconfigured", "restart_count": None, "uptime": None}
if _is_recently_unreachable(svc.host, svc.user):
return {"state": "unreachable", "host_unreachable": True, "restart_count": None, "uptime": None}
cmd = (
f"docker inspect {svc.container} "
f"--format '{{{{.State.Status}}}}|{{{{.State.StartedAt}}}}|{{{{.RestartCount}}}}|{{{{.State.ExitCode}}}}|{{{{.State.Error}}}}' "
f"2>&1 || echo 'NOT_FOUND'"
)
rc, out, _ = await ssh_run(svc.host, svc.user, cmd, settings, timeout=6)
out = out.strip()
if rc == 124 or "timeout after" in out.lower():
_mark_unreachable(svc.host, svc.user)
return {"state": "unreachable", "host_unreachable": True, "restart_count": None, "uptime": None}
_clear_unreachable(svc.host, svc.user)
if rc != 0 or out.startswith("NOT_FOUND") or "Error" in out and "no such object" in out.lower():
return {"state": "missing", "restart_count": None, "uptime": None, "raw": out}
parts = out.split("|")
if len(parts) < 4:
return {"state": "unknown", "raw": out}
status, started_at, restart_count, exit_code = parts[0], parts[1], parts[2], parts[3]
error = parts[4] if len(parts) > 4 else ""
return {
"state": status,
"started_at": started_at,
"restart_count": int(restart_count) if restart_count.isdigit() else None,
"exit_code": int(exit_code) if exit_code.lstrip("-").isdigit() else None,
"error": error or None,
}
async def run_action(settings: Settings, svc: ServiceDef, action: ServiceAction) -> dict:
"""Run docker start/stop/restart on the target host."""
if not svc.host or not svc.user:
return {"ok": False, "error": "service host not configured"}
cmd = f"docker {action} {svc.container}"
rc, out, err = await ssh_run(svc.host, svc.user, cmd, settings, timeout=30)
return {
"ok": rc == 0,
"rc": rc,
"stdout": out.strip(),
"stderr": err.strip(),
}