diff --git a/image/app/config.py b/image/app/config.py index eebd6a1..1e92871 100644 --- a/image/app/config.py +++ b/image/app/config.py @@ -29,6 +29,12 @@ class Settings: spark1_user: str spark2_host: str spark2_user: str + parakeet_host: str + parakeet_user: str + parakeet_container: str + magpie_host: str + magpie_user: str + magpie_container: str ssh_key_path: str ssh_known_hosts: str models_yaml: str @@ -39,11 +45,20 @@ class Settings: @classmethod def from_env(cls) -> "Settings": + spark2_host = _env("SPARK2_HOST") + spark2_user = _env("SPARK2_USER") + # Parakeet and Magpie default to Spark 2 unless explicitly overridden. return cls( spark1_host=_env("SPARK1_HOST"), - spark1_user=_env("SPARK1_USER", ""), - spark2_host=_env("SPARK2_HOST"), - spark2_user=_env("SPARK2_USER", ""), + spark1_user=_env("SPARK1_USER"), + spark2_host=spark2_host, + spark2_user=spark2_user, + parakeet_host=_env("PARAKEET_HOST") or spark2_host, + parakeet_user=_env("PARAKEET_USER") or spark2_user, + parakeet_container=_env("PARAKEET_CONTAINER", "parakeet-asr"), + magpie_host=_env("MAGPIE_HOST") or spark2_host, + magpie_user=_env("MAGPIE_USER") or spark2_user, + magpie_container=_env("MAGPIE_CONTAINER", "magpie-tts"), ssh_key_path=_env("SSH_KEY_PATH"), ssh_known_hosts=_env("SSH_KNOWN_HOSTS"), models_yaml=_resolve_models_yaml(), diff --git a/image/app/health.py b/image/app/health.py index 9b4d948..57a4c60 100644 --- a/image/app/health.py +++ b/image/app/health.py @@ -31,15 +31,15 @@ async def check_vllm(settings: Settings) -> dict: async def check_parakeet(settings: Settings) -> dict: base_url = ( - f"http://{settings.spark2_host}:{settings.parakeet_port}" - if settings.spark2_host + f"http://{settings.parakeet_host}:{settings.parakeet_port}" + if settings.parakeet_host else None ) - if not settings.spark2_host: - return {"ok": False, "error": "spark2 not configured", "base_url": base_url} + if not settings.parakeet_host: + return {"ok": False, "error": "parakeet host not configured", "base_url": base_url} try: async with httpx.AsyncClient(timeout=_TIMEOUT) as c: - r = await c.get(f"http://{settings.spark2_host}:{settings.parakeet_port}/health") + r = await c.get(f"http://{settings.parakeet_host}:{settings.parakeet_port}/health") r.raise_for_status() return {"ok": True, "detail": r.json(), "base_url": base_url} except Exception as e: @@ -48,15 +48,15 @@ async def check_parakeet(settings: Settings) -> dict: async def check_magpie(settings: Settings) -> dict: base_url = ( - f"http://{settings.spark2_host}:{settings.magpie_port}" - if settings.spark2_host + f"http://{settings.magpie_host}:{settings.magpie_port}" + if settings.magpie_host else None ) - if not settings.spark2_host: - return {"ok": False, "error": "spark2 not configured", "base_url": base_url} + if not settings.magpie_host: + return {"ok": False, "error": "magpie host not configured", "base_url": base_url} try: async with httpx.AsyncClient(timeout=_TIMEOUT) as c: - r = await c.get(f"http://{settings.spark2_host}:{settings.magpie_port}/v1/health/ready") + r = await c.get(f"http://{settings.magpie_host}:{settings.magpie_port}/v1/health/ready") r.raise_for_status() return { "ok": True, diff --git a/image/app/server.py b/image/app/server.py index 9eea6ec..e838e13 100644 --- a/image/app/server.py +++ b/image/app/server.py @@ -11,6 +11,7 @@ from pydantic import BaseModel from .config import Settings from .health import check_magpie, check_parakeet, check_vllm from .models import load_catalog +from .services import docker_state, run_action, services_from_settings from .ssh import ssh_run from .swap import SwapManager @@ -48,6 +49,64 @@ async def get_models() -> dict: } +@app.get("/api/services") +async def get_services() -> dict: + """Lifecycle state of always-on support services (Parakeet, Magpie, …). + + Each entry includes: + - host/port/container/user (configured) + - state: docker container status (running | exited | restarting | missing | unconfigured) + - http_ready: whether the service's /health endpoint responded + - base_url + - model (if reported by the service) + - restart_count + """ + services = services_from_settings(settings) + out: dict[str, dict] = {} + + async def one(name: str): + svc = services[name] + docker = await docker_state(settings, svc) + if name == "parakeet": + http = await check_parakeet(settings) + else: + http = await check_magpie(settings) + return name, { + "host": svc.host, + "user": svc.user, + "port": svc.port, + "container": svc.container, + "kind": svc.kind, + "base_url": http.get("base_url"), + "http_ready": bool(http.get("ok")), + "model": (http.get("detail") or {}).get("model") if isinstance(http.get("detail"), dict) else None, + "docker_state": docker.get("state"), + "restart_count": docker.get("restart_count"), + "started_at": docker.get("started_at"), + "exit_code": docker.get("exit_code"), + "error": docker.get("error"), + "detail": http.get("detail"), + } + + results = await asyncio.gather(*[one(n) for n in services.keys()]) + for name, info in results: + out[name] = info + return out + + +@app.post("/api/services/{name}/{action}") +async def service_action(name: str, action: str) -> dict: + services = services_from_settings(settings) + if name not in services: + raise HTTPException(404, f"unknown service: {name}") + if action not in ("start", "stop", "restart"): + raise HTTPException(400, f"unknown action: {action}") + result = await run_action(settings, services[name], action) # type: ignore[arg-type] + if not result["ok"]: + raise HTTPException(500, result.get("stderr") or result.get("error") or "action failed") + return {"name": name, "action": action, **result} + + @app.get("/api/endpoints") async def get_endpoints() -> dict: """Service-discovery summary. Stable shape; other apps on the LAN can poll this diff --git a/image/app/services.py b/image/app/services.py new file mode 100644 index 0000000..e86b972 --- /dev/null +++ b/image/app/services.py @@ -0,0 +1,88 @@ +"""Lifecycle controls for support-service containers (Parakeet, Magpie, etc.). + +These are independent always-on containers that don't go through the LLM-swap +machinery. We just run `docker start|stop|restart ` via SSH on the +appropriate host. +""" +from __future__ import annotations +from dataclasses import dataclass +from typing import Literal, Optional + +from .config import Settings +from .ssh import ssh_run + + +ServiceName = Literal["parakeet", "magpie"] +ServiceAction = Literal["start", "stop", "restart"] + + +@dataclass(frozen=True) +class ServiceDef: + name: str + kind: str # 'stt' | 'tts' | … + host: str + user: str + container: str + port: int + + +def services_from_settings(s: Settings) -> dict[str, ServiceDef]: + return { + "parakeet": ServiceDef( + name="parakeet", + kind="stt", + host=s.parakeet_host, + user=s.parakeet_user, + container=s.parakeet_container, + port=s.parakeet_port, + ), + "magpie": ServiceDef( + name="magpie", + kind="tts", + host=s.magpie_host, + user=s.magpie_user, + container=s.magpie_container, + port=s.magpie_port, + ), + } + + +async def docker_state(settings: Settings, svc: ServiceDef) -> dict: + """Get docker state (running, exited, restarting, etc.) + restart count.""" + if not svc.host or not svc.user: + return {"state": "unconfigured", "restart_count": None, "uptime": None} + cmd = ( + f"docker inspect {svc.container} " + f"--format '{{{{.State.Status}}}}|{{{{.State.StartedAt}}}}|{{{{.RestartCount}}}}|{{{{.State.ExitCode}}}}|{{{{.State.Error}}}}' " + f"2>&1 || echo 'NOT_FOUND'" + ) + rc, out, _ = await ssh_run(svc.host, svc.user, cmd, settings, timeout=10) + out = out.strip() + if rc != 0 or out.startswith("NOT_FOUND") or "Error" in out and "no such object" in out.lower(): + return {"state": "missing", "restart_count": None, "uptime": None, "raw": out} + parts = out.split("|") + if len(parts) < 4: + return {"state": "unknown", "raw": out} + status, started_at, restart_count, exit_code = parts[0], parts[1], parts[2], parts[3] + error = parts[4] if len(parts) > 4 else "" + return { + "state": status, + "started_at": started_at, + "restart_count": int(restart_count) if restart_count.isdigit() else None, + "exit_code": int(exit_code) if exit_code.lstrip("-").isdigit() else None, + "error": error or None, + } + + +async def run_action(settings: Settings, svc: ServiceDef, action: ServiceAction) -> dict: + """Run docker start/stop/restart on the target host.""" + if not svc.host or not svc.user: + return {"ok": False, "error": "service host not configured"} + cmd = f"docker {action} {svc.container}" + rc, out, err = await ssh_run(svc.host, svc.user, cmd, settings, timeout=30) + return { + "ok": rc == 0, + "rc": rc, + "stdout": out.strip(), + "stderr": err.strip(), + } diff --git a/image/app/static/app.js b/image/app/static/app.js index 427c34e..bc44c01 100644 --- a/image/app/static/app.js +++ b/image/app/static/app.js @@ -11,6 +11,8 @@ const state = { swap_phase: 'Starting…', swap_phase_detail: '', swap_progress: 0, // 0–1 + services: {}, + service_action_in_flight: null, // e.g. "parakeet:restart" configured: true, timer_handle: null, }; @@ -83,6 +85,107 @@ function renderCurrent(status) { c.innerHTML = `${label}`; } +function classifyService(s) { + // returns one of: running | unhealthy | missing | unconfigured | starting + if (!s.host) return 'unconfigured'; + if (s.docker_state === 'missing') return 'missing'; + if (s.docker_state === 'restarting') return 'unhealthy'; + if (s.docker_state === 'exited') return 'unhealthy'; + if (s.docker_state === 'running' && !s.http_ready) return 'starting'; + if (s.docker_state === 'running' && s.http_ready) return 'running'; + return s.docker_state || 'unknown'; +} + +function statusLabel(cls) { + return { + running: 'Healthy', + unhealthy: 'Unhealthy', + starting: 'Starting…', + missing: 'Not installed', + unconfigured: 'Not configured', + unknown: 'Unknown', + }[cls] || cls; +} + +async function renderServices() { + let services = state.services; + // First render: fetch. + if (!services || Object.keys(services).length === 0) { + try { + services = await fetchJSON('/api/services'); + state.services = services; + } catch (e) { console.error('services fetch failed', e); return; } + } + const panel = el('#services-panel'); + const grid = el('#services-grid'); + const entries = Object.entries(services); + if (entries.length === 0) { panel.classList.add('hidden'); return; } + panel.classList.remove('hidden'); + grid.innerHTML = ''; + for (const [name, s] of entries) { + const cls = classifyService(s); + const card = document.createElement('div'); + card.className = `service-card ${cls}`; + const inFlight = state.service_action_in_flight && state.service_action_in_flight.startsWith(name + ':'); + const disable = (action) => { + // Disable buttons that don't make sense for the current state + if (inFlight) return true; + if (cls === 'unconfigured' || cls === 'missing') return true; + if (action === 'start' && (cls === 'running' || cls === 'starting')) return true; + if (action === 'stop' && cls !== 'running' && cls !== 'starting' && cls !== 'unhealthy') return true; + return false; + }; + const hostRow = s.host + ? `
Host${escapeHtml(s.host)}:${s.port}
` + : `
Hostnot configured
`; + const modelRow = s.model + ? `
Model${escapeHtml(s.model)}
` + : ''; + const restartsRow = s.restart_count != null && s.restart_count > 1 + ? `
Restarts${s.restart_count}
` + : ''; + card.innerHTML = ` +
+ ${escapeHtml(name)} + ${escapeHtml(s.kind || '')} + ${statusLabel(cls)} +
+ ${hostRow} + ${modelRow} + ${restartsRow} +
+ + + +
+ `; + grid.appendChild(card); + } + for (const btn of grid.querySelectorAll('.btn[data-svc-action]')) { + btn.addEventListener('click', () => onServiceAction(btn.dataset.svcAction)); + } +} + +async function onServiceAction(key) { + if (state.service_action_in_flight) return; + const [name, action] = key.split(':'); + state.service_action_in_flight = key; + renderServices(); + try { + await fetchJSON(`/api/services/${name}/${action}`, { method: 'POST' }); + } catch (e) { + alert(`${action} ${name} failed: ${e.message}`); + } finally { + state.service_action_in_flight = null; + // Refresh services state + try { + state.services = await fetchJSON('/api/services'); + } catch {} + renderServices(); + pollStatus(); + } +} + function renderEndpoint(status) { const v = status.vllm || {}; const panel = el('#endpoint-panel'); @@ -269,6 +372,11 @@ async function pollStatus() { renderCurrent(status); renderEndpoint(status); renderHealth(status); + // Refresh services state lazily — every 5s poll triggers this too. + try { + state.services = await fetchJSON('/api/services'); + renderServices(); + } catch {} if (status.current_swap_job && status.current_swap_job !== state.swap_job_id) { attachToSwap(status.current_swap_job, /*needsBackfill=*/true); } else if (!status.current_swap_job && state.swap_job_id && !state.swap_eventsource) { @@ -392,6 +500,7 @@ async function init() { setupCopyButtons(); await loadModels(); await pollStatus(); + await renderServices(); setInterval(pollStatus, 5000); } diff --git a/image/app/static/index.html b/image/app/static/index.html index 98c4190..7fa3ba4 100644 --- a/image/app/static/index.html +++ b/image/app/static/index.html @@ -63,7 +63,15 @@ -
+ + +
+

LLM swap

+
+