v0.2.2 - spark-vllm-docker update checks + Apply Update

Backend:
- updates.py: get_update_status() runs git fetch + git rev-list --left-right --count HEAD...origin/main to learn ahead/behind/dirty, plus git log for pending commits
- UpdateManager class with asyncio.Lock; one update at a time
- POST /api/updates/apply triggers "git pull --ff-only && ./build-and-copy.sh -c" over SSH with streamed log + phase detection (Pulling / Building the vLLM container / Copying to peer Sparks)
- GET /api/updates returns {ok, behind, ahead, dirty, current, log[], branch}

Frontend:
- Persistent banner near footer: hidden when up-to-date, blue when N commits behind, warn (orange) when local dirty changes block update
- 'Show details' expands a list of pending commits
- 'Apply update' triggers the long-running build with phase + elapsed timer + collapsible logs
- Confirmation dialog explains the 5–40 min duration

Package: bump 0.2.2:0
This commit is contained in:
Grant
2026-05-12 11:26:55 -05:00
parent 9dde938348
commit 474417b458
6 changed files with 408 additions and 2 deletions
+65
View File
@@ -16,12 +16,14 @@ from .models import load_catalog
from .services import docker_state, run_action, services_from_settings from .services import docker_state, run_action, services_from_settings
from .ssh import ssh_run from .ssh import ssh_run
from .swap import SwapManager from .swap import SwapManager
from .updates import UpdateManager, get_update_status
settings = Settings.from_env() settings = Settings.from_env()
catalog = load_catalog(settings.models_yaml) catalog = load_catalog(settings.models_yaml)
swap_manager = SwapManager(settings, catalog) swap_manager = SwapManager(settings, catalog)
download_manager = DownloadManager(settings) download_manager = DownloadManager(settings)
update_manager = UpdateManager(settings)
app = FastAPI(title="spark-control", version="0.1.0") app = FastAPI(title="spark-control", version="0.1.0")
@@ -307,6 +309,69 @@ async def stream_download(job_id: str):
return StreamingResponse(gen(), media_type="text/event-stream") return StreamingResponse(gen(), media_type="text/event-stream")
@app.get("/api/updates")
async def get_updates() -> dict:
return await get_update_status(settings)
class UpdateRequest(BaseModel):
mode: Literal["solo", "cluster"] = "cluster"
@app.post("/api/updates/apply")
async def post_update_apply(req: UpdateRequest) -> dict:
if not settings.configured:
raise HTTPException(503, "spark1 not configured")
try:
job = await update_manager.trigger(req.mode)
except RuntimeError as e:
raise HTTPException(409, str(e))
return {"job_id": job.id, "mode": job.mode, "state": job.state}
@app.get("/api/updates/{job_id}")
async def get_update_job(job_id: str) -> dict:
job = update_manager.get(job_id)
if job is None:
raise HTTPException(404, "no such job")
return {
"id": job.id,
"mode": job.mode,
"state": job.state,
"phase": job.phase,
"started_at": job.started_at,
"finished_at": job.finished_at,
"returncode": job.returncode,
"lines": job.lines,
}
@app.get("/api/updates/{job_id}/stream")
async def stream_update(job_id: str):
job = update_manager.get(job_id)
if job is None:
raise HTTPException(404, "no such job")
async def gen():
sent = 0
last_phase = None
while True:
n = len(job.lines)
if n > sent:
for line in job.lines[sent:n]:
yield f"data: {json.dumps({'line': line})}\n\n"
sent = n
if job.phase != last_phase:
yield f"event: phase\ndata: {json.dumps({'state': job.state, 'phase': job.phase})}\n\n"
last_phase = job.phase
if job.returncode is not None and sent >= len(job.lines):
yield f"event: done\ndata: {json.dumps({'state': job.state, 'returncode': job.returncode})}\n\n"
return
await asyncio.sleep(0.5)
return StreamingResponse(gen(), media_type="text/event-stream")
@app.post("/api/test-connection") @app.post("/api/test-connection")
async def test_connection() -> dict: async def test_connection() -> dict:
"""Probe both Sparks with a `hostname` command. Useful for the StartOS setup flow.""" """Probe both Sparks with a `hostname` command. Useful for the StartOS setup flow."""
+136
View File
@@ -629,16 +629,152 @@ function handleDownloadDone(d) {
dlState.job_id = null; dlState.job_id = null;
} }
// ===================== updates (spark-vllm-docker) =====================
const updState = {
info: null,
job_id: null,
eventsource: null,
started_at: null,
timer_handle: null,
};
async function pollUpdates() {
try {
const info = await fetchJSON('/api/updates');
updState.info = info;
renderUpdateBanner();
} catch (e) {
console.warn('updates poll failed', e);
}
}
function renderUpdateBanner() {
const banner = el('#update-banner');
const info = updState.info;
const text = el('#ub-text');
const details = el('#ub-details');
const apply = el('#ub-apply');
const list = el('#ub-list');
const log = el('#ub-log');
if (!info || !info.ok) {
banner.classList.add('hidden');
return;
}
banner.classList.remove('hidden');
const behind = info.behind || 0;
const dirty = info.dirty || 0;
banner.classList.toggle('up-to-date', behind === 0 && !dirty);
banner.classList.toggle('warn', !!dirty);
if (dirty > 0) {
text.textContent = `${dirty} local change${dirty === 1 ? '' : 's'} in ~/spark-vllm-docker. Resolve before updating.`;
details.classList.add('hidden');
apply.classList.add('hidden');
} else if (behind === 0) {
text.textContent = `spark-vllm-docker is up to date (${info.current || ''})`;
details.classList.add('hidden');
apply.classList.add('hidden');
list.classList.add('hidden');
} else {
text.textContent = `${behind} commit${behind === 1 ? '' : 's'} behind upstream`;
details.classList.remove('hidden');
apply.classList.remove('hidden');
log.textContent = (info.log || []).join('\n') || '(no log)';
}
}
function ubTimerStart(startedAt) {
updState.started_at = startedAt;
if (updState.timer_handle) clearInterval(updState.timer_handle);
const tick = () => {
if (!updState.started_at) return;
const sec = Math.max(0, Math.floor((Date.now() - updState.started_at) / 1000));
const m = Math.floor(sec / 60);
const s = sec % 60;
el('#ub-elapsed').textContent = `${m}:${s.toString().padStart(2, '0')}`;
};
tick();
updState.timer_handle = setInterval(tick, 500);
}
async function applyUpdate() {
if (!confirm('This pulls the latest spark-vllm-docker and rebuilds the vLLM container. Can take 540 minutes; the cluster is unaffected until you swap to a different model. Continue?')) return;
try {
const r = await fetchJSON('/api/updates/apply', {
method: 'POST',
headers: { 'content-type': 'application/json' },
body: JSON.stringify({ mode: 'cluster' }),
});
attachToUpdate(r.job_id);
} catch (e) {
alert('Failed to start update: ' + e.message);
}
}
async function attachToUpdate(jobId) {
updState.job_id = jobId;
el('#ub-progress').classList.remove('hidden');
el('#ub-apply').classList.add('hidden');
el('#ub-stream').textContent = '';
el('#ub-phase').textContent = 'Starting…';
try {
const snap = await fetchJSON(`/api/updates/${jobId}`);
ubTimerStart(Date.parse(snap.started_at));
el('#ub-phase').textContent = snap.phase || 'Working…';
el('#ub-stream').textContent = (snap.lines || []).join('\n');
if (snap.returncode !== null) { handleUpdateDone(snap); return; }
} catch (e) {
ubTimerStart(Date.now());
}
const es = new EventSource(`/api/updates/${jobId}/stream`);
updState.eventsource = es;
es.onmessage = (ev) => {
try {
const d = JSON.parse(ev.data);
if (d.line !== undefined) {
const log = el('#ub-stream');
log.textContent += d.line + '\n';
log.scrollTop = log.scrollHeight;
}
} catch {}
};
es.addEventListener('phase', (ev) => {
try { el('#ub-phase').textContent = JSON.parse(ev.data).phase; } catch {}
});
es.addEventListener('done', (ev) => {
let d = {}; try { d = JSON.parse(ev.data); } catch {}
handleUpdateDone(d);
});
es.onerror = () => { es.close(); updState.eventsource = null; };
}
function handleUpdateDone(d) {
if (updState.eventsource) { updState.eventsource.close(); updState.eventsource = null; }
if (updState.timer_handle) { clearInterval(updState.timer_handle); updState.timer_handle = null; }
el('#ub-phase').textContent = d.state === 'failed' ? `Failed (rc=${d.returncode})` : 'Done ✓ — re-check from the banner.';
setTimeout(pollUpdates, 2000);
}
async function init() { async function init() {
setupCopyButtons(); setupCopyButtons();
el('#open-download').addEventListener('click', openDownloadForm); el('#open-download').addEventListener('click', openDownloadForm);
el('#dl-cancel').addEventListener('click', closeDownloadPanel); el('#dl-cancel').addEventListener('click', closeDownloadPanel);
el('#dl-start').addEventListener('click', startDownload); el('#dl-start').addEventListener('click', startDownload);
el('#dl-repo').addEventListener('keydown', (e) => { if (e.key === 'Enter') startDownload(); }); el('#dl-repo').addEventListener('keydown', (e) => { if (e.key === 'Enter') startDownload(); });
el('#ub-details').addEventListener('click', () => {
const list = el('#ub-list');
list.classList.toggle('hidden');
list.open = !list.open;
});
el('#ub-apply').addEventListener('click', applyUpdate);
await loadModels(); await loadModels();
await pollStatus(); await pollStatus();
await renderServices(); await renderServices();
pollUpdates();
setInterval(pollStatus, 5000); setInterval(pollStatus, 5000);
setInterval(pollUpdates, 300000); // every 5 min
} }
init(); init();
+24
View File
@@ -116,6 +116,30 @@
<section id="cards" class="cards"></section> <section id="cards" class="cards"></section>
</section> </section>
<section id="update-banner" class="update-banner hidden">
<div class="ub-row">
<span id="ub-text">Checking for updates…</span>
<span class="spacer"></span>
<button id="ub-details" class="btn small-btn hidden">Show details</button>
<button id="ub-apply" class="btn small-btn primary hidden">Apply update</button>
</div>
<details id="ub-list" class="hidden">
<summary class="muted small">Pending commits</summary>
<pre id="ub-log" class="snippet"></pre>
</details>
<div id="ub-progress" class="hidden">
<div class="phase-row">
<div class="phase" id="ub-phase">Applying update…</div>
<span class="spacer"></span>
<span class="timer" id="ub-elapsed">0:00</span>
</div>
<details>
<summary class="muted small">Show technical logs</summary>
<pre id="ub-stream" class="log"></pre>
</details>
</div>
</section>
<footer class="footer"> <footer class="footer">
<div class="health"> <div class="health">
<span class="health-item" id="h-vllm"><span class="dot"></span> vLLM</span> <span class="health-item" id="h-vllm"><span class="dot"></span> vLLM</span>
+21
View File
@@ -217,6 +217,27 @@ main {
word-break: break-word; word-break: break-word;
} }
/* ===== Update banner ===== */
.update-banner {
background: var(--surface);
border: 1px solid rgba(96, 165, 250, 0.4);
border-radius: var(--radius);
padding: 10px 14px;
margin-top: 18px;
font-size: 13px;
}
.update-banner.up-to-date {
border-color: var(--border);
color: var(--muted);
}
.update-banner.warn { border-color: var(--warn); }
.ub-row { display: flex; align-items: center; gap: 8px; flex-wrap: wrap; }
.ub-row .spacer { flex: 1; }
#ub-list { margin-top: 8px; }
#ub-list summary { cursor: pointer; padding: 4px 0; }
#ub-progress { margin-top: 10px; }
/* ===== Section header (title + action button) ===== */ /* ===== Section header (title + action button) ===== */
.section-header { .section-header {
+160
View File
@@ -0,0 +1,160 @@
"""Check for and apply updates to the upstream `eugr/spark-vllm-docker` checkout
on Spark 1. We don't auto-update — only display what's available and let the user
explicitly apply when they're ready.
"""
from __future__ import annotations
import asyncio
import uuid
from dataclasses import dataclass, field
from datetime import datetime, timezone
from typing import Optional
from .config import Settings
from .ssh import ssh_run, ssh_stream, StreamHandle
async def get_update_status(settings: Settings) -> dict:
"""Return upstream-vs-local commit info. Runs `git fetch` then `git log`.
No mutation other than refreshing remote refs (fetch). Safe to call frequently.
"""
if not settings.spark1_host or not settings.spark1_user:
return {"ok": False, "error": "spark1 not configured"}
script = (
"cd ~/spark-vllm-docker && "
"branch=$(git rev-parse --abbrev-ref HEAD) && "
"git fetch --quiet 2>&1 || true; "
"ahead_behind=$(git rev-list --left-right --count HEAD...origin/$branch 2>/dev/null || echo '0\t0'); "
"behind=$(echo \"$ahead_behind\" | awk '{print $2}'); "
"ahead=$(echo \"$ahead_behind\" | awk '{print $1}'); "
"current_hash=$(git rev-parse --short HEAD); "
"current_msg=$(git log -1 --pretty=format:%s); "
"dirty=$(git status --porcelain | wc -l); "
"echo BRANCH=\"$branch\"; "
"echo BEHIND=\"$behind\"; "
"echo AHEAD=\"$ahead\"; "
"echo DIRTY=\"$dirty\"; "
"echo CURRENT=\"$current_hash $current_msg\"; "
"echo ---LOG---; "
"git log HEAD..origin/$branch --pretty=format:'%h %s (%ar)' 2>/dev/null | head -30"
)
rc, out, err = await ssh_run(
settings.spark1_host,
settings.spark1_user,
script,
settings,
timeout=30,
)
if rc != 0:
return {"ok": False, "error": err.strip() or out.strip() or f"rc={rc}"}
info: dict = {"ok": True, "log": []}
in_log = False
for line in out.splitlines():
if line == "---LOG---":
in_log = True
continue
if in_log:
if line.strip():
info["log"].append(line)
continue
if "=" in line:
k, v = line.split("=", 1)
key = k.lower().strip()
val = v.strip()
if key in ("behind", "ahead", "dirty"):
info[key] = int(val) if val.isdigit() else 0
else:
info[key] = val
return info
@dataclass
class UpdateJob:
id: str
mode: str # 'cluster' | 'solo'
started_at: str
state: str = "starting"
lines: list[str] = field(default_factory=list)
returncode: Optional[int] = None
finished_at: Optional[str] = None
phase: str = "Starting…"
def append(self, line: str) -> None:
self.lines.append(line)
if len(self.lines) > 1000:
del self.lines[: len(self.lines) - 1000]
class UpdateManager:
def __init__(self, settings: Settings) -> None:
self.settings = settings
self.lock = asyncio.Lock()
self.jobs: dict[str, UpdateJob] = {}
self.current_job_id: Optional[str] = None
def get(self, job_id: str) -> UpdateJob | None:
return self.jobs.get(job_id)
async def trigger(self, mode: str = "cluster") -> UpdateJob:
if mode not in ("cluster", "solo"):
raise ValueError("mode must be 'cluster' or 'solo'")
if self.lock.locked():
raise RuntimeError("An update is already in progress")
job = UpdateJob(
id=uuid.uuid4().hex[:8],
mode=mode,
started_at=datetime.now(timezone.utc).isoformat(),
)
self.jobs[job.id] = job
self.current_job_id = job.id
asyncio.create_task(self._run(job))
return job
async def _run(self, job: UpdateJob) -> None:
async with self.lock:
try:
await self._do(job)
if job.state != "failed":
job.state = "done"
job.returncode = 0
job.phase = "Done"
except Exception as e:
job.append(f"[error] {type(e).__name__}: {e}")
job.state = "failed"
if job.returncode is None:
job.returncode = 1
finally:
job.finished_at = datetime.now(timezone.utc).isoformat()
if self.current_job_id == job.id:
self.current_job_id = None
async def _do(self, job: UpdateJob) -> None:
s = self.settings
if not s.spark1_host or not s.spark1_user:
raise RuntimeError("spark1 not configured")
flag = "-c" if job.mode == "cluster" else ""
cmd = (
f"cd ~/spark-vllm-docker && "
f"echo '=== git pull ===' && git pull --ff-only && "
f"echo '=== build-and-copy ===' && ./build-and-copy.sh {flag}"
).strip()
job.append(f"$ {cmd}")
job.state = "running"
job.phase = "Pulling latest changes…"
handle = StreamHandle()
async for line in ssh_stream(s.spark1_host, s.spark1_user, cmd, s, handle=handle):
job.append(line)
if "=== build-and-copy ===" in line:
job.phase = "Building the vLLM container…"
elif "Copy" in line and "complete" in line.lower():
job.phase = "Copying to peer Sparks…"
elif "Already up to date" in line:
job.phase = "No changes to pull; rebuilding…"
rc = handle.returncode or 0
if rc != 0:
job.state = "failed"
job.returncode = rc
+2 -2
View File
@@ -1,10 +1,10 @@
import { VersionInfo, IMPOSSIBLE } from '@start9labs/start-sdk' import { VersionInfo, IMPOSSIBLE } from '@start9labs/start-sdk'
export const v0_1_0 = VersionInfo.of({ export const v0_1_0 = VersionInfo.of({
version: '0.2.1:0', version: '0.2.2:0',
releaseNotes: { releaseNotes: {
en_US: en_US:
'Model download from the dashboard. Click "+ Download a new model", paste a Hugging Face repo (e.g. RedHatAI/...), choose solo or cluster, and watch %% progress with bytes, rate, and ETA. Drives ./hf-download.sh on Spark 1 over SSH and parses the tqdm output.', 'Update checking for spark-vllm-docker. Dashboard footer shows "N commits behind upstream" when applicable; click for the commit log, then "Apply update" runs git pull + ./build-and-copy.sh -c on Spark 1 with a streamed log and elapsed timer. No auto-apply — you confirm each update.',
}, },
migrations: { migrations: {
up: async ({ effects }) => {}, up: async ({ effects }) => {},