diff --git a/image/app/server.py b/image/app/server.py index ef73b30..bb496fe 100644 --- a/image/app/server.py +++ b/image/app/server.py @@ -16,12 +16,14 @@ from .models import load_catalog from .services import docker_state, run_action, services_from_settings from .ssh import ssh_run from .swap import SwapManager +from .updates import UpdateManager, get_update_status settings = Settings.from_env() catalog = load_catalog(settings.models_yaml) swap_manager = SwapManager(settings, catalog) download_manager = DownloadManager(settings) +update_manager = UpdateManager(settings) app = FastAPI(title="spark-control", version="0.1.0") @@ -307,6 +309,69 @@ async def stream_download(job_id: str): return StreamingResponse(gen(), media_type="text/event-stream") +@app.get("/api/updates") +async def get_updates() -> dict: + return await get_update_status(settings) + + +class UpdateRequest(BaseModel): + mode: Literal["solo", "cluster"] = "cluster" + + +@app.post("/api/updates/apply") +async def post_update_apply(req: UpdateRequest) -> dict: + if not settings.configured: + raise HTTPException(503, "spark1 not configured") + try: + job = await update_manager.trigger(req.mode) + except RuntimeError as e: + raise HTTPException(409, str(e)) + return {"job_id": job.id, "mode": job.mode, "state": job.state} + + +@app.get("/api/updates/{job_id}") +async def get_update_job(job_id: str) -> dict: + job = update_manager.get(job_id) + if job is None: + raise HTTPException(404, "no such job") + return { + "id": job.id, + "mode": job.mode, + "state": job.state, + "phase": job.phase, + "started_at": job.started_at, + "finished_at": job.finished_at, + "returncode": job.returncode, + "lines": job.lines, + } + + +@app.get("/api/updates/{job_id}/stream") +async def stream_update(job_id: str): + job = update_manager.get(job_id) + if job is None: + raise HTTPException(404, "no such job") + + async def gen(): + sent = 0 + last_phase = None + while True: + n = len(job.lines) + if n > sent: + for line in job.lines[sent:n]: + yield f"data: {json.dumps({'line': line})}\n\n" + sent = n + if job.phase != last_phase: + yield f"event: phase\ndata: {json.dumps({'state': job.state, 'phase': job.phase})}\n\n" + last_phase = job.phase + if job.returncode is not None and sent >= len(job.lines): + yield f"event: done\ndata: {json.dumps({'state': job.state, 'returncode': job.returncode})}\n\n" + return + await asyncio.sleep(0.5) + + return StreamingResponse(gen(), media_type="text/event-stream") + + @app.post("/api/test-connection") async def test_connection() -> dict: """Probe both Sparks with a `hostname` command. Useful for the StartOS setup flow.""" diff --git a/image/app/static/app.js b/image/app/static/app.js index 8efc98f..8b6adba 100644 --- a/image/app/static/app.js +++ b/image/app/static/app.js @@ -629,16 +629,152 @@ function handleDownloadDone(d) { dlState.job_id = null; } +// ===================== updates (spark-vllm-docker) ===================== + +const updState = { + info: null, + job_id: null, + eventsource: null, + started_at: null, + timer_handle: null, +}; + +async function pollUpdates() { + try { + const info = await fetchJSON('/api/updates'); + updState.info = info; + renderUpdateBanner(); + } catch (e) { + console.warn('updates poll failed', e); + } +} + +function renderUpdateBanner() { + const banner = el('#update-banner'); + const info = updState.info; + const text = el('#ub-text'); + const details = el('#ub-details'); + const apply = el('#ub-apply'); + const list = el('#ub-list'); + const log = el('#ub-log'); + + if (!info || !info.ok) { + banner.classList.add('hidden'); + return; + } + banner.classList.remove('hidden'); + const behind = info.behind || 0; + const dirty = info.dirty || 0; + banner.classList.toggle('up-to-date', behind === 0 && !dirty); + banner.classList.toggle('warn', !!dirty); + + if (dirty > 0) { + text.textContent = `${dirty} local change${dirty === 1 ? '' : 's'} in ~/spark-vllm-docker. Resolve before updating.`; + details.classList.add('hidden'); + apply.classList.add('hidden'); + } else if (behind === 0) { + text.textContent = `spark-vllm-docker is up to date (${info.current || ''})`; + details.classList.add('hidden'); + apply.classList.add('hidden'); + list.classList.add('hidden'); + } else { + text.textContent = `${behind} commit${behind === 1 ? '' : 's'} behind upstream`; + details.classList.remove('hidden'); + apply.classList.remove('hidden'); + log.textContent = (info.log || []).join('\n') || '(no log)'; + } +} + +function ubTimerStart(startedAt) { + updState.started_at = startedAt; + if (updState.timer_handle) clearInterval(updState.timer_handle); + const tick = () => { + if (!updState.started_at) return; + const sec = Math.max(0, Math.floor((Date.now() - updState.started_at) / 1000)); + const m = Math.floor(sec / 60); + const s = sec % 60; + el('#ub-elapsed').textContent = `${m}:${s.toString().padStart(2, '0')}`; + }; + tick(); + updState.timer_handle = setInterval(tick, 500); +} + +async function applyUpdate() { + if (!confirm('This pulls the latest spark-vllm-docker and rebuilds the vLLM container. Can take 5–40 minutes; the cluster is unaffected until you swap to a different model. Continue?')) return; + try { + const r = await fetchJSON('/api/updates/apply', { + method: 'POST', + headers: { 'content-type': 'application/json' }, + body: JSON.stringify({ mode: 'cluster' }), + }); + attachToUpdate(r.job_id); + } catch (e) { + alert('Failed to start update: ' + e.message); + } +} + +async function attachToUpdate(jobId) { + updState.job_id = jobId; + el('#ub-progress').classList.remove('hidden'); + el('#ub-apply').classList.add('hidden'); + el('#ub-stream').textContent = ''; + el('#ub-phase').textContent = 'Starting…'; + try { + const snap = await fetchJSON(`/api/updates/${jobId}`); + ubTimerStart(Date.parse(snap.started_at)); + el('#ub-phase').textContent = snap.phase || 'Working…'; + el('#ub-stream').textContent = (snap.lines || []).join('\n'); + if (snap.returncode !== null) { handleUpdateDone(snap); return; } + } catch (e) { + ubTimerStart(Date.now()); + } + const es = new EventSource(`/api/updates/${jobId}/stream`); + updState.eventsource = es; + es.onmessage = (ev) => { + try { + const d = JSON.parse(ev.data); + if (d.line !== undefined) { + const log = el('#ub-stream'); + log.textContent += d.line + '\n'; + log.scrollTop = log.scrollHeight; + } + } catch {} + }; + es.addEventListener('phase', (ev) => { + try { el('#ub-phase').textContent = JSON.parse(ev.data).phase; } catch {} + }); + es.addEventListener('done', (ev) => { + let d = {}; try { d = JSON.parse(ev.data); } catch {} + handleUpdateDone(d); + }); + es.onerror = () => { es.close(); updState.eventsource = null; }; +} + +function handleUpdateDone(d) { + if (updState.eventsource) { updState.eventsource.close(); updState.eventsource = null; } + if (updState.timer_handle) { clearInterval(updState.timer_handle); updState.timer_handle = null; } + el('#ub-phase').textContent = d.state === 'failed' ? `Failed (rc=${d.returncode})` : 'Done ✓ — re-check from the banner.'; + setTimeout(pollUpdates, 2000); +} + async function init() { setupCopyButtons(); el('#open-download').addEventListener('click', openDownloadForm); el('#dl-cancel').addEventListener('click', closeDownloadPanel); el('#dl-start').addEventListener('click', startDownload); el('#dl-repo').addEventListener('keydown', (e) => { if (e.key === 'Enter') startDownload(); }); + el('#ub-details').addEventListener('click', () => { + const list = el('#ub-list'); + list.classList.toggle('hidden'); + list.open = !list.open; + }); + el('#ub-apply').addEventListener('click', applyUpdate); await loadModels(); await pollStatus(); await renderServices(); + pollUpdates(); setInterval(pollStatus, 5000); + setInterval(pollUpdates, 300000); // every 5 min } init(); diff --git a/image/app/static/index.html b/image/app/static/index.html index 4f5dc18..33ed6bd 100644 --- a/image/app/static/index.html +++ b/image/app/static/index.html @@ -116,6 +116,30 @@
+ +