Compare commits
3 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 6434b01a95 | |||
| 5827683a09 | |||
| ee8c2406b8 |
@@ -84,6 +84,24 @@ Other services on your LAN can hit `GET /api/endpoints` to learn where the curre
|
||||
|
||||
`base_url` is filled in whenever Configure Sparks has been completed (even if the underlying service isn't currently up). Pair the URL with `ready: true` to safely route traffic.
|
||||
|
||||
## Reporting failures from external apps
|
||||
|
||||
Spark Control polls every 5 s, so a brief blip in Parakeet/Magpie/vLLM availability can slip between polls and never make it into the connectivity log. To capture short failures, an external app (e.g. Open WebUI) can POST whenever a call fails (or succeeds):
|
||||
|
||||
```bash
|
||||
curl -X POST http://<dashboard-url>/api/health-event \
|
||||
-H 'content-type: application/json' \
|
||||
-d '{
|
||||
"service": "parakeet",
|
||||
"ok": false,
|
||||
"source": "open-webui",
|
||||
"error": "HTTP 503",
|
||||
"ms": 420
|
||||
}'
|
||||
```
|
||||
|
||||
Fields: `service` (required), `ok` (required), `source` (optional, free-form), `error` (optional), `ms` (optional latency). Each POST appends a `report` event to the connectivity log alongside the polling-based transition events.
|
||||
|
||||
## Status
|
||||
|
||||
**v0.2.3** — installed and verified on a Start9 server. Five bundled LLMs in the catalog (qwen3-vl, gemma4, qwen36, qwen3-235b-fp8, qwen2.5-72b), plus any custom models added through the UI.
|
||||
|
||||
+87
-23
@@ -1,17 +1,28 @@
|
||||
"""Track Spark up/down transitions and cache discovered MAC addresses.
|
||||
"""Track up/down transitions for any subject (Sparks AND services) and cache MACs.
|
||||
|
||||
Persisted to /data/connectivity.json so history survives package restarts:
|
||||
Persisted to /data/connectivity.json. Schema:
|
||||
|
||||
{
|
||||
"macs": { "spark1": "aa:bb:..", "spark2": "11:22:.." },
|
||||
"current": { "spark1": "up", "spark2": "down" },
|
||||
"last_change": { "spark1": "2026-05-12T15:00:00Z", ... },
|
||||
"current": { "spark1": "up", "parakeet": "up", "magpie": "down", ... },
|
||||
"last_change": { ... },
|
||||
"events": [
|
||||
{ "spark": "spark2", "at": "2026-05-12T17:30:00Z", "transition": "down" },
|
||||
{ "spark": "spark2", "at": "2026-05-12T18:45:00Z", "transition": "up", "down_seconds": 4500 },
|
||||
...
|
||||
# Active-probe transition (logged when state flips during polling)
|
||||
{ "subject": "spark2", "at": "...", "kind": "transition",
|
||||
"transition": "down" },
|
||||
{ "subject": "spark2", "at": "...", "kind": "transition",
|
||||
"transition": "up", "down_seconds": 4500 },
|
||||
|
||||
# Passive report (logged whenever an external app POSTs to
|
||||
# /api/health-event regardless of state change)
|
||||
{ "subject": "parakeet", "at": "...", "kind": "report",
|
||||
"ok": false, "source": "open-webui",
|
||||
"detail": "Connection refused", "latency_ms": 320 },
|
||||
]
|
||||
}
|
||||
|
||||
Legacy events from v0.5 with `spark` instead of `subject` and no `kind` field
|
||||
are read transparently as kind="transition".
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import json
|
||||
@@ -59,21 +70,24 @@ def load() -> dict:
|
||||
return d
|
||||
|
||||
|
||||
def record_mac(spark: str, mac: Optional[str]) -> None:
|
||||
def record_mac(subject: str, mac: Optional[str]) -> None:
|
||||
if not mac:
|
||||
return
|
||||
with _lock:
|
||||
d = _read()
|
||||
d.setdefault("macs", {})
|
||||
if d["macs"].get(spark) != mac:
|
||||
d["macs"][spark] = mac
|
||||
if d["macs"].get(subject) != mac:
|
||||
d["macs"][subject] = mac
|
||||
_write(d)
|
||||
|
||||
|
||||
def record_state(spark: str, reachable: bool) -> Optional[dict]:
|
||||
"""Update current state. If it differs from the last seen state, append an event.
|
||||
def record_state(subject: str, reachable: bool) -> Optional[dict]:
|
||||
"""Update current state for `subject`. If it differs from the last seen
|
||||
state, append a transition event. Returns the event dict if a transition
|
||||
was recorded, else None.
|
||||
|
||||
Returns the event dict if a transition was recorded, else None.
|
||||
`subject` can be a Spark host key (spark1/spark2) or a service name
|
||||
(parakeet/magpie/vllm).
|
||||
"""
|
||||
new_state = "up" if reachable else "down"
|
||||
now = datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
|
||||
@@ -83,12 +97,17 @@ def record_state(spark: str, reachable: bool) -> Optional[dict]:
|
||||
d.setdefault("current", {})
|
||||
d.setdefault("last_change", {})
|
||||
d.setdefault("events", [])
|
||||
prev = d["current"].get(spark)
|
||||
prev = d["current"].get(subject)
|
||||
if prev == new_state:
|
||||
return None
|
||||
event: dict = {"spark": spark, "at": now, "transition": new_state}
|
||||
event: dict = {
|
||||
"subject": subject,
|
||||
"at": now,
|
||||
"kind": "transition",
|
||||
"transition": new_state,
|
||||
}
|
||||
# When we have a previous state and timestamp, compute duration
|
||||
last_change = d["last_change"].get(spark)
|
||||
last_change = d["last_change"].get(subject)
|
||||
if prev and last_change:
|
||||
try:
|
||||
prev_dt = datetime.fromisoformat(last_change.replace("Z", "+00:00"))
|
||||
@@ -99,28 +118,73 @@ def record_state(spark: str, reachable: bool) -> Optional[dict]:
|
||||
event["up_seconds"] = round(duration)
|
||||
except ValueError:
|
||||
pass
|
||||
d["current"][spark] = new_state
|
||||
d["last_change"][spark] = now
|
||||
d["current"][subject] = new_state
|
||||
d["last_change"][subject] = now
|
||||
d["events"].append(event)
|
||||
# Keep rolling window
|
||||
if len(d["events"]) > MAX_EVENTS:
|
||||
d["events"] = d["events"][-MAX_EVENTS:]
|
||||
_write(d)
|
||||
return event
|
||||
|
||||
|
||||
def get_mac(spark: str) -> Optional[str]:
|
||||
def record_report(
|
||||
subject: str,
|
||||
*,
|
||||
ok: bool,
|
||||
source: str = "external",
|
||||
detail: str = "",
|
||||
latency_ms: Optional[int] = None,
|
||||
) -> dict:
|
||||
"""Record a passive report from an external caller (e.g. Open WebUI got a
|
||||
503 calling Parakeet). Always appended to the events list; does NOT change
|
||||
the active-probe state (which only the polling probe is authoritative on).
|
||||
"""
|
||||
now = datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
|
||||
with _lock:
|
||||
d = _read()
|
||||
d.setdefault("events", [])
|
||||
event: dict = {
|
||||
"subject": subject,
|
||||
"at": now,
|
||||
"kind": "report",
|
||||
"ok": bool(ok),
|
||||
"source": source or "external",
|
||||
}
|
||||
if detail:
|
||||
event["detail"] = detail
|
||||
if latency_ms is not None:
|
||||
event["latency_ms"] = int(latency_ms)
|
||||
d["events"].append(event)
|
||||
if len(d["events"]) > MAX_EVENTS:
|
||||
d["events"] = d["events"][-MAX_EVENTS:]
|
||||
_write(d)
|
||||
return event
|
||||
|
||||
|
||||
def get_mac(subject: str) -> Optional[str]:
|
||||
d = load()
|
||||
return d.get("macs", {}).get(spark)
|
||||
return d.get("macs", {}).get(subject)
|
||||
|
||||
|
||||
def _normalize_event(e: dict) -> dict:
|
||||
"""Promote legacy v0.5 events to the v0.6 shape so the UI sees one schema."""
|
||||
if "subject" in e:
|
||||
e.setdefault("kind", "transition")
|
||||
return e
|
||||
# Legacy: had "spark" + "transition" only
|
||||
if "spark" in e:
|
||||
e["subject"] = e.pop("spark")
|
||||
e.setdefault("kind", "transition")
|
||||
return e
|
||||
|
||||
|
||||
def summary() -> dict:
|
||||
"""Compact summary for the UI: known MACs, current state, recent events."""
|
||||
d = load()
|
||||
events = d.get("events", [])
|
||||
events = [_normalize_event(dict(e)) for e in d.get("events", [])]
|
||||
return {
|
||||
"macs": d.get("macs", {}),
|
||||
"current": d.get("current", {}),
|
||||
"last_change": d.get("last_change", {}),
|
||||
"events": events[-50:],
|
||||
"events": events[-80:],
|
||||
}
|
||||
|
||||
+48
-1
@@ -10,7 +10,7 @@ from pydantic import BaseModel
|
||||
from typing import Literal
|
||||
|
||||
from .config import Settings
|
||||
from .connectivity import get_mac, summary as connectivity_summary
|
||||
from .connectivity import get_mac, record_report, record_state, summary as connectivity_summary
|
||||
from .custom_services import add_custom_service, delete_custom_service
|
||||
from .download import DownloadManager
|
||||
from .hardware import HardwareProbe
|
||||
@@ -22,6 +22,7 @@ from .services import docker_state, run_action, services_from_settings
|
||||
from .ssh import ssh_run
|
||||
from .swap import SwapManager
|
||||
from .updates import UpdateManager, get_update_status
|
||||
from .validate import validate_launch
|
||||
from .wol import send_local_broadcast, send_via_peer
|
||||
|
||||
|
||||
@@ -136,6 +137,37 @@ async def get_connectivity() -> dict:
|
||||
return connectivity_summary()
|
||||
|
||||
|
||||
class HealthEventBody(BaseModel):
|
||||
service: str # e.g. "parakeet", "magpie", "vllm"
|
||||
ok: bool # true on success, false on failure
|
||||
source: str | None = None # what app reported (e.g. "open-webui")
|
||||
error: str | None = None # optional detail
|
||||
ms: int | None = None # optional latency
|
||||
|
||||
|
||||
@app.post("/api/health-event")
|
||||
async def post_health_event(body: HealthEventBody) -> dict:
|
||||
"""Passive endpoint: any LAN app can POST here when its call to one of our
|
||||
services succeeds or (more usefully) fails. We log the report into the
|
||||
connectivity history so a brief blip that polling misses still surfaces.
|
||||
|
||||
Example:
|
||||
curl -X POST http://<dashboard>/api/health-event \\
|
||||
-H 'content-type: application/json' \\
|
||||
-d '{"service":"parakeet","ok":false,"error":"503","source":"open-webui","ms":420}'
|
||||
"""
|
||||
if not body.service.strip():
|
||||
raise HTTPException(400, "service is required")
|
||||
event = record_report(
|
||||
body.service.strip(),
|
||||
ok=body.ok,
|
||||
source=(body.source or "external").strip(),
|
||||
detail=(body.error or "").strip(),
|
||||
latency_ms=body.ms,
|
||||
)
|
||||
return {"ok": True, "recorded": event}
|
||||
|
||||
|
||||
@app.post("/api/spark/{name}/wake")
|
||||
async def wake_spark(name: str) -> dict:
|
||||
"""Send a Wake-on-LAN magic packet for the named Spark.
|
||||
@@ -216,6 +248,8 @@ async def get_services() -> dict:
|
||||
results = await asyncio.gather(*[one(n) for n in services.keys()])
|
||||
for name, info in results:
|
||||
out[name] = info
|
||||
# Feed http reachability into the connectivity log (transition-only)
|
||||
record_state(name, bool(info.get("http_ready")))
|
||||
return out
|
||||
|
||||
|
||||
@@ -372,6 +406,10 @@ async def get_status() -> dict:
|
||||
check_parakeet(settings),
|
||||
check_magpie(settings),
|
||||
)
|
||||
# Feed health into the connectivity log (deduped — only logs on transition)
|
||||
record_state("vllm", bool(vllm.get("ok")))
|
||||
record_state("parakeet", bool(parakeet.get("ok")))
|
||||
record_state("magpie", bool(magpie.get("ok")))
|
||||
current_key = _identify_current_model(vllm.get("current_model"))
|
||||
return {
|
||||
"configured": settings.configured,
|
||||
@@ -397,6 +435,15 @@ class SwapRequest(BaseModel):
|
||||
dry_run: bool = False
|
||||
|
||||
|
||||
@app.post("/api/swap/{key}/validate")
|
||||
async def validate_swap(key: str) -> dict:
|
||||
"""Pre-flight check: run vLLM's argparse layer against the proposed launch
|
||||
command WITHOUT starting an engine. Cheap (~5 s) and doesn't disturb the
|
||||
currently-loaded model.
|
||||
"""
|
||||
return await validate_launch(key, catalog, settings)
|
||||
|
||||
|
||||
@app.post("/api/swap")
|
||||
async def post_swap(req: SwapRequest) -> dict:
|
||||
if not settings.configured and not req.dry_run:
|
||||
|
||||
+89
-15
@@ -73,8 +73,10 @@ function renderCards() {
|
||||
<button class="btn ${isActive ? '' : 'primary'}" data-swap-key="${key}" ${isActive || isSwapping ? 'disabled' : ''}>
|
||||
${isActive ? 'Current' : 'Switch to this'}
|
||||
</button>
|
||||
<button class="btn test-btn" data-test-key="${key}" title="Pre-flight check the launch command without starting the engine">Test</button>
|
||||
<button class="btn adv-btn" data-adv-key="${key}" title="Advanced settings">Advanced</button>
|
||||
</div>
|
||||
<div class="test-result hidden" data-test-result-for="${key}"></div>
|
||||
`;
|
||||
root.appendChild(card);
|
||||
}
|
||||
@@ -84,6 +86,37 @@ function renderCards() {
|
||||
for (const btn of root.querySelectorAll('[data-adv-key]')) {
|
||||
btn.addEventListener('click', () => openAdvanced(btn.dataset.advKey));
|
||||
}
|
||||
for (const btn of root.querySelectorAll('[data-test-key]')) {
|
||||
btn.addEventListener('click', () => testLaunch(btn.dataset.testKey, btn));
|
||||
}
|
||||
}
|
||||
|
||||
async function testLaunch(key, btn) {
|
||||
const resultEl = document.querySelector(`[data-test-result-for="${key}"]`);
|
||||
if (!resultEl) return;
|
||||
const originalText = btn.textContent;
|
||||
btn.disabled = true;
|
||||
btn.textContent = 'Testing…';
|
||||
resultEl.classList.remove('hidden', 'ok', 'fail');
|
||||
resultEl.innerHTML = '<span class="muted small">Checking launch args against vLLM\'s parser…</span>';
|
||||
try {
|
||||
const r = await fetchJSON(`/api/swap/${encodeURIComponent(key)}/validate`, { method: 'POST' });
|
||||
if (r.ok) {
|
||||
resultEl.classList.add('ok');
|
||||
resultEl.innerHTML = `<span class="ok-mark">✓</span> Launch args parse OK. <span class="muted small">(Doesn't guarantee runtime success — only catches argparse-level issues.)</span>`;
|
||||
} else {
|
||||
resultEl.classList.add('fail');
|
||||
const err = escapeHtml(r.error || 'unknown error');
|
||||
const stage = r.stage ? ` <span class="muted small">(${escapeHtml(r.stage)})</span>` : '';
|
||||
resultEl.innerHTML = `<span class="fail-mark">✗</span> Would fail: ${err}${stage}`;
|
||||
}
|
||||
} catch (e) {
|
||||
resultEl.classList.add('fail');
|
||||
resultEl.innerHTML = `<span class="fail-mark">✗</span> Test failed: ${escapeHtml(e.message)}`;
|
||||
} finally {
|
||||
btn.disabled = false;
|
||||
btn.textContent = originalText;
|
||||
}
|
||||
}
|
||||
|
||||
function renderCurrent(status) {
|
||||
@@ -146,28 +179,42 @@ function openConnectivityDialog() {
|
||||
const c = state.connectivity || {};
|
||||
const events = c.events || [];
|
||||
if (events.length === 0) {
|
||||
content.innerHTML = '<div class="muted small">No transitions recorded yet. Once a Spark goes down and comes back, you\'ll see entries here.</div>';
|
||||
content.innerHTML = '<div class="muted small">No events recorded yet. Once a Spark or service goes down and back up (or an external app reports a failure), entries appear here.</div>';
|
||||
dlg.showModal();
|
||||
return;
|
||||
}
|
||||
const bySpark = {};
|
||||
const bySubject = {};
|
||||
for (const e of events) {
|
||||
(bySpark[e.spark] = bySpark[e.spark] || []).push(e);
|
||||
const subj = e.subject || e.spark || 'unknown'; // legacy fallback
|
||||
(bySubject[subj] = bySubject[subj] || []).push(e);
|
||||
}
|
||||
const html = Object.entries(bySpark).map(([spark, evs]) => {
|
||||
const downs = evs.filter(e => e.transition === 'down').length;
|
||||
const mac = c.macs?.[spark];
|
||||
// Sort subjects: hosts first, then services, alphabetical
|
||||
const hostOrder = ['spark1', 'spark2'];
|
||||
const subjects = Object.keys(bySubject).sort((a, b) => {
|
||||
const ia = hostOrder.indexOf(a);
|
||||
const ib = hostOrder.indexOf(b);
|
||||
if (ia >= 0 && ib >= 0) return ia - ib;
|
||||
if (ia >= 0) return -1;
|
||||
if (ib >= 0) return 1;
|
||||
return a.localeCompare(b);
|
||||
});
|
||||
|
||||
const html = subjects.map((subj) => {
|
||||
const evs = bySubject[subj];
|
||||
const transitions = evs.filter(e => (e.kind || 'transition') === 'transition');
|
||||
const reports = evs.filter(e => e.kind === 'report');
|
||||
const downs = transitions.filter(e => e.transition === 'down').length;
|
||||
const failedReports = reports.filter(e => !e.ok).length;
|
||||
const mac = c.macs?.[subj];
|
||||
const summaryParts = [];
|
||||
if (transitions.length) summaryParts.push(`${transitions.length} probe transition${transitions.length===1?'':'s'} (${downs} down)`);
|
||||
if (reports.length) summaryParts.push(`${reports.length} app report${reports.length===1?'':'s'} (${failedReports} failed)`);
|
||||
const isHost = hostOrder.includes(subj);
|
||||
return `
|
||||
<div class="conn-spark">
|
||||
<h4>${escapeHtml(spark)}${mac ? ` <span class="muted small">${escapeHtml(mac)}</span>` : ''}</h4>
|
||||
<div class="conn-summary">${evs.length} transition${evs.length===1?'':'s'} · ${downs} down event${downs===1?'':'s'} in window</div>
|
||||
${evs.slice(-25).reverse().map(e => `
|
||||
<div class="conn-event ${e.transition}">
|
||||
<span class="when">${escapeHtml(e.at.replace('T', ' ').replace('Z', ''))}</span>
|
||||
<span class="what">${e.transition === 'up' ? '↑ came back online' : '↓ dropped offline'}</span>
|
||||
<span class="dur">${e.down_seconds != null ? `was down ${fmtDuration(e.down_seconds)}` : ''}${e.up_seconds != null ? `was up ${fmtDuration(e.up_seconds)}` : ''}</span>
|
||||
</div>
|
||||
`).join('')}
|
||||
<h4>${escapeHtml(subj)}${isHost ? ' <span class="muted small">[host]</span>' : ' <span class="muted small">[service]</span>'}${mac ? ` <span class="muted small">${escapeHtml(mac)}</span>` : ''}</h4>
|
||||
<div class="conn-summary">${summaryParts.join(' · ') || 'no events'}</div>
|
||||
${evs.slice(-30).reverse().map(e => renderConnEvent(e)).join('')}
|
||||
</div>
|
||||
`;
|
||||
}).join('');
|
||||
@@ -175,6 +222,33 @@ function openConnectivityDialog() {
|
||||
dlg.showModal();
|
||||
}
|
||||
|
||||
function renderConnEvent(e) {
|
||||
const when = escapeHtml((e.at || '').replace('T', ' ').replace('Z', ''));
|
||||
const kind = e.kind || 'transition';
|
||||
if (kind === 'report') {
|
||||
const ok = !!e.ok;
|
||||
const source = escapeHtml(e.source || 'external');
|
||||
const detail = e.detail ? ` — ${escapeHtml(e.detail)}` : '';
|
||||
const latency = e.latency_ms != null ? ` (${e.latency_ms} ms)` : '';
|
||||
return `
|
||||
<div class="conn-event ${ok ? 'up' : 'down'} report">
|
||||
<span class="when">${when}</span>
|
||||
<span class="what">${ok ? '◷ report: ok' : '◷ report: failed'} <span class="muted">from</span> ${source}${detail}</span>
|
||||
<span class="dur">${latency}</span>
|
||||
</div>
|
||||
`;
|
||||
}
|
||||
const down = e.down_seconds != null ? `was down ${fmtDuration(e.down_seconds)}` : '';
|
||||
const up = e.up_seconds != null ? `was up ${fmtDuration(e.up_seconds)}` : '';
|
||||
return `
|
||||
<div class="conn-event ${e.transition}">
|
||||
<span class="when">${when}</span>
|
||||
<span class="what">${e.transition === 'up' ? '↑ came back online' : '↓ dropped offline'}</span>
|
||||
<span class="dur">${down}${up}</span>
|
||||
</div>
|
||||
`;
|
||||
}
|
||||
|
||||
async function wakeSpark(name) {
|
||||
try {
|
||||
const r = await fetchJSON(`/api/spark/${name}/wake`, { method: 'POST' });
|
||||
|
||||
@@ -411,6 +411,8 @@ main {
|
||||
.conn-event .what { flex: 1; }
|
||||
.conn-event.up .what { color: var(--accent); }
|
||||
.conn-event.down .what { color: var(--error); }
|
||||
.conn-event.report .what { font-style: italic; }
|
||||
.conn-event .muted { color: var(--muted); font-style: normal; }
|
||||
.conn-event .dur { color: var(--muted); }
|
||||
.conn-summary { color: var(--muted); font-size: 11px; padding: 4px 0 10px; }
|
||||
.hw-metric { display: flex; align-items: center; gap: 10px; font-size: 12px; }
|
||||
@@ -699,9 +701,24 @@ main {
|
||||
.card.active .btn { background: rgba(74, 222, 128, 0.12); color: var(--accent); border-color: rgba(74, 222, 128, 0.4); }
|
||||
.card-actions { display: flex; gap: 6px; }
|
||||
.card-actions .btn.primary { flex: 1; }
|
||||
.card .adv-btn { padding: 8px 12px; font-size: 12px; }
|
||||
.card .adv-btn,
|
||||
.card .test-btn { padding: 8px 12px; font-size: 12px; }
|
||||
.card .custom-pill { color: var(--info); border-color: rgba(96, 165, 250, 0.4); }
|
||||
|
||||
.test-result {
|
||||
font-size: 12px;
|
||||
line-height: 1.45;
|
||||
padding: 8px 10px;
|
||||
border-radius: 5px;
|
||||
margin-top: 4px;
|
||||
border: 1px solid var(--border);
|
||||
background: var(--surface-2);
|
||||
}
|
||||
.test-result.ok { border-color: rgba(74, 222, 128, 0.4); background: rgba(74, 222, 128, 0.04); }
|
||||
.test-result.fail { border-color: rgba(239, 68, 68, 0.45); background: rgba(239, 68, 68, 0.06); word-break: break-word; }
|
||||
.test-result .ok-mark { color: var(--accent); font-weight: 600; }
|
||||
.test-result .fail-mark { color: var(--error); font-weight: 600; }
|
||||
|
||||
.footer {
|
||||
margin-top: 28px;
|
||||
padding-top: 16px;
|
||||
|
||||
@@ -0,0 +1,137 @@
|
||||
"""Pre-flight validation of a proposed vLLM launch command.
|
||||
|
||||
Runs vLLM's own argparse layer (EngineArgs) inside the vllm_node container WITHOUT
|
||||
starting the engine. Catches:
|
||||
|
||||
* unknown flag names (typos)
|
||||
* bad types / values that argparse rejects
|
||||
* deprecated flags removed in the installed vLLM version
|
||||
|
||||
Does NOT catch (these surface only during real engine init):
|
||||
* model-architecture-specific constraints (e.g. Qwen3.6 Mamba block_size)
|
||||
* OOM at weight-loading time
|
||||
* Triton / CUDA-kernel compatibility errors
|
||||
|
||||
A pre-flight check that returns "ok" is therefore NOT a guarantee — but a
|
||||
"failed" verdict is a definitive 'don't bother with the real swap'.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import json
|
||||
import shlex
|
||||
from typing import Any
|
||||
|
||||
from .config import Settings
|
||||
from .models import Catalog, build_launch_command
|
||||
from .ssh import ssh_run
|
||||
|
||||
|
||||
# Validates the proposed args against the same combined parser vLLM uses for
|
||||
# `vllm serve` (engine args + server args + frontend args). Returns one JSON
|
||||
# line on stdout: {"ok": true, ...} or {"ok": false, ...}.
|
||||
_VALIDATOR_SCRIPT = r"""
|
||||
import argparse, json, sys
|
||||
|
||||
# Mirror what `vllm serve` does internally: FlexibleArgumentParser (which is
|
||||
# more lenient about dashes vs underscores) wrapped with make_arg_parser
|
||||
# (which adds engine + server + frontend args).
|
||||
parser = None
|
||||
try:
|
||||
# Newer vLLM path
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
except Exception:
|
||||
try:
|
||||
# Older fallback
|
||||
from vllm.engine.arg_utils import FlexibleArgumentParser
|
||||
except Exception:
|
||||
FlexibleArgumentParser = argparse.ArgumentParser # type: ignore
|
||||
|
||||
try:
|
||||
from vllm.entrypoints.openai.cli_args import make_arg_parser
|
||||
parser = make_arg_parser(FlexibleArgumentParser(add_help=False))
|
||||
except Exception:
|
||||
pass
|
||||
if parser is None:
|
||||
try:
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
parser = FlexibleArgumentParser(add_help=False)
|
||||
EngineArgs.add_cli_args(parser)
|
||||
except Exception as e:
|
||||
print(json.dumps({"ok": False, "stage": "import", "error": f"{type(e).__name__}: {e}"}))
|
||||
sys.exit(0)
|
||||
|
||||
class _ArgError(Exception):
|
||||
pass
|
||||
|
||||
def _err(message):
|
||||
raise _ArgError(message)
|
||||
|
||||
parser.error = _err # capture argparse errors instead of sys.exit(2)
|
||||
|
||||
try:
|
||||
raw = sys.stdin.read()
|
||||
arglist = json.loads(raw)
|
||||
ns = parser.parse_args(arglist)
|
||||
print(json.dumps({"ok": True, "model": getattr(ns, "model", None)}))
|
||||
except _ArgError as e:
|
||||
print(json.dumps({"ok": False, "stage": "parse", "error": str(e)}))
|
||||
except SystemExit as e:
|
||||
print(json.dumps({"ok": False, "stage": "parse", "error": f"argparse exit {e.code}"}))
|
||||
except Exception as e:
|
||||
print(json.dumps({"ok": False, "stage": "parse", "error": f"{type(e).__name__}: {e}"}))
|
||||
"""
|
||||
|
||||
|
||||
def _vllm_arg_list(key: str, model_def, catalog: Catalog) -> list[str]:
|
||||
"""Reconstruct the args list passed to `vllm serve` (without the positional model)."""
|
||||
cmd = build_launch_command(key, model_def, catalog.defaults)
|
||||
# build_launch_command yields:
|
||||
# ./launch-cluster.sh [--solo] -d exec vllm serve <repo> <args...>
|
||||
# We just want the bits after `vllm serve <repo>`.
|
||||
tokens = shlex.split(cmd)
|
||||
if "serve" not in tokens:
|
||||
return []
|
||||
i = tokens.index("serve")
|
||||
after = tokens[i + 1 :] # repo, then args
|
||||
if not after:
|
||||
return []
|
||||
args = after[1:] # drop the repo
|
||||
# EngineArgs expects --model=REPO rather than positional, so prepend it.
|
||||
return [f"--model={after[0]}", *args]
|
||||
|
||||
|
||||
async def validate_launch(key: str, catalog: Catalog, settings: Settings) -> dict:
|
||||
if key not in catalog.models:
|
||||
return {"ok": False, "stage": "lookup", "error": f"unknown model: {key}"}
|
||||
if not settings.spark1_host or not settings.spark1_user:
|
||||
return {"ok": False, "stage": "config", "error": "spark1 not configured"}
|
||||
|
||||
model = catalog.models[key]
|
||||
arg_list = _vllm_arg_list(key, model, catalog)
|
||||
if not arg_list:
|
||||
return {"ok": False, "stage": "build", "error": "failed to build args list"}
|
||||
|
||||
payload = json.dumps(arg_list).replace("'", "'\\''")
|
||||
# Pipe the JSON args list to a here-doc Python invocation. The validator
|
||||
# reads from stdin to avoid shell-escaping the args themselves.
|
||||
cmd = (
|
||||
f"echo '{payload}' | docker exec -i vllm_node python3 -c "
|
||||
+ shlex.quote(_VALIDATOR_SCRIPT)
|
||||
)
|
||||
|
||||
rc, out, err = await ssh_run(settings.spark1_host, settings.spark1_user, cmd, settings, timeout=20)
|
||||
if rc != 0 and not out.strip():
|
||||
return {
|
||||
"ok": False,
|
||||
"stage": "ssh",
|
||||
"error": err.strip() or f"rc={rc}",
|
||||
"cmd_args": arg_list,
|
||||
"launch_cmd": build_launch_command(key, model, catalog.defaults),
|
||||
}
|
||||
last = out.strip().splitlines()[-1] if out.strip() else ""
|
||||
try:
|
||||
result: dict[str, Any] = json.loads(last)
|
||||
except json.JSONDecodeError:
|
||||
result = {"ok": False, "stage": "decode", "error": "validator did not return JSON", "raw": out[-500:]}
|
||||
result["cmd_args"] = arg_list
|
||||
result["launch_cmd"] = build_launch_command(key, model, catalog.defaults)
|
||||
return result
|
||||
@@ -66,6 +66,7 @@ models:
|
||||
vllm_args:
|
||||
- --gpu-memory-utilization=0.85
|
||||
- --max-model-len=65536
|
||||
- --max-num-batched-tokens=16384
|
||||
- --reasoning-parser=qwen3
|
||||
- --moe_backend=flashinfer_cutlass
|
||||
- --load-format=fastsafetensors
|
||||
|
||||
@@ -20,6 +20,10 @@ The trick is the `docker run --rm alpine chown` — it runs as root inside the t
|
||||
|
||||
This flag is Blackwell-specific. If vLLM in the container reports `unrecognized arguments: --moe_backend` or similar, edit `models.yaml` for `qwen36` and drop that flag. The swap UI does NOT auto-fallback in v0.1 — failure surfaces in the log stream.
|
||||
|
||||
## Qwen3.6 Mamba block-size assertion (fixed in v0.6.0:1)
|
||||
|
||||
Qwen3.6 uses a Mamba-attention hybrid that requires `--max-num-batched-tokens >= 2096`. vLLM's default is 2048, which trips `AssertionError: In Mamba cache align mode, block_size (2096) must be <= max_num_batched_tokens (2048)`. Fix: bake `--max-num-batched-tokens=16384` into the bundled qwen36 entry — matches the upstream qwen3.5-35b-a3b-fp8 recipe.
|
||||
|
||||
## Two SSH paths to Spark 1 from the laptop
|
||||
|
||||
`ssh <spark-user>@<spark-1-ip>` does NOT work from the laptop because the NVIDIA Sync ssh_config only has a Host entry for `<spark-1-host>.local`. Always use the `.local` hostname or `<spark-2-ip>`-style entries that ARE matched.
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
import { VersionInfo, IMPOSSIBLE } from '@start9labs/start-sdk'
|
||||
|
||||
export const v0_1_0 = VersionInfo.of({
|
||||
version: '0.5.0:0',
|
||||
version: '0.7.0:2',
|
||||
releaseNotes: {
|
||||
en_US:
|
||||
'v0.5: Wake-on-LAN + connectivity history. Each Spark\'s MAC is now auto-discovered during the normal hardware sweep and cached in /data/connectivity.json. Up/down transitions are logged with duration. Unreachable hardware cards get a "Wake (WoL)" button that sends a magic packet (preferring the other Spark as the sender so it originates on the right LAN segment). New "Connectivity log" button in the hardware section shows the recent transitions for each Spark — useful for spotting patterns (e.g. always-at-noon dropouts).',
|
||||
'v0.7: pre-flight launch validation. New "Test" button on every model card runs vLLM\'s argparse against the proposed launch command inside the running vllm_node container — without starting an engine. Catches unknown flags, bad types, and version-removed flags in about 5 seconds, before disrupting the currently-loaded model. (Runtime-only failures like the Qwen3.6 Mamba block-size assertion still only surface during a real swap, but argparse-stage bugs are now caught up front.)',
|
||||
},
|
||||
migrations: {
|
||||
up: async ({ effects }) => {},
|
||||
|
||||
Reference in New Issue
Block a user