7ae6ab3ba8
GPU-arbiter safety layer for when automation, not just the dashboard, swaps models: - swap reservation lock (POST/GET/DELETE /api/swap/lock); 423-enforced in post_swap via a single-read gate, TTL-bounded, secret-token auth, human force-release override + dashboard banner - swap webhook (swap_complete/swap_failed) fired outside the swap lock, optional HMAC signature, configurable URL+secret - read-only schedule registry (GET/POST/DELETE /api/schedule) + dashboard panel New module image/app/coordination.py; docs/COORDINATION.md for consumers; 22 offline tests in test_coordination.py.
220 lines
8.8 KiB
TypeScript
220 lines
8.8 KiB
TypeScript
import { sdk } from '../sdk'
|
|
import { sparkConfigYaml } from '../fileModels/sparkConfig.yaml'
|
|
|
|
const { InputSpec, Value } = sdk
|
|
|
|
const inputSpec = InputSpec.of({
|
|
spark1_host: Value.text({
|
|
name: 'Spark 1 hostname or IP',
|
|
description:
|
|
'The head node of your DGX Spark cluster — the one that has ~/spark-vllm-docker cloned and runs the vLLM container. Enter its LAN IP (recommended) or hostname.',
|
|
required: true,
|
|
default: null,
|
|
placeholder: 'e.g. 192.168.1.10',
|
|
masked: false,
|
|
}),
|
|
spark1_user: Value.text({
|
|
name: 'Spark 1 SSH user',
|
|
description:
|
|
'The user account on Spark 1 to SSH in as — whatever you log in as when you ssh into it manually.',
|
|
required: true,
|
|
default: null,
|
|
placeholder: 'your SSH username',
|
|
masked: false,
|
|
}),
|
|
spark2_host: Value.text({
|
|
name: 'Spark 2 hostname or IP',
|
|
description:
|
|
'The worker node of your DGX Spark cluster (also runs always-on services like Parakeet and Kokoro). Enter its LAN IP or hostname.',
|
|
required: true,
|
|
default: null,
|
|
placeholder: 'e.g. 192.168.1.11',
|
|
masked: false,
|
|
}),
|
|
spark2_user: Value.text({
|
|
name: 'Spark 2 SSH user',
|
|
description:
|
|
'The user account on Spark 2 to SSH in as. Usually the same as Spark 1.',
|
|
required: true,
|
|
default: null,
|
|
placeholder: 'your SSH username',
|
|
masked: false,
|
|
}),
|
|
vllm_port: Value.text({
|
|
name: 'vLLM port (optional)',
|
|
description:
|
|
"The port your vLLM server listens on, on Spark 1 — used by the health check and the chat proxy. Leave blank to use 8888, which is what the bundled launch-cluster.sh wrapper uses. Set this to 8000 (vLLM's own default) or another port if your vLLM listens elsewhere.",
|
|
required: false,
|
|
default: null,
|
|
placeholder: 'leave blank for 8888',
|
|
masked: false,
|
|
}),
|
|
vllm_container: Value.text({
|
|
name: 'vLLM container name (optional)',
|
|
description:
|
|
'Docker container name for the swappable vLLM on Spark 1. Defaults to "vllm_node" (what the bundled launch-cluster.sh creates). Change this only if you run your vLLM under a different container name — the model-swap log view and the pre-flight validator exec into it by name.',
|
|
required: false,
|
|
default: null,
|
|
placeholder: 'leave blank for vllm_node',
|
|
masked: false,
|
|
}),
|
|
disabled_services: Value.text({
|
|
name: 'Services to hide (optional)',
|
|
description:
|
|
"Comma-separated list of built-in services your cluster doesn't run, so Spark Control hides their tiles and stops probing them. Valid names: parakeet, kokoro, embeddings, qdrant. Example: if you only run vLLM, set this to 'parakeet,kokoro,embeddings,qdrant'. Leave blank to monitor all of them. (Useful when, say, your vLLM shares port 8000 with Parakeet's default — hide Parakeet so its probe doesn't hit vLLM.)",
|
|
required: false,
|
|
default: null,
|
|
placeholder: 'e.g. parakeet,kokoro',
|
|
masked: false,
|
|
}),
|
|
parakeet_host: Value.text({
|
|
name: 'Parakeet host (optional)',
|
|
description:
|
|
"Override the host running the Parakeet STT container. Leave blank if Parakeet runs on Spark 2 — that's the default. Set this if you run Parakeet on Spark 1 or a different machine.",
|
|
required: false,
|
|
default: null,
|
|
placeholder: 'leave blank to use Spark 2',
|
|
masked: false,
|
|
}),
|
|
parakeet_container: Value.text({
|
|
name: 'Parakeet container name (optional)',
|
|
description:
|
|
'Docker container name for Parakeet. Defaults to "parakeet-asr" — change only if you named yours something else.',
|
|
required: false,
|
|
default: null,
|
|
placeholder: 'parakeet-asr',
|
|
masked: false,
|
|
}),
|
|
kokoro_host: Value.text({
|
|
name: 'Kokoro host (optional)',
|
|
description:
|
|
'Override the host running the Kokoro TTS container. Leave blank if Kokoro runs on Spark 2.',
|
|
required: false,
|
|
default: null,
|
|
placeholder: 'leave blank to use Spark 2',
|
|
masked: false,
|
|
}),
|
|
kokoro_container: Value.text({
|
|
name: 'Kokoro container name (optional)',
|
|
description: 'Docker container name for Kokoro. Defaults to "kokoro-tts".',
|
|
required: false,
|
|
default: null,
|
|
placeholder: 'kokoro-tts',
|
|
masked: false,
|
|
}),
|
|
embed_host: Value.text({
|
|
name: 'Embedding server host (optional)',
|
|
description:
|
|
'Override the host running the spark-embed container (bge-m3 dense embeddings + reranker). Leave blank if it runs on Spark 2.',
|
|
required: false,
|
|
default: null,
|
|
placeholder: 'leave blank to use Spark 2',
|
|
masked: false,
|
|
}),
|
|
embed_container: Value.text({
|
|
name: 'Embedding container name (optional)',
|
|
description:
|
|
'Docker container name for the embedding server. Defaults to "spark-embed".',
|
|
required: false,
|
|
default: null,
|
|
placeholder: 'spark-embed',
|
|
masked: false,
|
|
}),
|
|
qdrant_host: Value.text({
|
|
name: 'Qdrant host (optional)',
|
|
description:
|
|
'Override the host running the Qdrant vector database. Leave blank if it runs on Spark 2.',
|
|
required: false,
|
|
default: null,
|
|
placeholder: 'leave blank to use Spark 2',
|
|
masked: false,
|
|
}),
|
|
qdrant_container: Value.text({
|
|
name: 'Qdrant container name (optional)',
|
|
description: 'Docker container name for Qdrant. Defaults to "qdrant".',
|
|
required: false,
|
|
default: null,
|
|
placeholder: 'qdrant',
|
|
masked: false,
|
|
}),
|
|
qdrant_collection: Value.text({
|
|
name: 'Default Qdrant collection (optional)',
|
|
description:
|
|
'Default collection name used by /api/search when a request does not specify one. Leave blank to require callers to pass a collection.',
|
|
required: false,
|
|
default: null,
|
|
placeholder: 'e.g. crm_chunks',
|
|
masked: false,
|
|
}),
|
|
matrix_bridge_user: Value.text({
|
|
name: 'matrix-bridge bot SSH user (optional)',
|
|
description:
|
|
"If you run the matrix-bridge Matrix bot on Spark 2, enter the SSH user that owns its ~/matrix-bridge folder (e.g. 'modelo'). Spark Control then shows a tile to update, restart, and view logs for the bot. Leave blank if you don't run the bot — the tile stays hidden. Note: this package's SSH public key must be authorized for that user (Show Public Key action) unless it's the same as your Spark 2 user.",
|
|
required: false,
|
|
default: null,
|
|
placeholder: 'e.g. modelo',
|
|
masked: false,
|
|
}),
|
|
open_webui_url: Value.text({
|
|
name: 'Open WebUI URL (optional)',
|
|
description:
|
|
'If you also run Open WebUI on your LAN, paste its URL here. Spark Control will then show a one-click "Open chat" button next to the current model so you can jump straight to it.',
|
|
required: false,
|
|
default: null,
|
|
placeholder: 'e.g. https://open-webui.yourserver.local',
|
|
masked: false,
|
|
}),
|
|
ngc_api_key: Value.text({
|
|
name: 'NGC API key (optional)',
|
|
description:
|
|
'NVIDIA NGC personal API key — needed to install NIM containers (Parakeet, etc.) from nvcr.io. Get one free at https://ngc.nvidia.com/setup/personal-key. Stored only on this Start9 server; passed to docker as the NGC_API_KEY env var when installing NIM services. (Kokoro TTS is Apache 2.0 and does not need an NGC key.)',
|
|
required: false,
|
|
default: null,
|
|
placeholder: 'starts with "nvapi-..."',
|
|
masked: true,
|
|
}),
|
|
swap_webhook_url: Value.text({
|
|
name: 'Swap webhook URL (optional)',
|
|
description:
|
|
'If you run automation that needs to know when the loaded model changes, paste a URL here. Spark Control POSTs a small JSON event (swap_complete / swap_failed) to it after every model swap, so the consumer can re-point its config to the new model. Leave blank to disable. Only needed if something other than this dashboard cares about swaps.',
|
|
required: false,
|
|
default: null,
|
|
placeholder: 'e.g. https://my-service.local/spark-swap',
|
|
masked: false,
|
|
}),
|
|
swap_webhook_secret: Value.text({
|
|
name: 'Swap webhook secret (optional)',
|
|
description:
|
|
'Optional shared secret. If set, each webhook is signed with an "X-Spark-Signature: sha256=…" header (HMAC of the body) so the receiver can verify it really came from Spark Control. Leave blank to send the webhook unsigned.',
|
|
required: false,
|
|
default: null,
|
|
placeholder: 'a random string the receiver also knows',
|
|
masked: true,
|
|
}),
|
|
})
|
|
|
|
export const configureSparks = sdk.Action.withInput(
|
|
'configure-sparks',
|
|
async () => ({
|
|
name: 'Configure Sparks',
|
|
description: 'Set the hostnames and SSH users for your two Spark nodes.',
|
|
warning: null,
|
|
visibility: 'enabled',
|
|
allowedStatuses: 'any',
|
|
group: null,
|
|
}),
|
|
async () => inputSpec,
|
|
async ({ effects }) => {
|
|
const cfg = await sparkConfigYaml.read().once()
|
|
return cfg ?? null
|
|
},
|
|
async ({ effects, input }) => {
|
|
// Optional fields come through as `null`; coerce to empty string for the schema.
|
|
const normalized = Object.fromEntries(
|
|
Object.entries(input).map(([k, v]) => [k, v ?? '']),
|
|
) as Record<string, string>
|
|
await sparkConfigYaml.merge(effects, normalized)
|
|
return null
|
|
},
|
|
)
|