1889ab45fb
Hotfix (was v0.3.1):
- services.py: cache 'unreachable' per (host,user) for 25s so a dead Spark doesn't hang every /api/services call behind 6s ssh timeout
- ssh_run timeout reduced 10 -> 6s for docker_state probes
- hardware probe: shorter SSH timeout (6s), longer cache TTL for failures (25s)
- JS pollStatus retries loadModels() if state.models is empty (recovers from cold-start proxy timeout)
- Unreachable hardware card now includes troubleshooting steps (Spark Control cannot SSH into an unreachable Spark to restart it)
v0.4 NIM installer:
- nim.py module: curated SUGGESTED_NIMS list (Parakeet, Magpie, Riva) + NimManager that runs docker login nvcr.io + docker pull + docker run -d --gpus all -p PORT:PORT -v VOL:/opt/nim/.cache -e NGC_API_KEY -e ... --restart=unless-stopped + chown the volume to uid 1000 + restart. Streams all output via SSE; redacts the API key from log lines.
- custom_services.py: persists installed NIMs to /data/services-overrides.yaml so they appear in the services panel after install
- services.py: merges custom services into the panel
- /api/nim/catalog GET, /api/nim/install POST + GET/SSE
- /api/services/{name} DELETE for custom services
- UI: '+ Install NIM' button next to 'Always-on services'; modal lists curated images each with a 'Pick' button + a custom-image form; installation runs in a second dialog with phase + elapsed timer + collapsible log
- NGC API key field added to Configure Sparks (masked); injected as NGC_API_KEY env var into the container
Package: bump 0.4.0:0; main.ts adds SERVICES_OVERRIDES + NGC_API_KEY env vars
123 lines
4.2 KiB
TypeScript
123 lines
4.2 KiB
TypeScript
import { sdk } from '../sdk'
|
|
import { sparkConfigYaml } from '../fileModels/sparkConfig.yaml'
|
|
|
|
const { InputSpec, Value } = sdk
|
|
|
|
const inputSpec = InputSpec.of({
|
|
spark1_host: Value.text({
|
|
name: 'Spark 1 hostname or IP',
|
|
description:
|
|
'The head node of your DGX Spark cluster — the one that has ~/spark-vllm-docker cloned and runs the vLLM container. Enter its LAN IP (recommended) or hostname.',
|
|
required: true,
|
|
default: null,
|
|
placeholder: 'e.g. 192.168.1.10',
|
|
masked: false,
|
|
}),
|
|
spark1_user: Value.text({
|
|
name: 'Spark 1 SSH user',
|
|
description:
|
|
'The user account on Spark 1 to SSH in as — whatever you log in as when you ssh into it manually.',
|
|
required: true,
|
|
default: null,
|
|
placeholder: 'your SSH username',
|
|
masked: false,
|
|
}),
|
|
spark2_host: Value.text({
|
|
name: 'Spark 2 hostname or IP',
|
|
description:
|
|
'The worker node of your DGX Spark cluster (also runs always-on services like Parakeet/Magpie). Enter its LAN IP or hostname.',
|
|
required: true,
|
|
default: null,
|
|
placeholder: 'e.g. 192.168.1.11',
|
|
masked: false,
|
|
}),
|
|
spark2_user: Value.text({
|
|
name: 'Spark 2 SSH user',
|
|
description:
|
|
'The user account on Spark 2 to SSH in as. Usually the same as Spark 1.',
|
|
required: true,
|
|
default: null,
|
|
placeholder: 'your SSH username',
|
|
masked: false,
|
|
}),
|
|
parakeet_host: Value.text({
|
|
name: 'Parakeet host (optional)',
|
|
description:
|
|
'Override the host running the Parakeet STT container. Leave blank if Parakeet runs on Spark 2 — that\'s the default. Set this if you run Parakeet on Spark 1 or a different machine.',
|
|
required: false,
|
|
default: null,
|
|
placeholder: 'leave blank to use Spark 2',
|
|
masked: false,
|
|
}),
|
|
parakeet_container: Value.text({
|
|
name: 'Parakeet container name (optional)',
|
|
description:
|
|
'Docker container name for Parakeet. Defaults to "parakeet-asr" — change only if you named yours something else.',
|
|
required: false,
|
|
default: null,
|
|
placeholder: 'parakeet-asr',
|
|
masked: false,
|
|
}),
|
|
magpie_host: Value.text({
|
|
name: 'Magpie host (optional)',
|
|
description:
|
|
'Override the host running the Magpie TTS container. Leave blank if Magpie runs on Spark 2.',
|
|
required: false,
|
|
default: null,
|
|
placeholder: 'leave blank to use Spark 2',
|
|
masked: false,
|
|
}),
|
|
magpie_container: Value.text({
|
|
name: 'Magpie container name (optional)',
|
|
description:
|
|
'Docker container name for Magpie. Defaults to "magpie-tts".',
|
|
required: false,
|
|
default: null,
|
|
placeholder: 'magpie-tts',
|
|
masked: false,
|
|
}),
|
|
open_webui_url: Value.text({
|
|
name: 'Open WebUI URL (optional)',
|
|
description:
|
|
'If you also run Open WebUI on your LAN, paste its URL here. Spark Control will then show a one-click "Open chat" button next to the current model so you can jump straight to it.',
|
|
required: false,
|
|
default: null,
|
|
placeholder: 'e.g. https://open-webui.yourserver.local',
|
|
masked: false,
|
|
}),
|
|
ngc_api_key: Value.text({
|
|
name: 'NGC API key (optional)',
|
|
description:
|
|
'NVIDIA NGC personal API key — needed to install NIM containers (Parakeet, Magpie, etc.) from nvcr.io. Get one free at https://ngc.nvidia.com/setup/personal-key. Stored only on this Start9 server; passed to docker as the NGC_API_KEY env var when installing NIM services.',
|
|
required: false,
|
|
default: null,
|
|
placeholder: 'starts with "nvapi-..."',
|
|
masked: true,
|
|
}),
|
|
})
|
|
|
|
export const configureSparks = sdk.Action.withInput(
|
|
'configure-sparks',
|
|
async () => ({
|
|
name: 'Configure Sparks',
|
|
description: 'Set the hostnames and SSH users for your two Spark nodes.',
|
|
warning: null,
|
|
visibility: 'enabled',
|
|
allowedStatuses: 'any',
|
|
group: null,
|
|
}),
|
|
async () => inputSpec,
|
|
async ({ effects }) => {
|
|
const cfg = await sparkConfigYaml.read().once()
|
|
return cfg ?? null
|
|
},
|
|
async ({ effects, input }) => {
|
|
// Optional fields come through as `null`; coerce to empty string for the schema.
|
|
const normalized = Object.fromEntries(
|
|
Object.entries(input).map(([k, v]) => [k, v ?? '']),
|
|
) as Record<string, string>
|
|
await sparkConfigYaml.merge(effects, normalized)
|
|
return null
|
|
},
|
|
)
|