1889ab45fb
Hotfix (was v0.3.1):
- services.py: cache 'unreachable' per (host,user) for 25s so a dead Spark doesn't hang every /api/services call behind 6s ssh timeout
- ssh_run timeout reduced 10 -> 6s for docker_state probes
- hardware probe: shorter SSH timeout (6s), longer cache TTL for failures (25s)
- JS pollStatus retries loadModels() if state.models is empty (recovers from cold-start proxy timeout)
- Unreachable hardware card now includes troubleshooting steps (Spark Control cannot SSH into an unreachable Spark to restart it)
v0.4 NIM installer:
- nim.py module: curated SUGGESTED_NIMS list (Parakeet, Magpie, Riva) + NimManager that runs docker login nvcr.io + docker pull + docker run -d --gpus all -p PORT:PORT -v VOL:/opt/nim/.cache -e NGC_API_KEY -e ... --restart=unless-stopped + chown the volume to uid 1000 + restart. Streams all output via SSE; redacts the API key from log lines.
- custom_services.py: persists installed NIMs to /data/services-overrides.yaml so they appear in the services panel after install
- services.py: merges custom services into the panel
- /api/nim/catalog GET, /api/nim/install POST + GET/SSE
- /api/services/{name} DELETE for custom services
- UI: '+ Install NIM' button next to 'Always-on services'; modal lists curated images each with a 'Pick' button + a custom-image form; installation runs in a second dialog with phase + elapsed timer + collapsible log
- NGC API key field added to Configure Sparks (masked); injected as NGC_API_KEY env var into the container
Package: bump 0.4.0:0; main.ts adds SERVICES_OVERRIDES + NGC_API_KEY env vars
69 lines
2.1 KiB
TypeScript
69 lines
2.1 KiB
TypeScript
import { i18n } from './i18n'
|
|
import { sdk } from './sdk'
|
|
import { uiPort } from './utils'
|
|
import { sparkConfigYaml } from './fileModels/sparkConfig.yaml'
|
|
|
|
export const main = sdk.setupMain(async ({ effects }) => {
|
|
console.info(i18n('Starting Spark Control…'))
|
|
|
|
// Reactively read SSH targets from the user-configured yaml file.
|
|
// Changing this file via the "Configure Sparks" action restarts the daemon.
|
|
const cfg = (await sparkConfigYaml.read().const(effects)) ?? {
|
|
spark1_host: '',
|
|
spark1_user: '',
|
|
spark2_host: '',
|
|
spark2_user: '',
|
|
parakeet_host: '',
|
|
parakeet_user: '',
|
|
parakeet_container: '',
|
|
magpie_host: '',
|
|
magpie_user: '',
|
|
magpie_container: '',
|
|
open_webui_url: '',
|
|
ngc_api_key: '',
|
|
}
|
|
|
|
return sdk.Daemons.of(effects).addDaemon('primary', {
|
|
subcontainer: await sdk.SubContainer.of(
|
|
effects,
|
|
{ imageId: 'spark-control' },
|
|
sdk.Mounts.of().mountVolume({
|
|
volumeId: 'main',
|
|
subpath: null,
|
|
mountpoint: '/data',
|
|
readonly: false,
|
|
}),
|
|
'spark-control-sub',
|
|
),
|
|
exec: {
|
|
command: ['/app/entrypoint.sh'],
|
|
env: {
|
|
SPARK1_HOST: cfg.spark1_host,
|
|
SPARK1_USER: cfg.spark1_user,
|
|
SPARK2_HOST: cfg.spark2_host,
|
|
SPARK2_USER: cfg.spark2_user,
|
|
PARAKEET_HOST: cfg.parakeet_host,
|
|
PARAKEET_USER: cfg.parakeet_user,
|
|
PARAKEET_CONTAINER: cfg.parakeet_container,
|
|
MAGPIE_HOST: cfg.magpie_host,
|
|
MAGPIE_USER: cfg.magpie_user,
|
|
MAGPIE_CONTAINER: cfg.magpie_container,
|
|
MODELS_OVERRIDES: '/data/models-overrides.yaml',
|
|
SERVICES_OVERRIDES: '/data/services-overrides.yaml',
|
|
OPEN_WEBUI_URL: cfg.open_webui_url,
|
|
NGC_API_KEY: cfg.ngc_api_key,
|
|
BIND_PORT: String(uiPort),
|
|
},
|
|
},
|
|
ready: {
|
|
display: i18n('Web Interface'),
|
|
fn: () =>
|
|
sdk.healthCheck.checkPortListening(effects, uiPort, {
|
|
successMessage: i18n('The web interface is ready'),
|
|
errorMessage: i18n('The web interface is not ready'),
|
|
}),
|
|
},
|
|
requires: [],
|
|
})
|
|
})
|