From 1e1e1cb5687fee5221815c432fc4bb818f9a7bdf Mon Sep 17 00:00:00 2001 From: Keysat Date: Thu, 18 Jun 2026 16:44:07 -0500 Subject: [PATCH] v0.27.1:0 - fix model download: prepend ~/.local/bin so SSH finds uvx hf-download.sh shells out to uvx (the uv installer drops it in ~/.local/bin), but the non-interactive SSH session doesn't source the user's profile, so ~/.local/bin was off PATH and downloads died with "uvx: command not found". build_download_command now prepends $HOME/.local/bin. Adds test_download.py. --- AGENTS.md | 1 + image/app/download.py | 16 +++++++++++++- image/tests/test_download.py | 35 ++++++++++++++++++++++++++++++ package/startos/versions/v0_1_0.ts | 4 ++-- 4 files changed, 53 insertions(+), 3 deletions(-) create mode 100644 image/tests/test_download.py diff --git a/AGENTS.md b/AGENTS.md index 8a64937..1e66cc9 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -55,6 +55,7 @@ Subsystem guidance lives in `docs/guides/` and loads when matching files are tou ## Current state +- **Live: v0.27.1:0 — fix: "Download a new model" button (uvx PATH).** Installed on `immense-voyage` (`start-cli package list` confirms `0.27.1:0`); **not yet committed/pushed/published as of this write.** Root cause: `hf-download.sh` shells out to `uvx`, which the uv installer puts in `~/.local/bin`; Spark Control's *non-interactive* SSH session doesn't source the user's profile, so `~/.local/bin` is off PATH and the download died with "uvx: command not found" (same class as the matrix-bridge non-interactive-SSH gotcha). Fix: `download.build_download_command` prepends `export PATH="$HOME/.local/bin:$PATH"` (server-side `$HOME`, generic for any adopter); extracted to a pure helper with regression tests (`test_download.py`: PATH prefix, no-trailing-space, cluster flags, shlex round-trip). 161 pytest green; verified live (`uvx` resolves under the shipped PATH on Spark 1). Prompted by Grant hitting it while adding **Gemma-4-26B** — next: download `nvidia/Gemma-4-26B-A4B-NVFP4` (recipe `gemma4-26b` already in catalog) via the now-fixed button, then swap-test + business-card OCR. - **Live: v0.27.0:0 — in-app Settings gear + two bug fixes** (commit `7e07598`; installed on `immense-voyage` — `start-cli package list` confirms `0.27.0:0`; published to Clankistry; pushed to gitea master). Prompted by the second adopter's v0.25 feedback. (1) StartOS "Configure Sparks" action trimmed to the **four required fields**; all optional knobs moved to a **⚙ Settings gear** in the dashboard, backed by a `/data/app_settings.json` overlay (`app_settings.py`) keyed by env-var names, overlaid on `os.environ`, applied **live** via in-place `Settings.reload()` (architecture + the snapshot-holder gotcha are in the fastapi-image guide). Existing installs' values **migrate automatically** on first boot (`seed_from_env`). (2) **Support-service ports now configurable** (`PARAKEET_PORT`/`KOKORO_PORT`/`EMBED_PORT`/`QDRANT_PORT`; `VLLM_PORT` surfaced) — fixes the adopter's false "vLLM down" (theirs is on 8000, not launch-cluster.sh's 8888) and Parakeet 404 (remapped off 8000). (3) **Bug fix:** `GET /api/swap/lock` 404 (was shadowed by `/api/swap/{job_id}`; lock routes now register first). Code review caught a real P1 (the `WebhookNotifier` snapshot — fixed via `swap_webhook.update()` after reload, regression-tested). 157 pytest + live smoke all green. - **Next on this thread (small, externally gated):** (a) **adopter reply is drafted** (in the session — corrects the vLLM-port misconception → set 8000 in the gear, confirms the port knobs + swap/lock fix, asks the disk-scan diagnostic) — **pending Grant to send** + pick the distribution-channel wording. (b) **Optional Gitea tag + `make release`** so the adopter can pull v0.27 from Gitea Releases (NOT done this session — only registry + sideload shipped); do it only if that adopter pulls from Gitea Releases rather than subscribing to Clankistry. (c) **Un-diagnosed:** adopter's disk-scan shows Gemma "not on disk" — needs them to run `ls ~/.cache/huggingface/hub` as the SSH user vs `disk.py`'s `$HOME/.cache/huggingface/hub` assumption (likely a custom `HF_HOME`/container-volume/different-user cache path → would need a configurable cache path). - **Live: v0.26.0:0 — disk-driven model menu** (installed on the server 2026-06-18, `installed-version` confirms; also published to the self-hosted StartOS registry). The dashboard lists what's *actually downloaded* on the Sparks; `models.yaml`/overrides are **launch recipes** matched by `repo`, not the menu; an on-disk model with no recipe shows `needs_setup` and infers its launch flags from `config.json` (operator confirms once). Delete removes weights **and** the card; dropped the two legacy Qwen recipes. Architecture (`discovery.py`/`build_menu`/`infer_recipe`, the recipe-vs-disk split) is in the fastapi-image guide. diff --git a/image/app/download.py b/image/app/download.py index 8d010f3..6cfaf1c 100644 --- a/image/app/download.py +++ b/image/app/download.py @@ -23,6 +23,20 @@ from .ssh import ssh_stream, StreamHandle Mode = Literal["spark1", "spark2", "cluster"] +def build_download_command(repo: str, flags: str = "") -> str: + """Remote shell command that drives hf-download.sh on a Spark. + + Prepends ~/.local/bin to PATH. hf-download.sh shells out to `uvx` (Astral's + uv), and the official uv installer drops its binaries in ~/.local/bin — but + our SSH session is non-interactive, so it never sources the user's profile + and ~/.local/bin is off PATH, leaving `uvx` as "command not found". $HOME + expands server-side, so this stays correct for any adopter/user. `repo` is + shlex-quoted at the sink (validate_repo gates the charset upstream). + """ + serve = f"./hf-download.sh {quote_arg(repo)} {flags}".strip() + return f'export PATH="$HOME/.local/bin:$PATH" && cd ~/spark-vllm-docker && {serve}' + + _TQDM_RE = re.compile( r"(\d+(?:\.\d+)?)\s*%\s*\|.*?\|\s*" r"([\d.]+[KMG]?B?)\s*/\s*([\d.]+[KMG]?B?)\s*" @@ -126,7 +140,7 @@ class DownloadManager: if not target_host or not target_user: raise RuntimeError(f"{job.mode} host not configured") - cmd = f"cd ~/spark-vllm-docker && ./hf-download.sh {quote_arg(job.repo)} {flags}".strip() + cmd = build_download_command(job.repo, flags) job.append(f"$ {cmd}") job.state = "downloading" job.progress.phase = "Connecting to Hugging Face…" diff --git a/image/tests/test_download.py b/image/tests/test_download.py new file mode 100644 index 0000000..4b4bdc2 --- /dev/null +++ b/image/tests/test_download.py @@ -0,0 +1,35 @@ +"""build_download_command: the ~/.local/bin PATH fix + shell-injection quoting. + +hf-download.sh on the Spark shells out to `uvx`, which the uv installer puts in +~/.local/bin — off the PATH of our non-interactive SSH session. The command must +prepend ~/.local/bin (via $HOME, expanded server-side) or the download dies with +"uvx: command not found". The repo value must also be shlex-quoted at the sink so +a crafted value can't break out of the command (validate_repo gates it upstream). +""" +import shlex + +from app.download import build_download_command + + +def test_prepends_local_bin_to_path(): + cmd = build_download_command("org/name") + assert cmd.startswith('export PATH="$HOME/.local/bin:$PATH" && ') + assert "cd ~/spark-vllm-docker" in cmd + assert "./hf-download.sh org/name" in cmd + + +def test_no_trailing_space_without_flags(): + assert build_download_command("org/name", "").endswith("./hf-download.sh org/name") + + +def test_cluster_flags_appended(): + cmd = build_download_command("org/name", "-c --copy-parallel") + assert cmd.endswith("./hf-download.sh org/name -c --copy-parallel") + + +def test_repo_is_shlex_quoted(): + # Everything after the script name must shlex-split back to the exact repo, + # the same round-trip invariant build_launch_command relies on. + cmd = build_download_command("org/na;me") + after = cmd.split("./hf-download.sh ", 1)[1] + assert shlex.split(after) == ["org/na;me"] diff --git a/package/startos/versions/v0_1_0.ts b/package/startos/versions/v0_1_0.ts index 4efdc36..30c7abf 100644 --- a/package/startos/versions/v0_1_0.ts +++ b/package/startos/versions/v0_1_0.ts @@ -1,10 +1,10 @@ import { VersionInfo, IMPOSSIBLE } from '@start9labs/start-sdk' export const v0_1_0 = VersionInfo.of({ - version: '0.27.0:0', + version: '0.27.1:0', releaseNotes: { en_US: - 'v0.27.0:0 — settings move into the dashboard, plus two bug fixes. (1) New ⚙ Settings gear in the dashboard: all the optional cluster knobs — vLLM and support-service ports, container names, Parakeet/Kokoro/embeddings/Qdrant hosts, Open WebUI link, NGC key, swap webhook — are now edited here, in plain English, and apply immediately without a restart. The StartOS "Configure Sparks" action is now just the four required fields (two Spark IPs + SSH users); your existing optional values migrate into the gear automatically on first launch, and the settings are stored on the server and included in StartOS backups. (2) NEW: support-service ports are now configurable. If your vLLM runs on 8000 (vLLM\'s own default) and you moved Parakeet to another port, set them under ⚙ Settings → that fixes the false "vLLM down" and the Parakeet 404 some setups saw. (3) Bug fix: GET /api/swap/lock returned 404 (a routing bug where it was shadowed by the swap-job lookup); the swap reservation status now reads correctly. No breaking consumer-API changes; the /v1 proxy and swap API are unchanged.', + 'v0.27.1:0 — bug fix: "Download a new model" now works on its own. The downloader on the Spark relies on a helper tool (uvx, part of Astral\'s uv) that the standard installer places under your home directory in ~/.local/bin. Spark Control runs downloads over an automated SSH session that wasn\'t looking there, so a download failed immediately with "uvx: command not found" even though the tool was installed. Spark Control now includes ~/.local/bin on the path when it runs a download, so the Download button works with no manual setup. No other changes; the /v1 proxy, swap, and coordination APIs are unchanged.', }, migrations: { up: async ({ effects }) => {},