v0.8.1:1 - fix disk probe: $HOME wasn't expanding inside shlex.quote

The 0.8.1:0 probe wrapped the entire path (including $HOME) in
shlex.quote, which produces single quotes — preventing shell
variable expansion. The resulting `[ -d '$HOME/.cache/...' ]` test
looked for a literal path starting with the string $HOME and
always failed, so every model reported as "not downloaded" and no
trash icons rendered.

Fix: embed $HOME in a double-quoted shell context (which allows
expansion) and validate the cache dirname against a whitelist
[A-Za-z0-9._-]+ rather than relying on shlex quoting. The dirname
is fully constrained by HF's naming rules + our org--name munging,
so the whitelist is tight enough.

Verified against Spark 1: probe now correctly reports the
25,075,981,924 bytes (23.4 GB) of Qwen3.6's cache dir.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Keysat
2026-05-13 17:58:43 -05:00
parent 9ff7ee9c1e
commit 513c78bfa5
2 changed files with 31 additions and 27 deletions
+29 -25
View File
@@ -10,7 +10,7 @@ model or one tied to an in-flight swap/download.
""" """
from __future__ import annotations from __future__ import annotations
import asyncio import asyncio
import shlex import re
from dataclasses import dataclass from dataclasses import dataclass
from typing import Optional from typing import Optional
@@ -18,17 +18,21 @@ from .config import Settings
from .ssh import ssh_run from .ssh import ssh_run
# HF cache dirnames are `models--<org>--<name>` where <org> and <name> only contain
# Hugging Face's allowed identifier chars: letters, digits, dot, dash, underscore.
# Validate against this whitelist so we can safely embed the dirname into a shell
# command without quoting (we need $HOME outside the quotes to expand).
_SAFE_DIRNAME = re.compile(r"^[A-Za-z0-9._\-]+$")
def repo_to_cache_dirname(repo: str) -> str: def repo_to_cache_dirname(repo: str) -> str:
"""Convert 'org/name' to 'models--org--name' (the HF hub cache directory).""" """Convert 'org/name' to 'models--org--name' (the HF hub cache directory)."""
if "/" not in repo: if "/" not in repo:
raise ValueError(f"repo must be in 'org/name' form: {repo!r}") raise ValueError(f"repo must be in 'org/name' form: {repo!r}")
return "models--" + repo.replace("/", "--") dn = "models--" + repo.replace("/", "--")
if not _SAFE_DIRNAME.fullmatch(dn):
raise ValueError(f"unsafe cache dirname (rejected by whitelist): {dn!r}")
def _cache_path(repo: str) -> str: return dn
"""Full remote path to the model's cache directory."""
# Use $HOME so it resolves correctly regardless of the SSH user's home.
return f"$HOME/.cache/huggingface/hub/{repo_to_cache_dirname(repo)}"
@dataclass @dataclass
@@ -51,13 +55,13 @@ async def probe_host(host: str, user: str, repo: str, settings: Settings) -> Hos
"""Return whether the model's cache dir exists on this host and its size.""" """Return whether the model's cache dir exists on this host and its size."""
if not host or not user: if not host or not user:
return HostDiskResult(host=host or "?", on_disk=False, error="host not configured") return HostDiskResult(host=host or "?", on_disk=False, error="host not configured")
path = _cache_path(repo) dn = repo_to_cache_dirname(repo) # whitelisted; safe to embed
# `du -sb` prints bytes; if the dir doesn't exist, `du` returns non-zero. # $HOME must expand server-side, so we build the path with double quotes
# We test existence explicitly first so we can report on_disk=False cleanly. # (which DO allow variable expansion) rather than shlex.quote single quotes.
cmd = ( cmd = (
f"if [ -d {shlex.quote(path)} ]; then " f'P="$HOME/.cache/huggingface/hub/{dn}"; '
f"du -sb {shlex.quote(path)} 2>/dev/null | awk '{{print $1}}'; " f'if [ -d "$P" ]; then du -sb "$P" 2>/dev/null | cut -f1; '
f"else echo MISSING; fi" f'else echo MISSING; fi'
) )
rc, out, err = await ssh_run(host, user, cmd, settings, timeout=20.0) rc, out, err = await ssh_run(host, user, cmd, settings, timeout=20.0)
if rc != 0: if rc != 0:
@@ -88,19 +92,19 @@ async def delete_host(host: str, user: str, repo: str, settings: Settings) -> Ho
"""Probe + rm -rf on one host. Returns bytes freed (0 if the dir wasn't there).""" """Probe + rm -rf on one host. Returns bytes freed (0 if the dir wasn't there)."""
if not host or not user: if not host or not user:
return HostDiskResult(host=host or "?", on_disk=False, error="host not configured") return HostDiskResult(host=host or "?", on_disk=False, error="host not configured")
path = _cache_path(repo) dn = repo_to_cache_dirname(repo) # whitelisted; safe to embed
# Safety: hard-code the prefix in the command so a bad `repo` can never escape.
# Compute size first, then remove. If absent, still return success (idempotent). # Compute size first, then remove. If absent, still return success (idempotent).
# $HOME is in double-quoted context so it expands; the dirname is whitelisted.
cmd = ( cmd = (
f"set -e; " f'set -e; '
f"P={shlex.quote(path)}; " f'P="$HOME/.cache/huggingface/hub/{dn}"; '
f"if [ -d \"$P\" ]; then " f'if [ -d "$P" ]; then '
f" SIZE=$(du -sb \"$P\" 2>/dev/null | awk '{{print $1}}'); " f' SIZE=$(du -sb "$P" 2>/dev/null | cut -f1); '
f" rm -rf -- \"$P\"; " f' rm -rf -- "$P"; '
f" echo FREED $SIZE; " f' echo "FREED $SIZE"; '
f"else " f'else '
f" echo FREED 0; " f' echo "FREED 0"; '
f"fi" f'fi'
) )
rc, out, err = await ssh_run(host, user, cmd, settings, timeout=120.0) rc, out, err = await ssh_run(host, user, cmd, settings, timeout=120.0)
if rc != 0: if rc != 0:
+2 -2
View File
@@ -1,10 +1,10 @@
import { VersionInfo, IMPOSSIBLE } from '@start9labs/start-sdk' import { VersionInfo, IMPOSSIBLE } from '@start9labs/start-sdk'
export const v0_1_0 = VersionInfo.of({ export const v0_1_0 = VersionInfo.of({
version: '0.8.1:0', version: '0.8.1:1',
releaseNotes: { releaseNotes: {
en_US: en_US:
'v0.8.1: model weights can now be deleted from disk directly from the dashboard. Each model card shows whether the weights are present (with on-disk GB size) or not yet downloaded. When present and the model is NOT currently loaded, a small trash icon appears on the card; clicking it pops a confirmation showing how many GB will be freed and on which Spark(s), then runs `rm -rf` on the Hugging Face cache directory via SSH. Cluster-mode models are deleted from both Sparks; solo-mode from Spark 1 only. Safety rails: refuses to delete the currently-loaded model, refuses during an in-flight swap or download, and the catalog entry stays — you can always re-download. Disk status is probed once on dashboard load and re-checked every 60s.', 'v0.8.1:1 — fix: the disk-status probe shipped in 0.8.1:0 was wrapping $HOME in single quotes via shlex.quote, which prevented shell variable expansion. Result: every model reported as "not downloaded" even when weights were on disk, so no trash icons appeared. Rewritten to embed $HOME in double-quoted shell context and validate the cache dirname against a whitelist. The trash icons now show up correctly. v0.8.1:0 features: per-card disk-presence pills (on disk · GB / not downloaded), trash icon to rm -rf the HF cache directory via SSH with a confirmation dialog. Safety rails unchanged: refuses to delete the currently-loaded model or during an in-flight swap/download; catalog entry persists for re-download.',
}, },
migrations: { migrations: {
up: async ({ effects }) => {}, up: async ({ effects }) => {},