v0.8.1:1 - fix disk probe: $HOME wasn't expanding inside shlex.quote
The 0.8.1:0 probe wrapped the entire path (including $HOME) in shlex.quote, which produces single quotes — preventing shell variable expansion. The resulting `[ -d '$HOME/.cache/...' ]` test looked for a literal path starting with the string $HOME and always failed, so every model reported as "not downloaded" and no trash icons rendered. Fix: embed $HOME in a double-quoted shell context (which allows expansion) and validate the cache dirname against a whitelist [A-Za-z0-9._-]+ rather than relying on shlex quoting. The dirname is fully constrained by HF's naming rules + our org--name munging, so the whitelist is tight enough. Verified against Spark 1: probe now correctly reports the 25,075,981,924 bytes (23.4 GB) of Qwen3.6's cache dir. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
+29
-25
@@ -10,7 +10,7 @@ model or one tied to an in-flight swap/download.
|
|||||||
"""
|
"""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
import asyncio
|
import asyncio
|
||||||
import shlex
|
import re
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
@@ -18,17 +18,21 @@ from .config import Settings
|
|||||||
from .ssh import ssh_run
|
from .ssh import ssh_run
|
||||||
|
|
||||||
|
|
||||||
|
# HF cache dirnames are `models--<org>--<name>` where <org> and <name> only contain
|
||||||
|
# Hugging Face's allowed identifier chars: letters, digits, dot, dash, underscore.
|
||||||
|
# Validate against this whitelist so we can safely embed the dirname into a shell
|
||||||
|
# command without quoting (we need $HOME outside the quotes to expand).
|
||||||
|
_SAFE_DIRNAME = re.compile(r"^[A-Za-z0-9._\-]+$")
|
||||||
|
|
||||||
|
|
||||||
def repo_to_cache_dirname(repo: str) -> str:
|
def repo_to_cache_dirname(repo: str) -> str:
|
||||||
"""Convert 'org/name' to 'models--org--name' (the HF hub cache directory)."""
|
"""Convert 'org/name' to 'models--org--name' (the HF hub cache directory)."""
|
||||||
if "/" not in repo:
|
if "/" not in repo:
|
||||||
raise ValueError(f"repo must be in 'org/name' form: {repo!r}")
|
raise ValueError(f"repo must be in 'org/name' form: {repo!r}")
|
||||||
return "models--" + repo.replace("/", "--")
|
dn = "models--" + repo.replace("/", "--")
|
||||||
|
if not _SAFE_DIRNAME.fullmatch(dn):
|
||||||
|
raise ValueError(f"unsafe cache dirname (rejected by whitelist): {dn!r}")
|
||||||
def _cache_path(repo: str) -> str:
|
return dn
|
||||||
"""Full remote path to the model's cache directory."""
|
|
||||||
# Use $HOME so it resolves correctly regardless of the SSH user's home.
|
|
||||||
return f"$HOME/.cache/huggingface/hub/{repo_to_cache_dirname(repo)}"
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@@ -51,13 +55,13 @@ async def probe_host(host: str, user: str, repo: str, settings: Settings) -> Hos
|
|||||||
"""Return whether the model's cache dir exists on this host and its size."""
|
"""Return whether the model's cache dir exists on this host and its size."""
|
||||||
if not host or not user:
|
if not host or not user:
|
||||||
return HostDiskResult(host=host or "?", on_disk=False, error="host not configured")
|
return HostDiskResult(host=host or "?", on_disk=False, error="host not configured")
|
||||||
path = _cache_path(repo)
|
dn = repo_to_cache_dirname(repo) # whitelisted; safe to embed
|
||||||
# `du -sb` prints bytes; if the dir doesn't exist, `du` returns non-zero.
|
# $HOME must expand server-side, so we build the path with double quotes
|
||||||
# We test existence explicitly first so we can report on_disk=False cleanly.
|
# (which DO allow variable expansion) rather than shlex.quote single quotes.
|
||||||
cmd = (
|
cmd = (
|
||||||
f"if [ -d {shlex.quote(path)} ]; then "
|
f'P="$HOME/.cache/huggingface/hub/{dn}"; '
|
||||||
f"du -sb {shlex.quote(path)} 2>/dev/null | awk '{{print $1}}'; "
|
f'if [ -d "$P" ]; then du -sb "$P" 2>/dev/null | cut -f1; '
|
||||||
f"else echo MISSING; fi"
|
f'else echo MISSING; fi'
|
||||||
)
|
)
|
||||||
rc, out, err = await ssh_run(host, user, cmd, settings, timeout=20.0)
|
rc, out, err = await ssh_run(host, user, cmd, settings, timeout=20.0)
|
||||||
if rc != 0:
|
if rc != 0:
|
||||||
@@ -88,19 +92,19 @@ async def delete_host(host: str, user: str, repo: str, settings: Settings) -> Ho
|
|||||||
"""Probe + rm -rf on one host. Returns bytes freed (0 if the dir wasn't there)."""
|
"""Probe + rm -rf on one host. Returns bytes freed (0 if the dir wasn't there)."""
|
||||||
if not host or not user:
|
if not host or not user:
|
||||||
return HostDiskResult(host=host or "?", on_disk=False, error="host not configured")
|
return HostDiskResult(host=host or "?", on_disk=False, error="host not configured")
|
||||||
path = _cache_path(repo)
|
dn = repo_to_cache_dirname(repo) # whitelisted; safe to embed
|
||||||
# Safety: hard-code the prefix in the command so a bad `repo` can never escape.
|
|
||||||
# Compute size first, then remove. If absent, still return success (idempotent).
|
# Compute size first, then remove. If absent, still return success (idempotent).
|
||||||
|
# $HOME is in double-quoted context so it expands; the dirname is whitelisted.
|
||||||
cmd = (
|
cmd = (
|
||||||
f"set -e; "
|
f'set -e; '
|
||||||
f"P={shlex.quote(path)}; "
|
f'P="$HOME/.cache/huggingface/hub/{dn}"; '
|
||||||
f"if [ -d \"$P\" ]; then "
|
f'if [ -d "$P" ]; then '
|
||||||
f" SIZE=$(du -sb \"$P\" 2>/dev/null | awk '{{print $1}}'); "
|
f' SIZE=$(du -sb "$P" 2>/dev/null | cut -f1); '
|
||||||
f" rm -rf -- \"$P\"; "
|
f' rm -rf -- "$P"; '
|
||||||
f" echo FREED $SIZE; "
|
f' echo "FREED $SIZE"; '
|
||||||
f"else "
|
f'else '
|
||||||
f" echo FREED 0; "
|
f' echo "FREED 0"; '
|
||||||
f"fi"
|
f'fi'
|
||||||
)
|
)
|
||||||
rc, out, err = await ssh_run(host, user, cmd, settings, timeout=120.0)
|
rc, out, err = await ssh_run(host, user, cmd, settings, timeout=120.0)
|
||||||
if rc != 0:
|
if rc != 0:
|
||||||
|
|||||||
@@ -1,10 +1,10 @@
|
|||||||
import { VersionInfo, IMPOSSIBLE } from '@start9labs/start-sdk'
|
import { VersionInfo, IMPOSSIBLE } from '@start9labs/start-sdk'
|
||||||
|
|
||||||
export const v0_1_0 = VersionInfo.of({
|
export const v0_1_0 = VersionInfo.of({
|
||||||
version: '0.8.1:0',
|
version: '0.8.1:1',
|
||||||
releaseNotes: {
|
releaseNotes: {
|
||||||
en_US:
|
en_US:
|
||||||
'v0.8.1: model weights can now be deleted from disk directly from the dashboard. Each model card shows whether the weights are present (with on-disk GB size) or not yet downloaded. When present and the model is NOT currently loaded, a small trash icon appears on the card; clicking it pops a confirmation showing how many GB will be freed and on which Spark(s), then runs `rm -rf` on the Hugging Face cache directory via SSH. Cluster-mode models are deleted from both Sparks; solo-mode from Spark 1 only. Safety rails: refuses to delete the currently-loaded model, refuses during an in-flight swap or download, and the catalog entry stays — you can always re-download. Disk status is probed once on dashboard load and re-checked every 60s.',
|
'v0.8.1:1 — fix: the disk-status probe shipped in 0.8.1:0 was wrapping $HOME in single quotes via shlex.quote, which prevented shell variable expansion. Result: every model reported as "not downloaded" even when weights were on disk, so no trash icons appeared. Rewritten to embed $HOME in double-quoted shell context and validate the cache dirname against a whitelist. The trash icons now show up correctly. v0.8.1:0 features: per-card disk-presence pills (on disk · GB / not downloaded), trash icon to rm -rf the HF cache directory via SSH with a confirmation dialog. Safety rails unchanged: refuses to delete the currently-loaded model or during an in-flight swap/download; catalog entry persists for re-download.',
|
||||||
},
|
},
|
||||||
migrations: {
|
migrations: {
|
||||||
up: async ({ effects }) => {},
|
up: async ({ effects }) => {},
|
||||||
|
|||||||
Reference in New Issue
Block a user