v0.23.0:0 - local / fine-tuned model support

Add models that live as a directory on a Spark (e.g. LoRA-merged fine-tunes),
not just Hugging Face repos.

- ModelDef gains local_path; a model must set exactly one of repo / local_path.
  The validator also enforces the local-path whitelist and that any
  --chat-template lives inside local_path (only that dir is mounted).
- build_launch_command bind-mounts the dir into the vLLM container at the SAME
  host==container path via the launch script's VLLM_SPARK_EXTRA_DOCKER_ARGS hook,
  then `vllm serve <dir>`. No launch-cluster.sh change (verified the upstream
  expands that var unquoted; contract noted in runbook.md).
- shellsafe.validate_local_path: absolute path, charset whitelist, no '.'/'..'.
- POST /api/models validates the full entry via ModelDef before persisting, so a
  bad entry can't be written and then break catalog load; _merge_overrides skips
  an invalid override entry instead of failing the whole catalog.
- disk.py size-probes a local path with du; disk-delete refused for local models.
- UI: "+ Add local model" dialog, `local` badge, path shown instead of an HF
  link, delete button hidden for local models.
- Tests: local launch + injection round-trip, chat-template location, traversal,
  exactly-one-source, _merge_overrides skip-invalid (94 pass). Reviewer-agent
  pass; findings addressed.
This commit is contained in:
Keysat
2026-06-17 22:27:41 -05:00
parent 57a893000e
commit e783653ef0
14 changed files with 402 additions and 26 deletions
+41 -4
View File
@@ -15,6 +15,7 @@ from dataclasses import dataclass
from typing import Optional
from .config import Settings
from .shellsafe import quote_arg
from .ssh import ssh_run
@@ -76,16 +77,52 @@ async def probe_host(host: str, user: str, repo: str, settings: Settings) -> Hos
return HostDiskResult(host=host, on_disk=True, size_bytes=size)
async def probe_disk(repo: str, mode: str, settings: Settings) -> DiskStatus:
"""Probe one model across the relevant Sparks based on its mode (solo|cluster)."""
async def probe_local_host(host: str, user: str, path: str, settings: Settings) -> HostDiskResult:
"""Return whether a local model directory exists on this host and its size.
For locally fine-tuned models (a Spark directory, not an HF cache entry). The
path is whitelisted at the API boundary (shellsafe.validate_local_path); we
shlex-quote it here in depth.
"""
if not host or not user:
return HostDiskResult(host=host or "?", on_disk=False, error="host not configured")
qp = quote_arg(path)
cmd = f"if [ -d {qp} ]; then du -sb {qp} 2>/dev/null | cut -f1; else echo MISSING; fi"
rc, out, err = await ssh_run(host, user, cmd, settings, timeout=20.0)
if rc != 0:
return HostDiskResult(host=host, on_disk=False, error=(err or out).strip() or f"rc={rc}")
raw = out.strip()
if raw == "MISSING" or raw == "":
return HostDiskResult(host=host, on_disk=False)
try:
size = int(raw.splitlines()[-1])
except ValueError:
return HostDiskResult(host=host, on_disk=False, error=f"unparsable du output: {raw!r}")
return HostDiskResult(host=host, on_disk=True, size_bytes=size)
async def probe_disk(
repo: str, mode: str, settings: Settings, *, local_path: str | None = None
) -> DiskStatus:
"""Probe one model across the relevant Sparks based on its mode (solo|cluster).
A local model (local_path set) is probed by directory; otherwise by HF cache.
"""
hosts: list[tuple[str, str]] = [(settings.spark1_host, settings.spark1_user)]
if mode == "cluster" and settings.spark2_host:
hosts.append((settings.spark2_host, settings.spark2_user))
results = await asyncio.gather(*(probe_host(h, u, repo, settings) for h, u in hosts))
if local_path:
results = await asyncio.gather(
*(probe_local_host(h, u, local_path, settings) for h, u in hosts)
)
key = local_path
else:
results = await asyncio.gather(*(probe_host(h, u, repo, settings) for h, u in hosts))
key = repo
on_disk = any(r.on_disk for r in results)
total = sum(r.size_bytes for r in results)
return DiskStatus(repo=repo, on_disk=on_disk, total_bytes=total, per_host=list(results))
return DiskStatus(repo=key, on_disk=on_disk, total_bytes=total, per_host=list(results))
async def delete_host(host: str, user: str, repo: str, settings: Settings) -> HostDiskResult: