v0.23.0:0 - local / fine-tuned model support
Add models that live as a directory on a Spark (e.g. LoRA-merged fine-tunes), not just Hugging Face repos. - ModelDef gains local_path; a model must set exactly one of repo / local_path. The validator also enforces the local-path whitelist and that any --chat-template lives inside local_path (only that dir is mounted). - build_launch_command bind-mounts the dir into the vLLM container at the SAME host==container path via the launch script's VLLM_SPARK_EXTRA_DOCKER_ARGS hook, then `vllm serve <dir>`. No launch-cluster.sh change (verified the upstream expands that var unquoted; contract noted in runbook.md). - shellsafe.validate_local_path: absolute path, charset whitelist, no '.'/'..'. - POST /api/models validates the full entry via ModelDef before persisting, so a bad entry can't be written and then break catalog load; _merge_overrides skips an invalid override entry instead of failing the whole catalog. - disk.py size-probes a local path with du; disk-delete refused for local models. - UI: "+ Add local model" dialog, `local` badge, path shown instead of an HF link, delete button hidden for local models. - Tests: local launch + injection round-trip, chat-template location, traversal, exactly-one-source, _merge_overrides skip-invalid (94 pass). Reviewer-agent pass; findings addressed.
This commit is contained in:
+41
-4
@@ -15,6 +15,7 @@ from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
|
||||
from .config import Settings
|
||||
from .shellsafe import quote_arg
|
||||
from .ssh import ssh_run
|
||||
|
||||
|
||||
@@ -76,16 +77,52 @@ async def probe_host(host: str, user: str, repo: str, settings: Settings) -> Hos
|
||||
return HostDiskResult(host=host, on_disk=True, size_bytes=size)
|
||||
|
||||
|
||||
async def probe_disk(repo: str, mode: str, settings: Settings) -> DiskStatus:
|
||||
"""Probe one model across the relevant Sparks based on its mode (solo|cluster)."""
|
||||
async def probe_local_host(host: str, user: str, path: str, settings: Settings) -> HostDiskResult:
|
||||
"""Return whether a local model directory exists on this host and its size.
|
||||
|
||||
For locally fine-tuned models (a Spark directory, not an HF cache entry). The
|
||||
path is whitelisted at the API boundary (shellsafe.validate_local_path); we
|
||||
shlex-quote it here in depth.
|
||||
"""
|
||||
if not host or not user:
|
||||
return HostDiskResult(host=host or "?", on_disk=False, error="host not configured")
|
||||
qp = quote_arg(path)
|
||||
cmd = f"if [ -d {qp} ]; then du -sb {qp} 2>/dev/null | cut -f1; else echo MISSING; fi"
|
||||
rc, out, err = await ssh_run(host, user, cmd, settings, timeout=20.0)
|
||||
if rc != 0:
|
||||
return HostDiskResult(host=host, on_disk=False, error=(err or out).strip() or f"rc={rc}")
|
||||
raw = out.strip()
|
||||
if raw == "MISSING" or raw == "":
|
||||
return HostDiskResult(host=host, on_disk=False)
|
||||
try:
|
||||
size = int(raw.splitlines()[-1])
|
||||
except ValueError:
|
||||
return HostDiskResult(host=host, on_disk=False, error=f"unparsable du output: {raw!r}")
|
||||
return HostDiskResult(host=host, on_disk=True, size_bytes=size)
|
||||
|
||||
|
||||
async def probe_disk(
|
||||
repo: str, mode: str, settings: Settings, *, local_path: str | None = None
|
||||
) -> DiskStatus:
|
||||
"""Probe one model across the relevant Sparks based on its mode (solo|cluster).
|
||||
|
||||
A local model (local_path set) is probed by directory; otherwise by HF cache.
|
||||
"""
|
||||
hosts: list[tuple[str, str]] = [(settings.spark1_host, settings.spark1_user)]
|
||||
if mode == "cluster" and settings.spark2_host:
|
||||
hosts.append((settings.spark2_host, settings.spark2_user))
|
||||
|
||||
results = await asyncio.gather(*(probe_host(h, u, repo, settings) for h, u in hosts))
|
||||
if local_path:
|
||||
results = await asyncio.gather(
|
||||
*(probe_local_host(h, u, local_path, settings) for h, u in hosts)
|
||||
)
|
||||
key = local_path
|
||||
else:
|
||||
results = await asyncio.gather(*(probe_host(h, u, repo, settings) for h, u in hosts))
|
||||
key = repo
|
||||
on_disk = any(r.on_disk for r in results)
|
||||
total = sum(r.size_bytes for r in results)
|
||||
return DiskStatus(repo=repo, on_disk=on_disk, total_bytes=total, per_host=list(results))
|
||||
return DiskStatus(repo=key, on_disk=on_disk, total_bytes=total, per_host=list(results))
|
||||
|
||||
|
||||
async def delete_host(host: str, user: str, repo: str, settings: Settings) -> HostDiskResult:
|
||||
|
||||
Reference in New Issue
Block a user