e783653ef0
Add models that live as a directory on a Spark (e.g. LoRA-merged fine-tunes), not just Hugging Face repos. - ModelDef gains local_path; a model must set exactly one of repo / local_path. The validator also enforces the local-path whitelist and that any --chat-template lives inside local_path (only that dir is mounted). - build_launch_command bind-mounts the dir into the vLLM container at the SAME host==container path via the launch script's VLLM_SPARK_EXTRA_DOCKER_ARGS hook, then `vllm serve <dir>`. No launch-cluster.sh change (verified the upstream expands that var unquoted; contract noted in runbook.md). - shellsafe.validate_local_path: absolute path, charset whitelist, no '.'/'..'. - POST /api/models validates the full entry via ModelDef before persisting, so a bad entry can't be written and then break catalog load; _merge_overrides skips an invalid override entry instead of failing the whole catalog. - disk.py size-probes a local path with du; disk-delete refused for local models. - UI: "+ Add local model" dialog, `local` badge, path shown instead of an HF link, delete button hidden for local models. - Tests: local launch + injection round-trip, chat-template location, traversal, exactly-one-source, _merge_overrides skip-invalid (94 pass). Reviewer-agent pass; findings addressed.
152 lines
4.9 KiB
Python
152 lines
4.9 KiB
Python
"""User overrides + custom catalog entries, persisted on the package's main volume.
|
|
|
|
File: /data/models-overrides.yaml (only when something has been customized).
|
|
Survives package updates because /data is the StartOS volume.
|
|
|
|
Shape:
|
|
knobs:
|
|
qwen36:
|
|
max_model_len: 65536
|
|
gpu_memory_utilization: 0.85
|
|
fastsafetensors: true
|
|
prefix_caching: true
|
|
kv_cache_dtype: fp8 # one of: 'fp8' | 'auto' | null
|
|
custom:
|
|
- key: my-new-model
|
|
display_name: My New Model (from download)
|
|
repo: my-org/my-model # an HF repo; OR set local_path instead (exactly one)
|
|
size_gb: 20
|
|
mode: solo
|
|
description: null
|
|
capabilities: []
|
|
knobs:
|
|
max_model_len: 32768
|
|
gpu_memory_utilization: 0.85
|
|
fastsafetensors: true
|
|
prefix_caching: true
|
|
kv_cache_dtype: fp8
|
|
- key: my-finetune # a local/fine-tuned model (a directory on the Spark)
|
|
display_name: My Fine-tune
|
|
local_path: /home/you/models/my-finetune
|
|
size_gb: 59
|
|
mode: solo
|
|
vllm_args: [--chat-template=/home/you/models/my-finetune/chat_template.jinja]
|
|
"""
|
|
from __future__ import annotations
|
|
import os
|
|
from pathlib import Path
|
|
from typing import Any, Optional
|
|
import yaml
|
|
|
|
|
|
KNOWN_FLAG_MAP = {
|
|
"max_model_len": "--max-model-len",
|
|
"gpu_memory_utilization": "--gpu-memory-utilization",
|
|
}
|
|
|
|
|
|
def _path() -> str:
|
|
# Container: /data/models-overrides.yaml. Local dev: cwd or env override.
|
|
return os.environ.get("MODELS_OVERRIDES", "/data/models-overrides.yaml")
|
|
|
|
|
|
def load_overrides() -> dict:
|
|
p = _path()
|
|
try:
|
|
with open(p) as f:
|
|
data = yaml.safe_load(f) or {}
|
|
except FileNotFoundError:
|
|
return {"knobs": {}, "custom": []}
|
|
data.setdefault("knobs", {})
|
|
data.setdefault("custom", [])
|
|
return data
|
|
|
|
|
|
def save_overrides(data: dict) -> None:
|
|
p = _path()
|
|
parent = Path(p).parent
|
|
parent.mkdir(parents=True, exist_ok=True)
|
|
with open(p, "w") as f:
|
|
yaml.safe_dump(data, f, sort_keys=False)
|
|
|
|
|
|
def set_knobs(key: str, knobs: dict) -> dict:
|
|
data = load_overrides()
|
|
if not knobs:
|
|
data["knobs"].pop(key, None)
|
|
else:
|
|
data["knobs"][key] = knobs
|
|
save_overrides(data)
|
|
return data
|
|
|
|
|
|
def add_custom(entry: dict) -> dict:
|
|
data = load_overrides()
|
|
# Replace if same key already exists
|
|
data["custom"] = [c for c in data["custom"] if c.get("key") != entry["key"]]
|
|
data["custom"].append(entry)
|
|
save_overrides(data)
|
|
return data
|
|
|
|
|
|
def delete_custom(key: str) -> dict:
|
|
data = load_overrides()
|
|
data["custom"] = [c for c in data["custom"] if c.get("key") != key]
|
|
data["knobs"].pop(key, None)
|
|
save_overrides(data)
|
|
return data
|
|
|
|
|
|
def apply_knobs_to_args(vllm_args: list[str], knobs: Optional[dict]) -> list[str]:
|
|
"""Return a new vllm_args list with knob values overriding matching flags."""
|
|
if not knobs:
|
|
return list(vllm_args)
|
|
skip_prefixes: list[str] = []
|
|
for k, flag in KNOWN_FLAG_MAP.items():
|
|
if k in knobs:
|
|
skip_prefixes.append(f"{flag}=")
|
|
skip_prefixes.append(flag + " ") # rare style
|
|
# opt-in flag toggles
|
|
if "fastsafetensors" in knobs:
|
|
skip_prefixes.append("--load-format=")
|
|
if "prefix_caching" in knobs:
|
|
skip_prefixes.append("--enable-prefix-caching")
|
|
if "kv_cache_dtype" in knobs:
|
|
skip_prefixes.append("--kv-cache-dtype=")
|
|
|
|
filtered = [a for a in vllm_args if not any(a.startswith(p) or a == p.rstrip("= ") for p in skip_prefixes)]
|
|
|
|
for k, flag in KNOWN_FLAG_MAP.items():
|
|
if k in knobs and knobs[k] is not None:
|
|
filtered.append(f"{flag}={knobs[k]}")
|
|
|
|
if knobs.get("fastsafetensors"):
|
|
filtered.append("--load-format=fastsafetensors")
|
|
if knobs.get("prefix_caching"):
|
|
filtered.append("--enable-prefix-caching")
|
|
kvd = knobs.get("kv_cache_dtype")
|
|
if kvd and kvd != "auto":
|
|
filtered.append(f"--kv-cache-dtype={kvd}")
|
|
|
|
return filtered
|
|
|
|
|
|
def extract_knobs_from_args(vllm_args: list[str]) -> dict:
|
|
"""Reverse: read default knob values from a model's bundled vllm_args, so the UI
|
|
has a starting point even before the user has saved any overrides."""
|
|
knobs: dict[str, Any] = {}
|
|
for a in vllm_args:
|
|
if a.startswith("--max-model-len="):
|
|
try: knobs["max_model_len"] = int(a.split("=", 1)[1])
|
|
except ValueError: pass
|
|
elif a.startswith("--gpu-memory-utilization="):
|
|
try: knobs["gpu_memory_utilization"] = float(a.split("=", 1)[1])
|
|
except ValueError: pass
|
|
elif a == "--load-format=fastsafetensors":
|
|
knobs["fastsafetensors"] = True
|
|
elif a == "--enable-prefix-caching":
|
|
knobs["prefix_caching"] = True
|
|
elif a.startswith("--kv-cache-dtype="):
|
|
knobs["kv_cache_dtype"] = a.split("=", 1)[1]
|
|
return knobs
|