"""User overrides + custom catalog entries, persisted on the package's main volume. File: /data/models-overrides.yaml (only when something has been customized). Survives package updates because /data is the StartOS volume. Shape: knobs: qwen36: max_model_len: 65536 gpu_memory_utilization: 0.85 fastsafetensors: true prefix_caching: true kv_cache_dtype: fp8 # one of: 'fp8' | 'auto' | null custom: - key: my-new-model display_name: My New Model (from download) repo: my-org/my-model size_gb: 20 mode: solo description: null capabilities: [] knobs: max_model_len: 32768 gpu_memory_utilization: 0.85 fastsafetensors: true prefix_caching: true kv_cache_dtype: fp8 """ from __future__ import annotations import os from pathlib import Path from typing import Any, Optional import yaml KNOWN_FLAG_MAP = { "max_model_len": "--max-model-len", "gpu_memory_utilization": "--gpu-memory-utilization", } def _path() -> str: # Container: /data/models-overrides.yaml. Local dev: cwd or env override. return os.environ.get("MODELS_OVERRIDES", "/data/models-overrides.yaml") def load_overrides() -> dict: p = _path() try: with open(p) as f: data = yaml.safe_load(f) or {} except FileNotFoundError: return {"knobs": {}, "custom": []} data.setdefault("knobs", {}) data.setdefault("custom", []) return data def save_overrides(data: dict) -> None: p = _path() parent = Path(p).parent parent.mkdir(parents=True, exist_ok=True) with open(p, "w") as f: yaml.safe_dump(data, f, sort_keys=False) def set_knobs(key: str, knobs: dict) -> dict: data = load_overrides() if not knobs: data["knobs"].pop(key, None) else: data["knobs"][key] = knobs save_overrides(data) return data def add_custom(entry: dict) -> dict: data = load_overrides() # Replace if same key already exists data["custom"] = [c for c in data["custom"] if c.get("key") != entry["key"]] data["custom"].append(entry) save_overrides(data) return data def delete_custom(key: str) -> dict: data = load_overrides() data["custom"] = [c for c in data["custom"] if c.get("key") != key] data["knobs"].pop(key, None) save_overrides(data) return data def apply_knobs_to_args(vllm_args: list[str], knobs: Optional[dict]) -> list[str]: """Return a new vllm_args list with knob values overriding matching flags.""" if not knobs: return list(vllm_args) skip_prefixes: list[str] = [] for k, flag in KNOWN_FLAG_MAP.items(): if k in knobs: skip_prefixes.append(f"{flag}=") skip_prefixes.append(flag + " ") # rare style # opt-in flag toggles if "fastsafetensors" in knobs: skip_prefixes.append("--load-format=") if "prefix_caching" in knobs: skip_prefixes.append("--enable-prefix-caching") if "kv_cache_dtype" in knobs: skip_prefixes.append("--kv-cache-dtype=") filtered = [a for a in vllm_args if not any(a.startswith(p) or a == p.rstrip("= ") for p in skip_prefixes)] for k, flag in KNOWN_FLAG_MAP.items(): if k in knobs and knobs[k] is not None: filtered.append(f"{flag}={knobs[k]}") if knobs.get("fastsafetensors"): filtered.append("--load-format=fastsafetensors") if knobs.get("prefix_caching"): filtered.append("--enable-prefix-caching") kvd = knobs.get("kv_cache_dtype") if kvd and kvd != "auto": filtered.append(f"--kv-cache-dtype={kvd}") return filtered def extract_knobs_from_args(vllm_args: list[str]) -> dict: """Reverse: read default knob values from a model's bundled vllm_args, so the UI has a starting point even before the user has saved any overrides.""" knobs: dict[str, Any] = {} for a in vllm_args: if a.startswith("--max-model-len="): try: knobs["max_model_len"] = int(a.split("=", 1)[1]) except ValueError: pass elif a.startswith("--gpu-memory-utilization="): try: knobs["gpu_memory_utilization"] = float(a.split("=", 1)[1]) except ValueError: pass elif a == "--load-format=fastsafetensors": knobs["fastsafetensors"] = True elif a == "--enable-prefix-caching": knobs["prefix_caching"] = True elif a.startswith("--kv-cache-dtype="): knobs["kv_cache_dtype"] = a.split("=", 1)[1] return knobs