"""Validation + safe-quoting for user-supplied values that cross into SSH shell commands on the Sparks. Two layers of defense (same spirit as disk.py's `_SAFE_DIRNAME`): 1. Validate at the API boundary against a strict whitelist — rejects junk early with a clear error, and guarantees the value carries no shell metacharacters (so it is also safe to drop into echo/log lines). 2. `quote_arg` / `quote_args` at the actual interpolation site — the real guarantee: even a value that somehow skips validation cannot break out of the command. Rule: anything user-controlled that ends up in an `ssh_run` / `ssh_stream` command string must go through one of these, never be raw f-string'd. """ from __future__ import annotations import re import shlex # Hugging Face repo 'org/name'. HF identifiers allow letters, digits, dot, dash, # underscore; exactly one slash separates org from name. _HF_REPO_RE = re.compile(r"^[A-Za-z0-9._-]+/[A-Za-z0-9._-]+$") # Docker/OCI image reference: registry/path/name[:tag][@sha256:digest]. # Conservative charset covering e.g. nvcr.io/nim/nvidia/parakeet-...:latest and # @digest pins; excludes every shell metacharacter. _IMAGE_RE = re.compile(r"^[A-Za-z0-9][A-Za-z0-9._:/@-]*$") # Docker container / volume name (Docker's own rule). _CONTAINER_RE = re.compile(r"^[A-Za-z0-9][A-Za-z0-9_.-]*$") # Absolute filesystem path to a local model directory on a Spark. Conservative # charset (letters, digits, and safe path punctuation) with a required leading # '/', so it carries no shell metacharacters and no whitespace. Traversal ('.' # and '..' segments) is rejected separately in validate_local_path. _LOCAL_PATH_RE = re.compile(r"^/[A-Za-z0-9._+/-]+$") def validate_repo(repo: str) -> str: """Return `repo` if it is a well-formed 'org/name'; else raise ValueError.""" if not _HF_REPO_RE.fullmatch(repo or ""): raise ValueError(f"invalid model repo (expected 'org/name'): {repo!r}") return repo def validate_image(image: str) -> str: """Return `image` if it is a well-formed container image ref; else ValueError.""" if not image or len(image) > 512 or not _IMAGE_RE.fullmatch(image): raise ValueError(f"invalid container image reference: {image!r}") return image def validate_container(name: str) -> str: """Return `name` if it is a valid Docker container/volume name; else ValueError.""" if not name or len(name) > 128 or not _CONTAINER_RE.fullmatch(name): raise ValueError(f"invalid container name: {name!r}") return name def validate_local_path(path: str) -> str: """Return `path` if it is a safe absolute model directory path; else ValueError. For locally fine-tuned models served by directory (not an HF repo). Requires an absolute path, a metacharacter-free charset, and no '.'/'..' segments so a caller cannot traverse out of an intended models directory. The `quote_arg` sink still quotes it in depth — this is the boundary check. """ p = path or "" if len(p) > 512 or not _LOCAL_PATH_RE.fullmatch(p): raise ValueError( f"invalid local model path (expected an absolute path, no spaces or " f"shell metacharacters): {path!r}" ) if any(seg in (".", "..") for seg in p.split("/")): raise ValueError(f"local model path must not contain '.' or '..' segments: {path!r}") return p def quote_arg(value: object) -> str: """shlex.quote a single token for safe embedding in a shell command string.""" return shlex.quote(str(value)) def quote_args(values: object) -> str: """shlex.quote each token and join with spaces.""" return " ".join(shlex.quote(str(v)) for v in values) # type: ignore[union-attr]