Files
spark-control/image/models.yaml
T
Grant 72bf754baa Pack spark-control_x86_64.s9pk (55 MB)
- Move models.yaml into image/ so the docker build context is self-contained
- Fix manifest: dockerfile=../image/Dockerfile, workdir=../image
- Add LICENSE (MIT) and assets/README.md (StartOS marketplace listing)
- s9pk validates: id=spark-control, version=0.1.0:0, osVersion=0.4.0-beta.6, sdkVersion=1.3.3
- Image embeds python:3.12-slim + openssh-client + FastAPI app + models.yaml
2026-05-12 09:52:53 -05:00

87 lines
2.3 KiB
YAML

# spark-control model catalog
#
# Edit this file (or override at runtime via the StartOS "Edit Model Catalog"
# action) to add or change available models.
#
# Each model entry produces this command on Spark 1:
# cd ~/spark-vllm-docker
# ./launch-cluster.sh [--solo] -d exec vllm serve <repo> \
# --port=<defaults.port> --host=<defaults.host> <vllm_args...>
defaults:
port: 8888
host: 0.0.0.0
models:
qwen3-vl:
display_name: "Qwen3-VL 235B (vision)"
repo: RedHatAI/Qwen3-VL-235B-A22B-Instruct-NVFP4
size_gb: 135
mode: cluster
capabilities: [vision, multilingual]
expected_ready_seconds: 300
vllm_args:
- --gpu-memory-utilization=0.7
- -tp=2
- --distributed-executor-backend=ray
- --max-model-len=32768
gemma4:
display_name: "Gemma 4 31B"
repo: RedHatAI/gemma-4-31B-it-NVFP4
size_gb: 23
mode: solo
capabilities: [vision, reasoning, tools]
expected_ready_seconds: 240
vllm_args:
- --gpu-memory-utilization=0.8
- --max-model-len=32768
- --reasoning-parser=gemma4
- --tool-call-parser=gemma4
- --enable-auto-tool-choice
- --load-format=fastsafetensors
- --enable-prefix-caching
- --kv-cache-dtype=fp8
qwen36:
display_name: "Qwen3.6 35B-A3B (daily driver)"
repo: RedHatAI/Qwen3.6-35B-A3B-NVFP4
size_gb: 20
mode: solo
capabilities: [reasoning]
expected_ready_seconds: 300
vllm_args:
- --gpu-memory-utilization=0.85
- --max-model-len=65536
- --reasoning-parser=qwen3
- --moe_backend=flashinfer_cutlass
- --load-format=fastsafetensors
- --enable-prefix-caching
- --kv-cache-dtype=fp8
qwen3-235b-fp8:
display_name: "Qwen3 235B-A22B FP8 (legacy)"
repo: Qwen/Qwen3-235B-A22B-FP8
size_gb: 220
mode: cluster
capabilities: []
expected_ready_seconds: 360
vllm_args:
- --gpu-memory-utilization=0.7
- -tp=2
- --distributed-executor-backend=ray
- --max-model-len=32768
qwen25-72b:
display_name: "Qwen2.5 72B (legacy)"
repo: Qwen/Qwen2.5-72B-Instruct
size_gb: 145
mode: cluster
capabilities: []
expected_ready_seconds: 360
vllm_args:
- --gpu-memory-utilization=0.7
- -tp=2
- --distributed-executor-backend=ray
- --max-model-len=32768