342e150266
Aligned with sibling recipes in eugr/spark-vllm-docker. Applies on next swap to each model. First real swap gemma4 -> qwen36 succeeded in 5:30 with --moe_backend=flashinfer_cutlass.
87 lines
2.3 KiB
YAML
87 lines
2.3 KiB
YAML
# spark-control model catalog
|
|
#
|
|
# Edit this file (or override at runtime via the StartOS "Edit Model Catalog"
|
|
# action) to add or change available models.
|
|
#
|
|
# Each model entry produces this command on Spark 1:
|
|
# cd ~/spark-vllm-docker
|
|
# ./launch-cluster.sh [--solo] -d exec vllm serve <repo> \
|
|
# --port=<defaults.port> --host=<defaults.host> <vllm_args...>
|
|
|
|
defaults:
|
|
port: 8888
|
|
host: 0.0.0.0
|
|
|
|
models:
|
|
qwen3-vl:
|
|
display_name: "Qwen3-VL 235B (vision)"
|
|
repo: RedHatAI/Qwen3-VL-235B-A22B-Instruct-NVFP4
|
|
size_gb: 135
|
|
mode: cluster
|
|
capabilities: [vision, multilingual]
|
|
expected_ready_seconds: 300
|
|
vllm_args:
|
|
- --gpu-memory-utilization=0.7
|
|
- -tp=2
|
|
- --distributed-executor-backend=ray
|
|
- --max-model-len=32768
|
|
|
|
gemma4:
|
|
display_name: "Gemma 4 31B"
|
|
repo: RedHatAI/gemma-4-31B-it-NVFP4
|
|
size_gb: 23
|
|
mode: solo
|
|
capabilities: [vision, reasoning, tools]
|
|
expected_ready_seconds: 240
|
|
vllm_args:
|
|
- --gpu-memory-utilization=0.8
|
|
- --max-model-len=32768
|
|
- --reasoning-parser=gemma4
|
|
- --tool-call-parser=gemma4
|
|
- --enable-auto-tool-choice
|
|
- --load-format=fastsafetensors
|
|
- --enable-prefix-caching
|
|
- --kv-cache-dtype=fp8
|
|
|
|
qwen36:
|
|
display_name: "Qwen3.6 35B-A3B (daily driver)"
|
|
repo: RedHatAI/Qwen3.6-35B-A3B-NVFP4
|
|
size_gb: 20
|
|
mode: solo
|
|
capabilities: [reasoning]
|
|
expected_ready_seconds: 300
|
|
vllm_args:
|
|
- --gpu-memory-utilization=0.85
|
|
- --max-model-len=65536
|
|
- --reasoning-parser=qwen3
|
|
- --moe_backend=flashinfer_cutlass
|
|
- --load-format=fastsafetensors
|
|
- --enable-prefix-caching
|
|
- --kv-cache-dtype=fp8
|
|
|
|
qwen3-235b-fp8:
|
|
display_name: "Qwen3 235B-A22B FP8 (legacy)"
|
|
repo: Qwen/Qwen3-235B-A22B-FP8
|
|
size_gb: 220
|
|
mode: cluster
|
|
capabilities: []
|
|
expected_ready_seconds: 360
|
|
vllm_args:
|
|
- --gpu-memory-utilization=0.7
|
|
- -tp=2
|
|
- --distributed-executor-backend=ray
|
|
- --max-model-len=32768
|
|
|
|
qwen25-72b:
|
|
display_name: "Qwen2.5 72B (legacy)"
|
|
repo: Qwen/Qwen2.5-72B-Instruct
|
|
size_gb: 145
|
|
mode: cluster
|
|
capabilities: []
|
|
expected_ready_seconds: 360
|
|
vllm_args:
|
|
- --gpu-memory-utilization=0.7
|
|
- -tp=2
|
|
- --distributed-executor-backend=ray
|
|
- --max-model-len=32768
|