Initial scaffold: image/ FastAPI app, models.yaml, docs

- image/ FastAPI app: /api/status, /api/swap, /api/swap/{id}/stream, /api/test-connection - models.yaml: 5-model catalog (qwen3-vl, gemma4, qwen36, qwen3-235b-fp8, qwen25-72b) - README, runbook, known-issues - Dry-run swap verified against live Spark 1 (gemma4 currently loaded)
2026-05-12 09:29:13 -05:00
commit ae8efa1754
19 changed files with 1500 additions and 0 deletions
@@ -0,0 +1,80 @@
+# spark-control model catalog
+#
+# Edit this file (or override at runtime via the StartOS "Edit Model Catalog"
+# action) to add or change available models.
+#
+# Each model entry produces this command on Spark 1:
+#   cd ~/spark-vllm-docker
+#   ./launch-cluster.sh [--solo] -d exec vllm serve <repo> \
+#     --port=<defaults.port> --host=<defaults.host> <vllm_args...>
+
+defaults:
+  port: 8888
+  host: 0.0.0.0
+
+models:
+  qwen3-vl:
+    display_name: "Qwen3-VL 235B (vision)"
+    repo: RedHatAI/Qwen3-VL-235B-A22B-Instruct-NVFP4
+    size_gb: 135
+    mode: cluster
+    capabilities: [vision, multilingual]
+    expected_ready_seconds: 300
+    vllm_args:
+      - --gpu-memory-utilization=0.7
+      - -tp=2
+      - --distributed-executor-backend=ray
+      - --max-model-len=32768
+
+  gemma4:
+    display_name: "Gemma 4 31B"
+    repo: RedHatAI/gemma-4-31B-it-NVFP4
+    size_gb: 23
+    mode: solo
+    capabilities: [vision, reasoning, tools]
+    expected_ready_seconds: 240
+    vllm_args:
+      - --gpu-memory-utilization=0.8
+      - --max-model-len=32768
+      - --reasoning-parser=gemma4
+      - --tool-call-parser=gemma4
+      - --enable-auto-tool-choice
+
+  qwen36:
+    display_name: "Qwen3.6 35B-A3B (daily driver)"
+    repo: RedHatAI/Qwen3.6-35B-A3B-NVFP4
+    size_gb: 20
+    mode: solo
+    capabilities: [reasoning]
+    expected_ready_seconds: 300
+    vllm_args:
+      - --gpu-memory-utilization=0.85
+      - --max-model-len=65536
+      - --reasoning-parser=qwen3
+      - --moe_backend=flashinfer_cutlass
+
+  qwen3-235b-fp8:
+    display_name: "Qwen3 235B-A22B FP8 (legacy)"
+    repo: Qwen/Qwen3-235B-A22B-FP8
+    size_gb: 220
+    mode: cluster
+    capabilities: []
+    expected_ready_seconds: 360
+    vllm_args:
+      - --gpu-memory-utilization=0.7
+      - -tp=2
+      - --distributed-executor-backend=ray
+      - --max-model-len=32768
+
+  qwen25-72b:
+    display_name: "Qwen2.5 72B (legacy)"
+    repo: Qwen/Qwen2.5-72B-Instruct
+    size_gb: 145
+    mode: cluster
+    capabilities: []
+    expected_ready_seconds: 360
+    vllm_args:
+      - --gpu-memory-utilization=0.7
+      - -tp=2
+      - --distributed-executor-backend=ray
+      - --max-model-len=32768