diff --git a/image/models.yaml b/image/models.yaml index 1e07480..a27d85f 100644 --- a/image/models.yaml +++ b/image/models.yaml @@ -30,6 +30,7 @@ models: - -tp=2 - --distributed-executor-backend=ray - --max-model-len=32768 + - --max-num-batched-tokens=16384 gemma4: display_name: "Gemma 4 31B" @@ -45,6 +46,7 @@ models: vllm_args: - --gpu-memory-utilization=0.8 - --max-model-len=32768 + - --max-num-batched-tokens=16384 - --reasoning-parser=gemma4 - --tool-call-parser=gemma4 - --enable-auto-tool-choice diff --git a/known-issues.md b/known-issues.md index 0f57065..8f5f7f2 100644 --- a/known-issues.md +++ b/known-issues.md @@ -24,6 +24,10 @@ This flag is Blackwell-specific. If vLLM in the container reports `unrecognized Qwen3.6 uses a Mamba-attention hybrid that requires `--max-num-batched-tokens >= 2096`. vLLM's default is 2048, which trips `AssertionError: In Mamba cache align mode, block_size (2096) must be <= max_num_batched_tokens (2048)`. Fix: bake `--max-num-batched-tokens=16384` into the bundled qwen36 entry — matches the upstream qwen3.5-35b-a3b-fp8 recipe. +## Multimodal token budget for vision models (fixed in v0.8.0:1) + +After the eugr/spark-vllm-docker update, vLLM became stricter about multimodal token budgets. Vision-capable models like Gemma 4 31B and Qwen3-VL crash at engine init with `ValueError: Chunked MM input disabled but max_tokens_per_mm_item (2496) is larger than max_num_batched_tokens (2048)`. Fix: bake `--max-num-batched-tokens=16384` into every model that has the `vision` capability. Now applied to qwen3-vl, gemma4, and qwen36 (which was already set for the Mamba issue). + ## Two SSH paths to Spark 1 from the laptop `ssh @` does NOT work from the laptop because the NVIDIA Sync ssh_config only has a Host entry for `.local`. Always use the `.local` hostname or ``-style entries that ARE matched. diff --git a/package/startos/versions/v0_1_0.ts b/package/startos/versions/v0_1_0.ts index a8015fa..1470123 100644 --- a/package/startos/versions/v0_1_0.ts +++ b/package/startos/versions/v0_1_0.ts @@ -1,7 +1,7 @@ import { VersionInfo, IMPOSSIBLE } from '@start9labs/start-sdk' export const v0_1_0 = VersionInfo.of({ - version: '0.8.0:2', + version: '0.8.0:3', releaseNotes: { en_US: 'v0.8: deep health probes. Every 5 minutes, Spark Control sends a tiny synthetic inference request to each service (1 second of silent audio to Parakeet, short text to Magpie, 1-token completion to vLLM). All payloads are generated in-memory and never written to disk. If a probe returns CUDA-error / 5xx signals while the container is still "up" — i.e. the classic Triton-wedge pattern where /health stays green but real inference fails — Spark Control automatically restarts the affected container. Rate-limited to 3 auto-restarts per service per 30 minutes. Each service card now shows the last deep-check timestamp, latency, and an inline "Run now" button. Failures and recoveries are logged into the connectivity history with source=deep-health.',