From 8ac455f5f54ba91ff339a7715d05304583de985f Mon Sep 17 00:00:00 2001 From: Grant Date: Tue, 12 May 2026 14:47:32 -0500 Subject: [PATCH] v0.8.0:3 - add --max-num-batched-tokens=16384 to vision models (gemma4, qwen3-vl) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After the recent eugr/spark-vllm-docker update, vLLM became stricter about multimodal token budgets: ValueError: Chunked MM input disabled but max_tokens_per_mm_item (2496) is larger than max_num_batched_tokens (2048). Please increase max_num_batched_tokens. Each image input produces 2496 tokens, but vLLM's default --max-num-batched-tokens of 2048 is just under. Same class of bug as the Qwen3.6 Mamba block-size assertion we fixed in 0.6.0:1, surfacing on different models. Fix: bake --max-num-batched-tokens=16384 into every multimodal model entry. Now applied to: - qwen36 (already had it for the Mamba constraint; works for multimodal too since Qwen3.6 has vision) - gemma4 (crashed today on engine init) - qwen3-vl (would crash with the same error if anyone tried it) The pre-flight Test button validates argparse but the 2048<2496 check happens at runtime engine init, so it's not caught by Test — only by actually trying to load. This is exactly the kind of bug v0.7's Test catches the *syntax* of but not the *semantics*; runtime errors like this still surface only on real swap. Known limitation documented in v0.7 release notes. --- image/models.yaml | 2 ++ known-issues.md | 4 ++++ package/startos/versions/v0_1_0.ts | 2 +- 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/image/models.yaml b/image/models.yaml index 1e07480..a27d85f 100644 --- a/image/models.yaml +++ b/image/models.yaml @@ -30,6 +30,7 @@ models: - -tp=2 - --distributed-executor-backend=ray - --max-model-len=32768 + - --max-num-batched-tokens=16384 gemma4: display_name: "Gemma 4 31B" @@ -45,6 +46,7 @@ models: vllm_args: - --gpu-memory-utilization=0.8 - --max-model-len=32768 + - --max-num-batched-tokens=16384 - --reasoning-parser=gemma4 - --tool-call-parser=gemma4 - --enable-auto-tool-choice diff --git a/known-issues.md b/known-issues.md index 0f57065..8f5f7f2 100644 --- a/known-issues.md +++ b/known-issues.md @@ -24,6 +24,10 @@ This flag is Blackwell-specific. If vLLM in the container reports `unrecognized Qwen3.6 uses a Mamba-attention hybrid that requires `--max-num-batched-tokens >= 2096`. vLLM's default is 2048, which trips `AssertionError: In Mamba cache align mode, block_size (2096) must be <= max_num_batched_tokens (2048)`. Fix: bake `--max-num-batched-tokens=16384` into the bundled qwen36 entry — matches the upstream qwen3.5-35b-a3b-fp8 recipe. +## Multimodal token budget for vision models (fixed in v0.8.0:1) + +After the eugr/spark-vllm-docker update, vLLM became stricter about multimodal token budgets. Vision-capable models like Gemma 4 31B and Qwen3-VL crash at engine init with `ValueError: Chunked MM input disabled but max_tokens_per_mm_item (2496) is larger than max_num_batched_tokens (2048)`. Fix: bake `--max-num-batched-tokens=16384` into every model that has the `vision` capability. Now applied to qwen3-vl, gemma4, and qwen36 (which was already set for the Mamba issue). + ## Two SSH paths to Spark 1 from the laptop `ssh @` does NOT work from the laptop because the NVIDIA Sync ssh_config only has a Host entry for `.local`. Always use the `.local` hostname or ``-style entries that ARE matched. diff --git a/package/startos/versions/v0_1_0.ts b/package/startos/versions/v0_1_0.ts index a8015fa..1470123 100644 --- a/package/startos/versions/v0_1_0.ts +++ b/package/startos/versions/v0_1_0.ts @@ -1,7 +1,7 @@ import { VersionInfo, IMPOSSIBLE } from '@start9labs/start-sdk' export const v0_1_0 = VersionInfo.of({ - version: '0.8.0:2', + version: '0.8.0:3', releaseNotes: { en_US: 'v0.8: deep health probes. Every 5 minutes, Spark Control sends a tiny synthetic inference request to each service (1 second of silent audio to Parakeet, short text to Magpie, 1-token completion to vLLM). All payloads are generated in-memory and never written to disk. If a probe returns CUDA-error / 5xx signals while the container is still "up" — i.e. the classic Triton-wedge pattern where /health stays green but real inference fails — Spark Control automatically restarts the affected container. Rate-limited to 3 auto-restarts per service per 30 minutes. Each service card now shows the last deep-check timestamp, latency, and an inline "Run now" button. Failures and recoveries are logged into the connectivity history with source=deep-health.',