From 8ac455f5f54ba91ff339a7715d05304583de985f Mon Sep 17 00:00:00 2001
From: Grant <grant@ten31.xyz>
Date: Tue, 12 May 2026 14:47:32 -0500
Subject: [PATCH] v0.8.0:3 - add --max-num-batched-tokens=16384 to vision
 models (gemma4, qwen3-vl)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After the recent eugr/spark-vllm-docker update, vLLM became stricter about multimodal token budgets:

  ValueError: Chunked MM input disabled but max_tokens_per_mm_item (2496) is
  larger than max_num_batched_tokens (2048). Please increase max_num_batched_tokens.

Each image input produces 2496 tokens, but vLLM's default --max-num-batched-tokens of 2048 is just under. Same class of bug as the Qwen3.6 Mamba block-size assertion we fixed in 0.6.0:1, surfacing on different models.

Fix: bake --max-num-batched-tokens=16384 into every multimodal model entry. Now applied to:
  - qwen36 (already had it for the Mamba constraint; works for multimodal too since Qwen3.6 has vision)
  - gemma4 (crashed today on engine init)
  - qwen3-vl (would crash with the same error if anyone tried it)

The pre-flight Test button validates argparse but the 2048<2496 check happens at runtime engine init, so it's not caught by Test — only by actually trying to load. This is exactly the kind of bug v0.7's Test catches the *syntax* of but not the *semantics*; runtime errors like this still surface only on real swap. Known limitation documented in v0.7 release notes.
---
 image/models.yaml                  | 2 ++
 known-issues.md                    | 4 ++++
 package/startos/versions/v0_1_0.ts | 2 +-
 3 files changed, 7 insertions(+), 1 deletion(-)
diff --git a/image/models.yaml b/image/models.yaml
index 1e07480..a27d85f 100644
--- a/image/models.yaml
+++ b/image/models.yaml
@@ -30,6 +30,7 @@ models:
       - -tp=2
       - --distributed-executor-backend=ray
       - --max-model-len=32768
+      - --max-num-batched-tokens=16384
 
   gemma4:
     display_name: "Gemma 4 31B"
@@ -45,6 +46,7 @@ models:
     vllm_args:
       - --gpu-memory-utilization=0.8
       - --max-model-len=32768
+      - --max-num-batched-tokens=16384
       - --reasoning-parser=gemma4
       - --tool-call-parser=gemma4
       - --enable-auto-tool-choice
diff --git a/known-issues.md b/known-issues.md
index 0f57065..8f5f7f2 100644
--- a/known-issues.md
+++ b/known-issues.md
@@ -24,6 +24,10 @@ This flag is Blackwell-specific. If vLLM in the container reports `unrecognized
 
 Qwen3.6 uses a Mamba-attention hybrid that requires `--max-num-batched-tokens >= 2096`. vLLM's default is 2048, which trips `AssertionError: In Mamba cache align mode, block_size (2096) must be <= max_num_batched_tokens (2048)`. Fix: bake `--max-num-batched-tokens=16384` into the bundled qwen36 entry — matches the upstream qwen3.5-35b-a3b-fp8 recipe.
 
+## Multimodal token budget for vision models (fixed in v0.8.0:1)
+
+After the eugr/spark-vllm-docker update, vLLM became stricter about multimodal token budgets. Vision-capable models like Gemma 4 31B and Qwen3-VL crash at engine init with `ValueError: Chunked MM input disabled but max_tokens_per_mm_item (2496) is larger than max_num_batched_tokens (2048)`. Fix: bake `--max-num-batched-tokens=16384` into every model that has the `vision` capability. Now applied to qwen3-vl, gemma4, and qwen36 (which was already set for the Mamba issue).
+
 ## Two SSH paths to Spark 1 from the laptop
 
 `ssh <spark-user>@<spark-1-ip>` does NOT work from the laptop because the NVIDIA Sync ssh_config only has a Host entry for `<spark-1-host>.local`. Always use the `.local` hostname or `<spark-2-ip>`-style entries that ARE matched.
diff --git a/package/startos/versions/v0_1_0.ts b/package/startos/versions/v0_1_0.ts
index a8015fa..1470123 100644
--- a/package/startos/versions/v0_1_0.ts
+++ b/package/startos/versions/v0_1_0.ts
@@ -1,7 +1,7 @@
 import { VersionInfo, IMPOSSIBLE } from '@start9labs/start-sdk'
 
 export const v0_1_0 = VersionInfo.of({
-  version: '0.8.0:2',
+  version: '0.8.0:3',
   releaseNotes: {
     en_US:
       'v0.8: deep health probes. Every 5 minutes, Spark Control sends a tiny synthetic inference request to each service (1 second of silent audio to Parakeet, short text to Magpie, 1-token completion to vLLM). All payloads are generated in-memory and never written to disk. If a probe returns CUDA-error / 5xx signals while the container is still "up" — i.e. the classic Triton-wedge pattern where /health stays green but real inference fails — Spark Control automatically restarts the affected container. Rate-limited to 3 auto-restarts per service per 30 minutes. Each service card now shows the last deep-check timestamp, latency, and an inline "Run now" button. Failures and recoveries are logged into the connectivity history with source=deep-health.',