diff --git a/image/models.yaml b/image/models.yaml
index 1e07480..a27d85f 100644
--- a/image/models.yaml
+++ b/image/models.yaml
@@ -30,6 +30,7 @@ models:
       - -tp=2
       - --distributed-executor-backend=ray
       - --max-model-len=32768
+      - --max-num-batched-tokens=16384
 
   gemma4:
     display_name: "Gemma 4 31B"
@@ -45,6 +46,7 @@ models:
     vllm_args:
       - --gpu-memory-utilization=0.8
       - --max-model-len=32768
+      - --max-num-batched-tokens=16384
       - --reasoning-parser=gemma4
       - --tool-call-parser=gemma4
       - --enable-auto-tool-choice
diff --git a/known-issues.md b/known-issues.md
index 0f57065..8f5f7f2 100644
--- a/known-issues.md
+++ b/known-issues.md
@@ -24,6 +24,10 @@ This flag is Blackwell-specific. If vLLM in the container reports `unrecognized
 
 Qwen3.6 uses a Mamba-attention hybrid that requires `--max-num-batched-tokens >= 2096`. vLLM's default is 2048, which trips `AssertionError: In Mamba cache align mode, block_size (2096) must be <= max_num_batched_tokens (2048)`. Fix: bake `--max-num-batched-tokens=16384` into the bundled qwen36 entry — matches the upstream qwen3.5-35b-a3b-fp8 recipe.
 
+## Multimodal token budget for vision models (fixed in v0.8.0:1)
+
+After the eugr/spark-vllm-docker update, vLLM became stricter about multimodal token budgets. Vision-capable models like Gemma 4 31B and Qwen3-VL crash at engine init with `ValueError: Chunked MM input disabled but max_tokens_per_mm_item (2496) is larger than max_num_batched_tokens (2048)`. Fix: bake `--max-num-batched-tokens=16384` into every model that has the `vision` capability. Now applied to qwen3-vl, gemma4, and qwen36 (which was already set for the Mamba issue).
+
 ## Two SSH paths to Spark 1 from the laptop
 
 `ssh <spark-user>@<spark-1-ip>` does NOT work from the laptop because the NVIDIA Sync ssh_config only has a Host entry for `<spark-1-host>.local`. Always use the `.local` hostname or `<spark-2-ip>`-style entries that ARE matched.
diff --git a/package/startos/versions/v0_1_0.ts b/package/startos/versions/v0_1_0.ts
index a8015fa..1470123 100644
--- a/package/startos/versions/v0_1_0.ts
+++ b/package/startos/versions/v0_1_0.ts
@@ -1,7 +1,7 @@
 import { VersionInfo, IMPOSSIBLE } from '@start9labs/start-sdk'
 
 export const v0_1_0 = VersionInfo.of({
-  version: '0.8.0:2',
+  version: '0.8.0:3',
   releaseNotes: {
     en_US:
       'v0.8: deep health probes. Every 5 minutes, Spark Control sends a tiny synthetic inference request to each service (1 second of silent audio to Parakeet, short text to Magpie, 1-token completion to vLLM). All payloads are generated in-memory and never written to disk. If a probe returns CUDA-error / 5xx signals while the container is still "up" — i.e. the classic Triton-wedge pattern where /health stays green but real inference fails — Spark Control automatically restarts the affected container. Rate-limited to 3 auto-restarts per service per 30 minutes. Each service card now shows the last deep-check timestamp, latency, and an inline "Run now" button. Failures and recoveries are logged into the connectivity history with source=deep-health.',