diff --git a/image/app/deep_health.py b/image/app/deep_health.py
index 6695e04..243c362 100644
--- a/image/app/deep_health.py
+++ b/image/app/deep_health.py
@@ -173,16 +173,38 @@ class DeepHealth:
         if not s.spark1_host:
             return ProbeResult(ok=False, at=now_iso, error="not configured")
         base = f"http://{s.spark1_host}:{s.vllm_port}"
+        # Step 1: is there a model loaded?
         try:
             async with httpx.AsyncClient(timeout=5.0) as c:
                 r = await c.get(f"{base}/v1/models")
-                r.raise_for_status()
+            if 200 <= r.status_code < 300:
                 models = r.json().get("data") or []
-            if not models:
-                return ProbeResult(ok=False, at=now_iso, error="no model loaded")
-            model_id = models[0]["id"]
-        except Exception as e:
-            return ProbeResult(ok=False, at=now_iso, error=f"list models: {type(e).__name__}: {e}")
+            else:
+                # 5xx on /v1/models suggests something wedged after a model loaded
+                return ProbeResult(
+                    ok=False,
+                    at=now_iso,
+                    error=f"list_models HTTP {r.status_code}: {r.text[:240]}",
+                )
+        except Exception:
+            # Connection refused / timeout: usually means no vLLM process listening
+            # (the vllm_node container is alive but no `vllm serve` is running yet).
+            # That's an idle state, not a wedge — don't trigger auto-restart.
+            return ProbeResult(
+                ok=True,
+                at=now_iso,
+                note="no model currently loaded (idle)",
+            )
+
+        if not models:
+            return ProbeResult(
+                ok=True,
+                at=now_iso,
+                note="no model currently loaded (idle)",
+            )
+
+        model_id = models[0]["id"]
+        # Step 2: model is loaded; verify it can actually complete a 1-token request.
         t0 = time.monotonic()
         try:
             async with httpx.AsyncClient(timeout=PROBE_TIMEOUT_SEC) as c:
@@ -197,7 +219,7 @@ class DeepHealth:
                 )
             latency = round((time.monotonic() - t0) * 1000)
             if 200 <= r.status_code < 300:
-                return ProbeResult(ok=True, at=now_iso, latency_ms=latency)
+                return ProbeResult(ok=True, at=now_iso, latency_ms=latency, note=f"model={model_id}")
             return ProbeResult(
                 ok=False,
                 at=now_iso,
diff --git a/package/startos/versions/v0_1_0.ts b/package/startos/versions/v0_1_0.ts
index 1470123..340ea79 100644
--- a/package/startos/versions/v0_1_0.ts
+++ b/package/startos/versions/v0_1_0.ts
@@ -1,7 +1,7 @@
 import { VersionInfo, IMPOSSIBLE } from '@start9labs/start-sdk'
 
 export const v0_1_0 = VersionInfo.of({
-  version: '0.8.0:3',
+  version: '0.8.0:4',
   releaseNotes: {
     en_US:
       'v0.8: deep health probes. Every 5 minutes, Spark Control sends a tiny synthetic inference request to each service (1 second of silent audio to Parakeet, short text to Magpie, 1-token completion to vLLM). All payloads are generated in-memory and never written to disk. If a probe returns CUDA-error / 5xx signals while the container is still "up" — i.e. the classic Triton-wedge pattern where /health stays green but real inference fails — Spark Control automatically restarts the affected container. Rate-limited to 3 auto-restarts per service per 30 minutes. Each service card now shows the last deep-check timestamp, latency, and an inline "Run now" button. Failures and recoveries are logged into the connectivity history with source=deep-health.',