v0.3.0:1 - hotfix: parallel SSH probes + longer timeout

- Hardware probes for spark1 and spark2 now run via asyncio.gather (parallel) so the worst-case wall time is max(per-probe), not sum - Bump per-probe SSH timeout from 8s to 12s to absorb first-call overhead (StrictHostKeyChecking=accept-new on first connect + nvidia-smi cold start) - Unreachable Spark now shows up cleanly in the UI as a single 'unreachable' card with the error message
2026-05-12 12:14:36 -05:00
parent 64ce0fca10
commit e88fdcfde4
2 changed files with 7 additions and 6 deletions
@@ -96,10 +96,11 @@ class HardwareProbe:
        return self._locks[key]

    async def fetch(self) -> dict:
-        return {
-            "spark1": await self._one("spark1", self.settings.spark1_host, self.settings.spark1_user),
-            "spark2": await self._one("spark2", self.settings.spark2_host, self.settings.spark2_user),
-        }
+        s1, s2 = await asyncio.gather(
+            self._one("spark1", self.settings.spark1_host, self.settings.spark1_user),
+            self._one("spark2", self.settings.spark2_host, self.settings.spark2_user),
+        )
+        return {"spark1": s1, "spark2": s2}

    async def _one(self, key: str, host: str, user: str) -> dict:
        if not host or not user:
@@ -109,7 +110,7 @@ class HardwareProbe:
            cached = self._cache.get(key)
            if cached and (now - cached[0] < self.ttl_sec):
                return cached[1]
-            rc, out, err = await ssh_run(host, user, _PROBE, self.settings, timeout=8)
+            rc, out, err = await ssh_run(host, user, _PROBE, self.settings, timeout=12)
            if rc != 0:
                result = {"reachable": False, "configured": True, "host": host, "error": err.strip() or out.strip() or f"rc={rc}"}
            else: