v0.3.0:1 - hotfix: parallel SSH probes + longer timeout
- Hardware probes for spark1 and spark2 now run via asyncio.gather (parallel) so the worst-case wall time is max(per-probe), not sum - Bump per-probe SSH timeout from 8s to 12s to absorb first-call overhead (StrictHostKeyChecking=accept-new on first connect + nvidia-smi cold start) - Unreachable Spark now shows up cleanly in the UI as a single 'unreachable' card with the error message
This commit is contained in:
@@ -96,10 +96,11 @@ class HardwareProbe:
|
|||||||
return self._locks[key]
|
return self._locks[key]
|
||||||
|
|
||||||
async def fetch(self) -> dict:
|
async def fetch(self) -> dict:
|
||||||
return {
|
s1, s2 = await asyncio.gather(
|
||||||
"spark1": await self._one("spark1", self.settings.spark1_host, self.settings.spark1_user),
|
self._one("spark1", self.settings.spark1_host, self.settings.spark1_user),
|
||||||
"spark2": await self._one("spark2", self.settings.spark2_host, self.settings.spark2_user),
|
self._one("spark2", self.settings.spark2_host, self.settings.spark2_user),
|
||||||
}
|
)
|
||||||
|
return {"spark1": s1, "spark2": s2}
|
||||||
|
|
||||||
async def _one(self, key: str, host: str, user: str) -> dict:
|
async def _one(self, key: str, host: str, user: str) -> dict:
|
||||||
if not host or not user:
|
if not host or not user:
|
||||||
@@ -109,7 +110,7 @@ class HardwareProbe:
|
|||||||
cached = self._cache.get(key)
|
cached = self._cache.get(key)
|
||||||
if cached and (now - cached[0] < self.ttl_sec):
|
if cached and (now - cached[0] < self.ttl_sec):
|
||||||
return cached[1]
|
return cached[1]
|
||||||
rc, out, err = await ssh_run(host, user, _PROBE, self.settings, timeout=8)
|
rc, out, err = await ssh_run(host, user, _PROBE, self.settings, timeout=12)
|
||||||
if rc != 0:
|
if rc != 0:
|
||||||
result = {"reachable": False, "configured": True, "host": host, "error": err.strip() or out.strip() or f"rc={rc}"}
|
result = {"reachable": False, "configured": True, "host": host, "error": err.strip() or out.strip() or f"rc={rc}"}
|
||||||
else:
|
else:
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
import { VersionInfo, IMPOSSIBLE } from '@start9labs/start-sdk'
|
import { VersionInfo, IMPOSSIBLE } from '@start9labs/start-sdk'
|
||||||
|
|
||||||
export const v0_1_0 = VersionInfo.of({
|
export const v0_1_0 = VersionInfo.of({
|
||||||
version: '0.3.0:0',
|
version: '0.3.0:1',
|
||||||
releaseNotes: {
|
releaseNotes: {
|
||||||
en_US:
|
en_US:
|
||||||
'v0.3: Spark hardware dashboard (RAM, disk, GPU memory + utilization, CPU load, uptime per Spark). Per-model Advanced settings now show plain-English hints tied to your actual GPU memory (e.g. "0.85 GPU util leaves ~18 GB free"). "Explain context" button on the update banner asks the loaded LLM to summarize pending commits in plain English. Optional Open WebUI URL in Configure Sparks shows a one-click "Open chat" button in the top bar. Downloads can now target Spark 1, Spark 2, or both. Each model card links out to its Hugging Face page.',
|
'v0.3: Spark hardware dashboard (RAM, disk, GPU memory + utilization, CPU load, uptime per Spark). Per-model Advanced settings now show plain-English hints tied to your actual GPU memory (e.g. "0.85 GPU util leaves ~18 GB free"). "Explain context" button on the update banner asks the loaded LLM to summarize pending commits in plain English. Optional Open WebUI URL in Configure Sparks shows a one-click "Open chat" button in the top bar. Downloads can now target Spark 1, Spark 2, or both. Each model card links out to its Hugging Face page.',
|
||||||
|
|||||||
Reference in New Issue
Block a user