474417b458
Backend:
- updates.py: get_update_status() runs git fetch + git rev-list --left-right --count HEAD...origin/main to learn ahead/behind/dirty, plus git log for pending commits
- UpdateManager class with asyncio.Lock; one update at a time
- POST /api/updates/apply triggers "git pull --ff-only && ./build-and-copy.sh -c" over SSH with streamed log + phase detection (Pulling / Building the vLLM container / Copying to peer Sparks)
- GET /api/updates returns {ok, behind, ahead, dirty, current, log[], branch}
Frontend:
- Persistent banner near footer: hidden when up-to-date, blue when N commits behind, warn (orange) when local dirty changes block update
- 'Show details' expands a list of pending commits
- 'Apply update' triggers the long-running build with phase + elapsed timer + collapsible logs
- Confirmation dialog explains the 5–40 min duration
Package: bump 0.2.2:0
161 lines
5.6 KiB
Python
161 lines
5.6 KiB
Python
"""Check for and apply updates to the upstream `eugr/spark-vllm-docker` checkout
|
|
on Spark 1. We don't auto-update — only display what's available and let the user
|
|
explicitly apply when they're ready.
|
|
"""
|
|
from __future__ import annotations
|
|
import asyncio
|
|
import uuid
|
|
from dataclasses import dataclass, field
|
|
from datetime import datetime, timezone
|
|
from typing import Optional
|
|
|
|
from .config import Settings
|
|
from .ssh import ssh_run, ssh_stream, StreamHandle
|
|
|
|
|
|
async def get_update_status(settings: Settings) -> dict:
|
|
"""Return upstream-vs-local commit info. Runs `git fetch` then `git log`.
|
|
|
|
No mutation other than refreshing remote refs (fetch). Safe to call frequently.
|
|
"""
|
|
if not settings.spark1_host or not settings.spark1_user:
|
|
return {"ok": False, "error": "spark1 not configured"}
|
|
script = (
|
|
"cd ~/spark-vllm-docker && "
|
|
"branch=$(git rev-parse --abbrev-ref HEAD) && "
|
|
"git fetch --quiet 2>&1 || true; "
|
|
"ahead_behind=$(git rev-list --left-right --count HEAD...origin/$branch 2>/dev/null || echo '0\t0'); "
|
|
"behind=$(echo \"$ahead_behind\" | awk '{print $2}'); "
|
|
"ahead=$(echo \"$ahead_behind\" | awk '{print $1}'); "
|
|
"current_hash=$(git rev-parse --short HEAD); "
|
|
"current_msg=$(git log -1 --pretty=format:%s); "
|
|
"dirty=$(git status --porcelain | wc -l); "
|
|
"echo BRANCH=\"$branch\"; "
|
|
"echo BEHIND=\"$behind\"; "
|
|
"echo AHEAD=\"$ahead\"; "
|
|
"echo DIRTY=\"$dirty\"; "
|
|
"echo CURRENT=\"$current_hash $current_msg\"; "
|
|
"echo ---LOG---; "
|
|
"git log HEAD..origin/$branch --pretty=format:'%h %s (%ar)' 2>/dev/null | head -30"
|
|
)
|
|
rc, out, err = await ssh_run(
|
|
settings.spark1_host,
|
|
settings.spark1_user,
|
|
script,
|
|
settings,
|
|
timeout=30,
|
|
)
|
|
if rc != 0:
|
|
return {"ok": False, "error": err.strip() or out.strip() or f"rc={rc}"}
|
|
|
|
info: dict = {"ok": True, "log": []}
|
|
in_log = False
|
|
for line in out.splitlines():
|
|
if line == "---LOG---":
|
|
in_log = True
|
|
continue
|
|
if in_log:
|
|
if line.strip():
|
|
info["log"].append(line)
|
|
continue
|
|
if "=" in line:
|
|
k, v = line.split("=", 1)
|
|
key = k.lower().strip()
|
|
val = v.strip()
|
|
if key in ("behind", "ahead", "dirty"):
|
|
info[key] = int(val) if val.isdigit() else 0
|
|
else:
|
|
info[key] = val
|
|
return info
|
|
|
|
|
|
@dataclass
|
|
class UpdateJob:
|
|
id: str
|
|
mode: str # 'cluster' | 'solo'
|
|
started_at: str
|
|
state: str = "starting"
|
|
lines: list[str] = field(default_factory=list)
|
|
returncode: Optional[int] = None
|
|
finished_at: Optional[str] = None
|
|
phase: str = "Starting…"
|
|
|
|
def append(self, line: str) -> None:
|
|
self.lines.append(line)
|
|
if len(self.lines) > 1000:
|
|
del self.lines[: len(self.lines) - 1000]
|
|
|
|
|
|
class UpdateManager:
|
|
def __init__(self, settings: Settings) -> None:
|
|
self.settings = settings
|
|
self.lock = asyncio.Lock()
|
|
self.jobs: dict[str, UpdateJob] = {}
|
|
self.current_job_id: Optional[str] = None
|
|
|
|
def get(self, job_id: str) -> UpdateJob | None:
|
|
return self.jobs.get(job_id)
|
|
|
|
async def trigger(self, mode: str = "cluster") -> UpdateJob:
|
|
if mode not in ("cluster", "solo"):
|
|
raise ValueError("mode must be 'cluster' or 'solo'")
|
|
if self.lock.locked():
|
|
raise RuntimeError("An update is already in progress")
|
|
job = UpdateJob(
|
|
id=uuid.uuid4().hex[:8],
|
|
mode=mode,
|
|
started_at=datetime.now(timezone.utc).isoformat(),
|
|
)
|
|
self.jobs[job.id] = job
|
|
self.current_job_id = job.id
|
|
asyncio.create_task(self._run(job))
|
|
return job
|
|
|
|
async def _run(self, job: UpdateJob) -> None:
|
|
async with self.lock:
|
|
try:
|
|
await self._do(job)
|
|
if job.state != "failed":
|
|
job.state = "done"
|
|
job.returncode = 0
|
|
job.phase = "Done"
|
|
except Exception as e:
|
|
job.append(f"[error] {type(e).__name__}: {e}")
|
|
job.state = "failed"
|
|
if job.returncode is None:
|
|
job.returncode = 1
|
|
finally:
|
|
job.finished_at = datetime.now(timezone.utc).isoformat()
|
|
if self.current_job_id == job.id:
|
|
self.current_job_id = None
|
|
|
|
async def _do(self, job: UpdateJob) -> None:
|
|
s = self.settings
|
|
if not s.spark1_host or not s.spark1_user:
|
|
raise RuntimeError("spark1 not configured")
|
|
|
|
flag = "-c" if job.mode == "cluster" else ""
|
|
cmd = (
|
|
f"cd ~/spark-vllm-docker && "
|
|
f"echo '=== git pull ===' && git pull --ff-only && "
|
|
f"echo '=== build-and-copy ===' && ./build-and-copy.sh {flag}"
|
|
).strip()
|
|
job.append(f"$ {cmd}")
|
|
job.state = "running"
|
|
job.phase = "Pulling latest changes…"
|
|
|
|
handle = StreamHandle()
|
|
async for line in ssh_stream(s.spark1_host, s.spark1_user, cmd, s, handle=handle):
|
|
job.append(line)
|
|
if "=== build-and-copy ===" in line:
|
|
job.phase = "Building the vLLM container…"
|
|
elif "Copy" in line and "complete" in line.lower():
|
|
job.phase = "Copying to peer Sparks…"
|
|
elif "Already up to date" in line:
|
|
job.phase = "No changes to pull; rebuilding…"
|
|
|
|
rc = handle.returncode or 0
|
|
if rc != 0:
|
|
job.state = "failed"
|
|
job.returncode = rc
|