187 lines
7.2 KiB
Python
187 lines
7.2 KiB
Python
"""Update + logs for the matrix-bridge bot container on the Spark.
|
|
|
|
matrix-bridge is a single Docker container managed by docker compose out of a
|
|
git clone at `~matrix_bridge_user/matrix-bridge`. Status (the badge) and
|
|
start/stop/restart ride the generic service machinery in `services.py`
|
|
(`docker_state` / `run_action`). The two things that don't fit that mould live
|
|
here:
|
|
|
|
- **Update** — `git fetch && git reset --hard origin/<branch> && docker
|
|
compose up -d --build`. Long-running (docker build), so it streams like the
|
|
vLLM `UpdateManager`: fire-and-forget job, SSE stream, fail-loud rc.
|
|
- **Logs** — a one-shot `docker logs --tail N` for diagnosing a red badge.
|
|
|
|
We connect **directly as the configured user** (`modelo` — the repo owner), so
|
|
git never trips its dubious-ownership guard and docker runs via the user's
|
|
docker-group membership. We deliberately do NOT `sudo -iu modelo`: this Spark
|
|
has no passwordless sudo, so a sudo wrap would hang in SSH BatchMode.
|
|
"""
|
|
from __future__ import annotations
|
|
import asyncio
|
|
import time
|
|
import uuid
|
|
from dataclasses import dataclass, field
|
|
from datetime import datetime, timezone
|
|
from typing import Optional
|
|
|
|
from .config import Settings
|
|
from .shellsafe import quote_arg
|
|
from .ssh import ssh_run, ssh_stream, StreamHandle
|
|
|
|
# Hard ceiling on a single update. A first build after a base-image bump is
|
|
# slow (minutes); the cache makes later ones quick. 25 min is generous headroom
|
|
# without letting a genuinely wedged build spin forever.
|
|
_UPDATE_TIMEOUT_S = 1500
|
|
|
|
|
|
def build_update_command(directory: str, branch: str) -> str:
|
|
"""The update one-liner, run from the bot's git clone as its owner.
|
|
|
|
`directory` and `branch` come from operator config (not request input), so
|
|
they're interpolated directly — same trust model as the Spark hostnames in
|
|
`health`/`updates`. `directory` may be `~/...`, which must stay unquoted so
|
|
the remote login shell expands it; quoting would defeat that.
|
|
"""
|
|
return (
|
|
f"cd {directory} && "
|
|
f"git fetch origin && "
|
|
f"git reset --hard origin/{branch} && "
|
|
f"docker compose up -d --build"
|
|
)
|
|
|
|
|
|
def _phase_for(line: str) -> Optional[str]:
|
|
"""Map a streamed output line to a human-readable phase, or None to keep
|
|
the current phase. Kept loose — compose/buildkit output varies by version."""
|
|
low = line.lower()
|
|
if "git reset" in low or "head is now at" in low:
|
|
return "Resetting to the latest release…"
|
|
if "docker compose" in low or "buildkit" in low or low.startswith("step ") or "=> " in line or "building " in low:
|
|
return "Building the bot image…"
|
|
if "recreate" in low or "starting" in low or "started" in low or "container matrix-bridge" in low:
|
|
return "Recreating the container…"
|
|
if "already up to date" in low:
|
|
return "No new code; rebuilding…"
|
|
return None
|
|
|
|
|
|
@dataclass
|
|
class UpdateJob:
|
|
id: str
|
|
started_at: str
|
|
state: str = "starting"
|
|
lines: list[str] = field(default_factory=list)
|
|
returncode: Optional[int] = None
|
|
finished_at: Optional[str] = None
|
|
phase: str = "Starting…"
|
|
|
|
def append(self, line: str) -> None:
|
|
self.lines.append(line)
|
|
if len(self.lines) > 1000:
|
|
del self.lines[: len(self.lines) - 1000]
|
|
|
|
|
|
class MatrixBridgeManager:
|
|
def __init__(self, settings: Settings) -> None:
|
|
self.settings = settings
|
|
self.lock = asyncio.Lock()
|
|
self.jobs: dict[str, UpdateJob] = {}
|
|
self.current_job_id: Optional[str] = None
|
|
|
|
def _configured(self) -> bool:
|
|
s = self.settings
|
|
return bool(s.matrix_bridge_host and s.matrix_bridge_user)
|
|
|
|
def get(self, job_id: str) -> UpdateJob | None:
|
|
return self.jobs.get(job_id)
|
|
|
|
async def fetch_logs(self, tail: int = 100) -> dict:
|
|
"""One-shot `docker logs --tail N <container>` (stderr merged in)."""
|
|
s = self.settings
|
|
if not self._configured():
|
|
return {"ok": False, "error": "matrix-bridge host not configured"}
|
|
tail = max(1, min(int(tail), 1000))
|
|
# tail is already int-clamped, but quote at the sink anyway so the
|
|
# shellsafe convention (no raw interpolation into an SSH command) holds
|
|
# regardless of caller.
|
|
cmd = f"docker logs --tail {quote_arg(str(tail))} {quote_arg(s.matrix_bridge_container)} 2>&1"
|
|
rc, out, err = await ssh_run(
|
|
s.matrix_bridge_host, s.matrix_bridge_user, cmd, s, timeout=20
|
|
)
|
|
return {
|
|
"ok": rc == 0,
|
|
"rc": rc,
|
|
"container": s.matrix_bridge_container,
|
|
"output": (out or err).strip(),
|
|
}
|
|
|
|
async def trigger_update(self) -> UpdateJob:
|
|
if not self._configured():
|
|
raise RuntimeError("matrix-bridge host not configured")
|
|
if self.lock.locked():
|
|
raise RuntimeError("An update is already in progress")
|
|
job = UpdateJob(
|
|
id=uuid.uuid4().hex[:8],
|
|
started_at=datetime.now(timezone.utc).isoformat(),
|
|
)
|
|
self.jobs[job.id] = job
|
|
self.current_job_id = job.id
|
|
asyncio.create_task(self._run(job))
|
|
return job
|
|
|
|
async def _run(self, job: UpdateJob) -> None:
|
|
async with self.lock:
|
|
try:
|
|
await self._do(job)
|
|
if job.state != "failed":
|
|
job.state = "done"
|
|
job.returncode = 0
|
|
job.phase = "Done"
|
|
except asyncio.TimeoutError:
|
|
job.append(f"[error] update timed out after {_UPDATE_TIMEOUT_S}s")
|
|
job.state = "failed"
|
|
job.returncode = 124
|
|
job.phase = "Timed out"
|
|
except Exception as e:
|
|
job.append(f"[error] {type(e).__name__}: {e}")
|
|
job.state = "failed"
|
|
if job.returncode is None:
|
|
job.returncode = 1
|
|
finally:
|
|
job.finished_at = datetime.now(timezone.utc).isoformat()
|
|
if self.current_job_id == job.id:
|
|
self.current_job_id = None
|
|
|
|
async def _do(self, job: UpdateJob) -> None:
|
|
s = self.settings
|
|
cmd = build_update_command(s.matrix_bridge_dir, s.matrix_bridge_branch)
|
|
job.append(f"$ {cmd}")
|
|
job.state = "running"
|
|
job.phase = "Fetching latest code…"
|
|
|
|
handle = StreamHandle()
|
|
gen = ssh_stream(s.matrix_bridge_host, s.matrix_bridge_user, cmd, s, handle=handle)
|
|
deadline = time.monotonic() + _UPDATE_TIMEOUT_S
|
|
try:
|
|
while True:
|
|
remaining = deadline - time.monotonic()
|
|
if remaining <= 0:
|
|
raise asyncio.TimeoutError
|
|
try:
|
|
line = await asyncio.wait_for(gen.__anext__(), timeout=remaining)
|
|
except StopAsyncIteration:
|
|
break
|
|
job.append(line)
|
|
phase = _phase_for(line)
|
|
if phase:
|
|
job.phase = phase
|
|
finally:
|
|
# Closing the generator terminates the underlying ssh process and
|
|
# populates handle.returncode via ssh_stream's finally block.
|
|
await gen.aclose()
|
|
|
|
rc = handle.returncode or 0
|
|
if rc != 0:
|
|
job.state = "failed"
|
|
job.returncode = rc
|