Files
spark-control/image/app/matrix_bridge.py
T

187 lines
7.2 KiB
Python

"""Update + logs for the matrix-bridge bot container on the Spark.
matrix-bridge is a single Docker container managed by docker compose out of a
git clone at `~matrix_bridge_user/matrix-bridge`. Status (the badge) and
start/stop/restart ride the generic service machinery in `services.py`
(`docker_state` / `run_action`). The two things that don't fit that mould live
here:
- **Update** — `git fetch && git reset --hard origin/<branch> && docker
compose up -d --build`. Long-running (docker build), so it streams like the
vLLM `UpdateManager`: fire-and-forget job, SSE stream, fail-loud rc.
- **Logs** — a one-shot `docker logs --tail N` for diagnosing a red badge.
We connect **directly as the configured user** (`modelo` — the repo owner), so
git never trips its dubious-ownership guard and docker runs via the user's
docker-group membership. We deliberately do NOT `sudo -iu modelo`: this Spark
has no passwordless sudo, so a sudo wrap would hang in SSH BatchMode.
"""
from __future__ import annotations
import asyncio
import time
import uuid
from dataclasses import dataclass, field
from datetime import datetime, timezone
from typing import Optional
from .config import Settings
from .shellsafe import quote_arg
from .ssh import ssh_run, ssh_stream, StreamHandle
# Hard ceiling on a single update. A first build after a base-image bump is
# slow (minutes); the cache makes later ones quick. 25 min is generous headroom
# without letting a genuinely wedged build spin forever.
_UPDATE_TIMEOUT_S = 1500
def build_update_command(directory: str, branch: str) -> str:
"""The update one-liner, run from the bot's git clone as its owner.
`directory` and `branch` come from operator config (not request input), so
they're interpolated directly — same trust model as the Spark hostnames in
`health`/`updates`. `directory` may be `~/...`, which must stay unquoted so
the remote login shell expands it; quoting would defeat that.
"""
return (
f"cd {directory} && "
f"git fetch origin && "
f"git reset --hard origin/{branch} && "
f"docker compose up -d --build"
)
def _phase_for(line: str) -> Optional[str]:
"""Map a streamed output line to a human-readable phase, or None to keep
the current phase. Kept loose — compose/buildkit output varies by version."""
low = line.lower()
if "git reset" in low or "head is now at" in low:
return "Resetting to the latest release…"
if "docker compose" in low or "buildkit" in low or low.startswith("step ") or "=> " in line or "building " in low:
return "Building the bot image…"
if "recreate" in low or "starting" in low or "started" in low or "container matrix-bridge" in low:
return "Recreating the container…"
if "already up to date" in low:
return "No new code; rebuilding…"
return None
@dataclass
class UpdateJob:
id: str
started_at: str
state: str = "starting"
lines: list[str] = field(default_factory=list)
returncode: Optional[int] = None
finished_at: Optional[str] = None
phase: str = "Starting…"
def append(self, line: str) -> None:
self.lines.append(line)
if len(self.lines) > 1000:
del self.lines[: len(self.lines) - 1000]
class MatrixBridgeManager:
def __init__(self, settings: Settings) -> None:
self.settings = settings
self.lock = asyncio.Lock()
self.jobs: dict[str, UpdateJob] = {}
self.current_job_id: Optional[str] = None
def _configured(self) -> bool:
s = self.settings
return bool(s.matrix_bridge_host and s.matrix_bridge_user)
def get(self, job_id: str) -> UpdateJob | None:
return self.jobs.get(job_id)
async def fetch_logs(self, tail: int = 100) -> dict:
"""One-shot `docker logs --tail N <container>` (stderr merged in)."""
s = self.settings
if not self._configured():
return {"ok": False, "error": "matrix-bridge host not configured"}
tail = max(1, min(int(tail), 1000))
# tail is already int-clamped, but quote at the sink anyway so the
# shellsafe convention (no raw interpolation into an SSH command) holds
# regardless of caller.
cmd = f"docker logs --tail {quote_arg(str(tail))} {quote_arg(s.matrix_bridge_container)} 2>&1"
rc, out, err = await ssh_run(
s.matrix_bridge_host, s.matrix_bridge_user, cmd, s, timeout=20
)
return {
"ok": rc == 0,
"rc": rc,
"container": s.matrix_bridge_container,
"output": (out or err).strip(),
}
async def trigger_update(self) -> UpdateJob:
if not self._configured():
raise RuntimeError("matrix-bridge host not configured")
if self.lock.locked():
raise RuntimeError("An update is already in progress")
job = UpdateJob(
id=uuid.uuid4().hex[:8],
started_at=datetime.now(timezone.utc).isoformat(),
)
self.jobs[job.id] = job
self.current_job_id = job.id
asyncio.create_task(self._run(job))
return job
async def _run(self, job: UpdateJob) -> None:
async with self.lock:
try:
await self._do(job)
if job.state != "failed":
job.state = "done"
job.returncode = 0
job.phase = "Done"
except asyncio.TimeoutError:
job.append(f"[error] update timed out after {_UPDATE_TIMEOUT_S}s")
job.state = "failed"
job.returncode = 124
job.phase = "Timed out"
except Exception as e:
job.append(f"[error] {type(e).__name__}: {e}")
job.state = "failed"
if job.returncode is None:
job.returncode = 1
finally:
job.finished_at = datetime.now(timezone.utc).isoformat()
if self.current_job_id == job.id:
self.current_job_id = None
async def _do(self, job: UpdateJob) -> None:
s = self.settings
cmd = build_update_command(s.matrix_bridge_dir, s.matrix_bridge_branch)
job.append(f"$ {cmd}")
job.state = "running"
job.phase = "Fetching latest code…"
handle = StreamHandle()
gen = ssh_stream(s.matrix_bridge_host, s.matrix_bridge_user, cmd, s, handle=handle)
deadline = time.monotonic() + _UPDATE_TIMEOUT_S
try:
while True:
remaining = deadline - time.monotonic()
if remaining <= 0:
raise asyncio.TimeoutError
try:
line = await asyncio.wait_for(gen.__anext__(), timeout=remaining)
except StopAsyncIteration:
break
job.append(line)
phase = _phase_for(line)
if phase:
job.phase = phase
finally:
# Closing the generator terminates the underlying ssh process and
# populates handle.returncode via ssh_stream's finally block.
await gen.aclose()
rc = handle.returncode or 0
if rc != 0:
job.state = "failed"
job.returncode = rc