"""Update + logs for the matrix-bridge bot container on the Spark. matrix-bridge is a single Docker container managed by docker compose out of a git clone at `~matrix_bridge_user/matrix-bridge`. Status (the badge) and start/stop/restart ride the generic service machinery in `services.py` (`docker_state` / `run_action`). The two things that don't fit that mould live here: - **Update** — `git fetch && git reset --hard origin/ && docker compose up -d --build`. Long-running (docker build), so it streams like the vLLM `UpdateManager`: fire-and-forget job, SSE stream, fail-loud rc. - **Logs** — a one-shot `docker logs --tail N` for diagnosing a red badge. We connect **directly as the configured user** (`modelo` — the repo owner), so git never trips its dubious-ownership guard and docker runs via the user's docker-group membership. We deliberately do NOT `sudo -iu modelo`: this Spark has no passwordless sudo, so a sudo wrap would hang in SSH BatchMode. """ from __future__ import annotations import asyncio import time import uuid from dataclasses import dataclass, field from datetime import datetime, timezone from typing import Optional from .config import Settings from .shellsafe import quote_arg from .ssh import ssh_run, ssh_stream, StreamHandle # Hard ceiling on a single update. A first build after a base-image bump is # slow (minutes); the cache makes later ones quick. 25 min is generous headroom # without letting a genuinely wedged build spin forever. _UPDATE_TIMEOUT_S = 1500 def build_update_command(directory: str, branch: str) -> str: """The update one-liner, run from the bot's git clone as its owner. `directory` and `branch` come from operator config (not request input), so they're interpolated directly — same trust model as the Spark hostnames in `health`/`updates`. `directory` may be `~/...`, which must stay unquoted so the remote login shell expands it; quoting would defeat that. """ return ( f"cd {directory} && " f"git fetch origin && " f"git reset --hard origin/{branch} && " f"docker compose up -d --build" ) def _phase_for(line: str) -> Optional[str]: """Map a streamed output line to a human-readable phase, or None to keep the current phase. Kept loose — compose/buildkit output varies by version.""" low = line.lower() if "git reset" in low or "head is now at" in low: return "Resetting to the latest release…" if "docker compose" in low or "buildkit" in low or low.startswith("step ") or "=> " in line or "building " in low: return "Building the bot image…" if "recreate" in low or "starting" in low or "started" in low or "container matrix-bridge" in low: return "Recreating the container…" if "already up to date" in low: return "No new code; rebuilding…" return None @dataclass class UpdateJob: id: str started_at: str state: str = "starting" lines: list[str] = field(default_factory=list) returncode: Optional[int] = None finished_at: Optional[str] = None phase: str = "Starting…" def append(self, line: str) -> None: self.lines.append(line) if len(self.lines) > 1000: del self.lines[: len(self.lines) - 1000] class MatrixBridgeManager: def __init__(self, settings: Settings) -> None: self.settings = settings self.lock = asyncio.Lock() self.jobs: dict[str, UpdateJob] = {} self.current_job_id: Optional[str] = None def _configured(self) -> bool: s = self.settings return bool(s.matrix_bridge_host and s.matrix_bridge_user) def get(self, job_id: str) -> UpdateJob | None: return self.jobs.get(job_id) async def fetch_logs(self, tail: int = 100) -> dict: """One-shot `docker logs --tail N ` (stderr merged in).""" s = self.settings if not self._configured(): return {"ok": False, "error": "matrix-bridge host not configured"} tail = max(1, min(int(tail), 1000)) # tail is already int-clamped, but quote at the sink anyway so the # shellsafe convention (no raw interpolation into an SSH command) holds # regardless of caller. cmd = f"docker logs --tail {quote_arg(str(tail))} {quote_arg(s.matrix_bridge_container)} 2>&1" rc, out, err = await ssh_run( s.matrix_bridge_host, s.matrix_bridge_user, cmd, s, timeout=20 ) return { "ok": rc == 0, "rc": rc, "container": s.matrix_bridge_container, "output": (out or err).strip(), } async def trigger_update(self) -> UpdateJob: if not self._configured(): raise RuntimeError("matrix-bridge host not configured") if self.lock.locked(): raise RuntimeError("An update is already in progress") job = UpdateJob( id=uuid.uuid4().hex[:8], started_at=datetime.now(timezone.utc).isoformat(), ) self.jobs[job.id] = job self.current_job_id = job.id asyncio.create_task(self._run(job)) return job async def _run(self, job: UpdateJob) -> None: async with self.lock: try: await self._do(job) if job.state != "failed": job.state = "done" job.returncode = 0 job.phase = "Done" except asyncio.TimeoutError: job.append(f"[error] update timed out after {_UPDATE_TIMEOUT_S}s") job.state = "failed" job.returncode = 124 job.phase = "Timed out" except Exception as e: job.append(f"[error] {type(e).__name__}: {e}") job.state = "failed" if job.returncode is None: job.returncode = 1 finally: job.finished_at = datetime.now(timezone.utc).isoformat() if self.current_job_id == job.id: self.current_job_id = None async def _do(self, job: UpdateJob) -> None: s = self.settings cmd = build_update_command(s.matrix_bridge_dir, s.matrix_bridge_branch) job.append(f"$ {cmd}") job.state = "running" job.phase = "Fetching latest code…" handle = StreamHandle() gen = ssh_stream(s.matrix_bridge_host, s.matrix_bridge_user, cmd, s, handle=handle) deadline = time.monotonic() + _UPDATE_TIMEOUT_S try: while True: remaining = deadline - time.monotonic() if remaining <= 0: raise asyncio.TimeoutError try: line = await asyncio.wait_for(gen.__anext__(), timeout=remaining) except StopAsyncIteration: break job.append(line) phase = _phase_for(line) if phase: job.phase = phase finally: # Closing the generator terminates the underlying ssh process and # populates handle.returncode via ssh_stream's finally block. await gen.aclose() rc = handle.returncode or 0 if rc != 0: job.state = "failed" job.returncode = rc