recap-relay/server/backends/hardware.js

// Operator-hardware fallback backend. Forwards transcribe requests to
// a Parakeet endpoint (or any Whisper-API-compatible server — same wire
// format) and analyze requests to a Gemma endpoint (or any
// OpenAI-compatible chat-completions server).
//
// Used when a Pro/Max user has exceeded their monthly Gemini cap.
// Returns the same shape gemini.js produces so route handlers don't
// need a backend-specific branch downstream:
//   transcribeAudio → { text, segments, duration_seconds }
//   analyzeText     → { text }
//
// Both endpoints are reached via plain fetch — no SDK dependency keeps
// the relay container slim and the upstream wire format is dead-simple
// for these two well-known shapes.

const ANALYZE_MAX_TOKENS = 16000;
// Gemma served locally tends to live on the host's LAN, not the public
// internet, so generous timeouts. Same scale as Recap's defaults.
const DEFAULT_TIMEOUT_MS = 900_000;

// Defaults used only when the route handler doesn't supply explicit
// model names (e.g. a unit test instantiating the backend directly).
// In production the model names come from relay-config.json via
// setParakeetUrl / setGemmaUrl, so the operator can swap models on
// their Ollama deployment without rebuilding the relay.
const DEFAULT_TRANSCRIBE_MODEL = "parakeet-tdt-0.6b-v3";
const DEFAULT_ANALYZE_MODEL = "gemma3:27b";

// Normalize an OpenAI-API-compatible base URL: strip trailing slash
// AND strip a trailing `/v1` segment if the operator pasted one,
// because we always append `/v1/...` below. Without this, a base URL
// of `http://192.168.1.87:8000/v1` would produce
// `http://192.168.1.87:8000/v1/v1/audio/transcriptions` → 404.
function normalizeApiBase(url) {
  let s = (url || "").trim().replace(/\/$/, "");
  s = s.replace(/\/v1$/, "");
  return s;
}

export function createHardwareBackend({
  parakeetBaseURL = "",
  gemmaBaseURL = "",
  parakeetModel = DEFAULT_TRANSCRIBE_MODEL,
  gemmaModel = DEFAULT_ANALYZE_MODEL,
  timeoutMs = DEFAULT_TIMEOUT_MS,
} = {}) {
  const parakeet = normalizeApiBase(parakeetBaseURL);
  const gemma = normalizeApiBase(gemmaBaseURL);
  const transcribeModel = parakeetModel || DEFAULT_TRANSCRIBE_MODEL;
  const analyzeModel = gemmaModel || DEFAULT_ANALYZE_MODEL;

  return {
    hasTranscribe: !!parakeet,
    hasAnalyze: !!gemma,

    // POST <parakeet>/v1/audio/transcriptions with the OpenAI Whisper
    // multipart shape. Parakeet wrappers (NeMo + the patched one Recap
    // already talks to) honor this format and return segments with
    // per-segment timestamps when timestamp_granularities=segment is
    // requested. Falls back to a bare request if the rich shape 4xx/5xxs.
    async transcribeAudio({
      audio,
      mimeType = "application/octet-stream",
      offsetSeconds = 0,
    }) {
      if (!parakeet) {
        const err = new Error(
          "operator-hardware transcribe is not configured (relay_parakeet_base_url is empty)"
        );
        err.status = 503;
        throw err;
      }

      // Try the rich request first (verbose_json + segment timestamps).
      // FormData/Blob globals are available in Node 20+. Wrap the
      // received Buffer in a Blob so the multipart body is properly
      // chunked instead of falling back to base64.
      const buildForm = (richMode) => {
        const form = new FormData();
        const blob = new Blob([audio], { type: mimeType });
        form.append("file", blob, "audio.bin");
        form.append("model", transcribeModel);
        if (richMode) {
          form.append("response_format", "verbose_json");
          form.append("timestamp_granularities[]", "segment");
        }
        return form;
      };

      // Path candidates, in order. The OpenAI Whisper standard is
      // `/v1/audio/transcriptions`; some self-hosted wrappers (or
      // operators who pasted their base URL with a path already
      // stripped) expose the endpoint at `/audio/transcriptions`
      // instead. We try the standard path first, then fall back on
      // 404 only — other status codes (rate-limit, 500) shouldn't
      // trigger a different path retry.
      const pathCandidates = [
        "/v1/audio/transcriptions",
        "/audio/transcriptions",
      ];
      let res = null;
      let lastUrl = null;
      let pathErrSummary = null;
      for (const p of pathCandidates) {
        const url = `${parakeet}${p}`;
        lastUrl = url;
        try {
          res = await fetch(url, {
            method: "POST",
            body: buildForm(true),
            signal: AbortSignal.timeout(timeoutMs),
          });
        } catch (err) {
          const e = new Error(
            `Parakeet transcribe network error at ${url}: ${err?.message || err}`
          );
          e.status = 502;
          throw e;
        }
        if (res.status !== 404) break;
        // 404 → try the next path candidate. Capture the body for the
        // final error message if all candidates 404.
        pathErrSummary = await safeBody(res);
        console.warn(
          `[hardware] 404 at ${url} — trying next path candidate`
        );
      }

      // If the wrapper rejects the rich params (4xx other than 404 we
      // already exhausted, or 5xx), retry with bare-bones at the
      // working URL.
      if (!res.ok && res.status >= 400 && res.status < 600 && res.status !== 404) {
        const richBody = await safeBody(res);
        console.warn(
          `[hardware] rich Parakeet request to ${lastUrl} returned ${res.status}: ${richBody.slice(0, 200)} — retrying bare`
        );
        try {
          res = await fetch(lastUrl, {
            method: "POST",
            body: buildForm(false),
            signal: AbortSignal.timeout(timeoutMs),
          });
        } catch (err) {
          const e = new Error(
            `Parakeet transcribe network error (fallback) at ${lastUrl}: ${err?.message || err}`
          );
          e.status = 502;
          throw e;
        }
      }

      if (!res.ok) {
        const body = await safeBody(res);
        const hint =
          res.status === 404
            ? ` (tried ${pathCandidates.join(" and ")} on base ${parakeet} — wrapper may expose the endpoint at a different path; check the Parakeet URL or container logs)`
            : "";
        const e = new Error(
          `Parakeet transcribe ${res.status} at ${lastUrl}: ${body.slice(0, 300)}${hint}`
        );
        e.status = res.status;
        throw e;
      }

      const data = await res.json();
      const segments = Array.isArray(data.segments) ? data.segments : [];

      // Offset support: when the relay caller is processing a chunked
      // audio file, it asks for transcripts at a non-zero base time.
      // Parakeet returns timestamps relative to the chunk; shift them
      // up by offsetSeconds so the combined transcript downstream
      // lines up with the real video timeline.
      const shifted = segments.map((s) => ({
        start: (s.start || 0) + offsetSeconds,
        end: (s.end || 0) + offsetSeconds,
        text: (s.text || "").trim(),
      }));

      // Build the [MM:SS] text format Recap's parseTimestampedTranscript
      // already speaks. The route handler will pass this straight back
      // to Recap, which parses it on the client side.
      const lines = shifted.length
        ? shifted.map((s) => `[${formatMmSs(s.start)}] ${s.text}`)
        : [`[0:00] ${(data.text || "").trim()}`];

      return {
        text: lines.join("\n"),
        segments: shifted,
        duration_seconds: data.duration || 0,
        usage: null, // hardware backend doesn't expose token counts
        model: transcribeModel,
      };
    },

    // POST <gemma>/v1/chat/completions with the OpenAI shape. Ollama's
    // server, vLLM, llama.cpp's HTTP server, and most other OSS LLM
    // runners support this wire format — so we don't lock the relay
    // to one specific Gemma deployment.
    async analyzeText({ prompt }) {
      if (!gemma) {
        const err = new Error(
          "operator-hardware analyze is not configured (relay_gemma_base_url is empty)"
        );
        err.status = 503;
        throw err;
      }

      // Same path-fallback shape as Parakeet transcribe. Standard
      // OpenAI-compatible path is /v1/chat/completions; some Ollama
      // versions also expose it at /chat/completions without the /v1.
      const pathCandidates = ["/v1/chat/completions", "/chat/completions"];
      let res = null;
      let lastUrl = null;
      for (const p of pathCandidates) {
        const url = `${gemma}${p}`;
        lastUrl = url;
        try {
          res = await fetch(url, {
            method: "POST",
            headers: { "Content-Type": "application/json" },
            body: JSON.stringify({
              model: analyzeModel,
              max_tokens: ANALYZE_MAX_TOKENS,
              messages: [{ role: "user", content: prompt }],
              stream: false,
            }),
            signal: AbortSignal.timeout(timeoutMs),
          });
        } catch (err) {
          const e = new Error(
            `Gemma analyze network error at ${url}: ${err?.message || err}`
          );
          e.status = 502;
          throw e;
        }
        if (res.status !== 404) break;
        console.warn(
          `[hardware] 404 at ${url} — trying next path candidate`
        );
      }

      if (!res.ok) {
        const body = await safeBody(res);
        const hint =
          res.status === 404
            ? ` (tried ${pathCandidates.join(" and ")} on base ${gemma} — check the Gemma/Ollama URL)`
            : "";
        const e = new Error(
          `Gemma analyze ${res.status} at ${lastUrl}: ${body.slice(0, 300)}${hint}`
        );
        e.status = res.status;
        throw e;
      }

      const data = await res.json();
      const text = data?.choices?.[0]?.message?.content || "";
      return {
        text,
        usage: null,
        model: analyzeModel,
      };
    },
  };
}

function formatMmSs(seconds) {
  const s = Math.max(0, Math.floor(seconds));
  const h = Math.floor(s / 3600);
  const m = Math.floor((s % 3600) / 60);
  const sec = s % 60;
  if (h > 0)
    return `${h}:${String(m).padStart(2, "0")}:${String(sec).padStart(2, "0")}`;
  return `${m}:${String(sec).padStart(2, "0")}`;
}

async function safeBody(res) {
  try {
    return await res.text();
  } catch {
    return "";
  }
}