v0.2 hardware backend

2026-05-11 20:14:50 -05:00
parent b9d86fa303
commit cccbee27e4
9 changed files with 607 additions and 40 deletions
@@ -1,55 +1,219 @@
 // Operator-hardware fallback backend. Forwards transcribe requests to
-// the operator's Parakeet (or any Whisper-API-compatible) endpoint and
-// analyze requests to their Gemma (or any OpenAI-API-compatible) endpoint.
+// a Parakeet endpoint (or any Whisper-API-compatible server — same wire
+// format) and analyze requests to a Gemma endpoint (or any
+// OpenAI-compatible chat-completions server).
 //
-// v0.1 is a stub — the endpoints are wired up, but no operator has
-// pointed a real Parakeet/Gemma at the relay yet. Returns a 503
-// "hardware fallback not yet wired" so the credits.js routing logic
-// still applies but users get a clear message instead of a silent
-// failure.
+// Used when a Pro/Max user has exceeded their monthly Gemini cap.
+// Returns the same shape gemini.js produces so route handlers don't
+// need a backend-specific branch downstream:
+//   transcribeAudio → { text, segments, duration_seconds }
+//   analyzeText     → { text }
+//
+// Both endpoints are reached via plain fetch — no SDK dependency keeps
+// the relay container slim and the upstream wire format is dead-simple
+// for these two well-known shapes.
+
+const ANALYZE_MAX_TOKENS = 16000;
+// Gemma served locally tends to live on the host's LAN, not the public
+// internet, so generous timeouts. Same scale as Recap's defaults.
+const DEFAULT_TIMEOUT_MS = 900_000;
+
+// Pull the model identifier out of the prompt if the operator wants a
+// specific Gemma SKU. We default to "gemma3:27b" which is the typical
+// Ollama tag for the analysis-capable Gemma model. Operators with a
+// different deployment can update this via a future StartOS action;
+// for v0.2 it's hardcoded.
+const HARDWARE_ANALYZE_MODEL = process.env.RELAY_GEMMA_MODEL || "gemma3:27b";
+
+// Parakeet's typical model identifier. Mirrors what Recap's whisper.js
+// sends when the operator points the relay at a NeMo Parakeet HTTP
+// wrapper. Configurable via env var for non-default deployments.
+const HARDWARE_TRANSCRIBE_MODEL =
+  process.env.RELAY_PARAKEET_MODEL || "parakeet-tdt-0.6b-v3";

 export function createHardwareBackend({
  parakeetBaseURL = "",
  gemmaBaseURL = "",
+  timeoutMs = DEFAULT_TIMEOUT_MS,
 } = {}) {
-  const hasParakeet = !!parakeetBaseURL;
-  const hasGemma = !!gemmaBaseURL;
+  const parakeet = parakeetBaseURL ? parakeetBaseURL.replace(/\/$/, "") : "";
+  const gemma = gemmaBaseURL ? gemmaBaseURL.replace(/\/$/, "") : "";

  return {
-    hasTranscribe: hasParakeet,
-    hasAnalyze: hasGemma,
+    hasTranscribe: !!parakeet,
+    hasAnalyze: !!gemma,

-    async transcribeAudio() {
-      if (!hasParakeet) {
+    // POST <parakeet>/v1/audio/transcriptions with the OpenAI Whisper
+    // multipart shape. Parakeet wrappers (NeMo + the patched one Recap
+    // already talks to) honor this format and return segments with
+    // per-segment timestamps when timestamp_granularities=segment is
+    // requested. Falls back to a bare request if the rich shape 4xx/5xxs.
+    async transcribeAudio({
+      audio,
+      mimeType = "application/octet-stream",
+      offsetSeconds = 0,
+    }) {
+      if (!parakeet) {
        const err = new Error(
-          "operator-hardware transcribe path is not configured (relay_parakeet_base_url is empty)"
+          "operator-hardware transcribe is not configured (relay_parakeet_base_url is empty)"
        );
        err.status = 503;
        throw err;
      }
-      // TODO v0.2: POST audio to parakeetBaseURL using the OpenAI
-      // audio-transcriptions wire format Recap already speaks. Return
-      // { text, segments, duration_seconds } in the same shape as
-      // gemini.js's transcribeAudio.
-      const err = new Error("operator-hardware transcribe path not yet implemented in relay v0.1");
-      err.status = 503;
-      throw err;
+
+      // Try the rich request first (verbose_json + segment timestamps).
+      // FormData/Blob globals are available in Node 20+. Wrap the
+      // received Buffer in a Blob so the multipart body is properly
+      // chunked instead of falling back to base64.
+      const buildForm = (richMode) => {
+        const form = new FormData();
+        const blob = new Blob([audio], { type: mimeType });
+        form.append("file", blob, "audio.bin");
+        form.append("model", HARDWARE_TRANSCRIBE_MODEL);
+        if (richMode) {
+          form.append("response_format", "verbose_json");
+          form.append("timestamp_granularities[]", "segment");
+        }
+        return form;
+      };
+
+      const url = `${parakeet}/v1/audio/transcriptions`;
+      let res;
+      try {
+        res = await fetch(url, {
+          method: "POST",
+          body: buildForm(true),
+          signal: AbortSignal.timeout(timeoutMs),
+        });
+      } catch (err) {
+        const e = new Error(
+          `Parakeet transcribe network error: ${err?.message || err}`
+        );
+        e.status = 502;
+        throw e;
+      }
+
+      // If the wrapper rejects the rich params, retry with bare-bones.
+      if (!res.ok && res.status >= 400 && res.status < 600) {
+        const richBody = await safeBody(res);
+        console.warn(
+          `[hardware] rich Parakeet request returned ${res.status}: ${richBody.slice(0, 200)} — retrying bare`
+        );
+        try {
+          res = await fetch(url, {
+            method: "POST",
+            body: buildForm(false),
+            signal: AbortSignal.timeout(timeoutMs),
+          });
+        } catch (err) {
+          const e = new Error(
+            `Parakeet transcribe network error (fallback): ${err?.message || err}`
+          );
+          e.status = 502;
+          throw e;
+        }
+      }
+
+      if (!res.ok) {
+        const body = await safeBody(res);
+        const e = new Error(
+          `Parakeet transcribe ${res.status}: ${body.slice(0, 300)}`
+        );
+        e.status = res.status;
+        throw e;
+      }
+
+      const data = await res.json();
+      const segments = Array.isArray(data.segments) ? data.segments : [];
+
+      // Offset support: when the relay caller is processing a chunked
+      // audio file, it asks for transcripts at a non-zero base time.
+      // Parakeet returns timestamps relative to the chunk; shift them
+      // up by offsetSeconds so the combined transcript downstream
+      // lines up with the real video timeline.
+      const shifted = segments.map((s) => ({
+        start: (s.start || 0) + offsetSeconds,
+        end: (s.end || 0) + offsetSeconds,
+        text: (s.text || "").trim(),
+      }));
+
+      // Build the [MM:SS] text format Recap's parseTimestampedTranscript
+      // already speaks. The route handler will pass this straight back
+      // to Recap, which parses it on the client side.
+      const lines = shifted.length
+        ? shifted.map((s) => `[${formatMmSs(s.start)}] ${s.text}`)
+        : [`[0:00] ${(data.text || "").trim()}`];
+
+      return {
+        text: lines.join("\n"),
+        segments: shifted,
+        duration_seconds: data.duration || 0,
+      };
    },

-    async analyzeText() {
-      if (!hasGemma) {
+    // POST <gemma>/v1/chat/completions with the OpenAI shape. Ollama's
+    // server, vLLM, llama.cpp's HTTP server, and most other OSS LLM
+    // runners support this wire format — so we don't lock the relay
+    // to one specific Gemma deployment.
+    async analyzeText({ prompt }) {
+      if (!gemma) {
        const err = new Error(
-          "operator-hardware analyze path is not configured (relay_gemma_base_url is empty)"
+          "operator-hardware analyze is not configured (relay_gemma_base_url is empty)"
        );
        err.status = 503;
        throw err;
      }
-      // TODO v0.2: POST prompt to gemmaBaseURL using either /api/generate
-      // (Ollama native) or /v1/chat/completions (OpenAI-compatible).
-      // Return { text } matching gemini.js's analyzeText.
-      const err = new Error("operator-hardware analyze path not yet implemented in relay v0.1");
-      err.status = 503;
-      throw err;
+
+      const url = `${gemma}/v1/chat/completions`;
+      let res;
+      try {
+        res = await fetch(url, {
+          method: "POST",
+          headers: { "Content-Type": "application/json" },
+          body: JSON.stringify({
+            model: HARDWARE_ANALYZE_MODEL,
+            max_tokens: ANALYZE_MAX_TOKENS,
+            messages: [{ role: "user", content: prompt }],
+            stream: false,
+          }),
+          signal: AbortSignal.timeout(timeoutMs),
+        });
+      } catch (err) {
+        const e = new Error(
+          `Gemma analyze network error: ${err?.message || err}`
+        );
+        e.status = 502;
+        throw e;
+      }
+
+      if (!res.ok) {
+        const body = await safeBody(res);
+        const e = new Error(`Gemma analyze ${res.status}: ${body.slice(0, 300)}`);
+        e.status = res.status;
+        throw e;
+      }
+
+      const data = await res.json();
+      const text = data?.choices?.[0]?.message?.content || "";
+      return { text };
    },
  };
 }
+
+function formatMmSs(seconds) {
+  const s = Math.max(0, Math.floor(seconds));
+  const h = Math.floor(s / 3600);
+  const m = Math.floor((s % 3600) / 60);
+  const sec = s % 60;
+  if (h > 0)
+    return `${h}:${String(m).padStart(2, "0")}:${String(sec).padStart(2, "0")}`;
+  return `${m}:${String(sec).padStart(2, "0")}`;
+}
+
+async function safeBody(res) {
+  try {
+    return await res.text();
+  } catch {
+    return "";
+  }
+}