// Operator-hardware fallback backend. Forwards transcribe requests to // a Parakeet endpoint (or any Whisper-API-compatible server — same wire // format) and analyze requests to a Gemma endpoint (or any // OpenAI-compatible chat-completions server). // // Used when a Pro/Max user has exceeded their monthly Gemini cap. // Returns the same shape gemini.js produces so route handlers don't // need a backend-specific branch downstream: // transcribeAudio → { text, segments, duration_seconds } // analyzeText → { text } // // Both endpoints are reached via plain fetch — no SDK dependency keeps // the relay container slim and the upstream wire format is dead-simple // for these two well-known shapes. const ANALYZE_MAX_TOKENS = 16000; // Gemma served locally tends to live on the host's LAN, not the public // internet, so generous timeouts. Same scale as Recap's defaults. const DEFAULT_TIMEOUT_MS = 900_000; // Defaults used only when the route handler doesn't supply explicit // model names (e.g. a unit test instantiating the backend directly). // In production the model names come from relay-config.json via // setParakeetUrl / setGemmaUrl, so the operator can swap models on // their Ollama deployment without rebuilding the relay. const DEFAULT_TRANSCRIBE_MODEL = "parakeet-tdt-0.6b-v3"; const DEFAULT_ANALYZE_MODEL = "gemma3:27b"; // Normalize an OpenAI-API-compatible base URL: strip trailing slash // AND strip a trailing `/v1` segment if the operator pasted one, // because we always append `/v1/...` below. Without this, a base URL // of `http://192.168.1.87:8000/v1` would produce // `http://192.168.1.87:8000/v1/v1/audio/transcriptions` → 404. function normalizeApiBase(url) { let s = (url || "").trim().replace(/\/$/, ""); s = s.replace(/\/v1$/, ""); return s; } export function createHardwareBackend({ parakeetBaseURL = "", gemmaBaseURL = "", parakeetModel = DEFAULT_TRANSCRIBE_MODEL, gemmaModel = DEFAULT_ANALYZE_MODEL, timeoutMs = DEFAULT_TIMEOUT_MS, } = {}) { const parakeet = normalizeApiBase(parakeetBaseURL); const gemma = normalizeApiBase(gemmaBaseURL); const transcribeModel = parakeetModel || DEFAULT_TRANSCRIBE_MODEL; const analyzeModel = gemmaModel || DEFAULT_ANALYZE_MODEL; return { hasTranscribe: !!parakeet, hasAnalyze: !!gemma, // POST /v1/audio/transcriptions with the OpenAI Whisper // multipart shape. Parakeet wrappers (NeMo + the patched one Recap // already talks to) honor this format and return segments with // per-segment timestamps when timestamp_granularities=segment is // requested. Falls back to a bare request if the rich shape 4xx/5xxs. async transcribeAudio({ audio, mimeType = "application/octet-stream", offsetSeconds = 0, }) { if (!parakeet) { const err = new Error( "operator-hardware transcribe is not configured (relay_parakeet_base_url is empty)" ); err.status = 503; throw err; } // Try the rich request first (verbose_json + segment timestamps). // FormData/Blob globals are available in Node 20+. Wrap the // received Buffer in a Blob so the multipart body is properly // chunked instead of falling back to base64. const buildForm = (richMode) => { const form = new FormData(); const blob = new Blob([audio], { type: mimeType }); form.append("file", blob, "audio.bin"); form.append("model", transcribeModel); if (richMode) { form.append("response_format", "verbose_json"); form.append("timestamp_granularities[]", "segment"); } return form; }; // Path candidates, in order. The OpenAI Whisper standard is // `/v1/audio/transcriptions`; some self-hosted wrappers (or // operators who pasted their base URL with a path already // stripped) expose the endpoint at `/audio/transcriptions` // instead. We try the standard path first, then fall back on // 404 only — other status codes (rate-limit, 500) shouldn't // trigger a different path retry. const pathCandidates = [ "/v1/audio/transcriptions", "/audio/transcriptions", ]; let res = null; let lastUrl = null; let pathErrSummary = null; for (const p of pathCandidates) { const url = `${parakeet}${p}`; lastUrl = url; try { res = await fetch(url, { method: "POST", body: buildForm(true), signal: AbortSignal.timeout(timeoutMs), }); } catch (err) { const e = new Error( `Parakeet transcribe network error at ${url}: ${err?.message || err}` ); e.status = 502; throw e; } if (res.status !== 404) break; // 404 → try the next path candidate. Capture the body for the // final error message if all candidates 404. pathErrSummary = await safeBody(res); console.warn( `[hardware] 404 at ${url} — trying next path candidate` ); } // If the wrapper rejects the rich params (4xx other than 404 we // already exhausted, or 5xx), retry with bare-bones at the // working URL. if (!res.ok && res.status >= 400 && res.status < 600 && res.status !== 404) { const richBody = await safeBody(res); console.warn( `[hardware] rich Parakeet request to ${lastUrl} returned ${res.status}: ${richBody.slice(0, 200)} — retrying bare` ); try { res = await fetch(lastUrl, { method: "POST", body: buildForm(false), signal: AbortSignal.timeout(timeoutMs), }); } catch (err) { const e = new Error( `Parakeet transcribe network error (fallback) at ${lastUrl}: ${err?.message || err}` ); e.status = 502; throw e; } } if (!res.ok) { const body = await safeBody(res); const hint = res.status === 404 ? ` (tried ${pathCandidates.join(" and ")} on base ${parakeet} — wrapper may expose the endpoint at a different path; check the Parakeet URL or container logs)` : ""; const e = new Error( `Parakeet transcribe ${res.status} at ${lastUrl}: ${body.slice(0, 300)}${hint}` ); e.status = res.status; throw e; } const data = await res.json(); const segments = Array.isArray(data.segments) ? data.segments : []; // Offset support: when the relay caller is processing a chunked // audio file, it asks for transcripts at a non-zero base time. // Parakeet returns timestamps relative to the chunk; shift them // up by offsetSeconds so the combined transcript downstream // lines up with the real video timeline. const shifted = segments.map((s) => ({ start: (s.start || 0) + offsetSeconds, end: (s.end || 0) + offsetSeconds, text: (s.text || "").trim(), })); // Build the [MM:SS] text format Recap's parseTimestampedTranscript // already speaks. The route handler will pass this straight back // to Recap, which parses it on the client side. const lines = shifted.length ? shifted.map((s) => `[${formatMmSs(s.start)}] ${s.text}`) : [`[0:00] ${(data.text || "").trim()}`]; return { text: lines.join("\n"), segments: shifted, duration_seconds: data.duration || 0, usage: null, // hardware backend doesn't expose token counts model: transcribeModel, }; }, // POST /v1/chat/completions with the OpenAI shape. Ollama's // server, vLLM, llama.cpp's HTTP server, and most other OSS LLM // runners support this wire format — so we don't lock the relay // to one specific Gemma deployment. async analyzeText({ prompt }) { if (!gemma) { const err = new Error( "operator-hardware analyze is not configured (relay_gemma_base_url is empty)" ); err.status = 503; throw err; } // Same path-fallback shape as Parakeet transcribe. Standard // OpenAI-compatible path is /v1/chat/completions; some Ollama // versions also expose it at /chat/completions without the /v1. const pathCandidates = ["/v1/chat/completions", "/chat/completions"]; let res = null; let lastUrl = null; for (const p of pathCandidates) { const url = `${gemma}${p}`; lastUrl = url; try { res = await fetch(url, { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ model: analyzeModel, max_tokens: ANALYZE_MAX_TOKENS, messages: [{ role: "user", content: prompt }], stream: false, }), signal: AbortSignal.timeout(timeoutMs), }); } catch (err) { const e = new Error( `Gemma analyze network error at ${url}: ${err?.message || err}` ); e.status = 502; throw e; } if (res.status !== 404) break; console.warn( `[hardware] 404 at ${url} — trying next path candidate` ); } if (!res.ok) { const body = await safeBody(res); const hint = res.status === 404 ? ` (tried ${pathCandidates.join(" and ")} on base ${gemma} — check the Gemma/Ollama URL)` : ""; const e = new Error( `Gemma analyze ${res.status} at ${lastUrl}: ${body.slice(0, 300)}${hint}` ); e.status = res.status; throw e; } const data = await res.json(); const text = data?.choices?.[0]?.message?.content || ""; return { text, usage: null, model: analyzeModel, }; }, }; } function formatMmSs(seconds) { const s = Math.max(0, Math.floor(seconds)); const h = Math.floor(s / 3600); const m = Math.floor((s % 3600) / 60); const sec = s % 60; if (h > 0) return `${h}:${String(m).padStart(2, "0")}:${String(sec).padStart(2, "0")}`; return `${m}:${String(sec).padStart(2, "0")}`; } async function safeBody(res) { try { return await res.text(); } catch { return ""; } }