Pluggable AI providers, relay credit system, picker UX overhaul

Captures roughly forty version bumps (v0.2.6 → v0.2.47) of work that
accumulated without commits.

- Pluggable provider system under server/providers/: gemini, anthropic,
  openai, openai-compatible, ollama, whisper-compatible, relay. Mix and
  match transcription + analysis per request via the picker UI.
- Relay backend integration. Hardcoded relay URL in server/relay-default.js
  (operator-controlled at build time, not user-configurable). New
  /api/relay/{status,policy} endpoints proxy to the relay; balance pings
  populate a cached credit display.
- Per-install identity in server/install-id.js for relay credit accounting.
  Sent to the relay as X-Recap-Install-Id; persists across upgrades, lost
  on a full uninstall + reinstall. Not surfaced in the UI.
- Admin login gate (server/admin-auth.js + setAdminPassword action). Scrypt
  password hash + HMAC-signed session cookie.
- Entitlement scheme rename: pro / max (each paired with subscriptions and
  relay_pro / relay_max), replacing the misleading "core" entitlement
  that conflicted with the user-facing "Core" tier name.
- Activation screen: dynamic credit count pulled from /api/relay/policy,
  "Skip — use free mode" button, accurate paid-feature list.
- Top toolbar: inline credit-balance pill (or "BYO configured" fallback),
  Upgrade + "I have a key" buttons.
- Picker UI: per-provider sections with Save/Test/Delete buttons, sections
  collapsible by chevron, default-collapsed unless currently selected,
  "Use comped credits (reset to relay)" link when the user has strayed,
  green hint under inputs whose values are server-configured.
- Activity log: chevron-collapsible groups per video, refresh-survival via
  localStorage + a 500-entry server-side buffer, explicit Clear button.
- YouTube captions fast-path with user toggle (skips audio download + AI
  transcription when captions are available — uncheck for speaker labels).
- Cancel button: AbortController plumbed through every provider SDK call;
  retryAPI short-circuits on AbortError; cancellation events surface in
  the activity log instead of silent retries.
- Long-video analysis: auto-coalesce transcript entries before building the
  analysis prompt so local-model context windows (32k-ish) don't overflow.
  Original entries preserved for transcript display via an index map; the
  analyzer sees a coarser view but click-to-seek timestamps stay precise.
- StartOS action grouping (Setup / AI Providers) so the actions list is
  navigable.
- Manifest description rewritten to reflect multi-provider support and
  free-tier relay credits.
- Smaller fixes: summarize-button enablement no longer requires a Gemini
  key when other providers are configured; analysis fallback chain handles
  context-length and 503 capacity errors; single-segment expansion for
  providers that don't return per-segment timestamps (Parakeet et al.);
  many other UX polish items.
This commit is contained in:
Keysat
2026-05-11 23:46:20 -05:00
parent 2544cf7dde
commit 373d10595b
79 changed files with 6322 additions and 397 deletions
+300
View File
@@ -0,0 +1,300 @@
// Admin login gate.
//
// Reads username + scrypt password hash + session secret out of
// /data/config/startos-config.json (set by the "Set Admin Password"
// StartOS action). When a hash is set, gates /api/* behind a signed
// HttpOnly cookie. The static frontend stays open so the login screen
// can paint, but every API endpoint except the gate's own + /api/health
// returns 401 admin_login_required until the cookie validates.
//
// Cookie format: <base64url(payload)>.<base64url(hmac)>
// payload: { u: username, iat: epoch_ms, fp: fingerprint(passwordHash) }
// hmac: HMAC-SHA256(payload, sessionSecret)
//
// Changing the password rotates fp, which invalidates all existing
// sessions on the next request. Changing/clearing the session secret
// has the same effect.
//
// ADMIN is exported as a `let` binding so importers see the live value
// after each config-poll refresh.
import fs from "fs/promises";
import path from "path";
import {
randomBytes,
scryptSync,
createHmac,
timingSafeEqual,
} from "crypto";
const COOKIE_NAME = "recap_admin_session";
const COOKIE_MAX_AGE_MS = 30 * 24 * 60 * 60 * 1000; // 30 days
const SCRYPT_KEYLEN = 64;
// Endpoints reachable WITHOUT an admin session, even when the gate is
// enabled. The login flow itself + the bare-minimum status endpoints.
const ADMIN_OPEN_PATHS = new Set([
"/api/admin/status",
"/api/admin/login",
"/api/admin/logout",
"/api/health",
]);
// ── Module state ────────────────────────────────────────────────────────────
// Live snapshot of the admin-auth config. Refreshed from
// /data/config/startos-config.json every CONFIG_POLL_MS. `enabled` is
// derived (true iff a hash is set).
export let ADMIN = {
enabled: false,
username: "",
passwordHash: "",
passwordSalt: "",
sessionSecret: "",
};
let startosConfigPath = null;
// ── Init ────────────────────────────────────────────────────────────────────
export async function initAdminAuth({ dataDir }) {
startosConfigPath = path.join(dataDir, "config", "startos-config.json");
await refreshAdminConfig("startup");
const pollMs = parseInt(
process.env.RECAP_CONFIG_POLL_MS || "3000",
10
);
setInterval(() => {
refreshAdminConfig("config poll").catch(() => {});
}, pollMs);
}
async function refreshAdminConfig(reason) {
let next = {
enabled: false,
username: "",
passwordHash: "",
passwordSalt: "",
sessionSecret: "",
};
try {
const content = await fs.readFile(startosConfigPath, "utf-8");
const cfg = JSON.parse(content);
next = {
enabled: !!(cfg.recap_admin_password_hash && cfg.recap_admin_password_salt),
username: cfg.recap_admin_username || "",
passwordHash: cfg.recap_admin_password_hash || "",
passwordSalt: cfg.recap_admin_password_salt || "",
sessionSecret: cfg.recap_admin_session_secret || "",
};
} catch {
// File missing / unreadable — leave gate disabled.
}
if (
next.enabled !== ADMIN.enabled ||
next.username !== ADMIN.username ||
next.passwordHash !== ADMIN.passwordHash ||
next.sessionSecret !== ADMIN.sessionSecret
) {
ADMIN = next;
if (reason !== "config poll" || ADMIN.enabled !== false) {
console.log(
`[admin-auth] refresh (${reason}): enabled=${ADMIN.enabled} user=${ADMIN.username || "(unset)"}`
);
}
} else {
ADMIN = next;
}
}
// ── Cookie helpers ──────────────────────────────────────────────────────────
function b64url(buf) {
return Buffer.from(buf)
.toString("base64")
.replace(/\+/g, "-")
.replace(/\//g, "_")
.replace(/=+$/, "");
}
function b64urlDecode(s) {
const pad = s.length % 4 === 0 ? 0 : 4 - (s.length % 4);
const padded = s + "=".repeat(pad);
return Buffer.from(padded.replace(/-/g, "+").replace(/_/g, "/"), "base64");
}
function hashFingerprint(hash) {
// First 16 hex chars of the password hash. Stored in the cookie so
// changing the password invalidates all existing sessions.
return (hash || "").slice(0, 16);
}
function signSession({ username, hash, secret }) {
const payload = JSON.stringify({
u: username,
iat: Date.now(),
fp: hashFingerprint(hash),
});
const payloadB64 = b64url(payload);
const sig = createHmac("sha256", secret).update(payloadB64).digest();
return `${payloadB64}.${b64url(sig)}`;
}
function verifySession(token, { username, hash, secret }) {
if (!token || typeof token !== "string") return false;
const dot = token.indexOf(".");
if (dot < 0) return false;
const payloadB64 = token.slice(0, dot);
const sigB64 = token.slice(dot + 1);
if (!payloadB64 || !sigB64) return false;
let expected;
try {
expected = createHmac("sha256", secret).update(payloadB64).digest();
} catch {
return false;
}
let provided;
try {
provided = b64urlDecode(sigB64);
} catch {
return false;
}
if (provided.length !== expected.length) return false;
if (!timingSafeEqual(provided, expected)) return false;
let payload;
try {
payload = JSON.parse(b64urlDecode(payloadB64).toString("utf-8"));
} catch {
return false;
}
if (!payload || payload.u !== username) return false;
if (payload.fp !== hashFingerprint(hash)) return false;
if (typeof payload.iat !== "number") return false;
if (Date.now() - payload.iat > COOKIE_MAX_AGE_MS) return false;
return true;
}
function parseCookies(header) {
const out = {};
if (!header || typeof header !== "string") return out;
for (const part of header.split(";")) {
const eq = part.indexOf("=");
if (eq < 0) continue;
const k = part.slice(0, eq).trim();
const v = part.slice(eq + 1).trim();
if (k) out[k] = decodeURIComponent(v);
}
return out;
}
function buildSetCookie(value, { req, maxAgeMs }) {
const parts = [
`${COOKIE_NAME}=${value}`,
"HttpOnly",
"SameSite=Lax",
"Path=/",
];
if (maxAgeMs > 0) {
parts.push(`Max-Age=${Math.floor(maxAgeMs / 1000)}`);
} else {
parts.push("Max-Age=0");
}
// Mark Secure only when the request itself is HTTPS, so the cookie
// still works on plain-HTTP LAN access (the common StartOS dev setup).
const proto = (req.headers["x-forwarded-proto"] || "").toString().toLowerCase();
const isHttps = req.secure || proto.includes("https");
if (isHttps) parts.push("Secure");
return parts.join("; ");
}
function isSessionAuthed(req) {
if (!ADMIN.enabled) return true;
const cookies = parseCookies(req.headers.cookie);
return verifySession(cookies[COOKIE_NAME], {
username: ADMIN.username,
hash: ADMIN.passwordHash,
secret: ADMIN.sessionSecret,
});
}
// ── Middleware ──────────────────────────────────────────────────────────────
// Register BEFORE setupLicenseMiddleware. When the admin gate is
// disabled, this is a no-op pass-through.
export function setupAdminAuthMiddleware(app) {
app.use((req, res, next) => {
if (!ADMIN.enabled) return next();
if (!req.path.startsWith("/api/")) return next();
if (ADMIN_OPEN_PATHS.has(req.path)) return next();
if (isSessionAuthed(req)) return next();
return res.status(401).json({
error: "admin_login_required",
message: "Admin login required.",
});
});
}
// ── Routes ──────────────────────────────────────────────────────────────────
export function setupAdminAuthRoutes(app) {
app.get("/api/admin/status", (req, res) => {
res.json({
enabled: ADMIN.enabled,
authed: isSessionAuthed(req),
username: ADMIN.enabled ? ADMIN.username : null,
});
});
app.post("/api/admin/login", (req, res) => {
if (!ADMIN.enabled) {
// No password set; treat as success so the frontend doesn't get
// stuck on the login screen if the admin clears the password.
return res.json({ ok: true, enabled: false });
}
const username = (req.body && req.body.username) || "";
const password = (req.body && req.body.password) || "";
if (!username || !password) {
return res
.status(400)
.json({ error: "missing_credentials", message: "Username and password required." });
}
if (username !== ADMIN.username) {
return res
.status(401)
.json({ error: "invalid_credentials", message: "Invalid username or password." });
}
let computed;
try {
computed = scryptSync(password, ADMIN.passwordSalt, SCRYPT_KEYLEN);
} catch {
return res
.status(500)
.json({ error: "hash_failed", message: "Could not verify password." });
}
let stored;
try {
stored = Buffer.from(ADMIN.passwordHash, "hex");
} catch {
return res
.status(500)
.json({ error: "stored_hash_invalid", message: "Stored password hash is unreadable." });
}
if (computed.length !== stored.length || !timingSafeEqual(computed, stored)) {
return res
.status(401)
.json({ error: "invalid_credentials", message: "Invalid username or password." });
}
const token = signSession({
username: ADMIN.username,
hash: ADMIN.passwordHash,
secret: ADMIN.sessionSecret,
});
res.setHeader(
"Set-Cookie",
buildSetCookie(token, { req, maxAgeMs: COOKIE_MAX_AGE_MS })
);
res.json({ ok: true, enabled: true, username: ADMIN.username });
});
app.post("/api/admin/logout", (req, res) => {
res.setHeader("Set-Cookie", buildSetCookie("", { req, maxAgeMs: 0 }));
res.json({ ok: true });
});
}
+44
View File
@@ -89,3 +89,47 @@ export function resolveApiKey(clientKey) {
export function getEnvPath() {
return envPath;
}
// Snapshot of the full StartOS config blob — keys for every provider
// (gemini, anthropic, openai, openai-compatible, ollama) plus the
// admin-auth fields. Each request reads it once and passes it into
// resolveProviderOpts() per provider. Returns {} if the file doesn't
// exist or is unreadable.
export async function getConfigSnapshot() {
try {
const content = await fs.readFile(startosConfigPath, "utf-8");
return JSON.parse(content) || {};
} catch {
return {};
}
}
// Patch the StartOS config file in place. Reads current, merges in the
// given fields, writes atomically (tmp + rename). Used by the picker
// UI's Delete button to clear server-side credentials for a provider.
// The next config poll picks up the changes within CONFIG_POLL_MS;
// resolveProviderOpts already reads getConfigSnapshot per-request, so
// effectively the change is immediate.
//
// `patch` is a plain object of { config_field: value } pairs.
// Pass empty strings to clear a field rather than deleting the key —
// the StartOS schema declares every field with a default of '', so
// empty string is the canonical "unset" representation.
export async function mergeConfig(patch) {
if (!patch || typeof patch !== "object") return;
let current = {};
try {
const content = await fs.readFile(startosConfigPath, "utf-8");
current = JSON.parse(content) || {};
} catch {}
const merged = { ...current, ...patch };
const tmp = startosConfigPath + ".tmp";
await fs.mkdir(path.dirname(startosConfigPath), { recursive: true });
await fs.writeFile(tmp, JSON.stringify(merged, null, 2), { mode: 0o600 });
await fs.rename(tmp, startosConfigPath);
// Re-run the gemini-key refresher so serverApiKey reflects the
// patch immediately (otherwise it'd lag until the poll tick).
if (Object.prototype.hasOwnProperty.call(patch, "gemini_api_key")) {
await refreshServerApiKey("merge config");
}
}
+967 -167
View File
File diff suppressed because it is too large Load Diff
+75
View File
@@ -0,0 +1,75 @@
// Persistent per-install identifier. Generated once on first boot and
// stashed at `<DATA_DIR>/install-id` (typically /data/install-id on
// StartOS). Survives container restarts and Recap upgrades; lost on a
// full uninstall + reinstall.
//
// What it's for: the upcoming relay backend will use this ID as the
// owner of comped/paid relay credits. Without a stable client identity
// the relay can't tell whether a request belongs to a credited install
// or a fresh one. Direct install-ID auth is the v1 choice (see the
// project roadmap discussion) — simple, sufficient for low-count free
// credits, can be hardened later with license-server-minted JWTs.
//
// What it is NOT: a license key. The license system (./license.js) is
// completely separate — license keys are user-facing strings that
// authorize Pro features, while install-IDs are opaque per-install
// UUIDs the relay backend uses for credit accounting.
import fs from "fs/promises";
import path from "path";
import { randomUUID } from "crypto";
let cachedId = null;
// Initialize on boot. Reads the existing ID off disk; if there's no
// file, generates a fresh UUIDv4 and writes it. Subsequent calls to
// getInstallId() return the cached value without touching disk.
//
// `dataDir` must be writable — on StartOS that's /data (the persistent
// volume), on local dev it's the project root.
export async function initInstallId({ dataDir }) {
if (!dataDir) throw new Error("initInstallId: dataDir is required");
const filePath = path.join(dataDir, "install-id");
try {
const raw = await fs.readFile(filePath, "utf8");
const trimmed = raw.trim();
if (isValidInstallId(trimmed)) {
cachedId = trimmed;
console.log(`[install-id] loaded ${redact(cachedId)} from ${filePath}`);
return cachedId;
}
console.warn(
`[install-id] file at ${filePath} contained an invalid value — regenerating`
);
} catch (err) {
if (err.code !== "ENOENT") {
console.warn(`[install-id] read failed (${err.code}): ${err.message}`);
}
}
// No valid file — mint a new one. UUIDv4 is plenty: 122 bits of
// randomness, no collision risk across realistic install counts, and
// it's opaque enough to share over the wire without leaking system
// info (unlike e.g. a machine-id).
const fresh = randomUUID();
await fs.writeFile(filePath, fresh + "\n", { mode: 0o600 });
cachedId = fresh;
console.log(`[install-id] generated ${redact(cachedId)}${filePath}`);
return cachedId;
}
export function getInstallId() {
return cachedId;
}
// Loose UUID shape check — accepts any reasonable UUID-ish string.
// Avoids requiring v4 specifically in case operators want to seed
// non-standard IDs.
function isValidInstallId(s) {
return typeof s === "string" && /^[0-9a-f-]{32,40}$/i.test(s);
}
// Log-safe display: first 8 + last 4 chars only.
function redact(id) {
if (!id || id.length < 12) return "(short)";
return `${id.slice(0, 8)}${id.slice(-4)}`;
}
+133 -27
View File
@@ -20,9 +20,16 @@ console.log(
// Free-tier concurrency lock. Unlicensed users may process one video at
// a time — second submission while another is in flight returns 409 from
// /api/process. The /api/process handler calls tryAcquireFreeSlot() at
// entry and releaseFreeSlot() in its finally block.
let freeJobInFlight = false;
// /api/process with details about what's running. The /api/process
// handler calls tryAcquireFreeSlot() at entry and releaseFreeSlot() in
// its finally block.
//
// The current-job object also drives:
// - /api/process/current — UI status banner after a browser refresh
// - /api/process/cancel — sets `aborted: true` AND fires the request's
// AbortController so in-flight model API calls are interrupted
// immediately (not just at the next pipeline checkpoint).
let currentFreeJob = null; // { url, title, startedAt, aborted, abortController } | null
// ── Online validation tunables ──────────────────────────────────────────────
// 30 min default scheduled cycle catches revocations / suspensions /
@@ -103,22 +110,101 @@ export function startLicenseRefresh() {
}
// ── Free-tier slot management ───────────────────────────────────────────────
// Whether the current LIC counts as a "free" (unlicensed / no core) user.
// Whether the current LIC counts as a paid user — i.e. holds either the
// `pro` or `max` entitlement. Keysat policy cards mint Pro licenses with
// `pro` and Max licenses with `max`; both unlock the same Recap-side
// gates today (subscriptions, no free-tier concurrency lock), with the
// relay layer responsible for the Pro-vs-Max quota split.
export function isPaidUser() {
if (LIC.state !== "licensed") return false;
return LIC.entitlements.has("pro") || LIC.entitlements.has("max");
}
// Inverse of isPaidUser — kept as a separate export because that's how
// most callers phrase the check ("if free, apply rate limits / show
// upgrade banner / etc.").
export function isFreeUser() {
return !(LIC.state === "licensed" && LIC.entitlements.has("core"));
return !isPaidUser();
}
// Returns true if the slot was acquired, false if another free job is in
// flight. The /api/process handler must release via releaseFreeSlot()
// in a finally block on every exit path.
export function tryAcquireFreeSlot() {
if (freeJobInFlight) return false;
freeJobInFlight = true;
//
// `abortController` is the request's AbortController — abortCurrentFreeJob
// calls .abort() on it so in-flight provider SDK calls are interrupted at
// the network layer, not just at the next pipeline checkpoint.
//
// `logs` is a server-side buffer the pipeline appends to (via
// appendCurrentJobLog) as each progress message is sent over SSE. After
// a browser refresh the client re-fetches /api/process/current and uses
// these to repopulate the activity log — without it, a refresh during
// a long pipeline silently drops everything the user has already seen.
export function tryAcquireFreeSlot({ url = "", title = "", abortController = null } = {}) {
if (currentFreeJob) return false;
currentFreeJob = {
url,
title,
startedAt: Date.now(),
aborted: false,
abortController,
logs: [],
};
return true;
}
// Push one entry onto the in-flight job's log buffer. No-op if there's
// no current job (e.g. licensed user — no free-tier tracking). Kept
// bounded so a multi-hour run doesn't grow the buffer without limit.
const MAX_LIVE_LOG_ENTRIES = 500;
export function appendCurrentJobLog(entry) {
if (!currentFreeJob || !entry) return;
currentFreeJob.logs.push(entry);
if (currentFreeJob.logs.length > MAX_LIVE_LOG_ENTRIES) {
currentFreeJob.logs.splice(0, currentFreeJob.logs.length - MAX_LIVE_LOG_ENTRIES);
}
}
export function releaseFreeSlot() {
freeJobInFlight = false;
currentFreeJob = null;
}
// Returns a JSON-friendly snapshot of the in-flight free job, or null.
// `includeLogs` is opt-in because the typical poll (banner refresh) only
// cares about the small header fields — logs are only needed when the
// client is rehydrating after a browser refresh.
export function getCurrentFreeJob({ includeLogs = false } = {}) {
if (!currentFreeJob) return null;
const out = {
url: currentFreeJob.url,
title: currentFreeJob.title,
startedAt: currentFreeJob.startedAt,
elapsedMs: Date.now() - currentFreeJob.startedAt,
aborted: currentFreeJob.aborted,
};
if (includeLogs) out.logs = [...currentFreeJob.logs];
return out;
}
// Mark the current job as cancelled AND fire its AbortController so any
// in-flight provider SDK call rejects immediately. Pipeline code also
// polls isFreeJobAborted() at major checkpoints — that handles the gaps
// between awaitable calls (e.g. while looping over yt-dlp retry delays).
// The handler's finally block runs releaseFreeSlot(), so we don't clear
// currentFreeJob here — that avoids a race where a follow-up /api/process
// request acquires the slot while the cancelled call is still cleaning up.
// Returns true if there was a job to cancel.
export function abortCurrentFreeJob() {
if (!currentFreeJob) return false;
currentFreeJob.aborted = true;
try {
currentFreeJob.abortController?.abort();
} catch {}
return true;
}
export function isFreeJobAborted() {
return !!(currentFreeJob && currentFreeJob.aborted);
}
// ── Endpoints reachable without a license ───────────────────────────────────
@@ -132,34 +218,53 @@ const LICENSE_OPEN_PATHS = new Set([
"/api/license-status",
"/api/license/activate",
"/api/license/deactivate",
"/api/process",
// Install identity — needed by the relay client before any license
// exists, and by the UI's settings panel for verification.
"/api/install-id",
// Relay balance display — the UI needs to render credit counts even
// for unlicensed (Core) users since they get free lifetime credits.
"/api/relay/status",
// Tier-policy lookup powers dynamic copy on the activation screen
// (e.g. "N relay credits" pulled live from the relay). Unlicensed
// users see the activation screen, so this must be open to them.
"/api/relay/policy",
]);
// Prefix-based open list: any /api/* path that startsWith one of these
// is reachable without a license. Library + saved summaries are part of
// the free experience (the app would feel broken without them — you'd
// summarize a video and never be able to find it again). Subscriptions,
// clips, and the relay remain paid. /api/providers/* is open so any
// user (including unlicensed) can test connectivity to their LLM
// providers before deciding whether to buy. /api/process is a prefix
// (not an exact-match in LICENSE_OPEN_PATHS) because /api/process,
// /api/process/current, and /api/process/cancel all need to be reachable
// for the free-tier flow — without /current the in-flight banner can't
// clear after the pipeline finishes, and without /cancel the Cancel
// button silently fails for unlicensed users.
const LICENSE_OPEN_PREFIXES = [
"/api/history",
"/api/library",
"/api/providers",
"/api/process",
];
// ── Pro-tier feature gates ──────────────────────────────────────────────────
// Each entry maps URL prefixes → required entitlement; first match wins.
// A licensed user without the right entitlement gets a clean 402
// feature_not_in_tier (vs. the generic activation gate above).
//
// History + library used to be gated here. They moved to the free tier
// (see LICENSE_OPEN_PREFIXES above) — without saved summaries the app
// feels broken on first use, and the real paid value is auto-queue +
// relay credits.
const PRO_FEATURE_GATES = [
{
prefixes: ["/api/subscriptions", "/api/auto-queue", "/api/sub-check-log"],
entitlement: "subscriptions",
feature: "subscriptions",
message:
"Channel subscriptions and auto-queue require a Pro license. Upgrade to unlock.",
},
{
prefixes: ["/api/history"],
entitlement: "history",
feature: "history",
message:
"Summary history requires a Pro license. Upgrade to unlock.",
},
{
prefixes: ["/api/library"],
entitlement: "library",
feature: "library",
message:
"Library import/export requires a Pro license. Upgrade to unlock.",
"Channel subscriptions and auto-queue require a paid license. Upgrade to unlock.",
},
];
@@ -174,12 +279,13 @@ export function setupLicenseMiddleware(app) {
app.use((req, res, next) => {
if (!req.path.startsWith("/api/")) return next();
if (LICENSE_OPEN_PATHS.has(req.path)) return next();
if (LIC.state === "licensed" && LIC.entitlements.has("core")) return next();
if (LICENSE_OPEN_PREFIXES.some((p) => req.path.startsWith(p))) return next();
if (isPaidUser()) return next();
return res.status(402).json({
error: "license_required",
message:
LIC.state === "licensed"
? "Your license is missing the 'core' entitlement. Contact the seller."
? "Your license is missing the 'pro' or 'max' entitlement. Contact the seller."
: "This feature requires a Recap license. Upgrade to unlock.",
state: LIC.state,
reason: LIC.reason,
+28 -10
View File
@@ -10,17 +10,26 @@
// PRODUCT_SLUG → must match the product slug created in Keysat
// KEYSAT_BASE_URL → optional, only used by online validate() / purchase
//
// Tier model for this app (see KEYSAT_INTEGRATION.md §0):
// "core" required for any business endpoint; unlocks
// summarization and BYO Gemini API key
// "history" — saved summary library: /api/history*
// "library" — bulk import/export: /api/library/*
// "subscriptions" — Pro: channel subs, auto-queue, sub-check log
// "clips" — Pro: paperclip / clip-collection panel
// Tier model for this app:
// Core / Freeno license. Library + history + lifetime relay
// credits (count set by recap-relay's Adjust Tier
// Quotas action; default 10). BYO API keys or
// self-hosted model URL gives unlimited use.
// Pro — license with entitlements:
// "pro" flags the license as paid; unlocks the
// activation gate
// "subscriptions" — channel + podcast subs, auto-queue, sub-check log
// "relay_pro" — recap-relay reads this and applies the Pro
// monthly cap (defaults: 50/mo, 25 Gemini-served)
// Max — license with entitlements:
// "max" — same gate as "pro" (server treats either as paid)
// "subscriptions" — same feature set as Pro
// "relay_max" — recap-relay applies the Max monthly cap (default
// unlimited, with 50/month Gemini sub-cap)
//
// Tier policies:
// Core → ["core", "history", "library"]
// Pro → ["core", "history", "library", "subscriptions", "clips"]
// Older entitlements ("core", "library", "history", "clips") were used
// in pre-1.0 builds. They are unused by the current server; harmless to
// ship in legacy keys.
import fs from "fs";
import path from "path";
@@ -83,6 +92,15 @@ function getOnlineClient() {
// 1. RECAP_LICENSE_KEY env var (overrides everything; useful for tests)
// 2. license.txt at LICENSE_PATH (web-UI activation writes here)
// 3. recap_license_key in startos-config.json ("Set Recap License" action)
// Read-only accessor for the raw license key (LIC1-...). Returns null
// when no license is configured. Used by the relay provider when
// attaching the Authorization header so the relay can do its cached
// online check against keysat. Don't add this to publicView — it's
// server-side only and should never reach the browser.
export function getRawLicenseKey() {
return readLicenseString();
}
function readLicenseString() {
const fromEnv = (process.env.RECAP_LICENSE_KEY || "").trim();
if (fromEnv) return fromEnv;
+96 -2
View File
@@ -8,15 +8,17 @@
"name": "youtube-summarizer-server",
"version": "1.0.0",
"dependencies": {
"@anthropic-ai/sdk": "^0.95.0",
"@google/genai": "^1.41.0",
"@keysat/licensing-client": "file:../vendor/keysat-licensing-client",
"cors": "^2.8.5",
"express": "^4.21.0"
"express": "^4.21.0",
"openai": "^6.37.0"
}
},
"../vendor/keysat-licensing-client": {
"name": "@keysat/licensing-client",
"version": "0.1.0",
"version": "0.2.0",
"license": "MIT",
"dependencies": {
"@noble/ed25519": "^2.0.0",
@@ -27,6 +29,36 @@
"node": ">=18"
}
},
"node_modules/@anthropic-ai/sdk": {
"version": "0.95.1",
"resolved": "https://registry.npmjs.org/@anthropic-ai/sdk/-/sdk-0.95.1.tgz",
"integrity": "sha512-OO9AF7hmAoU492c/mD7Q2cPqI2WNAj7rAPHlawgBeUgpwiboLRiDs+grsErGWeHHP9ZRWfzq2OVrODTt8aITVg==",
"license": "MIT",
"dependencies": {
"json-schema-to-ts": "^3.1.1",
"standardwebhooks": "^1.0.0"
},
"bin": {
"anthropic-ai-sdk": "bin/cli"
},
"peerDependencies": {
"zod": "^3.25.0 || ^4.0.0"
},
"peerDependenciesMeta": {
"zod": {
"optional": true
}
}
},
"node_modules/@babel/runtime": {
"version": "7.29.2",
"resolved": "https://registry.npmjs.org/@babel/runtime/-/runtime-7.29.2.tgz",
"integrity": "sha512-JiDShH45zKHWyGe4ZNVRrCjBz8Nh9TMmZG1kh4QTK8hCBTWBi8Da+i7s1fJw7/lYpM4ccepSNfqzZ/QvABBi5g==",
"license": "MIT",
"engines": {
"node": ">=6.9.0"
}
},
"node_modules/@google/genai": {
"version": "1.52.0",
"resolved": "https://registry.npmjs.org/@google/genai/-/genai-1.52.0.tgz",
@@ -119,6 +151,12 @@
"integrity": "sha512-oOAWABowe8EAbMyWKM0tYDKi8Yaox52D+HWZhAIJqQXbqe0xI/GV7FhLWqlEKreMkfDjshR5FKgi3mnle0h6Eg==",
"license": "BSD-3-Clause"
},
"node_modules/@stablelib/base64": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/@stablelib/base64/-/base64-1.0.1.tgz",
"integrity": "sha512-1bnPQqSxSuc3Ii6MhBysoWCg58j97aUjuCSZrGSmDxNqtytIi0k8utUenAwTZN4V5mXXYGsVUI9zeBqy+jBOSQ==",
"license": "MIT"
},
"node_modules/@types/node": {
"version": "25.6.2",
"resolved": "https://registry.npmjs.org/@types/node/-/node-25.6.2.tgz",
@@ -499,6 +537,12 @@
"integrity": "sha512-fjquC59cD7CyW6urNXK0FBufkZcoiGG80wTuPujX590cB5Ttln20E2UB4S/WARVqhXffZl2LNgS+gQdPIIim/g==",
"license": "MIT"
},
"node_modules/fast-sha256": {
"version": "1.3.0",
"resolved": "https://registry.npmjs.org/fast-sha256/-/fast-sha256-1.3.0.tgz",
"integrity": "sha512-n11RGP/lrWEFI/bWdygLxhI+pVeo1ZYIVwvvPkW7azl/rOy+F3HYRZ2K5zeE9mmkhQppyv9sQFx0JM9UabnpPQ==",
"license": "Unlicense"
},
"node_modules/fetch-blob": {
"version": "3.2.0",
"resolved": "https://registry.npmjs.org/fetch-blob/-/fetch-blob-3.2.0.tgz",
@@ -798,6 +842,19 @@
"bignumber.js": "^9.0.0"
}
},
"node_modules/json-schema-to-ts": {
"version": "3.1.1",
"resolved": "https://registry.npmjs.org/json-schema-to-ts/-/json-schema-to-ts-3.1.1.tgz",
"integrity": "sha512-+DWg8jCJG2TEnpy7kOm/7/AxaYoaRbjVB4LFZLySZlWn8exGs3A4OLJR966cVvU26N7X9TWxl+Jsw7dzAqKT6g==",
"license": "MIT",
"dependencies": {
"@babel/runtime": "^7.18.3",
"ts-algebra": "^2.0.0"
},
"engines": {
"node": ">=16"
}
},
"node_modules/jwa": {
"version": "2.0.1",
"resolved": "https://registry.npmjs.org/jwa/-/jwa-2.0.1.tgz",
@@ -980,6 +1037,27 @@
"node": ">= 0.8"
}
},
"node_modules/openai": {
"version": "6.37.0",
"resolved": "https://registry.npmjs.org/openai/-/openai-6.37.0.tgz",
"integrity": "sha512-0H5dEGFmmLv6KSd0W1w2nyL8WsLkX6yoLeQpU+dZAOuGcany5qkYQMmj35ZrKgb6yiyYqpUzFOpR8mZQkgqeEQ==",
"license": "Apache-2.0",
"bin": {
"openai": "bin/cli"
},
"peerDependencies": {
"ws": "^8.18.0",
"zod": "^3.25 || ^4.0"
},
"peerDependenciesMeta": {
"ws": {
"optional": true
},
"zod": {
"optional": true
}
}
},
"node_modules/p-retry": {
"version": "4.6.2",
"resolved": "https://registry.npmjs.org/p-retry/-/p-retry-4.6.2.tgz",
@@ -1242,6 +1320,16 @@
"url": "https://github.com/sponsors/ljharb"
}
},
"node_modules/standardwebhooks": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/standardwebhooks/-/standardwebhooks-1.0.0.tgz",
"integrity": "sha512-BbHGOQK9olHPMvQNHWul6MYlrRTAOKn03rOe4A8O3CLWhNf4YHBqq2HJKKC+sfqpxiBY52pNeesD6jIiLDz8jg==",
"license": "MIT",
"dependencies": {
"@stablelib/base64": "^1.0.0",
"fast-sha256": "^1.3.0"
}
},
"node_modules/statuses": {
"version": "2.0.2",
"resolved": "https://registry.npmjs.org/statuses/-/statuses-2.0.2.tgz",
@@ -1260,6 +1348,12 @@
"node": ">=0.6"
}
},
"node_modules/ts-algebra": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/ts-algebra/-/ts-algebra-2.0.0.tgz",
"integrity": "sha512-FPAhNPFMrkwz76P7cdjdmiShwMynZYN6SgOujD1urY4oNm80Ou9oMdmbR45LotcKOXoy7wSmHkRFE6Mxbrhefw==",
"license": "MIT"
},
"node_modules/type-is": {
"version": "1.6.18",
"resolved": "https://registry.npmjs.org/type-is/-/type-is-1.6.18.tgz",
+3 -1
View File
@@ -8,9 +8,11 @@
"test": "node --test --test-reporter=spec 'test/**/*.test.js'"
},
"dependencies": {
"@anthropic-ai/sdk": "^0.95.0",
"@google/genai": "^1.41.0",
"@keysat/licensing-client": "file:../vendor/keysat-licensing-client",
"cors": "^2.8.5",
"express": "^4.21.0"
"express": "^4.21.0",
"openai": "^6.37.0"
}
}
+116
View File
@@ -0,0 +1,116 @@
// Anthropic (Claude) provider — analysis only.
//
// Claude does not natively transcribe audio, so transcribeAudio() throws.
// Mix-and-match users can pair this provider for analysis with Gemini
// (or future OpenAI Whisper) for transcription.
//
// Pricing reflects standard-context rates as of 2026-04-29 (cached in
// the claude-api skill). Update when Anthropic changes published rates.
import Anthropic from "@anthropic-ai/sdk";
import { retryAPI } from "../util.js";
import { formatCost, ratesFor } from "./cost.js";
// Per-1M-token rates in USD. Anthropic does not expose a separate
// "thinking" rate — thinking tokens are billed as output, so we let
// formatCost default thinking → output by omitting the thinking field.
export const ANTHROPIC_PRICING = {
"claude-opus-4-7": { input: 5.00, output: 25.00 },
"claude-opus-4-6": { input: 5.00, output: 25.00 },
"claude-sonnet-4-6": { input: 3.00, output: 15.00 },
"claude-haiku-4-5": { input: 1.00, output: 5.00 },
// Fallback for unknown / future models.
"default": { input: 3.00, output: 15.00 },
};
// Analysis model list. Order = default fallback chain (most capable first).
export const ANTHROPIC_ANALYSIS_MODELS = [
"claude-opus-4-7",
"claude-opus-4-6",
"claude-sonnet-4-6",
"claude-haiku-4-5",
];
// Analysis output cap. Generous — the topic-analysis prompt produces a
// JSON document scaled to transcript length, and truncation here loses
// trailing sections.
const ANALYSIS_MAX_TOKENS = 16000;
export function createAnthropicProvider({ apiKey, timeoutMs = 900_000 } = {}) {
if (!apiKey) {
throw new Error("createAnthropicProvider: apiKey is required");
}
const client = new Anthropic({ apiKey, timeout: timeoutMs });
return {
name: "anthropic",
capabilities: {
transcribe: false,
analyze: true,
listModels: true,
},
listAnalysisModels() {
return [...ANTHROPIC_ANALYSIS_MODELS];
},
listTranscriptionModels() {
return [];
},
async transcribeAudio() {
throw new Error(
"Anthropic models do not natively transcribe audio. Use Gemini or OpenAI (Whisper) for the transcription step."
);
},
async analyzeText({
prompt,
model,
onProgress = () => {},
retries = 2,
signal,
}) {
const result = await retryAPI(
() =>
client.messages.create(
{
model,
max_tokens: ANALYSIS_MAX_TOKENS,
messages: [{ role: "user", content: prompt }],
},
// The Anthropic SDK accepts a per-call signal as the second
// arg; abort() rejects the in-flight HTTP request immediately.
signal ? { signal } : undefined
),
{
retries,
delayMs: 5000,
label: "Anthropic analysis",
log: (msg) => onProgress(msg),
}
);
const text = (result.content || [])
.filter((b) => b.type === "text")
.map((b) => b.text)
.join("");
const usage = {
inputTokens: result.usage?.input_tokens || 0,
outputTokens: result.usage?.output_tokens || 0,
thinkingTokens: 0,
};
const cost = formatCost(ratesFor(ANTHROPIC_PRICING, model), usage);
return {
text,
usage,
cost,
finishReason: result.stop_reason || null,
raw: result,
};
},
};
}
+66
View File
@@ -0,0 +1,66 @@
// Shared cost-calculation helper for the provider abstraction.
//
// Each provider knows two things:
// 1. Its pricing table (per-1M-token rates per model).
// 2. How to map its native usage shape into the normalized
// { inputTokens, outputTokens, thinkingTokens, totalTokens } shape.
//
// This module then turns (rates, normalized usage) → the cost record
// the rest of the app already understands. Same shape gemini-helpers
// `calcCost` produces, so dashboards / logs don't care which provider
// was used.
// Format a normalized usage object against a per-model rate table into
// the shared cost record. `rates` is { input, output, thinking? } in
// USD per 1M tokens; `usage` is { inputTokens, outputTokens,
// thinkingTokens, totalTokens } counts.
export function formatCost(rates, usage) {
const inputTokens = usage.inputTokens || 0;
const outputTokens = usage.outputTokens || 0;
const thinkingTokens = usage.thinkingTokens || 0;
const thinkingRate = rates.thinking != null ? rates.thinking : rates.output;
const inputCost = (inputTokens / 1_000_000) * rates.input;
const outputCost = (outputTokens / 1_000_000) * rates.output;
const thinkingCost = (thinkingTokens / 1_000_000) * thinkingRate;
const totalCost = inputCost + outputCost + thinkingCost;
return {
inputTokens,
outputTokens,
thinkingTokens,
totalTokens: usage.totalTokens || (inputTokens + outputTokens + thinkingTokens),
inputCost: inputCost.toFixed(6),
outputCost: outputCost.toFixed(6),
thinkingCost: thinkingCost.toFixed(6),
totalCost: totalCost.toFixed(6),
totalCostDisplay: totalCost < 0.01
? `$${(totalCost * 100).toFixed(3)}¢`
: `$${totalCost.toFixed(4)}`,
};
}
// Look up rates for a model in a provider's pricing table, falling back
// to the table's "default" row. Each provider defines its own table.
export function ratesFor(pricingTable, model) {
return pricingTable[model] || pricingTable["default"] || { input: 0, output: 0 };
}
// Zero-cost record — used by providers that don't charge (Ollama,
// local, openai-compatible without a known pricing table).
export function zeroCost(usage = {}) {
const inputTokens = usage.inputTokens || 0;
const outputTokens = usage.outputTokens || 0;
const thinkingTokens = usage.thinkingTokens || 0;
return {
inputTokens,
outputTokens,
thinkingTokens,
totalTokens: usage.totalTokens || (inputTokens + outputTokens + thinkingTokens),
inputCost: "0.000000",
outputCost: "0.000000",
thinkingCost: "0.000000",
totalCost: "0.000000",
totalCostDisplay: "$0.0000",
};
}
+364
View File
@@ -0,0 +1,364 @@
// Gemini provider — wraps @google/genai behind the shared Provider
// interface. Stateless helpers + a per-request factory: each call to
// createGeminiProvider({ apiKey }) returns a provider instance bound to
// that key, mirroring how `new GoogleGenAI({ apiKey })` was used before.
//
// What lives here:
// - SDK init + per-request HTTP timeouts
// - File API upload + processing-state polling
// - generateContent calls for transcription + analysis
// - Empty-response retry loop
// - Safety settings + thinking-config selection
// - Cost calculation (delegated to gemini-helpers.calcCost)
// - Model lists for the two pipelines (transcription vs. analysis)
//
// What does NOT live here (stays in server/index.js as orchestration):
// - Audio chunking decisions + transcript merging
// - Analysis-output JSON parsing
// - Topic-analysis prompt construction (provider-neutral, in
// gemini-helpers.js)
import { GoogleGenAI } from "@google/genai";
import { safeText, retryGemini, formatTime } from "../util.js";
import { calcCost } from "../gemini-helpers.js";
// Models exposed to the analysis fallback chain. Order matters — first
// is the preferred default, the rest are tried in order if it fails.
export const GEMINI_ANALYSIS_MODELS = [
"gemini-3.1-pro-preview",
"gemini-3-pro-preview",
"gemini-3-flash-preview",
"gemini-2.5-flash",
];
// Transcription models, in fallback order. Flash is best speed/cost
// for audio → text; 2.5 Flash is the stable previous-gen multimodal
// model and works well as a fallback when Gemini 3 Flash returns 503
// (capacity / overload). The orchestration layer in server/index.js
// iterates this list, retrying with the next model when one fails.
export const GEMINI_TRANSCRIPTION_MODELS = [
"gemini-3-flash-preview",
"gemini-2.5-flash",
"gemini-2.0-flash",
];
// Empty-response retries: when the SDK returns 200 with no text (which
// happens periodically with audio inputs), retry up to N times with
// linear backoff before giving up.
const EMPTY_RETRIES = 3;
// The @google/genai SDK does not accept a per-call AbortSignal, so when
// the user cancels a request we need to interrupt the in-flight promise
// ourselves. Race the SDK call against a promise that rejects when the
// caller's signal aborts — the rejection bubbles up immediately and the
// underlying HTTP request gets garbage-collected by the SDK on its own
// timeout. `signal` is optional; without it this is a no-op passthrough.
function withAbort(promise, signal) {
if (!signal) return promise;
if (signal.aborted) {
return Promise.reject(Object.assign(new Error("aborted"), { name: "AbortError" }));
}
return new Promise((resolve, reject) => {
const onAbort = () => {
reject(Object.assign(new Error("aborted"), { name: "AbortError" }));
};
signal.addEventListener("abort", onAbort, { once: true });
promise.then(
(v) => {
signal.removeEventListener("abort", onAbort);
resolve(v);
},
(e) => {
signal.removeEventListener("abort", onAbort);
reject(e);
}
);
});
}
// Safety filters disabled for transcription so the model doesn't refuse
// to transcribe sensitive but legitimate spoken content. Analysis
// inherits whatever Gemini's defaults are.
const TRANSCRIPTION_SAFETY = [
{ category: "HARM_CATEGORY_HARASSMENT", threshold: "BLOCK_NONE" },
{ category: "HARM_CATEGORY_HATE_SPEECH", threshold: "BLOCK_NONE" },
{ category: "HARM_CATEGORY_SEXUALLY_EXPLICIT", threshold: "BLOCK_NONE" },
{ category: "HARM_CATEGORY_DANGEROUS_CONTENT", threshold: "BLOCK_NONE" },
];
export function createGeminiProvider({ apiKey, timeoutMs = 900_000 } = {}) {
if (!apiKey) {
throw new Error("createGeminiProvider: apiKey is required");
}
const ai = new GoogleGenAI({
apiKey,
httpOptions: { timeout: timeoutMs, headersTimeout: timeoutMs },
});
// Analysis uses the same client — legitimate analysis on long
// transcripts can genuinely take 35+ minutes, so an aggressive
// timeout cuts off real work. The double-retry-of-overloaded-model
// waste that 0.2.22 was trying to fix is already handled by
// retries=1 below: a 503 fast-fails in seconds, and the outer
// fallback chain (Pro → Pro older → Flash → Flash 2.5) moves
// on immediately.
const aiAnalyze = ai;
return {
name: "gemini",
capabilities: {
transcribe: true,
analyze: true,
listModels: true,
},
listAnalysisModels() {
return [...GEMINI_ANALYSIS_MODELS];
},
listTranscriptionModels() {
return [...GEMINI_TRANSCRIPTION_MODELS];
},
// Transcribe a single audio file. The caller handles chunking +
// merging — this is the atomic unit. Returns:
// { text, entries?, usage, cost, finishReason, blockReason }
// `text` is the raw model output (with [MM:SS] markers); the caller
// parses it into entries. `cost` uses the same shape calcCost
// already produces, so existing accounting code is unchanged.
async transcribeAudio({
filePath,
mimeType,
titleHint,
channelHint = "",
descriptionHint = "",
chaptersHint = [],
model,
offsetSeconds = 0,
onProgress = () => {},
signal,
}) {
const upStart = Date.now();
onProgress(
`Uploading audio${offsetSeconds > 0 ? ` (offset ${formatTime(offsetSeconds)})` : ""} to Gemini File API...`
);
const uploaded = await withAbort(
ai.files.upload({
file: filePath,
config: { mimeType },
}),
signal
);
const upTime = ((Date.now() - upStart) / 1000).toFixed(1);
onProgress(`Audio uploaded in ${upTime}s`);
// Wait for the File API to finish ingesting before generation.
let f = uploaded;
const pStart = Date.now();
while (f.state === "PROCESSING") {
if (signal?.aborted) {
throw Object.assign(new Error("aborted"), { name: "AbortError" });
}
const ws = ((Date.now() - pStart) / 1000).toFixed(0);
onProgress(`Waiting for Gemini to process audio... (${ws}s)`);
await new Promise((r) => setTimeout(r, 3000));
f = await withAbort(ai.files.get({ name: f.name }), signal);
}
if (f.state === "FAILED") {
throw new Error("Gemini failed to process audio file.");
}
const pTime = ((Date.now() - pStart) / 1000).toFixed(1);
onProgress(`Audio processed in ${pTime}s. Transcribing with ${model}...`);
const prompt = buildTranscriptionPrompt({
title: titleHint,
channel: channelHint,
description: descriptionHint,
chapters: chaptersHint,
});
// thinkingLevel: "minimal" is only valid for Flash. Pro models
// reject it. Match prior behavior precisely.
const txConfig = model.includes("flash")
? { thinkingConfig: { thinkingLevel: "minimal" } }
: {};
let result;
let finishReason = "UNKNOWN";
let blockReason = "none";
for (let attempt = 0; attempt < EMPTY_RETRIES; attempt++) {
if (signal?.aborted) {
throw Object.assign(new Error("aborted"), { name: "AbortError" });
}
result = await retryGemini(
() =>
withAbort(
ai.models.generateContent({
model,
config: {
...txConfig,
safetySettings: TRANSCRIPTION_SAFETY,
},
contents: [
{
role: "user",
parts: [
{ fileData: { fileUri: f.uri, mimeType } },
{ text: prompt },
],
},
],
}),
signal
),
{
retries: 3,
delayMs: 5000,
label: `Transcription${offsetSeconds > 0 ? ` (chunk@${formatTime(offsetSeconds)})` : ""}`,
log: (msg) => onProgress(msg),
}
);
const text = safeText(result);
if (text) break;
const candidate = result?.candidates?.[0];
finishReason = candidate?.finishReason || "UNKNOWN";
blockReason = result?.promptFeedback?.blockReason || "none";
onProgress(
`⚠ Empty response (attempt ${attempt + 1}/${EMPTY_RETRIES}) — finishReason: ${finishReason}, blockReason: ${blockReason}`
);
if (attempt < EMPTY_RETRIES - 1) {
const waitSec = 10 * (attempt + 1);
onProgress(`Waiting ${waitSec}s before retry...`);
await new Promise((r) => setTimeout(r, waitSec * 1000));
}
}
// Best-effort cleanup of the uploaded file. Failure here is
// harmless — Gemini garbage-collects on its own schedule.
try {
await ai.files.delete({ name: f.name });
} catch {}
const usage = result.usageMetadata || {};
const cost = calcCost(model, usage);
return {
text: safeText(result) || "",
usage,
cost,
finishReason,
blockReason,
// Pass-through for callers that still want the raw SDK response
// (e.g. existing logging code). Will be removed once nothing
// depends on it.
raw: result,
};
},
// Generate text from a prompt (no audio). Used by the topic-analysis
// step today, but generic enough for any text→text model call.
// Returns: { text, usage, cost, finishReason }
async analyzeText({
prompt,
model,
onProgress = () => {},
// Default to 1 attempt (no per-model retry). Analysis-step 503s
// ("model overloaded") almost never clear in 510 seconds —
// they're capacity-shaped, not transient-blip-shaped. Better
// UX: fail fast on a single model and let the outer fallback
// chain in server/index.js walk to the next model (Pro → Pro
// older → Flash → Flash 2.5) immediately. Caller can override
// with retries: 2 if they want the old behavior.
retries = 1,
signal,
}) {
const result = await retryGemini(
() =>
withAbort(
aiAnalyze.models.generateContent({
model,
contents: [
{
role: "user",
parts: [{ text: prompt }],
},
],
}),
signal
),
{
retries,
delayMs: 5000,
label: "Analysis",
log: (msg) => onProgress(msg),
}
);
const text = safeText(result);
const usage = result.usageMetadata || {};
const cost = calcCost(model, usage);
const finishReason = result?.candidates?.[0]?.finishReason || null;
return {
text: text || "",
usage,
cost,
finishReason,
raw: result,
};
},
};
}
// Transcription prompt — Gemini-specific because it relies on
// timestamp-formatted output we then parse. Other providers may need a
// differently-shaped prompt, so each provider owns its own.
//
// Accepts richer context than just a title: channel name, video
// description, and YouTube chapter markers. These dramatically improve
// speaker-name extraction — most podcast descriptions list host and
// guest by name, channel names are often the host's name, and chapter
// titles sometimes label introductions ("Conversation with John Doe").
// Without this context, the model falls back to "Host"/"Guest".
function buildTranscriptionPrompt({ title, channel, description, chapters } = {}) {
let context = "";
if (title) context += `Video title: "${title}"\n`;
if (channel) context += `Channel: ${channel}\n`;
if (description) {
// Trim to keep prompt size sane on hours-long podcasts whose
// descriptions can include full sponsor lists + show notes.
const desc = description.length > 1500 ? description.slice(0, 1500) + "…" : description;
context += `Video description (use to identify speakers by name):\n${desc}\n`;
}
if (Array.isArray(chapters) && chapters.length > 0) {
const lines = chapters
.slice(0, 30)
.map((c) => {
const start = typeof c.start_time === "number" ? c.start_time : 0;
const mm = Math.floor(start / 60);
const ss = Math.floor(start % 60).toString().padStart(2, "0");
return ` [${mm}:${ss}] ${c.title || ""}`;
})
.join("\n");
context += `Chapter markers (titles often name speakers or topics):\n${lines}\n`;
}
if (context) context += "\n";
return `${context}Transcribe this audio completely and verbatim. Include timestamps at regular intervals (every 15-30 seconds or at natural pauses).
Format each line as:
[MM:SS] The spoken text here...
Rules:
- Transcribe EVERY word spoken, do not skip or summarize anything.
- Use [MM:SS] or [H:MM:SS] timestamp format at the start of each line.
- Start a new timestamped line every 15-30 seconds or at natural speech pauses.
- Include filler words (um, uh, you know) for accuracy.
- Speaker identification: FIRST consult the metadata above — descriptions and chapter titles usually name the host(s) and guest(s) explicitly, and the channel name is often the host's name. Match those names to the voices in the audio (introductions, "I'm Dax", "this is Will", first-person references) and use them as speaker labels. Format as: [MM:SS] Name: text. Only fall back to "Host"/"Guest" if no names appear in the metadata AND nobody is introduced by name in the audio.
Return ONLY the timestamped transcript, nothing else.`;
}
+154
View File
@@ -0,0 +1,154 @@
// Provider registry. Each provider wraps a single LLM/SDK behind a
// uniform interface (see ./gemini.js for the reference shape). The rest
// of the server talks to providers through getProvider() and never
// imports SDKs directly.
//
// Adding a new provider:
// 1. Create ./<name>.js exporting createXxxProvider({ apiKey, ... }).
// 2. Add it to PROVIDER_NAMES + the switch in getProvider().
// 3. Add the matching opts shape to PROVIDER_KEY_FIELDS so
// resolveProviderOpts() can pull the right key/baseURL out of the
// StartOS config.
// 4. Wire its config field into startos/file-models/config.json.ts
// and add a "Set <Provider> Key" StartOS action.
//
// Capabilities (see provider.capabilities) signal what each one can do.
// Some providers analyze but can't transcribe (Claude, OpenAI-compat,
// Ollama); the orchestration layer in server/index.js can mix providers
// across the transcription + analysis pipelines.
import { createGeminiProvider } from "./gemini.js";
import { createAnthropicProvider } from "./anthropic.js";
import { createOpenAIProvider } from "./openai.js";
import { createOpenAICompatibleProvider } from "./openai-compatible.js";
import { createOllamaProvider } from "./ollama.js";
import { createWhisperProvider } from "./whisper.js";
import { createRelayProvider } from "./relay.js";
import { getInstallId } from "../install-id.js";
import { getRawLicenseKey } from "../license.js";
import { getRelayBaseURL } from "../relay-default.js";
export const PROVIDER_NAMES = [
"gemini",
"anthropic",
"openai",
"openai-compatible",
"ollama",
"whisper",
"relay",
];
// Map provider name → which fields to read from the StartOS config blob
// when resolving its construction opts. Used by resolveProviderOpts().
export const PROVIDER_KEY_FIELDS = {
gemini: { apiKey: "gemini_api_key" },
anthropic: { apiKey: "anthropic_api_key" },
openai: { apiKey: "openai_api_key" },
"openai-compatible": {
apiKey: "openai_compatible_api_key",
baseURL: "openai_compatible_base_url",
},
ollama: { baseURL: "ollama_base_url" },
whisper: {
apiKey: "whisper_api_key",
baseURL: "whisper_base_url",
},
// Relay is operator-only — base URL is HARDCODED in
// server/relay-default.js, NOT read from StartOS config. The empty
// object is intentional: resolveProviderOpts uses `name in
// PROVIDER_KEY_FIELDS` to recognise the provider, then the
// relay-specific block at the bottom of resolveProviderOpts
// injects baseURL + installId + licenseKey server-side. Without
// this entry the lookup throws "Unknown provider: relay" before
// reaching the injection block.
relay: {},
};
export function getProvider(name, opts = {}) {
switch (name) {
case "gemini":
return createGeminiProvider(opts);
case "anthropic":
return createAnthropicProvider(opts);
case "openai":
return createOpenAIProvider(opts);
case "openai-compatible":
return createOpenAICompatibleProvider(opts);
case "ollama":
return createOllamaProvider(opts);
case "whisper":
return createWhisperProvider(opts);
case "relay":
return createRelayProvider(opts);
default:
throw new Error(
`Unknown provider: ${name}. Available: ${PROVIDER_NAMES.join(", ")}`
);
}
}
// Pull the construction opts for a provider out of the StartOS config
// blob, optionally overridden per-provider by client-side opts the web
// UI passed in the request body.
//
// `config` is the parsed startos-config.json snapshot.
// `clientOpts` is { apiKey?, baseURL? } for THIS provider only —
// typically a value out of req.body.providerOpts[name].
//
// Resolution priority for each field: client opt → config opt.
// Returns { apiKey?, baseURL? } as appropriate for the provider.
export function resolveProviderOpts(name, { config = {}, clientOpts = {} } = {}) {
const fields = PROVIDER_KEY_FIELDS[name];
if (!fields) {
throw new Error(`Unknown provider: ${name}`);
}
const opts = {};
if (fields.apiKey) {
const fromConfig = config[fields.apiKey] || "";
const fromClient = (clientOpts.apiKey || "").trim();
opts.apiKey = fromClient || fromConfig;
}
if (fields.baseURL) {
const fromConfig = config[fields.baseURL] || "";
const fromClient = (clientOpts.baseURL || "").trim();
opts.baseURL = fromClient || fromConfig;
// Last-resort fallback for Ollama: the canonical StartOS internal
// hostname. Reachable when the optional Ollama dependency is
// installed alongside Recap on the same StartOS server, even if
// the user hasn't run the "Set Ollama Server URL" action.
if (!opts.baseURL && name === "ollama") {
opts.baseURL = "http://ollama.startos:11434";
}
}
// User-defined model list: providers with dynamic catalogs (ollama,
// openai-compatible, whisper) accept a comma- or newline-separated
// list of model names in clientOpts.models. Parse and pass through
// as `defaultModels` so listTranscriptionModels / listAnalysisModels
// return the right thing AND so the orchestration layer's fallback
// chain knows what to walk through if the user's chosen model fails.
if (typeof clientOpts.models === "string" && clientOpts.models.trim()) {
const seen = new Set();
const models = clientOpts.models
.split(/[,\n]/)
.map((s) => s.trim())
.filter((s) => {
if (!s || seen.has(s)) return false;
seen.add(s);
return true;
});
if (models.length > 0) {
opts.defaultModels = models;
}
}
// Relay-specific injections: baseURL (hardcoded constant or env
// override) + install-id (always) + license key (when present).
// None of these come from clientOpts — relay identity + endpoint
// must not be spoofable from a request body.
if (name === "relay") {
opts.baseURL = getRelayBaseURL();
opts.installId = getInstallId();
const rawKey = getRawLicenseKey();
if (rawKey) opts.licenseKey = rawKey;
}
return opts;
}
+125
View File
@@ -0,0 +1,125 @@
// Ollama provider — analysis only, raw HTTP to a local Ollama server.
//
// Ollama runs LLMs locally; there is no per-request cost. Default
// baseURL is the conventional `http://localhost:11434`. Users on a
// LAN-hosted Ollama point at it explicitly via the StartOS action.
//
// We don't ship a hardcoded model list — Ollama's catalog is whatever
// the user has `pull`ed locally. listAnalysisModels() can optionally
// query /api/tags at config time, but for v1 we expose a free-text
// model field in the picker UI.
import { retryAPI } from "../util.js";
import { zeroCost } from "./cost.js";
const DEFAULT_BASE_URL = "http://localhost:11434";
export function createOllamaProvider({
baseURL,
timeoutMs = 900_000,
} = {}) {
const base = (baseURL || DEFAULT_BASE_URL).replace(/\/$/, "");
return {
name: "ollama",
capabilities: {
transcribe: false,
analyze: true,
listModels: true,
},
listAnalysisModels() {
return [];
},
listTranscriptionModels() {
return [];
},
async transcribeAudio() {
throw new Error(
"Ollama is wired for analysis only. Use Gemini or OpenAI Whisper for transcription."
);
},
// Lists models the local Ollama server has pulled. Best-effort —
// returns [] on any error so the picker can fall back to the
// free-text input.
async listInstalledModels() {
try {
const res = await fetch(`${base}/api/tags`, {
signal: AbortSignal.timeout(5000),
});
if (!res.ok) return [];
const data = await res.json();
return (data.models || []).map((m) => m.name).filter(Boolean);
} catch {
return [];
}
},
async analyzeText({
prompt,
model,
onProgress = () => {},
retries = 2,
signal,
}) {
const result = await retryAPI(
async () => {
// Combine the per-request timeout with the caller-supplied
// cancel signal so a user-pressed Cancel button aborts the
// fetch immediately instead of waiting for the (long) timeout.
const timeoutSignal = AbortSignal.timeout(timeoutMs);
const combinedSignal = signal
? AbortSignal.any([signal, timeoutSignal])
: timeoutSignal;
const res = await fetch(`${base}/api/generate`, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({
model,
prompt,
stream: false,
}),
signal: combinedSignal,
});
if (!res.ok) {
const errText = await res.text().catch(() => "");
const err = new Error(
`Ollama ${res.status} ${res.statusText}: ${errText.slice(0, 200)}`
);
err.status = res.status;
throw err;
}
return res.json();
},
{
retries,
delayMs: 5000,
label: "Ollama analysis",
log: (msg) => onProgress(msg),
}
);
const text = result.response || "";
// Ollama's /api/generate returns prompt_eval_count + eval_count.
const usage = {
inputTokens: result.prompt_eval_count || 0,
outputTokens: result.eval_count || 0,
thinkingTokens: 0,
};
const cost = zeroCost(usage);
return {
text,
usage,
cost,
finishReason: result.done ? "stop" : null,
raw: result,
};
},
};
}
+110
View File
@@ -0,0 +1,110 @@
// OpenAI-compatible provider — analysis only.
//
// Same wire format as OpenAI's chat.completions endpoint, but pointed
// at a user-supplied baseURL: DeepSeek, Together, Groq, Fireworks, your
// own self-hosted vLLM, etc. The user provides baseURL + apiKey + model
// name; we don't ship a hardcoded model list (each backend's catalog
// differs), and we don't have pricing (varies wildly per backend).
//
// Structurally this is a thin re-export of the OpenAI SDK with the
// pricing table forced to zero — costs are reported as $0.0000 since we
// can't know the backend's rates without per-deploy configuration.
import OpenAI from "openai";
import { retryAPI } from "../util.js";
import { zeroCost } from "./cost.js";
// Default model lists are empty — the picker UI surfaces a free-text
// model field for OpenAI-compatible. listAnalysisModels() returns the
// caller-provided defaults if any were passed at construction time.
const ANALYSIS_MAX_TOKENS = 16000;
export function createOpenAICompatibleProvider({
apiKey,
baseURL,
defaultModels = [],
timeoutMs = 900_000,
} = {}) {
if (!baseURL) {
throw new Error(
"createOpenAICompatibleProvider: baseURL is required (e.g. https://api.deepseek.com/v1)"
);
}
// Some self-hosted backends accept any non-empty key. Default to a
// sentinel so the SDK's auth header stays well-formed.
const client = new OpenAI({
apiKey: apiKey || "no-auth",
baseURL,
timeout: timeoutMs,
});
return {
name: "openai-compatible",
capabilities: {
transcribe: false,
analyze: true,
listModels: defaultModels.length > 0,
},
listAnalysisModels() {
return [...defaultModels];
},
listTranscriptionModels() {
return [];
},
async transcribeAudio() {
throw new Error(
"openai-compatible providers are wired for analysis only. Use Gemini or OpenAI Whisper for transcription."
);
},
async analyzeText({
prompt,
model,
onProgress = () => {},
retries = 2,
signal,
}) {
const result = await retryAPI(
() =>
client.chat.completions.create(
{
model,
max_tokens: ANALYSIS_MAX_TOKENS,
messages: [{ role: "user", content: prompt }],
},
signal ? { signal } : undefined
),
{
retries,
delayMs: 5000,
label: "openai-compatible analysis",
log: (msg) => onProgress(msg),
}
);
const choice = result.choices?.[0];
const text = choice?.message?.content || "";
const usage = {
inputTokens: result.usage?.prompt_tokens || 0,
outputTokens: result.usage?.completion_tokens || 0,
thinkingTokens: 0,
};
// Per-backend pricing varies — report zero. UI can warn that cost
// tracking is not available for this provider.
const cost = zeroCost(usage);
return {
text,
usage,
cost,
finishReason: choice?.finish_reason || null,
raw: result,
};
},
};
}
+201
View File
@@ -0,0 +1,201 @@
// OpenAI provider — analysis (chat.completions) + transcription (Whisper).
//
// Whisper (whisper-1) has a 25 MB per-request file size cap. The
// orchestration layer's audio chunking is currently sized for Gemini's
// much larger cap; long podcasts at high bitrate can push individual
// chunks over Whisper's cap. We surface that as a clear error rather
// than silently truncating — users can mix providers (Whisper for
// short audio, Gemini for long) per-request via the picker.
//
// Pricing values are placeholders — verify against current OpenAI
// pricing before billing-sensitive use.
import { createReadStream, statSync } from "fs";
import OpenAI from "openai";
import { retryAPI, formatTime } from "../util.js";
import { formatCost, ratesFor } from "./cost.js";
// Per-1M-token rates in USD for chat.completions models.
// VERIFY against current OpenAI pricing before relying on these for billing.
export const OPENAI_PRICING = {
"gpt-4o": { input: 2.50, output: 10.00 },
"gpt-4o-mini": { input: 0.15, output: 0.60 },
"gpt-4-turbo": { input: 10.00, output: 30.00 },
"o3-mini": { input: 1.10, output: 4.40 },
// Fallback for unknown / future models.
"default": { input: 2.50, output: 10.00 },
};
// Whisper bills per minute of audio, not per token. The cost record
// reuses the token cost shape, but stores minute-based math in the
// `inputCost` field.
const WHISPER_USD_PER_MINUTE = 0.006;
const WHISPER_MAX_BYTES = 25 * 1024 * 1024; // OpenAI hard limit
export const OPENAI_ANALYSIS_MODELS = [
"gpt-4o",
"gpt-4o-mini",
"gpt-4-turbo",
"o3-mini",
];
export const OPENAI_TRANSCRIPTION_MODELS = ["whisper-1"];
const ANALYSIS_MAX_TOKENS = 16000;
export function createOpenAIProvider({
apiKey,
baseURL,
timeoutMs = 900_000,
} = {}) {
if (!apiKey) {
throw new Error("createOpenAIProvider: apiKey is required");
}
const client = new OpenAI({
apiKey,
baseURL: baseURL || undefined,
timeout: timeoutMs,
});
return {
name: "openai",
capabilities: {
transcribe: true,
analyze: true,
listModels: true,
},
listAnalysisModels() {
return [...OPENAI_ANALYSIS_MODELS];
},
listTranscriptionModels() {
return [...OPENAI_TRANSCRIPTION_MODELS];
},
// Whisper-based transcription. Returns the same [MM:SS] formatted
// text shape Gemini produces, so the orchestration layer's
// parseTimestampedTranscript() works unchanged.
async transcribeAudio({
filePath,
model = "whisper-1",
offsetSeconds = 0,
onProgress = () => {},
signal,
}) {
let bytes = 0;
try {
bytes = statSync(filePath).size;
} catch {}
if (bytes > WHISPER_MAX_BYTES) {
const sizeMB = (bytes / (1024 * 1024)).toFixed(1);
throw new Error(
`OpenAI Whisper file size limit is 25 MB. This chunk is ${sizeMB} MB. Try Gemini for transcription, or split the audio more aggressively.`
);
}
onProgress(
`Uploading audio${offsetSeconds > 0 ? ` (offset ${formatTime(offsetSeconds)})` : ""} to OpenAI Whisper (${model})...`
);
const start = Date.now();
const result = await retryAPI(
() =>
client.audio.transcriptions.create(
{
file: createReadStream(filePath),
model,
response_format: "verbose_json",
timestamp_granularities: ["segment"],
},
signal ? { signal } : undefined
),
{
retries: 3,
delayMs: 5000,
label: `Whisper transcription${offsetSeconds > 0 ? ` (chunk@${formatTime(offsetSeconds)})` : ""}`,
log: (msg) => onProgress(msg),
}
);
const elapsed = ((Date.now() - start) / 1000).toFixed(1);
onProgress(`Whisper transcription complete in ${elapsed}s`);
const segments = Array.isArray(result.segments) ? result.segments : [];
const lines = segments.length
? segments.map((s) => `[${formatTime(s.start || 0)}] ${(s.text || "").trim()}`)
: [`[0:00] ${(result.text || "").trim()}`];
const text = lines.join("\n");
// Whisper bills by audio duration in minutes, not tokens.
const durationSeconds = result.duration || 0;
const minutes = durationSeconds / 60;
const usdCost = minutes * WHISPER_USD_PER_MINUTE;
const cost = {
inputTokens: 0,
outputTokens: 0,
thinkingTokens: 0,
totalTokens: 0,
inputCost: usdCost.toFixed(6),
outputCost: "0.000000",
thinkingCost: "0.000000",
totalCost: usdCost.toFixed(6),
totalCostDisplay: usdCost < 0.01
? `$${(usdCost * 100).toFixed(3)}¢`
: `$${usdCost.toFixed(4)}`,
};
return {
text,
usage: { inputTokens: 0, outputTokens: 0, thinkingTokens: 0, totalTokens: 0 },
cost,
finishReason: null,
blockReason: "none",
raw: result,
};
},
async analyzeText({
prompt,
model,
onProgress = () => {},
retries = 2,
signal,
}) {
const result = await retryAPI(
() =>
client.chat.completions.create(
{
model,
max_tokens: ANALYSIS_MAX_TOKENS,
messages: [{ role: "user", content: prompt }],
},
signal ? { signal } : undefined
),
{
retries,
delayMs: 5000,
label: "OpenAI analysis",
log: (msg) => onProgress(msg),
}
);
const choice = result.choices?.[0];
const text = choice?.message?.content || "";
const usage = {
inputTokens: result.usage?.prompt_tokens || 0,
outputTokens: result.usage?.completion_tokens || 0,
thinkingTokens: 0,
};
const cost = formatCost(ratesFor(OPENAI_PRICING, model), usage);
return {
text,
usage,
cost,
finishReason: choice?.finish_reason || null,
raw: result,
};
},
};
}
+341
View File
@@ -0,0 +1,341 @@
// Relay provider — proxies transcription + analysis calls through the
// operator's relay backend (operator-side service, not in this repo).
// The relay handles billing/credit accounting, picks the actual backing
// model (Gemini, the operator's Parakeet+Gemma, etc.), and returns a
// uniform response with the user's remaining credit balance.
//
// Auth shape:
// - `X-Recap-Install-Id` header on every call (required; identifies
// the credit owner). Comes from ./install-id.js.
// - `Authorization: Bearer <license_key>` header when a license is
// present. Absent = treat as Core (free) tier.
// - `X-Recap-Job-Id` header on every call, common across the
// transcribe + analyze pair that make up one full summary. The
// relay decrements credits on the FIRST call with a given job_id;
// subsequent calls with the same id are free (until job_id
// expires, ~1h). Means "1 full summary = 1 credit" regardless of
// whether one or both pipeline steps go through the relay.
//
// Response envelope (success and error both):
// {
// "result": { ...endpoint-specific payload... },
// "credits_remaining": <number>,
// "tier": "core" | "pro" | "max",
// "credit_charged": <number, 0 if reused job_id>
// }
//
// What this provider does NOT do:
// - Validate the user's license. Relay does that server-side.
// - Track historical credit usage. Relay's DB owns the ledger.
// - Choose which backing model the relay uses. Operator's call.
import { createReadStream } from "fs";
import { retryAPI, formatTime } from "../util.js";
import { zeroCost } from "./cost.js";
import { updateRelayState, recordRelayError } from "../relay-state.js";
// Provider name shown in logs + chunk pagination labels. "relay" rather
// than e.g. "keysat-relay" because operators may run their own relay
// using a different backend brand — the name should describe the
// architecture, not the operator.
const NAME = "relay";
// Models exposed to the picker. The relay decides what actually runs —
// these labels are placeholders so the picker can show something.
const RELAY_TRANSCRIPTION_MODELS = ["relay-default"];
const RELAY_ANALYSIS_MODELS = ["relay-default"];
export function createRelayProvider({
baseURL,
installId,
licenseKey,
timeoutMs = 900_000,
} = {}) {
if (!baseURL) {
throw new Error(
"createRelayProvider: baseURL is required (e.g. https://relay.keysat.xyz)"
);
}
if (!installId) {
throw new Error(
"createRelayProvider: installId is required (boot must initInstallId first)"
);
}
const base = baseURL.replace(/\/$/, "");
// Build the auth/identity headers attached to every relay call.
// job_id is optional but the orchestration layer should always pass
// one — without it the relay can't bundle the transcribe + analyze
// pair into a single credit charge.
function buildHeaders({ extra = {}, jobId } = {}) {
const h = {
"X-Recap-Install-Id": installId,
...extra,
};
if (licenseKey) h["Authorization"] = `Bearer ${licenseKey}`;
if (jobId) h["X-Recap-Job-Id"] = jobId;
return h;
}
// Common error-handling wrapper. The relay's contract is that ANY
// response (success or failure) carries the standard envelope so
// Recap can keep its balance display accurate even on errors. We
// try to parse error bodies to harvest that.
async function postRelay({ path, body, headers, signal }) {
let res;
try {
res = await fetch(`${base}${path}`, {
method: "POST",
headers,
body,
signal,
});
} catch (err) {
recordRelayError(err?.message || String(err));
throw err;
}
const text = await res.text();
let parsed = null;
try {
parsed = text ? JSON.parse(text) : null;
} catch {}
if (parsed && (typeof parsed.credits_remaining === "number" || parsed.tier)) {
updateRelayState(parsed);
}
if (!res.ok) {
const msg =
parsed?.error ||
parsed?.message ||
text?.slice(0, 300) ||
`HTTP ${res.status}`;
const err = new Error(`Relay ${path} ${res.status}: ${msg}`);
err.status = res.status;
err.envelope = parsed;
if (!parsed) recordRelayError(msg);
throw err;
}
return parsed;
}
return {
name: NAME,
capabilities: {
transcribe: true,
analyze: true,
// The relay's model catalog is internal — Recap doesn't pick.
// listModels: false signals the picker to skip the dropdown.
listModels: false,
},
listAnalysisModels() {
return [...RELAY_ANALYSIS_MODELS];
},
listTranscriptionModels() {
return [...RELAY_TRANSCRIPTION_MODELS];
},
async transcribeAudio({
filePath,
mimeType,
titleHint,
channelHint = "",
descriptionHint = "",
chaptersHint = [],
offsetSeconds = 0,
onProgress = () => {},
signal,
jobId,
}) {
onProgress(
`Uploading audio${offsetSeconds > 0 ? ` (offset ${formatTime(offsetSeconds)})` : ""} to relay (${base})...`
);
const start = Date.now();
// Use multipart form encoding so the audio binary doesn't have
// to be base64-blown-up. Node 20+ provides global FormData; pair
// it with a stream so we don't load the whole audio file into
// memory.
const form = new FormData();
const blob = await fileToBlob(filePath, mimeType);
form.append("audio", blob, "audio.bin");
form.append("mime_type", mimeType || "application/octet-stream");
if (titleHint) form.append("title", titleHint);
if (channelHint) form.append("channel", channelHint);
if (descriptionHint) form.append("description", descriptionHint);
if (Array.isArray(chaptersHint) && chaptersHint.length > 0) {
form.append("chapters", JSON.stringify(chaptersHint));
}
form.append("offset_seconds", String(offsetSeconds));
const envelope = await retryAPI(
() =>
postRelay({
path: "/relay/transcribe",
body: form,
headers: buildHeaders({ jobId }),
signal,
}),
{
retries: 2,
delayMs: 5000,
label: `Relay transcribe${offsetSeconds > 0 ? ` (chunk@${formatTime(offsetSeconds)})` : ""}`,
log: (msg) => onProgress(msg),
}
);
const elapsed = ((Date.now() - start) / 1000).toFixed(1);
const remaining =
typeof envelope.credits_remaining === "number"
? `, ${envelope.credits_remaining} credits left`
: "";
onProgress(`Relay transcribe complete in ${elapsed}s${remaining}`);
// Relay's transcribe result shape: { text, segments?: [{start,
// end, text}], duration? }. We don't fabricate segment timestamps
// here — the orchestration layer's synthesizeEntriesFromText
// handles single-segment-only responses.
const result = envelope.result || {};
const segments = Array.isArray(result.segments) ? result.segments : [];
const lines = segments.length
? segments.map(
(s) => `[${formatTime(s.start || 0)}] ${(s.text || "").trim()}`
)
: [`[0:00] ${(result.text || "").trim()}`];
const text = lines.join("\n");
// Cost from Recap's POV is always zero — credits are the unit,
// and they're tracked separately. The orchestration layer's
// cost-summing code keeps working unchanged.
const cost = zeroCost({
inputTokens: 0,
outputTokens: 0,
thinkingTokens: 0,
});
return {
text,
usage: { inputTokens: 0, outputTokens: 0, thinkingTokens: 0, totalTokens: 0 },
cost,
finishReason: null,
blockReason: "none",
raw: envelope,
};
},
// Peek at the install's current credit balance without charging.
// Used by /api/relay/status to populate the picker banner on boot
// (or on demand) so the user sees real numbers before running any
// summarize. Short timeout — if the relay is unreachable we want
// the UI to fall back to "balance unknown" quickly, not hang.
async pingBalance({ timeoutMs = 5000, signal } = {}) {
const headers = buildHeaders({});
const ac = new AbortController();
const timer = setTimeout(() => ac.abort(), timeoutMs);
const combined = signal
? AbortSignal.any([signal, ac.signal])
: ac.signal;
try {
const res = await fetch(`${base}/relay/balance`, {
method: "GET",
headers,
signal: combined,
});
const text = await res.text();
let parsed = null;
try {
parsed = text ? JSON.parse(text) : null;
} catch {}
if (parsed && (typeof parsed.credits_remaining === "number" || parsed.tier)) {
updateRelayState(parsed);
}
if (!res.ok) {
const msg =
parsed?.error ||
parsed?.message ||
text?.slice(0, 300) ||
`HTTP ${res.status}`;
recordRelayError(msg);
const err = new Error(`Relay /balance ${res.status}: ${msg}`);
err.status = res.status;
throw err;
}
return parsed;
} catch (err) {
if (err?.name === "AbortError") {
recordRelayError(`balance ping timed out after ${timeoutMs}ms`);
} else if (!err.status) {
recordRelayError(err?.message || String(err));
}
throw err;
} finally {
clearTimeout(timer);
}
},
async analyzeText({
prompt,
onProgress = () => {},
retries = 2,
signal,
jobId,
}) {
const start = Date.now();
const headers = buildHeaders({
extra: { "Content-Type": "application/json" },
jobId,
});
const envelope = await retryAPI(
() =>
postRelay({
path: "/relay/analyze",
body: JSON.stringify({ prompt }),
headers,
signal,
}),
{
retries,
delayMs: 5000,
label: "Relay analyze",
log: (msg) => onProgress(msg),
}
);
const elapsed = ((Date.now() - start) / 1000).toFixed(1);
const remaining =
typeof envelope.credits_remaining === "number"
? `, ${envelope.credits_remaining} credits left`
: "";
onProgress(`Relay analyze complete in ${elapsed}s${remaining}`);
const result = envelope.result || {};
const text = typeof result.text === "string" ? result.text : "";
const cost = zeroCost({
inputTokens: 0,
outputTokens: 0,
thinkingTokens: 0,
});
return {
text,
usage: { inputTokens: 0, outputTokens: 0, thinkingTokens: 0, totalTokens: 0 },
cost,
finishReason: null,
raw: envelope,
};
},
};
}
// Streams a file off disk into a Blob with the given MIME type for
// FormData upload. Node's global Blob/File don't accept a stream
// directly the way browser File objects do, so we read into a Buffer
// here. Acceptable for typical podcast chunk sizes (~50 MB at most
// after the orchestration layer's 45-min split).
async function fileToBlob(filePath, mimeType) {
const { promises: fsp } = await import("fs");
const buf = await fsp.readFile(filePath);
return new Blob([buf], { type: mimeType || "application/octet-stream" });
}
+180
View File
@@ -0,0 +1,180 @@
// Whisper provider — transcription via any OpenAI-Audio-Transcription-API-
// compatible endpoint. OpenAI's audio.transcriptions.create wire format
// is the de facto standard; whisper.cpp's HTTP server, faster-whisper-
// server, NVIDIA Parakeet behind speaches, Groq's Whisper API, and most
// other self-hosted implementations honor it. So this provider is
// effectively "OpenAI for transcription with a custom baseURL" —
// distinct from the `openai` provider so users can wire a self-hosted
// transcription engine alongside their cloud OpenAI key (used for GPT
// analysis).
//
// Implementation note: although the wire format matches OpenAI's, this
// provider has its OWN transcribeAudio (rather than reusing the OpenAI
// provider's). Reasons:
// - Log messages should say "Whisper at host:port (model)" not
// "OpenAI Whisper" — Parakeet/whisper.cpp behind a custom URL is
// not "OpenAI" and showing that in logs is misleading.
// - No 25 MB chunk cap. Self-hosted Whisper / Parakeet typically
// handles much larger inputs than the OpenAI cloud API.
// - Zero per-minute cost reporting (self-hosted by definition).
import { createReadStream } from "fs";
import OpenAI from "openai";
import { retryAPI, formatTime } from "../util.js";
const FALLBACK_MODEL = "whisper-1";
export function createWhisperProvider({
apiKey,
baseURL,
defaultModels = [],
timeoutMs = 900_000,
} = {}) {
if (!baseURL) {
throw new Error(
"createWhisperProvider: baseURL is required (e.g. http://localhost:8000/v1)"
);
}
// Self-hosted Whisper servers commonly skip auth — pass a sentinel
// string so the SDK's authorization header is well-formed.
const client = new OpenAI({
apiKey: apiKey || "no-auth",
baseURL,
timeout: timeoutMs,
});
// Pretty-print the host for log messages: strip protocol, ignore /v1
// suffix, trim trailing slash.
const displayHost = baseURL
.replace(/^https?:\/\//, "")
.replace(/\/v\d+\/?$/, "")
.replace(/\/$/, "");
return {
name: "whisper",
capabilities: {
transcribe: true,
analyze: false,
listModels: defaultModels.length > 0,
},
listTranscriptionModels() {
return defaultModels.length > 0 ? [...defaultModels] : [FALLBACK_MODEL];
},
listAnalysisModels() {
return [];
},
async transcribeAudio({
filePath,
model = FALLBACK_MODEL,
offsetSeconds = 0,
onProgress = () => {},
signal,
}) {
// Use the model + host directly in the log — "Whisper" was
// misleading when a user wires up Parakeet (or any non-Whisper
// model) at a custom endpoint.
onProgress(
`Uploading audio${offsetSeconds > 0 ? ` (offset ${formatTime(offsetSeconds)})` : ""} to ${model} at ${displayHost}...`
);
const start = Date.now();
// Try the rich request first (verbose_json + per-segment
// timestamps — needed to render the transcript with timestamps
// and let the analysis step build sections). If the wrapper
// rejects those params (some Whisper-API-compatible servers,
// including some Parakeet wrappers, don't implement them and
// return 500), retry once with the bare-bones request shape.
let result;
let usedFallbackShape = false;
try {
result = await retryAPI(
() =>
client.audio.transcriptions.create(
{
file: createReadStream(filePath),
model,
response_format: "verbose_json",
timestamp_granularities: ["segment"],
},
signal ? { signal } : undefined
),
{
retries: 2,
delayMs: 5000,
label: `${model} transcription${offsetSeconds > 0 ? ` (chunk@${formatTime(offsetSeconds)})` : ""}`,
log: (msg) => onProgress(msg),
}
);
} catch (richErr) {
const richStatus = richErr?.status || 0;
// Only fall back on 4xx / 5xx where the params themselves are
// the likely culprit. Connection / timeout errors get thrown.
if (richStatus >= 400 && richStatus < 600) {
onProgress(
`Rich-request failed (status ${richStatus}); retrying with bare request shape (no verbose_json, no segment timestamps)...`
);
usedFallbackShape = true;
result = await retryAPI(
() =>
client.audio.transcriptions.create(
{
file: createReadStream(filePath),
model,
},
signal ? { signal } : undefined
),
{
retries: 2,
delayMs: 5000,
label: `${model} transcription (fallback)${offsetSeconds > 0 ? ` (chunk@${formatTime(offsetSeconds)})` : ""}`,
log: (msg) => onProgress(msg),
}
);
} else {
throw richErr;
}
}
const elapsed = ((Date.now() - start) / 1000).toFixed(1);
onProgress(
`${model} transcription complete in ${elapsed}s${usedFallbackShape ? " (bare request — no segment timestamps)" : ""}`
);
const segments = Array.isArray(result.segments) ? result.segments : [];
const lines = segments.length
? segments.map((s) => `[${formatTime(s.start || 0)}] ${(s.text || "").trim()}`)
: [`[0:00] ${(result.text || "").trim()}`];
const text = lines.join("\n");
// Self-hosted Whisper / Parakeet are free at the API layer
// (you've already paid for the hardware), so zero cost.
const cost = {
inputTokens: 0,
outputTokens: 0,
thinkingTokens: 0,
totalTokens: 0,
inputCost: "0.000000",
outputCost: "0.000000",
thinkingCost: "0.000000",
totalCost: "0.000000",
totalCostDisplay: "$0.0000",
};
return {
text,
usage: { inputTokens: 0, outputTokens: 0, thinkingTokens: 0, totalTokens: 0 },
cost,
finishReason: null,
blockReason: "none",
raw: result,
};
},
async analyzeText() {
throw new Error(
"Whisper provider is transcription-only. Use a different provider (Gemini / Anthropic / OpenAI / Ollama / OpenAI-compatible) for analysis."
);
},
};
}
+18
View File
@@ -0,0 +1,18 @@
// Hardcoded default relay URL. Recap users never see or configure this
// — it's baked into the build so Grant (operator) controls relay
// routing entirely through Recap version updates. Bump this constant
// in a new Recap release to point everyone at a different relay host.
//
// Empty string disables the relay path entirely (the "Relay (comped
// credits)" picker option will fail with "baseURL required" — caught
// by the provider-not-configured guard upstream).
//
// Override at runtime via the RECAP_RELAY_BASE_URL env var for local
// dev testing only — there's no StartOS action exposed for this, so
// production installs always use the hardcoded value.
const DEFAULT_RELAY_BASE_URL = "https://relay.keysat.xyz";
export function getRelayBaseURL() {
const fromEnv = (process.env.RECAP_RELAY_BASE_URL || "").trim();
return fromEnv || DEFAULT_RELAY_BASE_URL;
}
+52
View File
@@ -0,0 +1,52 @@
// In-memory cache of the most recent relay-reported credit balance + tier.
// Updated every time a relay provider call lands (success or 4xx error
// that includes the standard envelope). Exposed via /api/relay/status
// so the UI can render the "N credits remaining · Tier: X" banner
// without re-hitting the relay just for status.
//
// Not persisted to disk — the relay is the source of truth. We just cache
// the last response so the UI doesn't have to wait for the next request
// to refresh the display. On a fresh boot the cache is empty until the
// first /api/relay/status call, which can optionally probe the relay.
let lastSnapshot = {
creditsRemaining: null, // number | null
tier: null, // "core" | "pro" | "max" | null
lastUpdated: null, // ms-epoch | null
lastError: null, // string | null
};
// Called by the relay provider on every response (including error
// responses that the relay annotated with the standard envelope).
// `envelope` is the parsed JSON shape: { credits_remaining, tier, ... }.
export function updateRelayState(envelope) {
if (!envelope || typeof envelope !== "object") return;
if (typeof envelope.credits_remaining === "number") {
lastSnapshot.creditsRemaining = envelope.credits_remaining;
}
if (typeof envelope.tier === "string") {
lastSnapshot.tier = envelope.tier;
}
lastSnapshot.lastUpdated = Date.now();
lastSnapshot.lastError = null;
}
// Record a relay error (network failure, 5xx with no envelope, etc.).
// Surfaced in the UI status so the user knows the balance display is stale.
export function recordRelayError(message) {
lastSnapshot.lastError = (message || "Unknown relay error").slice(0, 300);
lastSnapshot.lastUpdated = Date.now();
}
export function getRelayState() {
return { ...lastSnapshot };
}
export function resetRelayState() {
lastSnapshot = {
creditsRemaining: null,
tier: null,
lastUpdated: null,
lastError: null,
};
}
+112 -6
View File
@@ -113,18 +113,50 @@ export function fetchUrl(url) {
});
}
// ── Retry helper for transient Gemini API errors ────────────────────────────
// Retries on 503/429 and on common transient network errors. Linear backoff
// (delayMs * attempt). The optional `log` callback receives a one-line
// status message per retry — useful for streaming progress to a UI.
export async function retryGemini(fn, { retries = 3, delayMs = 3000, label = "Gemini call", log: logFn } = {}) {
// ── Retry helper for transient API errors ──────────────────────────────────
// Retries on 503/429/529 and on common transient network errors. Linear
// backoff (delayMs * attempt). The optional `log` callback receives a
// one-line status message per retry — useful for streaming progress to a
// UI. Provider-neutral: error shapes from @google/genai, @anthropic-ai/sdk,
// openai, and raw fetch all expose `.status` (or message text) we can match.
export async function retryAPI(fn, { retries = 3, delayMs = 3000, label = "API call", log: logFn } = {}) {
let lastErr;
for (let attempt = 1; attempt <= retries; attempt++) {
// Surface every attempt — including the first — so the user
// sees what's happening when a retry is in flight rather than
// a frozen-looking activity log between "failed, retrying in
// 5s" and the final outcome.
if (attempt > 1 && logFn) {
logFn(`Retrying ${label}... (attempt ${attempt}/${retries})`);
}
try {
return await fn();
} catch (err) {
// User-cancelled requests must not be retried — re-throw so the
// outer handler can treat it as a clean cancellation rather than
// letting the retry loop log noise and burn time.
if (err?.name === "AbortError" || /aborted|operation was aborted/i.test(err?.message || "")) {
throw err;
}
lastErr = err;
const msg = err?.message || String(err);
const status = err?.status || err?.httpStatusCode || 0;
const isRetryable = status === 503 || status === 429 || /overloaded|unavailable|capacity|high demand|rate limit|fetch failed|ECONNRESET|ETIMEDOUT|socket hang up|network/i.test(msg);
const isRetryable = status === 503 || status === 429 || status === 529 || /overloaded|unavailable|capacity|high demand|rate limit|fetch failed|ECONNRESET|ETIMEDOUT|socket hang up|network/i.test(msg);
// Dump every detail we can pry out of the error so generic
// messages like "500 status code (no body)" become debuggable
// server-side. Anthropic/OpenAI SDK errors expose .response,
// .body, .headers, .cause; Node stream errors expose .code.
const richDetail = {
status,
code: err?.code,
type: err?.type,
body: err?.body || err?.response?.body || err?.error,
cause: err?.cause?.message || err?.cause?.code || err?.cause,
};
console.error(
`[retryAPI] ${label} failed (attempt ${attempt}/${retries}, status=${status || "n/a"}): ${msg}`,
JSON.stringify(richDetail, (_k, v) => (typeof v === "bigint" ? v.toString() : v))
);
if (isRetryable && attempt < retries) {
const waitSec = (delayMs * attempt / 1000).toFixed(0);
if (logFn) logFn(`${label} failed (${status || "error"}), retrying in ${waitSec}s... (attempt ${attempt}/${retries})`);
@@ -134,4 +166,78 @@ export async function retryGemini(fn, { retries = 3, delayMs = 3000, label = "Ge
}
}
}
throw lastErr;
}
// Back-compat alias: pre-existing call sites used `retryGemini`. Keep
// the name working so this rename is non-breaking.
export const retryGemini = retryAPI;
// Split a plain-text transcript into synthetic sentence-based entries
// with interpolated timestamps. Used when a transcription provider
// returns just text (no per-segment timing) — e.g. NVIDIA Parakeet
// behind an OpenAI-compatible wrapper. Without this, the entire
// transcript lands in one entry at [0:00] and the analyzer can only
// produce a single section spanning the whole audio.
//
// Strategy:
// 1. Split on sentence terminators (. ! ?). Keep the punctuation.
// 2. If no terminators (very rare in real speech), fall back to
// 30-word chunks.
// 3. Distribute timestamps proportionally by character count —
// sentence N starts at (cum_chars_so_far / total_chars) *
// audio_duration. Not perfectly accurate, but good enough to
// let the analyzer carve out coherent topic sections.
export function synthesizeEntriesFromText(text, totalDurationSeconds) {
const t = (text || "").trim();
if (!t || !totalDurationSeconds || totalDurationSeconds <= 0) {
return [{ offset: 0, text: t, duration: totalDurationSeconds || 0 }];
}
// Sentence split — keep the terminator on each sentence.
const sentenceMatches = t.match(/[^.!?\n]+[.!?]+|[^.!?\n]+$/g) || [];
let chunks = sentenceMatches.map((s) => s.trim()).filter(Boolean);
// If we couldn't find sentence boundaries (unpunctuated transcript),
// fall back to fixed-size word chunks.
if (chunks.length <= 1) {
const words = t.split(/\s+/).filter(Boolean);
if (words.length <= 1) {
return [{ offset: 0, text: t, duration: totalDurationSeconds }];
}
const wordsPerChunk = 30;
chunks = [];
for (let i = 0; i < words.length; i += wordsPerChunk) {
chunks.push(words.slice(i, i + wordsPerChunk).join(" "));
}
}
// Coalesce extremely short sentences (single words like "Yeah." or
// "Right.") into the previous chunk so we don't end up with hundreds
// of useless 5-char entries.
const COALESCE_MIN_CHARS = 40;
const coalesced = [];
for (const c of chunks) {
if (coalesced.length > 0 && coalesced[coalesced.length - 1].length < COALESCE_MIN_CHARS) {
coalesced[coalesced.length - 1] = `${coalesced[coalesced.length - 1]} ${c}`.trim();
} else {
coalesced.push(c);
}
}
// Distribute timestamps proportionally by character length.
const totalChars = coalesced.reduce((sum, c) => sum + c.length, 0) || 1;
const entries = [];
let cumChars = 0;
for (const c of coalesced) {
const startRatio = cumChars / totalChars;
cumChars += c.length;
const endRatio = cumChars / totalChars;
entries.push({
offset: startRatio * totalDurationSeconds,
text: c,
duration: Math.max(0.1, (endRatio - startRatio) * totalDurationSeconds),
});
}
return entries;
}