Pluggable AI providers, relay credit system, picker UX overhaul

Captures roughly forty version bumps (v0.2.6 → v0.2.47) of work that
accumulated without commits.

- Pluggable provider system under server/providers/: gemini, anthropic,
  openai, openai-compatible, ollama, whisper-compatible, relay. Mix and
  match transcription + analysis per request via the picker UI.
- Relay backend integration. Hardcoded relay URL in server/relay-default.js
  (operator-controlled at build time, not user-configurable). New
  /api/relay/{status,policy} endpoints proxy to the relay; balance pings
  populate a cached credit display.
- Per-install identity in server/install-id.js for relay credit accounting.
  Sent to the relay as X-Recap-Install-Id; persists across upgrades, lost
  on a full uninstall + reinstall. Not surfaced in the UI.
- Admin login gate (server/admin-auth.js + setAdminPassword action). Scrypt
  password hash + HMAC-signed session cookie.
- Entitlement scheme rename: pro / max (each paired with subscriptions and
  relay_pro / relay_max), replacing the misleading "core" entitlement
  that conflicted with the user-facing "Core" tier name.
- Activation screen: dynamic credit count pulled from /api/relay/policy,
  "Skip — use free mode" button, accurate paid-feature list.
- Top toolbar: inline credit-balance pill (or "BYO configured" fallback),
  Upgrade + "I have a key" buttons.
- Picker UI: per-provider sections with Save/Test/Delete buttons, sections
  collapsible by chevron, default-collapsed unless currently selected,
  "Use comped credits (reset to relay)" link when the user has strayed,
  green hint under inputs whose values are server-configured.
- Activity log: chevron-collapsible groups per video, refresh-survival via
  localStorage + a 500-entry server-side buffer, explicit Clear button.
- YouTube captions fast-path with user toggle (skips audio download + AI
  transcription when captions are available — uncheck for speaker labels).
- Cancel button: AbortController plumbed through every provider SDK call;
  retryAPI short-circuits on AbortError; cancellation events surface in
  the activity log instead of silent retries.
- Long-video analysis: auto-coalesce transcript entries before building the
  analysis prompt so local-model context windows (32k-ish) don't overflow.
  Original entries preserved for transcript display via an index map; the
  analyzer sees a coarser view but click-to-seek timestamps stay precise.
- StartOS action grouping (Setup / AI Providers) so the actions list is
  navigable.
- Manifest description rewritten to reflect multi-provider support and
  free-tier relay credits.
- Smaller fixes: summarize-button enablement no longer requires a Gemini
  key when other providers are configured; analysis fallback chain handles
  context-length and 503 capacity errors; single-segment expansion for
  providers that don't return per-segment timestamps (Parakeet et al.);
  many other UX polish items.
This commit is contained in:
Keysat
2026-05-11 23:46:20 -05:00
parent 2544cf7dde
commit 373d10595b
79 changed files with 6322 additions and 397 deletions
+116
View File
@@ -0,0 +1,116 @@
// Anthropic (Claude) provider — analysis only.
//
// Claude does not natively transcribe audio, so transcribeAudio() throws.
// Mix-and-match users can pair this provider for analysis with Gemini
// (or future OpenAI Whisper) for transcription.
//
// Pricing reflects standard-context rates as of 2026-04-29 (cached in
// the claude-api skill). Update when Anthropic changes published rates.
import Anthropic from "@anthropic-ai/sdk";
import { retryAPI } from "../util.js";
import { formatCost, ratesFor } from "./cost.js";
// Per-1M-token rates in USD. Anthropic does not expose a separate
// "thinking" rate — thinking tokens are billed as output, so we let
// formatCost default thinking → output by omitting the thinking field.
export const ANTHROPIC_PRICING = {
"claude-opus-4-7": { input: 5.00, output: 25.00 },
"claude-opus-4-6": { input: 5.00, output: 25.00 },
"claude-sonnet-4-6": { input: 3.00, output: 15.00 },
"claude-haiku-4-5": { input: 1.00, output: 5.00 },
// Fallback for unknown / future models.
"default": { input: 3.00, output: 15.00 },
};
// Analysis model list. Order = default fallback chain (most capable first).
export const ANTHROPIC_ANALYSIS_MODELS = [
"claude-opus-4-7",
"claude-opus-4-6",
"claude-sonnet-4-6",
"claude-haiku-4-5",
];
// Analysis output cap. Generous — the topic-analysis prompt produces a
// JSON document scaled to transcript length, and truncation here loses
// trailing sections.
const ANALYSIS_MAX_TOKENS = 16000;
export function createAnthropicProvider({ apiKey, timeoutMs = 900_000 } = {}) {
if (!apiKey) {
throw new Error("createAnthropicProvider: apiKey is required");
}
const client = new Anthropic({ apiKey, timeout: timeoutMs });
return {
name: "anthropic",
capabilities: {
transcribe: false,
analyze: true,
listModels: true,
},
listAnalysisModels() {
return [...ANTHROPIC_ANALYSIS_MODELS];
},
listTranscriptionModels() {
return [];
},
async transcribeAudio() {
throw new Error(
"Anthropic models do not natively transcribe audio. Use Gemini or OpenAI (Whisper) for the transcription step."
);
},
async analyzeText({
prompt,
model,
onProgress = () => {},
retries = 2,
signal,
}) {
const result = await retryAPI(
() =>
client.messages.create(
{
model,
max_tokens: ANALYSIS_MAX_TOKENS,
messages: [{ role: "user", content: prompt }],
},
// The Anthropic SDK accepts a per-call signal as the second
// arg; abort() rejects the in-flight HTTP request immediately.
signal ? { signal } : undefined
),
{
retries,
delayMs: 5000,
label: "Anthropic analysis",
log: (msg) => onProgress(msg),
}
);
const text = (result.content || [])
.filter((b) => b.type === "text")
.map((b) => b.text)
.join("");
const usage = {
inputTokens: result.usage?.input_tokens || 0,
outputTokens: result.usage?.output_tokens || 0,
thinkingTokens: 0,
};
const cost = formatCost(ratesFor(ANTHROPIC_PRICING, model), usage);
return {
text,
usage,
cost,
finishReason: result.stop_reason || null,
raw: result,
};
},
};
}
+66
View File
@@ -0,0 +1,66 @@
// Shared cost-calculation helper for the provider abstraction.
//
// Each provider knows two things:
// 1. Its pricing table (per-1M-token rates per model).
// 2. How to map its native usage shape into the normalized
// { inputTokens, outputTokens, thinkingTokens, totalTokens } shape.
//
// This module then turns (rates, normalized usage) → the cost record
// the rest of the app already understands. Same shape gemini-helpers
// `calcCost` produces, so dashboards / logs don't care which provider
// was used.
// Format a normalized usage object against a per-model rate table into
// the shared cost record. `rates` is { input, output, thinking? } in
// USD per 1M tokens; `usage` is { inputTokens, outputTokens,
// thinkingTokens, totalTokens } counts.
export function formatCost(rates, usage) {
const inputTokens = usage.inputTokens || 0;
const outputTokens = usage.outputTokens || 0;
const thinkingTokens = usage.thinkingTokens || 0;
const thinkingRate = rates.thinking != null ? rates.thinking : rates.output;
const inputCost = (inputTokens / 1_000_000) * rates.input;
const outputCost = (outputTokens / 1_000_000) * rates.output;
const thinkingCost = (thinkingTokens / 1_000_000) * thinkingRate;
const totalCost = inputCost + outputCost + thinkingCost;
return {
inputTokens,
outputTokens,
thinkingTokens,
totalTokens: usage.totalTokens || (inputTokens + outputTokens + thinkingTokens),
inputCost: inputCost.toFixed(6),
outputCost: outputCost.toFixed(6),
thinkingCost: thinkingCost.toFixed(6),
totalCost: totalCost.toFixed(6),
totalCostDisplay: totalCost < 0.01
? `$${(totalCost * 100).toFixed(3)}¢`
: `$${totalCost.toFixed(4)}`,
};
}
// Look up rates for a model in a provider's pricing table, falling back
// to the table's "default" row. Each provider defines its own table.
export function ratesFor(pricingTable, model) {
return pricingTable[model] || pricingTable["default"] || { input: 0, output: 0 };
}
// Zero-cost record — used by providers that don't charge (Ollama,
// local, openai-compatible without a known pricing table).
export function zeroCost(usage = {}) {
const inputTokens = usage.inputTokens || 0;
const outputTokens = usage.outputTokens || 0;
const thinkingTokens = usage.thinkingTokens || 0;
return {
inputTokens,
outputTokens,
thinkingTokens,
totalTokens: usage.totalTokens || (inputTokens + outputTokens + thinkingTokens),
inputCost: "0.000000",
outputCost: "0.000000",
thinkingCost: "0.000000",
totalCost: "0.000000",
totalCostDisplay: "$0.0000",
};
}
+364
View File
@@ -0,0 +1,364 @@
// Gemini provider — wraps @google/genai behind the shared Provider
// interface. Stateless helpers + a per-request factory: each call to
// createGeminiProvider({ apiKey }) returns a provider instance bound to
// that key, mirroring how `new GoogleGenAI({ apiKey })` was used before.
//
// What lives here:
// - SDK init + per-request HTTP timeouts
// - File API upload + processing-state polling
// - generateContent calls for transcription + analysis
// - Empty-response retry loop
// - Safety settings + thinking-config selection
// - Cost calculation (delegated to gemini-helpers.calcCost)
// - Model lists for the two pipelines (transcription vs. analysis)
//
// What does NOT live here (stays in server/index.js as orchestration):
// - Audio chunking decisions + transcript merging
// - Analysis-output JSON parsing
// - Topic-analysis prompt construction (provider-neutral, in
// gemini-helpers.js)
import { GoogleGenAI } from "@google/genai";
import { safeText, retryGemini, formatTime } from "../util.js";
import { calcCost } from "../gemini-helpers.js";
// Models exposed to the analysis fallback chain. Order matters — first
// is the preferred default, the rest are tried in order if it fails.
export const GEMINI_ANALYSIS_MODELS = [
"gemini-3.1-pro-preview",
"gemini-3-pro-preview",
"gemini-3-flash-preview",
"gemini-2.5-flash",
];
// Transcription models, in fallback order. Flash is best speed/cost
// for audio → text; 2.5 Flash is the stable previous-gen multimodal
// model and works well as a fallback when Gemini 3 Flash returns 503
// (capacity / overload). The orchestration layer in server/index.js
// iterates this list, retrying with the next model when one fails.
export const GEMINI_TRANSCRIPTION_MODELS = [
"gemini-3-flash-preview",
"gemini-2.5-flash",
"gemini-2.0-flash",
];
// Empty-response retries: when the SDK returns 200 with no text (which
// happens periodically with audio inputs), retry up to N times with
// linear backoff before giving up.
const EMPTY_RETRIES = 3;
// The @google/genai SDK does not accept a per-call AbortSignal, so when
// the user cancels a request we need to interrupt the in-flight promise
// ourselves. Race the SDK call against a promise that rejects when the
// caller's signal aborts — the rejection bubbles up immediately and the
// underlying HTTP request gets garbage-collected by the SDK on its own
// timeout. `signal` is optional; without it this is a no-op passthrough.
function withAbort(promise, signal) {
if (!signal) return promise;
if (signal.aborted) {
return Promise.reject(Object.assign(new Error("aborted"), { name: "AbortError" }));
}
return new Promise((resolve, reject) => {
const onAbort = () => {
reject(Object.assign(new Error("aborted"), { name: "AbortError" }));
};
signal.addEventListener("abort", onAbort, { once: true });
promise.then(
(v) => {
signal.removeEventListener("abort", onAbort);
resolve(v);
},
(e) => {
signal.removeEventListener("abort", onAbort);
reject(e);
}
);
});
}
// Safety filters disabled for transcription so the model doesn't refuse
// to transcribe sensitive but legitimate spoken content. Analysis
// inherits whatever Gemini's defaults are.
const TRANSCRIPTION_SAFETY = [
{ category: "HARM_CATEGORY_HARASSMENT", threshold: "BLOCK_NONE" },
{ category: "HARM_CATEGORY_HATE_SPEECH", threshold: "BLOCK_NONE" },
{ category: "HARM_CATEGORY_SEXUALLY_EXPLICIT", threshold: "BLOCK_NONE" },
{ category: "HARM_CATEGORY_DANGEROUS_CONTENT", threshold: "BLOCK_NONE" },
];
export function createGeminiProvider({ apiKey, timeoutMs = 900_000 } = {}) {
if (!apiKey) {
throw new Error("createGeminiProvider: apiKey is required");
}
const ai = new GoogleGenAI({
apiKey,
httpOptions: { timeout: timeoutMs, headersTimeout: timeoutMs },
});
// Analysis uses the same client — legitimate analysis on long
// transcripts can genuinely take 35+ minutes, so an aggressive
// timeout cuts off real work. The double-retry-of-overloaded-model
// waste that 0.2.22 was trying to fix is already handled by
// retries=1 below: a 503 fast-fails in seconds, and the outer
// fallback chain (Pro → Pro older → Flash → Flash 2.5) moves
// on immediately.
const aiAnalyze = ai;
return {
name: "gemini",
capabilities: {
transcribe: true,
analyze: true,
listModels: true,
},
listAnalysisModels() {
return [...GEMINI_ANALYSIS_MODELS];
},
listTranscriptionModels() {
return [...GEMINI_TRANSCRIPTION_MODELS];
},
// Transcribe a single audio file. The caller handles chunking +
// merging — this is the atomic unit. Returns:
// { text, entries?, usage, cost, finishReason, blockReason }
// `text` is the raw model output (with [MM:SS] markers); the caller
// parses it into entries. `cost` uses the same shape calcCost
// already produces, so existing accounting code is unchanged.
async transcribeAudio({
filePath,
mimeType,
titleHint,
channelHint = "",
descriptionHint = "",
chaptersHint = [],
model,
offsetSeconds = 0,
onProgress = () => {},
signal,
}) {
const upStart = Date.now();
onProgress(
`Uploading audio${offsetSeconds > 0 ? ` (offset ${formatTime(offsetSeconds)})` : ""} to Gemini File API...`
);
const uploaded = await withAbort(
ai.files.upload({
file: filePath,
config: { mimeType },
}),
signal
);
const upTime = ((Date.now() - upStart) / 1000).toFixed(1);
onProgress(`Audio uploaded in ${upTime}s`);
// Wait for the File API to finish ingesting before generation.
let f = uploaded;
const pStart = Date.now();
while (f.state === "PROCESSING") {
if (signal?.aborted) {
throw Object.assign(new Error("aborted"), { name: "AbortError" });
}
const ws = ((Date.now() - pStart) / 1000).toFixed(0);
onProgress(`Waiting for Gemini to process audio... (${ws}s)`);
await new Promise((r) => setTimeout(r, 3000));
f = await withAbort(ai.files.get({ name: f.name }), signal);
}
if (f.state === "FAILED") {
throw new Error("Gemini failed to process audio file.");
}
const pTime = ((Date.now() - pStart) / 1000).toFixed(1);
onProgress(`Audio processed in ${pTime}s. Transcribing with ${model}...`);
const prompt = buildTranscriptionPrompt({
title: titleHint,
channel: channelHint,
description: descriptionHint,
chapters: chaptersHint,
});
// thinkingLevel: "minimal" is only valid for Flash. Pro models
// reject it. Match prior behavior precisely.
const txConfig = model.includes("flash")
? { thinkingConfig: { thinkingLevel: "minimal" } }
: {};
let result;
let finishReason = "UNKNOWN";
let blockReason = "none";
for (let attempt = 0; attempt < EMPTY_RETRIES; attempt++) {
if (signal?.aborted) {
throw Object.assign(new Error("aborted"), { name: "AbortError" });
}
result = await retryGemini(
() =>
withAbort(
ai.models.generateContent({
model,
config: {
...txConfig,
safetySettings: TRANSCRIPTION_SAFETY,
},
contents: [
{
role: "user",
parts: [
{ fileData: { fileUri: f.uri, mimeType } },
{ text: prompt },
],
},
],
}),
signal
),
{
retries: 3,
delayMs: 5000,
label: `Transcription${offsetSeconds > 0 ? ` (chunk@${formatTime(offsetSeconds)})` : ""}`,
log: (msg) => onProgress(msg),
}
);
const text = safeText(result);
if (text) break;
const candidate = result?.candidates?.[0];
finishReason = candidate?.finishReason || "UNKNOWN";
blockReason = result?.promptFeedback?.blockReason || "none";
onProgress(
`⚠ Empty response (attempt ${attempt + 1}/${EMPTY_RETRIES}) — finishReason: ${finishReason}, blockReason: ${blockReason}`
);
if (attempt < EMPTY_RETRIES - 1) {
const waitSec = 10 * (attempt + 1);
onProgress(`Waiting ${waitSec}s before retry...`);
await new Promise((r) => setTimeout(r, waitSec * 1000));
}
}
// Best-effort cleanup of the uploaded file. Failure here is
// harmless — Gemini garbage-collects on its own schedule.
try {
await ai.files.delete({ name: f.name });
} catch {}
const usage = result.usageMetadata || {};
const cost = calcCost(model, usage);
return {
text: safeText(result) || "",
usage,
cost,
finishReason,
blockReason,
// Pass-through for callers that still want the raw SDK response
// (e.g. existing logging code). Will be removed once nothing
// depends on it.
raw: result,
};
},
// Generate text from a prompt (no audio). Used by the topic-analysis
// step today, but generic enough for any text→text model call.
// Returns: { text, usage, cost, finishReason }
async analyzeText({
prompt,
model,
onProgress = () => {},
// Default to 1 attempt (no per-model retry). Analysis-step 503s
// ("model overloaded") almost never clear in 510 seconds —
// they're capacity-shaped, not transient-blip-shaped. Better
// UX: fail fast on a single model and let the outer fallback
// chain in server/index.js walk to the next model (Pro → Pro
// older → Flash → Flash 2.5) immediately. Caller can override
// with retries: 2 if they want the old behavior.
retries = 1,
signal,
}) {
const result = await retryGemini(
() =>
withAbort(
aiAnalyze.models.generateContent({
model,
contents: [
{
role: "user",
parts: [{ text: prompt }],
},
],
}),
signal
),
{
retries,
delayMs: 5000,
label: "Analysis",
log: (msg) => onProgress(msg),
}
);
const text = safeText(result);
const usage = result.usageMetadata || {};
const cost = calcCost(model, usage);
const finishReason = result?.candidates?.[0]?.finishReason || null;
return {
text: text || "",
usage,
cost,
finishReason,
raw: result,
};
},
};
}
// Transcription prompt — Gemini-specific because it relies on
// timestamp-formatted output we then parse. Other providers may need a
// differently-shaped prompt, so each provider owns its own.
//
// Accepts richer context than just a title: channel name, video
// description, and YouTube chapter markers. These dramatically improve
// speaker-name extraction — most podcast descriptions list host and
// guest by name, channel names are often the host's name, and chapter
// titles sometimes label introductions ("Conversation with John Doe").
// Without this context, the model falls back to "Host"/"Guest".
function buildTranscriptionPrompt({ title, channel, description, chapters } = {}) {
let context = "";
if (title) context += `Video title: "${title}"\n`;
if (channel) context += `Channel: ${channel}\n`;
if (description) {
// Trim to keep prompt size sane on hours-long podcasts whose
// descriptions can include full sponsor lists + show notes.
const desc = description.length > 1500 ? description.slice(0, 1500) + "…" : description;
context += `Video description (use to identify speakers by name):\n${desc}\n`;
}
if (Array.isArray(chapters) && chapters.length > 0) {
const lines = chapters
.slice(0, 30)
.map((c) => {
const start = typeof c.start_time === "number" ? c.start_time : 0;
const mm = Math.floor(start / 60);
const ss = Math.floor(start % 60).toString().padStart(2, "0");
return ` [${mm}:${ss}] ${c.title || ""}`;
})
.join("\n");
context += `Chapter markers (titles often name speakers or topics):\n${lines}\n`;
}
if (context) context += "\n";
return `${context}Transcribe this audio completely and verbatim. Include timestamps at regular intervals (every 15-30 seconds or at natural pauses).
Format each line as:
[MM:SS] The spoken text here...
Rules:
- Transcribe EVERY word spoken, do not skip or summarize anything.
- Use [MM:SS] or [H:MM:SS] timestamp format at the start of each line.
- Start a new timestamped line every 15-30 seconds or at natural speech pauses.
- Include filler words (um, uh, you know) for accuracy.
- Speaker identification: FIRST consult the metadata above — descriptions and chapter titles usually name the host(s) and guest(s) explicitly, and the channel name is often the host's name. Match those names to the voices in the audio (introductions, "I'm Dax", "this is Will", first-person references) and use them as speaker labels. Format as: [MM:SS] Name: text. Only fall back to "Host"/"Guest" if no names appear in the metadata AND nobody is introduced by name in the audio.
Return ONLY the timestamped transcript, nothing else.`;
}
+154
View File
@@ -0,0 +1,154 @@
// Provider registry. Each provider wraps a single LLM/SDK behind a
// uniform interface (see ./gemini.js for the reference shape). The rest
// of the server talks to providers through getProvider() and never
// imports SDKs directly.
//
// Adding a new provider:
// 1. Create ./<name>.js exporting createXxxProvider({ apiKey, ... }).
// 2. Add it to PROVIDER_NAMES + the switch in getProvider().
// 3. Add the matching opts shape to PROVIDER_KEY_FIELDS so
// resolveProviderOpts() can pull the right key/baseURL out of the
// StartOS config.
// 4. Wire its config field into startos/file-models/config.json.ts
// and add a "Set <Provider> Key" StartOS action.
//
// Capabilities (see provider.capabilities) signal what each one can do.
// Some providers analyze but can't transcribe (Claude, OpenAI-compat,
// Ollama); the orchestration layer in server/index.js can mix providers
// across the transcription + analysis pipelines.
import { createGeminiProvider } from "./gemini.js";
import { createAnthropicProvider } from "./anthropic.js";
import { createOpenAIProvider } from "./openai.js";
import { createOpenAICompatibleProvider } from "./openai-compatible.js";
import { createOllamaProvider } from "./ollama.js";
import { createWhisperProvider } from "./whisper.js";
import { createRelayProvider } from "./relay.js";
import { getInstallId } from "../install-id.js";
import { getRawLicenseKey } from "../license.js";
import { getRelayBaseURL } from "../relay-default.js";
export const PROVIDER_NAMES = [
"gemini",
"anthropic",
"openai",
"openai-compatible",
"ollama",
"whisper",
"relay",
];
// Map provider name → which fields to read from the StartOS config blob
// when resolving its construction opts. Used by resolveProviderOpts().
export const PROVIDER_KEY_FIELDS = {
gemini: { apiKey: "gemini_api_key" },
anthropic: { apiKey: "anthropic_api_key" },
openai: { apiKey: "openai_api_key" },
"openai-compatible": {
apiKey: "openai_compatible_api_key",
baseURL: "openai_compatible_base_url",
},
ollama: { baseURL: "ollama_base_url" },
whisper: {
apiKey: "whisper_api_key",
baseURL: "whisper_base_url",
},
// Relay is operator-only — base URL is HARDCODED in
// server/relay-default.js, NOT read from StartOS config. The empty
// object is intentional: resolveProviderOpts uses `name in
// PROVIDER_KEY_FIELDS` to recognise the provider, then the
// relay-specific block at the bottom of resolveProviderOpts
// injects baseURL + installId + licenseKey server-side. Without
// this entry the lookup throws "Unknown provider: relay" before
// reaching the injection block.
relay: {},
};
export function getProvider(name, opts = {}) {
switch (name) {
case "gemini":
return createGeminiProvider(opts);
case "anthropic":
return createAnthropicProvider(opts);
case "openai":
return createOpenAIProvider(opts);
case "openai-compatible":
return createOpenAICompatibleProvider(opts);
case "ollama":
return createOllamaProvider(opts);
case "whisper":
return createWhisperProvider(opts);
case "relay":
return createRelayProvider(opts);
default:
throw new Error(
`Unknown provider: ${name}. Available: ${PROVIDER_NAMES.join(", ")}`
);
}
}
// Pull the construction opts for a provider out of the StartOS config
// blob, optionally overridden per-provider by client-side opts the web
// UI passed in the request body.
//
// `config` is the parsed startos-config.json snapshot.
// `clientOpts` is { apiKey?, baseURL? } for THIS provider only —
// typically a value out of req.body.providerOpts[name].
//
// Resolution priority for each field: client opt → config opt.
// Returns { apiKey?, baseURL? } as appropriate for the provider.
export function resolveProviderOpts(name, { config = {}, clientOpts = {} } = {}) {
const fields = PROVIDER_KEY_FIELDS[name];
if (!fields) {
throw new Error(`Unknown provider: ${name}`);
}
const opts = {};
if (fields.apiKey) {
const fromConfig = config[fields.apiKey] || "";
const fromClient = (clientOpts.apiKey || "").trim();
opts.apiKey = fromClient || fromConfig;
}
if (fields.baseURL) {
const fromConfig = config[fields.baseURL] || "";
const fromClient = (clientOpts.baseURL || "").trim();
opts.baseURL = fromClient || fromConfig;
// Last-resort fallback for Ollama: the canonical StartOS internal
// hostname. Reachable when the optional Ollama dependency is
// installed alongside Recap on the same StartOS server, even if
// the user hasn't run the "Set Ollama Server URL" action.
if (!opts.baseURL && name === "ollama") {
opts.baseURL = "http://ollama.startos:11434";
}
}
// User-defined model list: providers with dynamic catalogs (ollama,
// openai-compatible, whisper) accept a comma- or newline-separated
// list of model names in clientOpts.models. Parse and pass through
// as `defaultModels` so listTranscriptionModels / listAnalysisModels
// return the right thing AND so the orchestration layer's fallback
// chain knows what to walk through if the user's chosen model fails.
if (typeof clientOpts.models === "string" && clientOpts.models.trim()) {
const seen = new Set();
const models = clientOpts.models
.split(/[,\n]/)
.map((s) => s.trim())
.filter((s) => {
if (!s || seen.has(s)) return false;
seen.add(s);
return true;
});
if (models.length > 0) {
opts.defaultModels = models;
}
}
// Relay-specific injections: baseURL (hardcoded constant or env
// override) + install-id (always) + license key (when present).
// None of these come from clientOpts — relay identity + endpoint
// must not be spoofable from a request body.
if (name === "relay") {
opts.baseURL = getRelayBaseURL();
opts.installId = getInstallId();
const rawKey = getRawLicenseKey();
if (rawKey) opts.licenseKey = rawKey;
}
return opts;
}
+125
View File
@@ -0,0 +1,125 @@
// Ollama provider — analysis only, raw HTTP to a local Ollama server.
//
// Ollama runs LLMs locally; there is no per-request cost. Default
// baseURL is the conventional `http://localhost:11434`. Users on a
// LAN-hosted Ollama point at it explicitly via the StartOS action.
//
// We don't ship a hardcoded model list — Ollama's catalog is whatever
// the user has `pull`ed locally. listAnalysisModels() can optionally
// query /api/tags at config time, but for v1 we expose a free-text
// model field in the picker UI.
import { retryAPI } from "../util.js";
import { zeroCost } from "./cost.js";
const DEFAULT_BASE_URL = "http://localhost:11434";
export function createOllamaProvider({
baseURL,
timeoutMs = 900_000,
} = {}) {
const base = (baseURL || DEFAULT_BASE_URL).replace(/\/$/, "");
return {
name: "ollama",
capabilities: {
transcribe: false,
analyze: true,
listModels: true,
},
listAnalysisModels() {
return [];
},
listTranscriptionModels() {
return [];
},
async transcribeAudio() {
throw new Error(
"Ollama is wired for analysis only. Use Gemini or OpenAI Whisper for transcription."
);
},
// Lists models the local Ollama server has pulled. Best-effort —
// returns [] on any error so the picker can fall back to the
// free-text input.
async listInstalledModels() {
try {
const res = await fetch(`${base}/api/tags`, {
signal: AbortSignal.timeout(5000),
});
if (!res.ok) return [];
const data = await res.json();
return (data.models || []).map((m) => m.name).filter(Boolean);
} catch {
return [];
}
},
async analyzeText({
prompt,
model,
onProgress = () => {},
retries = 2,
signal,
}) {
const result = await retryAPI(
async () => {
// Combine the per-request timeout with the caller-supplied
// cancel signal so a user-pressed Cancel button aborts the
// fetch immediately instead of waiting for the (long) timeout.
const timeoutSignal = AbortSignal.timeout(timeoutMs);
const combinedSignal = signal
? AbortSignal.any([signal, timeoutSignal])
: timeoutSignal;
const res = await fetch(`${base}/api/generate`, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({
model,
prompt,
stream: false,
}),
signal: combinedSignal,
});
if (!res.ok) {
const errText = await res.text().catch(() => "");
const err = new Error(
`Ollama ${res.status} ${res.statusText}: ${errText.slice(0, 200)}`
);
err.status = res.status;
throw err;
}
return res.json();
},
{
retries,
delayMs: 5000,
label: "Ollama analysis",
log: (msg) => onProgress(msg),
}
);
const text = result.response || "";
// Ollama's /api/generate returns prompt_eval_count + eval_count.
const usage = {
inputTokens: result.prompt_eval_count || 0,
outputTokens: result.eval_count || 0,
thinkingTokens: 0,
};
const cost = zeroCost(usage);
return {
text,
usage,
cost,
finishReason: result.done ? "stop" : null,
raw: result,
};
},
};
}
+110
View File
@@ -0,0 +1,110 @@
// OpenAI-compatible provider — analysis only.
//
// Same wire format as OpenAI's chat.completions endpoint, but pointed
// at a user-supplied baseURL: DeepSeek, Together, Groq, Fireworks, your
// own self-hosted vLLM, etc. The user provides baseURL + apiKey + model
// name; we don't ship a hardcoded model list (each backend's catalog
// differs), and we don't have pricing (varies wildly per backend).
//
// Structurally this is a thin re-export of the OpenAI SDK with the
// pricing table forced to zero — costs are reported as $0.0000 since we
// can't know the backend's rates without per-deploy configuration.
import OpenAI from "openai";
import { retryAPI } from "../util.js";
import { zeroCost } from "./cost.js";
// Default model lists are empty — the picker UI surfaces a free-text
// model field for OpenAI-compatible. listAnalysisModels() returns the
// caller-provided defaults if any were passed at construction time.
const ANALYSIS_MAX_TOKENS = 16000;
export function createOpenAICompatibleProvider({
apiKey,
baseURL,
defaultModels = [],
timeoutMs = 900_000,
} = {}) {
if (!baseURL) {
throw new Error(
"createOpenAICompatibleProvider: baseURL is required (e.g. https://api.deepseek.com/v1)"
);
}
// Some self-hosted backends accept any non-empty key. Default to a
// sentinel so the SDK's auth header stays well-formed.
const client = new OpenAI({
apiKey: apiKey || "no-auth",
baseURL,
timeout: timeoutMs,
});
return {
name: "openai-compatible",
capabilities: {
transcribe: false,
analyze: true,
listModels: defaultModels.length > 0,
},
listAnalysisModels() {
return [...defaultModels];
},
listTranscriptionModels() {
return [];
},
async transcribeAudio() {
throw new Error(
"openai-compatible providers are wired for analysis only. Use Gemini or OpenAI Whisper for transcription."
);
},
async analyzeText({
prompt,
model,
onProgress = () => {},
retries = 2,
signal,
}) {
const result = await retryAPI(
() =>
client.chat.completions.create(
{
model,
max_tokens: ANALYSIS_MAX_TOKENS,
messages: [{ role: "user", content: prompt }],
},
signal ? { signal } : undefined
),
{
retries,
delayMs: 5000,
label: "openai-compatible analysis",
log: (msg) => onProgress(msg),
}
);
const choice = result.choices?.[0];
const text = choice?.message?.content || "";
const usage = {
inputTokens: result.usage?.prompt_tokens || 0,
outputTokens: result.usage?.completion_tokens || 0,
thinkingTokens: 0,
};
// Per-backend pricing varies — report zero. UI can warn that cost
// tracking is not available for this provider.
const cost = zeroCost(usage);
return {
text,
usage,
cost,
finishReason: choice?.finish_reason || null,
raw: result,
};
},
};
}
+201
View File
@@ -0,0 +1,201 @@
// OpenAI provider — analysis (chat.completions) + transcription (Whisper).
//
// Whisper (whisper-1) has a 25 MB per-request file size cap. The
// orchestration layer's audio chunking is currently sized for Gemini's
// much larger cap; long podcasts at high bitrate can push individual
// chunks over Whisper's cap. We surface that as a clear error rather
// than silently truncating — users can mix providers (Whisper for
// short audio, Gemini for long) per-request via the picker.
//
// Pricing values are placeholders — verify against current OpenAI
// pricing before billing-sensitive use.
import { createReadStream, statSync } from "fs";
import OpenAI from "openai";
import { retryAPI, formatTime } from "../util.js";
import { formatCost, ratesFor } from "./cost.js";
// Per-1M-token rates in USD for chat.completions models.
// VERIFY against current OpenAI pricing before relying on these for billing.
export const OPENAI_PRICING = {
"gpt-4o": { input: 2.50, output: 10.00 },
"gpt-4o-mini": { input: 0.15, output: 0.60 },
"gpt-4-turbo": { input: 10.00, output: 30.00 },
"o3-mini": { input: 1.10, output: 4.40 },
// Fallback for unknown / future models.
"default": { input: 2.50, output: 10.00 },
};
// Whisper bills per minute of audio, not per token. The cost record
// reuses the token cost shape, but stores minute-based math in the
// `inputCost` field.
const WHISPER_USD_PER_MINUTE = 0.006;
const WHISPER_MAX_BYTES = 25 * 1024 * 1024; // OpenAI hard limit
export const OPENAI_ANALYSIS_MODELS = [
"gpt-4o",
"gpt-4o-mini",
"gpt-4-turbo",
"o3-mini",
];
export const OPENAI_TRANSCRIPTION_MODELS = ["whisper-1"];
const ANALYSIS_MAX_TOKENS = 16000;
export function createOpenAIProvider({
apiKey,
baseURL,
timeoutMs = 900_000,
} = {}) {
if (!apiKey) {
throw new Error("createOpenAIProvider: apiKey is required");
}
const client = new OpenAI({
apiKey,
baseURL: baseURL || undefined,
timeout: timeoutMs,
});
return {
name: "openai",
capabilities: {
transcribe: true,
analyze: true,
listModels: true,
},
listAnalysisModels() {
return [...OPENAI_ANALYSIS_MODELS];
},
listTranscriptionModels() {
return [...OPENAI_TRANSCRIPTION_MODELS];
},
// Whisper-based transcription. Returns the same [MM:SS] formatted
// text shape Gemini produces, so the orchestration layer's
// parseTimestampedTranscript() works unchanged.
async transcribeAudio({
filePath,
model = "whisper-1",
offsetSeconds = 0,
onProgress = () => {},
signal,
}) {
let bytes = 0;
try {
bytes = statSync(filePath).size;
} catch {}
if (bytes > WHISPER_MAX_BYTES) {
const sizeMB = (bytes / (1024 * 1024)).toFixed(1);
throw new Error(
`OpenAI Whisper file size limit is 25 MB. This chunk is ${sizeMB} MB. Try Gemini for transcription, or split the audio more aggressively.`
);
}
onProgress(
`Uploading audio${offsetSeconds > 0 ? ` (offset ${formatTime(offsetSeconds)})` : ""} to OpenAI Whisper (${model})...`
);
const start = Date.now();
const result = await retryAPI(
() =>
client.audio.transcriptions.create(
{
file: createReadStream(filePath),
model,
response_format: "verbose_json",
timestamp_granularities: ["segment"],
},
signal ? { signal } : undefined
),
{
retries: 3,
delayMs: 5000,
label: `Whisper transcription${offsetSeconds > 0 ? ` (chunk@${formatTime(offsetSeconds)})` : ""}`,
log: (msg) => onProgress(msg),
}
);
const elapsed = ((Date.now() - start) / 1000).toFixed(1);
onProgress(`Whisper transcription complete in ${elapsed}s`);
const segments = Array.isArray(result.segments) ? result.segments : [];
const lines = segments.length
? segments.map((s) => `[${formatTime(s.start || 0)}] ${(s.text || "").trim()}`)
: [`[0:00] ${(result.text || "").trim()}`];
const text = lines.join("\n");
// Whisper bills by audio duration in minutes, not tokens.
const durationSeconds = result.duration || 0;
const minutes = durationSeconds / 60;
const usdCost = minutes * WHISPER_USD_PER_MINUTE;
const cost = {
inputTokens: 0,
outputTokens: 0,
thinkingTokens: 0,
totalTokens: 0,
inputCost: usdCost.toFixed(6),
outputCost: "0.000000",
thinkingCost: "0.000000",
totalCost: usdCost.toFixed(6),
totalCostDisplay: usdCost < 0.01
? `$${(usdCost * 100).toFixed(3)}¢`
: `$${usdCost.toFixed(4)}`,
};
return {
text,
usage: { inputTokens: 0, outputTokens: 0, thinkingTokens: 0, totalTokens: 0 },
cost,
finishReason: null,
blockReason: "none",
raw: result,
};
},
async analyzeText({
prompt,
model,
onProgress = () => {},
retries = 2,
signal,
}) {
const result = await retryAPI(
() =>
client.chat.completions.create(
{
model,
max_tokens: ANALYSIS_MAX_TOKENS,
messages: [{ role: "user", content: prompt }],
},
signal ? { signal } : undefined
),
{
retries,
delayMs: 5000,
label: "OpenAI analysis",
log: (msg) => onProgress(msg),
}
);
const choice = result.choices?.[0];
const text = choice?.message?.content || "";
const usage = {
inputTokens: result.usage?.prompt_tokens || 0,
outputTokens: result.usage?.completion_tokens || 0,
thinkingTokens: 0,
};
const cost = formatCost(ratesFor(OPENAI_PRICING, model), usage);
return {
text,
usage,
cost,
finishReason: choice?.finish_reason || null,
raw: result,
};
},
};
}
+341
View File
@@ -0,0 +1,341 @@
// Relay provider — proxies transcription + analysis calls through the
// operator's relay backend (operator-side service, not in this repo).
// The relay handles billing/credit accounting, picks the actual backing
// model (Gemini, the operator's Parakeet+Gemma, etc.), and returns a
// uniform response with the user's remaining credit balance.
//
// Auth shape:
// - `X-Recap-Install-Id` header on every call (required; identifies
// the credit owner). Comes from ./install-id.js.
// - `Authorization: Bearer <license_key>` header when a license is
// present. Absent = treat as Core (free) tier.
// - `X-Recap-Job-Id` header on every call, common across the
// transcribe + analyze pair that make up one full summary. The
// relay decrements credits on the FIRST call with a given job_id;
// subsequent calls with the same id are free (until job_id
// expires, ~1h). Means "1 full summary = 1 credit" regardless of
// whether one or both pipeline steps go through the relay.
//
// Response envelope (success and error both):
// {
// "result": { ...endpoint-specific payload... },
// "credits_remaining": <number>,
// "tier": "core" | "pro" | "max",
// "credit_charged": <number, 0 if reused job_id>
// }
//
// What this provider does NOT do:
// - Validate the user's license. Relay does that server-side.
// - Track historical credit usage. Relay's DB owns the ledger.
// - Choose which backing model the relay uses. Operator's call.
import { createReadStream } from "fs";
import { retryAPI, formatTime } from "../util.js";
import { zeroCost } from "./cost.js";
import { updateRelayState, recordRelayError } from "../relay-state.js";
// Provider name shown in logs + chunk pagination labels. "relay" rather
// than e.g. "keysat-relay" because operators may run their own relay
// using a different backend brand — the name should describe the
// architecture, not the operator.
const NAME = "relay";
// Models exposed to the picker. The relay decides what actually runs —
// these labels are placeholders so the picker can show something.
const RELAY_TRANSCRIPTION_MODELS = ["relay-default"];
const RELAY_ANALYSIS_MODELS = ["relay-default"];
export function createRelayProvider({
baseURL,
installId,
licenseKey,
timeoutMs = 900_000,
} = {}) {
if (!baseURL) {
throw new Error(
"createRelayProvider: baseURL is required (e.g. https://relay.keysat.xyz)"
);
}
if (!installId) {
throw new Error(
"createRelayProvider: installId is required (boot must initInstallId first)"
);
}
const base = baseURL.replace(/\/$/, "");
// Build the auth/identity headers attached to every relay call.
// job_id is optional but the orchestration layer should always pass
// one — without it the relay can't bundle the transcribe + analyze
// pair into a single credit charge.
function buildHeaders({ extra = {}, jobId } = {}) {
const h = {
"X-Recap-Install-Id": installId,
...extra,
};
if (licenseKey) h["Authorization"] = `Bearer ${licenseKey}`;
if (jobId) h["X-Recap-Job-Id"] = jobId;
return h;
}
// Common error-handling wrapper. The relay's contract is that ANY
// response (success or failure) carries the standard envelope so
// Recap can keep its balance display accurate even on errors. We
// try to parse error bodies to harvest that.
async function postRelay({ path, body, headers, signal }) {
let res;
try {
res = await fetch(`${base}${path}`, {
method: "POST",
headers,
body,
signal,
});
} catch (err) {
recordRelayError(err?.message || String(err));
throw err;
}
const text = await res.text();
let parsed = null;
try {
parsed = text ? JSON.parse(text) : null;
} catch {}
if (parsed && (typeof parsed.credits_remaining === "number" || parsed.tier)) {
updateRelayState(parsed);
}
if (!res.ok) {
const msg =
parsed?.error ||
parsed?.message ||
text?.slice(0, 300) ||
`HTTP ${res.status}`;
const err = new Error(`Relay ${path} ${res.status}: ${msg}`);
err.status = res.status;
err.envelope = parsed;
if (!parsed) recordRelayError(msg);
throw err;
}
return parsed;
}
return {
name: NAME,
capabilities: {
transcribe: true,
analyze: true,
// The relay's model catalog is internal — Recap doesn't pick.
// listModels: false signals the picker to skip the dropdown.
listModels: false,
},
listAnalysisModels() {
return [...RELAY_ANALYSIS_MODELS];
},
listTranscriptionModels() {
return [...RELAY_TRANSCRIPTION_MODELS];
},
async transcribeAudio({
filePath,
mimeType,
titleHint,
channelHint = "",
descriptionHint = "",
chaptersHint = [],
offsetSeconds = 0,
onProgress = () => {},
signal,
jobId,
}) {
onProgress(
`Uploading audio${offsetSeconds > 0 ? ` (offset ${formatTime(offsetSeconds)})` : ""} to relay (${base})...`
);
const start = Date.now();
// Use multipart form encoding so the audio binary doesn't have
// to be base64-blown-up. Node 20+ provides global FormData; pair
// it with a stream so we don't load the whole audio file into
// memory.
const form = new FormData();
const blob = await fileToBlob(filePath, mimeType);
form.append("audio", blob, "audio.bin");
form.append("mime_type", mimeType || "application/octet-stream");
if (titleHint) form.append("title", titleHint);
if (channelHint) form.append("channel", channelHint);
if (descriptionHint) form.append("description", descriptionHint);
if (Array.isArray(chaptersHint) && chaptersHint.length > 0) {
form.append("chapters", JSON.stringify(chaptersHint));
}
form.append("offset_seconds", String(offsetSeconds));
const envelope = await retryAPI(
() =>
postRelay({
path: "/relay/transcribe",
body: form,
headers: buildHeaders({ jobId }),
signal,
}),
{
retries: 2,
delayMs: 5000,
label: `Relay transcribe${offsetSeconds > 0 ? ` (chunk@${formatTime(offsetSeconds)})` : ""}`,
log: (msg) => onProgress(msg),
}
);
const elapsed = ((Date.now() - start) / 1000).toFixed(1);
const remaining =
typeof envelope.credits_remaining === "number"
? `, ${envelope.credits_remaining} credits left`
: "";
onProgress(`Relay transcribe complete in ${elapsed}s${remaining}`);
// Relay's transcribe result shape: { text, segments?: [{start,
// end, text}], duration? }. We don't fabricate segment timestamps
// here — the orchestration layer's synthesizeEntriesFromText
// handles single-segment-only responses.
const result = envelope.result || {};
const segments = Array.isArray(result.segments) ? result.segments : [];
const lines = segments.length
? segments.map(
(s) => `[${formatTime(s.start || 0)}] ${(s.text || "").trim()}`
)
: [`[0:00] ${(result.text || "").trim()}`];
const text = lines.join("\n");
// Cost from Recap's POV is always zero — credits are the unit,
// and they're tracked separately. The orchestration layer's
// cost-summing code keeps working unchanged.
const cost = zeroCost({
inputTokens: 0,
outputTokens: 0,
thinkingTokens: 0,
});
return {
text,
usage: { inputTokens: 0, outputTokens: 0, thinkingTokens: 0, totalTokens: 0 },
cost,
finishReason: null,
blockReason: "none",
raw: envelope,
};
},
// Peek at the install's current credit balance without charging.
// Used by /api/relay/status to populate the picker banner on boot
// (or on demand) so the user sees real numbers before running any
// summarize. Short timeout — if the relay is unreachable we want
// the UI to fall back to "balance unknown" quickly, not hang.
async pingBalance({ timeoutMs = 5000, signal } = {}) {
const headers = buildHeaders({});
const ac = new AbortController();
const timer = setTimeout(() => ac.abort(), timeoutMs);
const combined = signal
? AbortSignal.any([signal, ac.signal])
: ac.signal;
try {
const res = await fetch(`${base}/relay/balance`, {
method: "GET",
headers,
signal: combined,
});
const text = await res.text();
let parsed = null;
try {
parsed = text ? JSON.parse(text) : null;
} catch {}
if (parsed && (typeof parsed.credits_remaining === "number" || parsed.tier)) {
updateRelayState(parsed);
}
if (!res.ok) {
const msg =
parsed?.error ||
parsed?.message ||
text?.slice(0, 300) ||
`HTTP ${res.status}`;
recordRelayError(msg);
const err = new Error(`Relay /balance ${res.status}: ${msg}`);
err.status = res.status;
throw err;
}
return parsed;
} catch (err) {
if (err?.name === "AbortError") {
recordRelayError(`balance ping timed out after ${timeoutMs}ms`);
} else if (!err.status) {
recordRelayError(err?.message || String(err));
}
throw err;
} finally {
clearTimeout(timer);
}
},
async analyzeText({
prompt,
onProgress = () => {},
retries = 2,
signal,
jobId,
}) {
const start = Date.now();
const headers = buildHeaders({
extra: { "Content-Type": "application/json" },
jobId,
});
const envelope = await retryAPI(
() =>
postRelay({
path: "/relay/analyze",
body: JSON.stringify({ prompt }),
headers,
signal,
}),
{
retries,
delayMs: 5000,
label: "Relay analyze",
log: (msg) => onProgress(msg),
}
);
const elapsed = ((Date.now() - start) / 1000).toFixed(1);
const remaining =
typeof envelope.credits_remaining === "number"
? `, ${envelope.credits_remaining} credits left`
: "";
onProgress(`Relay analyze complete in ${elapsed}s${remaining}`);
const result = envelope.result || {};
const text = typeof result.text === "string" ? result.text : "";
const cost = zeroCost({
inputTokens: 0,
outputTokens: 0,
thinkingTokens: 0,
});
return {
text,
usage: { inputTokens: 0, outputTokens: 0, thinkingTokens: 0, totalTokens: 0 },
cost,
finishReason: null,
raw: envelope,
};
},
};
}
// Streams a file off disk into a Blob with the given MIME type for
// FormData upload. Node's global Blob/File don't accept a stream
// directly the way browser File objects do, so we read into a Buffer
// here. Acceptable for typical podcast chunk sizes (~50 MB at most
// after the orchestration layer's 45-min split).
async function fileToBlob(filePath, mimeType) {
const { promises: fsp } = await import("fs");
const buf = await fsp.readFile(filePath);
return new Blob([buf], { type: mimeType || "application/octet-stream" });
}
+180
View File
@@ -0,0 +1,180 @@
// Whisper provider — transcription via any OpenAI-Audio-Transcription-API-
// compatible endpoint. OpenAI's audio.transcriptions.create wire format
// is the de facto standard; whisper.cpp's HTTP server, faster-whisper-
// server, NVIDIA Parakeet behind speaches, Groq's Whisper API, and most
// other self-hosted implementations honor it. So this provider is
// effectively "OpenAI for transcription with a custom baseURL" —
// distinct from the `openai` provider so users can wire a self-hosted
// transcription engine alongside their cloud OpenAI key (used for GPT
// analysis).
//
// Implementation note: although the wire format matches OpenAI's, this
// provider has its OWN transcribeAudio (rather than reusing the OpenAI
// provider's). Reasons:
// - Log messages should say "Whisper at host:port (model)" not
// "OpenAI Whisper" — Parakeet/whisper.cpp behind a custom URL is
// not "OpenAI" and showing that in logs is misleading.
// - No 25 MB chunk cap. Self-hosted Whisper / Parakeet typically
// handles much larger inputs than the OpenAI cloud API.
// - Zero per-minute cost reporting (self-hosted by definition).
import { createReadStream } from "fs";
import OpenAI from "openai";
import { retryAPI, formatTime } from "../util.js";
const FALLBACK_MODEL = "whisper-1";
export function createWhisperProvider({
apiKey,
baseURL,
defaultModels = [],
timeoutMs = 900_000,
} = {}) {
if (!baseURL) {
throw new Error(
"createWhisperProvider: baseURL is required (e.g. http://localhost:8000/v1)"
);
}
// Self-hosted Whisper servers commonly skip auth — pass a sentinel
// string so the SDK's authorization header is well-formed.
const client = new OpenAI({
apiKey: apiKey || "no-auth",
baseURL,
timeout: timeoutMs,
});
// Pretty-print the host for log messages: strip protocol, ignore /v1
// suffix, trim trailing slash.
const displayHost = baseURL
.replace(/^https?:\/\//, "")
.replace(/\/v\d+\/?$/, "")
.replace(/\/$/, "");
return {
name: "whisper",
capabilities: {
transcribe: true,
analyze: false,
listModels: defaultModels.length > 0,
},
listTranscriptionModels() {
return defaultModels.length > 0 ? [...defaultModels] : [FALLBACK_MODEL];
},
listAnalysisModels() {
return [];
},
async transcribeAudio({
filePath,
model = FALLBACK_MODEL,
offsetSeconds = 0,
onProgress = () => {},
signal,
}) {
// Use the model + host directly in the log — "Whisper" was
// misleading when a user wires up Parakeet (or any non-Whisper
// model) at a custom endpoint.
onProgress(
`Uploading audio${offsetSeconds > 0 ? ` (offset ${formatTime(offsetSeconds)})` : ""} to ${model} at ${displayHost}...`
);
const start = Date.now();
// Try the rich request first (verbose_json + per-segment
// timestamps — needed to render the transcript with timestamps
// and let the analysis step build sections). If the wrapper
// rejects those params (some Whisper-API-compatible servers,
// including some Parakeet wrappers, don't implement them and
// return 500), retry once with the bare-bones request shape.
let result;
let usedFallbackShape = false;
try {
result = await retryAPI(
() =>
client.audio.transcriptions.create(
{
file: createReadStream(filePath),
model,
response_format: "verbose_json",
timestamp_granularities: ["segment"],
},
signal ? { signal } : undefined
),
{
retries: 2,
delayMs: 5000,
label: `${model} transcription${offsetSeconds > 0 ? ` (chunk@${formatTime(offsetSeconds)})` : ""}`,
log: (msg) => onProgress(msg),
}
);
} catch (richErr) {
const richStatus = richErr?.status || 0;
// Only fall back on 4xx / 5xx where the params themselves are
// the likely culprit. Connection / timeout errors get thrown.
if (richStatus >= 400 && richStatus < 600) {
onProgress(
`Rich-request failed (status ${richStatus}); retrying with bare request shape (no verbose_json, no segment timestamps)...`
);
usedFallbackShape = true;
result = await retryAPI(
() =>
client.audio.transcriptions.create(
{
file: createReadStream(filePath),
model,
},
signal ? { signal } : undefined
),
{
retries: 2,
delayMs: 5000,
label: `${model} transcription (fallback)${offsetSeconds > 0 ? ` (chunk@${formatTime(offsetSeconds)})` : ""}`,
log: (msg) => onProgress(msg),
}
);
} else {
throw richErr;
}
}
const elapsed = ((Date.now() - start) / 1000).toFixed(1);
onProgress(
`${model} transcription complete in ${elapsed}s${usedFallbackShape ? " (bare request — no segment timestamps)" : ""}`
);
const segments = Array.isArray(result.segments) ? result.segments : [];
const lines = segments.length
? segments.map((s) => `[${formatTime(s.start || 0)}] ${(s.text || "").trim()}`)
: [`[0:00] ${(result.text || "").trim()}`];
const text = lines.join("\n");
// Self-hosted Whisper / Parakeet are free at the API layer
// (you've already paid for the hardware), so zero cost.
const cost = {
inputTokens: 0,
outputTokens: 0,
thinkingTokens: 0,
totalTokens: 0,
inputCost: "0.000000",
outputCost: "0.000000",
thinkingCost: "0.000000",
totalCost: "0.000000",
totalCostDisplay: "$0.0000",
};
return {
text,
usage: { inputTokens: 0, outputTokens: 0, thinkingTokens: 0, totalTokens: 0 },
cost,
finishReason: null,
blockReason: "none",
raw: result,
};
},
async analyzeText() {
throw new Error(
"Whisper provider is transcription-only. Use a different provider (Gemini / Anthropic / OpenAI / Ollama / OpenAI-compatible) for analysis."
);
},
};
}