Files
recap/server/providers/whisper.js
T
Keysat 373d10595b Pluggable AI providers, relay credit system, picker UX overhaul
Captures roughly forty version bumps (v0.2.6 → v0.2.47) of work that
accumulated without commits.

- Pluggable provider system under server/providers/: gemini, anthropic,
  openai, openai-compatible, ollama, whisper-compatible, relay. Mix and
  match transcription + analysis per request via the picker UI.
- Relay backend integration. Hardcoded relay URL in server/relay-default.js
  (operator-controlled at build time, not user-configurable). New
  /api/relay/{status,policy} endpoints proxy to the relay; balance pings
  populate a cached credit display.
- Per-install identity in server/install-id.js for relay credit accounting.
  Sent to the relay as X-Recap-Install-Id; persists across upgrades, lost
  on a full uninstall + reinstall. Not surfaced in the UI.
- Admin login gate (server/admin-auth.js + setAdminPassword action). Scrypt
  password hash + HMAC-signed session cookie.
- Entitlement scheme rename: pro / max (each paired with subscriptions and
  relay_pro / relay_max), replacing the misleading "core" entitlement
  that conflicted with the user-facing "Core" tier name.
- Activation screen: dynamic credit count pulled from /api/relay/policy,
  "Skip — use free mode" button, accurate paid-feature list.
- Top toolbar: inline credit-balance pill (or "BYO configured" fallback),
  Upgrade + "I have a key" buttons.
- Picker UI: per-provider sections with Save/Test/Delete buttons, sections
  collapsible by chevron, default-collapsed unless currently selected,
  "Use comped credits (reset to relay)" link when the user has strayed,
  green hint under inputs whose values are server-configured.
- Activity log: chevron-collapsible groups per video, refresh-survival via
  localStorage + a 500-entry server-side buffer, explicit Clear button.
- YouTube captions fast-path with user toggle (skips audio download + AI
  transcription when captions are available — uncheck for speaker labels).
- Cancel button: AbortController plumbed through every provider SDK call;
  retryAPI short-circuits on AbortError; cancellation events surface in
  the activity log instead of silent retries.
- Long-video analysis: auto-coalesce transcript entries before building the
  analysis prompt so local-model context windows (32k-ish) don't overflow.
  Original entries preserved for transcript display via an index map; the
  analyzer sees a coarser view but click-to-seek timestamps stay precise.
- StartOS action grouping (Setup / AI Providers) so the actions list is
  navigable.
- Manifest description rewritten to reflect multi-provider support and
  free-tier relay credits.
- Smaller fixes: summarize-button enablement no longer requires a Gemini
  key when other providers are configured; analysis fallback chain handles
  context-length and 503 capacity errors; single-segment expansion for
  providers that don't return per-segment timestamps (Parakeet et al.);
  many other UX polish items.
2026-05-11 23:46:20 -05:00

181 lines
6.3 KiB
JavaScript

// Whisper provider — transcription via any OpenAI-Audio-Transcription-API-
// compatible endpoint. OpenAI's audio.transcriptions.create wire format
// is the de facto standard; whisper.cpp's HTTP server, faster-whisper-
// server, NVIDIA Parakeet behind speaches, Groq's Whisper API, and most
// other self-hosted implementations honor it. So this provider is
// effectively "OpenAI for transcription with a custom baseURL" —
// distinct from the `openai` provider so users can wire a self-hosted
// transcription engine alongside their cloud OpenAI key (used for GPT
// analysis).
//
// Implementation note: although the wire format matches OpenAI's, this
// provider has its OWN transcribeAudio (rather than reusing the OpenAI
// provider's). Reasons:
// - Log messages should say "Whisper at host:port (model)" not
// "OpenAI Whisper" — Parakeet/whisper.cpp behind a custom URL is
// not "OpenAI" and showing that in logs is misleading.
// - No 25 MB chunk cap. Self-hosted Whisper / Parakeet typically
// handles much larger inputs than the OpenAI cloud API.
// - Zero per-minute cost reporting (self-hosted by definition).
import { createReadStream } from "fs";
import OpenAI from "openai";
import { retryAPI, formatTime } from "../util.js";
const FALLBACK_MODEL = "whisper-1";
export function createWhisperProvider({
apiKey,
baseURL,
defaultModels = [],
timeoutMs = 900_000,
} = {}) {
if (!baseURL) {
throw new Error(
"createWhisperProvider: baseURL is required (e.g. http://localhost:8000/v1)"
);
}
// Self-hosted Whisper servers commonly skip auth — pass a sentinel
// string so the SDK's authorization header is well-formed.
const client = new OpenAI({
apiKey: apiKey || "no-auth",
baseURL,
timeout: timeoutMs,
});
// Pretty-print the host for log messages: strip protocol, ignore /v1
// suffix, trim trailing slash.
const displayHost = baseURL
.replace(/^https?:\/\//, "")
.replace(/\/v\d+\/?$/, "")
.replace(/\/$/, "");
return {
name: "whisper",
capabilities: {
transcribe: true,
analyze: false,
listModels: defaultModels.length > 0,
},
listTranscriptionModels() {
return defaultModels.length > 0 ? [...defaultModels] : [FALLBACK_MODEL];
},
listAnalysisModels() {
return [];
},
async transcribeAudio({
filePath,
model = FALLBACK_MODEL,
offsetSeconds = 0,
onProgress = () => {},
signal,
}) {
// Use the model + host directly in the log — "Whisper" was
// misleading when a user wires up Parakeet (or any non-Whisper
// model) at a custom endpoint.
onProgress(
`Uploading audio${offsetSeconds > 0 ? ` (offset ${formatTime(offsetSeconds)})` : ""} to ${model} at ${displayHost}...`
);
const start = Date.now();
// Try the rich request first (verbose_json + per-segment
// timestamps — needed to render the transcript with timestamps
// and let the analysis step build sections). If the wrapper
// rejects those params (some Whisper-API-compatible servers,
// including some Parakeet wrappers, don't implement them and
// return 500), retry once with the bare-bones request shape.
let result;
let usedFallbackShape = false;
try {
result = await retryAPI(
() =>
client.audio.transcriptions.create(
{
file: createReadStream(filePath),
model,
response_format: "verbose_json",
timestamp_granularities: ["segment"],
},
signal ? { signal } : undefined
),
{
retries: 2,
delayMs: 5000,
label: `${model} transcription${offsetSeconds > 0 ? ` (chunk@${formatTime(offsetSeconds)})` : ""}`,
log: (msg) => onProgress(msg),
}
);
} catch (richErr) {
const richStatus = richErr?.status || 0;
// Only fall back on 4xx / 5xx where the params themselves are
// the likely culprit. Connection / timeout errors get thrown.
if (richStatus >= 400 && richStatus < 600) {
onProgress(
`Rich-request failed (status ${richStatus}); retrying with bare request shape (no verbose_json, no segment timestamps)...`
);
usedFallbackShape = true;
result = await retryAPI(
() =>
client.audio.transcriptions.create(
{
file: createReadStream(filePath),
model,
},
signal ? { signal } : undefined
),
{
retries: 2,
delayMs: 5000,
label: `${model} transcription (fallback)${offsetSeconds > 0 ? ` (chunk@${formatTime(offsetSeconds)})` : ""}`,
log: (msg) => onProgress(msg),
}
);
} else {
throw richErr;
}
}
const elapsed = ((Date.now() - start) / 1000).toFixed(1);
onProgress(
`${model} transcription complete in ${elapsed}s${usedFallbackShape ? " (bare request — no segment timestamps)" : ""}`
);
const segments = Array.isArray(result.segments) ? result.segments : [];
const lines = segments.length
? segments.map((s) => `[${formatTime(s.start || 0)}] ${(s.text || "").trim()}`)
: [`[0:00] ${(result.text || "").trim()}`];
const text = lines.join("\n");
// Self-hosted Whisper / Parakeet are free at the API layer
// (you've already paid for the hardware), so zero cost.
const cost = {
inputTokens: 0,
outputTokens: 0,
thinkingTokens: 0,
totalTokens: 0,
inputCost: "0.000000",
outputCost: "0.000000",
thinkingCost: "0.000000",
totalCost: "0.000000",
totalCostDisplay: "$0.0000",
};
return {
text,
usage: { inputTokens: 0, outputTokens: 0, thinkingTokens: 0, totalTokens: 0 },
cost,
finishReason: null,
blockReason: "none",
raw: result,
};
},
async analyzeText() {
throw new Error(
"Whisper provider is transcription-only. Use a different provider (Gemini / Anthropic / OpenAI / Ollama / OpenAI-compatible) for analysis."
);
},
};
}