Pluggable AI providers, relay credit system, picker UX overhaul

Captures roughly forty version bumps (v0.2.6 → v0.2.47) of work that accumulated without commits. - Pluggable provider system under server/providers/: gemini, anthropic, openai, openai-compatible, ollama, whisper-compatible, relay. Mix and match transcription + analysis per request via the picker UI. - Relay backend integration. Hardcoded relay URL in server/relay-default.js (operator-controlled at build time, not user-configurable). New /api/relay/{status,policy} endpoints proxy to the relay; balance pings populate a cached credit display. - Per-install identity in server/install-id.js for relay credit accounting. Sent to the relay as X-Recap-Install-Id; persists across upgrades, lost on a full uninstall + reinstall. Not surfaced in the UI. - Admin login gate (server/admin-auth.js + setAdminPassword action). Scrypt password hash + HMAC-signed session cookie. - Entitlement scheme rename: pro / max (each paired with subscriptions and relay_pro / relay_max), replacing the misleading "core" entitlement that conflicted with the user-facing "Core" tier name. - Activation screen: dynamic credit count pulled from /api/relay/policy, "Skip — use free mode" button, accurate paid-feature list. - Top toolbar: inline credit-balance pill (or "BYO configured" fallback), Upgrade + "I have a key" buttons. - Picker UI: per-provider sections with Save/Test/Delete buttons, sections collapsible by chevron, default-collapsed unless currently selected, "Use comped credits (reset to relay)" link when the user has strayed, green hint under inputs whose values are server-configured. - Activity log: chevron-collapsible groups per video, refresh-survival via localStorage + a 500-entry server-side buffer, explicit Clear button. - YouTube captions fast-path with user toggle (skips audio download + AI transcription when captions are available — uncheck for speaker labels). - Cancel button: AbortController plumbed through every provider SDK call; retryAPI short-circuits on AbortError; cancellation events surface in the activity log instead of silent retries. - Long-video analysis: auto-coalesce transcript entries before building the analysis prompt so local-model context windows (32k-ish) don't overflow. Original entries preserved for transcript display via an index map; the analyzer sees a coarser view but click-to-seek timestamps stay precise. - StartOS action grouping (Setup / AI Providers) so the actions list is navigable. - Manifest description rewritten to reflect multi-provider support and free-tier relay credits. - Smaller fixes: summarize-button enablement no longer requires a Gemini key when other providers are configured; analysis fallback chain handles context-length and 503 capacity errors; single-segment expansion for providers that don't return per-segment timestamps (Parakeet et al.); many other UX polish items.
2026-05-11 23:46:20 -05:00
parent 2544cf7dde
commit 373d10595b
79 changed files with 6322 additions and 397 deletions
@@ -0,0 +1,180 @@
+// Whisper provider — transcription via any OpenAI-Audio-Transcription-API-
+// compatible endpoint. OpenAI's audio.transcriptions.create wire format
+// is the de facto standard; whisper.cpp's HTTP server, faster-whisper-
+// server, NVIDIA Parakeet behind speaches, Groq's Whisper API, and most
+// other self-hosted implementations honor it. So this provider is
+// effectively "OpenAI for transcription with a custom baseURL" —
+// distinct from the `openai` provider so users can wire a self-hosted
+// transcription engine alongside their cloud OpenAI key (used for GPT
+// analysis).
+//
+// Implementation note: although the wire format matches OpenAI's, this
+// provider has its OWN transcribeAudio (rather than reusing the OpenAI
+// provider's). Reasons:
+//   - Log messages should say "Whisper at host:port (model)" not
+//     "OpenAI Whisper" — Parakeet/whisper.cpp behind a custom URL is
+//     not "OpenAI" and showing that in logs is misleading.
+//   - No 25 MB chunk cap. Self-hosted Whisper / Parakeet typically
+//     handles much larger inputs than the OpenAI cloud API.
+//   - Zero per-minute cost reporting (self-hosted by definition).
+
+import { createReadStream } from "fs";
+import OpenAI from "openai";
+import { retryAPI, formatTime } from "../util.js";
+
+const FALLBACK_MODEL = "whisper-1";
+
+export function createWhisperProvider({
+  apiKey,
+  baseURL,
+  defaultModels = [],
+  timeoutMs = 900_000,
+} = {}) {
+  if (!baseURL) {
+    throw new Error(
+      "createWhisperProvider: baseURL is required (e.g. http://localhost:8000/v1)"
+    );
+  }
+  // Self-hosted Whisper servers commonly skip auth — pass a sentinel
+  // string so the SDK's authorization header is well-formed.
+  const client = new OpenAI({
+    apiKey: apiKey || "no-auth",
+    baseURL,
+    timeout: timeoutMs,
+  });
+  // Pretty-print the host for log messages: strip protocol, ignore /v1
+  // suffix, trim trailing slash.
+  const displayHost = baseURL
+    .replace(/^https?:\/\//, "")
+    .replace(/\/v\d+\/?$/, "")
+    .replace(/\/$/, "");
+
+  return {
+    name: "whisper",
+
+    capabilities: {
+      transcribe: true,
+      analyze: false,
+      listModels: defaultModels.length > 0,
+    },
+
+    listTranscriptionModels() {
+      return defaultModels.length > 0 ? [...defaultModels] : [FALLBACK_MODEL];
+    },
+
+    listAnalysisModels() {
+      return [];
+    },
+
+    async transcribeAudio({
+      filePath,
+      model = FALLBACK_MODEL,
+      offsetSeconds = 0,
+      onProgress = () => {},
+      signal,
+    }) {
+      // Use the model + host directly in the log — "Whisper" was
+      // misleading when a user wires up Parakeet (or any non-Whisper
+      // model) at a custom endpoint.
+      onProgress(
+        `Uploading audio${offsetSeconds > 0 ? ` (offset ${formatTime(offsetSeconds)})` : ""} to ${model} at ${displayHost}...`
+      );
+      const start = Date.now();
+      // Try the rich request first (verbose_json + per-segment
+      // timestamps — needed to render the transcript with timestamps
+      // and let the analysis step build sections). If the wrapper
+      // rejects those params (some Whisper-API-compatible servers,
+      // including some Parakeet wrappers, don't implement them and
+      // return 500), retry once with the bare-bones request shape.
+      let result;
+      let usedFallbackShape = false;
+      try {
+        result = await retryAPI(
+          () =>
+            client.audio.transcriptions.create(
+              {
+                file: createReadStream(filePath),
+                model,
+                response_format: "verbose_json",
+                timestamp_granularities: ["segment"],
+              },
+              signal ? { signal } : undefined
+            ),
+          {
+            retries: 2,
+            delayMs: 5000,
+            label: `${model} transcription${offsetSeconds > 0 ? ` (chunk@${formatTime(offsetSeconds)})` : ""}`,
+            log: (msg) => onProgress(msg),
+          }
+        );
+      } catch (richErr) {
+        const richStatus = richErr?.status || 0;
+        // Only fall back on 4xx / 5xx where the params themselves are
+        // the likely culprit. Connection / timeout errors get thrown.
+        if (richStatus >= 400 && richStatus < 600) {
+          onProgress(
+            `Rich-request failed (status ${richStatus}); retrying with bare request shape (no verbose_json, no segment timestamps)...`
+          );
+          usedFallbackShape = true;
+          result = await retryAPI(
+            () =>
+              client.audio.transcriptions.create(
+                {
+                  file: createReadStream(filePath),
+                  model,
+                },
+                signal ? { signal } : undefined
+              ),
+            {
+              retries: 2,
+              delayMs: 5000,
+              label: `${model} transcription (fallback)${offsetSeconds > 0 ? ` (chunk@${formatTime(offsetSeconds)})` : ""}`,
+              log: (msg) => onProgress(msg),
+            }
+          );
+        } else {
+          throw richErr;
+        }
+      }
+      const elapsed = ((Date.now() - start) / 1000).toFixed(1);
+      onProgress(
+        `${model} transcription complete in ${elapsed}s${usedFallbackShape ? " (bare request — no segment timestamps)" : ""}`
+      );
+
+      const segments = Array.isArray(result.segments) ? result.segments : [];
+      const lines = segments.length
+        ? segments.map((s) => `[${formatTime(s.start || 0)}] ${(s.text || "").trim()}`)
+        : [`[0:00] ${(result.text || "").trim()}`];
+      const text = lines.join("\n");
+
+      // Self-hosted Whisper / Parakeet are free at the API layer
+      // (you've already paid for the hardware), so zero cost.
+      const cost = {
+        inputTokens: 0,
+        outputTokens: 0,
+        thinkingTokens: 0,
+        totalTokens: 0,
+        inputCost: "0.000000",
+        outputCost: "0.000000",
+        thinkingCost: "0.000000",
+        totalCost: "0.000000",
+        totalCostDisplay: "$0.0000",
+      };
+
+      return {
+        text,
+        usage: { inputTokens: 0, outputTokens: 0, thinkingTokens: 0, totalTokens: 0 },
+        cost,
+        finishReason: null,
+        blockReason: "none",
+        raw: result,
+      };
+    },
+
+    async analyzeText() {
+      throw new Error(
+        "Whisper provider is transcription-only. Use a different provider (Gemini / Anthropic / OpenAI / Ollama / OpenAI-compatible) for analysis."
+      );
+    },
+  };
+}