Pluggable AI providers, relay credit system, picker UX overhaul

Captures roughly forty version bumps (v0.2.6 → v0.2.47) of work that accumulated without commits. - Pluggable provider system under server/providers/: gemini, anthropic, openai, openai-compatible, ollama, whisper-compatible, relay. Mix and match transcription + analysis per request via the picker UI. - Relay backend integration. Hardcoded relay URL in server/relay-default.js (operator-controlled at build time, not user-configurable). New /api/relay/{status,policy} endpoints proxy to the relay; balance pings populate a cached credit display. - Per-install identity in server/install-id.js for relay credit accounting. Sent to the relay as X-Recap-Install-Id; persists across upgrades, lost on a full uninstall + reinstall. Not surfaced in the UI. - Admin login gate (server/admin-auth.js + setAdminPassword action). Scrypt password hash + HMAC-signed session cookie. - Entitlement scheme rename: pro / max (each paired with subscriptions and relay_pro / relay_max), replacing the misleading "core" entitlement that conflicted with the user-facing "Core" tier name. - Activation screen: dynamic credit count pulled from /api/relay/policy, "Skip — use free mode" button, accurate paid-feature list. - Top toolbar: inline credit-balance pill (or "BYO configured" fallback), Upgrade + "I have a key" buttons. - Picker UI: per-provider sections with Save/Test/Delete buttons, sections collapsible by chevron, default-collapsed unless currently selected, "Use comped credits (reset to relay)" link when the user has strayed, green hint under inputs whose values are server-configured. - Activity log: chevron-collapsible groups per video, refresh-survival via localStorage + a 500-entry server-side buffer, explicit Clear button. - YouTube captions fast-path with user toggle (skips audio download + AI transcription when captions are available — uncheck for speaker labels). - Cancel button: AbortController plumbed through every provider SDK call; retryAPI short-circuits on AbortError; cancellation events surface in the activity log instead of silent retries. - Long-video analysis: auto-coalesce transcript entries before building the analysis prompt so local-model context windows (32k-ish) don't overflow. Original entries preserved for transcript display via an index map; the analyzer sees a coarser view but click-to-seek timestamps stay precise. - StartOS action grouping (Setup / AI Providers) so the actions list is navigable. - Manifest description rewritten to reflect multi-provider support and free-tier relay credits. - Smaller fixes: summarize-button enablement no longer requires a Gemini key when other providers are configured; analysis fallback chain handles context-length and 503 capacity errors; single-segment expansion for providers that don't return per-segment timestamps (Parakeet et al.); many other UX polish items.
2026-05-11 23:46:20 -05:00
parent 2544cf7dde
commit 373d10595b
79 changed files with 6322 additions and 397 deletions
@@ -0,0 +1,201 @@
+// OpenAI provider — analysis (chat.completions) + transcription (Whisper).
+//
+// Whisper (whisper-1) has a 25 MB per-request file size cap. The
+// orchestration layer's audio chunking is currently sized for Gemini's
+// much larger cap; long podcasts at high bitrate can push individual
+// chunks over Whisper's cap. We surface that as a clear error rather
+// than silently truncating — users can mix providers (Whisper for
+// short audio, Gemini for long) per-request via the picker.
+//
+// Pricing values are placeholders — verify against current OpenAI
+// pricing before billing-sensitive use.
+
+import { createReadStream, statSync } from "fs";
+import OpenAI from "openai";
+import { retryAPI, formatTime } from "../util.js";
+import { formatCost, ratesFor } from "./cost.js";
+
+// Per-1M-token rates in USD for chat.completions models.
+// VERIFY against current OpenAI pricing before relying on these for billing.
+export const OPENAI_PRICING = {
+  "gpt-4o":         { input: 2.50,  output: 10.00 },
+  "gpt-4o-mini":    { input: 0.15,  output: 0.60 },
+  "gpt-4-turbo":    { input: 10.00, output: 30.00 },
+  "o3-mini":        { input: 1.10,  output: 4.40 },
+  // Fallback for unknown / future models.
+  "default":        { input: 2.50,  output: 10.00 },
+};
+
+// Whisper bills per minute of audio, not per token. The cost record
+// reuses the token cost shape, but stores minute-based math in the
+// `inputCost` field.
+const WHISPER_USD_PER_MINUTE = 0.006;
+const WHISPER_MAX_BYTES = 25 * 1024 * 1024; // OpenAI hard limit
+
+export const OPENAI_ANALYSIS_MODELS = [
+  "gpt-4o",
+  "gpt-4o-mini",
+  "gpt-4-turbo",
+  "o3-mini",
+];
+
+export const OPENAI_TRANSCRIPTION_MODELS = ["whisper-1"];
+
+const ANALYSIS_MAX_TOKENS = 16000;
+
+export function createOpenAIProvider({
+  apiKey,
+  baseURL,
+  timeoutMs = 900_000,
+} = {}) {
+  if (!apiKey) {
+    throw new Error("createOpenAIProvider: apiKey is required");
+  }
+  const client = new OpenAI({
+    apiKey,
+    baseURL: baseURL || undefined,
+    timeout: timeoutMs,
+  });
+
+  return {
+    name: "openai",
+
+    capabilities: {
+      transcribe: true,
+      analyze: true,
+      listModels: true,
+    },
+
+    listAnalysisModels() {
+      return [...OPENAI_ANALYSIS_MODELS];
+    },
+
+    listTranscriptionModels() {
+      return [...OPENAI_TRANSCRIPTION_MODELS];
+    },
+
+    // Whisper-based transcription. Returns the same [MM:SS] formatted
+    // text shape Gemini produces, so the orchestration layer's
+    // parseTimestampedTranscript() works unchanged.
+    async transcribeAudio({
+      filePath,
+      model = "whisper-1",
+      offsetSeconds = 0,
+      onProgress = () => {},
+      signal,
+    }) {
+      let bytes = 0;
+      try {
+        bytes = statSync(filePath).size;
+      } catch {}
+      if (bytes > WHISPER_MAX_BYTES) {
+        const sizeMB = (bytes / (1024 * 1024)).toFixed(1);
+        throw new Error(
+          `OpenAI Whisper file size limit is 25 MB. This chunk is ${sizeMB} MB. Try Gemini for transcription, or split the audio more aggressively.`
+        );
+      }
+
+      onProgress(
+        `Uploading audio${offsetSeconds > 0 ? ` (offset ${formatTime(offsetSeconds)})` : ""} to OpenAI Whisper (${model})...`
+      );
+      const start = Date.now();
+      const result = await retryAPI(
+        () =>
+          client.audio.transcriptions.create(
+            {
+              file: createReadStream(filePath),
+              model,
+              response_format: "verbose_json",
+              timestamp_granularities: ["segment"],
+            },
+            signal ? { signal } : undefined
+          ),
+        {
+          retries: 3,
+          delayMs: 5000,
+          label: `Whisper transcription${offsetSeconds > 0 ? ` (chunk@${formatTime(offsetSeconds)})` : ""}`,
+          log: (msg) => onProgress(msg),
+        }
+      );
+      const elapsed = ((Date.now() - start) / 1000).toFixed(1);
+      onProgress(`Whisper transcription complete in ${elapsed}s`);
+
+      const segments = Array.isArray(result.segments) ? result.segments : [];
+      const lines = segments.length
+        ? segments.map((s) => `[${formatTime(s.start || 0)}] ${(s.text || "").trim()}`)
+        : [`[0:00] ${(result.text || "").trim()}`];
+      const text = lines.join("\n");
+
+      // Whisper bills by audio duration in minutes, not tokens.
+      const durationSeconds = result.duration || 0;
+      const minutes = durationSeconds / 60;
+      const usdCost = minutes * WHISPER_USD_PER_MINUTE;
+      const cost = {
+        inputTokens: 0,
+        outputTokens: 0,
+        thinkingTokens: 0,
+        totalTokens: 0,
+        inputCost: usdCost.toFixed(6),
+        outputCost: "0.000000",
+        thinkingCost: "0.000000",
+        totalCost: usdCost.toFixed(6),
+        totalCostDisplay: usdCost < 0.01
+          ? `$${(usdCost * 100).toFixed(3)}¢`
+          : `$${usdCost.toFixed(4)}`,
+      };
+
+      return {
+        text,
+        usage: { inputTokens: 0, outputTokens: 0, thinkingTokens: 0, totalTokens: 0 },
+        cost,
+        finishReason: null,
+        blockReason: "none",
+        raw: result,
+      };
+    },
+
+    async analyzeText({
+      prompt,
+      model,
+      onProgress = () => {},
+      retries = 2,
+      signal,
+    }) {
+      const result = await retryAPI(
+        () =>
+          client.chat.completions.create(
+            {
+              model,
+              max_tokens: ANALYSIS_MAX_TOKENS,
+              messages: [{ role: "user", content: prompt }],
+            },
+            signal ? { signal } : undefined
+          ),
+        {
+          retries,
+          delayMs: 5000,
+          label: "OpenAI analysis",
+          log: (msg) => onProgress(msg),
+        }
+      );
+
+      const choice = result.choices?.[0];
+      const text = choice?.message?.content || "";
+
+      const usage = {
+        inputTokens: result.usage?.prompt_tokens || 0,
+        outputTokens: result.usage?.completion_tokens || 0,
+        thinkingTokens: 0,
+      };
+      const cost = formatCost(ratesFor(OPENAI_PRICING, model), usage);
+
+      return {
+        text,
+        usage,
+        cost,
+        finishReason: choice?.finish_reason || null,
+        raw: result,
+      };
+    },
+  };
+}