Files
Keysat 373d10595b Pluggable AI providers, relay credit system, picker UX overhaul
Captures roughly forty version bumps (v0.2.6 → v0.2.47) of work that
accumulated without commits.

- Pluggable provider system under server/providers/: gemini, anthropic,
  openai, openai-compatible, ollama, whisper-compatible, relay. Mix and
  match transcription + analysis per request via the picker UI.
- Relay backend integration. Hardcoded relay URL in server/relay-default.js
  (operator-controlled at build time, not user-configurable). New
  /api/relay/{status,policy} endpoints proxy to the relay; balance pings
  populate a cached credit display.
- Per-install identity in server/install-id.js for relay credit accounting.
  Sent to the relay as X-Recap-Install-Id; persists across upgrades, lost
  on a full uninstall + reinstall. Not surfaced in the UI.
- Admin login gate (server/admin-auth.js + setAdminPassword action). Scrypt
  password hash + HMAC-signed session cookie.
- Entitlement scheme rename: pro / max (each paired with subscriptions and
  relay_pro / relay_max), replacing the misleading "core" entitlement
  that conflicted with the user-facing "Core" tier name.
- Activation screen: dynamic credit count pulled from /api/relay/policy,
  "Skip — use free mode" button, accurate paid-feature list.
- Top toolbar: inline credit-balance pill (or "BYO configured" fallback),
  Upgrade + "I have a key" buttons.
- Picker UI: per-provider sections with Save/Test/Delete buttons, sections
  collapsible by chevron, default-collapsed unless currently selected,
  "Use comped credits (reset to relay)" link when the user has strayed,
  green hint under inputs whose values are server-configured.
- Activity log: chevron-collapsible groups per video, refresh-survival via
  localStorage + a 500-entry server-side buffer, explicit Clear button.
- YouTube captions fast-path with user toggle (skips audio download + AI
  transcription when captions are available — uncheck for speaker labels).
- Cancel button: AbortController plumbed through every provider SDK call;
  retryAPI short-circuits on AbortError; cancellation events surface in
  the activity log instead of silent retries.
- Long-video analysis: auto-coalesce transcript entries before building the
  analysis prompt so local-model context windows (32k-ish) don't overflow.
  Original entries preserved for transcript display via an index map; the
  analyzer sees a coarser view but click-to-seek timestamps stay precise.
- StartOS action grouping (Setup / AI Providers) so the actions list is
  navigable.
- Manifest description rewritten to reflect multi-provider support and
  free-tier relay credits.
- Smaller fixes: summarize-button enablement no longer requires a Gemini
  key when other providers are configured; analysis fallback chain handles
  context-length and 503 capacity errors; single-segment expansion for
  providers that don't return per-segment timestamps (Parakeet et al.);
  many other UX polish items.
2026-05-11 23:46:20 -05:00

202 lines
6.0 KiB
JavaScript

// OpenAI provider — analysis (chat.completions) + transcription (Whisper).
//
// Whisper (whisper-1) has a 25 MB per-request file size cap. The
// orchestration layer's audio chunking is currently sized for Gemini's
// much larger cap; long podcasts at high bitrate can push individual
// chunks over Whisper's cap. We surface that as a clear error rather
// than silently truncating — users can mix providers (Whisper for
// short audio, Gemini for long) per-request via the picker.
//
// Pricing values are placeholders — verify against current OpenAI
// pricing before billing-sensitive use.
import { createReadStream, statSync } from "fs";
import OpenAI from "openai";
import { retryAPI, formatTime } from "../util.js";
import { formatCost, ratesFor } from "./cost.js";
// Per-1M-token rates in USD for chat.completions models.
// VERIFY against current OpenAI pricing before relying on these for billing.
export const OPENAI_PRICING = {
"gpt-4o": { input: 2.50, output: 10.00 },
"gpt-4o-mini": { input: 0.15, output: 0.60 },
"gpt-4-turbo": { input: 10.00, output: 30.00 },
"o3-mini": { input: 1.10, output: 4.40 },
// Fallback for unknown / future models.
"default": { input: 2.50, output: 10.00 },
};
// Whisper bills per minute of audio, not per token. The cost record
// reuses the token cost shape, but stores minute-based math in the
// `inputCost` field.
const WHISPER_USD_PER_MINUTE = 0.006;
const WHISPER_MAX_BYTES = 25 * 1024 * 1024; // OpenAI hard limit
export const OPENAI_ANALYSIS_MODELS = [
"gpt-4o",
"gpt-4o-mini",
"gpt-4-turbo",
"o3-mini",
];
export const OPENAI_TRANSCRIPTION_MODELS = ["whisper-1"];
const ANALYSIS_MAX_TOKENS = 16000;
export function createOpenAIProvider({
apiKey,
baseURL,
timeoutMs = 900_000,
} = {}) {
if (!apiKey) {
throw new Error("createOpenAIProvider: apiKey is required");
}
const client = new OpenAI({
apiKey,
baseURL: baseURL || undefined,
timeout: timeoutMs,
});
return {
name: "openai",
capabilities: {
transcribe: true,
analyze: true,
listModels: true,
},
listAnalysisModels() {
return [...OPENAI_ANALYSIS_MODELS];
},
listTranscriptionModels() {
return [...OPENAI_TRANSCRIPTION_MODELS];
},
// Whisper-based transcription. Returns the same [MM:SS] formatted
// text shape Gemini produces, so the orchestration layer's
// parseTimestampedTranscript() works unchanged.
async transcribeAudio({
filePath,
model = "whisper-1",
offsetSeconds = 0,
onProgress = () => {},
signal,
}) {
let bytes = 0;
try {
bytes = statSync(filePath).size;
} catch {}
if (bytes > WHISPER_MAX_BYTES) {
const sizeMB = (bytes / (1024 * 1024)).toFixed(1);
throw new Error(
`OpenAI Whisper file size limit is 25 MB. This chunk is ${sizeMB} MB. Try Gemini for transcription, or split the audio more aggressively.`
);
}
onProgress(
`Uploading audio${offsetSeconds > 0 ? ` (offset ${formatTime(offsetSeconds)})` : ""} to OpenAI Whisper (${model})...`
);
const start = Date.now();
const result = await retryAPI(
() =>
client.audio.transcriptions.create(
{
file: createReadStream(filePath),
model,
response_format: "verbose_json",
timestamp_granularities: ["segment"],
},
signal ? { signal } : undefined
),
{
retries: 3,
delayMs: 5000,
label: `Whisper transcription${offsetSeconds > 0 ? ` (chunk@${formatTime(offsetSeconds)})` : ""}`,
log: (msg) => onProgress(msg),
}
);
const elapsed = ((Date.now() - start) / 1000).toFixed(1);
onProgress(`Whisper transcription complete in ${elapsed}s`);
const segments = Array.isArray(result.segments) ? result.segments : [];
const lines = segments.length
? segments.map((s) => `[${formatTime(s.start || 0)}] ${(s.text || "").trim()}`)
: [`[0:00] ${(result.text || "").trim()}`];
const text = lines.join("\n");
// Whisper bills by audio duration in minutes, not tokens.
const durationSeconds = result.duration || 0;
const minutes = durationSeconds / 60;
const usdCost = minutes * WHISPER_USD_PER_MINUTE;
const cost = {
inputTokens: 0,
outputTokens: 0,
thinkingTokens: 0,
totalTokens: 0,
inputCost: usdCost.toFixed(6),
outputCost: "0.000000",
thinkingCost: "0.000000",
totalCost: usdCost.toFixed(6),
totalCostDisplay: usdCost < 0.01
? `$${(usdCost * 100).toFixed(3)}¢`
: `$${usdCost.toFixed(4)}`,
};
return {
text,
usage: { inputTokens: 0, outputTokens: 0, thinkingTokens: 0, totalTokens: 0 },
cost,
finishReason: null,
blockReason: "none",
raw: result,
};
},
async analyzeText({
prompt,
model,
onProgress = () => {},
retries = 2,
signal,
}) {
const result = await retryAPI(
() =>
client.chat.completions.create(
{
model,
max_tokens: ANALYSIS_MAX_TOKENS,
messages: [{ role: "user", content: prompt }],
},
signal ? { signal } : undefined
),
{
retries,
delayMs: 5000,
label: "OpenAI analysis",
log: (msg) => onProgress(msg),
}
);
const choice = result.choices?.[0];
const text = choice?.message?.content || "";
const usage = {
inputTokens: result.usage?.prompt_tokens || 0,
outputTokens: result.usage?.completion_tokens || 0,
thinkingTokens: 0,
};
const cost = formatCost(ratesFor(OPENAI_PRICING, model), usage);
return {
text,
usage,
cost,
finishReason: choice?.finish_reason || null,
raw: result,
};
},
};
}