// Whisper provider — transcription via any OpenAI-Audio-Transcription-API- // compatible endpoint. OpenAI's audio.transcriptions.create wire format // is the de facto standard; whisper.cpp's HTTP server, faster-whisper- // server, NVIDIA Parakeet behind speaches, Groq's Whisper API, and most // other self-hosted implementations honor it. So this provider is // effectively "OpenAI for transcription with a custom baseURL" — // distinct from the `openai` provider so users can wire a self-hosted // transcription engine alongside their cloud OpenAI key (used for GPT // analysis). // // Implementation note: although the wire format matches OpenAI's, this // provider has its OWN transcribeAudio (rather than reusing the OpenAI // provider's). Reasons: // - Log messages should say "Whisper at host:port (model)" not // "OpenAI Whisper" — Parakeet/whisper.cpp behind a custom URL is // not "OpenAI" and showing that in logs is misleading. // - No 25 MB chunk cap. Self-hosted Whisper / Parakeet typically // handles much larger inputs than the OpenAI cloud API. // - Zero per-minute cost reporting (self-hosted by definition). import { createReadStream } from "fs"; import OpenAI from "openai"; import { retryAPI, formatTime } from "../util.js"; const FALLBACK_MODEL = "whisper-1"; export function createWhisperProvider({ apiKey, baseURL, defaultModels = [], timeoutMs = 900_000, } = {}) { if (!baseURL) { throw new Error( "createWhisperProvider: baseURL is required (e.g. http://localhost:8000/v1)" ); } // Self-hosted Whisper servers commonly skip auth — pass a sentinel // string so the SDK's authorization header is well-formed. const client = new OpenAI({ apiKey: apiKey || "no-auth", baseURL, timeout: timeoutMs, }); // Pretty-print the host for log messages: strip protocol, ignore /v1 // suffix, trim trailing slash. const displayHost = baseURL .replace(/^https?:\/\//, "") .replace(/\/v\d+\/?$/, "") .replace(/\/$/, ""); return { name: "whisper", capabilities: { transcribe: true, analyze: false, listModels: defaultModels.length > 0, }, listTranscriptionModels() { return defaultModels.length > 0 ? [...defaultModels] : [FALLBACK_MODEL]; }, listAnalysisModels() { return []; }, async transcribeAudio({ filePath, model = FALLBACK_MODEL, offsetSeconds = 0, onProgress = () => {}, signal, }) { // Use the model + host directly in the log — "Whisper" was // misleading when a user wires up Parakeet (or any non-Whisper // model) at a custom endpoint. onProgress( `Uploading audio${offsetSeconds > 0 ? ` (offset ${formatTime(offsetSeconds)})` : ""} to ${model} at ${displayHost}...` ); const start = Date.now(); // Try the rich request first (verbose_json + per-segment // timestamps — needed to render the transcript with timestamps // and let the analysis step build sections). If the wrapper // rejects those params (some Whisper-API-compatible servers, // including some Parakeet wrappers, don't implement them and // return 500), retry once with the bare-bones request shape. let result; let usedFallbackShape = false; try { result = await retryAPI( () => client.audio.transcriptions.create( { file: createReadStream(filePath), model, response_format: "verbose_json", timestamp_granularities: ["segment"], }, signal ? { signal } : undefined ), { retries: 2, delayMs: 5000, label: `${model} transcription${offsetSeconds > 0 ? ` (chunk@${formatTime(offsetSeconds)})` : ""}`, log: (msg) => onProgress(msg), } ); } catch (richErr) { const richStatus = richErr?.status || 0; // Only fall back on 4xx / 5xx where the params themselves are // the likely culprit. Connection / timeout errors get thrown. if (richStatus >= 400 && richStatus < 600) { onProgress( `Rich-request failed (status ${richStatus}); retrying with bare request shape (no verbose_json, no segment timestamps)...` ); usedFallbackShape = true; result = await retryAPI( () => client.audio.transcriptions.create( { file: createReadStream(filePath), model, }, signal ? { signal } : undefined ), { retries: 2, delayMs: 5000, label: `${model} transcription (fallback)${offsetSeconds > 0 ? ` (chunk@${formatTime(offsetSeconds)})` : ""}`, log: (msg) => onProgress(msg), } ); } else { throw richErr; } } const elapsed = ((Date.now() - start) / 1000).toFixed(1); onProgress( `${model} transcription complete in ${elapsed}s${usedFallbackShape ? " (bare request — no segment timestamps)" : ""}` ); const segments = Array.isArray(result.segments) ? result.segments : []; const lines = segments.length ? segments.map((s) => `[${formatTime(s.start || 0)}] ${(s.text || "").trim()}`) : [`[0:00] ${(result.text || "").trim()}`]; const text = lines.join("\n"); // Self-hosted Whisper / Parakeet are free at the API layer // (you've already paid for the hardware), so zero cost. const cost = { inputTokens: 0, outputTokens: 0, thinkingTokens: 0, totalTokens: 0, inputCost: "0.000000", outputCost: "0.000000", thinkingCost: "0.000000", totalCost: "0.000000", totalCostDisplay: "$0.0000", }; return { text, usage: { inputTokens: 0, outputTokens: 0, thinkingTokens: 0, totalTokens: 0 }, cost, finishReason: null, blockReason: "none", raw: result, }; }, async analyzeText() { throw new Error( "Whisper provider is transcription-only. Use a different provider (Gemini / Anthropic / OpenAI / Ollama / OpenAI-compatible) for analysis." ); }, }; }