recap/server/providers/whisper.js

// Whisper provider — transcription via any OpenAI-Audio-Transcription-API-
// compatible endpoint. OpenAI's audio.transcriptions.create wire format
// is the de facto standard; whisper.cpp's HTTP server, faster-whisper-
// server, NVIDIA Parakeet behind speaches, Groq's Whisper API, and most
// other self-hosted implementations honor it. So this provider is
// effectively "OpenAI for transcription with a custom baseURL" —
// distinct from the `openai` provider so users can wire a self-hosted
// transcription engine alongside their cloud OpenAI key (used for GPT
// analysis).
//
// Implementation note: although the wire format matches OpenAI's, this
// provider has its OWN transcribeAudio (rather than reusing the OpenAI
// provider's). Reasons:
//   - Log messages should say "Whisper at host:port (model)" not
//     "OpenAI Whisper" — Parakeet/whisper.cpp behind a custom URL is
//     not "OpenAI" and showing that in logs is misleading.
//   - No 25 MB chunk cap. Self-hosted Whisper / Parakeet typically
//     handles much larger inputs than the OpenAI cloud API.
//   - Zero per-minute cost reporting (self-hosted by definition).

import { createReadStream } from "fs";
import OpenAI from "openai";
import { retryAPI, formatTime } from "../util.js";

const FALLBACK_MODEL = "whisper-1";

export function createWhisperProvider({
  apiKey,
  baseURL,
  defaultModels = [],
  timeoutMs = 900_000,
} = {}) {
  if (!baseURL) {
    throw new Error(
      "createWhisperProvider: baseURL is required (e.g. http://localhost:8000/v1)"
    );
  }
  // Self-hosted Whisper servers commonly skip auth — pass a sentinel
  // string so the SDK's authorization header is well-formed.
  const client = new OpenAI({
    apiKey: apiKey || "no-auth",
    baseURL,
    timeout: timeoutMs,
  });
  // Pretty-print the host for log messages: strip protocol, ignore /v1
  // suffix, trim trailing slash.
  const displayHost = baseURL
    .replace(/^https?:\/\//, "")
    .replace(/\/v\d+\/?$/, "")
    .replace(/\/$/, "");

  return {
    name: "whisper",

    capabilities: {
      transcribe: true,
      analyze: false,
      listModels: defaultModels.length > 0,
    },

    listTranscriptionModels() {
      return defaultModels.length > 0 ? [...defaultModels] : [FALLBACK_MODEL];
    },

    listAnalysisModels() {
      return [];
    },

    async transcribeAudio({
      filePath,
      model = FALLBACK_MODEL,
      offsetSeconds = 0,
      onProgress = () => {},
      signal,
    }) {
      // Use the model + host directly in the log — "Whisper" was
      // misleading when a user wires up Parakeet (or any non-Whisper
      // model) at a custom endpoint.
      onProgress(
        `Uploading audio${offsetSeconds > 0 ? ` (offset ${formatTime(offsetSeconds)})` : ""} to ${model} at ${displayHost}...`
      );
      const start = Date.now();
      // Try the rich request first (verbose_json + per-segment
      // timestamps — needed to render the transcript with timestamps
      // and let the analysis step build sections). If the wrapper
      // rejects those params (some Whisper-API-compatible servers,
      // including some Parakeet wrappers, don't implement them and
      // return 500), retry once with the bare-bones request shape.
      let result;
      let usedFallbackShape = false;
      try {
        result = await retryAPI(
          () =>
            client.audio.transcriptions.create(
              {
                file: createReadStream(filePath),
                model,
                response_format: "verbose_json",
                timestamp_granularities: ["segment"],
              },
              signal ? { signal } : undefined
            ),
          {
            retries: 2,
            delayMs: 5000,
            label: `${model} transcription${offsetSeconds > 0 ? ` (chunk@${formatTime(offsetSeconds)})` : ""}`,
            log: (msg) => onProgress(msg),
          }
        );
      } catch (richErr) {
        const richStatus = richErr?.status || 0;
        // Only fall back on 4xx / 5xx where the params themselves are
        // the likely culprit. Connection / timeout errors get thrown.
        if (richStatus >= 400 && richStatus < 600) {
          onProgress(
            `Rich-request failed (status ${richStatus}); retrying with bare request shape (no verbose_json, no segment timestamps)...`
          );
          usedFallbackShape = true;
          result = await retryAPI(
            () =>
              client.audio.transcriptions.create(
                {
                  file: createReadStream(filePath),
                  model,
                },
                signal ? { signal } : undefined
              ),
            {
              retries: 2,
              delayMs: 5000,
              label: `${model} transcription (fallback)${offsetSeconds > 0 ? ` (chunk@${formatTime(offsetSeconds)})` : ""}`,
              log: (msg) => onProgress(msg),
            }
          );
        } else {
          throw richErr;
        }
      }
      const elapsed = ((Date.now() - start) / 1000).toFixed(1);
      onProgress(
        `${model} transcription complete in ${elapsed}s${usedFallbackShape ? " (bare request — no segment timestamps)" : ""}`
      );

      const segments = Array.isArray(result.segments) ? result.segments : [];
      const lines = segments.length
        ? segments.map((s) => `[${formatTime(s.start || 0)}] ${(s.text || "").trim()}`)
        : [`[0:00] ${(result.text || "").trim()}`];
      const text = lines.join("\n");

      // Self-hosted Whisper / Parakeet are free at the API layer
      // (you've already paid for the hardware), so zero cost.
      const cost = {
        inputTokens: 0,
        outputTokens: 0,
        thinkingTokens: 0,
        totalTokens: 0,
        inputCost: "0.000000",
        outputCost: "0.000000",
        thinkingCost: "0.000000",
        totalCost: "0.000000",
        totalCostDisplay: "$0.0000",
      };

      return {
        text,
        usage: { inputTokens: 0, outputTokens: 0, thinkingTokens: 0, totalTokens: 0 },
        cost,
        finishReason: null,
        blockReason: "none",
        raw: result,
      };
    },

    async analyzeText() {
      throw new Error(
        "Whisper provider is transcription-only. Use a different provider (Gemini / Anthropic / OpenAI / Ollama / OpenAI-compatible) for analysis."
      );
    },
  };
}