// OpenAI provider โ€” analysis (chat.completions) + transcription (Whisper). // // Whisper (whisper-1) has a 25 MB per-request file size cap. The // orchestration layer's audio chunking is currently sized for Gemini's // much larger cap; long podcasts at high bitrate can push individual // chunks over Whisper's cap. We surface that as a clear error rather // than silently truncating โ€” users can mix providers (Whisper for // short audio, Gemini for long) per-request via the picker. // // Pricing values are placeholders โ€” verify against current OpenAI // pricing before billing-sensitive use. import { createReadStream, statSync } from "fs"; import OpenAI from "openai"; import { retryAPI, formatTime } from "../util.js"; import { formatCost, ratesFor } from "./cost.js"; // Per-1M-token rates in USD for chat.completions models. // VERIFY against current OpenAI pricing before relying on these for billing. export const OPENAI_PRICING = { "gpt-4o": { input: 2.50, output: 10.00 }, "gpt-4o-mini": { input: 0.15, output: 0.60 }, "gpt-4-turbo": { input: 10.00, output: 30.00 }, "o3-mini": { input: 1.10, output: 4.40 }, // Fallback for unknown / future models. "default": { input: 2.50, output: 10.00 }, }; // Whisper bills per minute of audio, not per token. The cost record // reuses the token cost shape, but stores minute-based math in the // `inputCost` field. const WHISPER_USD_PER_MINUTE = 0.006; const WHISPER_MAX_BYTES = 25 * 1024 * 1024; // OpenAI hard limit export const OPENAI_ANALYSIS_MODELS = [ "gpt-4o", "gpt-4o-mini", "gpt-4-turbo", "o3-mini", ]; export const OPENAI_TRANSCRIPTION_MODELS = ["whisper-1"]; const ANALYSIS_MAX_TOKENS = 16000; export function createOpenAIProvider({ apiKey, baseURL, timeoutMs = 900_000, } = {}) { if (!apiKey) { throw new Error("createOpenAIProvider: apiKey is required"); } const client = new OpenAI({ apiKey, baseURL: baseURL || undefined, timeout: timeoutMs, }); return { name: "openai", capabilities: { transcribe: true, analyze: true, listModels: true, }, listAnalysisModels() { return [...OPENAI_ANALYSIS_MODELS]; }, listTranscriptionModels() { return [...OPENAI_TRANSCRIPTION_MODELS]; }, // Whisper-based transcription. Returns the same [MM:SS] formatted // text shape Gemini produces, so the orchestration layer's // parseTimestampedTranscript() works unchanged. async transcribeAudio({ filePath, model = "whisper-1", offsetSeconds = 0, onProgress = () => {}, signal, }) { let bytes = 0; try { bytes = statSync(filePath).size; } catch {} if (bytes > WHISPER_MAX_BYTES) { const sizeMB = (bytes / (1024 * 1024)).toFixed(1); throw new Error( `OpenAI Whisper file size limit is 25 MB. This chunk is ${sizeMB} MB. Try Gemini for transcription, or split the audio more aggressively.` ); } onProgress( `Uploading audio${offsetSeconds > 0 ? ` (offset ${formatTime(offsetSeconds)})` : ""} to OpenAI Whisper (${model})...` ); const start = Date.now(); const result = await retryAPI( () => client.audio.transcriptions.create( { file: createReadStream(filePath), model, response_format: "verbose_json", timestamp_granularities: ["segment"], }, signal ? { signal } : undefined ), { retries: 3, delayMs: 5000, label: `Whisper transcription${offsetSeconds > 0 ? ` (chunk@${formatTime(offsetSeconds)})` : ""}`, log: (msg) => onProgress(msg), } ); const elapsed = ((Date.now() - start) / 1000).toFixed(1); onProgress(`Whisper transcription complete in ${elapsed}s`); const segments = Array.isArray(result.segments) ? result.segments : []; const lines = segments.length ? segments.map((s) => `[${formatTime(s.start || 0)}] ${(s.text || "").trim()}`) : [`[0:00] ${(result.text || "").trim()}`]; const text = lines.join("\n"); // Whisper bills by audio duration in minutes, not tokens. const durationSeconds = result.duration || 0; const minutes = durationSeconds / 60; const usdCost = minutes * WHISPER_USD_PER_MINUTE; const cost = { inputTokens: 0, outputTokens: 0, thinkingTokens: 0, totalTokens: 0, inputCost: usdCost.toFixed(6), outputCost: "0.000000", thinkingCost: "0.000000", totalCost: usdCost.toFixed(6), totalCostDisplay: usdCost < 0.01 ? `$${(usdCost * 100).toFixed(3)}ยข` : `$${usdCost.toFixed(4)}`, }; return { text, usage: { inputTokens: 0, outputTokens: 0, thinkingTokens: 0, totalTokens: 0 }, cost, finishReason: null, blockReason: "none", raw: result, }; }, async analyzeText({ prompt, model, onProgress = () => {}, retries = 2, signal, }) { const result = await retryAPI( () => client.chat.completions.create( { model, max_tokens: ANALYSIS_MAX_TOKENS, messages: [{ role: "user", content: prompt }], }, signal ? { signal } : undefined ), { retries, delayMs: 5000, label: "OpenAI analysis", log: (msg) => onProgress(msg), } ); const choice = result.choices?.[0]; const text = choice?.message?.content || ""; const usage = { inputTokens: result.usage?.prompt_tokens || 0, outputTokens: result.usage?.completion_tokens || 0, thinkingTokens: 0, }; const cost = formatCost(ratesFor(OPENAI_PRICING, model), usage); return { text, usage, cost, finishReason: choice?.finish_reason || null, raw: result, }; }, }; }