Files
recap-relay/server/backends/elevenlabs.js
T

110 lines
3.6 KiB
JavaScript

// ElevenLabs TTS backend — the cloud alternative to operator-hardware
// Kokoro, mirroring how the transcribe/analyze pipelines treat Gemini
// as a swappable cloud fallback to the Parakeet/vLLM hardware path.
//
// Selected when relay_tts_backend_preference routes here (or when
// Kokoro is unavailable and a fallback is allowed) AND
// relay_elevenlabs_api_key is configured. Like Kokoro, ElevenLabs
// handles full multi-sentence paragraphs in one call — one request per
// topic summary, no chunking.
//
// NOTE: This path is implemented to ElevenLabs' documented API but is
// UNTESTED against a live key (the operator hadn't supplied one at
// build time). The Kokoro path is the tested default. Before relying on
// ElevenLabs in production, set relay_elevenlabs_api_key +
// relay_elevenlabs_voice_id and smoke-test one /relay/tts call.
//
// Output: { audio: Buffer, contentType: "audio/mpeg", durationSeconds,
// voice, model } — durationSeconds is null (we don't decode
// the MP3 frame count here; the Recap server measures it when
// transcoding/caching).
const API_BASE = "https://api.elevenlabs.io/v1";
const DEFAULT_MODEL = "eleven_turbo_v2_5";
const DEFAULT_TIMEOUT_MS = 120_000;
export function createElevenLabsBackend({
apiKey = "",
voiceId = "",
model = DEFAULT_MODEL,
timeoutMs = DEFAULT_TIMEOUT_MS,
} = {}) {
const configured = !!(apiKey && voiceId);
return {
hasTts: configured,
kind: "elevenlabs",
async synthesize({ text, voice }) {
if (!apiKey) {
const e = new Error(
"ElevenLabs TTS is not configured — set relay_elevenlabs_api_key"
);
e.status = 503;
throw e;
}
// `voice` from the client overrides the operator default voice id
// when present (the Recap client may let a user pick a voice).
const chosenVoice = (voice || voiceId || "").trim();
if (!chosenVoice) {
const e = new Error(
"ElevenLabs TTS has no voice — set relay_elevenlabs_voice_id"
);
e.status = 503;
throw e;
}
const cleaned = (text || "").replace(/\s+/g, " ").trim();
if (!cleaned) {
const e = new Error("TTS input text is empty");
e.status = 400;
throw e;
}
const url = `${API_BASE}/text-to-speech/${encodeURIComponent(chosenVoice)}`;
let res;
try {
// Public-internet call — use the global fetch with full cert
// validation (NOT lanFetch, which is scoped to LAN/Spark Control).
res = await fetch(url, {
method: "POST",
headers: {
"xi-api-key": apiKey,
"Content-Type": "application/json",
Accept: "audio/mpeg",
},
body: JSON.stringify({
text: cleaned,
model_id: model || DEFAULT_MODEL,
}),
signal: AbortSignal.timeout(timeoutMs),
});
} catch (err) {
const e = new Error(`ElevenLabs TTS network error: ${err?.message || err}`);
e.status = 502;
throw e;
}
if (!res.ok) {
let body = "";
try {
body = await res.text();
} catch {}
const e = new Error(
`ElevenLabs TTS ${res.status}: ${body.slice(0, 300)}`
);
e.status = res.status;
throw e;
}
const audio = Buffer.from(await res.arrayBuffer());
return {
audio,
contentType: "audio/mpeg",
durationSeconds: null,
sentenceCount: null,
attempts: 1,
voice: chosenVoice,
model: model || DEFAULT_MODEL,
};
},
};
}