110 lines
3.6 KiB
JavaScript
110 lines
3.6 KiB
JavaScript
// ElevenLabs TTS backend — the cloud alternative to operator-hardware
|
|
// Kokoro, mirroring how the transcribe/analyze pipelines treat Gemini
|
|
// as a swappable cloud fallback to the Parakeet/vLLM hardware path.
|
|
//
|
|
// Selected when relay_tts_backend_preference routes here (or when
|
|
// Kokoro is unavailable and a fallback is allowed) AND
|
|
// relay_elevenlabs_api_key is configured. Like Kokoro, ElevenLabs
|
|
// handles full multi-sentence paragraphs in one call — one request per
|
|
// topic summary, no chunking.
|
|
//
|
|
// NOTE: This path is implemented to ElevenLabs' documented API but is
|
|
// UNTESTED against a live key (the operator hadn't supplied one at
|
|
// build time). The Kokoro path is the tested default. Before relying on
|
|
// ElevenLabs in production, set relay_elevenlabs_api_key +
|
|
// relay_elevenlabs_voice_id and smoke-test one /relay/tts call.
|
|
//
|
|
// Output: { audio: Buffer, contentType: "audio/mpeg", durationSeconds,
|
|
// voice, model } — durationSeconds is null (we don't decode
|
|
// the MP3 frame count here; the Recap server measures it when
|
|
// transcoding/caching).
|
|
|
|
const API_BASE = "https://api.elevenlabs.io/v1";
|
|
const DEFAULT_MODEL = "eleven_turbo_v2_5";
|
|
const DEFAULT_TIMEOUT_MS = 120_000;
|
|
|
|
export function createElevenLabsBackend({
|
|
apiKey = "",
|
|
voiceId = "",
|
|
model = DEFAULT_MODEL,
|
|
timeoutMs = DEFAULT_TIMEOUT_MS,
|
|
} = {}) {
|
|
const configured = !!(apiKey && voiceId);
|
|
|
|
return {
|
|
hasTts: configured,
|
|
kind: "elevenlabs",
|
|
|
|
async synthesize({ text, voice }) {
|
|
if (!apiKey) {
|
|
const e = new Error(
|
|
"ElevenLabs TTS is not configured — set relay_elevenlabs_api_key"
|
|
);
|
|
e.status = 503;
|
|
throw e;
|
|
}
|
|
// `voice` from the client overrides the operator default voice id
|
|
// when present (the Recap client may let a user pick a voice).
|
|
const chosenVoice = (voice || voiceId || "").trim();
|
|
if (!chosenVoice) {
|
|
const e = new Error(
|
|
"ElevenLabs TTS has no voice — set relay_elevenlabs_voice_id"
|
|
);
|
|
e.status = 503;
|
|
throw e;
|
|
}
|
|
const cleaned = (text || "").replace(/\s+/g, " ").trim();
|
|
if (!cleaned) {
|
|
const e = new Error("TTS input text is empty");
|
|
e.status = 400;
|
|
throw e;
|
|
}
|
|
|
|
const url = `${API_BASE}/text-to-speech/${encodeURIComponent(chosenVoice)}`;
|
|
let res;
|
|
try {
|
|
// Public-internet call — use the global fetch with full cert
|
|
// validation (NOT lanFetch, which is scoped to LAN/Spark Control).
|
|
res = await fetch(url, {
|
|
method: "POST",
|
|
headers: {
|
|
"xi-api-key": apiKey,
|
|
"Content-Type": "application/json",
|
|
Accept: "audio/mpeg",
|
|
},
|
|
body: JSON.stringify({
|
|
text: cleaned,
|
|
model_id: model || DEFAULT_MODEL,
|
|
}),
|
|
signal: AbortSignal.timeout(timeoutMs),
|
|
});
|
|
} catch (err) {
|
|
const e = new Error(`ElevenLabs TTS network error: ${err?.message || err}`);
|
|
e.status = 502;
|
|
throw e;
|
|
}
|
|
if (!res.ok) {
|
|
let body = "";
|
|
try {
|
|
body = await res.text();
|
|
} catch {}
|
|
const e = new Error(
|
|
`ElevenLabs TTS ${res.status}: ${body.slice(0, 300)}`
|
|
);
|
|
e.status = res.status;
|
|
throw e;
|
|
}
|
|
const audio = Buffer.from(await res.arrayBuffer());
|
|
return {
|
|
audio,
|
|
contentType: "audio/mpeg",
|
|
durationSeconds: null,
|
|
sentenceCount: null,
|
|
attempts: 1,
|
|
voice: chosenVoice,
|
|
model: model || DEFAULT_MODEL,
|
|
};
|
|
},
|
|
};
|
|
}
|