// ElevenLabs TTS backend — the cloud alternative to operator-hardware // Kokoro, mirroring how the transcribe/analyze pipelines treat Gemini // as a swappable cloud fallback to the Parakeet/vLLM hardware path. // // Selected when relay_tts_backend_preference routes here (or when // Kokoro is unavailable and a fallback is allowed) AND // relay_elevenlabs_api_key is configured. Like Kokoro, ElevenLabs // handles full multi-sentence paragraphs in one call — one request per // topic summary, no chunking. // // NOTE: This path is implemented to ElevenLabs' documented API but is // UNTESTED against a live key (the operator hadn't supplied one at // build time). The Kokoro path is the tested default. Before relying on // ElevenLabs in production, set relay_elevenlabs_api_key + // relay_elevenlabs_voice_id and smoke-test one /relay/tts call. // // Output: { audio: Buffer, contentType: "audio/mpeg", durationSeconds, // voice, model } — durationSeconds is null (we don't decode // the MP3 frame count here; the Recap server measures it when // transcoding/caching). const API_BASE = "https://api.elevenlabs.io/v1"; const DEFAULT_MODEL = "eleven_turbo_v2_5"; const DEFAULT_TIMEOUT_MS = 120_000; export function createElevenLabsBackend({ apiKey = "", voiceId = "", model = DEFAULT_MODEL, timeoutMs = DEFAULT_TIMEOUT_MS, } = {}) { const configured = !!(apiKey && voiceId); return { hasTts: configured, kind: "elevenlabs", async synthesize({ text, voice }) { if (!apiKey) { const e = new Error( "ElevenLabs TTS is not configured — set relay_elevenlabs_api_key" ); e.status = 503; throw e; } // `voice` from the client overrides the operator default voice id // when present (the Recap client may let a user pick a voice). const chosenVoice = (voice || voiceId || "").trim(); if (!chosenVoice) { const e = new Error( "ElevenLabs TTS has no voice — set relay_elevenlabs_voice_id" ); e.status = 503; throw e; } const cleaned = (text || "").replace(/\s+/g, " ").trim(); if (!cleaned) { const e = new Error("TTS input text is empty"); e.status = 400; throw e; } const url = `${API_BASE}/text-to-speech/${encodeURIComponent(chosenVoice)}`; let res; try { // Public-internet call — use the global fetch with full cert // validation (NOT lanFetch, which is scoped to LAN/Spark Control). res = await fetch(url, { method: "POST", headers: { "xi-api-key": apiKey, "Content-Type": "application/json", Accept: "audio/mpeg", }, body: JSON.stringify({ text: cleaned, model_id: model || DEFAULT_MODEL, }), signal: AbortSignal.timeout(timeoutMs), }); } catch (err) { const e = new Error(`ElevenLabs TTS network error: ${err?.message || err}`); e.status = 502; throw e; } if (!res.ok) { let body = ""; try { body = await res.text(); } catch {} const e = new Error( `ElevenLabs TTS ${res.status}: ${body.slice(0, 300)}` ); e.status = res.status; throw e; } const audio = Buffer.from(await res.arrayBuffer()); return { audio, contentType: "audio/mpeg", durationSeconds: null, sentenceCount: null, attempts: 1, voice: chosenVoice, model: model || DEFAULT_MODEL, }; }, }; }