recap-relay/server/backends/elevenlabs.js

// ElevenLabs TTS backend — the cloud alternative to operator-hardware
// Kokoro, mirroring how the transcribe/analyze pipelines treat Gemini
// as a swappable cloud fallback to the Parakeet/vLLM hardware path.
//
// Selected when relay_tts_backend_preference routes here (or when
// Kokoro is unavailable and a fallback is allowed) AND
// relay_elevenlabs_api_key is configured. Like Kokoro, ElevenLabs
// handles full multi-sentence paragraphs in one call — one request per
// topic summary, no chunking.
//
// NOTE: This path is implemented to ElevenLabs' documented API but is
// UNTESTED against a live key (the operator hadn't supplied one at
// build time). The Kokoro path is the tested default. Before relying on
// ElevenLabs in production, set relay_elevenlabs_api_key +
// relay_elevenlabs_voice_id and smoke-test one /relay/tts call.
//
// Output: { audio: Buffer, contentType: "audio/mpeg", durationSeconds,
//           voice, model } — durationSeconds is null (we don't decode
//           the MP3 frame count here; the Recap server measures it when
//           transcoding/caching).

const API_BASE = "https://api.elevenlabs.io/v1";
const DEFAULT_MODEL = "eleven_turbo_v2_5";
const DEFAULT_TIMEOUT_MS = 120_000;

export function createElevenLabsBackend({
  apiKey = "",
  voiceId = "",
  model = DEFAULT_MODEL,
  timeoutMs = DEFAULT_TIMEOUT_MS,
} = {}) {
  const configured = !!(apiKey && voiceId);

  return {
    hasTts: configured,
    kind: "elevenlabs",

    async synthesize({ text, voice }) {
      if (!apiKey) {
        const e = new Error(
          "ElevenLabs TTS is not configured — set relay_elevenlabs_api_key"
        );
        e.status = 503;
        throw e;
      }
      // `voice` from the client overrides the operator default voice id
      // when present (the Recap client may let a user pick a voice).
      const chosenVoice = (voice || voiceId || "").trim();
      if (!chosenVoice) {
        const e = new Error(
          "ElevenLabs TTS has no voice — set relay_elevenlabs_voice_id"
        );
        e.status = 503;
        throw e;
      }
      const cleaned = (text || "").replace(/\s+/g, " ").trim();
      if (!cleaned) {
        const e = new Error("TTS input text is empty");
        e.status = 400;
        throw e;
      }

      const url = `${API_BASE}/text-to-speech/${encodeURIComponent(chosenVoice)}`;
      let res;
      try {
        // Public-internet call — use the global fetch with full cert
        // validation (NOT lanFetch, which is scoped to LAN/Spark Control).
        res = await fetch(url, {
          method: "POST",
          headers: {
            "xi-api-key": apiKey,
            "Content-Type": "application/json",
            Accept: "audio/mpeg",
          },
          body: JSON.stringify({
            text: cleaned,
            model_id: model || DEFAULT_MODEL,
          }),
          signal: AbortSignal.timeout(timeoutMs),
        });
      } catch (err) {
        const e = new Error(`ElevenLabs TTS network error: ${err?.message || err}`);
        e.status = 502;
        throw e;
      }
      if (!res.ok) {
        let body = "";
        try {
          body = await res.text();
        } catch {}
        const e = new Error(
          `ElevenLabs TTS ${res.status}: ${body.slice(0, 300)}`
        );
        e.status = res.status;
        throw e;
      }
      const audio = Buffer.from(await res.arrayBuffer());
      return {
        audio,
        contentType: "audio/mpeg",
        durationSeconds: null,
        sentenceCount: null,
        attempts: 1,
        voice: chosenVoice,
        model: model || DEFAULT_MODEL,
      };
    },
  };
}