v0.2.11 /relay/capabilities + /relay/transcribe-url (yt-dlp in container)

2026-05-12 01:33:34 -05:00
parent 0a0fa009ec
commit b7f75904bb
7 changed files with 521 additions and 7 deletions
@@ -0,0 +1,415 @@
+// POST /relay/transcribe-url — like /relay/transcribe but the relay
+// fetches the audio itself instead of accepting it in the request
+// body. Saves the buyer's upstream-bandwidth bottleneck: a 100-MB
+// podcast that takes 60s to upload from a home connection takes <5s
+// for the operator's relay (typically on a Start9 server with much
+// fatter pipe) to download from the original source.
+//
+// Request body (application/json):
+//   {
+//     media_url:   string,       // YouTube URL OR direct audio (.mp3 / .m4a / etc.)
+//     type?:       "youtube" | "podcast",  // hint; we sniff URL shape if absent
+//     mime_type?:  string,       // hint for the transcribe backend; we sniff from
+//                                //   the downloaded file if absent
+//     title?:      string,       // metadata for Gemini's transcription prompt
+//     channel?:    string,
+//     description?: string,
+//     chapters?:   any[]
+//   }
+//
+// Same auth as /relay/transcribe (X-Recap-Install-Id required,
+// X-Recap-Job-Id optional, Authorization optional Bearer license).
+// Same standard envelope on response. Same job-id dedup + credit
+// accounting + audit log. Adds `download_ms` to the audit row so
+// dashboard can show how long the relay's own download took
+// separately from the backend's inference time.
+
+import express from "express";
+import fs from "fs/promises";
+import { createWriteStream } from "fs";
+import os from "os";
+import path from "path";
+import { execFile } from "child_process";
+import { promisify } from "util";
+import { Readable } from "stream";
+import { pipeline } from "stream/promises";
+import { resolveLicense } from "../keysat-client.js";
+import { getOrCreateRow, planBackend, commitCredit } from "../credits.js";
+import { lookupJob, markJobCharged, refundJob } from "../job-credits.js";
+import { getConfigSnapshot, getTierQuotas } from "../config.js";
+import { createGeminiBackend } from "../backends/gemini.js";
+import { createHardwareBackend } from "../backends/hardware.js";
+import { envelope, errorEnvelope } from "./envelope.js";
+import { recordCall } from "../audit-log.js";
+import { calcGeminiCost } from "../pricing.js";
+
+const execFileAsync = promisify(execFile);
+
+// Max file size the relay is willing to download. Generous enough for
+// 4-hour podcasts at ~256 kbps but caps DOS exposure.
+const MAX_DOWNLOAD_BYTES = 500 * 1024 * 1024;
+
+// Per-request safety timeout on the download leg alone (separate from
+// the transcribe call's own timeout). yt-dlp can be slow when YouTube
+// rate-limits; a hard ceiling avoids holding the request open forever.
+const DOWNLOAD_TIMEOUT_MS = 10 * 60 * 1000;
+
+function looksLikeYouTube(url) {
+  if (!url) return false;
+  return /(?:^|\.)(youtube\.com|youtu\.be)\b/i.test(url);
+}
+
+function guessMimeFromExt(filePath) {
+  const ext = path.extname(filePath).toLowerCase().replace(/^\./, "");
+  return (
+    {
+      mp3: "audio/mpeg",
+      m4a: "audio/mp4",
+      mp4: "audio/mp4",
+      aac: "audio/aac",
+      ogg: "audio/ogg",
+      opus: "audio/opus",
+      wav: "audio/wav",
+      webm: "audio/webm",
+      flac: "audio/flac",
+    }[ext] || "audio/mpeg"
+  );
+}
+
+// Download an HTTP(S) audio URL to a temp file. Stops if the file
+// would exceed MAX_DOWNLOAD_BYTES. Returns { filePath, bytes,
+// mimeType }.
+async function downloadDirect(url, tmpDir) {
+  const res = await fetch(url, {
+    redirect: "follow",
+    signal: AbortSignal.timeout(DOWNLOAD_TIMEOUT_MS),
+  });
+  if (!res.ok) {
+    throw new Error(`Download ${url} returned HTTP ${res.status}`);
+  }
+  const contentType = res.headers.get("content-type") || "";
+  const isAudioy =
+    contentType.startsWith("audio/") ||
+    contentType === "application/octet-stream" ||
+    contentType.includes("mpeg") ||
+    contentType.includes("mp4");
+  if (!isAudioy) {
+    // Don't enforce strictly — some podcast CDNs serve audio with
+    // generic content-types. Log + continue; the transcription backend
+    // will reject if it's truly not audio.
+    console.warn(
+      `[transcribe-url] non-audio content-type "${contentType}" for ${url} — proceeding anyway`
+    );
+  }
+  const ext =
+    contentType.includes("mp4") ? "m4a" :
+    contentType.includes("ogg") ? "ogg" :
+    contentType.includes("opus") ? "opus" :
+    "mp3";
+  const filePath = path.join(tmpDir, `audio.${ext}`);
+
+  // Stream the response body into the temp file with a running byte
+  // count so we can abort if it gets too large mid-download.
+  if (!res.body) throw new Error("response has no body");
+  let bytes = 0;
+  const out = createWriteStream(filePath);
+  const counted = new ReadableStream({
+    async start(controller) {
+      const reader = res.body.getReader();
+      try {
+        while (true) {
+          const { done, value } = await reader.read();
+          if (done) break;
+          bytes += value.byteLength;
+          if (bytes > MAX_DOWNLOAD_BYTES) {
+            controller.error(
+              new Error(
+                `Download exceeded ${MAX_DOWNLOAD_BYTES} bytes — refusing to continue`
+              )
+            );
+            return;
+          }
+          controller.enqueue(value);
+        }
+        controller.close();
+      } catch (err) {
+        controller.error(err);
+      }
+    },
+  });
+  await pipeline(Readable.fromWeb(counted), out);
+  return { filePath, bytes, mimeType: contentType || guessMimeFromExt(filePath) };
+}
+
+// Download a YouTube URL via yt-dlp. Picks the audio-only m4a/mp3.
+// Logs the chosen path back as the file. Caller manages tmpDir.
+async function downloadYouTube(url, tmpDir) {
+  const outTemplate = path.join(tmpDir, "audio.%(ext)s");
+  const args = [
+    "-x", // extract audio
+    "--audio-format",
+    "mp3",
+    "--audio-quality",
+    "5",
+    "-o",
+    outTemplate,
+    "--no-playlist",
+    "--no-simulate",
+    "--no-warnings",
+    url,
+  ];
+  try {
+    await execFileAsync("yt-dlp", args, {
+      timeout: DOWNLOAD_TIMEOUT_MS,
+      maxBuffer: 10 * 1024 * 1024,
+    });
+  } catch (err) {
+    const stderr = (err?.stderr || "").toString();
+    const stdout = (err?.stdout || "").toString();
+    throw new Error(
+      `yt-dlp failed: ${stderr.trim() || stdout.trim() || err?.message}`
+    );
+  }
+  // Find the produced file — yt-dlp's audio-format=mp3 means it ends
+  // up at audio.mp3, but be defensive in case it landed at a
+  // different extension.
+  const files = await fs.readdir(tmpDir);
+  const audioFile = files.find((f) => /^audio\.(mp3|m4a|opus|webm|aac|ogg)$/i.test(f));
+  if (!audioFile) {
+    throw new Error(`yt-dlp ran but no audio file found in ${tmpDir}`);
+  }
+  const filePath = path.join(tmpDir, audioFile);
+  const stat = await fs.stat(filePath);
+  if (stat.size > MAX_DOWNLOAD_BYTES) {
+    throw new Error(
+      `YouTube download exceeded ${MAX_DOWNLOAD_BYTES} bytes — refusing to continue`
+    );
+  }
+  return {
+    filePath,
+    bytes: stat.size,
+    mimeType: guessMimeFromExt(filePath),
+  };
+}
+
+export function transcribeUrlRouter() {
+  const router = express.Router();
+
+  router.post("/transcribe-url", express.json({ limit: "1mb" }), async (req, res) => {
+    const t0 = Date.now();
+    const installId = req.header("X-Recap-Install-Id");
+    const jobId = req.header("X-Recap-Job-Id") || null;
+    const auth = req.header("Authorization");
+
+    if (!installId) {
+      const e = await errorEnvelope({
+        error: "missing X-Recap-Install-Id header",
+        statusHint: 400,
+      });
+      return res.status(400).json(e.body);
+    }
+    const {
+      media_url: mediaUrl,
+      type,
+      mime_type: bodyMime,
+      title,
+      channel,
+      description,
+      chapters,
+    } = req.body || {};
+    if (!mediaUrl || typeof mediaUrl !== "string") {
+      const e = await errorEnvelope({
+        error: "missing or non-string body.media_url",
+        installId,
+        statusHint: 400,
+      });
+      return res.status(400).json(e.body);
+    }
+
+    const license = await resolveLicense(auth);
+    const tier = license.tier;
+    const row = await getOrCreateRow(installId);
+    row.tier_snapshot = tier;
+
+    // Quota check + backend choice. Same as /relay/transcribe.
+    let reusedJob = false;
+    let chosenBackend = null;
+    const existingJob = lookupJob(installId, jobId);
+    if (existingJob) {
+      reusedJob = true;
+      chosenBackend = existingJob.backend;
+    } else {
+      const cfg = await getConfigSnapshot();
+      const hasHardware = !!cfg.relay_parakeet_base_url;
+      const quota = await getTierQuotas();
+      const preference =
+        cfg.relay_transcribe_backend_preference || "gemini_first";
+      const plan = planBackend(row, quota, { hasHardware, preference });
+      if (!plan.allowed) {
+        await recordCall({
+          install_id: installId,
+          tier,
+          pipeline: "transcribe",
+          backend: null,
+          model: null,
+          status: "refused",
+          credit_charged: 0,
+          duration_ms: Date.now() - t0,
+          cost_usd: 0,
+          job_id: jobId,
+          error: plan.reason,
+        });
+        const e = await errorEnvelope({
+          error: plan.reason,
+          installId,
+          tier,
+          statusHint: 402,
+        });
+        return res.status(402).json(e.body);
+      }
+      chosenBackend = plan.backend;
+    }
+
+    // ── Download phase ─────────────────────────────────────────────
+    const tmpDir = await fs.mkdtemp(path.join(os.tmpdir(), "relay-dl-"));
+    const isYT = type === "youtube" || (!type && looksLikeYouTube(mediaUrl));
+    const dlStart = Date.now();
+    let audio;
+    let downloadMs = 0;
+    try {
+      audio = isYT
+        ? await downloadYouTube(mediaUrl, tmpDir)
+        : await downloadDirect(mediaUrl, tmpDir);
+      downloadMs = Date.now() - dlStart;
+      console.log(
+        `[transcribe-url] downloaded ${audio.bytes} bytes from ${isYT ? "youtube" : "direct"} in ${downloadMs}ms (${mediaUrl.slice(0, 80)})`
+      );
+    } catch (err) {
+      try { await fs.rm(tmpDir, { recursive: true, force: true }); } catch {}
+      console.error(`[transcribe-url] download failed: ${err?.message || err}`);
+      await recordCall({
+        install_id: installId,
+        tier,
+        pipeline: "transcribe",
+        backend: chosenBackend,
+        model: null,
+        status: "error",
+        credit_charged: 0,
+        duration_ms: Date.now() - t0,
+        download_ms: Date.now() - dlStart,
+        cost_usd: 0,
+        job_id: jobId,
+        error: ("download_failed: " + (err?.message || String(err))).slice(0, 200),
+      });
+      const e = await errorEnvelope({
+        error: "download_failed: " + (err?.message || String(err)).slice(0, 200),
+        installId,
+        tier,
+        statusHint: 502,
+      });
+      return res.status(502).json(e.body);
+    }
+
+    // ── Transcription phase ────────────────────────────────────────
+    const cfg = await getConfigSnapshot();
+    let result;
+    try {
+      const audioBuf = await fs.readFile(audio.filePath);
+      const mimeType = bodyMime || audio.mimeType;
+      if (chosenBackend === "gemini") {
+        const backend = createGeminiBackend({
+          apiKey: cfg.relay_gemini_api_key,
+          transcriptionModel: cfg.relay_gemini_transcription_model,
+          analysisModel: cfg.relay_gemini_analysis_model,
+        });
+        result = await backend.transcribeAudio({
+          audio: audioBuf,
+          mimeType,
+          title: title || "",
+          channel: channel || "",
+          description: description || "",
+          chapters: Array.isArray(chapters) ? chapters : [],
+          offsetSeconds: 0,
+        });
+      } else {
+        const backend = createHardwareBackend({
+          parakeetBaseURL: cfg.relay_parakeet_base_url,
+          gemmaBaseURL: cfg.relay_gemma_base_url,
+          parakeetModel: cfg.relay_parakeet_model,
+          gemmaModel: cfg.relay_gemma_model,
+        });
+        result = await backend.transcribeAudio({
+          audio: audioBuf,
+          mimeType,
+          offsetSeconds: 0,
+        });
+      }
+    } catch (err) {
+      try { await fs.rm(tmpDir, { recursive: true, force: true }); } catch {}
+      if (reusedJob) refundJob(installId, jobId);
+      console.error(`[transcribe-url] transcribe failed: ${err?.message}`);
+      await recordCall({
+        install_id: installId,
+        tier,
+        pipeline: "transcribe",
+        backend: chosenBackend,
+        model:
+          chosenBackend === "gemini"
+            ? cfg.relay_gemini_transcription_model
+            : cfg.relay_parakeet_model,
+        status: "error",
+        credit_charged: 0,
+        duration_ms: Date.now() - t0,
+        download_ms: downloadMs,
+        cost_usd: 0,
+        job_id: jobId,
+        error: (err?.message || String(err)).slice(0, 200),
+      });
+      const e = await errorEnvelope({
+        error: err?.message || "backend_error",
+        installId,
+        tier,
+        statusHint: err?.status || 502,
+      });
+      return res.status(e.statusHint).json(e.body);
+    } finally {
+      try { await fs.rm(tmpDir, { recursive: true, force: true }); } catch {}
+    }
+
+    // ── Commit + audit ─────────────────────────────────────────────
+    let creditCharged = 0;
+    if (!reusedJob) {
+      await commitCredit(installId, { backend: chosenBackend, tier });
+      markJobCharged(installId, jobId, { backend: chosenBackend, tier });
+      creditCharged = 1;
+    }
+    const costDetails =
+      chosenBackend === "gemini" && result.usage
+        ? calcGeminiCost(result.model, result.usage)
+        : {
+            input_tokens: 0,
+            output_tokens: 0,
+            thinking_tokens: 0,
+            cost_usd: 0,
+          };
+    await recordCall({
+      install_id: installId,
+      tier,
+      pipeline: "transcribe",
+      backend: chosenBackend,
+      model: result?.model || null,
+      status: "success",
+      credit_charged: creditCharged,
+      duration_ms: Date.now() - t0,
+      download_ms: downloadMs,
+      audio_bytes: audio.bytes,
+      job_id: jobId,
+      ...costDetails,
+    });
+
+    const body = await envelope({ result, installId, tier, creditCharged });
+    res.json(body);
+  });
+
+  return router;
+}