v0.2.11 /relay/capabilities + /relay/transcribe-url (yt-dlp in container)

2026-05-12 01:33:34 -05:00
parent 0a0fa009ec
commit b7f75904bb
7 changed files with 521 additions and 7 deletions
@@ -1,9 +1,10 @@
 # ─────────────────────────────────────────────────────────
 #  Recap Relay — StartOS 0.4 Docker image
 #
-#  Includes: Node.js 20 only. No yt-dlp / ffmpeg / Python — the relay
+#  Includes: Node.js 20 + yt-dlp + ffmpeg + Python. yt-dlp + ffmpeg are
-#  receives audio buffers from Recap clients and forwards to Gemini's
+#  needed by the /relay/transcribe-url endpoint, which downloads
-#  File API; no local audio processing.
+#  YouTube / direct-audio URLs server-side so Recap clients don't have
 #  to ship the audio over their own (often slow) upstream.
 #
 #  Uses Debian slim for the same reason Recap does — pip-free, but
 #  pulled-in C deps from npm packages prefer glibc over musl.
@@ -32,10 +33,20 @@ WORKDIR /app
 # Runtime deps:
 #   - dumb-init: PID 1 signal handling
 #   - ca-certificates: HTTPS for Gemini + Keysat
 #   - python3 + pip: yt-dlp installation
 #   - ffmpeg: audio extraction (yt-dlp invokes it for -x)
 #   - curl: yt-dlp self-update fallback
 RUN apt-get update && apt-get install -y --no-install-recommends \
    dumb-init \
    ca-certificates \
-  && rm -rf /var/lib/apt/lists/*
+    curl \
    python3 \
    python3-pip \
    python3-venv \
    ffmpeg \
  && rm -rf /var/lib/apt/lists/* \
  && pip3 install --break-system-packages yt-dlp \
  && yt-dlp --version
 # Copy installed deps + app code from builder
 COPY --from=builder /app/vendor ./vendor/
@@ -20,10 +20,12 @@ import {
  setupAdminAuthRoutes,
 } from "./admin-auth.js";
 import { transcribeRouter } from "./routes/transcribe.js";
 import { transcribeUrlRouter } from "./routes/transcribe-url.js";
 import { analyzeRouter } from "./routes/analyze.js";
 import { healthRouter } from "./routes/health.js";
 import { balanceRouter } from "./routes/balance.js";
 import { policyRouter } from "./routes/policy.js";
 import { capabilitiesRouter } from "./routes/capabilities.js";
 import { adminRouter } from "./routes/admin.js";
 const __dirname = path.dirname(fileURLToPath(import.meta.url));
@@ -51,8 +53,10 @@ setupAdminAuthRoutes(app);
 // Authorization optional).
 app.use("/relay", healthRouter());
 app.use("/relay", policyRouter());
 app.use("/relay", capabilitiesRouter());
 app.use("/relay", balanceRouter());
 app.use("/relay", transcribeRouter());
 app.use("/relay", transcribeUrlRouter());
 app.use("/relay", analyzeRouter());
 // Admin dashboard endpoints (cookie-gated).
@@ -1,6 +1,6 @@
 {
  "name": "recap-relay-server",
-  "version": "0.2.10",
+  "version": "0.2.11",
  "type": "module",
  "private": true,
  "dependencies": {
@@ -0,0 +1,70 @@
 // GET /relay/capabilities — operator-aware metadata for Recap clients
 // to plan their audio handling. Returns the upper bounds the relay's
 // CURRENT routing config can comfortably accept, so Recap can decide
 // whether to chunk a long video before sending it.
 //
 // Today's logic:
 //   - When the operator's transcribe_backend_preference routes through
 //     Gemini at all (gemini_first / gemini_only), we report Gemini-safe
 //     limits (60 min / 30 MB / 2700 s chunks). Even with hardware as
 //     overflow, the FIRST attempt is Gemini, which needs the chunk
 //     budget.
 //   - When the operator's preference is hardware-only (or hardware-
 //     first with overflow to Gemini disabled in spirit), we report
 //     "unbounded" — the operator's Parakeet wrapper can typically
 //     ingest 2+ hour podcasts in a single shot, so chunking just adds
 //     extra inference passes and timestamp-stitching overhead.
 //
 // Recap reads this once on boot + on policy refresh; when its
 // transcriptionProvider is "relay", it honors these limits instead of
 // its own hardcoded thresholds. For non-relay providers, Recap's
 // internal per-provider thresholds apply.
 import express from "express";
 import { getConfigSnapshot } from "../config.js";
 export function capabilitiesRouter() {
  const router = express.Router();
  router.get("/capabilities", async (_req, res) => {
    const cfg = await getConfigSnapshot();
    const txPref =
      cfg.relay_transcribe_backend_preference || "gemini_first";
    const hasParakeet = !!cfg.relay_parakeet_base_url;
    // Conservative default: Gemini-safe limits unless the operator has
    // explicitly said "use hardware (only or first) and I've got a
    // Parakeet endpoint wired up". Without the Parakeet endpoint we
    // can't make use of larger inputs — Gemini's the only path —
    // so we'd just be lying to the client.
    const hardwareCapable =
      hasParakeet && (txPref === "hardware_only" || txPref === "hardware_first");
    if (hardwareCapable) {
      res.json({
        // Effective unbounded — Parakeet wrappers commonly handle 2+
        // hour audio in one shot. Set high but finite ceilings so a
        // 24-hour file doesn't OOM the operator's GPU box silently.
        max_audio_mb: 500,
        max_audio_minutes: 240,
        preferred_chunk_seconds: null,
        // Diagnostic — Recap doesn't need this but the dashboard / a
        // curious operator might want to know which limit shape they
        // returned and why.
        reason: "hardware-capable backend preference (" + txPref + ")",
      });
    } else {
      res.json({
        // Gemini File-API + practical reliability limits. Matches
        // Recap's pre-relay defaults so existing chunking behavior
        // is preserved.
        max_audio_mb: 30,
        max_audio_minutes: 60,
        preferred_chunk_seconds: 2700, // 45 min chunks
        reason: "Gemini-backed preference (" + txPref + ")",
      });
    }
  });
  return router;
 }
@@ -0,0 +1,415 @@
 // POST /relay/transcribe-url — like /relay/transcribe but the relay
 // fetches the audio itself instead of accepting it in the request
 // body. Saves the buyer's upstream-bandwidth bottleneck: a 100-MB
 // podcast that takes 60s to upload from a home connection takes <5s
 // for the operator's relay (typically on a Start9 server with much
 // fatter pipe) to download from the original source.
 //
 // Request body (application/json):
 //   {
 //     media_url:   string,       // YouTube URL OR direct audio (.mp3 / .m4a / etc.)
 //     type?:       "youtube" | "podcast",  // hint; we sniff URL shape if absent
 //     mime_type?:  string,       // hint for the transcribe backend; we sniff from
 //                                //   the downloaded file if absent
 //     title?:      string,       // metadata for Gemini's transcription prompt
 //     channel?:    string,
 //     description?: string,
 //     chapters?:   any[]
 //   }
 //
 // Same auth as /relay/transcribe (X-Recap-Install-Id required,
 // X-Recap-Job-Id optional, Authorization optional Bearer license).
 // Same standard envelope on response. Same job-id dedup + credit
 // accounting + audit log. Adds `download_ms` to the audit row so
 // dashboard can show how long the relay's own download took
 // separately from the backend's inference time.
 import express from "express";
 import fs from "fs/promises";
 import { createWriteStream } from "fs";
 import os from "os";
 import path from "path";
 import { execFile } from "child_process";
 import { promisify } from "util";
 import { Readable } from "stream";
 import { pipeline } from "stream/promises";
 import { resolveLicense } from "../keysat-client.js";
 import { getOrCreateRow, planBackend, commitCredit } from "../credits.js";
 import { lookupJob, markJobCharged, refundJob } from "../job-credits.js";
 import { getConfigSnapshot, getTierQuotas } from "../config.js";
 import { createGeminiBackend } from "../backends/gemini.js";
 import { createHardwareBackend } from "../backends/hardware.js";
 import { envelope, errorEnvelope } from "./envelope.js";
 import { recordCall } from "../audit-log.js";
 import { calcGeminiCost } from "../pricing.js";
 const execFileAsync = promisify(execFile);
 // Max file size the relay is willing to download. Generous enough for
 // 4-hour podcasts at ~256 kbps but caps DOS exposure.
 const MAX_DOWNLOAD_BYTES = 500 * 1024 * 1024;
 // Per-request safety timeout on the download leg alone (separate from
 // the transcribe call's own timeout). yt-dlp can be slow when YouTube
 // rate-limits; a hard ceiling avoids holding the request open forever.
 const DOWNLOAD_TIMEOUT_MS = 10 * 60 * 1000;
 function looksLikeYouTube(url) {
  if (!url) return false;
  return /(?:^|\.)(youtube\.com|youtu\.be)\b/i.test(url);
 }
 function guessMimeFromExt(filePath) {
  const ext = path.extname(filePath).toLowerCase().replace(/^\./, "");
  return (
    {
      mp3: "audio/mpeg",
      m4a: "audio/mp4",
      mp4: "audio/mp4",
      aac: "audio/aac",
      ogg: "audio/ogg",
      opus: "audio/opus",
      wav: "audio/wav",
      webm: "audio/webm",
      flac: "audio/flac",
    }[ext] || "audio/mpeg"
  );
 }
 // Download an HTTP(S) audio URL to a temp file. Stops if the file
 // would exceed MAX_DOWNLOAD_BYTES. Returns { filePath, bytes,
 // mimeType }.
 async function downloadDirect(url, tmpDir) {
  const res = await fetch(url, {
    redirect: "follow",
    signal: AbortSignal.timeout(DOWNLOAD_TIMEOUT_MS),
  });
  if (!res.ok) {
    throw new Error(`Download ${url} returned HTTP ${res.status}`);
  }
  const contentType = res.headers.get("content-type") || "";
  const isAudioy =
    contentType.startsWith("audio/") ||
    contentType === "application/octet-stream" ||
    contentType.includes("mpeg") ||
    contentType.includes("mp4");
  if (!isAudioy) {
    // Don't enforce strictly — some podcast CDNs serve audio with
    // generic content-types. Log + continue; the transcription backend
    // will reject if it's truly not audio.
    console.warn(
      `[transcribe-url] non-audio content-type "${contentType}" for ${url} — proceeding anyway`
    );
  }
  const ext =
    contentType.includes("mp4") ? "m4a" :
    contentType.includes("ogg") ? "ogg" :
    contentType.includes("opus") ? "opus" :
    "mp3";
  const filePath = path.join(tmpDir, `audio.${ext}`);
  // Stream the response body into the temp file with a running byte
  // count so we can abort if it gets too large mid-download.
  if (!res.body) throw new Error("response has no body");
  let bytes = 0;
  const out = createWriteStream(filePath);
  const counted = new ReadableStream({
    async start(controller) {
      const reader = res.body.getReader();
      try {
        while (true) {
          const { done, value } = await reader.read();
          if (done) break;
          bytes += value.byteLength;
          if (bytes > MAX_DOWNLOAD_BYTES) {
            controller.error(
              new Error(
                `Download exceeded ${MAX_DOWNLOAD_BYTES} bytes — refusing to continue`
              )
            );
            return;
          }
          controller.enqueue(value);
        }
        controller.close();
      } catch (err) {
        controller.error(err);
      }
    },
  });
  await pipeline(Readable.fromWeb(counted), out);
  return { filePath, bytes, mimeType: contentType || guessMimeFromExt(filePath) };
 }
 // Download a YouTube URL via yt-dlp. Picks the audio-only m4a/mp3.
 // Logs the chosen path back as the file. Caller manages tmpDir.
 async function downloadYouTube(url, tmpDir) {
  const outTemplate = path.join(tmpDir, "audio.%(ext)s");
  const args = [
    "-x", // extract audio
    "--audio-format",
    "mp3",
    "--audio-quality",
    "5",
    "-o",
    outTemplate,
    "--no-playlist",
    "--no-simulate",
    "--no-warnings",
    url,
  ];
  try {
    await execFileAsync("yt-dlp", args, {
      timeout: DOWNLOAD_TIMEOUT_MS,
      maxBuffer: 10 * 1024 * 1024,
    });
  } catch (err) {
    const stderr = (err?.stderr || "").toString();
    const stdout = (err?.stdout || "").toString();
    throw new Error(
      `yt-dlp failed: ${stderr.trim() || stdout.trim() || err?.message}`
    );
  }
  // Find the produced file — yt-dlp's audio-format=mp3 means it ends
  // up at audio.mp3, but be defensive in case it landed at a
  // different extension.
  const files = await fs.readdir(tmpDir);
  const audioFile = files.find((f) => /^audio\.(mp3|m4a|opus|webm|aac|ogg)$/i.test(f));
  if (!audioFile) {
    throw new Error(`yt-dlp ran but no audio file found in ${tmpDir}`);
  }
  const filePath = path.join(tmpDir, audioFile);
  const stat = await fs.stat(filePath);
  if (stat.size > MAX_DOWNLOAD_BYTES) {
    throw new Error(
      `YouTube download exceeded ${MAX_DOWNLOAD_BYTES} bytes — refusing to continue`
    );
  }
  return {
    filePath,
    bytes: stat.size,
    mimeType: guessMimeFromExt(filePath),
  };
 }
 export function transcribeUrlRouter() {
  const router = express.Router();
  router.post("/transcribe-url", express.json({ limit: "1mb" }), async (req, res) => {
    const t0 = Date.now();
    const installId = req.header("X-Recap-Install-Id");
    const jobId = req.header("X-Recap-Job-Id") || null;
    const auth = req.header("Authorization");
    if (!installId) {
      const e = await errorEnvelope({
        error: "missing X-Recap-Install-Id header",
        statusHint: 400,
      });
      return res.status(400).json(e.body);
    }
    const {
      media_url: mediaUrl,
      type,
      mime_type: bodyMime,
      title,
      channel,
      description,
      chapters,
    } = req.body || {};
    if (!mediaUrl || typeof mediaUrl !== "string") {
      const e = await errorEnvelope({
        error: "missing or non-string body.media_url",
        installId,
        statusHint: 400,
      });
      return res.status(400).json(e.body);
    }
    const license = await resolveLicense(auth);
    const tier = license.tier;
    const row = await getOrCreateRow(installId);
    row.tier_snapshot = tier;
    // Quota check + backend choice. Same as /relay/transcribe.
    let reusedJob = false;
    let chosenBackend = null;
    const existingJob = lookupJob(installId, jobId);
    if (existingJob) {
      reusedJob = true;
      chosenBackend = existingJob.backend;
    } else {
      const cfg = await getConfigSnapshot();
      const hasHardware = !!cfg.relay_parakeet_base_url;
      const quota = await getTierQuotas();
      const preference =
        cfg.relay_transcribe_backend_preference || "gemini_first";
      const plan = planBackend(row, quota, { hasHardware, preference });
      if (!plan.allowed) {
        await recordCall({
          install_id: installId,
          tier,
          pipeline: "transcribe",
          backend: null,
          model: null,
          status: "refused",
          credit_charged: 0,
          duration_ms: Date.now() - t0,
          cost_usd: 0,
          job_id: jobId,
          error: plan.reason,
        });
        const e = await errorEnvelope({
          error: plan.reason,
          installId,
          tier,
          statusHint: 402,
        });
        return res.status(402).json(e.body);
      }
      chosenBackend = plan.backend;
    }
    // ── Download phase ─────────────────────────────────────────────
    const tmpDir = await fs.mkdtemp(path.join(os.tmpdir(), "relay-dl-"));
    const isYT = type === "youtube" || (!type && looksLikeYouTube(mediaUrl));
    const dlStart = Date.now();
    let audio;
    let downloadMs = 0;
    try {
      audio = isYT
        ? await downloadYouTube(mediaUrl, tmpDir)
        : await downloadDirect(mediaUrl, tmpDir);
      downloadMs = Date.now() - dlStart;
      console.log(
        `[transcribe-url] downloaded ${audio.bytes} bytes from ${isYT ? "youtube" : "direct"} in ${downloadMs}ms (${mediaUrl.slice(0, 80)})`
      );
    } catch (err) {
      try { await fs.rm(tmpDir, { recursive: true, force: true }); } catch {}
      console.error(`[transcribe-url] download failed: ${err?.message || err}`);
      await recordCall({
        install_id: installId,
        tier,
        pipeline: "transcribe",
        backend: chosenBackend,
        model: null,
        status: "error",
        credit_charged: 0,
        duration_ms: Date.now() - t0,
        download_ms: Date.now() - dlStart,
        cost_usd: 0,
        job_id: jobId,
        error: ("download_failed: " + (err?.message || String(err))).slice(0, 200),
      });
      const e = await errorEnvelope({
        error: "download_failed: " + (err?.message || String(err)).slice(0, 200),
        installId,
        tier,
        statusHint: 502,
      });
      return res.status(502).json(e.body);
    }
    // ── Transcription phase ────────────────────────────────────────
    const cfg = await getConfigSnapshot();
    let result;
    try {
      const audioBuf = await fs.readFile(audio.filePath);
      const mimeType = bodyMime || audio.mimeType;
      if (chosenBackend === "gemini") {
        const backend = createGeminiBackend({
          apiKey: cfg.relay_gemini_api_key,
          transcriptionModel: cfg.relay_gemini_transcription_model,
          analysisModel: cfg.relay_gemini_analysis_model,
        });
        result = await backend.transcribeAudio({
          audio: audioBuf,
          mimeType,
          title: title || "",
          channel: channel || "",
          description: description || "",
          chapters: Array.isArray(chapters) ? chapters : [],
          offsetSeconds: 0,
        });
      } else {
        const backend = createHardwareBackend({
          parakeetBaseURL: cfg.relay_parakeet_base_url,
          gemmaBaseURL: cfg.relay_gemma_base_url,
          parakeetModel: cfg.relay_parakeet_model,
          gemmaModel: cfg.relay_gemma_model,
        });
        result = await backend.transcribeAudio({
          audio: audioBuf,
          mimeType,
          offsetSeconds: 0,
        });
      }
    } catch (err) {
      try { await fs.rm(tmpDir, { recursive: true, force: true }); } catch {}
      if (reusedJob) refundJob(installId, jobId);
      console.error(`[transcribe-url] transcribe failed: ${err?.message}`);
      await recordCall({
        install_id: installId,
        tier,
        pipeline: "transcribe",
        backend: chosenBackend,
        model:
          chosenBackend === "gemini"
            ? cfg.relay_gemini_transcription_model
            : cfg.relay_parakeet_model,
        status: "error",
        credit_charged: 0,
        duration_ms: Date.now() - t0,
        download_ms: downloadMs,
        cost_usd: 0,
        job_id: jobId,
        error: (err?.message || String(err)).slice(0, 200),
      });
      const e = await errorEnvelope({
        error: err?.message || "backend_error",
        installId,
        tier,
        statusHint: err?.status || 502,
      });
      return res.status(e.statusHint).json(e.body);
    } finally {
      try { await fs.rm(tmpDir, { recursive: true, force: true }); } catch {}
    }
    // ── Commit + audit ─────────────────────────────────────────────
    let creditCharged = 0;
    if (!reusedJob) {
      await commitCredit(installId, { backend: chosenBackend, tier });
      markJobCharged(installId, jobId, { backend: chosenBackend, tier });
      creditCharged = 1;
    }
    const costDetails =
      chosenBackend === "gemini" && result.usage
        ? calcGeminiCost(result.model, result.usage)
        : {
            input_tokens: 0,
            output_tokens: 0,
            thinking_tokens: 0,
            cost_usd: 0,
          };
    await recordCall({
      install_id: installId,
      tier,
      pipeline: "transcribe",
      backend: chosenBackend,
      model: result?.model || null,
      status: "success",
      credit_charged: creditCharged,
      duration_ms: Date.now() - t0,
      download_ms: downloadMs,
      audio_bytes: audio.bytes,
      job_id: jobId,
      ...costDetails,
    });
    const body = await envelope({ result, installId, tier, creditCharged });
    res.json(body);
  });
  return router;
 }
@@ -11,8 +11,9 @@ import { v_0_2_7 } from './v0.2.7'
 import { v_0_2_8 } from './v0.2.8'
 import { v_0_2_9 } from './v0.2.9'
 import { v_0_2_10 } from './v0.2.10'
 import { v_0_2_11 } from './v0.2.11'
 export const versionGraph = VersionGraph.of({
-  current: v_0_2_10,
+  current: v_0_2_11,
-  other: [v_0_2_9, v_0_2_8, v_0_2_7, v_0_2_6, v_0_2_5, v_0_2_4, v_0_2_3, v_0_2_2, v_0_2_1, v_0_2_0, v_0_1_0],
+  other: [v_0_2_10, v_0_2_9, v_0_2_8, v_0_2_7, v_0_2_6, v_0_2_5, v_0_2_4, v_0_2_3, v_0_2_2, v_0_2_1, v_0_2_0, v_0_1_0],
 })
@@ -0,0 +1,13 @@
 import { VersionInfo } from '@start9labs/start-sdk'
 export const v_0_2_11 = VersionInfo.of({
  version: '0.2.11:0',
  releaseNotes: {
    en_US:
      'New GET /relay/capabilities endpoint tells Recap clients the max audio size/duration the relay can comfortably handle given its current backend-routing preference. New POST /relay/transcribe-url endpoint downloads YouTube or direct audio URLs server-side so Recap clients no longer have to upload large audio files from their home connection. Docker image now ships yt-dlp + ffmpeg + python (~150MB larger).',
  },
  migrations: {
    up: async ({ effects }) => {},
    down: async ({ effects }) => {},
  },
 })