recap-relay/server/youtube-captions.js

// YouTube captions fast-path. Uses yt-dlp to fetch auto-generated
// (or manual) subtitle tracks WITHOUT downloading the video audio.
// For YouTube videos that have captions, this is dramatically
// faster than running audio through Gemini transcribe — captions
// download is typically 2-5 seconds vs 60-300+ seconds for audio.
//
// We then transform the .vtt subtitle file into a bracketed [MM:SS]
// transcript matching the shape Gemini's transcribe path produces,
// so the analyze step receives the same input format regardless of
// which source provided the transcript.
//
// Public entry point: fetchYouTubeCaptions({ url, tmpDir }) → {
//   text:              bracketed transcript string
//   segments:          [] (parity with transcribe; analyze doesn't use)
//   duration_seconds:  audio duration in seconds
//   captions_source:   "manual" | "auto"  (which track yt-dlp picked)
// }

import fs from "fs/promises";
import path from "path";
import { execFile } from "child_process";
import { promisify } from "util";

const execFileAsync = promisify(execFile);

// Hard cap on the captions fetch — yt-dlp can occasionally hang on
// YouTube rate-limit pages; better to fail fast than wait forever.
const CAPTIONS_TIMEOUT_MS = 90_000;

export async function fetchYouTubeCaptions({ url, tmpDir }) {
  // Two-pass strategy:
  //
  //   Pass 1: --sub-langs "en.*" (covers en, en-US, en-GB, en-orig,
  //           en-auto, etc.). Handles the common case fast.
  //
  //   Pass 2: If pass 1 produced no .vtt, retry with --sub-langs all
  //           AND surface what subtitles yt-dlp THINKS are available
  //           (via --list-subs in a third diagnostic call) so the
  //           operator sees the actual lang codes when it still fails.
  //
  // The earlier single-pass implementation failed silently on videos
  // whose captions yt-dlp tagged in ways that didn't match "en.*"
  // (some manually-uploaded subs use just a country code like "us",
  // and some auto-captions resist the en-prefix entirely).
  const outTemplate = path.join(tmpDir, "captions.%(ext)s");
  const baseArgs = [
    "--write-subs",
    "--write-auto-subs",
    "--convert-subs",
    "vtt",
    "--skip-download",
    "--no-warnings",
    "-o",
    outTemplate,
    "--print",
    "duration",
  ];

  let durationSec = null;
  let lastStderr = "";

  // Pass 1: english-only.
  durationSec = await runYtDlp([...baseArgs, "--sub-langs", "en.*", url]).then(
    (r) => {
      lastStderr = r.stderr;
      return parseDuration(r.stdout);
    },
    (err) => {
      lastStderr = (err?.stderr || "").toString();
      throw new Error(
        `yt-dlp captions fetch failed: ${lastStderr.slice(0, 200) || err?.message}`
      );
    }
  );

  let files = await fs.readdir(tmpDir);
  let vttFiles = files.filter((f) => f.endsWith(".vtt"));

  // Pass 2: fall back to "all" if english-only produced nothing.
  // Some videos have manually-uploaded captions tagged with non-en
  // language codes (e.g. "en-US" → "us", or community-translated
  // subs that are still in english but tagged "en-LIVE_CHAT").
  if (vttFiles.length === 0) {
    console.warn(
      `[captions] pass 1 (en.*) produced no .vtt for ${url} — retrying with --sub-langs all`
    );
    try {
      const r = await runYtDlp([...baseArgs, "--sub-langs", "all", url]);
      lastStderr = r.stderr;
      durationSec = parseDuration(r.stdout) || durationSec;
    } catch (err) {
      lastStderr = (err?.stderr || "").toString();
      // Don't throw; fall through to the "still no .vtt" diagnostic.
    }
    files = await fs.readdir(tmpDir);
    vttFiles = files.filter((f) => f.endsWith(".vtt"));
  }

  if (vttFiles.length === 0) {
    // Last-resort diagnostic: ask yt-dlp what subtitles it CAN see.
    // Surfaces in the error message so the operator can tell whether
    // the video genuinely lacks captions or whether yt-dlp's
    // extractor is being blocked (rate limit, geo, sign-in wall).
    let listOut = "";
    try {
      const { stdout: lsOut } = await execFileAsync(
        "yt-dlp",
        ["--list-subs", "--skip-download", "--no-warnings", url],
        { timeout: CAPTIONS_TIMEOUT_MS, maxBuffer: 4 * 1024 * 1024 }
      );
      listOut = lsOut || "";
    } catch (lsErr) {
      listOut = (lsErr?.stderr || lsErr?.message || "").toString();
    }
    const summary = listOut
      .split("\n")
      .filter((l) => l.trim() && !/^\[/.test(l))
      .slice(0, 12)
      .join(" | ")
      .slice(0, 400);
    throw new Error(
      `yt-dlp produced no .vtt subtitle file. ` +
        `yt-dlp --list-subs output: ${summary || "(empty)"}. ` +
        `Last stderr: ${lastStderr.slice(0, 200)}`
    );
  }

  // Preference order for picking among multiple .vtt files:
  //   1. English manual subs (lang code starts with "en", not auto)
  //   2. Any english (auto-generated, "en-orig", etc.)
  //   3. Any other language (translation is better than nothing for
  //      benchmarking; analyze just needs text)
  vttFiles.sort((a, b) => {
    const aEn = /\.en[\.\-]/i.test(a) || /\.en\./i.test(a);
    const bEn = /\.en[\.\-]/i.test(b) || /\.en\./i.test(b);
    if (aEn !== bEn) return aEn ? -1 : 1;
    const aAuto = /auto|orig/i.test(a);
    const bAuto = /auto|orig/i.test(b);
    if (aAuto !== bAuto) return aAuto ? 1 : -1;
    return a.localeCompare(b);
  });
  const chosenVtt = path.join(tmpDir, vttFiles[0]);
  const captionsSource = /auto|orig/i.test(vttFiles[0]) ? "auto" : "manual";

  const vtt = await fs.readFile(chosenVtt, "utf8");
  const text = vttToBracketedTranscript(vtt);
  return {
    text,
    segments: [],
    duration_seconds: durationSec,
    captions_source: captionsSource,
  };
}

// Run yt-dlp with the given args, returning { stdout, stderr }.
// Rejects on non-zero exit. Captures stderr so the caller can include
// it in diagnostic error messages.
async function runYtDlp(args) {
  const result = await execFileAsync("yt-dlp", args, {
    timeout: CAPTIONS_TIMEOUT_MS,
    maxBuffer: 4 * 1024 * 1024,
  });
  return {
    stdout: result.stdout || "",
    stderr: result.stderr || "",
  };
}

// yt-dlp prints duration (seconds) on stdout when --print duration is
// set. Other lines may interleave; the duration is the last numeric
// token on stdout in practice. Returns null when not present.
function parseDuration(stdout) {
  if (!stdout) return null;
  const last = stdout.trim().split(/\s+/).pop();
  const v = parseFloat(last);
  return Number.isFinite(v) ? v : null;
}

// Parse a .vtt subtitle file and produce a "[MM:SS] line\n[MM:SS] line"
// transcript. We drop overlapping / duplicate caption blocks (YouTube
// auto-captions emit each line twice: once incremental, once final).
//
// VTT format:
//   WEBVTT
//
//   00:00:01.000 --> 00:00:03.500
//   <c>First caption line.</c>
//
//   00:00:03.500 --> 00:00:06.000
//   Second caption line.
//
// We strip the timing arrow + any inline cue tags, dedupe consecutive
// identical lines, and prefix each block with its start time.
export function vttToBracketedTranscript(vtt) {
  const lines = vtt.split(/\r?\n/);
  const out = [];
  let i = 0;
  let lastEmitted = "";
  while (i < lines.length) {
    const line = lines[i];
    // Find a timing line: "HH:MM:SS.mmm --> HH:MM:SS.mmm"
    const m = line.match(/^(\d{2}:\d{2}:\d{2}\.\d{3})\s+-->\s+(\d{2}:\d{2}:\d{2}\.\d{3})/);
    if (!m) { i++; continue; }
    const startTs = m[1];
    // Collect subsequent text lines until a blank line / end of file.
    i++;
    const textLines = [];
    while (i < lines.length && lines[i].trim() !== "") {
      // Strip inline tags <c.colorXXX>...</c>, <00:00:01.234>, etc.
      const cleaned = lines[i]
        .replace(/<\d{2}:\d{2}:\d{2}\.\d{3}>/g, "")
        .replace(/<\/?[^>]+>/g, "")
        .trim();
      if (cleaned) textLines.push(cleaned);
      i++;
    }
    const blockText = textLines.join(" ").trim();
    if (!blockText) continue;
    // De-dupe: YouTube auto-captions emit progressive + final lines
    // that overlap. If the new text starts with everything from the
    // last emitted text, skip it (it's just the previous text being
    // re-emitted with slight extensions).
    if (lastEmitted && (blockText === lastEmitted || blockText.startsWith(lastEmitted))) {
      // Replace the previous emission with the more-complete version.
      out.pop();
    } else if (lastEmitted && lastEmitted.startsWith(blockText)) {
      // Already emitted a more-complete version; skip this one.
      continue;
    }
    // Convert "HH:MM:SS.mmm" → "[MM:SS]" or "[H:MM:SS]" for our
    // downstream parser.
    const [h, mm, ss] = startTs.split(":");
    const hours = parseInt(h, 10);
    const mins = parseInt(mm, 10);
    const secs = Math.floor(parseFloat(ss));
    const pad = (n) => n.toString().padStart(2, "0");
    const stamp =
      hours > 0
        ? `[${hours}:${pad(mins)}:${pad(secs)}]`
        : `[${mins}:${pad(secs)}]`;
    const formatted = `${stamp} ${blockText}`;
    out.push(formatted);
    lastEmitted = blockText;
  }
  return out.join("\n");
}