// YouTube captions fast-path. Uses yt-dlp to fetch auto-generated
// (or manual) subtitle tracks WITHOUT downloading the video audio.
// For YouTube videos that have captions, this is dramatically
// faster than running audio through Gemini transcribe — captions
// download is typically 2-5 seconds vs 60-300+ seconds for audio.
//
// We then transform the .vtt subtitle file into a bracketed [MM:SS]
// transcript matching the shape Gemini's transcribe path produces,
// so the analyze step receives the same input format regardless of
// which source provided the transcript.
//
// Public entry point: fetchYouTubeCaptions({ url, tmpDir }) → {
// text: bracketed transcript string
// segments: [] (parity with transcribe; analyze doesn't use)
// duration_seconds: audio duration in seconds
// captions_source: "manual" | "auto" (which track yt-dlp picked)
// }
import fs from "fs/promises";
import path from "path";
import { execFile } from "child_process";
import { promisify } from "util";
const execFileAsync = promisify(execFile);
// Hard cap on the captions fetch — yt-dlp can occasionally hang on
// YouTube rate-limit pages; better to fail fast than wait forever.
const CAPTIONS_TIMEOUT_MS = 90_000;
export async function fetchYouTubeCaptions({ url, tmpDir }) {
// Two-pass strategy:
//
// Pass 1: --sub-langs "en.*" (covers en, en-US, en-GB, en-orig,
// en-auto, etc.). Handles the common case fast.
//
// Pass 2: If pass 1 produced no .vtt, retry with --sub-langs all
// AND surface what subtitles yt-dlp THINKS are available
// (via --list-subs in a third diagnostic call) so the
// operator sees the actual lang codes when it still fails.
//
// The earlier single-pass implementation failed silently on videos
// whose captions yt-dlp tagged in ways that didn't match "en.*"
// (some manually-uploaded subs use just a country code like "us",
// and some auto-captions resist the en-prefix entirely).
const outTemplate = path.join(tmpDir, "captions.%(ext)s");
const baseArgs = [
"--write-subs",
"--write-auto-subs",
"--convert-subs",
"vtt",
"--skip-download",
"--no-warnings",
"-o",
outTemplate,
"--print",
"duration",
];
let durationSec = null;
let lastStderr = "";
// Pass 1: english-only.
durationSec = await runYtDlp([...baseArgs, "--sub-langs", "en.*", url]).then(
(r) => {
lastStderr = r.stderr;
return parseDuration(r.stdout);
},
(err) => {
lastStderr = (err?.stderr || "").toString();
throw new Error(
`yt-dlp captions fetch failed: ${lastStderr.slice(0, 200) || err?.message}`
);
}
);
let files = await fs.readdir(tmpDir);
let vttFiles = files.filter((f) => f.endsWith(".vtt"));
// Pass 2: fall back to "all" if english-only produced nothing.
// Some videos have manually-uploaded captions tagged with non-en
// language codes (e.g. "en-US" → "us", or community-translated
// subs that are still in english but tagged "en-LIVE_CHAT").
if (vttFiles.length === 0) {
console.warn(
`[captions] pass 1 (en.*) produced no .vtt for ${url} — retrying with --sub-langs all`
);
try {
const r = await runYtDlp([...baseArgs, "--sub-langs", "all", url]);
lastStderr = r.stderr;
durationSec = parseDuration(r.stdout) || durationSec;
} catch (err) {
lastStderr = (err?.stderr || "").toString();
// Don't throw; fall through to the "still no .vtt" diagnostic.
}
files = await fs.readdir(tmpDir);
vttFiles = files.filter((f) => f.endsWith(".vtt"));
}
if (vttFiles.length === 0) {
// Last-resort diagnostic: ask yt-dlp what subtitles it CAN see.
// Surfaces in the error message so the operator can tell whether
// the video genuinely lacks captions or whether yt-dlp's
// extractor is being blocked (rate limit, geo, sign-in wall).
let listOut = "";
try {
const { stdout: lsOut } = await execFileAsync(
"yt-dlp",
["--list-subs", "--skip-download", "--no-warnings", url],
{ timeout: CAPTIONS_TIMEOUT_MS, maxBuffer: 4 * 1024 * 1024 }
);
listOut = lsOut || "";
} catch (lsErr) {
listOut = (lsErr?.stderr || lsErr?.message || "").toString();
}
const summary = listOut
.split("\n")
.filter((l) => l.trim() && !/^\[/.test(l))
.slice(0, 12)
.join(" | ")
.slice(0, 400);
throw new Error(
`yt-dlp produced no .vtt subtitle file. ` +
`yt-dlp --list-subs output: ${summary || "(empty)"}. ` +
`Last stderr: ${lastStderr.slice(0, 200)}`
);
}
// Preference order for picking among multiple .vtt files:
// 1. English manual subs (lang code starts with "en", not auto)
// 2. Any english (auto-generated, "en-orig", etc.)
// 3. Any other language (translation is better than nothing for
// benchmarking; analyze just needs text)
vttFiles.sort((a, b) => {
const aEn = /\.en[\.\-]/i.test(a) || /\.en\./i.test(a);
const bEn = /\.en[\.\-]/i.test(b) || /\.en\./i.test(b);
if (aEn !== bEn) return aEn ? -1 : 1;
const aAuto = /auto|orig/i.test(a);
const bAuto = /auto|orig/i.test(b);
if (aAuto !== bAuto) return aAuto ? 1 : -1;
return a.localeCompare(b);
});
const chosenVtt = path.join(tmpDir, vttFiles[0]);
const captionsSource = /auto|orig/i.test(vttFiles[0]) ? "auto" : "manual";
const vtt = await fs.readFile(chosenVtt, "utf8");
const text = vttToBracketedTranscript(vtt);
return {
text,
segments: [],
duration_seconds: durationSec,
captions_source: captionsSource,
};
}
// Run yt-dlp with the given args, returning { stdout, stderr }.
// Rejects on non-zero exit. Captures stderr so the caller can include
// it in diagnostic error messages.
async function runYtDlp(args) {
const result = await execFileAsync("yt-dlp", args, {
timeout: CAPTIONS_TIMEOUT_MS,
maxBuffer: 4 * 1024 * 1024,
});
return {
stdout: result.stdout || "",
stderr: result.stderr || "",
};
}
// yt-dlp prints duration (seconds) on stdout when --print duration is
// set. Other lines may interleave; the duration is the last numeric
// token on stdout in practice. Returns null when not present.
function parseDuration(stdout) {
if (!stdout) return null;
const last = stdout.trim().split(/\s+/).pop();
const v = parseFloat(last);
return Number.isFinite(v) ? v : null;
}
// Parse a .vtt subtitle file and produce a "[MM:SS] line\n[MM:SS] line"
// transcript. We drop overlapping / duplicate caption blocks (YouTube
// auto-captions emit each line twice: once incremental, once final).
//
// VTT format:
// WEBVTT
//
// 00:00:01.000 --> 00:00:03.500
// First caption line.
//
// 00:00:03.500 --> 00:00:06.000
// Second caption line.
//
// We strip the timing arrow + any inline cue tags, dedupe consecutive
// identical lines, and prefix each block with its start time.
export function vttToBracketedTranscript(vtt) {
const lines = vtt.split(/\r?\n/);
const out = [];
let i = 0;
let lastEmitted = "";
while (i < lines.length) {
const line = lines[i];
// Find a timing line: "HH:MM:SS.mmm --> HH:MM:SS.mmm"
const m = line.match(/^(\d{2}:\d{2}:\d{2}\.\d{3})\s+-->\s+(\d{2}:\d{2}:\d{2}\.\d{3})/);
if (!m) { i++; continue; }
const startTs = m[1];
// Collect subsequent text lines until a blank line / end of file.
i++;
const textLines = [];
while (i < lines.length && lines[i].trim() !== "") {
// Strip inline tags ..., <00:00:01.234>, etc.
const cleaned = lines[i]
.replace(/<\d{2}:\d{2}:\d{2}\.\d{3}>/g, "")
.replace(/<\/?[^>]+>/g, "")
.trim();
if (cleaned) textLines.push(cleaned);
i++;
}
const blockText = textLines.join(" ").trim();
if (!blockText) continue;
// De-dupe: YouTube auto-captions emit progressive + final lines
// that overlap. If the new text starts with everything from the
// last emitted text, skip it (it's just the previous text being
// re-emitted with slight extensions).
if (lastEmitted && (blockText === lastEmitted || blockText.startsWith(lastEmitted))) {
// Replace the previous emission with the more-complete version.
out.pop();
} else if (lastEmitted && lastEmitted.startsWith(blockText)) {
// Already emitted a more-complete version; skip this one.
continue;
}
// Convert "HH:MM:SS.mmm" → "[MM:SS]" or "[H:MM:SS]" for our
// downstream parser.
const [h, mm, ss] = startTs.split(":");
const hours = parseInt(h, 10);
const mins = parseInt(mm, 10);
const secs = Math.floor(parseFloat(ss));
const pad = (n) => n.toString().padStart(2, "0");
const stamp =
hours > 0
? `[${hours}:${pad(mins)}:${pad(secs)}]`
: `[${mins}:${pad(secs)}]`;
const formatted = `${stamp} ${blockText}`;
out.push(formatted);
lastEmitted = blockText;
}
return out.join("\n");
}