// YouTube captions fast-path. Uses yt-dlp to fetch auto-generated // (or manual) subtitle tracks WITHOUT downloading the video audio. // For YouTube videos that have captions, this is dramatically // faster than running audio through Gemini transcribe — captions // download is typically 2-5 seconds vs 60-300+ seconds for audio. // // We then transform the .vtt subtitle file into a bracketed [MM:SS] // transcript matching the shape Gemini's transcribe path produces, // so the analyze step receives the same input format regardless of // which source provided the transcript. // // Public entry point: fetchYouTubeCaptions({ url, tmpDir }) → { // text: bracketed transcript string // segments: [] (parity with transcribe; analyze doesn't use) // duration_seconds: audio duration in seconds // captions_source: "manual" | "auto" (which track yt-dlp picked) // } import fs from "fs/promises"; import path from "path"; import { execFile } from "child_process"; import { promisify } from "util"; const execFileAsync = promisify(execFile); // Hard cap on the captions fetch — yt-dlp can occasionally hang on // YouTube rate-limit pages; better to fail fast than wait forever. const CAPTIONS_TIMEOUT_MS = 90_000; export async function fetchYouTubeCaptions({ url, tmpDir }) { // Two-pass strategy: // // Pass 1: --sub-langs "en.*" (covers en, en-US, en-GB, en-orig, // en-auto, etc.). Handles the common case fast. // // Pass 2: If pass 1 produced no .vtt, retry with --sub-langs all // AND surface what subtitles yt-dlp THINKS are available // (via --list-subs in a third diagnostic call) so the // operator sees the actual lang codes when it still fails. // // The earlier single-pass implementation failed silently on videos // whose captions yt-dlp tagged in ways that didn't match "en.*" // (some manually-uploaded subs use just a country code like "us", // and some auto-captions resist the en-prefix entirely). const outTemplate = path.join(tmpDir, "captions.%(ext)s"); const baseArgs = [ "--write-subs", "--write-auto-subs", "--convert-subs", "vtt", "--skip-download", "--no-warnings", "-o", outTemplate, "--print", "duration", ]; let durationSec = null; let lastStderr = ""; // Pass 1: english-only. durationSec = await runYtDlp([...baseArgs, "--sub-langs", "en.*", url]).then( (r) => { lastStderr = r.stderr; return parseDuration(r.stdout); }, (err) => { lastStderr = (err?.stderr || "").toString(); throw new Error( `yt-dlp captions fetch failed: ${lastStderr.slice(0, 200) || err?.message}` ); } ); let files = await fs.readdir(tmpDir); let vttFiles = files.filter((f) => f.endsWith(".vtt")); // Pass 2: fall back to "all" if english-only produced nothing. // Some videos have manually-uploaded captions tagged with non-en // language codes (e.g. "en-US" → "us", or community-translated // subs that are still in english but tagged "en-LIVE_CHAT"). if (vttFiles.length === 0) { console.warn( `[captions] pass 1 (en.*) produced no .vtt for ${url} — retrying with --sub-langs all` ); try { const r = await runYtDlp([...baseArgs, "--sub-langs", "all", url]); lastStderr = r.stderr; durationSec = parseDuration(r.stdout) || durationSec; } catch (err) { lastStderr = (err?.stderr || "").toString(); // Don't throw; fall through to the "still no .vtt" diagnostic. } files = await fs.readdir(tmpDir); vttFiles = files.filter((f) => f.endsWith(".vtt")); } if (vttFiles.length === 0) { // Last-resort diagnostic: ask yt-dlp what subtitles it CAN see. // Surfaces in the error message so the operator can tell whether // the video genuinely lacks captions or whether yt-dlp's // extractor is being blocked (rate limit, geo, sign-in wall). let listOut = ""; try { const { stdout: lsOut } = await execFileAsync( "yt-dlp", ["--list-subs", "--skip-download", "--no-warnings", url], { timeout: CAPTIONS_TIMEOUT_MS, maxBuffer: 4 * 1024 * 1024 } ); listOut = lsOut || ""; } catch (lsErr) { listOut = (lsErr?.stderr || lsErr?.message || "").toString(); } const summary = listOut .split("\n") .filter((l) => l.trim() && !/^\[/.test(l)) .slice(0, 12) .join(" | ") .slice(0, 400); throw new Error( `yt-dlp produced no .vtt subtitle file. ` + `yt-dlp --list-subs output: ${summary || "(empty)"}. ` + `Last stderr: ${lastStderr.slice(0, 200)}` ); } // Preference order for picking among multiple .vtt files: // 1. English manual subs (lang code starts with "en", not auto) // 2. Any english (auto-generated, "en-orig", etc.) // 3. Any other language (translation is better than nothing for // benchmarking; analyze just needs text) vttFiles.sort((a, b) => { const aEn = /\.en[\.\-]/i.test(a) || /\.en\./i.test(a); const bEn = /\.en[\.\-]/i.test(b) || /\.en\./i.test(b); if (aEn !== bEn) return aEn ? -1 : 1; const aAuto = /auto|orig/i.test(a); const bAuto = /auto|orig/i.test(b); if (aAuto !== bAuto) return aAuto ? 1 : -1; return a.localeCompare(b); }); const chosenVtt = path.join(tmpDir, vttFiles[0]); const captionsSource = /auto|orig/i.test(vttFiles[0]) ? "auto" : "manual"; const vtt = await fs.readFile(chosenVtt, "utf8"); const text = vttToBracketedTranscript(vtt); return { text, segments: [], duration_seconds: durationSec, captions_source: captionsSource, }; } // Run yt-dlp with the given args, returning { stdout, stderr }. // Rejects on non-zero exit. Captures stderr so the caller can include // it in diagnostic error messages. async function runYtDlp(args) { const result = await execFileAsync("yt-dlp", args, { timeout: CAPTIONS_TIMEOUT_MS, maxBuffer: 4 * 1024 * 1024, }); return { stdout: result.stdout || "", stderr: result.stderr || "", }; } // yt-dlp prints duration (seconds) on stdout when --print duration is // set. Other lines may interleave; the duration is the last numeric // token on stdout in practice. Returns null when not present. function parseDuration(stdout) { if (!stdout) return null; const last = stdout.trim().split(/\s+/).pop(); const v = parseFloat(last); return Number.isFinite(v) ? v : null; } // Parse a .vtt subtitle file and produce a "[MM:SS] line\n[MM:SS] line" // transcript. We drop overlapping / duplicate caption blocks (YouTube // auto-captions emit each line twice: once incremental, once final). // // VTT format: // WEBVTT // // 00:00:01.000 --> 00:00:03.500 // First caption line. // // 00:00:03.500 --> 00:00:06.000 // Second caption line. // // We strip the timing arrow + any inline cue tags, dedupe consecutive // identical lines, and prefix each block with its start time. export function vttToBracketedTranscript(vtt) { const lines = vtt.split(/\r?\n/); const out = []; let i = 0; let lastEmitted = ""; while (i < lines.length) { const line = lines[i]; // Find a timing line: "HH:MM:SS.mmm --> HH:MM:SS.mmm" const m = line.match(/^(\d{2}:\d{2}:\d{2}\.\d{3})\s+-->\s+(\d{2}:\d{2}:\d{2}\.\d{3})/); if (!m) { i++; continue; } const startTs = m[1]; // Collect subsequent text lines until a blank line / end of file. i++; const textLines = []; while (i < lines.length && lines[i].trim() !== "") { // Strip inline tags ..., <00:00:01.234>, etc. const cleaned = lines[i] .replace(/<\d{2}:\d{2}:\d{2}\.\d{3}>/g, "") .replace(/<\/?[^>]+>/g, "") .trim(); if (cleaned) textLines.push(cleaned); i++; } const blockText = textLines.join(" ").trim(); if (!blockText) continue; // De-dupe: YouTube auto-captions emit progressive + final lines // that overlap. If the new text starts with everything from the // last emitted text, skip it (it's just the previous text being // re-emitted with slight extensions). if (lastEmitted && (blockText === lastEmitted || blockText.startsWith(lastEmitted))) { // Replace the previous emission with the more-complete version. out.pop(); } else if (lastEmitted && lastEmitted.startsWith(blockText)) { // Already emitted a more-complete version; skip this one. continue; } // Convert "HH:MM:SS.mmm" → "[MM:SS]" or "[H:MM:SS]" for our // downstream parser. const [h, mm, ss] = startTs.split(":"); const hours = parseInt(h, 10); const mins = parseInt(mm, 10); const secs = Math.floor(parseFloat(ss)); const pad = (n) => n.toString().padStart(2, "0"); const stamp = hours > 0 ? `[${hours}:${pad(mins)}:${pad(secs)}]` : `[${mins}:${pad(secs)}]`; const formatted = `${stamp} ${blockText}`; out.push(formatted); lastEmitted = blockText; } return out.join("\n"); }