247 lines
8.9 KiB
JavaScript
247 lines
8.9 KiB
JavaScript
// YouTube captions fast-path. Uses yt-dlp to fetch auto-generated
|
|
// (or manual) subtitle tracks WITHOUT downloading the video audio.
|
|
// For YouTube videos that have captions, this is dramatically
|
|
// faster than running audio through Gemini transcribe — captions
|
|
// download is typically 2-5 seconds vs 60-300+ seconds for audio.
|
|
//
|
|
// We then transform the .vtt subtitle file into a bracketed [MM:SS]
|
|
// transcript matching the shape Gemini's transcribe path produces,
|
|
// so the analyze step receives the same input format regardless of
|
|
// which source provided the transcript.
|
|
//
|
|
// Public entry point: fetchYouTubeCaptions({ url, tmpDir }) → {
|
|
// text: bracketed transcript string
|
|
// segments: [] (parity with transcribe; analyze doesn't use)
|
|
// duration_seconds: audio duration in seconds
|
|
// captions_source: "manual" | "auto" (which track yt-dlp picked)
|
|
// }
|
|
|
|
import fs from "fs/promises";
|
|
import path from "path";
|
|
import { execFile } from "child_process";
|
|
import { promisify } from "util";
|
|
|
|
const execFileAsync = promisify(execFile);
|
|
|
|
// Hard cap on the captions fetch — yt-dlp can occasionally hang on
|
|
// YouTube rate-limit pages; better to fail fast than wait forever.
|
|
const CAPTIONS_TIMEOUT_MS = 90_000;
|
|
|
|
export async function fetchYouTubeCaptions({ url, tmpDir }) {
|
|
// Two-pass strategy:
|
|
//
|
|
// Pass 1: --sub-langs "en.*" (covers en, en-US, en-GB, en-orig,
|
|
// en-auto, etc.). Handles the common case fast.
|
|
//
|
|
// Pass 2: If pass 1 produced no .vtt, retry with --sub-langs all
|
|
// AND surface what subtitles yt-dlp THINKS are available
|
|
// (via --list-subs in a third diagnostic call) so the
|
|
// operator sees the actual lang codes when it still fails.
|
|
//
|
|
// The earlier single-pass implementation failed silently on videos
|
|
// whose captions yt-dlp tagged in ways that didn't match "en.*"
|
|
// (some manually-uploaded subs use just a country code like "us",
|
|
// and some auto-captions resist the en-prefix entirely).
|
|
const outTemplate = path.join(tmpDir, "captions.%(ext)s");
|
|
const baseArgs = [
|
|
"--write-subs",
|
|
"--write-auto-subs",
|
|
"--convert-subs",
|
|
"vtt",
|
|
"--skip-download",
|
|
"--no-warnings",
|
|
"-o",
|
|
outTemplate,
|
|
"--print",
|
|
"duration",
|
|
];
|
|
|
|
let durationSec = null;
|
|
let lastStderr = "";
|
|
|
|
// Pass 1: english-only.
|
|
durationSec = await runYtDlp([...baseArgs, "--sub-langs", "en.*", url]).then(
|
|
(r) => {
|
|
lastStderr = r.stderr;
|
|
return parseDuration(r.stdout);
|
|
},
|
|
(err) => {
|
|
lastStderr = (err?.stderr || "").toString();
|
|
throw new Error(
|
|
`yt-dlp captions fetch failed: ${lastStderr.slice(0, 200) || err?.message}`
|
|
);
|
|
}
|
|
);
|
|
|
|
let files = await fs.readdir(tmpDir);
|
|
let vttFiles = files.filter((f) => f.endsWith(".vtt"));
|
|
|
|
// Pass 2: fall back to "all" if english-only produced nothing.
|
|
// Some videos have manually-uploaded captions tagged with non-en
|
|
// language codes (e.g. "en-US" → "us", or community-translated
|
|
// subs that are still in english but tagged "en-LIVE_CHAT").
|
|
if (vttFiles.length === 0) {
|
|
console.warn(
|
|
`[captions] pass 1 (en.*) produced no .vtt for ${url} — retrying with --sub-langs all`
|
|
);
|
|
try {
|
|
const r = await runYtDlp([...baseArgs, "--sub-langs", "all", url]);
|
|
lastStderr = r.stderr;
|
|
durationSec = parseDuration(r.stdout) || durationSec;
|
|
} catch (err) {
|
|
lastStderr = (err?.stderr || "").toString();
|
|
// Don't throw; fall through to the "still no .vtt" diagnostic.
|
|
}
|
|
files = await fs.readdir(tmpDir);
|
|
vttFiles = files.filter((f) => f.endsWith(".vtt"));
|
|
}
|
|
|
|
if (vttFiles.length === 0) {
|
|
// Last-resort diagnostic: ask yt-dlp what subtitles it CAN see.
|
|
// Surfaces in the error message so the operator can tell whether
|
|
// the video genuinely lacks captions or whether yt-dlp's
|
|
// extractor is being blocked (rate limit, geo, sign-in wall).
|
|
let listOut = "";
|
|
try {
|
|
const { stdout: lsOut } = await execFileAsync(
|
|
"yt-dlp",
|
|
["--list-subs", "--skip-download", "--no-warnings", url],
|
|
{ timeout: CAPTIONS_TIMEOUT_MS, maxBuffer: 4 * 1024 * 1024 }
|
|
);
|
|
listOut = lsOut || "";
|
|
} catch (lsErr) {
|
|
listOut = (lsErr?.stderr || lsErr?.message || "").toString();
|
|
}
|
|
const summary = listOut
|
|
.split("\n")
|
|
.filter((l) => l.trim() && !/^\[/.test(l))
|
|
.slice(0, 12)
|
|
.join(" | ")
|
|
.slice(0, 400);
|
|
throw new Error(
|
|
`yt-dlp produced no .vtt subtitle file. ` +
|
|
`yt-dlp --list-subs output: ${summary || "(empty)"}. ` +
|
|
`Last stderr: ${lastStderr.slice(0, 200)}`
|
|
);
|
|
}
|
|
|
|
// Preference order for picking among multiple .vtt files:
|
|
// 1. English manual subs (lang code starts with "en", not auto)
|
|
// 2. Any english (auto-generated, "en-orig", etc.)
|
|
// 3. Any other language (translation is better than nothing for
|
|
// benchmarking; analyze just needs text)
|
|
vttFiles.sort((a, b) => {
|
|
const aEn = /\.en[\.\-]/i.test(a) || /\.en\./i.test(a);
|
|
const bEn = /\.en[\.\-]/i.test(b) || /\.en\./i.test(b);
|
|
if (aEn !== bEn) return aEn ? -1 : 1;
|
|
const aAuto = /auto|orig/i.test(a);
|
|
const bAuto = /auto|orig/i.test(b);
|
|
if (aAuto !== bAuto) return aAuto ? 1 : -1;
|
|
return a.localeCompare(b);
|
|
});
|
|
const chosenVtt = path.join(tmpDir, vttFiles[0]);
|
|
const captionsSource = /auto|orig/i.test(vttFiles[0]) ? "auto" : "manual";
|
|
|
|
const vtt = await fs.readFile(chosenVtt, "utf8");
|
|
const text = vttToBracketedTranscript(vtt);
|
|
return {
|
|
text,
|
|
segments: [],
|
|
duration_seconds: durationSec,
|
|
captions_source: captionsSource,
|
|
};
|
|
}
|
|
|
|
// Run yt-dlp with the given args, returning { stdout, stderr }.
|
|
// Rejects on non-zero exit. Captures stderr so the caller can include
|
|
// it in diagnostic error messages.
|
|
async function runYtDlp(args) {
|
|
const result = await execFileAsync("yt-dlp", args, {
|
|
timeout: CAPTIONS_TIMEOUT_MS,
|
|
maxBuffer: 4 * 1024 * 1024,
|
|
});
|
|
return {
|
|
stdout: result.stdout || "",
|
|
stderr: result.stderr || "",
|
|
};
|
|
}
|
|
|
|
// yt-dlp prints duration (seconds) on stdout when --print duration is
|
|
// set. Other lines may interleave; the duration is the last numeric
|
|
// token on stdout in practice. Returns null when not present.
|
|
function parseDuration(stdout) {
|
|
if (!stdout) return null;
|
|
const last = stdout.trim().split(/\s+/).pop();
|
|
const v = parseFloat(last);
|
|
return Number.isFinite(v) ? v : null;
|
|
}
|
|
|
|
// Parse a .vtt subtitle file and produce a "[MM:SS] line\n[MM:SS] line"
|
|
// transcript. We drop overlapping / duplicate caption blocks (YouTube
|
|
// auto-captions emit each line twice: once incremental, once final).
|
|
//
|
|
// VTT format:
|
|
// WEBVTT
|
|
//
|
|
// 00:00:01.000 --> 00:00:03.500
|
|
// <c>First caption line.</c>
|
|
//
|
|
// 00:00:03.500 --> 00:00:06.000
|
|
// Second caption line.
|
|
//
|
|
// We strip the timing arrow + any inline cue tags, dedupe consecutive
|
|
// identical lines, and prefix each block with its start time.
|
|
export function vttToBracketedTranscript(vtt) {
|
|
const lines = vtt.split(/\r?\n/);
|
|
const out = [];
|
|
let i = 0;
|
|
let lastEmitted = "";
|
|
while (i < lines.length) {
|
|
const line = lines[i];
|
|
// Find a timing line: "HH:MM:SS.mmm --> HH:MM:SS.mmm"
|
|
const m = line.match(/^(\d{2}:\d{2}:\d{2}\.\d{3})\s+-->\s+(\d{2}:\d{2}:\d{2}\.\d{3})/);
|
|
if (!m) { i++; continue; }
|
|
const startTs = m[1];
|
|
// Collect subsequent text lines until a blank line / end of file.
|
|
i++;
|
|
const textLines = [];
|
|
while (i < lines.length && lines[i].trim() !== "") {
|
|
// Strip inline tags <c.colorXXX>...</c>, <00:00:01.234>, etc.
|
|
const cleaned = lines[i]
|
|
.replace(/<\d{2}:\d{2}:\d{2}\.\d{3}>/g, "")
|
|
.replace(/<\/?[^>]+>/g, "")
|
|
.trim();
|
|
if (cleaned) textLines.push(cleaned);
|
|
i++;
|
|
}
|
|
const blockText = textLines.join(" ").trim();
|
|
if (!blockText) continue;
|
|
// De-dupe: YouTube auto-captions emit progressive + final lines
|
|
// that overlap. If the new text starts with everything from the
|
|
// last emitted text, skip it (it's just the previous text being
|
|
// re-emitted with slight extensions).
|
|
if (lastEmitted && (blockText === lastEmitted || blockText.startsWith(lastEmitted))) {
|
|
// Replace the previous emission with the more-complete version.
|
|
out.pop();
|
|
} else if (lastEmitted && lastEmitted.startsWith(blockText)) {
|
|
// Already emitted a more-complete version; skip this one.
|
|
continue;
|
|
}
|
|
// Convert "HH:MM:SS.mmm" → "[MM:SS]" or "[H:MM:SS]" for our
|
|
// downstream parser.
|
|
const [h, mm, ss] = startTs.split(":");
|
|
const hours = parseInt(h, 10);
|
|
const mins = parseInt(mm, 10);
|
|
const secs = Math.floor(parseFloat(ss));
|
|
const pad = (n) => n.toString().padStart(2, "0");
|
|
const stamp =
|
|
hours > 0
|
|
? `[${hours}:${pad(mins)}:${pad(secs)}]`
|
|
: `[${mins}:${pad(secs)}]`;
|
|
const formatted = `${stamp} ${blockText}`;
|
|
out.push(formatted);
|
|
lastEmitted = blockText;
|
|
}
|
|
return out.join("\n");
|
|
}
|