Files

852 lines
36 KiB
JavaScript

// Gemini backend forwarder. Receives a transcribe or analyze request
// from a route handler, calls the corresponding Gemini API, and
// returns a normalized result the route can wrap in the standard
// envelope.
//
// v0.1 implements:
// - transcribeAudio({ audio: Buffer, mimeType, title?, channel?,
// description?, chapters?, offsetSeconds? }) → { text, segments,
// duration_seconds }
// - analyzeText({ prompt }) → { text }
//
// Both go through @google/genai with similar prompts to Recap's
// gemini.js provider, so output shapes line up with what Recap's
// orchestration layer expects.
import { GoogleGenAI } from "@google/genai";
import fs from "fs/promises";
import os from "os";
import path from "path";
import { splitAudioFile, getAudioDurationSeconds } from "../audio-meta.js";
// Chunking knobs are passed into createGeminiBackend() from
// /data/config/relay-config.json (live-reloaded). The previous
// hardcoded constants (30 min / 6 concurrency) were removed in
// v0.2.32 so all chunking decisions flow from one canonical config
// source — edited via the dashboard's Settings tab.
// ffmpeg infers the output container/codec from the FILE EXTENSION,
// not the input format. Writing the master audio as "audio.bin" and
// asking ffmpeg to produce "chunk_0.bin" makes it choke with "Unable
// to find a suitable output format for ...bin". So we derive a sane
// extension from the mimeType the caller hands us; the audio bytes
// themselves are unchanged. mp3 is the safe default for unknown
// audio/* types — yt-dlp emits mp3 for YouTube and most podcast
// enclosures are mp3 too.
function extForMime(mimeType) {
const t = (mimeType || "").toLowerCase();
if (t.includes("mp4") || t.includes("m4a")) return "m4a";
if (t.includes("ogg")) return "ogg";
if (t.includes("opus")) return "opus";
if (t.includes("wav")) return "wav";
if (t.includes("webm")) return "webm";
if (t.includes("flac")) return "flac";
if (t.includes("aac")) return "aac";
return "mp3";
}
// Defaults used only when the caller doesn't supply explicit model
// names. Production callers should pass models pulled from
// relay_gemini_transcription_model / relay_gemini_analysis_model in
// the relay config so the operator can swap SKUs (e.g. flash for
// analysis) without rebuilding the relay.
const DEFAULT_TRANSCRIPTION_MODEL = "gemini-3-flash-preview";
const DEFAULT_ANALYSIS_MODEL = "gemini-3.1-pro-preview";
const EMPTY_RETRIES = 3;
// Per-pipeline fallback chains. All entries are verified valid as of
// 2026-05 against ai.google.dev/gemini-api/docs/models. Retired model
// IDs (gemini-3-pro-preview shut down 2026-03-09, gemini-2.0-flash
// deprecated, gemini-3.1-flash-* never existed) are NOT in the chain.
//
// Ordering rationale per pipeline:
// transcribe — Flash first (audio is Flash's natural fit), Pro
// only as last-resort because Pro on audio is wildly expensive
// analyze — Pro first (structured JSON benefits from reasoning),
// then Flash for the cheap+fast path
const TRANSCRIPTION_FALLBACK_CHAIN = [
"gemini-3-flash-preview",
"gemini-2.5-flash",
"gemini-3.1-flash-lite",
"gemini-2.5-pro",
"gemini-3.1-pro-preview",
];
const ANALYSIS_FALLBACK_CHAIN = [
"gemini-3.1-pro-preview",
"gemini-2.5-pro",
"gemini-3-flash-preview",
"gemini-2.5-flash",
"gemini-3.1-flash-lite",
];
// Build the fallback chain starting at the operator-selected primary.
// Walks the primary first, then everything below it (cheaper / older
// fallbacks — preserves the "operator picked their price point"
// intent), then everything above it as a last-resort layer. The
// last-resort layer matters when the primary sits at the BOTTOM of
// the chain (e.g. operator picked flash-lite for cost): if Google
// returns 503 capacity on flash-lite there's nothing strictly below
// to try, so the call would fail. Walking upward as a final tier
// lets the user's job complete on a pricier model rather than fail
// entirely — operators see the cost in the dashboard and can adjust
// their primary if the fallback-to-up is too costly in practice.
function fallbackChain(chain, primary) {
const idx = chain.indexOf(primary);
if (idx < 0) return [primary];
return [
...chain.slice(idx), // primary + everything strictly below
...chain.slice(0, idx), // everything above primary, last-resort
];
}
// Detect errors that warrant trying the next model in the chain.
// Capacity / rate-limit / network blips → yes. Auth failures / 400s
// → no, those would just keep failing with the same root cause.
function isFallbackEligibleError(err) {
const status = err?.status || err?.httpStatusCode || 0;
const msg = err?.message || String(err);
if (status === 503 || status === 429 || status === 529) return true;
// 404 when the MODEL doesn't exist — Google retires preview model
// names regularly, and the next model in the chain is exactly the
// recovery we want. Match the specific shape of the response so a
// generic 404 from a URL typo (config issue) doesn't get masked.
if (status === 404 && /models?\/[^ ]+ is not found|not supported for gen|not supported for this method/i.test(msg)) return true;
// 400 with "Thinking level is not supported" — we already gate
// thinkingConfig on Gemini 3.x Flash, but if a future model
// unexpectedly rejects it, walking to a different model is a
// valid recovery path.
if (status === 400 && /thinking level is not supported/i.test(msg)) return true;
if (/overloaded|unavailable|capacity|high demand|rate limit|fetch failed|ECONNRESET|ETIMEDOUT|socket hang up|network/i.test(msg)) return true;
return false;
}
// Strip Google's URL-padded suffix and trim so cascade messages stay
// readable in the activity log. Keeps just enough signal to tell
// "high demand" from "rate limit" from "auth failure".
function shortenError(err) {
const msg = err?.message || String(err);
return msg
.replace(/Please refer to https?:\/\/[^\s]+/g, "")
.replace(/\s+/g, " ")
.trim()
.slice(0, 120);
}
// Shift every [MM:SS] / [H:MM:SS] timestamp in a transcript by a
// number of seconds. Used by the chunked transcribe path so each
// chunk's local timestamps (model emits [0:00] at the start of its
// audio slice) translate to global timestamps relative to the start
// of the full audio. No-op when offsetSec is 0.
//
// Regex captures: hours? : minutes : seconds. Hours is optional;
// minutes and seconds are required. We rewrite the matched token
// with the offset applied, picking [H:MM:SS] format when the result
// crosses 60 minutes and [MM:SS] otherwise — same shape Gemini
// emits, so downstream parsers don't have to change.
function shiftTimestamps(text, offsetSec) {
if (!text || !offsetSec) return text;
return text.replace(
/\[(\d+):(\d{2})(?::(\d{2}))?\]/g,
(_match, p1, p2, p3) => {
let total;
if (p3 !== undefined) {
total = parseInt(p1, 10) * 3600 + parseInt(p2, 10) * 60 + parseInt(p3, 10);
} else {
total = parseInt(p1, 10) * 60 + parseInt(p2, 10);
}
total += offsetSec;
if (!Number.isFinite(total) || total < 0) return _match;
const h = Math.floor(total / 3600);
const m = Math.floor((total % 3600) / 60);
const s = total % 60;
const pad = (n) => n.toString().padStart(2, "0");
return h > 0 ? `[${h}:${pad(m)}:${pad(s)}]` : `[${m}:${pad(s)}]`;
}
);
}
// Render the attempts array as a "primary (503: high demand) →
// fallback (503: high demand) → final ✓" string for activity-log
// surfacing. Caller-agnostic so transcribe + analyze share format.
function formatCascade(attempts) {
return attempts
.map((a) =>
a.status === "success"
? `${a.model}`
: `${a.model} (${a.status}: ${a.error || "?"})`
)
.join(" → ");
}
const TRANSCRIPTION_SAFETY = [
{ category: "HARM_CATEGORY_HARASSMENT", threshold: "BLOCK_NONE" },
{ category: "HARM_CATEGORY_HATE_SPEECH", threshold: "BLOCK_NONE" },
{ category: "HARM_CATEGORY_SEXUALLY_EXPLICIT", threshold: "BLOCK_NONE" },
{ category: "HARM_CATEGORY_DANGEROUS_CONTENT", threshold: "BLOCK_NONE" },
];
export function createGeminiBackend({
apiKey,
transcriptionModel = DEFAULT_TRANSCRIPTION_MODEL,
analysisModel = DEFAULT_ANALYSIS_MODEL,
timeoutMs = 900_000,
// Chunking knobs — caller MUST source from relay-config.json. We
// accept defaults here ONLY for unit-test ergonomics. Production
// callers (admin-test-run.js, transcribe-url.js) always pass the
// values explicitly so the operator's Settings-tab edits flow
// through.
txChunkSeconds = 30 * 60,
txConcurrency = 12,
// Operator-editable prompt override (Settings tab). Empty string
// falls back to DEFAULT_TRANSCRIBE_PROMPT_BODY at request time.
transcribePromptOverride = "",
// Output-token caps. Defaults are the same values that used to be
// hardcoded in this file (65536 for TX, 8192 for AN — was implicit
// Google default for AN). Operator overrides via Settings tab so
// they can trade robustness for cost. Test-run / unit-test callers
// can keep the defaults.
txMaxOutputTokens = 65536,
anMaxOutputTokens = 8192,
} = {}) {
if (!apiKey) {
throw new Error("createGeminiBackend: apiKey is required");
}
const ai = new GoogleGenAI({
apiKey,
httpOptions: { timeout: timeoutMs, headersTimeout: timeoutMs },
});
// Build the per-call fallback chains. The primary is whatever the
// operator selected via the StartOS action; subsequent entries are
// the lower-tier members of the chain (we never fall back UP). When
// the primary returns a 503/capacity/rate-limit error, the loops
// below try the next model.
const txChain = fallbackChain(TRANSCRIPTION_FALLBACK_CHAIN, transcriptionModel);
const anChain = fallbackChain(ANALYSIS_FALLBACK_CHAIN, analysisModel);
// Single-file transcribe: upload one audio file to Gemini File API,
// poll until PROCESSING completes, run generateContent through the
// fallback chain. Used both for the short-content single-call path
// and as the per-chunk primitive for long content. timestampOffset
// is applied to all [MM:SS] / [H:MM:SS] tokens in the result so
// chunks above the first stitch correctly into a global timeline.
async function transcribeFromFile({
filePath,
mimeType,
prompt,
timestampOffsetSec = 0,
chunkLabel = "",
}) {
const tag = chunkLabel ? `[gemini ${chunkLabel}] ` : "[gemini] ";
const uploaded = await ai.files.upload({
file: filePath,
config: { mimeType },
});
let f = uploaded;
const pStart = Date.now();
while (f.state === "PROCESSING") {
if (Date.now() - pStart > 5 * 60 * 1000) {
throw new Error(`${tag}Gemini file processing exceeded 5 min`);
}
await new Promise((r) => setTimeout(r, 3000));
f = await ai.files.get({ name: f.name });
}
if (f.state === "FAILED") {
throw new Error(`${tag}Gemini failed to process audio file`);
}
const attempts = [];
let lastErr;
for (let modelIdx = 0; modelIdx < txChain.length; modelIdx++) {
const model = txChain[modelIdx];
// Both Gemini 3.x AND 2.5 families support thinking, but with
// DIFFERENT parameter shapes:
// - Gemini 3.x → thinkingConfig: { thinkingLevel }
// - Gemini 2.5 → thinkingConfig: { thinkingBudget }
const is3x = /^gemini-3(\.\d+)?-(?:pro|flash)/i.test(model);
const is25Flash = /^gemini-2\.5-flash/i.test(model);
const is25Pro = /^gemini-2\.5-pro/i.test(model);
const thinkingConfig = is3x
? { thinkingLevel: "minimal" }
: is25Flash
? { thinkingBudget: 0 }
: is25Pro
? { thinkingBudget: 128 }
: null;
try {
let result;
// Empty-response retries: SDK occasionally returns 200 with
// no text for audio inputs; retry up to N times before
// falling back to the next model.
for (let attempt = 0; attempt < EMPTY_RETRIES; attempt++) {
result = await ai.models.generateContent({
model,
config: {
...(thinkingConfig ? { thinkingConfig } : {}),
safetySettings: TRANSCRIPTION_SAFETY,
// Audio transcripts are output-token-bound. Gemini's
// default (~8192) silently truncates long-chunk
// transcripts mid-stream — observed on a 45-min chunk
// that returned 31:05 worth of speech and stopped. Set
// high so the model can emit the full transcript;
// models with smaller caps clamp internally.
// Configurable via Settings → `relay_gemini_tx_max_output_tokens`.
maxOutputTokens: txMaxOutputTokens,
},
contents: [
{
role: "user",
parts: [
{ fileData: { fileUri: f.uri, mimeType } },
{ text: prompt },
],
},
],
});
if (safeText(result)) break;
}
try { await ai.files.delete({ name: f.name }); } catch {}
const rawText = safeText(result) || "";
const text = shiftTimestamps(rawText, timestampOffsetSec);
attempts.push({ model, status: "success" });
return {
text,
usage: result?.usageMetadata || null,
model,
attempts,
};
} catch (err) {
lastErr = err;
const status = err?.status || err?.httpStatusCode || 0;
attempts.push({
model,
status: status || "error",
error: shortenError(err),
});
const canFallback =
isFallbackEligibleError(err) && modelIdx < txChain.length - 1;
console.warn(
`${tag}transcribe with ${model} failed (${status || "?"}): ${err?.message || err}${canFallback ? ` — falling back to ${txChain[modelIdx + 1]}` : ""}`
);
if (!canFallback) {
try { await ai.files.delete({ name: f.name }); } catch {}
err.attempts = attempts;
err.message =
`transcribe: all attempts failed — ${formatCascade(attempts)}`;
throw err;
}
}
}
const finalErr =
lastErr || new Error("transcribe: all models in fallback chain failed");
finalErr.attempts = attempts;
finalErr.message =
`transcribe: all attempts failed — ${formatCascade(attempts)}`;
throw finalErr;
}
async function transcribeAudio({
audio,
mimeType,
title = "",
channel = "",
description = "",
chapters = [],
offsetSeconds = 0,
}) {
const tmpDir = await fs.mkdtemp(path.join(os.tmpdir(), "relay-tx-"));
const masterPath = path.join(tmpDir, `audio.${extForMime(mimeType)}`);
await fs.writeFile(masterPath, audio);
try {
const prompt = buildTranscriptionPrompt({
title, channel, description, chapters,
promptOverride: transcribePromptOverride,
});
const duration = await getAudioDurationSeconds(masterPath);
// Short content: single-call path. Same behavior as before the
// chunking refactor — minimizes overhead for the common case.
if (!duration || duration <= txChunkSeconds) {
const singleShotStart = Date.now();
const r = await transcribeFromFile({
filePath: masterPath,
mimeType,
prompt,
timestampOffsetSec: offsetSeconds,
});
return {
// Sort even single-shot output: Gemini sometimes emits
// entries within a single call in non-chronological order
// (rare but observed). sortAndDedupeTranscript is a no-op
// when entries are already monotonic. Also coalesces
// too-short adjacent entries into more readable chunks.
text: mergeShortEntries(sortAndDedupeTranscript(r.text)),
segments: [],
duration_seconds: duration || 0,
usage: r.usage,
model: r.model,
attempts: r.attempts,
chunk_count: 1,
chunk_durations_ms: [Date.now() - singleShotStart],
};
}
// Long content: split with ffmpeg, fire chunks in parallel,
// stitch the transcripts with timestamp offsets applied per
// chunk. txChunkSeconds and txConcurrency come from the
// operator's relay-config.json (Settings tab).
const chunks = await splitAudioFile({
inputPath: masterPath,
outputDir: tmpDir,
chunkSeconds: txChunkSeconds,
});
console.log(
`[gemini] chunked transcribe: ${Math.round(duration)}s audio → ${chunks.length} chunks of up to ${txChunkSeconds}s, ${Math.min(chunks.length, txConcurrency)} in parallel`
);
// ffmpeg preserves the input extension; use the input mimeType
// for chunk uploads. Falls back to mp3 if mimeType is empty.
const chunkMime = mimeType || "audio/mpeg";
const results = new Array(chunks.length);
// Per-chunk wall-time tracking. Each entry is the duration in
// ms from when the chunk's API call started to when it
// completed. Summed by the caller into `transcribe_ms_sum`
// (total backend compute) — the parent's `duration_ms` is the
// outer parallel-fan-out wall-time, which is much smaller when
// concurrency > 1.
const chunkDurationsMs = new Array(chunks.length).fill(null);
let nextIdx = 0;
const worker = async () => {
while (true) {
const i = nextIdx++;
if (i >= chunks.length) return;
const c = chunks[i];
const chunkStart = Date.now();
try {
const r = await transcribeFromFile({
filePath: c.filePath,
mimeType: chunkMime,
prompt,
timestampOffsetSec: offsetSeconds + (c.startSeconds || 0),
chunkLabel: `chunk ${i + 1}/${chunks.length}`,
});
chunkDurationsMs[i] = Date.now() - chunkStart;
results[i] = { ok: true, ...r };
} catch (err) {
chunkDurationsMs[i] = Date.now() - chunkStart;
console.warn(
`[gemini] chunk ${i + 1}/${chunks.length} failed: ${err?.message || err}`
);
results[i] = { ok: false, error: err };
}
}
};
const workerPromises = Array.from(
{ length: Math.min(chunks.length, txConcurrency) },
worker
);
await Promise.all(workerPromises);
const succeeded = results.filter((r) => r && r.ok);
const failed = results.filter((r) => r && !r.ok);
if (succeeded.length === 0) {
const first = failed[0]?.error;
const e = new Error(
`transcribe: all ${results.length} chunks failed. First error: ${first?.message || "unknown"}`
);
e.status = first?.status || 502;
throw e;
}
if (failed.length > 0) {
console.warn(
`[gemini] chunked transcribe: ${failed.length}/${results.length} chunks failed — proceeding with ${succeeded.length} successful chunks`
);
}
// Stitch in chunk order. Each chunk's text already has its
// timestamps shifted by its startSeconds offset. In a perfect
// world the join would produce a chronological transcript —
// but Gemini (especially flash variants) sometimes emits
// entries within a single chunk in non-chronological order
// (observed pattern: 4:27 → 4:40 → 4:56 → 0:00 → 0:18 → ...,
// as if the model treated the audio as multiple "thoughts" and
// reset its mental clock between them). Without a post-stitch
// sort the downstream analyzer sees those out-of-order entries
// and the rendered transcript shows the timestamps jumping
// backward in the middle of a section. Solution: parse the
// joined text, sort entries by absolute offset, dedupe near-
// duplicates (same offset + similar leading text), re-emit.
const naiveStitched = results
.filter((r) => r && r.ok)
.map((r) => r.text)
.join("\n");
const stitchedText = mergeShortEntries(sortAndDedupeTranscript(naiveStitched));
// Truncation detection: compare the LAST timestamp emitted in
// each chunk to that chunk's expected end. Some Gemini models
// (notably 2.5-flash and flash-lite) still hit output-token
// caps on long dense chunks even with maxOutputTokens=65536 —
// they silently emit a partial transcript and the chunk
// appears to "succeed." Without this check, the operator's
// Jobs table shows status=SUCCESS for a job that lost real
// minutes of speech. We collect coverage warnings here and
// expose them via the return envelope so the worker route can
// mark the job partial + surface in the errors column.
const truncatedChunks = [];
const LAST_TS_RE = /\[(\d+):(\d{2})(?::(\d{2}))?\][^\[]*$/;
for (let i = 0; i < results.length; i++) {
const r = results[i];
if (!r || !r.ok || !r.text) continue;
const chunk = chunks[i];
const chunkEndAbs = (chunk.startSeconds || 0) + (chunk.durationSec || chunk.durationSeconds || 0);
// Pull the last [H:MM:SS]/[MM:SS] from this chunk's text.
const m = r.text.match(LAST_TS_RE);
if (!m) continue;
const h = m[3] !== undefined ? parseInt(m[1], 10) : 0;
const mm = m[3] !== undefined ? parseInt(m[2], 10) : parseInt(m[1], 10);
const ss = m[3] !== undefined ? parseInt(m[3], 10) : parseInt(m[2], 10);
const lastAbs = h * 3600 + mm * 60 + ss;
const expectedSec = chunk.durationSec || chunk.durationSeconds || 0;
if (expectedSec < 60) continue; // too short to meaningfully truncate
const coverage = expectedSec > 0
? (lastAbs - (chunk.startSeconds || 0)) / expectedSec
: 1;
if (coverage < 0.8) {
truncatedChunks.push({
index: i,
coverage: Math.round(coverage * 100),
lastTs: lastAbs,
expectedEnd: chunkEndAbs,
missingSec: expectedSec - (lastAbs - (chunk.startSeconds || 0)),
});
console.warn(
`[gemini] chunk ${i + 1} appears TRUNCATED — coverage ${Math.round(coverage * 100)}% (last ts ${lastAbs}s, expected ~${chunkEndAbs}s). Likely hit maxOutputTokens.`
);
}
}
// Aggregate metadata: pick the most-used successful model,
// sum token usage across chunks, collect every attempt.
const modelCounts = new Map();
let totalIn = 0, totalOut = 0, totalThink = 0, totalAll = 0;
const allAttempts = [];
for (const r of results) {
if (!r || !r.ok) continue;
modelCounts.set(r.model, (modelCounts.get(r.model) || 0) + 1);
const u = r.usage || {};
totalIn += u.promptTokenCount || 0;
totalOut += u.candidatesTokenCount || 0;
totalThink += u.thoughtsTokenCount || 0;
totalAll += u.totalTokenCount || 0;
if (Array.isArray(r.attempts)) allAttempts.push(...r.attempts);
}
const dominantModel =
[...modelCounts.entries()].sort((a, b) => b[1] - a[1])[0]?.[0] || null;
return {
text: stitchedText,
segments: [],
duration_seconds: duration,
usage: {
promptTokenCount: totalIn,
candidatesTokenCount: totalOut,
thoughtsTokenCount: totalThink,
totalTokenCount:
totalAll || totalIn + totalOut + totalThink,
},
model: dominantModel,
attempts: allAttempts,
chunk_count: chunks.length,
// Per-chunk wall-time (ms). Summed by callers into the
// "transcribe_ms_sum" audit field — represents total backend
// compute across all chunks, distinct from the outer
// duration_ms which is the parallel-fan-out wall-time.
chunk_durations_ms: chunkDurationsMs,
// Truncated-chunk warnings. Empty array if every chunk
// emitted ≥80% of its expected audio duration; populated
// when one or more chunks hit a silent output-token cap.
// Worker routes pass this through to recordCall so the
// Jobs-table status flips from SUCCESS to PARTIAL and the
// errors column shows what went missing.
truncated_chunks: truncatedChunks,
};
} finally {
try { await fs.rm(tmpDir, { recursive: true, force: true }); } catch {}
}
}
async function analyzeText({ prompt }) {
// Accumulate every attempt so the final error / success can
// surface the cascade. Without this, the operator only sees the
// last model's failure and doesn't know whether the system even
// tried the cheaper/older fallbacks.
const attempts = [];
let lastErr;
for (let modelIdx = 0; modelIdx < anChain.length; modelIdx++) {
const model = anChain[modelIdx];
try {
const result = await ai.models.generateContent({
model,
config: {
// Explicit cap so an over-thinking model can't emit an
// unbounded reasoning preamble that crowds out the JSON
// sections array. Default 8192 is plenty for the typical
// 1-3 section payload; configurable via Settings →
// `relay_gemini_an_max_output_tokens`.
maxOutputTokens: anMaxOutputTokens,
// JSON mode — Gemini guarantees the response body is
// valid JSON when this is set. Eliminates the entire
// class of "invalid JSON in window response" failures
// that came from the model occasionally wrapping its
// sections array in a prose preamble, a markdown fence,
// or truncating the closing brace. The prompt itself
// already asks for JSON; this turns that into a hard
// server-enforced constraint on the model\'s decoder.
// Doesn\'t replace responseSchema (which would also
// enforce shape) — kept lighter-weight because the
// post-parse stitcher already clamps + dedupes out-of-
// range indices, so a structural deviation doesn\'t
// crash anything.
responseMimeType: "application/json",
},
contents: [
{
role: "user",
parts: [{ text: prompt }],
},
],
});
attempts.push({ model, status: "success" });
return {
text: safeText(result) || "",
usage: result?.usageMetadata || null,
model,
attempts,
};
} catch (err) {
lastErr = err;
const status = err?.status || err?.httpStatusCode || 0;
attempts.push({
model,
status: status || "error",
error: shortenError(err),
});
const canFallback =
isFallbackEligibleError(err) && modelIdx < anChain.length - 1;
console.warn(
`[gemini] analyze with ${model} failed (${status || "?"}): ${err?.message || err}${canFallback ? ` — falling back to ${anChain[modelIdx + 1]}` : ""}`
);
if (!canFallback) {
err.attempts = attempts;
err.message =
`analyze: all attempts failed — ${formatCascade(attempts)}`;
throw err;
}
}
}
const finalErr =
lastErr || new Error("analyze: all models in fallback chain failed");
finalErr.attempts = attempts;
finalErr.message =
`analyze: all attempts failed — ${formatCascade(attempts)}`;
throw finalErr;
}
return { transcribeAudio, analyzeText };
}
function safeText(r) {
try {
if (r?.text) return r.text;
} catch {}
try {
const parts = r?.candidates?.[0]?.content?.parts;
if (parts) return parts.map((p) => p.text || "").join("");
} catch {}
return "";
}
// Default transcribe-prompt INSTRUCTION body. Exported so the
// dashboard's Settings tab can show it as the "current default"
// alongside any operator override. The auto-generated metadata
// block (title / channel / description / chapters) is prepended
// at request time and is NOT part of the editable prompt — the
// operator only edits the instruction portion below.
export const DEFAULT_TRANSCRIBE_PROMPT_BODY = `Transcribe this audio completely and verbatim. Group the transcript into substantive thoughts, each spanning roughly 30-60 seconds (or a complete idea, whichever is natural).
Format each line as:
[MM:SS] The spoken text here, spanning a complete thought or several connected sentences...
Rules:
- Transcribe EVERY word spoken, do not skip or summarize anything.
- Use [MM:SS] or [H:MM:SS] timestamp format at the start of each line.
- AIM FOR ~30-60 SECONDS between timestamps. Do NOT emit one entry per breath or one entry per sentence — that makes the transcript unreadable. Each entry should contain a complete thought, typically 2-5 sentences, ~50-200 words. Short interjections ("Yeah.", "Right.") only get their own line when they're a meaningful exchange between speakers.
- TIMESTAMPS MUST INCREASE MONOTONICALLY across the entire output. Never reset to a smaller time and never go backwards. If the audio length is over 60 minutes, use [H:MM:SS] format consistently.
- Include filler words (um, uh, you know) for accuracy.
- Speaker identification: FIRST consult the metadata above — descriptions and chapter titles usually name the host(s) and guest(s) explicitly, and the channel name is often the host's name. Match those names to the voices in the audio (introductions, "I'm Dax", "this is Will", first-person references) and use them as speaker labels. Format as: [MM:SS] Name: text. Only fall back to "Host"/"Guest" if no names appear in the metadata AND nobody is introduced by name in the audio.
Return ONLY the timestamped transcript, nothing else.`;
// Builds the auto-prepended metadata context (title / channel /
// description / chapters). Returns "" when nothing is present —
// avoids leading whitespace in that case.
function buildTranscriptionContextBlock({ title, channel, description, chapters } = {}) {
let ctx = "";
if (title) ctx += `Video title: "${title}"\n`;
if (channel) ctx += `Channel: ${channel}\n`;
if (description) {
const d = description.length > 1500 ? description.slice(0, 1500) + "…" : description;
ctx += `Video description (use to identify speakers by name):\n${d}\n`;
}
if (Array.isArray(chapters) && chapters.length > 0) {
const lines = chapters
.slice(0, 30)
.map((c) => {
const start = typeof c.start_time === "number" ? c.start_time : 0;
const mm = Math.floor(start / 60);
const ss = Math.floor(start % 60).toString().padStart(2, "0");
return ` [${mm}:${ss}] ${c.title || ""}`;
})
.join("\n");
ctx += `Chapter markers:\n${lines}\n`;
}
if (ctx) ctx += "\n";
return ctx;
}
// Resolve the transcribe prompt: operator override (from config)
// when present + non-empty, else the hardcoded default body.
// Always prepends the auto-generated context block.
function buildTranscriptionPrompt({ title, channel, description, chapters, promptOverride } = {}) {
const ctx = buildTranscriptionContextBlock({ title, channel, description, chapters });
const body = (typeof promptOverride === "string" && promptOverride.trim())
? promptOverride
: DEFAULT_TRANSCRIBE_PROMPT_BODY;
return ctx + body;
}
// Merge adjacent entries that are too short (just a few words) into
// a single readable entry. Models occasionally emit one entry per
// breath/word ("► 4:05 um,", "► 4:07 that is", "► 4:09 usually based")
// which is unreadable. We coalesce entries that are <60 chars AND
// within ~10s of the previous entry's timestamp, until each entry is
// either ≥60 chars or sits at a real speech-pause boundary. The
// merge preserves the FIRST entry's timestamp (the start of the
// merged thought).
//
// Conservative thresholds: 60-char floor keeps very short utterances
// (like "Yeah." or "Right.") from being aggressively swept up, but
// merges fragmentary breath-by-breath entries the model sometimes
// produces. 10s gap ceiling avoids merging across an actual silence.
export function mergeShortEntries(text) {
if (!text || typeof text !== "string") return text;
const lines = text.split(/\r?\n/);
const tsRe = /^\[(\d{1,2}):(\d{2})(?::(\d{2}))?\]\s*(.*)$/;
const entries = [];
for (const line of lines) {
const m = line.match(tsRe);
if (m) {
const offset = m[3] !== undefined
? parseInt(m[1], 10) * 3600 + parseInt(m[2], 10) * 60 + parseInt(m[3], 10)
: parseInt(m[1], 10) * 60 + parseInt(m[2], 10);
entries.push({ offset, text: (m[4] || "").trim() });
} else if (entries.length > 0 && line.trim()) {
// continuation
entries[entries.length - 1].text += " " + line.trim();
}
}
if (entries.length < 2) return text;
const MIN_CHARS = 60;
const MAX_GAP_SEC = 10;
const merged = [];
for (const e of entries) {
const prev = merged[merged.length - 1];
const gap = prev ? e.offset - prev.offset : Infinity;
if (prev && prev.text.length < MIN_CHARS && gap <= MAX_GAP_SEC) {
// Append to previous; keep prev.offset (start-of-thought).
prev.text = (prev.text + " " + e.text).trim();
} else {
merged.push({ offset: e.offset, text: e.text });
}
}
// Re-emit. Re-stamp in canonical [H:MM:SS] / [MM:SS] form.
const out = merged.map((e) => {
const h = Math.floor(e.offset / 3600);
const m = Math.floor((e.offset % 3600) / 60);
const s = e.offset % 60;
const pad = (n) => n.toString().padStart(2, "0");
const stamp = h > 0 ? `${h}:${pad(m)}:${pad(s)}` : `${m}:${pad(s)}`;
return `[${stamp}] ${e.text}`;
});
return out.join("\n");
}
// Parse a bracketed-timestamp transcript, sort entries by absolute
// offset, drop near-duplicates, and re-emit. Used as a defensive
// post-stitch step in the chunked transcribe path — Gemini flash
// variants occasionally emit entries within a chunk in non-
// chronological order. Without this sort the downstream analyzer
// sees out-of-order entries and the user-facing transcript renders
// with timestamps jumping backward mid-section.
//
// Dedup rule: entries with offsets within 1 second AND identical
// leading 40 chars are treated as duplicates (keeps the first).
// Conservative — won't merge entries that share an offset but
// have different content (e.g., two speakers at the same moment).
export function sortAndDedupeTranscript(text) {
if (!text || typeof text !== "string") return text;
const lines = text.split(/\r?\n/);
const entries = [];
const tsRe = /^\[(\d{1,2}):(\d{2})(?::(\d{2}))?\]\s*(.*)$/;
for (const line of lines) {
const m = line.match(tsRe);
if (m) {
let offset;
if (m[3] !== undefined) {
offset = parseInt(m[1], 10) * 3600 + parseInt(m[2], 10) * 60 + parseInt(m[3], 10);
} else {
offset = parseInt(m[1], 10) * 60 + parseInt(m[2], 10);
}
entries.push({ offset, text: (m[4] || "").trim(), origLine: line });
} else if (entries.length > 0 && line.trim()) {
// Continuation line — append to the previous entry's text so
// multi-line entries don't get lost in the sort.
entries[entries.length - 1].text += " " + line.trim();
entries[entries.length - 1].origLine += "\n" + line;
}
}
if (entries.length < 2) return text;
// Detect if sorting is needed (most chunks are already sorted; skip
// the expensive rebuild when they are).
let isSorted = true;
for (let i = 1; i < entries.length; i++) {
if (entries[i].offset < entries[i - 1].offset) {
isSorted = false;
break;
}
}
if (isSorted) return text;
console.warn(
`[gemini] post-stitch sort: ${entries.length} entries were out of order — re-sorting by absolute offset`
);
// Stable sort by offset, then drop near-duplicates.
entries.sort((a, b) => a.offset - b.offset);
const out = [];
for (const e of entries) {
const prev = out[out.length - 1];
if (prev &&
Math.abs(e.offset - prev.offset) <= 1 &&
e.text.slice(0, 40) === prev.text.slice(0, 40)) {
continue; // duplicate
}
// Re-emit in canonical [H:MM:SS] / [MM:SS] form.
const h = Math.floor(e.offset / 3600);
const m = Math.floor((e.offset % 3600) / 60);
const s = e.offset % 60;
const pad = (n) => n.toString().padStart(2, "0");
const stamp = h > 0 ? `${h}:${pad(m)}:${pad(s)}` : `${m}:${pad(s)}`;
out.push(`[${stamp}] ${e.text}`);
}
return out.join("\n");
}