initial relay scaffold
This commit is contained in:
@@ -0,0 +1,176 @@
|
||||
// Gemini backend forwarder. Receives a transcribe or analyze request
|
||||
// from a route handler, calls the corresponding Gemini API, and
|
||||
// returns a normalized result the route can wrap in the standard
|
||||
// envelope.
|
||||
//
|
||||
// v0.1 implements:
|
||||
// - transcribeAudio({ audio: Buffer, mimeType, title?, channel?,
|
||||
// description?, chapters?, offsetSeconds? }) → { text, segments,
|
||||
// duration_seconds }
|
||||
// - analyzeText({ prompt }) → { text }
|
||||
//
|
||||
// Both go through @google/genai with similar prompts to Recap's
|
||||
// gemini.js provider, so output shapes line up with what Recap's
|
||||
// orchestration layer expects.
|
||||
|
||||
import { GoogleGenAI } from "@google/genai";
|
||||
import fs from "fs/promises";
|
||||
import os from "os";
|
||||
import path from "path";
|
||||
|
||||
const TRANSCRIPTION_MODEL = "gemini-3-flash-preview";
|
||||
const ANALYSIS_MODEL = "gemini-3.1-pro-preview";
|
||||
const EMPTY_RETRIES = 3;
|
||||
|
||||
const TRANSCRIPTION_SAFETY = [
|
||||
{ category: "HARM_CATEGORY_HARASSMENT", threshold: "BLOCK_NONE" },
|
||||
{ category: "HARM_CATEGORY_HATE_SPEECH", threshold: "BLOCK_NONE" },
|
||||
{ category: "HARM_CATEGORY_SEXUALLY_EXPLICIT", threshold: "BLOCK_NONE" },
|
||||
{ category: "HARM_CATEGORY_DANGEROUS_CONTENT", threshold: "BLOCK_NONE" },
|
||||
];
|
||||
|
||||
export function createGeminiBackend({ apiKey, timeoutMs = 900_000 } = {}) {
|
||||
if (!apiKey) {
|
||||
throw new Error("createGeminiBackend: apiKey is required");
|
||||
}
|
||||
const ai = new GoogleGenAI({
|
||||
apiKey,
|
||||
httpOptions: { timeout: timeoutMs, headersTimeout: timeoutMs },
|
||||
});
|
||||
|
||||
async function transcribeAudio({
|
||||
audio,
|
||||
mimeType,
|
||||
title = "",
|
||||
channel = "",
|
||||
description = "",
|
||||
chapters = [],
|
||||
offsetSeconds = 0,
|
||||
}) {
|
||||
// The Files API requires a path on disk; write to a temp file.
|
||||
const tmpDir = await fs.mkdtemp(path.join(os.tmpdir(), "relay-tx-"));
|
||||
const tmpPath = path.join(tmpDir, "audio.bin");
|
||||
await fs.writeFile(tmpPath, audio);
|
||||
try {
|
||||
const uploaded = await ai.files.upload({
|
||||
file: tmpPath,
|
||||
config: { mimeType },
|
||||
});
|
||||
let f = uploaded;
|
||||
const pStart = Date.now();
|
||||
while (f.state === "PROCESSING") {
|
||||
if (Date.now() - pStart > 5 * 60 * 1000) {
|
||||
throw new Error("Gemini file processing exceeded 5 min");
|
||||
}
|
||||
await new Promise((r) => setTimeout(r, 3000));
|
||||
f = await ai.files.get({ name: f.name });
|
||||
}
|
||||
if (f.state === "FAILED") {
|
||||
throw new Error("Gemini failed to process audio file");
|
||||
}
|
||||
|
||||
const prompt = buildTranscriptionPrompt({ title, channel, description, chapters });
|
||||
let result;
|
||||
for (let attempt = 0; attempt < EMPTY_RETRIES; attempt++) {
|
||||
result = await ai.models.generateContent({
|
||||
model: TRANSCRIPTION_MODEL,
|
||||
config: {
|
||||
thinkingConfig: { thinkingLevel: "minimal" },
|
||||
safetySettings: TRANSCRIPTION_SAFETY,
|
||||
},
|
||||
contents: [
|
||||
{
|
||||
role: "user",
|
||||
parts: [
|
||||
{ fileData: { fileUri: f.uri, mimeType } },
|
||||
{ text: prompt },
|
||||
],
|
||||
},
|
||||
],
|
||||
});
|
||||
if (safeText(result)) break;
|
||||
}
|
||||
|
||||
// Best-effort cleanup of the uploaded File API artifact.
|
||||
try { await ai.files.delete({ name: f.name }); } catch {}
|
||||
|
||||
const text = safeText(result) || "";
|
||||
return {
|
||||
text,
|
||||
// Gemini returns a single timestamped blob — segments are
|
||||
// parsed client-side by the orchestration layer. We could
|
||||
// pre-parse here but Recap already has parseTimestampedTranscript
|
||||
// that handles this exact shape.
|
||||
segments: [],
|
||||
duration_seconds: 0,
|
||||
};
|
||||
} finally {
|
||||
try { await fs.rm(tmpDir, { recursive: true, force: true }); } catch {}
|
||||
}
|
||||
}
|
||||
|
||||
async function analyzeText({ prompt }) {
|
||||
const result = await ai.models.generateContent({
|
||||
model: ANALYSIS_MODEL,
|
||||
contents: [
|
||||
{
|
||||
role: "user",
|
||||
parts: [{ text: prompt }],
|
||||
},
|
||||
],
|
||||
});
|
||||
return {
|
||||
text: safeText(result) || "",
|
||||
};
|
||||
}
|
||||
|
||||
return { transcribeAudio, analyzeText };
|
||||
}
|
||||
|
||||
function safeText(r) {
|
||||
try {
|
||||
if (r?.text) return r.text;
|
||||
} catch {}
|
||||
try {
|
||||
const parts = r?.candidates?.[0]?.content?.parts;
|
||||
if (parts) return parts.map((p) => p.text || "").join("");
|
||||
} catch {}
|
||||
return "";
|
||||
}
|
||||
|
||||
function buildTranscriptionPrompt({ title, channel, description, chapters } = {}) {
|
||||
let ctx = "";
|
||||
if (title) ctx += `Video title: "${title}"\n`;
|
||||
if (channel) ctx += `Channel: ${channel}\n`;
|
||||
if (description) {
|
||||
const d = description.length > 1500 ? description.slice(0, 1500) + "…" : description;
|
||||
ctx += `Video description (use to identify speakers by name):\n${d}\n`;
|
||||
}
|
||||
if (Array.isArray(chapters) && chapters.length > 0) {
|
||||
const lines = chapters
|
||||
.slice(0, 30)
|
||||
.map((c) => {
|
||||
const start = typeof c.start_time === "number" ? c.start_time : 0;
|
||||
const mm = Math.floor(start / 60);
|
||||
const ss = Math.floor(start % 60).toString().padStart(2, "0");
|
||||
return ` [${mm}:${ss}] ${c.title || ""}`;
|
||||
})
|
||||
.join("\n");
|
||||
ctx += `Chapter markers:\n${lines}\n`;
|
||||
}
|
||||
if (ctx) ctx += "\n";
|
||||
|
||||
return `${ctx}Transcribe this audio completely and verbatim. Include timestamps at regular intervals (every 15-30 seconds or at natural pauses).
|
||||
|
||||
Format each line as:
|
||||
[MM:SS] The spoken text here...
|
||||
|
||||
Rules:
|
||||
- Transcribe EVERY word spoken, do not skip or summarize anything.
|
||||
- Use [MM:SS] or [H:MM:SS] timestamp format at the start of each line.
|
||||
- Start a new timestamped line every 15-30 seconds or at natural speech pauses.
|
||||
- Include filler words (um, uh, you know) for accuracy.
|
||||
- Speaker identification: FIRST consult the metadata above — descriptions and chapter titles usually name the host(s) and guest(s) explicitly. Format as: [MM:SS] Name: text. Only fall back to "Host"/"Guest" if no names appear.
|
||||
|
||||
Return ONLY the timestamped transcript, nothing else.`;
|
||||
}
|
||||
Reference in New Issue
Block a user