v0.2.7 configurable Gemini models + per-pipeline backend preference
This commit is contained in:
@@ -18,8 +18,13 @@ import fs from "fs/promises";
|
||||
import os from "os";
|
||||
import path from "path";
|
||||
|
||||
const TRANSCRIPTION_MODEL = "gemini-3-flash-preview";
|
||||
const ANALYSIS_MODEL = "gemini-3.1-pro-preview";
|
||||
// Defaults used only when the caller doesn't supply explicit model
|
||||
// names. Production callers should pass models pulled from
|
||||
// relay_gemini_transcription_model / relay_gemini_analysis_model in
|
||||
// the relay config so the operator can swap SKUs (e.g. flash for
|
||||
// analysis) without rebuilding the relay.
|
||||
const DEFAULT_TRANSCRIPTION_MODEL = "gemini-3-flash-preview";
|
||||
const DEFAULT_ANALYSIS_MODEL = "gemini-3.1-pro-preview";
|
||||
const EMPTY_RETRIES = 3;
|
||||
|
||||
const TRANSCRIPTION_SAFETY = [
|
||||
@@ -29,7 +34,12 @@ const TRANSCRIPTION_SAFETY = [
|
||||
{ category: "HARM_CATEGORY_DANGEROUS_CONTENT", threshold: "BLOCK_NONE" },
|
||||
];
|
||||
|
||||
export function createGeminiBackend({ apiKey, timeoutMs = 900_000 } = {}) {
|
||||
export function createGeminiBackend({
|
||||
apiKey,
|
||||
transcriptionModel = DEFAULT_TRANSCRIPTION_MODEL,
|
||||
analysisModel = DEFAULT_ANALYSIS_MODEL,
|
||||
timeoutMs = 900_000,
|
||||
} = {}) {
|
||||
if (!apiKey) {
|
||||
throw new Error("createGeminiBackend: apiKey is required");
|
||||
}
|
||||
@@ -37,6 +47,10 @@ export function createGeminiBackend({ apiKey, timeoutMs = 900_000 } = {}) {
|
||||
apiKey,
|
||||
httpOptions: { timeout: timeoutMs, headersTimeout: timeoutMs },
|
||||
});
|
||||
// Flash models accept `thinkingLevel: "minimal"`; Pro models reject
|
||||
// it. Detect from the model id so the operator can flip flash <-> pro
|
||||
// via the StartOS action without breaking the request.
|
||||
const txIsFlash = /flash/i.test(transcriptionModel);
|
||||
|
||||
async function transcribeAudio({
|
||||
audio,
|
||||
@@ -73,9 +87,12 @@ export function createGeminiBackend({ apiKey, timeoutMs = 900_000 } = {}) {
|
||||
let result;
|
||||
for (let attempt = 0; attempt < EMPTY_RETRIES; attempt++) {
|
||||
result = await ai.models.generateContent({
|
||||
model: TRANSCRIPTION_MODEL,
|
||||
model: transcriptionModel,
|
||||
config: {
|
||||
thinkingConfig: { thinkingLevel: "minimal" },
|
||||
// thinkingLevel: "minimal" is only valid for Flash. Pro
|
||||
// models reject it. Skip when the operator picks a Pro
|
||||
// model for transcription (slower but valid).
|
||||
...(txIsFlash ? { thinkingConfig: { thinkingLevel: "minimal" } } : {}),
|
||||
safetySettings: TRANSCRIPTION_SAFETY,
|
||||
},
|
||||
contents: [
|
||||
@@ -111,7 +128,7 @@ export function createGeminiBackend({ apiKey, timeoutMs = 900_000 } = {}) {
|
||||
|
||||
async function analyzeText({ prompt }) {
|
||||
const result = await ai.models.generateContent({
|
||||
model: ANALYSIS_MODEL,
|
||||
model: analysisModel,
|
||||
contents: [
|
||||
{
|
||||
role: "user",
|
||||
|
||||
@@ -18,6 +18,10 @@ function defaultConfig() {
|
||||
relay_gemma_base_url: "",
|
||||
relay_parakeet_model: "parakeet-tdt-0.6b-v3",
|
||||
relay_gemma_model: "gemma3:27b",
|
||||
relay_gemini_transcription_model: "gemini-3-flash-preview",
|
||||
relay_gemini_analysis_model: "gemini-3.1-pro-preview",
|
||||
relay_transcribe_backend_preference: "gemini_first",
|
||||
relay_analyze_backend_preference: "gemini_first",
|
||||
relay_keysat_base_url: "https://keysat.xyz",
|
||||
relay_admin_username: "",
|
||||
relay_admin_password_hash: "",
|
||||
|
||||
+68
-9
@@ -250,7 +250,24 @@ export function computeRemaining(row, quota) {
|
||||
// served at all. Returns { allowed, backend: "gemini"|"hardware",
|
||||
// reason }. Does NOT debit — that's a separate commit step after the
|
||||
// backend call succeeds.
|
||||
export function planBackend(row, quota, { hasHardware }) {
|
||||
//
|
||||
// `preference` is the operator-configured routing strategy for the
|
||||
// current pipeline step (transcribe or analyze), one of:
|
||||
// - "gemini_first" try Gemini until cap is exceeded, then hardware
|
||||
// (default — best quality routing on operator's
|
||||
// Gemini budget, hardware as overflow)
|
||||
// - "hardware_first" try hardware first, fall back to Gemini when
|
||||
// hardware isn't configured (lets the operator
|
||||
// conserve Gemini budget for premium use cases)
|
||||
// - "gemini_only" Gemini only, fail when cap exceeded (caps the
|
||||
// operator's spend at the per-tier limit)
|
||||
// - "hardware_only" Hardware only, fail when not configured (good
|
||||
// for fully local / offline deployments)
|
||||
//
|
||||
// The Gemini cap (geminiCapMonthly / geminiCapLifetime on the tier
|
||||
// quota) still applies regardless of preference — preference just
|
||||
// controls the order in which backends are tried.
|
||||
export function planBackend(row, quota, { hasHardware, preference = "gemini_first" }) {
|
||||
const balance = computeRemaining(row, quota);
|
||||
|
||||
// Out of credits entirely?
|
||||
@@ -258,15 +275,57 @@ export function planBackend(row, quota, { hasHardware }) {
|
||||
return { allowed: false, backend: null, reason: "out_of_credits" };
|
||||
}
|
||||
|
||||
// Pick backend: Gemini if there's room under the Gemini cap; else
|
||||
// fall back to hardware if configured; else 402.
|
||||
if (balance.gemini_remaining === null || balance.gemini_remaining > 0) {
|
||||
return { allowed: true, backend: "gemini", reason: null };
|
||||
const geminiAvailable =
|
||||
balance.gemini_remaining === null || balance.gemini_remaining > 0;
|
||||
|
||||
switch (preference) {
|
||||
case "hardware_only":
|
||||
if (hasHardware) {
|
||||
return { allowed: true, backend: "hardware", reason: null };
|
||||
}
|
||||
return {
|
||||
allowed: false,
|
||||
backend: null,
|
||||
reason: "hardware_only_not_configured",
|
||||
};
|
||||
|
||||
case "gemini_only":
|
||||
if (geminiAvailable) {
|
||||
return { allowed: true, backend: "gemini", reason: null };
|
||||
}
|
||||
return {
|
||||
allowed: false,
|
||||
backend: null,
|
||||
reason: "gemini_cap_exceeded_no_fallback",
|
||||
};
|
||||
|
||||
case "hardware_first":
|
||||
if (hasHardware) {
|
||||
return { allowed: true, backend: "hardware", reason: null };
|
||||
}
|
||||
if (geminiAvailable) {
|
||||
return { allowed: true, backend: "gemini", reason: null };
|
||||
}
|
||||
return {
|
||||
allowed: false,
|
||||
backend: null,
|
||||
reason: "no_backend_available",
|
||||
};
|
||||
|
||||
case "gemini_first":
|
||||
default:
|
||||
if (geminiAvailable) {
|
||||
return { allowed: true, backend: "gemini", reason: null };
|
||||
}
|
||||
if (hasHardware) {
|
||||
return { allowed: true, backend: "hardware", reason: null };
|
||||
}
|
||||
return {
|
||||
allowed: false,
|
||||
backend: null,
|
||||
reason: "gemini_cap_exceeded_no_hardware",
|
||||
};
|
||||
}
|
||||
if (hasHardware) {
|
||||
return { allowed: true, backend: "hardware", reason: null };
|
||||
}
|
||||
return { allowed: false, backend: null, reason: "gemini_cap_exceeded_no_hardware" };
|
||||
}
|
||||
|
||||
// Debit one credit on a successful call. Persists immediately.
|
||||
|
||||
+1
-1
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "recap-relay-server",
|
||||
"version": "0.2.6",
|
||||
"version": "0.2.7",
|
||||
"type": "module",
|
||||
"private": true,
|
||||
"dependencies": {
|
||||
|
||||
@@ -61,7 +61,9 @@ export function analyzeRouter() {
|
||||
const cfg = await getConfigSnapshot();
|
||||
const hasHardware = !!cfg.relay_gemma_base_url;
|
||||
const quota = await getTierQuotas();
|
||||
const plan = planBackend(row, quota, { hasHardware });
|
||||
const preference =
|
||||
cfg.relay_analyze_backend_preference || "gemini_first";
|
||||
const plan = planBackend(row, quota, { hasHardware, preference });
|
||||
if (!plan.allowed) {
|
||||
const e = await errorEnvelope({
|
||||
error: plan.reason,
|
||||
@@ -78,7 +80,11 @@ export function analyzeRouter() {
|
||||
let result;
|
||||
try {
|
||||
if (chosenBackend === "gemini") {
|
||||
const backend = createGeminiBackend({ apiKey: cfg.relay_gemini_api_key });
|
||||
const backend = createGeminiBackend({
|
||||
apiKey: cfg.relay_gemini_api_key,
|
||||
transcriptionModel: cfg.relay_gemini_transcription_model,
|
||||
analysisModel: cfg.relay_gemini_analysis_model,
|
||||
});
|
||||
result = await backend.analyzeText({ prompt });
|
||||
} else {
|
||||
const backend = createHardwareBackend({
|
||||
|
||||
@@ -78,7 +78,9 @@ export function transcribeRouter() {
|
||||
const cfg = await getConfigSnapshot();
|
||||
const hasHardware = !!cfg.relay_parakeet_base_url;
|
||||
const quota = await getTierQuotas();
|
||||
const plan = planBackend(row, quota, { hasHardware });
|
||||
const preference =
|
||||
cfg.relay_transcribe_backend_preference || "gemini_first";
|
||||
const plan = planBackend(row, quota, { hasHardware, preference });
|
||||
if (!plan.allowed) {
|
||||
const e = await errorEnvelope({
|
||||
error: plan.reason,
|
||||
@@ -96,7 +98,11 @@ export function transcribeRouter() {
|
||||
let result;
|
||||
try {
|
||||
if (chosenBackend === "gemini") {
|
||||
const backend = createGeminiBackend({ apiKey: cfg.relay_gemini_api_key });
|
||||
const backend = createGeminiBackend({
|
||||
apiKey: cfg.relay_gemini_api_key,
|
||||
transcriptionModel: cfg.relay_gemini_transcription_model,
|
||||
analysisModel: cfg.relay_gemini_analysis_model,
|
||||
});
|
||||
result = await backend.transcribeAudio({
|
||||
audio: req.file.buffer,
|
||||
mimeType: req.body?.mime_type || req.file.mimetype || "application/octet-stream",
|
||||
|
||||
Reference in New Issue
Block a user