Files
recap-relay/server/jobs.js
T

220 lines
8.1 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
// In-memory background-job tracker. Used by /relay/transcribe-url
// (and any future long-running endpoint) so the request that kicks
// off the work returns immediately with a job_id, and the client
// polls /relay/jobs/{id} to find out when it's done.
//
// Rationale: synchronous HTTP responses for multi-minute transcribes
// are fragile. Any intermediate proxy / load balancer / NAT in the
// path will drop the connection after some idle/total timeout (often
// 100s10min), failing the whole job mid-flight even though the
// relay backend is working fine. Async jobs sidestep all of that:
// the long-running work happens off the request path and the client
// polls short, cheap requests until done.
//
// Storage is in-process memory. Restart-survivability is a known
// gap — a relay restart mid-job loses that job's state, and the
// client will re-poll forever until it gives up. Acceptable for v1
// at small relay scale; the audit log already captures every
// completed call so the operator has a paper trail either way.
// Migrate to SQLite if/when restart-resilience becomes important.
//
// Each job is { id, kind, install_id, status, started_at, updated_at,
// completed_at?, progress?, result?, error? }
// status: "queued" | "running" | "complete" | "failed"
import { randomUUID } from "crypto";
import { sanitizeErrorForClient } from "./sanitize-error.js";
// All in-memory; lost on restart.
const jobs = new Map();
// Cap how long completed jobs hang around so the map doesn't grow
// unbounded. Once a client has polled and seen "complete", it'll
// stop polling — keeping the record 24h gives slow / retried clients
// a generous window without exhausting memory.
const RETENTION_MS = 24 * 60 * 60 * 1000;
export function createJob({ kind, installId, metadata = {} }) {
pruneExpired();
const id = randomUUID();
const now = Date.now();
const job = {
id,
kind,
install_id: installId,
status: "queued",
started_at: now,
updated_at: now,
completed_at: null,
progress: null,
result: null,
error: null,
metadata,
// Event log + live subscriber list. Used by jobs that stream
// incremental results via SSE (e.g., /relay/summarize-url
// dispatches transcribe_progress, transcribe_complete,
// window_complete, done, error events). Each event is
// { type, data, ts } and gets BOTH appended to the log (so a
// late SSE-connecting client can replay missed events) and
// pushed to any currently-subscribed callbacks. `subscribers`
// is intentionally non-enumerable / non-serialized so it never
// leaks into snapshotJobs() or HTTP responses.
events: [],
};
Object.defineProperty(job, "subscribers", {
value: new Set(),
enumerable: false,
writable: false,
});
jobs.set(id, job);
return job;
}
// Append an event to a job's log AND notify any live SSE
// subscribers. Used by /relay/summarize-url's background worker to
// emit per-window progress as it streams in from runChunkedAnalysis.
// Event shape:
// { type: "window_complete"|"transcribe_complete"|"done"|"error"|"progress",
// data: <event payload>,
// ts: ms-epoch }
// Subscriber callbacks receive ONLY the new event (not the full log);
// new subscribers should replay the log themselves on connect.
export function appendEvent(jobId, type, data) {
const job = jobs.get(jobId);
if (!job) return;
const event = { type, data, ts: Date.now() };
job.events.push(event);
job.updated_at = event.ts;
// Cap the log so a runaway job doesn't blow memory. 1000 events
// is far beyond any plausible window count (typical: 10-20).
if (job.events.length > 1000) job.events.shift();
for (const cb of job.subscribers) {
try {
cb(event);
} catch (err) {
console.warn(`[jobs] subscriber callback failed: ${err?.message || err}`);
}
}
}
// Subscribe to live events from a job. Returns an unsubscribe
// function the caller MUST call (e.g., on SSE connection close)
// or the job state will leak the callback closure forever.
// Returns null when the job no longer exists.
export function subscribeToJob(jobId, callback) {
const job = jobs.get(jobId);
if (!job) return null;
job.subscribers.add(callback);
return () => {
job.subscribers.delete(callback);
};
}
export function getJob(jobId) {
pruneExpired();
return jobs.get(jobId) || null;
}
export function markRunning(jobId) {
const job = jobs.get(jobId);
if (!job) return;
job.status = "running";
job.updated_at = Date.now();
}
export function setProgress(jobId, message) {
const job = jobs.get(jobId);
if (!job) return;
job.progress = String(message).slice(0, 200);
job.updated_at = Date.now();
}
export function markComplete(jobId, envelope) {
const job = jobs.get(jobId);
if (!job) return;
job.status = "complete";
// Keep the full envelope shape on the job (caller decides what to
// pass — typically { result: {...inner...}, credit_charged, tier }).
// Internal consumers that read job.result directly still see the
// wrapped form.
job.result = envelope;
job.completed_at = Date.now();
job.updated_at = job.completed_at;
// SSE "done" event: emit the INNER result directly so subscribers
// can read fields off `data.result.title` (or `.transcript`,
// `.analysis`, etc.) instead of a confusing `data.result.result.title`.
// The wrapped form (envelope.result) is unwrapped here; if the
// caller passed a flat result without an inner `.result` key we
// just pass it through unchanged. credit_charged + tier travel
// alongside as siblings so the SSE consumer can update its
// balance display without digging into the result body.
//
// Why this matters: Recap-app's SSE handler does
// `finalResult = data.result`, then reads `finalResult.title`.
// Before this fix, that landed on the wrapping envelope and every
// title came back undefined — library entries persisted as
// "Untitled" despite the relay correctly extracting the real title
// via yt-dlp. The audit log was unaffected (it reads the local
// `title` variable directly) which made the bug look like a
// Recap-side issue. It wasn't.
const inner = envelope && typeof envelope === "object" && "result" in envelope
? envelope.result
: envelope;
appendEvent(jobId, "done", {
result: inner,
credit_charged: envelope?.credit_charged,
tier: envelope?.tier,
});
}
export function markFailed(jobId, errorMessage) {
const job = jobs.get(jobId);
if (!job) return;
job.status = "failed";
// Sanitize at the source so EVERY downstream surface that reads
// job.error (SSE error event, the per-job GET endpoints, etc.)
// gets the client-safe wording, without having to remember to
// sanitize at every call site. The raw operator-internal message
// stays available on job.error_internal for the admin dashboard +
// audit log (snapshotJobs exposes both fields).
const raw = String(errorMessage || "unknown error").slice(0, 500);
job.error_internal = raw;
job.error = sanitizeErrorForClient(raw).slice(0, 500);
job.completed_at = Date.now();
job.updated_at = job.completed_at;
// Same terminal event for failures — SSE clients close on this
// and surface the error to the user.
appendEvent(jobId, "error", { error: job.error });
}
export function snapshotJobs() {
pruneExpired();
return Array.from(jobs.values()).map((j) => ({
id: j.id,
kind: j.kind,
install_id: j.install_id,
status: j.status,
started_at: j.started_at,
updated_at: j.updated_at,
completed_at: j.completed_at,
progress: j.progress,
has_result: j.result != null,
// Both error variants exposed — the admin dashboard consumes
// snapshotJobs and can prefer error_internal for operator
// diagnosis (full backend / spark-control wording intact).
// External callers should always read `error` (sanitized).
error: j.error,
error_internal: j.error_internal || j.error,
}));
}
function pruneExpired() {
const cutoff = Date.now() - RETENTION_MS;
for (const [id, job] of jobs) {
const ref = job.completed_at || job.updated_at || job.started_at;
if (ref && ref < cutoff) {
jobs.delete(id);
}
}
}