Add internal-meetings pipeline and post-hoc speaker tools
This commit is contained in:
@@ -0,0 +1,330 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||||
<title>Job output — Recap Relay</title>
|
||||
<!--
|
||||
Stand-alone render of a stored job's transcript + analysis JSON.
|
||||
Loaded by the operator's dashboard "View" link on a Jobs row.
|
||||
|
||||
Visual style: mirror of Recap's results panel (two-pane —
|
||||
topic list on the left, transcript on the right, click a topic
|
||||
to jump to its timestamp range in the transcript). Sourced from
|
||||
Recap's index.html .chunk + .transcript-line styling so changes
|
||||
there stay aesthetically aligned here.
|
||||
|
||||
Data source: GET /admin/job-output/:id returns
|
||||
{
|
||||
job_id, batch_id, source, saved_at,
|
||||
transcript: "[MM:SS] line\n[MM:SS] line...",
|
||||
analysis: { sections: [{ title, summary, startIndex, endIndex }] } | null
|
||||
analysis_raw_text: string | null // when JSON-parse failed
|
||||
meta: { title, media_url, audio_seconds, ... }
|
||||
}
|
||||
-->
|
||||
<style>
|
||||
:root {
|
||||
--bg: #0a0e1a;
|
||||
--panel: #111827;
|
||||
--panel-2: #1e293b;
|
||||
--line: #1e293b;
|
||||
--line-2: #334155;
|
||||
--fg: #e2e8f0;
|
||||
--fg-dim: #94a3b8;
|
||||
--fg-faint: #64748b;
|
||||
--accent: #818cf8;
|
||||
--accent-soft: #a5b4fc;
|
||||
--good: #4ade80;
|
||||
--bad: #fca5a5;
|
||||
}
|
||||
* { box-sizing: border-box; }
|
||||
body {
|
||||
margin: 0; padding: 0;
|
||||
background: var(--bg); color: var(--fg);
|
||||
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Helvetica, Arial, sans-serif;
|
||||
font-size: 13px; line-height: 1.55;
|
||||
min-height: 100vh;
|
||||
}
|
||||
a { color: var(--accent-soft); text-decoration: none; }
|
||||
a:hover { text-decoration: underline; }
|
||||
|
||||
.header {
|
||||
padding: 14px 24px;
|
||||
background: var(--panel);
|
||||
border-bottom: 1px solid var(--line);
|
||||
display: flex; align-items: center; gap: 16px; flex-wrap: wrap;
|
||||
}
|
||||
.header h1 {
|
||||
margin: 0; font-size: 16px; font-weight: 700;
|
||||
color: var(--fg); max-width: 800px;
|
||||
overflow: hidden; text-overflow: ellipsis; white-space: nowrap;
|
||||
}
|
||||
.header .meta { font-size: 11px; color: var(--fg-faint); }
|
||||
.header .meta strong { color: var(--fg-dim); }
|
||||
.header .pill {
|
||||
display: inline-block; padding: 2px 8px; border-radius: 999px;
|
||||
font-size: 10px; font-weight: 700; text-transform: uppercase;
|
||||
letter-spacing: 0.04em;
|
||||
background: rgba(129,140,248,0.18); color: var(--accent-soft);
|
||||
}
|
||||
|
||||
.split { display: flex; min-height: calc(100vh - 60px); }
|
||||
.left {
|
||||
flex: 0 0 42%; max-width: 42%;
|
||||
border-right: 1px solid var(--line);
|
||||
overflow-y: auto;
|
||||
padding: 16px;
|
||||
background: var(--bg);
|
||||
}
|
||||
.right {
|
||||
flex: 1; min-width: 0;
|
||||
overflow-y: auto;
|
||||
padding: 16px;
|
||||
background: var(--panel);
|
||||
}
|
||||
@media (max-width: 900px) {
|
||||
.split { flex-direction: column; }
|
||||
.left, .right { flex: none; max-width: 100%; border-right: none; }
|
||||
.left { border-bottom: 1px solid var(--line); max-height: 50vh; }
|
||||
}
|
||||
|
||||
/* Topic / chunk card */
|
||||
.chunk {
|
||||
padding: 12px 14px; margin-bottom: 8px;
|
||||
background: var(--panel); border: 1px solid var(--line);
|
||||
border-radius: 10px; cursor: pointer;
|
||||
transition: border-color 0.15s, background 0.15s;
|
||||
}
|
||||
.chunk:hover { border-color: var(--accent); }
|
||||
.chunk.active {
|
||||
border-color: var(--accent);
|
||||
background: rgba(129,140,248,0.06);
|
||||
box-shadow: 0 2px 16px rgba(129,140,248,0.10);
|
||||
}
|
||||
.chunk-title {
|
||||
font-size: 13px; font-weight: 700; color: var(--fg);
|
||||
margin-bottom: 4px;
|
||||
}
|
||||
.chunk-time {
|
||||
font-size: 10px; color: var(--fg-faint);
|
||||
font-variant-numeric: tabular-nums; margin-left: 6px;
|
||||
font-weight: 500;
|
||||
}
|
||||
.chunk-summary {
|
||||
font-size: 12px; color: var(--fg-dim); line-height: 1.5;
|
||||
}
|
||||
|
||||
/* Transcript pane */
|
||||
.transcript-line {
|
||||
display: flex; gap: 10px; padding: 4px 8px;
|
||||
border-radius: 6px; line-height: 1.6;
|
||||
scroll-margin-top: 80px;
|
||||
}
|
||||
.transcript-line.hl { background: rgba(129,140,248,0.10); }
|
||||
.ts-badge {
|
||||
flex: 0 0 auto;
|
||||
font-family: "SF Mono", Menlo, monospace;
|
||||
font-size: 11px; color: var(--accent-soft);
|
||||
min-width: 56px;
|
||||
}
|
||||
.ts-text { flex: 1; font-size: 13px; color: var(--fg); }
|
||||
|
||||
.empty {
|
||||
padding: 40px 20px; text-align: center;
|
||||
color: var(--fg-faint); font-size: 13px;
|
||||
}
|
||||
.error {
|
||||
padding: 20px; background: var(--panel);
|
||||
border: 1px solid var(--bad); border-radius: 10px;
|
||||
color: var(--bad); margin: 20px;
|
||||
}
|
||||
pre.raw {
|
||||
background: var(--panel); padding: 12px;
|
||||
border: 1px solid var(--line); border-radius: 8px;
|
||||
overflow: auto; font-size: 11px; color: var(--fg-dim);
|
||||
max-height: 300px; white-space: pre-wrap;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div id="root">
|
||||
<div class="empty">Loading job output…</div>
|
||||
</div>
|
||||
|
||||
<script>
|
||||
// Strip script tags + on-event attrs from any HTML the data
|
||||
// accidentally contains. Transcript + analysis text comes from
|
||||
// Gemini / Parakeet so it's unlikely to contain malicious HTML
|
||||
// but escape it anyway — we're rendering server data in an
|
||||
// admin context.
|
||||
function esc(s) {
|
||||
if (s == null) return "";
|
||||
return String(s)
|
||||
.replace(/&/g, "&")
|
||||
.replace(/</g, "<")
|
||||
.replace(/>/g, ">")
|
||||
.replace(/"/g, """);
|
||||
}
|
||||
|
||||
// Parse "[MM:SS] text" or "[H:MM:SS] text" lines into entries
|
||||
// with offset-seconds + text. Mirrors Recap's parser so the
|
||||
// analysis startIndex/endIndex map onto the same entry indices.
|
||||
function parseTimestampedTranscript(text) {
|
||||
if (!text) return [];
|
||||
const entries = [];
|
||||
const re = /^\s*\[(\d+):(\d{2})(?::(\d{2}))?\]\s*(.*)$/;
|
||||
for (const line of String(text).split(/\r?\n/)) {
|
||||
const m = line.match(re);
|
||||
if (!m) continue;
|
||||
const hasHours = m[3] !== undefined;
|
||||
const offset = hasHours
|
||||
? parseInt(m[1], 10) * 3600 + parseInt(m[2], 10) * 60 + parseInt(m[3], 10)
|
||||
: parseInt(m[1], 10) * 60 + parseInt(m[2], 10);
|
||||
entries.push({ offset, text: m[4].trim() });
|
||||
}
|
||||
return entries;
|
||||
}
|
||||
|
||||
function formatTime(sec) {
|
||||
sec = Math.max(0, Math.round(sec || 0));
|
||||
const h = Math.floor(sec / 3600);
|
||||
const m = Math.floor((sec % 3600) / 60);
|
||||
const s = sec % 60;
|
||||
const pad = (n) => String(n).padStart(2, "0");
|
||||
return h > 0 ? `${h}:${pad(m)}:${pad(s)}` : `${m}:${pad(s)}`;
|
||||
}
|
||||
|
||||
function getJobIdFromURL() {
|
||||
const u = new URL(location.href);
|
||||
return u.searchParams.get("id") || "";
|
||||
}
|
||||
|
||||
async function load() {
|
||||
const root = document.getElementById("root");
|
||||
const jobId = getJobIdFromURL();
|
||||
if (!jobId) {
|
||||
root.innerHTML = '<div class="error">Missing ?id=<job_id> in URL.</div>';
|
||||
return;
|
||||
}
|
||||
let data;
|
||||
try {
|
||||
const r = await fetch("/admin/job-output/" + encodeURIComponent(jobId));
|
||||
if (r.status === 404) {
|
||||
root.innerHTML = '<div class="error">No stored output for job <code>' + esc(jobId) + '</code>. The output may have been deleted, or this job ran before output-storage was enabled.</div>';
|
||||
return;
|
||||
}
|
||||
if (!r.ok) throw new Error("HTTP " + r.status);
|
||||
data = await r.json();
|
||||
} catch (err) {
|
||||
root.innerHTML = '<div class="error">Failed to load: ' + esc(err?.message || err) + '</div>';
|
||||
return;
|
||||
}
|
||||
render(data);
|
||||
}
|
||||
|
||||
function render(data) {
|
||||
const root = document.getElementById("root");
|
||||
const meta = data.meta || {};
|
||||
const entries = parseTimestampedTranscript(data.transcript || "");
|
||||
const sections = (data.analysis && Array.isArray(data.analysis.sections))
|
||||
? data.analysis.sections
|
||||
: null;
|
||||
|
||||
// Header block — title, source, models, batch link back to dashboard.
|
||||
const sourceLabel = data.source === "admin-test"
|
||||
? '<span class="pill">Test run</span>'
|
||||
: (data.source === "admin-test-shared-tx" ? '<span class="pill">Shared TX</span>' : '');
|
||||
const headerHTML =
|
||||
'<div class="header">' +
|
||||
'<h1>' + esc(meta.title || meta.media_url || data.job_id) + '</h1>' +
|
||||
sourceLabel +
|
||||
'<div class="meta">' +
|
||||
(meta.media_url ? '<a href="' + esc(meta.media_url) + '" target="_blank" rel="noopener">source ↗</a> · ' : '') +
|
||||
(meta.audio_seconds ? '<strong>' + formatTime(meta.audio_seconds) + '</strong> audio · ' : '') +
|
||||
(meta.transcribe_backend ? 'TX: <strong>' + esc(meta.transcribe_model || meta.transcribe_backend) + '</strong> · ' : '') +
|
||||
(meta.analyze_backend ? 'AN: <strong>' + esc(meta.analyze_model || meta.analyze_backend) + '</strong>' : '') +
|
||||
'</div>' +
|
||||
'<div style="margin-left:auto;"><a href="/" title="Back to dashboard">← Dashboard</a></div>' +
|
||||
'</div>';
|
||||
|
||||
// Empty states.
|
||||
if (entries.length === 0) {
|
||||
root.innerHTML = headerHTML +
|
||||
'<div class="empty">No transcript text was saved for this job.</div>';
|
||||
return;
|
||||
}
|
||||
|
||||
// Left pane: topic list.
|
||||
let leftHTML = '<div class="left" id="topics-pane">';
|
||||
if (!sections || sections.length === 0) {
|
||||
leftHTML += '<div class="empty">No analysis sections were saved for this job.';
|
||||
if (data.analysis_raw_text) {
|
||||
leftHTML += '<pre class="raw" style="text-align:left; margin-top: 12px;">' + esc(data.analysis_raw_text.slice(0, 4000)) + '</pre>';
|
||||
}
|
||||
leftHTML += '</div>';
|
||||
} else {
|
||||
sections.forEach((s, i) => {
|
||||
const startIdx = Math.max(0, Math.min(s.startIndex || 0, entries.length - 1));
|
||||
const startTs = entries[startIdx]?.offset || 0;
|
||||
const endIdx = Math.max(startIdx, Math.min(s.endIndex || 0, entries.length - 1));
|
||||
const endTs = entries[endIdx]?.offset || 0;
|
||||
leftHTML +=
|
||||
'<div class="chunk" data-section-idx="' + i + '" data-start="' + startIdx + '" onclick="onChunkClick(' + i + ')">' +
|
||||
'<div class="chunk-title">' +
|
||||
esc(s.title || "(untitled)") +
|
||||
'<span class="chunk-time">' + formatTime(startTs) + ' — ' + formatTime(endTs) + '</span>' +
|
||||
'</div>' +
|
||||
'<div class="chunk-summary">' + esc(s.summary || "") + '</div>' +
|
||||
'</div>';
|
||||
});
|
||||
}
|
||||
leftHTML += '</div>';
|
||||
|
||||
// Right pane: transcript.
|
||||
let rightHTML = '<div class="right" id="transcript-pane">';
|
||||
entries.forEach((e, i) => {
|
||||
rightHTML +=
|
||||
'<div class="transcript-line" id="entry-' + i + '">' +
|
||||
'<span class="ts-badge">' + formatTime(e.offset) + '</span>' +
|
||||
'<span class="ts-text">' + esc(e.text) + '</span>' +
|
||||
'</div>';
|
||||
});
|
||||
rightHTML += '</div>';
|
||||
|
||||
root.innerHTML = headerHTML + '<div class="split">' + leftHTML + rightHTML + '</div>';
|
||||
|
||||
// Expose for click handlers.
|
||||
window._entries = entries;
|
||||
window._sections = sections;
|
||||
}
|
||||
|
||||
// Click a topic in the left pane: scroll the matching entry into
|
||||
// view on the right pane and apply a highlight band over the
|
||||
// section's entry range. Highlight clears after a couple seconds.
|
||||
function onChunkClick(sectionIdx) {
|
||||
const sections = window._sections;
|
||||
if (!sections || !sections[sectionIdx]) return;
|
||||
// Mark active chunk visually.
|
||||
document.querySelectorAll(".chunk.active").forEach((el) => el.classList.remove("active"));
|
||||
const chunkEl = document.querySelector('.chunk[data-section-idx="' + sectionIdx + '"]');
|
||||
if (chunkEl) chunkEl.classList.add("active");
|
||||
|
||||
const s = sections[sectionIdx];
|
||||
const start = Math.max(0, s.startIndex || 0);
|
||||
const end = Math.max(start, s.endIndex || start);
|
||||
// Scroll the start entry into view in the transcript pane.
|
||||
const target = document.getElementById("entry-" + start);
|
||||
if (target) target.scrollIntoView({ behavior: "smooth", block: "start" });
|
||||
// Highlight the section's range; clear prior highlights first.
|
||||
document.querySelectorAll(".transcript-line.hl").forEach((el) => el.classList.remove("hl"));
|
||||
for (let i = start; i <= end; i++) {
|
||||
const el = document.getElementById("entry-" + i);
|
||||
if (el) el.classList.add("hl");
|
||||
}
|
||||
}
|
||||
|
||||
load();
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
@@ -0,0 +1,171 @@
|
||||
// Wrapper around ffprobe for getting the playable duration of an
|
||||
// audio file. Used by the transcribe routes to record audio_seconds
|
||||
// alongside each audit entry, so the dashboard can normalize wall-
|
||||
// clock duration to "ms per minute of audio" — a backend-agnostic
|
||||
// speed benchmark.
|
||||
//
|
||||
// Returns the duration in seconds (float), or null if ffprobe fails
|
||||
// or the file isn't probeable. Never throws — best-effort metadata
|
||||
// shouldn't break the request that needs it.
|
||||
|
||||
import { execFile } from "child_process";
|
||||
import { promisify } from "util";
|
||||
import fs from "fs/promises";
|
||||
import os from "os";
|
||||
import path from "path";
|
||||
|
||||
const execFileAsync = promisify(execFile);
|
||||
|
||||
// NOTE: there is intentionally NO default chunk size export here.
|
||||
// The canonical default lives in server/config.js
|
||||
// (`relay_hardware_tx_chunk_minutes` and `relay_gemini_tx_chunk_minutes`)
|
||||
// and flows down through createHardwareBackend / createGeminiBackend
|
||||
// to splitAudioFile. Removed in v0.2.32 so there's exactly one place
|
||||
// to change the default — the Settings tab in the dashboard.
|
||||
|
||||
// Runs ffprobe on a file path. Returns seconds, or null on any failure.
|
||||
export async function getAudioDurationSeconds(filePath) {
|
||||
if (!filePath) return null;
|
||||
try {
|
||||
// -v error: silence everything except hard errors
|
||||
// -show_entries format=duration: just the duration float
|
||||
// -of default=noprint_wrappers=1:nokey=1: bare number, no labels
|
||||
const { stdout } = await execFileAsync(
|
||||
"ffprobe",
|
||||
[
|
||||
"-v",
|
||||
"error",
|
||||
"-show_entries",
|
||||
"format=duration",
|
||||
"-of",
|
||||
"default=noprint_wrappers=1:nokey=1",
|
||||
filePath,
|
||||
],
|
||||
{ timeout: 10_000 }
|
||||
);
|
||||
const seconds = parseFloat(stdout.trim());
|
||||
if (!Number.isFinite(seconds) || seconds <= 0) return null;
|
||||
return seconds;
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
// Split an audio file into fixed-length chunks via ffmpeg. Returns
|
||||
// an array of { filePath, startSeconds, durationSeconds, index }
|
||||
// ordered by startSeconds. Uses -acodec copy so it's lossless and
|
||||
// fast (no re-encoding pass). Returns an empty array if the audio
|
||||
// is shorter than chunkSeconds — caller should just send the
|
||||
// original file in that case.
|
||||
//
|
||||
// Used by the hardware backend to keep Parakeet calls within memory
|
||||
// limits on long audio. The relay's audit log later records audio_seconds
|
||||
// for the WHOLE file (not per-chunk) so the dashboard's
|
||||
// "ms per minute of audio" benchmark stays meaningful.
|
||||
export async function splitAudioFile({
|
||||
inputPath,
|
||||
outputDir,
|
||||
chunkSeconds,
|
||||
overlapSeconds = 0,
|
||||
}) {
|
||||
if (!Number.isFinite(chunkSeconds) || chunkSeconds <= 0) {
|
||||
throw new Error("splitAudioFile: chunkSeconds is required (no default — pass an explicit value from config)");
|
||||
}
|
||||
if (
|
||||
!Number.isFinite(overlapSeconds) ||
|
||||
overlapSeconds < 0 ||
|
||||
overlapSeconds >= chunkSeconds
|
||||
) {
|
||||
// Overlap must be smaller than chunk size or the loop never
|
||||
// advances. 0 is fine (no overlap, original behavior).
|
||||
overlapSeconds = 0;
|
||||
}
|
||||
const duration = await getAudioDurationSeconds(inputPath);
|
||||
if (!duration || duration <= chunkSeconds) return [];
|
||||
const chunks = [];
|
||||
let startSec = 0;
|
||||
let i = 0;
|
||||
const ext = path.extname(inputPath).replace(/^\./, "") || "mp3";
|
||||
// Advance step = chunkSeconds - overlap. Each chunk still has
|
||||
// length up to chunkSeconds; consecutive chunks share `overlap`
|
||||
// seconds at their boundary. The caller's stitching code dedupes
|
||||
// by dropping the overlapping prefix from chunk N+1 (and all
|
||||
// subsequent chunks).
|
||||
const advanceStep = chunkSeconds - overlapSeconds;
|
||||
while (startSec < duration) {
|
||||
const chunkPath = path.join(outputDir, `chunk_${i}.${ext}`);
|
||||
const segLen = Math.min(chunkSeconds, duration - startSec);
|
||||
try {
|
||||
await execFileAsync(
|
||||
"ffmpeg",
|
||||
[
|
||||
"-y",
|
||||
"-i",
|
||||
inputPath,
|
||||
"-ss",
|
||||
String(startSec),
|
||||
"-t",
|
||||
String(segLen),
|
||||
"-acodec",
|
||||
"copy",
|
||||
chunkPath,
|
||||
],
|
||||
{ timeout: 120_000 }
|
||||
);
|
||||
} catch (err) {
|
||||
// `-acodec copy` fails on some containers/streams that don't
|
||||
// start on a keyframe at the cut point. Retry with re-encoding,
|
||||
// which always works at the cost of CPU time.
|
||||
await execFileAsync(
|
||||
"ffmpeg",
|
||||
[
|
||||
"-y",
|
||||
"-i",
|
||||
inputPath,
|
||||
"-ss",
|
||||
String(startSec),
|
||||
"-t",
|
||||
String(segLen),
|
||||
chunkPath,
|
||||
],
|
||||
{ timeout: 180_000 }
|
||||
);
|
||||
}
|
||||
chunks.push({
|
||||
filePath: chunkPath,
|
||||
startSeconds: startSec,
|
||||
durationSeconds: segLen,
|
||||
// Boundary marker: timestamps strictly less than this value
|
||||
// are duplicates of the prior chunk's tail (overlap region).
|
||||
// Caller dedupes by dropping output before this boundary.
|
||||
// For chunk 0 this equals startSec (no prior chunk), so the
|
||||
// boundary check is a no-op.
|
||||
overlapBoundarySec: i === 0 ? startSec : startSec + overlapSeconds,
|
||||
index: i,
|
||||
});
|
||||
startSec += advanceStep;
|
||||
i++;
|
||||
}
|
||||
return chunks;
|
||||
}
|
||||
|
||||
// Convenience wrapper for callers holding the audio in memory (the
|
||||
// /relay/transcribe route receives multipart uploads as buffers).
|
||||
// Writes a temp file, probes, cleans up. Cheaper than re-streaming
|
||||
// through ffprobe's stdin which doesn't always handle every format
|
||||
// reliably.
|
||||
export async function getAudioDurationSecondsFromBuffer(buffer) {
|
||||
if (!buffer || !buffer.length) return null;
|
||||
const tmpFile = path.join(
|
||||
os.tmpdir(),
|
||||
`relay-probe-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`
|
||||
);
|
||||
try {
|
||||
await fs.writeFile(tmpFile, buffer);
|
||||
return await getAudioDurationSeconds(tmpFile);
|
||||
} catch {
|
||||
return null;
|
||||
} finally {
|
||||
fs.unlink(tmpFile).catch(() => {});
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,142 @@
|
||||
// Chunk-buffer state used by the pipelined-analyze path in
|
||||
// routes/summarize-url.js. The hardware backend fires
|
||||
// onChunkComplete(chunkData) as each transcribe chunk finishes;
|
||||
// this buffer:
|
||||
// - drains chunks in INDEX ORDER (chunks may arrive out of order
|
||||
// when concurrency > 1; we hold them in `pending` until the
|
||||
// next-expected index lands so dedup against the prior chunk's
|
||||
// overlap boundary is deterministic)
|
||||
// - dedupes each new chunk's segments against the prior chunk's
|
||||
// overlapBoundarySec — same logic that runs at end-of-transcribe
|
||||
// in hardware.js, but applied incrementally so analyze can read
|
||||
// a clean, no-duplicates segment view per window
|
||||
// - tracks coveredEndSec (the maximum global timestamp the deduped
|
||||
// buffer extends to, considering ONLY in-order chunks)
|
||||
// - lets the analyze workers await `waitForTime(targetSec)` and
|
||||
// query `getSegments(startSec, endSec)` to build per-window
|
||||
// analyze inputs as soon as the required chunks are in
|
||||
//
|
||||
// Failure modes:
|
||||
// - A chunk fails entirely → its segments are empty / undefined.
|
||||
// The buffer still advances nextExpected past it so later chunks
|
||||
// aren't stuck behind. The window covering that chunk's range
|
||||
// gets a shorter transcript and may yield no sections (or fewer
|
||||
// than expected). Downstream stitcher tolerates gaps.
|
||||
// - waitForTime can wait forever if the relevant chunk index
|
||||
// never arrives. Caller is responsible for racing this against
|
||||
// the transcribe Promise so a transcribe failure unblocks all
|
||||
// pending waiters via reject.
|
||||
|
||||
export function createChunkBuffer() {
|
||||
return {
|
||||
// Sparse staging area for chunks that arrived out of index order.
|
||||
pending: new Map(),
|
||||
// Drained, deduped, sorted-by-start segments. Append-only.
|
||||
segments: [],
|
||||
// Index of the next chunk we're waiting on to drain.
|
||||
nextExpected: 0,
|
||||
// Total chunk count, populated on the first onChunkComplete call.
|
||||
totalChunks: null,
|
||||
// Greatest global end-time covered by drained chunks. NOT just
|
||||
// max(pending) — out-of-order pending chunks don't count until
|
||||
// their predecessors land, so dedup is consistent.
|
||||
coveredEndSec: 0,
|
||||
// The previous chunk's overlap boundary in GLOBAL seconds.
|
||||
// Segments in the next chunk with start < this are duplicates of
|
||||
// segments already in the prior chunk's tail and get dropped.
|
||||
prevOverlapBoundary: 0,
|
||||
// Async waiters: { targetSec, resolve, reject }
|
||||
waiters: [],
|
||||
// Set true on terminal failure so future waiters reject immediately
|
||||
// instead of hanging.
|
||||
failed: false,
|
||||
failedReason: null,
|
||||
|
||||
add(chunkData) {
|
||||
if (this.failed) return;
|
||||
if (chunkData == null) return;
|
||||
if (this.totalChunks == null && Number.isInteger(chunkData.totalChunks)) {
|
||||
this.totalChunks = chunkData.totalChunks;
|
||||
}
|
||||
this.pending.set(chunkData.chunkIndex, chunkData);
|
||||
// Drain consecutive chunks starting from nextExpected
|
||||
while (this.pending.has(this.nextExpected)) {
|
||||
const c = this.pending.get(this.nextExpected);
|
||||
this.pending.delete(this.nextExpected);
|
||||
const segs = Array.isArray(c.segments) ? c.segments : [];
|
||||
// Dedup against the global overlap boundary set by the prior
|
||||
// chunk. Same predicate hardware.js uses at end-of-transcribe
|
||||
// for the global stitch: `seg.start >= prevOverlapBoundary`.
|
||||
for (const s of segs) {
|
||||
if ((s.start || 0) >= this.prevOverlapBoundary) {
|
||||
this.segments.push(s);
|
||||
}
|
||||
}
|
||||
// overlapBoundarySec from audio-meta.js is ALREADY a global
|
||||
// timestamp (= startSec + overlapSeconds at chunking time),
|
||||
// NOT a chunk-relative offset. The earlier `c.startSeconds +
|
||||
// c.overlapBoundarySec` double-counted: chunk 1 ended up
|
||||
// with prevOverlapBoundary=570 instead of 300, chunk 2
|
||||
// 1110 instead of 570, and by chunk 3+ the boundary had
|
||||
// outrun every subsequent chunk's segments — all dropped.
|
||||
// Symptom: window 1 received only ~30% of the segments it
|
||||
// should have, windows 2-6 received zero. Matches the
|
||||
// formula hardware.js uses at end-of-transcribe (with
|
||||
// offsetSeconds=0 for summarize-url callers).
|
||||
this.prevOverlapBoundary = c.overlapBoundarySec || 0;
|
||||
const endHere = (c.startSeconds || 0) + (c.durationSeconds || 0);
|
||||
if (endHere > this.coveredEndSec) this.coveredEndSec = endHere;
|
||||
this.nextExpected += 1;
|
||||
}
|
||||
this.checkWaiters();
|
||||
},
|
||||
|
||||
checkWaiters() {
|
||||
const stillWaiting = [];
|
||||
for (const w of this.waiters) {
|
||||
if (this.coveredEndSec >= w.targetSec) {
|
||||
w.resolve();
|
||||
} else {
|
||||
stillWaiting.push(w);
|
||||
}
|
||||
}
|
||||
this.waiters = stillWaiting;
|
||||
},
|
||||
|
||||
// Block until coveredEndSec reaches targetSec. Rejects with the
|
||||
// failedReason if the buffer is poisoned by a transcribe failure.
|
||||
waitForTime(targetSec) {
|
||||
if (this.failed) return Promise.reject(this.failedReason);
|
||||
if (this.coveredEndSec >= targetSec) return Promise.resolve();
|
||||
return new Promise((resolve, reject) =>
|
||||
this.waiters.push({ targetSec, resolve, reject })
|
||||
);
|
||||
},
|
||||
|
||||
// Snapshot the segments covering [startSec, endSec). Caller gets
|
||||
// a fresh array safe to mutate.
|
||||
getSegments(startSec, endSec) {
|
||||
const out = [];
|
||||
for (const s of this.segments) {
|
||||
const t = s.start || 0;
|
||||
if (t >= startSec && t < endSec) out.push(s);
|
||||
}
|
||||
return out;
|
||||
},
|
||||
|
||||
// Mark the buffer dead so all current + future waiters reject.
|
||||
// Called when transcribe throws — without this, runPipelinedAnalysis
|
||||
// workers would hang forever waiting for a window that'll never
|
||||
// become ready.
|
||||
fail(reason) {
|
||||
this.failed = true;
|
||||
this.failedReason = reason instanceof Error
|
||||
? reason
|
||||
: new Error(String(reason || "transcribe failed"));
|
||||
for (const w of this.waiters) {
|
||||
try { w.reject(this.failedReason); } catch {}
|
||||
}
|
||||
this.waiters = [];
|
||||
},
|
||||
};
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,323 @@
|
||||
// Per-job aggregation over the relay's audit log. The audit log
|
||||
// records ONE row per relay call (transcribe or analyze); a single
|
||||
// summary job typically produces 1 transcribe row + N analyze rows
|
||||
// (one per chunked-analyze window). This module groups those rows by
|
||||
// X-Recap-Job-Id and computes per-video stats the dashboard renders
|
||||
// as a sortable / filterable table.
|
||||
//
|
||||
// The aggregation is computed on the fly from the in-memory entries
|
||||
// array — no separate persistence. A typical 30-day window has a few
|
||||
// thousand audit rows; grouping is O(n) and well under 10 ms.
|
||||
//
|
||||
// Output row shape (one per job_id, plus a synthetic row for
|
||||
// orphaned entries with no job_id):
|
||||
// {
|
||||
// job_id: string | null
|
||||
// started_at: ms-epoch (earliest ts across the job's rows)
|
||||
// completed_at: ms-epoch (latest ts)
|
||||
// install_id: short string
|
||||
// tier: "core" | "pro" | "max" | null
|
||||
// media_url: string | null // from the transcribe row
|
||||
// title: string | null // ditto
|
||||
// audio_seconds: number | null // from transcribe row
|
||||
// audio_bytes: number | null // ditto (bytes downloaded
|
||||
// by the relay for transcribe-url)
|
||||
// transcribe_status: "success" | "error" | "refused" | "missing"
|
||||
// transcribe_backend: "gemini" | "hardware" | null
|
||||
// transcribe_model: string | null
|
||||
// transcribe_ms: number | null
|
||||
// download_ms: number | null
|
||||
// chunk_count: number | null // transcribe-side audio chunks
|
||||
// analyze_windows_total: number // count of analyze rows
|
||||
// analyze_windows_success: number
|
||||
// analyze_windows_failed: number
|
||||
// analyze_backend: string | null // dominant backend across analyze rows
|
||||
// analyze_model: string | null // dominant model across analyze rows
|
||||
// analyze_ms: number // sum of analyze duration_ms
|
||||
// overall_status: "success" | "partial" | "failed"
|
||||
// wall_time_ms: completed_at - started_at
|
||||
// cost_usd: number (sum across all rows)
|
||||
// errors: string[] // concatenated short error strings
|
||||
// // Derived metrics — pre-computed so the UI can sort by them:
|
||||
// transcribe_ms_per_min: number | null // transcribe_ms / (audio_seconds/60)
|
||||
// transcribe_ms_per_mb: number | null // transcribe_ms / (audio_bytes / 1MB)
|
||||
// download_ms_per_mb: number | null
|
||||
// analyze_ms_per_min: number | null
|
||||
// analyze_ms_per_mb: number | null
|
||||
// }
|
||||
|
||||
const MB = 1024 * 1024;
|
||||
|
||||
export function aggregateJobs(entries, opts = {}) {
|
||||
// Group by job_id. Entries without job_id become singleton groups
|
||||
// keyed by their ts so they still appear in the table (helpful for
|
||||
// debugging orphan calls).
|
||||
const groups = new Map();
|
||||
for (const e of entries) {
|
||||
const key = e.job_id || `_orphan_${e.ts}`;
|
||||
if (!groups.has(key)) groups.set(key, []);
|
||||
groups.get(key).push(e);
|
||||
}
|
||||
|
||||
// Optional set of job_ids that have stored output JSONs — passed
|
||||
// in from the route layer so the aggregator doesn't have to hit
|
||||
// the filesystem itself. Used to set the has_output flag the
|
||||
// dashboard reads to show/hide the "View" link.
|
||||
const outputIdSet = opts.outputIdSet instanceof Set ? opts.outputIdSet : null;
|
||||
|
||||
const out = [];
|
||||
for (const [key, rows] of groups) {
|
||||
const row = aggregateOne(key, rows);
|
||||
row.has_output = outputIdSet ? outputIdSet.has(row.job_id) : false;
|
||||
out.push(row);
|
||||
}
|
||||
// Newest first by started_at.
|
||||
out.sort((a, b) => b.started_at - a.started_at);
|
||||
return out;
|
||||
}
|
||||
|
||||
function aggregateOne(key, rows) {
|
||||
rows.sort((a, b) => a.ts - b.ts);
|
||||
// ts in each audit row is when recordCall() fired — i.e., when the
|
||||
// work for that row COMPLETED, not when it started. To recover the
|
||||
// user-POV "job start" timestamp we work backwards from the first
|
||||
// row's end-time using its duration_ms AND download_ms fields.
|
||||
//
|
||||
// TX row layout:
|
||||
// ts = download_end + tx_work_end
|
||||
// duration_ms = tx_work_duration (NOT including download)
|
||||
// download_ms = download_duration
|
||||
//
|
||||
// So: job_start = ts - duration_ms - download_ms.
|
||||
//
|
||||
// Without including download_ms here, wall_time misses the
|
||||
// download phase (which can be 30-60s on a long YouTube fetch).
|
||||
// Including it makes wall_time match the operator's intuitive
|
||||
// formula: WALL ≈ DL + TX + AN_wall.
|
||||
const firstRowDur = Number(rows[0].duration_ms) || 0;
|
||||
const firstRowDownload = Number(rows[0].download_ms) || 0;
|
||||
const startedAt = rows[0].ts - firstRowDur - firstRowDownload;
|
||||
const completedAt = rows[rows.length - 1].ts;
|
||||
|
||||
const tx = rows.find((r) => r.pipeline === "transcribe");
|
||||
const analyzeRows = rows.filter((r) => r.pipeline === "analyze");
|
||||
|
||||
const analyzeSuccess = analyzeRows.filter((r) => r.status === "success");
|
||||
const analyzeFailed = analyzeRows.filter((r) => r.status !== "success");
|
||||
const analyzeMs = analyzeRows.reduce(
|
||||
(s, r) => s + (Number(r.duration_ms) || 0),
|
||||
0
|
||||
);
|
||||
// Analyze wall time: elapsed clock time from when the FIRST window
|
||||
// started to when the LAST window finished. For a 1-batch parallel
|
||||
// analyze (all N windows fire concurrently), this ≈ the slowest
|
||||
// single window's duration. For multi-batch (N > concurrency, e.g.
|
||||
// 10 windows at concurrency 8 → 2 sequential batches), this spans
|
||||
// both batches including any gap. Computed from end-ts minus
|
||||
// start-ts (where start-ts = row.ts - row.duration_ms) so it's
|
||||
// an accurate measured value, not a predicted one.
|
||||
let analyzeWallMs = null;
|
||||
if (analyzeRows.length > 0) {
|
||||
let minStart = Infinity;
|
||||
let maxEnd = -Infinity;
|
||||
for (const r of analyzeRows) {
|
||||
const end = Number(r.ts) || 0;
|
||||
const dur = Number(r.duration_ms) || 0;
|
||||
const start = end - dur;
|
||||
if (start < minStart) minStart = start;
|
||||
if (end > maxEnd) maxEnd = end;
|
||||
}
|
||||
analyzeWallMs = maxEnd - minStart;
|
||||
}
|
||||
const analyzeDominantBackend = dominant(
|
||||
analyzeSuccess.map((r) => r.backend)
|
||||
);
|
||||
const analyzeDominantModel = dominant(analyzeSuccess.map((r) => r.model));
|
||||
|
||||
const errors = rows
|
||||
.filter((r) => r.error)
|
||||
.map((r) => `${r.pipeline}: ${String(r.error).slice(0, 160)}`);
|
||||
|
||||
const txStatus = tx ? tx.status : "missing";
|
||||
let overall;
|
||||
if (txStatus === "error" || txStatus === "refused" || txStatus === "missing") {
|
||||
overall = "failed";
|
||||
} else if (txStatus === "partial") {
|
||||
// TX produced a truncated transcript (chunks hit the output-token
|
||||
// cap). Mark the whole job partial regardless of analyze status —
|
||||
// the analysis was performed against incomplete input, so even
|
||||
// a "success" on analyze rows is misleading.
|
||||
overall = "partial";
|
||||
} else if (analyzeRows.length === 0) {
|
||||
// Transcribe succeeded but no analyze rows — could be in flight,
|
||||
// or the client never called /relay/analyze (uses local model).
|
||||
overall = "success";
|
||||
} else if (analyzeSuccess.length === analyzeRows.length) {
|
||||
overall = "success";
|
||||
} else if (analyzeSuccess.length > 0) {
|
||||
overall = "partial";
|
||||
} else {
|
||||
overall = "failed";
|
||||
}
|
||||
|
||||
const cost = rows.reduce((s, r) => s + (Number(r.cost_usd) || 0), 0);
|
||||
|
||||
// Use `??` (nullish-coalesce) — NOT `||` — so a legitimate 0 isn't
|
||||
// treated as missing data. The test-run worker writes duration_ms=0
|
||||
// historically (pre-fix) on cache-hit siblings; even though the new
|
||||
// worker writes a non-zero shared wall-time, old audit rows from
|
||||
// earlier benchmark batches still live in the NDJSON and we want
|
||||
// those rendered correctly rather than collapsed to "—".
|
||||
const audioSec = tx?.audio_seconds ?? null;
|
||||
const audioBytes = tx?.audio_bytes ?? null;
|
||||
const txMs = tx?.duration_ms ?? null;
|
||||
const downloadMs = tx?.download_ms ?? null;
|
||||
// TX backend compute time = sum of per-chunk wall-times. Distinct
|
||||
// from txMs which is the outer parallel-fan-out wall-time.
|
||||
// single-chunk: txMsSum ≈ txMs (one chunk, one duration)
|
||||
// N-chunks at concurrency C: txMsSum ≈ N × per-chunk-duration
|
||||
// txMs ≈ ⌈N/C⌉ × per-chunk-duration
|
||||
// Falls back to txMs (the wall-time) when chunk_durations_ms is
|
||||
// absent — old audit rows from before v0.2.41 don't have it.
|
||||
const chunkDurationsArr = Array.isArray(tx?.chunk_durations_ms) ? tx.chunk_durations_ms : null;
|
||||
const txMsSum = chunkDurationsArr
|
||||
? chunkDurationsArr.reduce((s, d) => s + (Number(d) || 0), 0)
|
||||
: txMs;
|
||||
|
||||
const audioMinutes = audioSec ? audioSec / 60 : null;
|
||||
const audioMb = audioBytes ? audioBytes / MB : null;
|
||||
|
||||
// batch_id and source are stamped per audit row by the test-run
|
||||
// path; use the first non-null we see so dashboard filters work
|
||||
// regardless of which row gets read first in a multi-row job.
|
||||
const batchId = rows.find((r) => r.batch_id)?.batch_id || null;
|
||||
const source = rows.find((r) => r.source)?.source || null;
|
||||
|
||||
return {
|
||||
job_id: key.startsWith("_orphan_") ? null : key,
|
||||
started_at: startedAt,
|
||||
completed_at: completedAt,
|
||||
install_id: tx?.install_id || rows[0].install_id || null,
|
||||
tier: tx?.tier || rows[0].tier || null,
|
||||
media_url: tx?.media_url || null,
|
||||
title: tx?.title || null,
|
||||
batch_id: batchId,
|
||||
source: source,
|
||||
audio_seconds: audioSec,
|
||||
audio_bytes: audioBytes,
|
||||
transcribe_status: txStatus,
|
||||
transcribe_backend: tx?.backend || null,
|
||||
transcribe_model: tx?.model || null,
|
||||
// transcribe_ms = outer wall-time of the whole TX phase (the
|
||||
// value the operator perceives as "how long did transcribe
|
||||
// take"). transcribe_ms_sum = total backend compute across all
|
||||
// chunks (drives cost; equals N × wall when N chunks run truly
|
||||
// sequentially, equals wall when single-chunk). For Gemini at
|
||||
// concurrency 12 over 3 chunks: wall ≈ 60s, sum ≈ 180s.
|
||||
transcribe_ms: txMs,
|
||||
transcribe_ms_sum: txMsSum,
|
||||
download_ms: downloadMs,
|
||||
chunk_count: tx?.chunk_count ?? null,
|
||||
analyze_windows_total: analyzeRows.length,
|
||||
analyze_windows_success: analyzeSuccess.length,
|
||||
analyze_windows_failed: analyzeFailed.length,
|
||||
analyze_backend: analyzeDominantBackend,
|
||||
analyze_model: analyzeDominantModel,
|
||||
// analyze_ms = SUM of per-window durations (total backend compute,
|
||||
// useful for cost). analyze_wall_time_ms = ELAPSED time from
|
||||
// first window start to last window end (the time a user actually
|
||||
// waits for the analyze phase). The two diverge when N windows
|
||||
// run in parallel: a 10-window 100s-per-window job has analyze_ms
|
||||
// = 1000s but analyze_wall_time_ms ≈ 100s (single batch) or
|
||||
// ≈ 200s (two sequential batches at concurrency 5).
|
||||
analyze_ms: analyzeMs,
|
||||
analyze_wall_time_ms: analyzeWallMs,
|
||||
overall_status: overall,
|
||||
wall_time_ms: completedAt - startedAt,
|
||||
cost_usd: cost,
|
||||
errors,
|
||||
// Derived rate metrics:
|
||||
transcribe_ms_per_min: audioMinutes && txMs ? txMs / audioMinutes : null,
|
||||
transcribe_ms_per_mb: audioMb && txMs ? txMs / audioMb : null,
|
||||
download_ms_per_mb: audioMb && downloadMs ? downloadMs / audioMb : null,
|
||||
analyze_ms_per_min: audioMinutes && analyzeMs ? analyzeMs / audioMinutes : null,
|
||||
analyze_wall_ms_per_min: audioMinutes && analyzeWallMs ? analyzeWallMs / audioMinutes : null,
|
||||
analyze_ms_per_mb: audioMb && analyzeMs ? analyzeMs / audioMb : null,
|
||||
};
|
||||
}
|
||||
|
||||
// Pick the most frequent string in a list (ties broken by first
|
||||
// occurrence). Used to attribute a backend/model to a job when its
|
||||
// rows might disagree (e.g. some analyze windows hit gemini and
|
||||
// fallback chain walked to hardware on others).
|
||||
function dominant(values) {
|
||||
const counts = new Map();
|
||||
for (const v of values) {
|
||||
if (!v) continue;
|
||||
counts.set(v, (counts.get(v) || 0) + 1);
|
||||
}
|
||||
let best = null;
|
||||
let bestCount = 0;
|
||||
for (const [v, c] of counts) {
|
||||
if (c > bestCount) {
|
||||
best = v;
|
||||
bestCount = c;
|
||||
}
|
||||
}
|
||||
return best;
|
||||
}
|
||||
|
||||
// Compute summary statistics across all aggregated jobs. Returned to
|
||||
// the dashboard's top-of-page cards: success rate, total processing
|
||||
// time, average wall-time per video, etc.
|
||||
export function summarizeJobs(jobs) {
|
||||
const total = jobs.length;
|
||||
if (total === 0) {
|
||||
return {
|
||||
total: 0,
|
||||
success: 0,
|
||||
partial: 0,
|
||||
failed: 0,
|
||||
success_rate: 1,
|
||||
median_wall_time_ms: null,
|
||||
median_transcribe_ms_per_min: null,
|
||||
median_analyze_ms_per_min: null,
|
||||
total_cost_usd: 0,
|
||||
total_audio_hours: 0,
|
||||
};
|
||||
}
|
||||
const success = jobs.filter((j) => j.overall_status === "success").length;
|
||||
const partial = jobs.filter((j) => j.overall_status === "partial").length;
|
||||
const failed = jobs.filter((j) => j.overall_status === "failed").length;
|
||||
const totalCost = jobs.reduce((s, j) => s + (j.cost_usd || 0), 0);
|
||||
const totalAudioSec = jobs.reduce(
|
||||
(s, j) => s + (j.audio_seconds || 0),
|
||||
0
|
||||
);
|
||||
|
||||
return {
|
||||
total,
|
||||
success,
|
||||
partial,
|
||||
failed,
|
||||
success_rate: (success + partial) / total,
|
||||
median_wall_time_ms: median(jobs.map((j) => j.wall_time_ms).filter(Number.isFinite)),
|
||||
median_transcribe_ms_per_min: median(
|
||||
jobs.map((j) => j.transcribe_ms_per_min).filter(Number.isFinite)
|
||||
),
|
||||
median_analyze_ms_per_min: median(
|
||||
jobs.map((j) => j.analyze_ms_per_min).filter(Number.isFinite)
|
||||
),
|
||||
total_cost_usd: totalCost,
|
||||
total_audio_hours: totalAudioSec / 3600,
|
||||
};
|
||||
}
|
||||
|
||||
function median(arr) {
|
||||
if (!arr.length) return null;
|
||||
const sorted = [...arr].sort((a, b) => a - b);
|
||||
const mid = Math.floor(sorted.length / 2);
|
||||
return sorted.length % 2 === 0
|
||||
? (sorted[mid - 1] + sorted[mid]) / 2
|
||||
: sorted[mid];
|
||||
}
|
||||
+219
@@ -0,0 +1,219 @@
|
||||
// In-memory background-job tracker. Used by /relay/transcribe-url
|
||||
// (and any future long-running endpoint) so the request that kicks
|
||||
// off the work returns immediately with a job_id, and the client
|
||||
// polls /relay/jobs/{id} to find out when it's done.
|
||||
//
|
||||
// Rationale: synchronous HTTP responses for multi-minute transcribes
|
||||
// are fragile. Any intermediate proxy / load balancer / NAT in the
|
||||
// path will drop the connection after some idle/total timeout (often
|
||||
// 100s–10min), failing the whole job mid-flight even though the
|
||||
// relay backend is working fine. Async jobs sidestep all of that:
|
||||
// the long-running work happens off the request path and the client
|
||||
// polls short, cheap requests until done.
|
||||
//
|
||||
// Storage is in-process memory. Restart-survivability is a known
|
||||
// gap — a relay restart mid-job loses that job's state, and the
|
||||
// client will re-poll forever until it gives up. Acceptable for v1
|
||||
// at small relay scale; the audit log already captures every
|
||||
// completed call so the operator has a paper trail either way.
|
||||
// Migrate to SQLite if/when restart-resilience becomes important.
|
||||
//
|
||||
// Each job is { id, kind, install_id, status, started_at, updated_at,
|
||||
// completed_at?, progress?, result?, error? }
|
||||
// status: "queued" | "running" | "complete" | "failed"
|
||||
|
||||
import { randomUUID } from "crypto";
|
||||
import { sanitizeErrorForClient } from "./sanitize-error.js";
|
||||
|
||||
// All in-memory; lost on restart.
|
||||
const jobs = new Map();
|
||||
|
||||
// Cap how long completed jobs hang around so the map doesn't grow
|
||||
// unbounded. Once a client has polled and seen "complete", it'll
|
||||
// stop polling — keeping the record 24h gives slow / retried clients
|
||||
// a generous window without exhausting memory.
|
||||
const RETENTION_MS = 24 * 60 * 60 * 1000;
|
||||
|
||||
export function createJob({ kind, installId, metadata = {} }) {
|
||||
pruneExpired();
|
||||
const id = randomUUID();
|
||||
const now = Date.now();
|
||||
const job = {
|
||||
id,
|
||||
kind,
|
||||
install_id: installId,
|
||||
status: "queued",
|
||||
started_at: now,
|
||||
updated_at: now,
|
||||
completed_at: null,
|
||||
progress: null,
|
||||
result: null,
|
||||
error: null,
|
||||
metadata,
|
||||
// Event log + live subscriber list. Used by jobs that stream
|
||||
// incremental results via SSE (e.g., /relay/summarize-url
|
||||
// dispatches transcribe_progress, transcribe_complete,
|
||||
// window_complete, done, error events). Each event is
|
||||
// { type, data, ts } and gets BOTH appended to the log (so a
|
||||
// late SSE-connecting client can replay missed events) and
|
||||
// pushed to any currently-subscribed callbacks. `subscribers`
|
||||
// is intentionally non-enumerable / non-serialized so it never
|
||||
// leaks into snapshotJobs() or HTTP responses.
|
||||
events: [],
|
||||
};
|
||||
Object.defineProperty(job, "subscribers", {
|
||||
value: new Set(),
|
||||
enumerable: false,
|
||||
writable: false,
|
||||
});
|
||||
jobs.set(id, job);
|
||||
return job;
|
||||
}
|
||||
|
||||
// Append an event to a job's log AND notify any live SSE
|
||||
// subscribers. Used by /relay/summarize-url's background worker to
|
||||
// emit per-window progress as it streams in from runChunkedAnalysis.
|
||||
// Event shape:
|
||||
// { type: "window_complete"|"transcribe_complete"|"done"|"error"|"progress",
|
||||
// data: <event payload>,
|
||||
// ts: ms-epoch }
|
||||
// Subscriber callbacks receive ONLY the new event (not the full log);
|
||||
// new subscribers should replay the log themselves on connect.
|
||||
export function appendEvent(jobId, type, data) {
|
||||
const job = jobs.get(jobId);
|
||||
if (!job) return;
|
||||
const event = { type, data, ts: Date.now() };
|
||||
job.events.push(event);
|
||||
job.updated_at = event.ts;
|
||||
// Cap the log so a runaway job doesn't blow memory. 1000 events
|
||||
// is far beyond any plausible window count (typical: 10-20).
|
||||
if (job.events.length > 1000) job.events.shift();
|
||||
for (const cb of job.subscribers) {
|
||||
try {
|
||||
cb(event);
|
||||
} catch (err) {
|
||||
console.warn(`[jobs] subscriber callback failed: ${err?.message || err}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Subscribe to live events from a job. Returns an unsubscribe
|
||||
// function the caller MUST call (e.g., on SSE connection close)
|
||||
// or the job state will leak the callback closure forever.
|
||||
// Returns null when the job no longer exists.
|
||||
export function subscribeToJob(jobId, callback) {
|
||||
const job = jobs.get(jobId);
|
||||
if (!job) return null;
|
||||
job.subscribers.add(callback);
|
||||
return () => {
|
||||
job.subscribers.delete(callback);
|
||||
};
|
||||
}
|
||||
|
||||
export function getJob(jobId) {
|
||||
pruneExpired();
|
||||
return jobs.get(jobId) || null;
|
||||
}
|
||||
|
||||
export function markRunning(jobId) {
|
||||
const job = jobs.get(jobId);
|
||||
if (!job) return;
|
||||
job.status = "running";
|
||||
job.updated_at = Date.now();
|
||||
}
|
||||
|
||||
export function setProgress(jobId, message) {
|
||||
const job = jobs.get(jobId);
|
||||
if (!job) return;
|
||||
job.progress = String(message).slice(0, 200);
|
||||
job.updated_at = Date.now();
|
||||
}
|
||||
|
||||
export function markComplete(jobId, envelope) {
|
||||
const job = jobs.get(jobId);
|
||||
if (!job) return;
|
||||
job.status = "complete";
|
||||
// Keep the full envelope shape on the job (caller decides what to
|
||||
// pass — typically { result: {...inner...}, credit_charged, tier }).
|
||||
// Internal consumers that read job.result directly still see the
|
||||
// wrapped form.
|
||||
job.result = envelope;
|
||||
job.completed_at = Date.now();
|
||||
job.updated_at = job.completed_at;
|
||||
// SSE "done" event: emit the INNER result directly so subscribers
|
||||
// can read fields off `data.result.title` (or `.transcript`,
|
||||
// `.analysis`, etc.) instead of a confusing `data.result.result.title`.
|
||||
// The wrapped form (envelope.result) is unwrapped here; if the
|
||||
// caller passed a flat result without an inner `.result` key we
|
||||
// just pass it through unchanged. credit_charged + tier travel
|
||||
// alongside as siblings so the SSE consumer can update its
|
||||
// balance display without digging into the result body.
|
||||
//
|
||||
// Why this matters: Recap-app's SSE handler does
|
||||
// `finalResult = data.result`, then reads `finalResult.title`.
|
||||
// Before this fix, that landed on the wrapping envelope and every
|
||||
// title came back undefined — library entries persisted as
|
||||
// "Untitled" despite the relay correctly extracting the real title
|
||||
// via yt-dlp. The audit log was unaffected (it reads the local
|
||||
// `title` variable directly) which made the bug look like a
|
||||
// Recap-side issue. It wasn't.
|
||||
const inner = envelope && typeof envelope === "object" && "result" in envelope
|
||||
? envelope.result
|
||||
: envelope;
|
||||
appendEvent(jobId, "done", {
|
||||
result: inner,
|
||||
credit_charged: envelope?.credit_charged,
|
||||
tier: envelope?.tier,
|
||||
});
|
||||
}
|
||||
|
||||
export function markFailed(jobId, errorMessage) {
|
||||
const job = jobs.get(jobId);
|
||||
if (!job) return;
|
||||
job.status = "failed";
|
||||
// Sanitize at the source so EVERY downstream surface that reads
|
||||
// job.error (SSE error event, the per-job GET endpoints, etc.)
|
||||
// gets the client-safe wording, without having to remember to
|
||||
// sanitize at every call site. The raw operator-internal message
|
||||
// stays available on job.error_internal for the admin dashboard +
|
||||
// audit log (snapshotJobs exposes both fields).
|
||||
const raw = String(errorMessage || "unknown error").slice(0, 500);
|
||||
job.error_internal = raw;
|
||||
job.error = sanitizeErrorForClient(raw).slice(0, 500);
|
||||
job.completed_at = Date.now();
|
||||
job.updated_at = job.completed_at;
|
||||
// Same terminal event for failures — SSE clients close on this
|
||||
// and surface the error to the user.
|
||||
appendEvent(jobId, "error", { error: job.error });
|
||||
}
|
||||
|
||||
export function snapshotJobs() {
|
||||
pruneExpired();
|
||||
return Array.from(jobs.values()).map((j) => ({
|
||||
id: j.id,
|
||||
kind: j.kind,
|
||||
install_id: j.install_id,
|
||||
status: j.status,
|
||||
started_at: j.started_at,
|
||||
updated_at: j.updated_at,
|
||||
completed_at: j.completed_at,
|
||||
progress: j.progress,
|
||||
has_result: j.result != null,
|
||||
// Both error variants exposed — the admin dashboard consumes
|
||||
// snapshotJobs and can prefer error_internal for operator
|
||||
// diagnosis (full backend / spark-control wording intact).
|
||||
// External callers should always read `error` (sanitized).
|
||||
error: j.error,
|
||||
error_internal: j.error_internal || j.error,
|
||||
}));
|
||||
}
|
||||
|
||||
function pruneExpired() {
|
||||
const cutoff = Date.now() - RETENTION_MS;
|
||||
for (const [id, job] of jobs) {
|
||||
const ref = job.completed_at || job.updated_at || job.started_at;
|
||||
if (ref && ref < cutoff) {
|
||||
jobs.delete(id);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,376 @@
|
||||
// Phase 2 of Path 2A — meeting extras analysis.
|
||||
//
|
||||
// Runs a single LLM pass AFTER transcribe → diarize → cluster →
|
||||
// analyze → name-inference → summary-polish complete. Pulls out four
|
||||
// categories of structured information that operators consistently
|
||||
// want at the top of an internal meeting recap:
|
||||
//
|
||||
// - decisions : what was agreed on (with the offset where it was settled)
|
||||
// - action_items : who owes what, by when (best-effort due_hint)
|
||||
// - open_questions : questions raised that didn't get resolved
|
||||
// - key_quotes : notable statements worth surfacing verbatim
|
||||
//
|
||||
// Each item carries a `supporting_offset` (or `offset`) in seconds so
|
||||
// the dashboard can render the timestamp as a clickable jump to the
|
||||
// corresponding transcript line. Each item also carries speaker IDs
|
||||
// (cluster ids like Speaker_A) so the renderer can show the speaker's
|
||||
// colored chip + display name, and so an operator-rename or per-line
|
||||
// override propagates here too.
|
||||
//
|
||||
// Returns:
|
||||
// {
|
||||
// decisions: [{ statement, agreed_by[], supporting_offset }],
|
||||
// action_items: [{ description, owner, due_hint, supporting_offset }],
|
||||
// open_questions: [{ question, raised_by, answered }],
|
||||
// key_quotes: [{ speaker, offset, quote, why_notable }],
|
||||
// }
|
||||
//
|
||||
// or null on total failure. Failure is non-fatal — the meeting still
|
||||
// saves with rec.extras = null and the dashboard just hides the
|
||||
// extras section.
|
||||
|
||||
import { recordCall } from "./audit-log.js";
|
||||
|
||||
const EXTRAS_MAX_ATTEMPTS = 3;
|
||||
|
||||
export const DEFAULT_MEETING_EXTRAS_PROMPT_TEMPLATE = `You are extracting structured information from an internal team meeting transcript. The transcript below has been pre-tagged with speaker labels like [A], [B], [C] (anonymous voice-clustering labels) and inferred real names where available.
|
||||
|
||||
MEETING METADATA:
|
||||
- Title: {{title}}
|
||||
- Duration: {{duration}}
|
||||
|
||||
{{operatorContext}}SPEAKERS (from voice clustering, with operator-confirmed names where present):
|
||||
{{speakerRoster}}
|
||||
|
||||
TOPIC SUMMARIES (already produced — for context only, do not duplicate):
|
||||
{{topics}}
|
||||
|
||||
TRANSCRIPT (each line is "[<letter> <MM:SS>] text"):
|
||||
{{transcript}}
|
||||
|
||||
INSTRUCTIONS:
|
||||
Extract FIVE categories of information from the meeting. Return EMPTY ARRAYS for categories that don't apply — do NOT invent items.
|
||||
|
||||
1. TLDR — A 2-4 sentence executive summary of the entire meeting: what it was about, the key discussion arc, and the bottom-line outcome. Write in past tense, third person. Keep it dense — every clause should carry information. Skip pleasantries and procedural opening/closing chatter. If a meeting was genuinely substanceless (a 3-minute check-in, audio test, etc.), write one factual descriptor sentence instead of padding. This is the only required category — even the most trivial meeting gets a one-sentence TLDR.
|
||||
- summary: the 2-4 sentence executive summary
|
||||
- primary_speakers: array of Speaker_X ids who drove the conversation (the 1-3 people most central to the discussion, in rough order of contribution). Empty array if unclear.
|
||||
|
||||
2. DECISIONS — Things explicitly decided / agreed during the meeting. Include only clear commitments ("we will do X", "let's go with Y"), not casual mentions. For each:
|
||||
- statement: the decision in one sentence
|
||||
- agreed_by: array of Speaker_X ids who explicitly agreed (use the chip-letter notation, e.g. ["Speaker_A", "Speaker_C"]). Empty array if unclear.
|
||||
- supporting_offset: integer SECONDS where this decision was made (use the [<letter> <MM:SS>] timestamp from the most relevant transcript line — convert MM:SS to total seconds)
|
||||
|
||||
3. ACTION_ITEMS — Specific commitments where someone said they would do something. Include only explicit ownership ("I'll send the doc", "Matt will follow up"), not vague "someone should...". For each:
|
||||
- description: the action in imperative form
|
||||
- owner: the Speaker_X id of the person taking it on (e.g. "Speaker_A"), or null if unclear
|
||||
- due_hint: the deadline as a string if mentioned ("by Friday", "end of week", "before next call"), or null
|
||||
- supporting_offset: integer seconds where the commitment was made
|
||||
|
||||
4. OPEN_QUESTIONS — Questions raised that were NOT clearly answered during the meeting. Skip rhetorical questions and questions that got direct answers. For each:
|
||||
- question: the question, rephrased to be self-contained
|
||||
- raised_by: the Speaker_X id who asked (or null if unclear)
|
||||
- answered: false (always — if it was answered, don't include it)
|
||||
|
||||
5. KEY_QUOTES — Statements worth surfacing verbatim because they are pivotal, particularly insightful, or capture a strong opinion. Limit to 3-6 quotes max. Skip filler and conversational text. For each:
|
||||
- speaker: the Speaker_X id of the speaker
|
||||
- offset: integer seconds where the quote occurs
|
||||
- quote: the verbatim quote (trim to the substantive sentence, 4-30 words)
|
||||
- why_notable: one short clause on why this is worth surfacing
|
||||
|
||||
Be conservative across all five. Better to return an empty array (or for TLDR, a single factual sentence) than to fabricate. A 5-minute small-talk call may legitimately have 0 decisions, 0 action items, 0 open questions, 0 key quotes — but it still gets a TLDR.
|
||||
|
||||
Respond with ONLY valid JSON in this exact shape, no other text:
|
||||
{
|
||||
"tldr": {"summary": "...", "primary_speakers": ["Speaker_A", "Speaker_B"]},
|
||||
"decisions": [{"statement": "...", "agreed_by": ["Speaker_A"], "supporting_offset": 123}],
|
||||
"action_items": [{"description": "...", "owner": "Speaker_B", "due_hint": "by Friday", "supporting_offset": 234}],
|
||||
"open_questions": [{"question": "...", "raised_by": "Speaker_C", "answered": false}],
|
||||
"key_quotes": [{"speaker": "Speaker_A", "offset": 345, "quote": "...", "why_notable": "..."}]
|
||||
}`;
|
||||
|
||||
function fillTemplate(template, vars) {
|
||||
return String(template || "").replace(/\{\{\s*(\w+)\s*\}\}/g, (_match, key) => {
|
||||
return key in vars ? String(vars[key]) : `{{${key}}}`;
|
||||
});
|
||||
}
|
||||
|
||||
function formatDuration(seconds) {
|
||||
const s = Math.max(0, Math.floor(seconds || 0));
|
||||
const h = Math.floor(s / 3600);
|
||||
const m = Math.floor((s % 3600) / 60);
|
||||
const sec = s % 60;
|
||||
if (h > 0) return `${h}h ${m}m ${sec}s`;
|
||||
if (m > 0) return `${m}m ${sec}s`;
|
||||
return `${sec}s`;
|
||||
}
|
||||
|
||||
function formatLabeledTranscript(segments) {
|
||||
if (!Array.isArray(segments) || segments.length === 0) return "";
|
||||
const lines = [];
|
||||
for (const seg of segments) {
|
||||
const text = (seg.text || "").trim();
|
||||
if (!text) continue;
|
||||
const t = seg.start || 0;
|
||||
let letter = "?";
|
||||
const m = String(seg.speaker || "").match(/^Speaker_([A-Z]+)$/);
|
||||
if (m) letter = m[1];
|
||||
const secInt = Math.floor(t);
|
||||
const mm = Math.floor(secInt / 60);
|
||||
const ss = secInt % 60;
|
||||
lines.push(`[${letter} ${mm}:${String(ss).padStart(2, "0")}] ${text}`);
|
||||
}
|
||||
return lines.join("\n");
|
||||
}
|
||||
|
||||
// Trim a too-large transcript by keeping the head and tail. Keeps
|
||||
// the meeting's opening (introductions, agenda) AND closing (wrap-up,
|
||||
// next steps) which are where most extras-worthy content lives.
|
||||
function capTranscript(text, maxChars) {
|
||||
if (text.length <= maxChars) return text;
|
||||
const half = Math.floor(maxChars / 2) - 50;
|
||||
return (
|
||||
text.slice(0, half) +
|
||||
"\n\n…[middle truncated for prompt length]…\n\n" +
|
||||
text.slice(-half)
|
||||
);
|
||||
}
|
||||
|
||||
function safeParseExtras(text) {
|
||||
if (!text || typeof text !== "string") return null;
|
||||
let s = text.trim();
|
||||
const fence = s.match(/```(?:json)?\s*([\s\S]*?)```/);
|
||||
if (fence) s = fence[1].trim();
|
||||
let parsed;
|
||||
try {
|
||||
parsed = JSON.parse(s);
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
if (!parsed || typeof parsed !== "object") return null;
|
||||
const asArray = (v) => (Array.isArray(v) ? v : []);
|
||||
// TLDR — exactly one object (not an array). Required category;
|
||||
// we accept any well-formed shape and clamp to safe bounds. If
|
||||
// the LLM omitted it entirely we leave it null so the renderer
|
||||
// can show "TLDR unavailable" rather than fabricating.
|
||||
let tldr = null;
|
||||
if (parsed.tldr && typeof parsed.tldr === "object" && !Array.isArray(parsed.tldr)) {
|
||||
const summary = typeof parsed.tldr.summary === "string" ? parsed.tldr.summary.trim() : "";
|
||||
if (summary) {
|
||||
tldr = {
|
||||
summary: summary.slice(0, 800),
|
||||
primary_speakers: Array.isArray(parsed.tldr.primary_speakers)
|
||||
? parsed.tldr.primary_speakers
|
||||
.filter((x) => typeof x === "string" && /^Speaker_[A-Z]+$/.test(x))
|
||||
.slice(0, 5)
|
||||
: [],
|
||||
};
|
||||
}
|
||||
}
|
||||
// Coerce + clamp each category to a sane shape. Drop entries
|
||||
// that fail validation rather than failing the whole pass.
|
||||
const decisions = asArray(parsed.decisions)
|
||||
.map((d) => {
|
||||
if (!d || typeof d !== "object") return null;
|
||||
const statement = typeof d.statement === "string" ? d.statement.trim() : "";
|
||||
if (!statement) return null;
|
||||
return {
|
||||
statement: statement.slice(0, 400),
|
||||
agreed_by: Array.isArray(d.agreed_by)
|
||||
? d.agreed_by.filter((x) => typeof x === "string" && /^Speaker_[A-Z]+$/.test(x)).slice(0, 10)
|
||||
: [],
|
||||
supporting_offset: Number.isFinite(d.supporting_offset) ? Math.max(0, Math.floor(d.supporting_offset)) : null,
|
||||
};
|
||||
})
|
||||
.filter(Boolean)
|
||||
.slice(0, 20);
|
||||
const action_items = asArray(parsed.action_items)
|
||||
.map((a) => {
|
||||
if (!a || typeof a !== "object") return null;
|
||||
const description = typeof a.description === "string" ? a.description.trim() : "";
|
||||
if (!description) return null;
|
||||
return {
|
||||
description: description.slice(0, 400),
|
||||
owner: typeof a.owner === "string" && /^Speaker_[A-Z]+$/.test(a.owner) ? a.owner : null,
|
||||
due_hint: typeof a.due_hint === "string" && a.due_hint.trim() ? a.due_hint.trim().slice(0, 80) : null,
|
||||
supporting_offset: Number.isFinite(a.supporting_offset) ? Math.max(0, Math.floor(a.supporting_offset)) : null,
|
||||
};
|
||||
})
|
||||
.filter(Boolean)
|
||||
.slice(0, 30);
|
||||
const open_questions = asArray(parsed.open_questions)
|
||||
.map((q) => {
|
||||
if (!q || typeof q !== "object") return null;
|
||||
const question = typeof q.question === "string" ? q.question.trim() : "";
|
||||
if (!question) return null;
|
||||
return {
|
||||
question: question.slice(0, 400),
|
||||
raised_by: typeof q.raised_by === "string" && /^Speaker_[A-Z]+$/.test(q.raised_by) ? q.raised_by : null,
|
||||
answered: q.answered === true,
|
||||
};
|
||||
})
|
||||
.filter(Boolean)
|
||||
.slice(0, 20);
|
||||
const key_quotes = asArray(parsed.key_quotes)
|
||||
.map((q) => {
|
||||
if (!q || typeof q !== "object") return null;
|
||||
const quote = typeof q.quote === "string" ? q.quote.trim() : "";
|
||||
if (!quote) return null;
|
||||
return {
|
||||
speaker: typeof q.speaker === "string" && /^Speaker_[A-Z]+$/.test(q.speaker) ? q.speaker : null,
|
||||
offset: Number.isFinite(q.offset) ? Math.max(0, Math.floor(q.offset)) : null,
|
||||
quote: quote.slice(0, 400),
|
||||
why_notable: typeof q.why_notable === "string" ? q.why_notable.trim().slice(0, 200) : "",
|
||||
};
|
||||
})
|
||||
.filter(Boolean)
|
||||
.slice(0, 10);
|
||||
return { tldr, decisions, action_items, open_questions, key_quotes };
|
||||
}
|
||||
|
||||
export async function runMeetingExtras({
|
||||
title,
|
||||
audioSec,
|
||||
speakers,
|
||||
speakerNames,
|
||||
transcriptSegments,
|
||||
topics, // array of { title, summary, startTime } from analyze-then-polish
|
||||
promptOverride = "",
|
||||
// Operator-supplied hints (internal meetings only). participantHints
|
||||
// is a CSV-ish string of expected attendees; operatorNotes is free-
|
||||
// form prose describing who-said-what. Both are framed as hints in
|
||||
// the rendered prompt — the LLM is instructed to use them as soft
|
||||
// signals and verify against the transcript before quoting or
|
||||
// attributing. Empty → no OPERATOR HINTS block appears.
|
||||
participantHints = "",
|
||||
operatorNotes = "",
|
||||
backend,
|
||||
pipelineBackend,
|
||||
jobId,
|
||||
installId,
|
||||
licenseFingerprint = null,
|
||||
source,
|
||||
computeCostDetails,
|
||||
}) {
|
||||
if (!backend) return null;
|
||||
if (!Array.isArray(transcriptSegments) || transcriptSegments.length === 0) return null;
|
||||
|
||||
// Build speaker roster — Speaker_A (chip A, 12m 34s, "Matt Hill")
|
||||
const speakerLetters = Object.keys(speakers || {})
|
||||
.filter((k) => /^Speaker_[A-Z]+$/.test(k))
|
||||
.sort();
|
||||
const speakerRoster = speakerLetters
|
||||
.map((k) => {
|
||||
const stats = speakers[k] || {};
|
||||
const secs = Math.round(stats.total_speaking_seconds || 0);
|
||||
const mins = Math.floor(secs / 60);
|
||||
const rem = secs % 60;
|
||||
const timeStr = mins > 0 ? `${mins}m ${rem}s` : `${rem}s`;
|
||||
const letter = k.replace("Speaker_", "");
|
||||
const name = speakerNames && speakerNames[k] ? `"${speakerNames[k]}"` : "(unknown)";
|
||||
return `- ${k} (chip [${letter}], ${timeStr} speaking, ${stats.turns || 0} turns): ${name}`;
|
||||
})
|
||||
.join("\n");
|
||||
|
||||
const topicsBlock = Array.isArray(topics) && topics.length
|
||||
? topics
|
||||
.map((t, i) => {
|
||||
const startSec = t.startTime || 0;
|
||||
const mm = Math.floor(startSec / 60);
|
||||
const ss = Math.floor(startSec % 60);
|
||||
const tStr = `${mm}:${String(ss).padStart(2, "0")}`;
|
||||
return `${i + 1}. [${tStr}] ${t.title || "(untitled)"} — ${t.summary || ""}`;
|
||||
})
|
||||
.join("\n")
|
||||
: "(no topics)";
|
||||
|
||||
const fullTranscript = formatLabeledTranscript(transcriptSegments);
|
||||
const cappedTranscript = capTranscript(fullTranscript, 25000);
|
||||
|
||||
// Compose the OPERATOR HINTS block — same shape as the name-
|
||||
// inference pipeline so the LLM gets consistent framing across
|
||||
// both passes. Empty when no hints supplied.
|
||||
const hintsParts = [];
|
||||
if (participantHints && String(participantHints).trim()) {
|
||||
hintsParts.push(
|
||||
`Possible participants in this meeting (operator-supplied — may be incomplete):\n${String(participantHints).trim()}`,
|
||||
);
|
||||
}
|
||||
if (operatorNotes && String(operatorNotes).trim()) {
|
||||
const trimmed = String(operatorNotes).trim().slice(0, 4000);
|
||||
hintsParts.push(
|
||||
`Operator notes (may describe who said what — use as soft context, verify against the transcript before extracting decisions / action items / quotes):\n${trimmed}`,
|
||||
);
|
||||
}
|
||||
const operatorContextBlock = hintsParts.length
|
||||
? `OPERATOR HINTS (treat as suggestions only — verify against the transcript):\n\n${hintsParts.join("\n\n")}\n\n`
|
||||
: "";
|
||||
|
||||
const templateSource =
|
||||
typeof promptOverride === "string" && promptOverride.trim()
|
||||
? promptOverride
|
||||
: DEFAULT_MEETING_EXTRAS_PROMPT_TEMPLATE;
|
||||
const prompt = fillTemplate(templateSource, {
|
||||
title: title || "(untitled)",
|
||||
duration: formatDuration(audioSec),
|
||||
operatorContext: operatorContextBlock,
|
||||
speakerRoster: speakerRoster || "(no speakers identified)",
|
||||
topics: topicsBlock,
|
||||
transcript: cappedTranscript || "(empty)",
|
||||
});
|
||||
|
||||
const t0 = Date.now();
|
||||
let r = null;
|
||||
let parsed = null;
|
||||
let lastErr = null;
|
||||
for (let attempt = 0; attempt < EXTRAS_MAX_ATTEMPTS; attempt++) {
|
||||
try {
|
||||
r = await backend.analyzeText({ prompt });
|
||||
parsed = safeParseExtras(r.text);
|
||||
if (parsed) {
|
||||
lastErr = null;
|
||||
break;
|
||||
}
|
||||
lastErr = "invalid JSON in extras response";
|
||||
} catch (err) {
|
||||
lastErr = (err?.message || String(err)).slice(0, 280);
|
||||
r = null;
|
||||
}
|
||||
if (attempt < EXTRAS_MAX_ATTEMPTS - 1) {
|
||||
console.warn(
|
||||
`[meeting-extras] attempt ${attempt + 1} failed (${lastErr}) — retrying`
|
||||
);
|
||||
}
|
||||
}
|
||||
const dur = Date.now() - t0;
|
||||
const cost =
|
||||
parsed && r
|
||||
? computeCostDetails(r.model, r.usage)
|
||||
: { input_tokens: 0, output_tokens: 0, thinking_tokens: 0, cost_usd: 0 };
|
||||
await recordCall({
|
||||
install_id: installId,
|
||||
license_fingerprint: licenseFingerprint,
|
||||
tier: "core",
|
||||
pipeline: "meeting_extras",
|
||||
backend: pipelineBackend,
|
||||
model: r?.model || null,
|
||||
status: parsed ? "success" : "error",
|
||||
duration_ms: dur,
|
||||
audio_seconds: 0,
|
||||
job_id: jobId,
|
||||
batch_id: null,
|
||||
source,
|
||||
media_url: null,
|
||||
error: parsed ? null : lastErr || "extras analysis failed",
|
||||
...cost,
|
||||
});
|
||||
if (!parsed) {
|
||||
console.warn(
|
||||
`[meeting-extras] all ${EXTRAS_MAX_ATTEMPTS} attempts failed (${lastErr}) — extras unavailable`
|
||||
);
|
||||
return null;
|
||||
}
|
||||
console.log(
|
||||
`[meeting-extras] extracted ${parsed.tldr ? "tldr + " : "(no tldr) + "}${parsed.decisions.length} decision(s), ${parsed.action_items.length} action(s), ${parsed.open_questions.length} question(s), ${parsed.key_quotes.length} quote(s) in ${(dur / 1000).toFixed(1)}s`
|
||||
);
|
||||
return parsed;
|
||||
}
|
||||
@@ -0,0 +1,359 @@
|
||||
// Post-hoc speaker edits for saved internal meetings.
|
||||
//
|
||||
// Two operator tools that mutate a saved meeting record in place,
|
||||
// without re-uploading audio or hitting Spark Control:
|
||||
//
|
||||
// mergeSpeakersInRecord — fold one or more clusters that diarization
|
||||
// mistakenly split apart into a single speaker.
|
||||
// reclusterMeetingRecord — re-run the cross-chunk voice clustering at a
|
||||
// new strictness threshold to separate two
|
||||
// people who were over-merged into one cluster.
|
||||
// Pure offline re-clustering off the persisted
|
||||
// per-chunk fingerprints (rec.diarization).
|
||||
//
|
||||
// Both must keep the FOUR places a speaker label lives in sync:
|
||||
// 1. rec.transcript_segments[].speaker
|
||||
// 2. rec.chunks[].entries[].speaker (+ .speaker_override)
|
||||
// 3. rec.speakers (per-cluster stats map)
|
||||
// 4. rec.extras (tldr.primary_speakers, decisions.agreed_by,
|
||||
// action_items.owner, key_quotes.speaker)
|
||||
// plus rec.speaker_names (display-name map).
|
||||
|
||||
import {
|
||||
clusterSpeakers,
|
||||
assignSpeakersToSegments,
|
||||
} from "./speaker-clustering.js";
|
||||
|
||||
// ─── Entry speaker backfill ─────────────────────────────────────────
|
||||
// Re-derive each chunk entry's speaker from rec.transcript_segments by
|
||||
// timestamp. Used (a) on load to repair pre-diarization records and
|
||||
// (b) after a re-cluster re-stamps the segments. By default it only
|
||||
// fills entries that LACK a speaker (the load-path use); pass
|
||||
// { force: true } to re-stamp every entry (the re-cluster use, after
|
||||
// the old labels have been cleared).
|
||||
//
|
||||
// Matching mirrors the pipeline's original offset→segment logic
|
||||
// (internal-meetings.js build path): exact floored-start, then a
|
||||
// containing segment within ±0.5s, then nearest preceding within 5s.
|
||||
export function backfillEntrySpeakers(rec, { force = false } = {}) {
|
||||
if (!rec || !Array.isArray(rec.chunks) || !Array.isArray(rec.transcript_segments)) {
|
||||
return;
|
||||
}
|
||||
if (!force) {
|
||||
const needsBackfill = rec.chunks.some((c) =>
|
||||
Array.isArray(c.entries) && c.entries.some((e) => !e || !e.speaker)
|
||||
);
|
||||
if (!needsBackfill) return;
|
||||
}
|
||||
|
||||
const segs = rec.transcript_segments
|
||||
.slice()
|
||||
.sort((a, b) => (a.start || 0) - (b.start || 0));
|
||||
const byFlooredStart = new Map();
|
||||
for (const seg of segs) {
|
||||
const k = Math.floor(seg.start || 0);
|
||||
if (!byFlooredStart.has(k)) byFlooredStart.set(k, seg);
|
||||
}
|
||||
const pickSpeaker = (t) => {
|
||||
let found = byFlooredStart.get(t);
|
||||
if (found && found.speaker) return found;
|
||||
for (const seg of segs) {
|
||||
if ((seg.start || 0) > t + 5) break;
|
||||
if ((seg.start || 0) - 0.5 <= t && t <= (seg.end || 0) + 0.5) {
|
||||
if (seg.speaker) return seg;
|
||||
}
|
||||
}
|
||||
let bestPrev = null;
|
||||
let bestDist = Infinity;
|
||||
for (const seg of segs) {
|
||||
if ((seg.start || 0) > t) break;
|
||||
const dist = t - (seg.start || 0);
|
||||
if (dist < bestDist && seg.speaker) {
|
||||
bestDist = dist;
|
||||
bestPrev = seg;
|
||||
}
|
||||
}
|
||||
if (bestPrev && bestDist <= 5) return bestPrev;
|
||||
return null;
|
||||
};
|
||||
for (const chunk of rec.chunks) {
|
||||
if (!Array.isArray(chunk.entries)) continue;
|
||||
for (const entry of chunk.entries) {
|
||||
if (!force && entry.speaker) continue;
|
||||
const t = entry.offset || 0;
|
||||
const found = pickSpeaker(t);
|
||||
if (found && found.speaker) {
|
||||
entry.speaker = found.speaker;
|
||||
entry.speaker_confidence = found.speaker_confidence ?? null;
|
||||
entry.speaker_uncertain = !!found.speaker_uncertain;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ─── Merge speakers ─────────────────────────────────────────────────
|
||||
// Fold each cluster in `absorbed` into `survivor`. Rewrites every label
|
||||
// reference, sums the stats, inherits the absorbed display name only
|
||||
// when the survivor has none, and rewrites extras attributions.
|
||||
// Remaining letters are intentionally NOT renumbered — that would
|
||||
// cascade through speaker_names + per-line overrides for no real gain.
|
||||
//
|
||||
// Returns { changed, speakers, speaker_names }. Throws on invalid input.
|
||||
export function mergeSpeakersInRecord(rec, survivor, absorbed) {
|
||||
if (!rec || typeof rec !== "object") {
|
||||
throw badRequest("record required");
|
||||
}
|
||||
const speakers = rec.speakers && typeof rec.speakers === "object" ? rec.speakers : {};
|
||||
const absorbList = Array.isArray(absorbed) ? [...new Set(absorbed)] : [];
|
||||
|
||||
if (typeof survivor !== "string" || !speakers[survivor]) {
|
||||
throw badRequest("survivor must be an existing speaker id");
|
||||
}
|
||||
if (absorbList.length === 0) {
|
||||
throw badRequest("absorbed must list at least one speaker id");
|
||||
}
|
||||
for (const x of absorbList) {
|
||||
if (x === survivor) throw badRequest("cannot merge a speaker into itself");
|
||||
if (!speakers[x]) throw badRequest(`unknown speaker id: ${x}`);
|
||||
}
|
||||
// Refuse if the merge would leave no named-able speakers — i.e. it
|
||||
// collapses everything into one is fine, but survivor must remain.
|
||||
const remaining = Object.keys(speakers).filter((id) => !absorbList.includes(id));
|
||||
if (!remaining.includes(survivor)) {
|
||||
throw badRequest("survivor cannot be in the absorbed set");
|
||||
}
|
||||
|
||||
const absorbedSet = new Set(absorbList);
|
||||
let changed = 0;
|
||||
|
||||
// 1. transcript_segments
|
||||
for (const seg of rec.transcript_segments || []) {
|
||||
if (seg && absorbedSet.has(seg.speaker)) {
|
||||
seg.speaker = survivor;
|
||||
changed += 1;
|
||||
}
|
||||
}
|
||||
|
||||
// 2. chunk entries (+ per-line overrides)
|
||||
for (const chunk of rec.chunks || []) {
|
||||
for (const entry of chunk.entries || []) {
|
||||
if (!entry) continue;
|
||||
if (absorbedSet.has(entry.speaker)) {
|
||||
entry.speaker = survivor;
|
||||
changed += 1;
|
||||
}
|
||||
if (absorbedSet.has(entry.speaker_override)) {
|
||||
entry.speaker_override = survivor;
|
||||
changed += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 3. stats + display name
|
||||
rec.speaker_names = rec.speaker_names && typeof rec.speaker_names === "object"
|
||||
? rec.speaker_names
|
||||
: {};
|
||||
for (const x of absorbList) {
|
||||
mergeStats(speakers[survivor], speakers[x]);
|
||||
delete speakers[x];
|
||||
// Survivor inherits the absorbed name only if it has none of its own.
|
||||
if (!rec.speaker_names[survivor] && rec.speaker_names[x]) {
|
||||
rec.speaker_names[survivor] = rec.speaker_names[x];
|
||||
}
|
||||
if (x in rec.speaker_names) delete rec.speaker_names[x];
|
||||
}
|
||||
|
||||
// 4. extras attributions
|
||||
remapExtrasSpeakers(rec.extras, (id) => (absorbedSet.has(id) ? survivor : id));
|
||||
|
||||
rec.meta = rec.meta || {};
|
||||
rec.meta.speakers_merged_at = Date.now();
|
||||
|
||||
return { changed, speakers: rec.speakers, speaker_names: rec.speaker_names };
|
||||
}
|
||||
|
||||
// ─── Re-cluster (re-run diarization) ────────────────────────────────
|
||||
// Re-run cross-chunk clustering off the persisted per-chunk
|
||||
// fingerprints at a new threshold (+ optional suppression knobs),
|
||||
// re-stamp every segment + entry, then RESET the now-stale attribution
|
||||
// data (inferred names, per-line overrides, extras speaker tags) so the
|
||||
// operator re-labels from a clean slate. No LLM calls.
|
||||
//
|
||||
// Returns { speakers, clusterCount, threshold }. Throws a NO_FINGERPRINTS
|
||||
// error (code on err) when the record has no usable fingerprint data.
|
||||
export function reclusterMeetingRecord(rec, opts = {}) {
|
||||
if (!rec || typeof rec !== "object") throw badRequest("record required");
|
||||
|
||||
const diar = Array.isArray(rec.diarization) ? rec.diarization : [];
|
||||
const totalFps = diar.reduce(
|
||||
(n, d) => n + (d && d.ok ? Object.keys(d.fingerprints || {}).length : 0),
|
||||
0
|
||||
);
|
||||
if (totalFps === 0) {
|
||||
const err = new Error(
|
||||
"this meeting has no saved voice fingerprints — it predates fingerprint capture or was processed with diarization off, so it can't be re-clustered"
|
||||
);
|
||||
err.code = "NO_FINGERPRINTS";
|
||||
throw err;
|
||||
}
|
||||
|
||||
const threshold = opts.threshold;
|
||||
const { globalMap, uncertaintyMap, speakers, clusterCount } = clusterSpeakers(
|
||||
diar,
|
||||
threshold,
|
||||
{
|
||||
anchorMinSpeakingSec: opts.anchorMinSpeakingSec,
|
||||
smallClusterMaxSpeakingSec: opts.smallClusterMaxSpeakingSec,
|
||||
uncertainMarginPct: opts.uncertainMarginPct,
|
||||
}
|
||||
);
|
||||
|
||||
// Re-stamp the flat transcript segments off the new clustering...
|
||||
if (Array.isArray(rec.transcript_segments)) {
|
||||
assignSpeakersToSegments(rec.transcript_segments, diar, globalMap, uncertaintyMap);
|
||||
}
|
||||
// ...then clear + re-derive each chunk entry's speaker from them.
|
||||
for (const chunk of rec.chunks || []) {
|
||||
for (const entry of chunk.entries || []) {
|
||||
if (!entry) continue;
|
||||
entry.speaker = null;
|
||||
entry.speaker_confidence = null;
|
||||
entry.speaker_uncertain = false;
|
||||
if ("speaker_override" in entry) delete entry.speaker_override;
|
||||
}
|
||||
}
|
||||
backfillEntrySpeakers(rec, { force: true });
|
||||
|
||||
// New roster; stale name/attribution data reset.
|
||||
rec.speakers = speakers;
|
||||
rec.speaker_names = {};
|
||||
resetExtrasSpeakers(rec.extras);
|
||||
|
||||
rec.meta = rec.meta || {};
|
||||
rec.meta.reclustered_at = Date.now();
|
||||
rec.meta.recluster_threshold = clampPct(threshold);
|
||||
rec.meta.polish_done = false;
|
||||
|
||||
return { speakers, clusterCount, threshold: rec.meta.recluster_threshold };
|
||||
}
|
||||
|
||||
// ─── Apply re-polished summaries ────────────────────────────────────
|
||||
// After a re-polish pass (runSummaryPolish with the operator's corrected
|
||||
// names), write the new section summaries back into the saved record:
|
||||
// - rec.analysis.sections — the canonical section store
|
||||
// - rec.chunks[].summary — the rendered topic cards
|
||||
// Chunk summaries are matched to sections BY TITLE (polish never changes
|
||||
// titles), consumed in section order so duplicate titles still line up.
|
||||
// Chunk ENTRIES and any per-line speaker_override are left untouched —
|
||||
// only the summary text changes. Returns the count of chunk summaries
|
||||
// actually changed.
|
||||
export function applyPolishedSummaries(rec, polishedSections) {
|
||||
if (!rec || typeof rec !== "object" || !Array.isArray(polishedSections)) return 0;
|
||||
|
||||
if (rec.analysis && typeof rec.analysis === "object") {
|
||||
rec.analysis.sections = polishedSections;
|
||||
} else {
|
||||
rec.analysis = { sections: polishedSections };
|
||||
}
|
||||
|
||||
// title → queue of summaries, in section order.
|
||||
const byTitle = new Map();
|
||||
for (const s of polishedSections) {
|
||||
const key = s && typeof s.title === "string" ? s.title : "";
|
||||
if (!byTitle.has(key)) byTitle.set(key, []);
|
||||
byTitle.get(key).push(s && typeof s.summary === "string" ? s.summary : "");
|
||||
}
|
||||
|
||||
const used = new Map();
|
||||
let changed = 0;
|
||||
for (const chunk of rec.chunks || []) {
|
||||
if (!chunk) continue;
|
||||
const key = typeof chunk.title === "string" ? chunk.title : "";
|
||||
const list = byTitle.get(key);
|
||||
if (!list || !list.length) continue;
|
||||
const i = used.get(key) || 0;
|
||||
const summary = i < list.length ? list[i] : list[list.length - 1];
|
||||
used.set(key, i + 1);
|
||||
if (typeof summary === "string" && summary && chunk.summary !== summary) {
|
||||
chunk.summary = summary;
|
||||
changed += 1;
|
||||
}
|
||||
}
|
||||
return changed;
|
||||
}
|
||||
|
||||
// ─── helpers ────────────────────────────────────────────────────────
|
||||
|
||||
function badRequest(message) {
|
||||
const err = new Error(message);
|
||||
err.code = "BAD_REQUEST";
|
||||
return err;
|
||||
}
|
||||
|
||||
function clampPct(v) {
|
||||
const n = Number(v);
|
||||
if (!Number.isFinite(n)) return 70;
|
||||
return Math.max(50, Math.min(95, Math.round(n)));
|
||||
}
|
||||
|
||||
// Merge stats of `from` into `into` in place. turns / speaking-time /
|
||||
// fingerprint-count sum; mean_confidence is turn-weighted across the
|
||||
// clusters that have one; chunks_appeared_in uses max as a safe
|
||||
// approximation (the raw per-cluster chunk sets aren't retained).
|
||||
function mergeStats(into, from) {
|
||||
if (!into || !from) return;
|
||||
const t1 = into.turns || 0;
|
||||
const t2 = from.turns || 0;
|
||||
const c1 = typeof into.mean_confidence === "number" ? into.mean_confidence : null;
|
||||
const c2 = typeof from.mean_confidence === "number" ? from.mean_confidence : null;
|
||||
let mean = null;
|
||||
if (c1 != null && c2 != null) {
|
||||
const w = t1 + t2;
|
||||
mean = w > 0 ? (c1 * t1 + c2 * t2) / w : (c1 + c2) / 2;
|
||||
} else if (c1 != null) {
|
||||
mean = c1;
|
||||
} else if (c2 != null) {
|
||||
mean = c2;
|
||||
}
|
||||
into.turns = t1 + t2;
|
||||
into.total_speaking_seconds =
|
||||
Math.round(((into.total_speaking_seconds || 0) + (from.total_speaking_seconds || 0)) * 10) / 10;
|
||||
into.fingerprint_count = (into.fingerprint_count || 0) + (from.fingerprint_count || 0);
|
||||
into.chunks_appeared_in = Math.max(into.chunks_appeared_in || 0, from.chunks_appeared_in || 0);
|
||||
into.mean_confidence = mean;
|
||||
}
|
||||
|
||||
// Rewrite every speaker id in the extras block through `map`.
|
||||
function remapExtrasSpeakers(extras, map) {
|
||||
if (!extras || typeof extras !== "object") return;
|
||||
if (extras.tldr && Array.isArray(extras.tldr.primary_speakers)) {
|
||||
extras.tldr.primary_speakers = dedupe(extras.tldr.primary_speakers.map(map));
|
||||
}
|
||||
for (const d of arr(extras.decisions)) {
|
||||
if (Array.isArray(d.agreed_by)) d.agreed_by = dedupe(d.agreed_by.map(map));
|
||||
}
|
||||
for (const a of arr(extras.action_items)) {
|
||||
if (a.owner) a.owner = map(a.owner);
|
||||
}
|
||||
for (const q of arr(extras.key_quotes)) {
|
||||
if (q.speaker) q.speaker = map(q.speaker);
|
||||
}
|
||||
}
|
||||
|
||||
// Clear extras speaker attributions (keep the text). Used by re-cluster
|
||||
// since cluster identities change and old ids would be meaningless.
|
||||
function resetExtrasSpeakers(extras) {
|
||||
if (!extras || typeof extras !== "object") return;
|
||||
if (extras.tldr) extras.tldr.primary_speakers = [];
|
||||
for (const d of arr(extras.decisions)) d.agreed_by = [];
|
||||
for (const a of arr(extras.action_items)) a.owner = null;
|
||||
for (const q of arr(extras.key_quotes)) q.speaker = null;
|
||||
}
|
||||
|
||||
function arr(v) {
|
||||
return Array.isArray(v) ? v : [];
|
||||
}
|
||||
|
||||
function dedupe(list) {
|
||||
return [...new Set(list)];
|
||||
}
|
||||
@@ -0,0 +1,163 @@
|
||||
// Per-job output storage. After a transcribe + analyze cycle
|
||||
// completes, the worker calls saveJobOutput() to persist the
|
||||
// transcript + analysis JSON to /data/relay-outputs/<job_id>.json.
|
||||
// The operator dashboard surfaces these as a "View" link per job
|
||||
// that opens a Recap-style two-pane render in a new tab.
|
||||
//
|
||||
// Storage policy:
|
||||
// - Test-run jobs (source = "admin-test") are ALWAYS saved
|
||||
// - Real-user jobs are saved only when relay_save_user_outputs
|
||||
// is true in the operator config (default false for privacy)
|
||||
//
|
||||
// Storage format (per file):
|
||||
// {
|
||||
// job_id: string
|
||||
// batch_id: string | null
|
||||
// source: "admin-test" | null
|
||||
// saved_at: ms-epoch
|
||||
// transcript: string ("[MM:SS] line\n[MM:SS] line...")
|
||||
// analysis: { sections: [{title, summary, startIndex, endIndex}, ...] }
|
||||
// meta: {
|
||||
// title, media_url, audio_seconds, audio_bytes,
|
||||
// transcribe_backend, transcribe_model,
|
||||
// analyze_backend, analyze_model,
|
||||
// transcribe_ms, analyze_ms, wall_time_ms,
|
||||
// captions_mode
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// Files are simple JSON — no index, no DB. Listing scans the dir;
|
||||
// deletion just rm's the file. Cheap up to thousands of entries; if
|
||||
// the operator hits scale, swap in a SQLite index without changing
|
||||
// the on-disk format.
|
||||
|
||||
import fs from "fs/promises";
|
||||
import path from "path";
|
||||
|
||||
let outputDir = "/data/relay-outputs";
|
||||
|
||||
export async function initOutputStore({ dataDir }) {
|
||||
outputDir = path.join(dataDir, "relay-outputs");
|
||||
try {
|
||||
await fs.mkdir(outputDir, { recursive: true, mode: 0o700 });
|
||||
} catch (err) {
|
||||
console.warn(`[output-store] mkdir failed: ${err?.message || err}`);
|
||||
}
|
||||
console.log(`[output-store] writing to ${outputDir}`);
|
||||
}
|
||||
|
||||
// Path constructor with light sanitization — job_id is a UUID-style
|
||||
// string, but filter out anything that could traverse the filesystem
|
||||
// just in case the upstream ID generator changes.
|
||||
function pathFor(jobId) {
|
||||
const safe = String(jobId || "").replace(/[^a-zA-Z0-9_-]/g, "");
|
||||
if (!safe) throw new Error("invalid job_id");
|
||||
return path.join(outputDir, `${safe}.json`);
|
||||
}
|
||||
|
||||
// Save a job's transcript + analysis to disk. Best-effort: on write
|
||||
// failure, log and continue — the audit log remains the source of
|
||||
// truth for whether the job ran.
|
||||
export async function saveJobOutput(jobId, payload) {
|
||||
try {
|
||||
const filePath = pathFor(jobId);
|
||||
const body = JSON.stringify(
|
||||
{ job_id: jobId, saved_at: Date.now(), ...payload },
|
||||
null,
|
||||
2
|
||||
);
|
||||
await fs.writeFile(filePath, body, { mode: 0o600 });
|
||||
} catch (err) {
|
||||
console.warn(
|
||||
`[output-store] save failed for ${jobId}: ${err?.message || err}`
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// Read a single job's stored output. Returns null when missing —
|
||||
// the route layer should turn that into a 404.
|
||||
export async function getJobOutput(jobId) {
|
||||
try {
|
||||
const filePath = pathFor(jobId);
|
||||
const raw = await fs.readFile(filePath, "utf8");
|
||||
return JSON.parse(raw);
|
||||
} catch (err) {
|
||||
if (err.code === "ENOENT") return null;
|
||||
console.warn(
|
||||
`[output-store] read failed for ${jobId}: ${err?.message || err}`
|
||||
);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
// Check existence cheaply (stat) without reading the file body —
|
||||
// the Jobs table only needs a has_output boolean per row, not the
|
||||
// full payload, and scanning thousands of stats is much cheaper
|
||||
// than reading thousands of files into memory.
|
||||
export async function listJobOutputIds() {
|
||||
try {
|
||||
const files = await fs.readdir(outputDir);
|
||||
return files
|
||||
.filter((f) => f.endsWith(".json"))
|
||||
.map((f) => f.replace(/\.json$/, ""));
|
||||
} catch (err) {
|
||||
if (err.code === "ENOENT") return [];
|
||||
console.warn(
|
||||
`[output-store] list failed: ${err?.message || err}`
|
||||
);
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
// Delete one job's output. Returns true on success, false when
|
||||
// the file didn't exist.
|
||||
export async function deleteJobOutput(jobId) {
|
||||
try {
|
||||
await fs.unlink(pathFor(jobId));
|
||||
return true;
|
||||
} catch (err) {
|
||||
if (err.code === "ENOENT") return false;
|
||||
throw err;
|
||||
}
|
||||
}
|
||||
|
||||
// Bulk delete. Accepts either an array of job_ids or { all: true }.
|
||||
// Returns { deleted, missing } for caller reporting.
|
||||
export async function bulkDeleteOutputs({ jobIds, all }) {
|
||||
let deleted = 0;
|
||||
let missing = 0;
|
||||
if (all) {
|
||||
const ids = await listJobOutputIds();
|
||||
for (const id of ids) {
|
||||
const ok = await deleteJobOutput(id).catch(() => false);
|
||||
if (ok) deleted++;
|
||||
}
|
||||
return { deleted, missing };
|
||||
}
|
||||
if (!Array.isArray(jobIds)) return { deleted: 0, missing: 0 };
|
||||
for (const id of jobIds) {
|
||||
const ok = await deleteJobOutput(id).catch(() => false);
|
||||
if (ok) deleted++;
|
||||
else missing++;
|
||||
}
|
||||
return { deleted, missing };
|
||||
}
|
||||
|
||||
// Aggregate stats for the dashboard "Stored outputs" mini-panel.
|
||||
export async function getStoredOutputsSummary() {
|
||||
try {
|
||||
const files = await fs.readdir(outputDir);
|
||||
const jsonFiles = files.filter((f) => f.endsWith(".json"));
|
||||
let totalBytes = 0;
|
||||
for (const f of jsonFiles) {
|
||||
try {
|
||||
const s = await fs.stat(path.join(outputDir, f));
|
||||
totalBytes += s.size;
|
||||
} catch {}
|
||||
}
|
||||
return { count: jsonFiles.length, total_bytes: totalBytes };
|
||||
} catch (err) {
|
||||
if (err.code === "ENOENT") return { count: 0, total_bytes: 0 };
|
||||
return { count: 0, total_bytes: 0, error: err?.message };
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,655 @@
|
||||
// Post-cluster polish pass: after transcribe + diarize + clustering
|
||||
// have produced a speaker-labeled transcript, AND after pipelined
|
||||
// analyze has produced section objects (titles + summaries), run a
|
||||
// two-stage LLM pass that:
|
||||
//
|
||||
// Stage 1 — Global name inference. One LLM call with the
|
||||
// speaker-labeled transcript + episode metadata (channel name,
|
||||
// title, description) → JSON map { Speaker_A: "Matt Hill",
|
||||
// Speaker_B: "Sarah Jones", Speaker_C: null }. The "_C: null"
|
||||
// case is essential: when the LLM can't confidently identify a
|
||||
// speaker, it must return null instead of guessing.
|
||||
//
|
||||
// Stage 2 — Per-window summary polish. N parallel LLM calls, one
|
||||
// per analyze window. Each call sees that window's sections
|
||||
// (original summaries) + that window's transcript with speaker
|
||||
// labels + the global name map from Stage 1, and rewrites each
|
||||
// section's SUMMARY to attribute statements to specific
|
||||
// speakers ("Matt Hill explains..." vs "the discussion
|
||||
// centers..."). Section TITLES and start/end indices are kept
|
||||
// unchanged — polish only touches summary text.
|
||||
//
|
||||
// Why two stages: name inference benefits from the FULL transcript
|
||||
// view (name introductions like "welcome Matt" tend to appear in
|
||||
// window 1 but Matt keeps speaking throughout); per-window polish
|
||||
// benefits from parallelism (matches the existing analyze pattern).
|
||||
// Running them as one batched call would either lose parallelism
|
||||
// or send the full transcript N times.
|
||||
//
|
||||
// Failure modes:
|
||||
// - Stage 1 returns invalid JSON → all names default to null;
|
||||
// Stage 2 still runs and produces "Speaker A explains..." etc.
|
||||
// - Stage 2 fails for a particular window → keep the original
|
||||
// analyze summary for that window's sections. Per-window
|
||||
// failure shouldn't kill the whole polish.
|
||||
// - Both stages fail → fall back to the unpolished analyzeResult.
|
||||
// The caller sees the same output as a polish-disabled run.
|
||||
//
|
||||
// Cost: Stage 1 ~5-10s; Stage 2 ~10-15s (parallel); total ~15-25s
|
||||
// added to end of pipeline. On a 200s pipelined pipeline that's a
|
||||
// ~10% slowdown for the speaker-attribution UX win.
|
||||
|
||||
import { recordCall } from "./audit-log.js";
|
||||
|
||||
const STAGE_1_MAX_ATTEMPTS = 3;
|
||||
const STAGE_2_MAX_ATTEMPTS = 3;
|
||||
|
||||
// ─── Default prompts (operator-editable via Settings tab) ───────────
|
||||
//
|
||||
// Same three-layer override pattern as the analyze + transcribe
|
||||
// prompts: per-session operator override → operator-promoted default
|
||||
// → these hardcoded defaults. Both are validated on save —
|
||||
// `DEFAULT_NAME_INFERENCE_PROMPT_TEMPLATE` must contain {{transcript}}
|
||||
// and JSON output instructions; `DEFAULT_SUMMARY_POLISH_PROMPT_TEMPLATE`
|
||||
// must contain {{sections}} and JSON output instructions. Template
|
||||
// variables (interpolated at request time):
|
||||
//
|
||||
// Name inference prompt:
|
||||
// {{channel}} — operator-supplied or yt-dlp-extracted channel name
|
||||
// {{title}} — episode/video title
|
||||
// {{description}} — episode description (capped at 800 chars)
|
||||
// {{speakerStats}} — pre-formatted block listing each speaker's
|
||||
// chip letter, total speaking time, turn count
|
||||
// {{transcript}} — speaker-labeled bracketed transcript, capped
|
||||
// at 25k chars (middle truncated when over)
|
||||
// {{speakerKeys}} — JSON-schema-friendly key list for the
|
||||
// response shape (one line per Speaker_X)
|
||||
//
|
||||
// Summary polish prompt:
|
||||
// {{speakerRoster}} — pre-formatted block listing each speaker
|
||||
// with their inferred name (or "(unknown)")
|
||||
// and stats
|
||||
// {{transcript}} — this window's slice of the labeled transcript
|
||||
// {{sections}} — pre-formatted block listing each section
|
||||
// with title + original summary + time range
|
||||
export const DEFAULT_NAME_INFERENCE_PROMPT_TEMPLATE = `You are identifying real-world speaker names in an interview/podcast/meeting transcript. The transcript below has been pre-tagged with speaker labels like [A], [B], [C] — these are anonymous labels assigned by voice clustering. Your job: infer the real names of each speaker from contextual clues in the transcript.
|
||||
|
||||
EPISODE METADATA:
|
||||
- Channel/show: {{channel}}
|
||||
- Episode title: {{title}}
|
||||
- Description: {{description}}
|
||||
|
||||
{{operatorContext}}SPEAKER STATISTICS (cluster output):
|
||||
{{speakerStats}}
|
||||
|
||||
TRANSCRIPT (each line is "[<letter> <MM:SS>] text"):
|
||||
{{transcript}}
|
||||
|
||||
INSTRUCTIONS:
|
||||
1. For each Speaker_X in the speaker statistics, infer the real name from contextual clues:
|
||||
- Direct introductions ("welcome Matt", "I'm joined by Sarah")
|
||||
- Self-introductions ("my name is", "I'm Sarah, founder of...")
|
||||
- References between speakers ("what do you think Matt?", "as Sarah was saying")
|
||||
- Channel name or episode title hints
|
||||
- Operator hints in the OPERATOR HINTS section above, IF PRESENT — but see rule 6 for how to weight those.
|
||||
2. Use the speaker statistics to help — the host typically speaks more turns; guests speak less.
|
||||
3. Use first + last name if confidently identifiable. Use first name only if that's all you have.
|
||||
4. RETURN null IF YOU CANNOT CONFIDENTLY IDENTIFY THE SPEAKER. Do not guess. A null is better than a wrong name.
|
||||
5. For brief speakers (under 30s of speaking time, e.g. an intro music VO or a passing comment) it's expected that you'll often return null.
|
||||
6. WEIGHTING OPERATOR HINTS: When an OPERATOR HINTS section appears above, treat it as informed suggestion, NOT authoritative truth. The operator may have listed people who turned out not to speak, omitted people who did, or guessed wrong on who matches which voice. ALWAYS verify hints against the transcript. Specifically:
|
||||
- A name in the hints is only a candidate; if the transcript provides no signal that THIS Speaker_X is that person, return null instead of guessing.
|
||||
- If the transcript clearly identifies a speaker as someone NOT in the hints, use the transcript's name.
|
||||
- If the hints describe what each named person did ("Steve gave the update, John asked questions"), use that as a soft signal for mapping names to chip letters, but still verify with the transcript before committing.
|
||||
- It is better to leave a speaker as null than to confidently map a hint to the wrong chip letter.
|
||||
|
||||
Respond with ONLY valid JSON in this exact format, no other text:
|
||||
{
|
||||
"speakers": {
|
||||
{{speakerKeys}}
|
||||
}
|
||||
}`;
|
||||
|
||||
export const DEFAULT_SUMMARY_POLISH_PROMPT_TEMPLATE = `You wrote section summaries for a podcast/interview transcript window. We've now identified the speakers via voice clustering and (where possible) inferred their real names. Your job: rewrite each section's SUMMARY to attribute statements to specific speakers where it improves clarity, naturalness, and information density.
|
||||
|
||||
SPEAKERS (from voice clustering across the full episode):
|
||||
{{speakerRoster}}
|
||||
|
||||
WINDOW TRANSCRIPT (this window's slice; each line is labeled with the speaker's name, or a chip letter when their name is unknown):
|
||||
{{transcript}}
|
||||
|
||||
ORIGINAL SECTIONS IN THIS WINDOW (re-write the summary of each):
|
||||
{{sections}}
|
||||
|
||||
INSTRUCTIONS:
|
||||
1. The SPEAKERS roster and the WINDOW TRANSCRIPT are the AUTHORITATIVE source of who said what. The ORIGINAL summaries were written in an earlier pass and may attribute statements to OUTDATED or WRONG speaker names — your job includes CORRECTING those.
|
||||
2. Rewrite each section's SUMMARY so every speaker attribution matches the transcript + roster. If an original summary credits a statement to a person who, per the transcript, was actually said by someone else, REPLACE the name with the correct one. Never keep a name that does not appear in the roster.
|
||||
3. Use real names when available ("Matt Hill explains..."); fall back to a chip letter only for a speaker who has no name ("Speaker A explains...").
|
||||
4. Keep summaries 1-3 sentences — same length range as the original.
|
||||
5. KEEP THE TITLE EXACTLY AS GIVEN. Do not rewrite titles.
|
||||
6. Return the sections in the SAME ORDER as given, with the SAME INDEX numbers in the array.
|
||||
7. If a section is primarily one speaker, lean into their name ("Matt explains..."). If it is back-and-forth, name both ("Matt and Sarah debate...").
|
||||
8. If the transcript for a section genuinely has no speaker signal, keep the original summary's wording — but still fix or drop any name in it that conflicts with the roster. DO NOT invent attribution that the transcript does not support.
|
||||
|
||||
Respond with ONLY valid JSON in this exact format:
|
||||
{
|
||||
"sections": [
|
||||
{ "index": 0, "summary": "Polished summary text..." },
|
||||
{ "index": 1, "summary": "..." }
|
||||
]
|
||||
}
|
||||
|
||||
Return only the sections in this window. Use the same indices as the input ([0], [1], ...). Only the summary field — title and indices stay as given.`;
|
||||
|
||||
// Substitute {{key}} placeholders in a template. Unknown keys are
|
||||
// left as the literal {{key}} so an operator's edit that drops a
|
||||
// variable doesn't crash the run — the model just sees the placeholder.
|
||||
function fillTemplate(template, vars) {
|
||||
return String(template || "").replace(/\{\{\s*(\w+)\s*\}\}/g, (_match, key) => {
|
||||
return key in vars ? String(vars[key]) : `{{${key}}}`;
|
||||
});
|
||||
}
|
||||
|
||||
// Build a transcript representation with speaker labels prefixed.
|
||||
// Each line: `[A 0:08] So Matt, tell us how you got started…`
|
||||
// - The bracketed prefix is `[<label> <MM:SS>]`
|
||||
// - <label> is the speaker chip letter (Speaker_A → A) by default;
|
||||
// when the segment has no speaker (gap, unmatched), uses "?" — the
|
||||
// model is instructed to ignore those lines for name inference.
|
||||
// - When opts.speakerNames is provided (the SUMMARY-POLISH pass), a
|
||||
// named speaker is labeled with their RESOLVED name instead of the
|
||||
// letter (`[Matt 0:08] …`). This matters for the re-polish flow: the
|
||||
// operator's corrected names land directly on every transcript line,
|
||||
// so the polish model attributes to the right person without having
|
||||
// to resolve chip letters through the roster — and without trusting
|
||||
// any stale names already baked into the original summaries. The
|
||||
// name-INFERENCE pass deliberately omits speakerNames (it's inferring
|
||||
// them) so it still sees plain chip letters.
|
||||
export function formatSpeakerLabeledTranscript(segments, opts = {}) {
|
||||
if (!Array.isArray(segments) || segments.length === 0) return "";
|
||||
const startSec = opts.startSec != null ? opts.startSec : -Infinity;
|
||||
const endSec = opts.endSec != null ? opts.endSec : Infinity;
|
||||
const speakerNames =
|
||||
opts.speakerNames && typeof opts.speakerNames === "object"
|
||||
? opts.speakerNames
|
||||
: null;
|
||||
const lines = [];
|
||||
for (const s of segments) {
|
||||
const t = s.start || 0;
|
||||
if (t < startSec || t >= endSec) continue;
|
||||
const text = (s.text || "").trim();
|
||||
if (!text) continue;
|
||||
let label = "?";
|
||||
const m = String(s.speaker || "").match(/^Speaker_([A-Z]+)$/);
|
||||
if (m) {
|
||||
label = m[1];
|
||||
// Prefer the operator-corrected name when we have one.
|
||||
if (speakerNames) {
|
||||
const nm = speakerNames[s.speaker];
|
||||
if (typeof nm === "string" && nm.trim()) {
|
||||
label = nm.trim().replace(/[\[\]]/g, "");
|
||||
}
|
||||
}
|
||||
}
|
||||
const sec = Math.floor(t);
|
||||
const mm = Math.floor(sec / 60);
|
||||
const ss = sec % 60;
|
||||
lines.push(`[${label} ${mm}:${String(ss).padStart(2, "0")}] ${text}`);
|
||||
}
|
||||
return lines.join("\n");
|
||||
}
|
||||
|
||||
// ─── Stage 1: global speaker name inference ─────────────────────────
|
||||
//
|
||||
// Returns: { Speaker_A: "Matt Hill" | null, Speaker_B: ..., ... }
|
||||
// On total failure returns an object with all values null so Stage 2
|
||||
// can still run and produce generic speaker-attributed summaries
|
||||
// ("Speaker A explains...").
|
||||
export async function runNameInference({
|
||||
speakers, // map from clusterSpeakers (Speaker_A → stats)
|
||||
transcriptSegments, // array of { start, end, text, speaker } with speakers attached
|
||||
channelHint = "",
|
||||
titleHint = "",
|
||||
descriptionHint = "",
|
||||
// Operator-editable prompt override (Settings → LLM prompts).
|
||||
// Three-layer resolution at the caller: per-session override →
|
||||
// operator-promoted default → DEFAULT_NAME_INFERENCE_PROMPT_TEMPLATE.
|
||||
// Empty string means "use the hardcoded default" inside this fn.
|
||||
promptOverride = "",
|
||||
// Free-form operator context — internal meetings path uses this to
|
||||
// pass participant hints + notes that the LLM should treat as
|
||||
// suggestions, not authoritative truth. Empty string → no
|
||||
// OPERATOR HINTS block appears in the rendered prompt. The block
|
||||
// is composed here (not by the caller) so the warning language
|
||||
// stays consistent across pipelines.
|
||||
participantHints = "",
|
||||
operatorNotes = "",
|
||||
backend,
|
||||
// audit params
|
||||
pipelineBackend,
|
||||
jobId,
|
||||
batchId,
|
||||
mediaUrl,
|
||||
installId,
|
||||
licenseFingerprint = null,
|
||||
source,
|
||||
computeCostDetails,
|
||||
}) {
|
||||
const speakerLetters = Object.keys(speakers || {})
|
||||
.filter((k) => /^Speaker_[A-Z]+$/.test(k))
|
||||
.sort();
|
||||
// Build the null-default map up front so any early-return path
|
||||
// still returns the right shape.
|
||||
const nullMap = Object.fromEntries(speakerLetters.map((k) => [k, null]));
|
||||
if (speakerLetters.length === 0) return nullMap;
|
||||
|
||||
// For name inference we send the WHOLE labeled transcript (no
|
||||
// time window). Cap at ~25k chars to stay well inside Qwen3.6's
|
||||
// context window — for very long content we'd truncate the
|
||||
// middle; in practice 25k chars ≈ 200 minutes of transcript so
|
||||
// this only bites on very long podcasts.
|
||||
const fullLabeledTranscript = formatSpeakerLabeledTranscript(transcriptSegments);
|
||||
const cappedTranscript =
|
||||
fullLabeledTranscript.length > 25000
|
||||
? fullLabeledTranscript.slice(0, 12500) + "\n\n…[middle truncated for prompt length]…\n\n" + fullLabeledTranscript.slice(-12500)
|
||||
: fullLabeledTranscript;
|
||||
|
||||
// Speaker stats block — gives the model a sense of who speaks
|
||||
// how much, which helps it map names confidently.
|
||||
const speakerStatsBlock = speakerLetters
|
||||
.map((k) => {
|
||||
const stats = speakers[k] || {};
|
||||
const secs = Math.round(stats.total_speaking_seconds || 0);
|
||||
const mins = Math.floor(secs / 60);
|
||||
const rem = secs % 60;
|
||||
const timeStr = mins > 0 ? `${mins}m ${rem}s` : `${rem}s`;
|
||||
const letter = k.replace("Speaker_", "");
|
||||
return `- ${k} (chip "${letter}"): ${timeStr} of speaking time, ${stats.turns || 0} turns`;
|
||||
})
|
||||
.join("\n");
|
||||
|
||||
const speakerKeysBlock = speakerLetters
|
||||
.map((k, i) => ` "${k}": "Real Name or null"${i < speakerLetters.length - 1 ? "," : ""}`)
|
||||
.join("\n");
|
||||
|
||||
// Compose the OPERATOR HINTS block. Renders only when at least one
|
||||
// of participantHints / operatorNotes is non-empty. The wrapping
|
||||
// heading + warning language lives here (not in the template) so
|
||||
// operator-edited prompt overrides can't accidentally drop the
|
||||
// "treat as suggestions" framing — the LLM always sees it when
|
||||
// the block is present.
|
||||
const hintsParts = [];
|
||||
if (participantHints && String(participantHints).trim()) {
|
||||
hintsParts.push(
|
||||
`Possible participants in this conversation (operator-supplied — may be incomplete or include people who don't actually speak):\n${String(participantHints).trim()}`,
|
||||
);
|
||||
}
|
||||
if (operatorNotes && String(operatorNotes).trim()) {
|
||||
// Cap notes at 4000 chars to leave room for the transcript in
|
||||
// the prompt window. Mid-truncate is fine — notes are usually
|
||||
// short.
|
||||
const trimmed = String(operatorNotes).trim().slice(0, 4000);
|
||||
hintsParts.push(
|
||||
`Operator notes on the conversation (may describe what each named participant did — use as a soft signal for mapping names to chip letters):\n${trimmed}`,
|
||||
);
|
||||
}
|
||||
const operatorContextBlock = hintsParts.length
|
||||
? `OPERATOR HINTS (treat as suggestions only — verify against the transcript before assigning names):\n\n${hintsParts.join("\n\n")}\n\n`
|
||||
: "";
|
||||
|
||||
const templateSource =
|
||||
typeof promptOverride === "string" && promptOverride.trim()
|
||||
? promptOverride
|
||||
: DEFAULT_NAME_INFERENCE_PROMPT_TEMPLATE;
|
||||
const prompt = fillTemplate(templateSource, {
|
||||
channel: channelHint || "(unknown)",
|
||||
title: titleHint || "(unknown)",
|
||||
description: (descriptionHint || "").slice(0, 800) || "(none)",
|
||||
operatorContext: operatorContextBlock,
|
||||
speakerStats: speakerStatsBlock,
|
||||
transcript: cappedTranscript,
|
||||
speakerKeys: speakerKeysBlock,
|
||||
});
|
||||
|
||||
const t0 = Date.now();
|
||||
let lastErr = null;
|
||||
let r = null;
|
||||
let parsed = null;
|
||||
for (let attempt = 0; attempt < STAGE_1_MAX_ATTEMPTS; attempt++) {
|
||||
try {
|
||||
r = await backend.analyzeText({ prompt });
|
||||
parsed = safeParseSpeakers(r.text, speakerLetters);
|
||||
if (parsed) {
|
||||
lastErr = null;
|
||||
break;
|
||||
}
|
||||
lastErr = "invalid JSON in name-inference response";
|
||||
} catch (err) {
|
||||
lastErr = (err?.message || String(err)).slice(0, 280);
|
||||
r = null;
|
||||
}
|
||||
if (attempt < STAGE_1_MAX_ATTEMPTS - 1) {
|
||||
console.warn(
|
||||
`[polish/names] attempt ${attempt + 1} failed (${lastErr}) — retrying`
|
||||
);
|
||||
}
|
||||
}
|
||||
const dur = Date.now() - t0;
|
||||
const cost = parsed && r ? computeCostDetails(r.model, r.usage) : { input_tokens: 0, output_tokens: 0, thinking_tokens: 0, cost_usd: 0 };
|
||||
await recordCall({
|
||||
install_id: installId,
|
||||
license_fingerprint: licenseFingerprint,
|
||||
tier: "core",
|
||||
pipeline: "polish_names",
|
||||
backend: pipelineBackend,
|
||||
model: r?.model || null,
|
||||
status: parsed ? "success" : "error",
|
||||
duration_ms: dur,
|
||||
audio_seconds: 0,
|
||||
job_id: jobId,
|
||||
batch_id: batchId,
|
||||
source,
|
||||
media_url: mediaUrl,
|
||||
error: parsed ? null : (lastErr || "name inference failed"),
|
||||
...cost,
|
||||
});
|
||||
if (!parsed) {
|
||||
console.warn(
|
||||
`[polish/names] all ${STAGE_1_MAX_ATTEMPTS} attempts failed (${lastErr}) — falling back to null names`
|
||||
);
|
||||
return nullMap;
|
||||
}
|
||||
const filled = { ...nullMap };
|
||||
for (const k of speakerLetters) {
|
||||
const v = parsed[k];
|
||||
if (typeof v === "string" && v.trim() && v.trim().toLowerCase() !== "null") {
|
||||
filled[k] = v.trim();
|
||||
}
|
||||
}
|
||||
const namedCount = Object.values(filled).filter((v) => v).length;
|
||||
console.log(
|
||||
`[polish/names] inferred ${namedCount}/${speakerLetters.length} speakers in ${(dur / 1000).toFixed(1)}s`
|
||||
);
|
||||
return filled;
|
||||
}
|
||||
|
||||
function safeParseSpeakers(text, expectedKeys) {
|
||||
if (!text || typeof text !== "string") return null;
|
||||
// Strip optional code fence wrapping.
|
||||
let s = text.trim();
|
||||
const fence = s.match(/```(?:json)?\s*([\s\S]*?)```/);
|
||||
if (fence) s = fence[1].trim();
|
||||
let parsed;
|
||||
try {
|
||||
parsed = JSON.parse(s);
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
const speakers = parsed?.speakers;
|
||||
if (!speakers || typeof speakers !== "object") return null;
|
||||
// Require all expected keys present (null values OK)
|
||||
for (const k of expectedKeys) {
|
||||
if (!(k in speakers)) return null;
|
||||
}
|
||||
return speakers;
|
||||
}
|
||||
|
||||
// ─── Stage 2: per-window summary polish ─────────────────────────────
|
||||
//
|
||||
// `sections` is the FULL stitched section list with global entry
|
||||
// indices (output of stitchAnalysisResults). We need per-window
|
||||
// groupings to fire N parallel LLM calls — each call sees only the
|
||||
// sections within its window's time range so the prompt stays
|
||||
// bounded.
|
||||
//
|
||||
// `windows` is the planned-windows array (each w.bodyStartSec /
|
||||
// w.bodyEndSec) — used to assign sections to windows and to slice
|
||||
// the transcript for the prompt.
|
||||
//
|
||||
// `canonicalEntries` is the parsed transcript (the same array
|
||||
// stitcher uses for indexing). We need it to convert section
|
||||
// startIndex/endIndex into time ranges for grouping.
|
||||
//
|
||||
// Returns: the same sections array with summaries rewritten in
|
||||
// place. Sections whose window's polish failed keep their original
|
||||
// summary. Section start/end indices and titles are NEVER modified.
|
||||
export async function runSummaryPolish({
|
||||
sections,
|
||||
canonicalEntries,
|
||||
windows,
|
||||
transcriptSegments,
|
||||
speakerNames,
|
||||
speakerStats,
|
||||
// Operator-editable polish prompt override. Same three-layer
|
||||
// resolution at caller as the name-inference override.
|
||||
promptOverride = "",
|
||||
backend,
|
||||
concurrency,
|
||||
// audit params
|
||||
pipelineBackend,
|
||||
jobId,
|
||||
batchId,
|
||||
mediaUrl,
|
||||
installId,
|
||||
licenseFingerprint = null,
|
||||
source,
|
||||
computeCostDetails,
|
||||
}) {
|
||||
if (!Array.isArray(sections) || sections.length === 0) return sections;
|
||||
if (!Array.isArray(windows) || windows.length === 0) return sections;
|
||||
if (!Array.isArray(canonicalEntries) || canonicalEntries.length === 0) return sections;
|
||||
|
||||
// Group sections by window — assign each section to the window
|
||||
// whose body contains its start time. Sections whose start time
|
||||
// falls before any window's body (shouldn't happen in practice)
|
||||
// get assigned to window 0.
|
||||
const sectionsByWindow = windows.map(() => []);
|
||||
for (const sec of sections) {
|
||||
const startEntry = canonicalEntries[sec.startIndex];
|
||||
if (!startEntry) continue;
|
||||
const startSec = startEntry.offset || 0;
|
||||
let assigned = -1;
|
||||
for (let i = 0; i < windows.length; i++) {
|
||||
const w = windows[i];
|
||||
const nextW = windows[i + 1];
|
||||
const upper = nextW ? nextW.bodyStartSec : Infinity;
|
||||
if (startSec >= (w.bodyStartSec || 0) && startSec < upper) {
|
||||
assigned = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (assigned < 0) assigned = 0;
|
||||
sectionsByWindow[assigned].push(sec);
|
||||
}
|
||||
|
||||
// Build a speaker-roster block reused across all window prompts.
|
||||
const speakerRoster = Object.entries(speakerNames || {})
|
||||
.filter(([k]) => /^Speaker_[A-Z]+$/.test(k))
|
||||
.sort()
|
||||
.map(([k, name]) => {
|
||||
const letter = k.replace("Speaker_", "");
|
||||
const stats = speakerStats?.[k] || {};
|
||||
const secs = Math.round(stats.total_speaking_seconds || 0);
|
||||
const mins = Math.floor(secs / 60);
|
||||
const rem = secs % 60;
|
||||
const timeStr = mins > 0 ? `${mins}m ${rem}s` : `${rem}s`;
|
||||
const nameLabel = name ? `"${name}"` : "(unknown name)";
|
||||
return `- ${k} (chip [${letter}], ${timeStr} speaking, ${stats.turns || 0} turns): ${nameLabel}`;
|
||||
})
|
||||
.join("\n");
|
||||
|
||||
// Per-window worker — runs one LLM call to polish all sections
|
||||
// assigned to that window. Returns a Map<sectionIndex,
|
||||
// newSummary> covering only the sections it successfully rewrote.
|
||||
const worker = async (windowIdx) => {
|
||||
const w = windows[windowIdx];
|
||||
const winSections = sectionsByWindow[windowIdx];
|
||||
if (winSections.length === 0) return new Map();
|
||||
|
||||
const winStartSec = w.startSec || 0;
|
||||
const winEndSec = w.windowEndSec || (w.bodyEndSec || 0);
|
||||
const windowTranscript = formatSpeakerLabeledTranscript(transcriptSegments, {
|
||||
startSec: winStartSec,
|
||||
endSec: winEndSec,
|
||||
// Label lines with the corrected names so attributions in the
|
||||
// rewritten summaries follow the operator's roster, not whatever
|
||||
// (possibly stale) names the original summaries were written with.
|
||||
speakerNames,
|
||||
});
|
||||
|
||||
// Format each section for the prompt. We carry the section's
|
||||
// ORIGINAL position in `sections` via `__origIdx` so the worker
|
||||
// can map polished summaries back to the right slot.
|
||||
const sectionsBlock = winSections
|
||||
.map((sec, i) => {
|
||||
const startEntry = canonicalEntries[sec.startIndex];
|
||||
const endEntry = canonicalEntries[sec.endIndex];
|
||||
const tStart = startEntry ? (startEntry.offset || 0) : 0;
|
||||
const tEnd = endEntry ? (endEntry.offset || 0) : 0;
|
||||
const tStartStr = fmtMmSs(tStart);
|
||||
const tEndStr = fmtMmSs(tEnd);
|
||||
return `[${i}] Title: "${sec.title}" (${tStartStr}-${tEndStr})\n Original summary: ${sec.summary}`;
|
||||
})
|
||||
.join("\n\n");
|
||||
|
||||
const templateSource =
|
||||
typeof promptOverride === "string" && promptOverride.trim()
|
||||
? promptOverride
|
||||
: DEFAULT_SUMMARY_POLISH_PROMPT_TEMPLATE;
|
||||
const prompt = fillTemplate(templateSource, {
|
||||
speakerRoster: speakerRoster || "(no speakers identified)",
|
||||
transcript: windowTranscript || "(empty)",
|
||||
sections: sectionsBlock,
|
||||
});
|
||||
|
||||
const t0 = Date.now();
|
||||
let r = null;
|
||||
let parsed = null;
|
||||
let lastErr = null;
|
||||
for (let attempt = 0; attempt < STAGE_2_MAX_ATTEMPTS; attempt++) {
|
||||
try {
|
||||
r = await backend.analyzeText({ prompt });
|
||||
parsed = safeParsePolishedSections(r.text, winSections.length);
|
||||
if (parsed) {
|
||||
lastErr = null;
|
||||
break;
|
||||
}
|
||||
lastErr = "invalid JSON in polish response";
|
||||
} catch (err) {
|
||||
lastErr = (err?.message || String(err)).slice(0, 280);
|
||||
r = null;
|
||||
}
|
||||
if (attempt < STAGE_2_MAX_ATTEMPTS - 1) {
|
||||
console.warn(
|
||||
`[polish/window ${windowIdx + 1}/${windows.length}] attempt ${attempt + 1} failed (${lastErr}) — retrying`
|
||||
);
|
||||
}
|
||||
}
|
||||
const dur = Date.now() - t0;
|
||||
const cost = parsed && r ? computeCostDetails(r.model, r.usage) : { input_tokens: 0, output_tokens: 0, thinking_tokens: 0, cost_usd: 0 };
|
||||
await recordCall({
|
||||
install_id: installId,
|
||||
license_fingerprint: licenseFingerprint,
|
||||
tier: "core",
|
||||
pipeline: "polish_summaries",
|
||||
backend: pipelineBackend,
|
||||
model: r?.model || null,
|
||||
status: parsed ? "success" : "error",
|
||||
duration_ms: dur,
|
||||
audio_seconds: 0,
|
||||
job_id: jobId,
|
||||
batch_id: batchId,
|
||||
source,
|
||||
media_url: mediaUrl,
|
||||
error: parsed ? null : (lastErr || "polish failed"),
|
||||
window_idx: windowIdx,
|
||||
window_count: windows.length,
|
||||
...cost,
|
||||
});
|
||||
if (!parsed) {
|
||||
console.warn(
|
||||
`[polish/window ${windowIdx + 1}/${windows.length}] all attempts failed (${lastErr}) — keeping original summaries`
|
||||
);
|
||||
return new Map();
|
||||
}
|
||||
// Map polish output back to the original sections by their
|
||||
// win-local index. Build a Map<orig-section-position-in-array,
|
||||
// newSummary>.
|
||||
const out = new Map();
|
||||
for (const p of parsed) {
|
||||
const localIdx = p.index;
|
||||
if (!Number.isInteger(localIdx) || localIdx < 0 || localIdx >= winSections.length) continue;
|
||||
const newSummary = (p.summary || "").trim();
|
||||
if (!newSummary) continue;
|
||||
const sec = winSections[localIdx];
|
||||
const origIdx = sections.indexOf(sec);
|
||||
if (origIdx >= 0) out.set(origIdx, newSummary);
|
||||
}
|
||||
console.log(
|
||||
`[polish/window ${windowIdx + 1}/${windows.length}] polished ${out.size}/${winSections.length} sections in ${(dur / 1000).toFixed(1)}s`
|
||||
);
|
||||
return out;
|
||||
};
|
||||
|
||||
// Concurrent worker pool (same shape as runPipelinedAnalysis).
|
||||
let nextIdx = 0;
|
||||
const updates = new Map();
|
||||
async function poolWorker() {
|
||||
while (true) {
|
||||
const my = nextIdx++;
|
||||
if (my >= windows.length) return;
|
||||
const result = await worker(my);
|
||||
for (const [k, v] of result) updates.set(k, v);
|
||||
}
|
||||
}
|
||||
const pool = Array.from(
|
||||
{ length: Math.min(concurrency || 4, windows.length) },
|
||||
poolWorker
|
||||
);
|
||||
await Promise.all(pool);
|
||||
|
||||
// Apply polished summaries onto a shallow copy of sections so the
|
||||
// caller's reference doesn't mutate unexpectedly.
|
||||
const polished = sections.map((sec, i) => {
|
||||
const newSummary = updates.get(i);
|
||||
return newSummary ? { ...sec, summary: newSummary } : sec;
|
||||
});
|
||||
console.log(
|
||||
`[polish] applied ${updates.size}/${sections.length} polished summaries`
|
||||
);
|
||||
return polished;
|
||||
}
|
||||
|
||||
function safeParsePolishedSections(text, maxIndex) {
|
||||
if (!text || typeof text !== "string") return null;
|
||||
let s = text.trim();
|
||||
const fence = s.match(/```(?:json)?\s*([\s\S]*?)```/);
|
||||
if (fence) s = fence[1].trim();
|
||||
let parsed;
|
||||
try {
|
||||
parsed = JSON.parse(s);
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
const arr = parsed?.sections;
|
||||
if (!Array.isArray(arr)) return null;
|
||||
// Validate each entry has {index: int, summary: string} and indices
|
||||
// are in-range.
|
||||
const valid = [];
|
||||
for (const item of arr) {
|
||||
if (!item || typeof item !== "object") continue;
|
||||
if (!Number.isInteger(item.index)) continue;
|
||||
if (item.index < 0 || item.index >= maxIndex) continue;
|
||||
if (typeof item.summary !== "string" || !item.summary.trim()) continue;
|
||||
valid.push(item);
|
||||
}
|
||||
return valid.length > 0 ? valid : null;
|
||||
}
|
||||
|
||||
function fmtMmSs(seconds) {
|
||||
const s = Math.max(0, Math.floor(seconds || 0));
|
||||
const h = Math.floor(s / 3600);
|
||||
const m = Math.floor((s % 3600) / 60);
|
||||
const sec = s % 60;
|
||||
const pad = (n) => n.toString().padStart(2, "0");
|
||||
return h > 0 ? `${h}:${pad(m)}:${pad(sec)}` : `${m}:${pad(sec)}`;
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,624 @@
|
||||
// Cross-chunk speaker reconciliation for Phase 1D of the diarization
|
||||
// pipeline. Spark Control's /api/audio/diarize-chunk hands back
|
||||
// per-chunk speaker labels ("Speaker_0", "Speaker_1") that are local
|
||||
// to one chunk — Sortformer has no memory across calls, so Speaker_0
|
||||
// in chunk 5 might or might not be the same person as Speaker_0 in
|
||||
// chunk 6. The per-speaker 192-dim TitaNet voice embedding it also
|
||||
// returns IS persistent though, so we cluster fingerprints across
|
||||
// chunks via cosine similarity to recover the global speaker identity.
|
||||
//
|
||||
// Algorithm: average-linkage agglomerative clustering. Start with N
|
||||
// singleton clusters (one per fingerprint), repeatedly merge the
|
||||
// closest pair until no pair is closer than the operator-configured
|
||||
// threshold. Average-linkage was the choice over single/complete
|
||||
// because it's robust to outlier embeddings (one bad embedding from
|
||||
// a noisy chunk doesn't anchor or repel an entire cluster).
|
||||
//
|
||||
// Complexity: O(N³) where N = total fingerprints across all chunks.
|
||||
// Typical: 2 speakers × 21 chunks = 42 → ~74k ops, sub-millisecond.
|
||||
// Worst case for a 4-hour all-talk-show video: 6 speakers × 48 chunks
|
||||
// = ~288 → ~24M ops, still milliseconds in Node.
|
||||
//
|
||||
// Threshold convention: configured as INTEGER percentage 50-95
|
||||
// representing cosine similarity. 70 (= 0.70 sim) is NeMo's
|
||||
// recommended default for TitaNet embeddings. Internally we work
|
||||
// in cosine DISTANCE (= 1 - similarity) for the merge condition.
|
||||
|
||||
// ─── Cosine similarity ──────────────────────────────────────────────
|
||||
// Standard dot-product / (||a|| * ||b||). Both inputs must be number
|
||||
// arrays of the same length. Returns 0 for any zero-magnitude input
|
||||
// to avoid NaN propagation.
|
||||
export function cosineSimilarity(a, b) {
|
||||
if (!a || !b || a.length !== b.length) return 0;
|
||||
let dot = 0;
|
||||
let na = 0;
|
||||
let nb = 0;
|
||||
for (let i = 0; i < a.length; i++) {
|
||||
dot += a[i] * b[i];
|
||||
na += a[i] * a[i];
|
||||
nb += b[i] * b[i];
|
||||
}
|
||||
if (!na || !nb) return 0;
|
||||
return dot / (Math.sqrt(na) * Math.sqrt(nb));
|
||||
}
|
||||
|
||||
// ─── Cluster ID naming ──────────────────────────────────────────────
|
||||
// Speaker_A, Speaker_B, ..., Speaker_Z, Speaker_AA, Speaker_AB...
|
||||
// Capital letter prefix is intentionally distinct from the chunk-
|
||||
// local "Speaker_0" / "Speaker_1" naming that Sortformer uses, so
|
||||
// the source of a label is obvious at a glance.
|
||||
function globalSpeakerLabel(index) {
|
||||
let n = index;
|
||||
let s = "";
|
||||
do {
|
||||
s = String.fromCharCode(65 + (n % 26)) + s;
|
||||
n = Math.floor(n / 26) - 1;
|
||||
} while (n >= 0);
|
||||
return "Speaker_" + s;
|
||||
}
|
||||
|
||||
// ─── Agglomerative clustering (average linkage) ─────────────────────
|
||||
// Input: array of { key, vector } pairs. `key` is opaque to the
|
||||
// algorithm — we just propagate it into the returned cluster's
|
||||
// `members` list for the caller to map back to (chunkIdx, localLabel).
|
||||
//
|
||||
// Output: array of clusters, each { members: [keys], vectors: [vecs] }.
|
||||
// Order: clusters are emitted in the order their FIRST member was
|
||||
// added to the input — keeps "Speaker_A" assigned to whoever spoke
|
||||
// first across the audio, which is the natural user expectation.
|
||||
function agglomerativeCluster(items, distanceThreshold) {
|
||||
if (items.length === 0) return [];
|
||||
const clusters = items.map((it) => ({
|
||||
members: [it.key],
|
||||
vectors: [it.vector],
|
||||
firstOrder: it.firstOrder, // preserve original input order for stable sort later
|
||||
}));
|
||||
// Cache pairwise singleton distances to avoid recomputing as
|
||||
// clusters grow. distMatrix[i][j] = avg cosine distance between
|
||||
// cluster i's vectors and cluster j's vectors. Recomputed on merge.
|
||||
while (clusters.length > 1) {
|
||||
let best = { dist: Infinity, i: -1, j: -1 };
|
||||
for (let i = 0; i < clusters.length; i++) {
|
||||
for (let j = i + 1; j < clusters.length; j++) {
|
||||
const d = avgLinkageDistance(clusters[i], clusters[j]);
|
||||
if (d < best.dist) {
|
||||
best = { dist: d, i, j };
|
||||
}
|
||||
}
|
||||
}
|
||||
if (best.dist > distanceThreshold) break;
|
||||
// Merge clusters[j] into clusters[i], remove clusters[j]
|
||||
clusters[best.i].members.push(...clusters[best.j].members);
|
||||
clusters[best.i].vectors.push(...clusters[best.j].vectors);
|
||||
// Keep the earliest firstOrder so the merged cluster sorts to
|
||||
// the position of its earliest-appearing member.
|
||||
if (clusters[best.j].firstOrder < clusters[best.i].firstOrder) {
|
||||
clusters[best.i].firstOrder = clusters[best.j].firstOrder;
|
||||
}
|
||||
clusters.splice(best.j, 1);
|
||||
}
|
||||
// Sort by first-appearance order so Speaker_A = whoever spoke first.
|
||||
clusters.sort((a, b) => a.firstOrder - b.firstOrder);
|
||||
return clusters;
|
||||
}
|
||||
|
||||
function avgLinkageDistance(c1, c2) {
|
||||
let sum = 0;
|
||||
let count = 0;
|
||||
for (const v1 of c1.vectors) {
|
||||
for (const v2 of c2.vectors) {
|
||||
sum += 1 - cosineSimilarity(v1, v2);
|
||||
count += 1;
|
||||
}
|
||||
}
|
||||
return count > 0 ? sum / count : Infinity;
|
||||
}
|
||||
|
||||
// ─── Public: cluster per-chunk diarization → global speaker map ─────
|
||||
//
|
||||
// Input shape (one entry per chunk; failed/skipped chunks are
|
||||
// silently filtered):
|
||||
// {
|
||||
// ok: true,
|
||||
// chunkIndex: 5,
|
||||
// startSeconds: 1080,
|
||||
// segments: [{ start, end, speaker_local, confidence }],
|
||||
// speakers_local: ["Speaker_0", "Speaker_1"],
|
||||
// fingerprints: { "Speaker_0": [192 floats], "Speaker_1": [192 floats] }
|
||||
// }
|
||||
//
|
||||
// Returns:
|
||||
// {
|
||||
// globalMap: Map<"chunkIdx:localLabel", "Speaker_A">,
|
||||
// speakers: {
|
||||
// Speaker_A: { turns, total_speaking_seconds, mean_confidence,
|
||||
// chunks_appeared_in, fingerprint_count },
|
||||
// ...
|
||||
// },
|
||||
// clusterCount: 2,
|
||||
// thresholdSimilarity: 0.70
|
||||
// }
|
||||
//
|
||||
// When fingerprintCount === 0 (diarization off or all chunks failed)
|
||||
// returns an empty result: { globalMap: empty, speakers: {}, ... }.
|
||||
// Post-cluster suppression tunables. After the initial agglomerative
|
||||
// cluster pass, walk the resulting clusters and re-categorize the
|
||||
// small ones to fix the "14 speakers detected when really only 2"
|
||||
// case Grant flagged on a 2h53m podcast. The clustering algorithm
|
||||
// itself stays strict (no false-positive merges); suppression is a
|
||||
// second pass that operates on cluster size + cross-cluster
|
||||
// similarity to catch the noise-induced spurious clusters.
|
||||
//
|
||||
// anchor_min_speaking_sec — a cluster needs at least this much
|
||||
// total speaking time to be considered an "anchor" (= a real
|
||||
// speaker). Anchors keep their own global ID + colored chip.
|
||||
//
|
||||
// small_cluster_max_speaking_sec — clusters with LESS than this
|
||||
// are suppression candidates. Brief utterances are common false
|
||||
// positives (background noise, crosstalk fragments, brief
|
||||
// intros).
|
||||
//
|
||||
// uncertain_margin_pct — a small cluster whose best similarity
|
||||
// to any anchor is within this many percentage points of the
|
||||
// main threshold gets REASSIGNED to that anchor and marked
|
||||
// uncertain (chip shows "?"). Far-from-anchor small clusters
|
||||
// become Speaker_Unknown.
|
||||
//
|
||||
// All three are operator-editable via Settings → Operator hardware.
|
||||
// Defaults are conservative — no false-positive merges into
|
||||
// anchors, just reassignment of small clusters that are PROBABLY
|
||||
// the anchor in noisy conditions. A real 30+ second second speaker
|
||||
// still gets their own chip; only brief flecks of similar voice get
|
||||
// pulled in.
|
||||
const DEFAULT_ANCHOR_MIN_SPEAKING_SEC = 30;
|
||||
const DEFAULT_SMALL_CLUSTER_MAX_SPEAKING_SEC = 15;
|
||||
const DEFAULT_UNCERTAIN_MARGIN_PCT = 10;
|
||||
|
||||
// Average cosine similarity between two clusters' vector sets —
|
||||
// inverse of avgLinkageDistance, expressed as similarity for
|
||||
// readability in the suppression logic.
|
||||
function avgLinkageSimilarity(c1, c2) {
|
||||
return 1 - avgLinkageDistance(c1, c2);
|
||||
}
|
||||
|
||||
// Clamp an option value to an integer in [lo, hi], falling back to
|
||||
// `fallback` when the value is missing or non-finite. Used to make
|
||||
// out-of-range or absent operator settings safe.
|
||||
function clampInt(v, fallback, lo, hi) {
|
||||
const n = Number(v);
|
||||
if (!Number.isFinite(n)) return fallback;
|
||||
const i = Math.round(n);
|
||||
if (i < lo) return lo;
|
||||
if (i > hi) return hi;
|
||||
return i;
|
||||
}
|
||||
|
||||
export function clusterSpeakers(
|
||||
chunkDiarization,
|
||||
clusterThresholdPct = 70,
|
||||
options = {}
|
||||
) {
|
||||
// Use Number.isFinite-guarded fallback rather than the `|| 70`
|
||||
// idiom — the latter substitutes 70 for ANY falsy value including
|
||||
// 0 (a valid input we want to clamp to 50, not silently bump up).
|
||||
const raw = Number(clusterThresholdPct);
|
||||
const pct = Math.max(50, Math.min(95, Number.isFinite(raw) ? raw : 70));
|
||||
const similarityThreshold = pct / 100;
|
||||
const distanceThreshold = 1 - similarityThreshold;
|
||||
|
||||
// Operator-tunable suppression thresholds — accept from options
|
||||
// with Number.isFinite-guarded fallbacks to the conservative
|
||||
// defaults. Clamped to the same ranges the admin.js SETTINGS_RANGES
|
||||
// enforces on save, so a hand-edited relay-config.json with an
|
||||
// out-of-range value still produces sane behavior.
|
||||
const anchorMinSec = clampInt(
|
||||
options.anchorMinSpeakingSec,
|
||||
DEFAULT_ANCHOR_MIN_SPEAKING_SEC,
|
||||
5,
|
||||
120
|
||||
);
|
||||
const smallMaxSec = clampInt(
|
||||
options.smallClusterMaxSpeakingSec,
|
||||
DEFAULT_SMALL_CLUSTER_MAX_SPEAKING_SEC,
|
||||
1,
|
||||
60
|
||||
);
|
||||
const uncertainMarginPct = clampInt(
|
||||
options.uncertainMarginPct,
|
||||
DEFAULT_UNCERTAIN_MARGIN_PCT,
|
||||
0,
|
||||
30
|
||||
);
|
||||
const uncertainSimThreshold = Math.max(
|
||||
0,
|
||||
similarityThreshold - uncertainMarginPct / 100
|
||||
);
|
||||
|
||||
// Flatten fingerprints into the clustering input. Preserve insertion
|
||||
// order so the first-appearance speaker gets Speaker_A.
|
||||
const items = [];
|
||||
let order = 0;
|
||||
for (const d of chunkDiarization || []) {
|
||||
if (!d || !d.ok || !d.fingerprints) continue;
|
||||
for (const [localLabel, vector] of Object.entries(d.fingerprints)) {
|
||||
if (!Array.isArray(vector) || vector.length === 0) continue;
|
||||
items.push({
|
||||
key: `${d.chunkIndex}:${localLabel}`,
|
||||
vector,
|
||||
firstOrder: order++,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
if (items.length === 0) {
|
||||
return {
|
||||
globalMap: new Map(),
|
||||
uncertaintyMap: new Map(),
|
||||
speakers: {},
|
||||
clusterCount: 0,
|
||||
thresholdSimilarity: similarityThreshold,
|
||||
};
|
||||
}
|
||||
|
||||
const clusters = agglomerativeCluster(items, distanceThreshold);
|
||||
|
||||
// ─── First pass: compute speaking time per cluster ───────────────
|
||||
// We need cluster sizes BEFORE building the global map so the
|
||||
// suppression pass can identify anchors. Walk all diar segments,
|
||||
// map each (chunkIdx, speaker_local) to its cluster index, and
|
||||
// accumulate seg duration.
|
||||
const clusterIdxByMember = new Map();
|
||||
for (let i = 0; i < clusters.length; i++) {
|
||||
for (const memberKey of clusters[i].members) {
|
||||
clusterIdxByMember.set(memberKey, i);
|
||||
}
|
||||
}
|
||||
const totalSecsByCluster = new Array(clusters.length).fill(0);
|
||||
for (const d of chunkDiarization || []) {
|
||||
if (!d || !d.ok || !Array.isArray(d.segments)) continue;
|
||||
for (const seg of d.segments) {
|
||||
const key = `${d.chunkIndex}:${seg.speaker_local}`;
|
||||
const ci = clusterIdxByMember.get(key);
|
||||
if (ci === undefined) continue;
|
||||
const dur = Math.max(0, (seg.end || 0) - (seg.start || 0));
|
||||
totalSecsByCluster[ci] += dur;
|
||||
}
|
||||
}
|
||||
|
||||
// ─── Second pass: identify anchors + plan suppression ────────────
|
||||
const isAnchorIdx = new Array(clusters.length).fill(false);
|
||||
const anchorIdxs = [];
|
||||
for (let i = 0; i < clusters.length; i++) {
|
||||
if (totalSecsByCluster[i] >= anchorMinSec) {
|
||||
isAnchorIdx[i] = true;
|
||||
anchorIdxs.push(i);
|
||||
}
|
||||
}
|
||||
|
||||
// reassignTo[i] = anchor cluster idx that absorbs i; uncertain
|
||||
// unknownClusters: set of cluster idxs whose members map to
|
||||
// Speaker_Unknown. keptAsOwn: non-anchor cluster idxs that stay
|
||||
// as their own speaker (large + low-sim — plausibly a real
|
||||
// third+ speaker even if rare).
|
||||
const reassignTo = new Map();
|
||||
const unknownClusters = new Set();
|
||||
if (anchorIdxs.length >= 1) {
|
||||
for (let i = 0; i < clusters.length; i++) {
|
||||
if (isAnchorIdx[i]) continue;
|
||||
// Find best anchor by average cosine similarity
|
||||
let bestAnchorIdx = -1;
|
||||
let bestSim = -Infinity;
|
||||
for (const ai of anchorIdxs) {
|
||||
const sim = avgLinkageSimilarity(clusters[i], clusters[ai]);
|
||||
if (sim > bestSim) {
|
||||
bestSim = sim;
|
||||
bestAnchorIdx = ai;
|
||||
}
|
||||
}
|
||||
const totalSecs = totalSecsByCluster[i];
|
||||
if (bestAnchorIdx >= 0 && bestSim >= uncertainSimThreshold) {
|
||||
// Close-to-anchor (within uncertain_margin_pct of
|
||||
// main threshold) → reassign to anchor with uncertainty.
|
||||
// Chip will show e.g. "MH?" so the user knows attribution
|
||||
// is best-guess.
|
||||
reassignTo.set(i, bestAnchorIdx);
|
||||
} else if (totalSecs < smallMaxSec) {
|
||||
// Small + far-from-anchor → Unknown. Brief noise / crosstalk /
|
||||
// background voices that don't confidently match either main
|
||||
// speaker. Merged into a single Speaker_Unknown pseudo-
|
||||
// speaker so the legend doesn't fill with N "unidentified
|
||||
// brief speaker" entries.
|
||||
unknownClusters.add(i);
|
||||
}
|
||||
// else: large (>= 15s) + far-from-anchor → keep as own speaker.
|
||||
// Plausibly a real third+ person who's distinct from the main
|
||||
// anchors. Rare but possible.
|
||||
}
|
||||
}
|
||||
|
||||
// ─── Build the final cluster-label ordering ──────────────────────
|
||||
// Order by first-appearance: whoever spoke first in the audio
|
||||
// gets Speaker_A. Anchors + kept-as-own clusters get labels;
|
||||
// reassigned + unknown clusters don't.
|
||||
const ordered = [];
|
||||
for (let i = 0; i < clusters.length; i++) {
|
||||
if (reassignTo.has(i) || unknownClusters.has(i)) continue;
|
||||
ordered.push({ idx: i, firstOrder: clusters[i].firstOrder });
|
||||
}
|
||||
ordered.sort((a, b) => a.firstOrder - b.firstOrder);
|
||||
const labelByOrigIdx = new Map();
|
||||
for (let j = 0; j < ordered.length; j++) {
|
||||
labelByOrigIdx.set(ordered[j].idx, globalSpeakerLabel(j));
|
||||
}
|
||||
// Reassigned clusters inherit their anchor's label
|
||||
for (const [i, ai] of reassignTo) {
|
||||
const anchorLabel = labelByOrigIdx.get(ai);
|
||||
if (anchorLabel) labelByOrigIdx.set(i, anchorLabel);
|
||||
}
|
||||
|
||||
// ─── Build globalMap + uncertaintyMap ────────────────────────────
|
||||
const globalMap = new Map();
|
||||
const uncertaintyMap = new Map();
|
||||
let hasUnknown = false;
|
||||
for (let i = 0; i < clusters.length; i++) {
|
||||
if (unknownClusters.has(i)) {
|
||||
for (const memberKey of clusters[i].members) {
|
||||
globalMap.set(memberKey, "Speaker_Unknown");
|
||||
hasUnknown = true;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
const label = labelByOrigIdx.get(i);
|
||||
if (!label) continue;
|
||||
const isReassigned = reassignTo.has(i);
|
||||
for (const memberKey of clusters[i].members) {
|
||||
globalMap.set(memberKey, label);
|
||||
if (isReassigned) uncertaintyMap.set(memberKey, true);
|
||||
}
|
||||
}
|
||||
|
||||
// ─── Build the per-speaker summary ───────────────────────────────
|
||||
const speakers = {};
|
||||
const seenLabels = new Set([...labelByOrigIdx.values()]);
|
||||
for (const label of seenLabels) {
|
||||
speakers[label] = {
|
||||
turns: 0,
|
||||
total_speaking_seconds: 0,
|
||||
mean_confidence: null,
|
||||
chunks_appeared_in: 0,
|
||||
fingerprint_count: 0,
|
||||
};
|
||||
}
|
||||
if (hasUnknown) {
|
||||
speakers["Speaker_Unknown"] = {
|
||||
turns: 0,
|
||||
total_speaking_seconds: 0,
|
||||
mean_confidence: null,
|
||||
chunks_appeared_in: 0,
|
||||
fingerprint_count: 0,
|
||||
};
|
||||
}
|
||||
// Accumulate fingerprint counts from clusters that contributed to
|
||||
// each label. Reassigned clusters' fingerprints count toward
|
||||
// their anchor's total.
|
||||
for (let i = 0; i < clusters.length; i++) {
|
||||
let targetLabel;
|
||||
if (unknownClusters.has(i)) targetLabel = "Speaker_Unknown";
|
||||
else targetLabel = labelByOrigIdx.get(i);
|
||||
if (!targetLabel || !speakers[targetLabel]) continue;
|
||||
speakers[targetLabel].fingerprint_count += clusters[i].members.length;
|
||||
}
|
||||
|
||||
// Accumulate turns / speaking time / confidence by walking
|
||||
// diarization segments through the globalMap.
|
||||
//
|
||||
// Two earlier bugs fixed here:
|
||||
//
|
||||
// 1. UNFINGERPRINTED SEGMENTS WERE SILENTLY DROPPED. The cluster-
|
||||
// index lookup only contains entries that have fingerprints —
|
||||
// but Sortformer routinely emits diar segments for speakers
|
||||
// whose voice TitaNet didn't aggregate a fingerprint for (very
|
||||
// brief utterances, soft speech, overlapped speech). Those
|
||||
// segments produced globalId === undefined and were dropped,
|
||||
// so the "total speech detected" totals understated reality
|
||||
// dramatically. A 1.5-hour call could show as "34% speech
|
||||
// detected" when in reality 70%+ of the audio was speech that
|
||||
// Sortformer found but TitaNet couldn't fingerprint. Now: an
|
||||
// unmapped segment falls through to Speaker_Unknown so the
|
||||
// time still gets accounted for. (The chip on the per-line
|
||||
// transcript still shows "?" for those segments — they just
|
||||
// aren't claimed by a wrong cluster.)
|
||||
//
|
||||
// 2. CHUNK-OVERLAP DOUBLE-COUNTING. Transcribe segments are
|
||||
// deduped at the chunk overlap boundary (handled in
|
||||
// hardware.js), but diar segments are not. Until this fix the
|
||||
// same speech in a 30s overlap zone got counted toward TWO
|
||||
// chunks, inflating speaker totals. Dedup here using the
|
||||
// chunk's overlapBoundarySec when present.
|
||||
const confidenceSum = new Map();
|
||||
const confidenceCount = new Map();
|
||||
const chunksByLabel = new Map();
|
||||
|
||||
// Ensure Speaker_Unknown exists in speakers map before we attribute
|
||||
// any unmapped time to it — clusterSpeakers may have created it
|
||||
// already (via the unknownClusters path) or not (when no clusters
|
||||
// were suppressed). Either way, we want it as a destination bucket.
|
||||
if (!speakers["Speaker_Unknown"]) {
|
||||
speakers["Speaker_Unknown"] = {
|
||||
turns: 0,
|
||||
total_speaking_seconds: 0,
|
||||
mean_confidence: null,
|
||||
chunks_appeared_in: 0,
|
||||
fingerprint_count: 0,
|
||||
};
|
||||
hasUnknown = true;
|
||||
}
|
||||
|
||||
for (const d of chunkDiarization || []) {
|
||||
if (!d || !d.ok || !Array.isArray(d.segments)) continue;
|
||||
// Chunk-overlap dedup: skip any segment whose GLOBAL start time
|
||||
// sits in the prior chunk's tail (which this chunk overlapped).
|
||||
// chunkOverlapBoundary is the global timestamp BEFORE which
|
||||
// segments in this chunk are duplicates of the prior chunk's
|
||||
// tail. Comes from the chunk planner (audio-meta.js) and is
|
||||
// 0 for chunk 0 (no prior chunk → no dedup).
|
||||
const chunkOverlapBoundary =
|
||||
typeof d.chunkOverlapBoundarySec === "number"
|
||||
? d.chunkOverlapBoundarySec
|
||||
: 0;
|
||||
const labelsInThisChunk = new Set();
|
||||
for (const seg of d.segments) {
|
||||
if ((seg.start || 0) < chunkOverlapBoundary) continue;
|
||||
let globalId = globalMap.get(`${d.chunkIndex}:${seg.speaker_local}`);
|
||||
// Unmapped (no fingerprint produced for this speaker_local in
|
||||
// this chunk) → bucket into Speaker_Unknown rather than drop.
|
||||
if (!globalId || !speakers[globalId]) {
|
||||
globalId = "Speaker_Unknown";
|
||||
}
|
||||
speakers[globalId].turns += 1;
|
||||
const segDuration = Math.max(0, (seg.end || 0) - (seg.start || 0));
|
||||
speakers[globalId].total_speaking_seconds += segDuration;
|
||||
if (typeof seg.confidence === "number" && Number.isFinite(seg.confidence)) {
|
||||
confidenceSum.set(globalId, (confidenceSum.get(globalId) || 0) + seg.confidence);
|
||||
confidenceCount.set(globalId, (confidenceCount.get(globalId) || 0) + 1);
|
||||
}
|
||||
labelsInThisChunk.add(globalId);
|
||||
}
|
||||
for (const label of labelsInThisChunk) {
|
||||
if (!chunksByLabel.has(label)) chunksByLabel.set(label, new Set());
|
||||
chunksByLabel.get(label).add(d.chunkIndex);
|
||||
}
|
||||
}
|
||||
|
||||
// If Speaker_Unknown ended up with zero turns (no unmapped + no
|
||||
// suppressed clusters contributed), drop it from the legend so we
|
||||
// don't show "? Unknown 0:00" by default.
|
||||
if (speakers["Speaker_Unknown"] && speakers["Speaker_Unknown"].turns === 0) {
|
||||
delete speakers["Speaker_Unknown"];
|
||||
hasUnknown = false;
|
||||
}
|
||||
for (const label of Object.keys(speakers)) {
|
||||
if (confidenceCount.get(label)) {
|
||||
speakers[label].mean_confidence =
|
||||
confidenceSum.get(label) / confidenceCount.get(label);
|
||||
}
|
||||
speakers[label].chunks_appeared_in = (chunksByLabel.get(label) || new Set()).size;
|
||||
speakers[label].total_speaking_seconds =
|
||||
Math.round(speakers[label].total_speaking_seconds * 10) / 10;
|
||||
}
|
||||
|
||||
// Logging: surface the suppression summary so operators can see
|
||||
// what happened ("14 clusters → 2 anchors + 12 small/uncertain
|
||||
// suppressed").
|
||||
const reassignedCount = reassignTo.size;
|
||||
const unknownClusterCount = unknownClusters.size;
|
||||
const finalCount =
|
||||
Object.keys(speakers).length - (hasUnknown ? 1 : 0);
|
||||
console.log(
|
||||
`[clustering] ${clusters.length} raw clusters → ${finalCount} primary + ` +
|
||||
`${reassignedCount} reassigned (uncertain) + ${unknownClusterCount} unknown ` +
|
||||
`(anchors >= ${anchorMinSec}s, uncertain margin ${uncertainMarginPct}%, ` +
|
||||
`unknown < ${smallMaxSec}s)`
|
||||
);
|
||||
|
||||
return {
|
||||
globalMap,
|
||||
uncertaintyMap,
|
||||
speakers,
|
||||
clusterCount: clusters.length,
|
||||
thresholdSimilarity: similarityThreshold,
|
||||
};
|
||||
}
|
||||
|
||||
// ─── Public: stamp global speaker labels onto transcript segments ───
|
||||
//
|
||||
// Walks the merged transcript segments and assigns each one a
|
||||
// `speaker` (global ID, e.g. "Speaker_A") + `speaker_confidence`
|
||||
// based on which diarization segment its midpoint falls inside.
|
||||
// When no diar segment covers the midpoint, we fall back to nearest-
|
||||
// midpoint matching with a 5-second window — beyond that, leave the
|
||||
// speaker null so the frontend can render as "(speaker unknown)" or
|
||||
// just drop the label.
|
||||
//
|
||||
// Mutates the segments in-place (and also returns the array) so
|
||||
// callers don't have to remember which they got.
|
||||
export function assignSpeakersToSegments(segments, chunkDiarization, globalMap, uncertaintyMap = null) {
|
||||
if (!Array.isArray(segments) || segments.length === 0) return segments;
|
||||
if (!globalMap || globalMap.size === 0) {
|
||||
// Diarization didn't run / produced nothing — leave segments
|
||||
// alone. Caller can detect this state via speakers === {}.
|
||||
return segments;
|
||||
}
|
||||
// Flatten all per-chunk diar segments into one timeline annotated
|
||||
// with the global speaker label + the suppression-uncertainty
|
||||
// flag (set when a small cluster was reassigned to an anchor —
|
||||
// chip will show "?" so the user knows attribution is best-guess).
|
||||
const flatDiar = [];
|
||||
for (const d of chunkDiarization || []) {
|
||||
if (!d || !d.ok || !Array.isArray(d.segments)) continue;
|
||||
for (const seg of d.segments) {
|
||||
const memberKey = `${d.chunkIndex}:${seg.speaker_local}`;
|
||||
const globalId = globalMap.get(memberKey);
|
||||
if (!globalId) continue;
|
||||
const uncertain = uncertaintyMap ? !!uncertaintyMap.get(memberKey) : false;
|
||||
flatDiar.push({
|
||||
start: seg.start || 0,
|
||||
end: seg.end || 0,
|
||||
speaker: globalId,
|
||||
confidence: typeof seg.confidence === "number" ? seg.confidence : null,
|
||||
uncertain,
|
||||
});
|
||||
}
|
||||
}
|
||||
flatDiar.sort((a, b) => a.start - b.start);
|
||||
|
||||
const NEAREST_FALLBACK_WINDOW_SEC = 5;
|
||||
|
||||
for (const e of segments) {
|
||||
const mid = ((e.start || 0) + (e.end || 0)) / 2;
|
||||
// Find segments that contain the midpoint
|
||||
let containing = null;
|
||||
let containingMostOverlap = 0;
|
||||
for (const d of flatDiar) {
|
||||
if (d.start <= mid && mid <= d.end) {
|
||||
// Score by overlap with the entry to handle the rare case of
|
||||
// multiple diar segments straddling one transcript line
|
||||
// (chunk overlap zones, choppy speaker turns).
|
||||
const overlap =
|
||||
Math.min(d.end, e.end || 0) - Math.max(d.start, e.start || 0);
|
||||
if (overlap > containingMostOverlap) {
|
||||
containing = d;
|
||||
containingMostOverlap = overlap;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (containing) {
|
||||
e.speaker = containing.speaker;
|
||||
e.speaker_confidence = containing.confidence;
|
||||
e.speaker_uncertain = !!containing.uncertain;
|
||||
continue;
|
||||
}
|
||||
// Fall back to nearest by midpoint distance (within window)
|
||||
let nearest = null;
|
||||
let nearestDist = Infinity;
|
||||
for (const d of flatDiar) {
|
||||
const dMid = (d.start + d.end) / 2;
|
||||
const dist = Math.abs(dMid - mid);
|
||||
if (dist < nearestDist) {
|
||||
nearestDist = dist;
|
||||
nearest = d;
|
||||
}
|
||||
}
|
||||
if (nearest && nearestDist <= NEAREST_FALLBACK_WINDOW_SEC) {
|
||||
e.speaker = nearest.speaker;
|
||||
e.speaker_confidence = nearest.confidence;
|
||||
e.speaker_uncertain = !!nearest.uncertain;
|
||||
} else {
|
||||
e.speaker = null;
|
||||
e.speaker_confidence = null;
|
||||
e.speaker_uncertain = false;
|
||||
}
|
||||
}
|
||||
|
||||
return segments;
|
||||
}
|
||||
@@ -0,0 +1,269 @@
|
||||
// Unit tests for post-hoc speaker edits (merge + re-cluster) on saved
|
||||
// internal-meeting records.
|
||||
// Run via: node --test server/test/meeting-speaker-edits.test.js
|
||||
|
||||
import { test } from "node:test";
|
||||
import assert from "node:assert/strict";
|
||||
import {
|
||||
mergeSpeakersInRecord,
|
||||
reclusterMeetingRecord,
|
||||
backfillEntrySpeakers,
|
||||
applyPolishedSummaries,
|
||||
} from "../meeting-speaker-edits.js";
|
||||
|
||||
// Distinct synthetic voice fingerprints (mirror speaker-clustering.test.js).
|
||||
const FP_A = (j = 0) => [1.0 + j * 0.01, 0.05 * j, 0];
|
||||
const FP_B = (j = 0) => [0.05 * j, 1.0 + j * 0.01, 0];
|
||||
|
||||
// A 3-speaker record with labels spread across all four sync points.
|
||||
function makeMergeRecord() {
|
||||
return {
|
||||
id: "m1",
|
||||
transcript_segments: [
|
||||
{ start: 0, end: 9, text: "a", speaker: "Speaker_A" },
|
||||
{ start: 10, end: 19, text: "b", speaker: "Speaker_B" },
|
||||
{ start: 20, end: 29, text: "c", speaker: "Speaker_C" },
|
||||
{ start: 30, end: 39, text: "c2", speaker: "Speaker_C" },
|
||||
],
|
||||
chunks: [
|
||||
{
|
||||
title: "t",
|
||||
summary: "s",
|
||||
startTime: 0,
|
||||
entries: [
|
||||
{ offset: 0, text: "a", speaker: "Speaker_A" },
|
||||
{ offset: 10, text: "b", speaker: "Speaker_B" },
|
||||
{ offset: 20, text: "c", speaker: "Speaker_C", speaker_override: "Speaker_C" },
|
||||
{ offset: 30, text: "c2", speaker: "Speaker_A", speaker_override: "Speaker_C" },
|
||||
],
|
||||
},
|
||||
],
|
||||
speakers: {
|
||||
Speaker_A: { turns: 4, total_speaking_seconds: 40, mean_confidence: 0.8, chunks_appeared_in: 2, fingerprint_count: 2 },
|
||||
Speaker_B: { turns: 2, total_speaking_seconds: 20, mean_confidence: 0.9, chunks_appeared_in: 1, fingerprint_count: 1 },
|
||||
Speaker_C: { turns: 6, total_speaking_seconds: 18, mean_confidence: 0.6, chunks_appeared_in: 3, fingerprint_count: 3 },
|
||||
},
|
||||
speaker_names: { Speaker_A: "Matt", Speaker_B: "John" },
|
||||
extras: {
|
||||
tldr: { summary: "x", primary_speakers: ["Speaker_A", "Speaker_C"] },
|
||||
decisions: [{ statement: "d", agreed_by: ["Speaker_C", "Speaker_A"], supporting_offset: 5 }],
|
||||
action_items: [{ description: "do", owner: "Speaker_C", supporting_offset: 6 }],
|
||||
key_quotes: [{ speaker: "Speaker_C", offset: 7, quote: "q" }],
|
||||
},
|
||||
meta: {},
|
||||
};
|
||||
}
|
||||
|
||||
test("merge: collapses absorbed speaker across all four label locations", () => {
|
||||
const rec = makeMergeRecord();
|
||||
const out = mergeSpeakersInRecord(rec, "Speaker_A", ["Speaker_C"]);
|
||||
|
||||
// transcript_segments
|
||||
assert.deepEqual(
|
||||
rec.transcript_segments.map((s) => s.speaker),
|
||||
["Speaker_A", "Speaker_B", "Speaker_A", "Speaker_A"]
|
||||
);
|
||||
// entries + per-line overrides
|
||||
assert.deepEqual(
|
||||
rec.chunks[0].entries.map((e) => e.speaker),
|
||||
["Speaker_A", "Speaker_B", "Speaker_A", "Speaker_A"]
|
||||
);
|
||||
assert.equal(rec.chunks[0].entries[2].speaker_override, "Speaker_A");
|
||||
assert.equal(rec.chunks[0].entries[3].speaker_override, "Speaker_A");
|
||||
|
||||
// stats merged, Speaker_C gone
|
||||
assert.ok(!("Speaker_C" in rec.speakers));
|
||||
assert.equal(rec.speakers.Speaker_A.turns, 10); // 4 + 6
|
||||
assert.equal(rec.speakers.Speaker_A.total_speaking_seconds, 58); // 40 + 18
|
||||
assert.equal(rec.speakers.Speaker_A.fingerprint_count, 5); // 2 + 3
|
||||
// turn-weighted mean confidence: (0.8*4 + 0.6*6) / 10 = 0.68
|
||||
assert.ok(Math.abs(rec.speakers.Speaker_A.mean_confidence - 0.68) < 1e-9);
|
||||
|
||||
// names: survivor keeps its own, absorbed dropped
|
||||
assert.equal(rec.speaker_names.Speaker_A, "Matt");
|
||||
assert.ok(!("Speaker_C" in rec.speaker_names));
|
||||
|
||||
// extras remapped + deduped
|
||||
assert.deepEqual(rec.extras.tldr.primary_speakers, ["Speaker_A"]);
|
||||
assert.deepEqual(rec.extras.decisions[0].agreed_by, ["Speaker_A"]);
|
||||
assert.equal(rec.extras.action_items[0].owner, "Speaker_A");
|
||||
assert.equal(rec.extras.key_quotes[0].speaker, "Speaker_A");
|
||||
|
||||
assert.ok(rec.meta.speakers_merged_at > 0);
|
||||
assert.equal(out.changed > 0, true);
|
||||
});
|
||||
|
||||
test("merge: survivor with no name inherits the absorbed name", () => {
|
||||
const rec = makeMergeRecord();
|
||||
// Speaker_B has a name; clear it so it can inherit Speaker_C's.
|
||||
delete rec.speaker_names.Speaker_B;
|
||||
rec.speaker_names.Speaker_C = "Carol";
|
||||
mergeSpeakersInRecord(rec, "Speaker_B", ["Speaker_C"]);
|
||||
assert.equal(rec.speaker_names.Speaker_B, "Carol");
|
||||
assert.ok(!("Speaker_C" in rec.speaker_names));
|
||||
});
|
||||
|
||||
test("merge: rejects invalid input", () => {
|
||||
const rec = makeMergeRecord();
|
||||
assert.throws(() => mergeSpeakersInRecord(rec, "Speaker_Z", ["Speaker_A"]), /survivor/);
|
||||
assert.throws(() => mergeSpeakersInRecord(rec, "Speaker_A", ["Speaker_A"]), /itself/);
|
||||
assert.throws(() => mergeSpeakersInRecord(rec, "Speaker_A", ["Speaker_Z"]), /unknown/);
|
||||
assert.throws(() => mergeSpeakersInRecord(rec, "Speaker_A", []), /at least one/);
|
||||
});
|
||||
|
||||
// A record carrying per-chunk fingerprints so re-clustering can run
|
||||
// fully offline. Two distinct voices (FP_A first, FP_B second) →
|
||||
// Speaker_A / Speaker_B by first-appearance order.
|
||||
function makeReclusterRecord() {
|
||||
return {
|
||||
id: "r1",
|
||||
transcript_segments: [
|
||||
{ start: 0, end: 9, text: "a", speaker: "STALE" },
|
||||
{ start: 10, end: 19, text: "b", speaker: "STALE" },
|
||||
{ start: 20, end: 29, text: "c", speaker: "STALE" },
|
||||
],
|
||||
chunks: [
|
||||
{
|
||||
title: "t",
|
||||
summary: "s",
|
||||
startTime: 0,
|
||||
entries: [
|
||||
{ offset: 0, text: "a", speaker: "STALE", speaker_override: "STALE" },
|
||||
{ offset: 10, text: "b", speaker: "STALE" },
|
||||
{ offset: 20, text: "c", speaker: "STALE" },
|
||||
],
|
||||
},
|
||||
],
|
||||
speakers: { STALE: { turns: 3, total_speaking_seconds: 30, mean_confidence: 0.5, chunks_appeared_in: 2, fingerprint_count: 3 } },
|
||||
speaker_names: { STALE: "Wrong" },
|
||||
extras: {
|
||||
tldr: { summary: "x", primary_speakers: ["STALE"] },
|
||||
decisions: [{ statement: "d", agreed_by: ["STALE"], supporting_offset: 5 }],
|
||||
action_items: [{ description: "do", owner: "STALE", supporting_offset: 6 }],
|
||||
key_quotes: [{ speaker: "STALE", offset: 7, quote: "q" }],
|
||||
},
|
||||
diarization: [
|
||||
{
|
||||
ok: true,
|
||||
chunkIndex: 0,
|
||||
segments: [
|
||||
{ start: 0, end: 10, speaker_local: "Speaker_0", confidence: 0.9 },
|
||||
{ start: 10, end: 20, speaker_local: "Speaker_1", confidence: 0.9 },
|
||||
],
|
||||
fingerprints: { Speaker_0: FP_A(1), Speaker_1: FP_B(1) },
|
||||
},
|
||||
{
|
||||
ok: true,
|
||||
chunkIndex: 1,
|
||||
segments: [{ start: 20, end: 30, speaker_local: "Speaker_0", confidence: 0.8 }],
|
||||
fingerprints: { Speaker_0: FP_A(2) },
|
||||
},
|
||||
],
|
||||
meta: { polish_done: true },
|
||||
};
|
||||
}
|
||||
|
||||
test("recluster: re-stamps segments + entries and resets stale data", () => {
|
||||
const rec = makeReclusterRecord();
|
||||
const out = reclusterMeetingRecord(rec, { threshold: 70 });
|
||||
|
||||
// Two distinct voices recovered.
|
||||
assert.equal(out.speakers ? Object.keys(out.speakers).filter((k) => k !== "Speaker_Unknown").length : 0, 2);
|
||||
|
||||
// Segments re-stamped: FP_A group = Speaker_A (first), FP_B = Speaker_B.
|
||||
assert.deepEqual(
|
||||
rec.transcript_segments.map((s) => s.speaker),
|
||||
["Speaker_A", "Speaker_B", "Speaker_A"]
|
||||
);
|
||||
// Entries re-derived to match.
|
||||
assert.deepEqual(
|
||||
rec.chunks[0].entries.map((e) => e.speaker),
|
||||
["Speaker_A", "Speaker_B", "Speaker_A"]
|
||||
);
|
||||
// Per-line override cleared.
|
||||
assert.ok(!("speaker_override" in rec.chunks[0].entries[0]));
|
||||
|
||||
// Stale attribution data reset.
|
||||
assert.deepEqual(rec.speaker_names, {});
|
||||
assert.deepEqual(rec.extras.tldr.primary_speakers, []);
|
||||
assert.deepEqual(rec.extras.decisions[0].agreed_by, []);
|
||||
assert.equal(rec.extras.action_items[0].owner, null);
|
||||
assert.equal(rec.extras.key_quotes[0].speaker, null);
|
||||
// Decision text preserved.
|
||||
assert.equal(rec.extras.decisions[0].statement, "d");
|
||||
|
||||
assert.ok(rec.meta.reclustered_at > 0);
|
||||
assert.equal(rec.meta.recluster_threshold, 70);
|
||||
assert.equal(rec.meta.polish_done, false);
|
||||
});
|
||||
|
||||
test("recluster: throws NO_FINGERPRINTS when none are saved", () => {
|
||||
const rec = makeReclusterRecord();
|
||||
rec.diarization = null;
|
||||
assert.throws(() => reclusterMeetingRecord(rec, { threshold: 70 }), (e) => e.code === "NO_FINGERPRINTS");
|
||||
|
||||
const rec2 = makeReclusterRecord();
|
||||
rec2.diarization = [{ ok: true, chunkIndex: 0, segments: [], fingerprints: {} }];
|
||||
assert.throws(() => reclusterMeetingRecord(rec2, { threshold: 70 }), (e) => e.code === "NO_FINGERPRINTS");
|
||||
});
|
||||
|
||||
test("applyPolishedSummaries: writes summaries to analysis + chunks, leaves entries", () => {
|
||||
const rec = {
|
||||
analysis: { sections: [
|
||||
{ title: "Intro", summary: "OLD intro", startIndex: 0, endIndex: 1 },
|
||||
{ title: "Plan", summary: "OLD plan", startIndex: 2, endIndex: 3 },
|
||||
] },
|
||||
chunks: [
|
||||
{ title: "Intro", summary: "OLD intro", entries: [{ offset: 0, speaker: "Speaker_A", speaker_override: "Speaker_B" }] },
|
||||
{ title: "Plan", summary: "OLD plan", entries: [{ offset: 20, speaker: "Speaker_B" }] },
|
||||
],
|
||||
meta: {},
|
||||
};
|
||||
const polished = [
|
||||
{ title: "Intro", summary: "Matt opens the standup", startIndex: 0, endIndex: 1 },
|
||||
{ title: "Plan", summary: "John lays out the Q3 plan", startIndex: 2, endIndex: 3 },
|
||||
];
|
||||
const changed = applyPolishedSummaries(rec, polished);
|
||||
assert.equal(changed, 2);
|
||||
// analysis store updated
|
||||
assert.equal(rec.analysis.sections[0].summary, "Matt opens the standup");
|
||||
// chunk cards updated by title
|
||||
assert.equal(rec.chunks[0].summary, "Matt opens the standup");
|
||||
assert.equal(rec.chunks[1].summary, "John lays out the Q3 plan");
|
||||
// entries + per-line override untouched
|
||||
assert.equal(rec.chunks[0].entries[0].speaker, "Speaker_A");
|
||||
assert.equal(rec.chunks[0].entries[0].speaker_override, "Speaker_B");
|
||||
});
|
||||
|
||||
test("applyPolishedSummaries: duplicate titles map in order", () => {
|
||||
const rec = {
|
||||
analysis: { sections: [] },
|
||||
chunks: [
|
||||
{ title: "Discussion", summary: "old1", entries: [] },
|
||||
{ title: "Discussion", summary: "old2", entries: [] },
|
||||
],
|
||||
};
|
||||
const polished = [
|
||||
{ title: "Discussion", summary: "new1" },
|
||||
{ title: "Discussion", summary: "new2" },
|
||||
];
|
||||
applyPolishedSummaries(rec, polished);
|
||||
assert.equal(rec.chunks[0].summary, "new1");
|
||||
assert.equal(rec.chunks[1].summary, "new2");
|
||||
});
|
||||
|
||||
test("backfillEntrySpeakers force re-stamps already-labeled entries", () => {
|
||||
const rec = {
|
||||
transcript_segments: [
|
||||
{ start: 0, end: 9, text: "a", speaker: "Speaker_A" },
|
||||
{ start: 10, end: 19, text: "b", speaker: "Speaker_B" },
|
||||
],
|
||||
chunks: [{ entries: [{ offset: 0, speaker: "OLD" }, { offset: 10, speaker: "OLD" }] }],
|
||||
};
|
||||
// Without force, existing speakers are left alone.
|
||||
backfillEntrySpeakers(rec);
|
||||
assert.deepEqual(rec.chunks[0].entries.map((e) => e.speaker), ["OLD", "OLD"]);
|
||||
// With force, they are re-derived from the segments.
|
||||
backfillEntrySpeakers(rec, { force: true });
|
||||
assert.deepEqual(rec.chunks[0].entries.map((e) => e.speaker), ["Speaker_A", "Speaker_B"]);
|
||||
});
|
||||
@@ -0,0 +1,60 @@
|
||||
// Re-polish bug fix: the summary-polish pass must label each transcript
|
||||
// line with the operator's CORRECTED speaker name, so a re-polish after a
|
||||
// legend rename actually re-attributes statements to the new name (rather
|
||||
// than echoing the stale name baked into the original summaries).
|
||||
|
||||
import { test, describe } from "node:test";
|
||||
import assert from "node:assert/strict";
|
||||
|
||||
import { formatSpeakerLabeledTranscript } from "../post-cluster-polish.js";
|
||||
|
||||
const SEGMENTS = [
|
||||
{ start: 5, speaker: "Speaker_A", text: "Let's get started." },
|
||||
{ start: 12, speaker: "Speaker_B", text: "Sounds good." },
|
||||
{ start: 20, speaker: "Speaker_C", text: "One more thing." }, // unnamed
|
||||
{ start: 30, speaker: "", text: "(crosstalk)" }, // no speaker
|
||||
];
|
||||
|
||||
describe("formatSpeakerLabeledTranscript", () => {
|
||||
test("without speakerNames: labels by chip letter (name-inference pass)", () => {
|
||||
const out = formatSpeakerLabeledTranscript(SEGMENTS);
|
||||
assert.match(out, /\[A 0:05\] Let's get started\./);
|
||||
assert.match(out, /\[B 0:12\] Sounds good\./);
|
||||
assert.match(out, /\[C 0:20\] One more thing\./);
|
||||
// Segment with no speaker → "?" label.
|
||||
assert.match(out, /\[\? 0:30\] \(crosstalk\)/);
|
||||
});
|
||||
|
||||
test("with speakerNames: named speakers labeled by NAME, unnamed fall back to letter", () => {
|
||||
const out = formatSpeakerLabeledTranscript(SEGMENTS, {
|
||||
speakerNames: { Speaker_A: "Matt", Speaker_B: "Grant" },
|
||||
});
|
||||
assert.match(out, /\[Matt 0:05\] Let's get started\./);
|
||||
assert.match(out, /\[Grant 0:12\] Sounds good\./);
|
||||
// Speaker_C has no name → still the letter.
|
||||
assert.match(out, /\[C 0:20\] One more thing\./);
|
||||
// Crucially, the OLD letter labels for the named speakers are gone.
|
||||
assert.doesNotMatch(out, /\[A 0:05\]/);
|
||||
assert.doesNotMatch(out, /\[B 0:12\]/);
|
||||
});
|
||||
|
||||
test("respects the time window (startSec/endSec)", () => {
|
||||
const out = formatSpeakerLabeledTranscript(SEGMENTS, {
|
||||
startSec: 10,
|
||||
endSec: 25,
|
||||
speakerNames: { Speaker_A: "Matt" },
|
||||
});
|
||||
assert.doesNotMatch(out, /Let's get started/); // 0:05, before window
|
||||
assert.match(out, /Sounds good/); // 0:12, in window
|
||||
assert.match(out, /One more thing/); // 0:20, in window
|
||||
assert.doesNotMatch(out, /crosstalk/); // 0:30, after window
|
||||
});
|
||||
|
||||
test("strips brackets from a name so the [label] frame can't break", () => {
|
||||
const out = formatSpeakerLabeledTranscript(
|
||||
[{ start: 0, speaker: "Speaker_A", text: "hi" }],
|
||||
{ speakerNames: { Speaker_A: "Ma[t]t" } },
|
||||
);
|
||||
assert.match(out, /\[Matt 0:00\] hi/);
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,203 @@
|
||||
// Unit tests for the Phase 1D speaker-clustering module.
|
||||
// Run via: node --test server/test/speaker-clustering.test.js
|
||||
|
||||
import { test } from "node:test";
|
||||
import assert from "node:assert/strict";
|
||||
import {
|
||||
cosineSimilarity,
|
||||
clusterSpeakers,
|
||||
assignSpeakersToSegments,
|
||||
} from "../speaker-clustering.js";
|
||||
|
||||
// Synthetic fingerprints — easier to reason about than 192-dim vectors.
|
||||
// "Person A" embeddings all point roughly toward [+1, 0, 0]
|
||||
// "Person B" embeddings all point roughly toward [0, +1, 0]
|
||||
// "Person C" embeddings (when present) point toward [0, 0, +1]
|
||||
const FP_A = (jitter = 0) => [1.0 + jitter * 0.01, 0.05 * jitter, 0];
|
||||
const FP_B = (jitter = 0) => [0.05 * jitter, 1.0 + jitter * 0.01, 0];
|
||||
const FP_C = (jitter = 0) => [0, 0, 1.0 + jitter * 0.01];
|
||||
|
||||
test("cosineSimilarity: identical vectors = 1", () => {
|
||||
assert.equal(cosineSimilarity([1, 0, 0], [1, 0, 0]), 1);
|
||||
});
|
||||
|
||||
test("cosineSimilarity: orthogonal vectors = 0", () => {
|
||||
assert.equal(cosineSimilarity([1, 0, 0], [0, 1, 0]), 0);
|
||||
});
|
||||
|
||||
test("cosineSimilarity: zero-magnitude input returns 0 (no NaN)", () => {
|
||||
assert.equal(cosineSimilarity([0, 0, 0], [1, 1, 1]), 0);
|
||||
});
|
||||
|
||||
test("clusterSpeakers: two distinct speakers across 3 chunks → 2 clusters", () => {
|
||||
const chunkDiar = [
|
||||
{
|
||||
ok: true,
|
||||
chunkIndex: 0,
|
||||
segments: [],
|
||||
fingerprints: { Speaker_0: FP_A(1), Speaker_1: FP_B(1) },
|
||||
},
|
||||
{
|
||||
ok: true,
|
||||
chunkIndex: 1,
|
||||
segments: [],
|
||||
fingerprints: { Speaker_0: FP_A(2), Speaker_1: FP_B(2) },
|
||||
},
|
||||
{
|
||||
ok: true,
|
||||
chunkIndex: 2,
|
||||
segments: [],
|
||||
fingerprints: { Speaker_0: FP_B(3), Speaker_1: FP_A(3) }, // labels flipped this chunk
|
||||
},
|
||||
];
|
||||
const { clusterCount, globalMap, speakers } = clusterSpeakers(chunkDiar, 70);
|
||||
assert.equal(clusterCount, 2, "should identify 2 distinct speakers");
|
||||
// First speaker seen (chunk 0, Speaker_0 = FP_A) becomes Speaker_A
|
||||
assert.equal(globalMap.get("0:Speaker_0"), "Speaker_A");
|
||||
assert.equal(globalMap.get("0:Speaker_1"), "Speaker_B");
|
||||
// Chunk 1 (same physical voices, same label assignment by SC)
|
||||
assert.equal(globalMap.get("1:Speaker_0"), "Speaker_A");
|
||||
assert.equal(globalMap.get("1:Speaker_1"), "Speaker_B");
|
||||
// Chunk 2 has labels flipped — clustering should recover the truth
|
||||
assert.equal(globalMap.get("2:Speaker_0"), "Speaker_B");
|
||||
assert.equal(globalMap.get("2:Speaker_1"), "Speaker_A");
|
||||
// Summary should report each speaker appearing in 3 chunks
|
||||
assert.equal(speakers.Speaker_A.fingerprint_count, 3);
|
||||
assert.equal(speakers.Speaker_B.fingerprint_count, 3);
|
||||
});
|
||||
|
||||
test("clusterSpeakers: three distinct speakers → 3 clusters", () => {
|
||||
const chunkDiar = [
|
||||
{
|
||||
ok: true,
|
||||
chunkIndex: 0,
|
||||
segments: [],
|
||||
fingerprints: { Speaker_0: FP_A(1), Speaker_1: FP_B(1) },
|
||||
},
|
||||
{
|
||||
ok: true,
|
||||
chunkIndex: 1,
|
||||
segments: [],
|
||||
fingerprints: { Speaker_0: FP_C(2), Speaker_1: FP_B(2) },
|
||||
},
|
||||
];
|
||||
const { clusterCount } = clusterSpeakers(chunkDiar, 70);
|
||||
assert.equal(clusterCount, 3);
|
||||
});
|
||||
|
||||
test("clusterSpeakers: empty input returns empty result", () => {
|
||||
const out = clusterSpeakers([], 70);
|
||||
assert.equal(out.clusterCount, 0);
|
||||
assert.equal(out.globalMap.size, 0);
|
||||
assert.deepEqual(out.speakers, {});
|
||||
});
|
||||
|
||||
test("clusterSpeakers: all-failed-chunks input returns empty result", () => {
|
||||
const out = clusterSpeakers([{ ok: false }, { ok: false }], 70);
|
||||
assert.equal(out.clusterCount, 0);
|
||||
});
|
||||
|
||||
test("clusterSpeakers: threshold clamped to 50..95", () => {
|
||||
const chunkDiar = [
|
||||
{
|
||||
ok: true,
|
||||
chunkIndex: 0,
|
||||
segments: [],
|
||||
fingerprints: { Speaker_0: FP_A(1), Speaker_1: FP_B(1) },
|
||||
},
|
||||
];
|
||||
const lo = clusterSpeakers(chunkDiar, 0); // clamps to 50
|
||||
assert.equal(lo.thresholdSimilarity, 0.5);
|
||||
const hi = clusterSpeakers(chunkDiar, 200); // clamps to 95
|
||||
assert.equal(hi.thresholdSimilarity, 0.95);
|
||||
});
|
||||
|
||||
test("clusterSpeakers: very strict threshold (95%) splits tightly-grouped voices", () => {
|
||||
// FP_A with significant jitter — at 70% they cluster as one, at 95% they may split.
|
||||
const chunkDiar = [
|
||||
{
|
||||
ok: true,
|
||||
chunkIndex: 0,
|
||||
segments: [],
|
||||
fingerprints: {
|
||||
Speaker_0: [1.0, 0.0, 0.0],
|
||||
// Same general direction but ~0.93 similarity — borderline.
|
||||
Speaker_1: [0.93, 0.36, 0.06],
|
||||
},
|
||||
},
|
||||
];
|
||||
const lenient = clusterSpeakers(chunkDiar, 70);
|
||||
const strict = clusterSpeakers(chunkDiar, 95);
|
||||
assert.equal(lenient.clusterCount, 1, "lenient should merge");
|
||||
assert.equal(strict.clusterCount, 2, "strict should split");
|
||||
});
|
||||
|
||||
test("clusterSpeakers: summary stats aggregate turns + speaking time", () => {
|
||||
const chunkDiar = [
|
||||
{
|
||||
ok: true,
|
||||
chunkIndex: 0,
|
||||
segments: [
|
||||
{ start: 0, end: 10, speaker_local: "Speaker_0", confidence: 0.9 },
|
||||
{ start: 10, end: 25, speaker_local: "Speaker_1", confidence: 0.8 },
|
||||
{ start: 25, end: 30, speaker_local: "Speaker_0", confidence: 0.95 },
|
||||
],
|
||||
fingerprints: { Speaker_0: FP_A(1), Speaker_1: FP_B(1) },
|
||||
},
|
||||
];
|
||||
const { speakers } = clusterSpeakers(chunkDiar, 70);
|
||||
assert.equal(speakers.Speaker_A.turns, 2);
|
||||
assert.equal(speakers.Speaker_A.total_speaking_seconds, 15);
|
||||
assert.equal(speakers.Speaker_B.turns, 1);
|
||||
assert.equal(speakers.Speaker_B.total_speaking_seconds, 15);
|
||||
assert.ok(Math.abs(speakers.Speaker_A.mean_confidence - 0.925) < 0.001);
|
||||
});
|
||||
|
||||
test("assignSpeakersToSegments: midpoint inside diar segment wins", () => {
|
||||
const segments = [
|
||||
{ start: 0, end: 5, text: "hello" },
|
||||
{ start: 5, end: 10, text: "world" },
|
||||
];
|
||||
const chunkDiar = [
|
||||
{
|
||||
ok: true,
|
||||
chunkIndex: 0,
|
||||
segments: [
|
||||
{ start: 0, end: 5, speaker_local: "Speaker_0", confidence: 0.9 },
|
||||
{ start: 5, end: 10, speaker_local: "Speaker_1", confidence: 0.85 },
|
||||
],
|
||||
fingerprints: { Speaker_0: FP_A(1), Speaker_1: FP_B(1) },
|
||||
},
|
||||
];
|
||||
const { globalMap } = clusterSpeakers(chunkDiar, 70);
|
||||
assignSpeakersToSegments(segments, chunkDiar, globalMap);
|
||||
assert.equal(segments[0].speaker, "Speaker_A");
|
||||
assert.equal(segments[1].speaker, "Speaker_B");
|
||||
assert.equal(segments[0].speaker_confidence, 0.9);
|
||||
});
|
||||
|
||||
test("assignSpeakersToSegments: nearest-fallback within 5s window", () => {
|
||||
const segments = [
|
||||
{ start: 8, end: 12, text: "in between" }, // gap with no covering diar seg
|
||||
];
|
||||
const chunkDiar = [
|
||||
{
|
||||
ok: true,
|
||||
chunkIndex: 0,
|
||||
segments: [
|
||||
{ start: 0, end: 5, speaker_local: "Speaker_0", confidence: 0.9 },
|
||||
],
|
||||
fingerprints: { Speaker_0: FP_A(1) },
|
||||
},
|
||||
];
|
||||
const { globalMap } = clusterSpeakers(chunkDiar, 70);
|
||||
assignSpeakersToSegments(segments, chunkDiar, globalMap);
|
||||
// Diar segment ends at 5, transcript mid is 10 → distance 7.5 > 5s → speaker stays null
|
||||
assert.equal(segments[0].speaker, null);
|
||||
});
|
||||
|
||||
test("assignSpeakersToSegments: no diar data leaves segments unchanged", () => {
|
||||
const segments = [{ start: 0, end: 5, text: "hello" }];
|
||||
assignSpeakersToSegments(segments, [], new Map());
|
||||
assert.equal(segments[0].speaker, undefined);
|
||||
});
|
||||
Reference in New Issue
Block a user