14 lines
1.7 KiB
TypeScript
14 lines
1.7 KiB
TypeScript
import { VersionInfo } from '@start9labs/start-sdk'
|
||
|
||
export const v_0_2_88 = VersionInfo.of({
|
||
version: '0.2.88:0',
|
||
releaseNotes: {
|
||
en_US:
|
||
"Phase 1D of the diarization migration: cross-chunk speaker clustering. The Phase 1C output (per-chunk segments + per-speaker 192-dim TitaNet fingerprints) is now reconciled into global speaker IDs (Speaker_A, Speaker_B, ...) via average-linkage agglomerative clustering on cosine similarity. New module server/speaker-clustering.js. Threshold is the operator-tunable 'Voice clustering threshold' slider (default 0.70 cosine similarity, NeMo's recommended TitaNet default; range 0.50–0.95). The merged transcript segments each gain `speaker` + `speaker_confidence` fields based on which diarization segment overlaps the transcript line's midpoint (5s nearest-fallback when no segment covers the midpoint). The relay's summarize-url response envelope gains two new top-level fields: `speakers` (per-speaker summary with turns, total speaking seconds, mean confidence, fingerprint count, chunks_appeared_in) and `transcript_segments` (per-segment array with start, end, text, speaker, speaker_confidence). Both are null when diarization is off or produced no fingerprints. The Recap frontend ignores these for now — Phase 1E will hook up color-coded speaker rendering. New log line: '[hardware] diarization: 21/21 chunks succeeded, 42 fingerprints → 2 distinct speaker(s) at 70% cosine-sim threshold (clustering took 3ms)'. 13 unit tests cover the algorithm (cosine sim, cluster merge, threshold clamping, label-flip recovery, summary aggregation, midpoint vs nearest assignment).",
|
||
},
|
||
migrations: {
|
||
up: async ({ effects }) => {},
|
||
down: async ({ effects }) => {},
|
||
},
|
||
})
|