Files
recap-relay/startos/versions/v0.2.114.ts
T

14 lines
1.8 KiB
TypeScript

import { VersionInfo } from '@start9labs/start-sdk'
export const v_0_2_114 = VersionInfo.of({
version: '0.2.114:0',
releaseNotes: {
en_US:
"Spark Control 503 'CUDA wedge' retry — proper multi-attempt loop on both transcribe AND diarize paths. The Parakeet wrapper and the Sortformer/TitaNet wrapper both return HTTP 503 with a body of {detail: 'Parakeet returned a transient error (likely CUDA wedge). Auto-restart triggered; retry in ~60s.'} when the GPU container wedges and Spark Control kicks an automatic container restart. Before this ship: the diarize path retried ONCE on 503 (then failed), and the transcribe path didn't have a 503-aware retry at all — its 'retrying bare' fallback was a PARAMETER fix (drop verbose_json) that ran immediately and just hit the same wedge again, then permanently failed the chunk. Symptom: a multi-chunk job that starts two chunks in parallel can hit a single wedge at job start and lose both chunks, breaking the whole pipeline. Now: each path waits and retries up to 4 attempts total per chunk, honoring the Retry-After header when supplied (clamped to [5s, 120s], default 60s if absent), with ±5s jitter so parallel chunks don't synchronize their wake-ups and pile back onto the freshly-restarted container at the exact same instant (would just re-wedge). The transcribe path's existing rich → bare parameter fallback still runs for non-503 4xx/5xx (a different failure mode). Error messages now distinguish 503-after-N-retries ('operator container may need manual restart') from one-shot failures. Net effect for an operator on Spark Control: a fresh container restart that takes 60-120s now mostly gets absorbed by the retry loop instead of killing the pipeline.",
},
migrations: {
up: async ({ effects }) => {},
down: async ({ effects }) => {},
},
})