recap-relay/startos/versions/v0.2.114.ts

import { VersionInfo } from '@start9labs/start-sdk'

export const v_0_2_114 = VersionInfo.of({
  version: '0.2.114:0',
  releaseNotes: {
    en_US:
      "Spark Control 503 'CUDA wedge' retry — proper multi-attempt loop on both transcribe AND diarize paths. The Parakeet wrapper and the Sortformer/TitaNet wrapper both return HTTP 503 with a body of {detail: 'Parakeet returned a transient error (likely CUDA wedge). Auto-restart triggered; retry in ~60s.'} when the GPU container wedges and Spark Control kicks an automatic container restart. Before this ship: the diarize path retried ONCE on 503 (then failed), and the transcribe path didn't have a 503-aware retry at all — its 'retrying bare' fallback was a PARAMETER fix (drop verbose_json) that ran immediately and just hit the same wedge again, then permanently failed the chunk. Symptom: a multi-chunk job that starts two chunks in parallel can hit a single wedge at job start and lose both chunks, breaking the whole pipeline. Now: each path waits and retries up to 4 attempts total per chunk, honoring the Retry-After header when supplied (clamped to [5s, 120s], default 60s if absent), with ±5s jitter so parallel chunks don't synchronize their wake-ups and pile back onto the freshly-restarted container at the exact same instant (would just re-wedge). The transcribe path's existing rich → bare parameter fallback still runs for non-503 4xx/5xx (a different failure mode). Error messages now distinguish 503-after-N-retries ('operator container may need manual restart') from one-shot failures. Net effect for an operator on Spark Control: a fresh container restart that takes 60-120s now mostly gets absorbed by the retry loop instead of killing the pipeline.",
  },
  migrations: {
    up: async ({ effects }) => {},
    down: async ({ effects }) => {},
  },
})