diff --git a/image/app/audio_proxy.py b/image/app/audio_proxy.py index 0d03f24..cfffdb4 100644 --- a/image/app/audio_proxy.py +++ b/image/app/audio_proxy.py @@ -344,6 +344,24 @@ def _merge_words_with_speakers(words: list[dict], diar_turns: list[dict]) -> lis return [] SILENCE_BREAK_S = 1.5 + def _join_words(parts: list[str]) -> str: + """Join word tokens with proper spacing. Different STT outputs vary — + some include leading spaces in the word text (' morning'), some don't + ('morning'). Normalize by stripping each token then joining with one + space; collapse multiple spaces. Keeps punctuation tight (no space + before period/comma/etc.).""" + cleaned = [p.strip() for p in parts if p and p.strip()] + if not cleaned: + return "" + out = cleaned[0] + for token in cleaned[1:]: + # No leading space before pure-punctuation tokens + if token and token[0] in ".,;:!?)]}'\"": + out += token + else: + out += " " + token + return out + blocks: list[dict] = [] cur_words: list[str] = [] cur_speaker: Optional[str] = None @@ -367,7 +385,7 @@ def _merge_words_with_speakers(words: list[dict], diar_turns: list[dict]) -> lis "start_ms": int(cur_start_s * 1000), "end_ms": int(cur_end_s * 1000), "speaker": cur_speaker, - "text": "".join(cur_words).strip(), + "text": _join_words(cur_words), }) cur_words = [wt] cur_speaker = spk @@ -382,7 +400,7 @@ def _merge_words_with_speakers(words: list[dict], diar_turns: list[dict]) -> lis "start_ms": int(cur_start_s * 1000), "end_ms": int(cur_end_s * 1000), "speaker": cur_speaker, - "text": "".join(cur_words).strip(), + "text": _join_words(cur_words), }) return blocks diff --git a/package/startos/versions/v0_1_0.ts b/package/startos/versions/v0_1_0.ts index 1d4d2cf..fa05889 100644 --- a/package/startos/versions/v0_1_0.ts +++ b/package/startos/versions/v0_1_0.ts @@ -1,10 +1,10 @@ import { VersionInfo, IMPOSSIBLE } from '@start9labs/start-sdk' export const v0_1_0 = VersionInfo.of({ - version: '0.10.0:0', + version: '0.10.0:1', releaseNotes: { en_US: - 'v0.10.0 — Speaker diarization. Spark Control now offers a merged transcription + diarization endpoint at POST /api/audio/transcribe-with-speakers. Returns the spoken text broken into blocks with anonymous speaker labels (Speaker_0, Speaker_1, ...) and millisecond timestamps — designed as input for downstream apps (recap-relay, custom UIs) that handle speaker→name mapping and LLM analysis with their own configurable prompts. Diarization runs via NVIDIA NeMo Sortformer (nvidia/diar_sortformer_4spk-v1), loaded alongside Parakeet ASR inside the existing parakeet-asr container on Spark 2 — no new infrastructure, ~150 MB model addition. A new /v1/audio/diarize endpoint is also exposed on Parakeet directly for clients that just want speaker turns. Apply Sortformer patches via image/parakeet_patches/apply.sh after install. v0.11 will add a Speech Models dashboard panel for in-UI model swap/update.', + 'v0.10.0:1 — fix: in 0.10.0:0 the /api/audio/transcribe-with-speakers merge function joined word tokens without spaces (e.g. "I\'mrecordingrightnow") because it assumed Parakeet returned words with leading whitespace. Spark Parakeet returns them without. Rewrote the joiner to strip each token, separate with a single space, and keep punctuation tight (no space before period/comma/colon/etc.). No other changes — Parakeet container patches and the endpoint shape stay the same.', }, migrations: { up: async ({ effects }) => {},