From fda23088fe970990776872c9bf9c2ebcf725c665 Mon Sep 17 00:00:00 2001
From: Keysat <licensing@keysat.xyz>
Date: Mon, 18 May 2026 15:42:04 -0500
Subject: [PATCH] v0.10.0:1 - hotfix: merge function now joins words with
 proper spacing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Smoke testing v0.10.0:0 against a real anarlog audio.mp3 showed the
output running words together: "I'mrecordingrightnow", "don'tyoutry".

Root cause: _merge_words_with_speakers was doing "".join(cur_words),
assuming Parakeet returns words with leading whitespace (which the
hyprnote local Parakeet does, but the Spark-hosted Parakeet does not).

Rewrote the join with a small helper that:
  - Strips each token (handles both leading-space and no-leading-space
    word formats)
  - Joins with a single space
  - Keeps punctuation tight — no space before period/comma/colon/etc.

Verified post-install with the same test audio:
  [00:06] Speaker_0: I'm I'm recording right now.
  [00:18] Speaker_1: you're you're on your computer and your phone, right?

No other changes — Parakeet container patches and the endpoint shape
stay identical.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 image/app/audio_proxy.py           | 22 ++++++++++++++++++++--
 package/startos/versions/v0_1_0.ts |  4 ++--
 2 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/image/app/audio_proxy.py b/image/app/audio_proxy.py
index 0d03f24..cfffdb4 100644
--- a/image/app/audio_proxy.py
+++ b/image/app/audio_proxy.py
@@ -344,6 +344,24 @@ def _merge_words_with_speakers(words: list[dict], diar_turns: list[dict]) -> lis
         return []
     SILENCE_BREAK_S = 1.5
 
+    def _join_words(parts: list[str]) -> str:
+        """Join word tokens with proper spacing. Different STT outputs vary —
+        some include leading spaces in the word text (' morning'), some don't
+        ('morning'). Normalize by stripping each token then joining with one
+        space; collapse multiple spaces. Keeps punctuation tight (no space
+        before period/comma/etc.)."""
+        cleaned = [p.strip() for p in parts if p and p.strip()]
+        if not cleaned:
+            return ""
+        out = cleaned[0]
+        for token in cleaned[1:]:
+            # No leading space before pure-punctuation tokens
+            if token and token[0] in ".,;:!?)]}'\"":
+                out += token
+            else:
+                out += " " + token
+        return out
+
     blocks: list[dict] = []
     cur_words: list[str] = []
     cur_speaker: Optional[str] = None
@@ -367,7 +385,7 @@ def _merge_words_with_speakers(words: list[dict], diar_turns: list[dict]) -> lis
                     "start_ms": int(cur_start_s * 1000),
                     "end_ms": int(cur_end_s * 1000),
                     "speaker": cur_speaker,
-                    "text": "".join(cur_words).strip(),
+                    "text": _join_words(cur_words),
                 })
             cur_words = [wt]
             cur_speaker = spk
@@ -382,7 +400,7 @@ def _merge_words_with_speakers(words: list[dict], diar_turns: list[dict]) -> lis
             "start_ms": int(cur_start_s * 1000),
             "end_ms": int(cur_end_s * 1000),
             "speaker": cur_speaker,
-            "text": "".join(cur_words).strip(),
+            "text": _join_words(cur_words),
         })
 
     return blocks
diff --git a/package/startos/versions/v0_1_0.ts b/package/startos/versions/v0_1_0.ts
index 1d4d2cf..fa05889 100644
--- a/package/startos/versions/v0_1_0.ts
+++ b/package/startos/versions/v0_1_0.ts
@@ -1,10 +1,10 @@
 import { VersionInfo, IMPOSSIBLE } from '@start9labs/start-sdk'
 
 export const v0_1_0 = VersionInfo.of({
-  version: '0.10.0:0',
+  version: '0.10.0:1',
   releaseNotes: {
     en_US:
-      'v0.10.0 — Speaker diarization. Spark Control now offers a merged transcription + diarization endpoint at POST /api/audio/transcribe-with-speakers. Returns the spoken text broken into blocks with anonymous speaker labels (Speaker_0, Speaker_1, ...) and millisecond timestamps — designed as input for downstream apps (recap-relay, custom UIs) that handle speaker→name mapping and LLM analysis with their own configurable prompts. Diarization runs via NVIDIA NeMo Sortformer (nvidia/diar_sortformer_4spk-v1), loaded alongside Parakeet ASR inside the existing parakeet-asr container on Spark 2 — no new infrastructure, ~150 MB model addition. A new /v1/audio/diarize endpoint is also exposed on Parakeet directly for clients that just want speaker turns. Apply Sortformer patches via image/parakeet_patches/apply.sh after install. v0.11 will add a Speech Models dashboard panel for in-UI model swap/update.',
+      'v0.10.0:1 — fix: in 0.10.0:0 the /api/audio/transcribe-with-speakers merge function joined word tokens without spaces (e.g. "I\'mrecordingrightnow") because it assumed Parakeet returned words with leading whitespace. Spark Parakeet returns them without. Rewrote the joiner to strip each token, separate with a single space, and keep punctuation tight (no space before period/comma/colon/etc.). No other changes — Parakeet container patches and the endpoint shape stay the same.',
   },
   migrations: {
     up: async ({ effects }) => {},