From 6238ac88f7f6e7e84015ff4812eb290537e06756 Mon Sep 17 00:00:00 2001
From: Keysat <licensing@keysat.xyz>
Date: Mon, 15 Jun 2026 17:24:49 -0500
Subject: [PATCH] test: add offline pytest harness (build_launch_command
 injection, label-merge)

---
 AGENTS.md                          |  1 +
 docs/guides/fastapi-image.md       |  5 ++-
 image/pyproject.toml               |  6 +++
 image/tests/conftest.py            | 17 ++++++++
 image/tests/test_label_merge.py    | 69 ++++++++++++++++++++++++++++++
 image/tests/test_launch_command.py | 67 +++++++++++++++++++++++++++++
 6 files changed, 164 insertions(+), 1 deletion(-)
 create mode 100644 image/tests/conftest.py
 create mode 100644 image/tests/test_label_merge.py
 create mode 100644 image/tests/test_launch_command.py
diff --git a/AGENTS.md b/AGENTS.md
index 175fc9d..a7a88a3 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -23,6 +23,7 @@ Subsystem guidance lives in `docs/guides/` and loads when matching files are tou
 ```bash
 (cd package && make x86)                                  # build the s9pk; make install sideloads (restarts live service — ask first)
 (cd image && uvicorn app.server:app --port 9999)          # local dev — needs env vars, see fastapi-image rule
+(cd image && .venv/bin/python -m pytest)                          # offline unit suite (launch-cmd injection, label-merge)
 (cd image && .venv/bin/python -m app.redaction.test_gateway)      # offline redaction suite 1
 (cd image && .venv/bin/python app/redaction/test_scrub_leak.py)   # offline redaction suite 2
 ./scripts/test-audio-with-speakers.sh <audio-file>        # e2e audio — hits the LIVE cluster
diff --git a/docs/guides/fastapi-image.md b/docs/guides/fastapi-image.md
index 1df6ad1..ed123ed 100644
--- a/docs/guides/fastapi-image.md
+++ b/docs/guides/fastapi-image.md
@@ -24,7 +24,10 @@ Other env vars: `BIND_PORT`, `MODELS_YAML`, `SSH_DIR`, `SSH_KNOWN_HOSTS`, `MODEL
 
 ## Tests
 
-No pytest harness — each suite is a standalone script run with the `image/.venv` interpreter (system python3 has no deps). See the redaction and audio rules for the suites themselves.
+Two kinds, both run with the `image/.venv` interpreter (system python3 has no deps):
+
+- **pytest unit suite** — offline, pure functions, no cluster. `.venv/bin/python -m pytest` from `image/`. Lives in `image/tests/`; currently covers `build_launch_command` (incl. the shell-injection / `shlex` round-trip invariant) and the transcript↔diarizer label-merge (`_merge_words_with_speakers`). Install the test dep once with `pip install -e '.[dev]'`. Add new pure-function coverage here.
+- **Standalone scripts** — the redaction suites and the live-cluster audio e2e are run directly (not via pytest). See the redaction and audio rules.
 
 ## Conventions
 
diff --git a/image/pyproject.toml b/image/pyproject.toml
index 0917ea7..bf9dcd0 100644
--- a/image/pyproject.toml
+++ b/image/pyproject.toml
@@ -12,6 +12,12 @@ dependencies = [
     "python-multipart>=0.0.9",
 ]
 
+[project.optional-dependencies]
+dev = ["pytest>=8"]
+
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+
 [build-system]
 requires = ["setuptools>=68"]
 build-backend = "setuptools.build_meta"
diff --git a/image/tests/conftest.py b/image/tests/conftest.py
new file mode 100644
index 0000000..17809fc
--- /dev/null
+++ b/image/tests/conftest.py
@@ -0,0 +1,17 @@
+"""Shared pytest setup.
+
+These suites are pure/offline — they exercise pure functions and never touch the
+Sparks, /data, or the network. We still pin the env vars the app modules expect
+(documented in docs/guides/fastapi-image.md) to tmp paths so importing them can
+never write to the container-only /data path.
+"""
+import os
+import sys
+from pathlib import Path
+
+# Let `import app...` resolve whether or not the package is pip-installed.
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
+
+os.environ.setdefault("REDACTION_MAP_DB", "/tmp/spark_control_test_maps.db")
+os.environ.setdefault("CONNECTIVITY_LOG", "/tmp/spark_control_test_connectivity.json")
+os.environ.setdefault("MODELS_OVERRIDES", "/tmp/spark_control_test_overrides.yaml")
diff --git a/image/tests/test_label_merge.py b/image/tests/test_label_merge.py
new file mode 100644
index 0000000..6e42577
--- /dev/null
+++ b/image/tests/test_label_merge.py
@@ -0,0 +1,69 @@
+"""_merge_words_with_speakers + _assign_speaker_to_word: the transcript/diarizer
+merge that turns Parakeet words + Sortformer turns into speaker-labelled blocks.
+Pure functions, no cluster — this is the core of transcribe-with-speakers.
+"""
+from app.audio_proxy import _assign_speaker_to_word, _merge_words_with_speakers
+
+
+def _w(start, end, text):
+    return {"start": start, "end": end, "text": text}
+
+
+def _t(start, end, speaker):
+    return {"start_s": start, "end_s": end, "speaker": speaker}
+
+
+# ---- _assign_speaker_to_word ----
+
+def test_assign_by_midpoint_containment():
+    turns = [_t(0.0, 2.0, "Speaker_0"), _t(2.0, 4.0, "Speaker_1")]
+    assert _assign_speaker_to_word(2.4, 2.8, turns) == "Speaker_1"
+
+
+def test_assign_falls_back_to_max_overlap_when_midpoint_outside():
+    # midpoint 5.0 is in no turn; word span overlaps Speaker_0 more than Speaker_1.
+    turns = [_t(0.0, 4.9, "Speaker_0"), _t(6.0, 8.0, "Speaker_1")]
+    assert _assign_speaker_to_word(4.0, 6.0, turns) == "Speaker_0"
+
+
+def test_assign_unknown_when_no_overlap():
+    turns = [_t(0.0, 1.0, "Speaker_0")]
+    assert _assign_speaker_to_word(10.0, 11.0, turns) == "Speaker_unknown"
+
+
+# ---- _merge_words_with_speakers ----
+
+def test_empty_words_returns_empty():
+    assert _merge_words_with_speakers([], [_t(0, 1, "Speaker_0")]) == []
+
+
+def test_consecutive_same_speaker_words_join_into_one_block():
+    words = [_w(0.0, 0.5, "good"), _w(0.5, 1.0, "morning")]
+    turns = [_t(0.0, 2.0, "Speaker_0")]
+    blocks = _merge_words_with_speakers(words, turns)
+    assert blocks == [
+        {"start_ms": 0, "end_ms": 1000, "speaker": "Speaker_0", "text": "good morning"}
+    ]
+
+
+def test_speaker_change_splits_blocks():
+    words = [_w(0.0, 1.0, "hi"), _w(2.1, 3.0, "hello")]
+    turns = [_t(0.0, 2.0, "Speaker_0"), _t(2.0, 4.0, "Speaker_1")]
+    blocks = _merge_words_with_speakers(words, turns)
+    assert [b["speaker"] for b in blocks] == ["Speaker_0", "Speaker_1"]
+    assert [b["text"] for b in blocks] == ["hi", "hello"]
+
+
+def test_long_silence_breaks_block_for_same_speaker():
+    # >1.5s gap between two words of the same speaker forces a new block.
+    words = [_w(0.0, 0.5, "one"), _w(3.0, 3.5, "two")]
+    turns = [_t(0.0, 4.0, "Speaker_0")]
+    blocks = _merge_words_with_speakers(words, turns)
+    assert len(blocks) == 2
+    assert [b["text"] for b in blocks] == ["one", "two"]
+
+
+def test_punctuation_token_joins_without_leading_space():
+    words = [_w(0.0, 0.5, "hello"), _w(0.5, 0.7, ".")]
+    turns = [_t(0.0, 2.0, "Speaker_0")]
+    assert _merge_words_with_speakers(words, turns)[0]["text"] == "hello."
diff --git a/image/tests/test_launch_command.py b/image/tests/test_launch_command.py
new file mode 100644
index 0000000..8d879bf
--- /dev/null
+++ b/image/tests/test_launch_command.py
@@ -0,0 +1,67 @@
+"""build_launch_command: argument assembly + the shell-injection invariant.
+
+The security-critical property is that every user-controllable value (repo,
+vllm_args, knobs) is shlex-quoted at the sink, so `shlex.split` cleanly reverses
+the command back into the exact token list. The vLLM pre-flight validator
+(validate.py) depends on this round-trip — these tests lock it in.
+"""
+import shlex
+
+from app.models import Defaults, ModelDef, build_launch_command
+
+DEFAULTS = Defaults(port=8888, host="0.0.0.0")
+
+
+def _model(**kw) -> ModelDef:
+    base = dict(display_name="X", repo="org/name", size_gb=1.0, mode="solo")
+    base.update(kw)
+    return ModelDef(**base)
+
+
+def test_solo_model_emits_solo_flag_and_ordered_args():
+    cmd = build_launch_command("k", _model(vllm_args=["--max-model-len=1000"]), DEFAULTS)
+    assert cmd == (
+        "./launch-cluster.sh --solo -d exec vllm serve org/name "
+        "--port=8888 --host=0.0.0.0 --max-model-len=1000"
+    )
+
+
+def test_cluster_model_omits_solo_flag():
+    cmd = build_launch_command("k", _model(mode="cluster", vllm_args=["-tp=2"]), DEFAULTS)
+    assert " --solo " not in cmd
+    assert cmd.startswith("./launch-cluster.sh -d exec vllm serve org/name")
+
+
+def test_knob_overrides_matching_bundled_flag():
+    # bundled arg sets max-model-len; the knob must win (single occurrence).
+    m = _model(vllm_args=["--max-model-len=1000"], knobs={"max_model_len": 65536})
+    cmd = build_launch_command("k", m, DEFAULTS)
+    assert "--max-model-len=65536" in cmd
+    assert "--max-model-len=1000" not in cmd
+
+
+def test_repo_with_shell_metacharacters_is_quoted_not_executed():
+    # build_launch_command quotes even a hostile repo (validate_repo guards the
+    # API boundary; this proves the sink itself is safe in depth).
+    evil = "org/name; rm -rf ~ #"
+    cmd = build_launch_command("k", _model(repo=evil), DEFAULTS)
+    # The raw metacharacters must not appear unquoted...
+    assert "; rm -rf" not in cmd.replace(shlex.quote(evil), "")
+    # ...and shlex.split must recover the repo as one literal token.
+    tokens = shlex.split(cmd)
+    assert evil in tokens
+
+
+def test_command_string_round_trips_through_shlex_split():
+    # The invariant validate.py relies on: every arg survives quote -> split intact.
+    args = ["--max-model-len=32768", "--load-format=fastsafetensors", "--note=a b c"]
+    cmd = build_launch_command("k", _model(vllm_args=args), DEFAULTS)
+    tokens = shlex.split(cmd)
+    for a in args:
+        assert a in tokens
+
+
+def test_injection_via_vllm_arg_stays_literal():
+    payload = "--foo=$(touch /tmp/pwned)"
+    cmd = build_launch_command("k", _model(vllm_args=[payload]), DEFAULTS)
+    assert payload in shlex.split(cmd)  # preserved as one inert token