test: add offline pytest harness (build_launch_command injection, label-merge)

2026-06-15 17:24:49 -05:00
parent 17a9973ba2
commit 6238ac88f7
6 changed files with 164 additions and 1 deletions
@@ -0,0 +1,17 @@
+"""Shared pytest setup.
+
+These suites are pure/offline — they exercise pure functions and never touch the
+Sparks, /data, or the network. We still pin the env vars the app modules expect
+(documented in docs/guides/fastapi-image.md) to tmp paths so importing them can
+never write to the container-only /data path.
+"""
+import os
+import sys
+from pathlib import Path
+
+# Let `import app...` resolve whether or not the package is pip-installed.
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
+
+os.environ.setdefault("REDACTION_MAP_DB", "/tmp/spark_control_test_maps.db")
+os.environ.setdefault("CONNECTIVITY_LOG", "/tmp/spark_control_test_connectivity.json")
+os.environ.setdefault("MODELS_OVERRIDES", "/tmp/spark_control_test_overrides.yaml")
@@ -0,0 +1,69 @@
+"""_merge_words_with_speakers + _assign_speaker_to_word: the transcript/diarizer
+merge that turns Parakeet words + Sortformer turns into speaker-labelled blocks.
+Pure functions, no cluster — this is the core of transcribe-with-speakers.
+"""
+from app.audio_proxy import _assign_speaker_to_word, _merge_words_with_speakers
+
+
+def _w(start, end, text):
+    return {"start": start, "end": end, "text": text}
+
+
+def _t(start, end, speaker):
+    return {"start_s": start, "end_s": end, "speaker": speaker}
+
+
+# ---- _assign_speaker_to_word ----
+
+def test_assign_by_midpoint_containment():
+    turns = [_t(0.0, 2.0, "Speaker_0"), _t(2.0, 4.0, "Speaker_1")]
+    assert _assign_speaker_to_word(2.4, 2.8, turns) == "Speaker_1"
+
+
+def test_assign_falls_back_to_max_overlap_when_midpoint_outside():
+    # midpoint 5.0 is in no turn; word span overlaps Speaker_0 more than Speaker_1.
+    turns = [_t(0.0, 4.9, "Speaker_0"), _t(6.0, 8.0, "Speaker_1")]
+    assert _assign_speaker_to_word(4.0, 6.0, turns) == "Speaker_0"
+
+
+def test_assign_unknown_when_no_overlap():
+    turns = [_t(0.0, 1.0, "Speaker_0")]
+    assert _assign_speaker_to_word(10.0, 11.0, turns) == "Speaker_unknown"
+
+
+# ---- _merge_words_with_speakers ----
+
+def test_empty_words_returns_empty():
+    assert _merge_words_with_speakers([], [_t(0, 1, "Speaker_0")]) == []
+
+
+def test_consecutive_same_speaker_words_join_into_one_block():
+    words = [_w(0.0, 0.5, "good"), _w(0.5, 1.0, "morning")]
+    turns = [_t(0.0, 2.0, "Speaker_0")]
+    blocks = _merge_words_with_speakers(words, turns)
+    assert blocks == [
+        {"start_ms": 0, "end_ms": 1000, "speaker": "Speaker_0", "text": "good morning"}
+    ]
+
+
+def test_speaker_change_splits_blocks():
+    words = [_w(0.0, 1.0, "hi"), _w(2.1, 3.0, "hello")]
+    turns = [_t(0.0, 2.0, "Speaker_0"), _t(2.0, 4.0, "Speaker_1")]
+    blocks = _merge_words_with_speakers(words, turns)
+    assert [b["speaker"] for b in blocks] == ["Speaker_0", "Speaker_1"]
+    assert [b["text"] for b in blocks] == ["hi", "hello"]
+
+
+def test_long_silence_breaks_block_for_same_speaker():
+    # >1.5s gap between two words of the same speaker forces a new block.
+    words = [_w(0.0, 0.5, "one"), _w(3.0, 3.5, "two")]
+    turns = [_t(0.0, 4.0, "Speaker_0")]
+    blocks = _merge_words_with_speakers(words, turns)
+    assert len(blocks) == 2
+    assert [b["text"] for b in blocks] == ["one", "two"]
+
+
+def test_punctuation_token_joins_without_leading_space():
+    words = [_w(0.0, 0.5, "hello"), _w(0.5, 0.7, ".")]
+    turns = [_t(0.0, 2.0, "Speaker_0")]
+    assert _merge_words_with_speakers(words, turns)[0]["text"] == "hello."
@@ -0,0 +1,67 @@
+"""build_launch_command: argument assembly + the shell-injection invariant.
+
+The security-critical property is that every user-controllable value (repo,
+vllm_args, knobs) is shlex-quoted at the sink, so `shlex.split` cleanly reverses
+the command back into the exact token list. The vLLM pre-flight validator
+(validate.py) depends on this round-trip — these tests lock it in.
+"""
+import shlex
+
+from app.models import Defaults, ModelDef, build_launch_command
+
+DEFAULTS = Defaults(port=8888, host="0.0.0.0")
+
+
+def _model(**kw) -> ModelDef:
+    base = dict(display_name="X", repo="org/name", size_gb=1.0, mode="solo")
+    base.update(kw)
+    return ModelDef(**base)
+
+
+def test_solo_model_emits_solo_flag_and_ordered_args():
+    cmd = build_launch_command("k", _model(vllm_args=["--max-model-len=1000"]), DEFAULTS)
+    assert cmd == (
+        "./launch-cluster.sh --solo -d exec vllm serve org/name "
+        "--port=8888 --host=0.0.0.0 --max-model-len=1000"
+    )
+
+
+def test_cluster_model_omits_solo_flag():
+    cmd = build_launch_command("k", _model(mode="cluster", vllm_args=["-tp=2"]), DEFAULTS)
+    assert " --solo " not in cmd
+    assert cmd.startswith("./launch-cluster.sh -d exec vllm serve org/name")
+
+
+def test_knob_overrides_matching_bundled_flag():
+    # bundled arg sets max-model-len; the knob must win (single occurrence).
+    m = _model(vllm_args=["--max-model-len=1000"], knobs={"max_model_len": 65536})
+    cmd = build_launch_command("k", m, DEFAULTS)
+    assert "--max-model-len=65536" in cmd
+    assert "--max-model-len=1000" not in cmd
+
+
+def test_repo_with_shell_metacharacters_is_quoted_not_executed():
+    # build_launch_command quotes even a hostile repo (validate_repo guards the
+    # API boundary; this proves the sink itself is safe in depth).
+    evil = "org/name; rm -rf ~ #"
+    cmd = build_launch_command("k", _model(repo=evil), DEFAULTS)
+    # The raw metacharacters must not appear unquoted...
+    assert "; rm -rf" not in cmd.replace(shlex.quote(evil), "")
+    # ...and shlex.split must recover the repo as one literal token.
+    tokens = shlex.split(cmd)
+    assert evil in tokens
+
+
+def test_command_string_round_trips_through_shlex_split():
+    # The invariant validate.py relies on: every arg survives quote -> split intact.
+    args = ["--max-model-len=32768", "--load-format=fastsafetensors", "--note=a b c"]
+    cmd = build_launch_command("k", _model(vllm_args=args), DEFAULTS)
+    tokens = shlex.split(cmd)
+    for a in args:
+        assert a in tokens
+
+
+def test_injection_via_vllm_arg_stays_literal():
+    payload = "--foo=$(touch /tmp/pwned)"
+    cmd = build_launch_command("k", _model(vllm_args=[payload]), DEFAULTS)
+    assert payload in shlex.split(cmd)  # preserved as one inert token