test: add offline pytest harness (build_launch_command injection, label-merge)

This commit is contained in:
Keysat
2026-06-15 17:24:49 -05:00
parent 17a9973ba2
commit 6238ac88f7
6 changed files with 164 additions and 1 deletions
+17
View File
@@ -0,0 +1,17 @@
"""Shared pytest setup.
These suites are pure/offline — they exercise pure functions and never touch the
Sparks, /data, or the network. We still pin the env vars the app modules expect
(documented in docs/guides/fastapi-image.md) to tmp paths so importing them can
never write to the container-only /data path.
"""
import os
import sys
from pathlib import Path
# Let `import app...` resolve whether or not the package is pip-installed.
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
os.environ.setdefault("REDACTION_MAP_DB", "/tmp/spark_control_test_maps.db")
os.environ.setdefault("CONNECTIVITY_LOG", "/tmp/spark_control_test_connectivity.json")
os.environ.setdefault("MODELS_OVERRIDES", "/tmp/spark_control_test_overrides.yaml")
+69
View File
@@ -0,0 +1,69 @@
"""_merge_words_with_speakers + _assign_speaker_to_word: the transcript/diarizer
merge that turns Parakeet words + Sortformer turns into speaker-labelled blocks.
Pure functions, no cluster — this is the core of transcribe-with-speakers.
"""
from app.audio_proxy import _assign_speaker_to_word, _merge_words_with_speakers
def _w(start, end, text):
return {"start": start, "end": end, "text": text}
def _t(start, end, speaker):
return {"start_s": start, "end_s": end, "speaker": speaker}
# ---- _assign_speaker_to_word ----
def test_assign_by_midpoint_containment():
turns = [_t(0.0, 2.0, "Speaker_0"), _t(2.0, 4.0, "Speaker_1")]
assert _assign_speaker_to_word(2.4, 2.8, turns) == "Speaker_1"
def test_assign_falls_back_to_max_overlap_when_midpoint_outside():
# midpoint 5.0 is in no turn; word span overlaps Speaker_0 more than Speaker_1.
turns = [_t(0.0, 4.9, "Speaker_0"), _t(6.0, 8.0, "Speaker_1")]
assert _assign_speaker_to_word(4.0, 6.0, turns) == "Speaker_0"
def test_assign_unknown_when_no_overlap():
turns = [_t(0.0, 1.0, "Speaker_0")]
assert _assign_speaker_to_word(10.0, 11.0, turns) == "Speaker_unknown"
# ---- _merge_words_with_speakers ----
def test_empty_words_returns_empty():
assert _merge_words_with_speakers([], [_t(0, 1, "Speaker_0")]) == []
def test_consecutive_same_speaker_words_join_into_one_block():
words = [_w(0.0, 0.5, "good"), _w(0.5, 1.0, "morning")]
turns = [_t(0.0, 2.0, "Speaker_0")]
blocks = _merge_words_with_speakers(words, turns)
assert blocks == [
{"start_ms": 0, "end_ms": 1000, "speaker": "Speaker_0", "text": "good morning"}
]
def test_speaker_change_splits_blocks():
words = [_w(0.0, 1.0, "hi"), _w(2.1, 3.0, "hello")]
turns = [_t(0.0, 2.0, "Speaker_0"), _t(2.0, 4.0, "Speaker_1")]
blocks = _merge_words_with_speakers(words, turns)
assert [b["speaker"] for b in blocks] == ["Speaker_0", "Speaker_1"]
assert [b["text"] for b in blocks] == ["hi", "hello"]
def test_long_silence_breaks_block_for_same_speaker():
# >1.5s gap between two words of the same speaker forces a new block.
words = [_w(0.0, 0.5, "one"), _w(3.0, 3.5, "two")]
turns = [_t(0.0, 4.0, "Speaker_0")]
blocks = _merge_words_with_speakers(words, turns)
assert len(blocks) == 2
assert [b["text"] for b in blocks] == ["one", "two"]
def test_punctuation_token_joins_without_leading_space():
words = [_w(0.0, 0.5, "hello"), _w(0.5, 0.7, ".")]
turns = [_t(0.0, 2.0, "Speaker_0")]
assert _merge_words_with_speakers(words, turns)[0]["text"] == "hello."
+67
View File
@@ -0,0 +1,67 @@
"""build_launch_command: argument assembly + the shell-injection invariant.
The security-critical property is that every user-controllable value (repo,
vllm_args, knobs) is shlex-quoted at the sink, so `shlex.split` cleanly reverses
the command back into the exact token list. The vLLM pre-flight validator
(validate.py) depends on this round-trip — these tests lock it in.
"""
import shlex
from app.models import Defaults, ModelDef, build_launch_command
DEFAULTS = Defaults(port=8888, host="0.0.0.0")
def _model(**kw) -> ModelDef:
base = dict(display_name="X", repo="org/name", size_gb=1.0, mode="solo")
base.update(kw)
return ModelDef(**base)
def test_solo_model_emits_solo_flag_and_ordered_args():
cmd = build_launch_command("k", _model(vllm_args=["--max-model-len=1000"]), DEFAULTS)
assert cmd == (
"./launch-cluster.sh --solo -d exec vllm serve org/name "
"--port=8888 --host=0.0.0.0 --max-model-len=1000"
)
def test_cluster_model_omits_solo_flag():
cmd = build_launch_command("k", _model(mode="cluster", vllm_args=["-tp=2"]), DEFAULTS)
assert " --solo " not in cmd
assert cmd.startswith("./launch-cluster.sh -d exec vllm serve org/name")
def test_knob_overrides_matching_bundled_flag():
# bundled arg sets max-model-len; the knob must win (single occurrence).
m = _model(vllm_args=["--max-model-len=1000"], knobs={"max_model_len": 65536})
cmd = build_launch_command("k", m, DEFAULTS)
assert "--max-model-len=65536" in cmd
assert "--max-model-len=1000" not in cmd
def test_repo_with_shell_metacharacters_is_quoted_not_executed():
# build_launch_command quotes even a hostile repo (validate_repo guards the
# API boundary; this proves the sink itself is safe in depth).
evil = "org/name; rm -rf ~ #"
cmd = build_launch_command("k", _model(repo=evil), DEFAULTS)
# The raw metacharacters must not appear unquoted...
assert "; rm -rf" not in cmd.replace(shlex.quote(evil), "")
# ...and shlex.split must recover the repo as one literal token.
tokens = shlex.split(cmd)
assert evil in tokens
def test_command_string_round_trips_through_shlex_split():
# The invariant validate.py relies on: every arg survives quote -> split intact.
args = ["--max-model-len=32768", "--load-format=fastsafetensors", "--note=a b c"]
cmd = build_launch_command("k", _model(vllm_args=args), DEFAULTS)
tokens = shlex.split(cmd)
for a in args:
assert a in tokens
def test_injection_via_vllm_arg_stays_literal():
payload = "--foo=$(touch /tmp/pwned)"
cmd = build_launch_command("k", _model(vllm_args=[payload]), DEFAULTS)
assert payload in shlex.split(cmd) # preserved as one inert token