test: add offline pytest harness (build_launch_command injection, label-merge)
This commit is contained in:
@@ -0,0 +1,17 @@
|
||||
"""Shared pytest setup.
|
||||
|
||||
These suites are pure/offline — they exercise pure functions and never touch the
|
||||
Sparks, /data, or the network. We still pin the env vars the app modules expect
|
||||
(documented in docs/guides/fastapi-image.md) to tmp paths so importing them can
|
||||
never write to the container-only /data path.
|
||||
"""
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Let `import app...` resolve whether or not the package is pip-installed.
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
||||
|
||||
os.environ.setdefault("REDACTION_MAP_DB", "/tmp/spark_control_test_maps.db")
|
||||
os.environ.setdefault("CONNECTIVITY_LOG", "/tmp/spark_control_test_connectivity.json")
|
||||
os.environ.setdefault("MODELS_OVERRIDES", "/tmp/spark_control_test_overrides.yaml")
|
||||
@@ -0,0 +1,69 @@
|
||||
"""_merge_words_with_speakers + _assign_speaker_to_word: the transcript/diarizer
|
||||
merge that turns Parakeet words + Sortformer turns into speaker-labelled blocks.
|
||||
Pure functions, no cluster — this is the core of transcribe-with-speakers.
|
||||
"""
|
||||
from app.audio_proxy import _assign_speaker_to_word, _merge_words_with_speakers
|
||||
|
||||
|
||||
def _w(start, end, text):
|
||||
return {"start": start, "end": end, "text": text}
|
||||
|
||||
|
||||
def _t(start, end, speaker):
|
||||
return {"start_s": start, "end_s": end, "speaker": speaker}
|
||||
|
||||
|
||||
# ---- _assign_speaker_to_word ----
|
||||
|
||||
def test_assign_by_midpoint_containment():
|
||||
turns = [_t(0.0, 2.0, "Speaker_0"), _t(2.0, 4.0, "Speaker_1")]
|
||||
assert _assign_speaker_to_word(2.4, 2.8, turns) == "Speaker_1"
|
||||
|
||||
|
||||
def test_assign_falls_back_to_max_overlap_when_midpoint_outside():
|
||||
# midpoint 5.0 is in no turn; word span overlaps Speaker_0 more than Speaker_1.
|
||||
turns = [_t(0.0, 4.9, "Speaker_0"), _t(6.0, 8.0, "Speaker_1")]
|
||||
assert _assign_speaker_to_word(4.0, 6.0, turns) == "Speaker_0"
|
||||
|
||||
|
||||
def test_assign_unknown_when_no_overlap():
|
||||
turns = [_t(0.0, 1.0, "Speaker_0")]
|
||||
assert _assign_speaker_to_word(10.0, 11.0, turns) == "Speaker_unknown"
|
||||
|
||||
|
||||
# ---- _merge_words_with_speakers ----
|
||||
|
||||
def test_empty_words_returns_empty():
|
||||
assert _merge_words_with_speakers([], [_t(0, 1, "Speaker_0")]) == []
|
||||
|
||||
|
||||
def test_consecutive_same_speaker_words_join_into_one_block():
|
||||
words = [_w(0.0, 0.5, "good"), _w(0.5, 1.0, "morning")]
|
||||
turns = [_t(0.0, 2.0, "Speaker_0")]
|
||||
blocks = _merge_words_with_speakers(words, turns)
|
||||
assert blocks == [
|
||||
{"start_ms": 0, "end_ms": 1000, "speaker": "Speaker_0", "text": "good morning"}
|
||||
]
|
||||
|
||||
|
||||
def test_speaker_change_splits_blocks():
|
||||
words = [_w(0.0, 1.0, "hi"), _w(2.1, 3.0, "hello")]
|
||||
turns = [_t(0.0, 2.0, "Speaker_0"), _t(2.0, 4.0, "Speaker_1")]
|
||||
blocks = _merge_words_with_speakers(words, turns)
|
||||
assert [b["speaker"] for b in blocks] == ["Speaker_0", "Speaker_1"]
|
||||
assert [b["text"] for b in blocks] == ["hi", "hello"]
|
||||
|
||||
|
||||
def test_long_silence_breaks_block_for_same_speaker():
|
||||
# >1.5s gap between two words of the same speaker forces a new block.
|
||||
words = [_w(0.0, 0.5, "one"), _w(3.0, 3.5, "two")]
|
||||
turns = [_t(0.0, 4.0, "Speaker_0")]
|
||||
blocks = _merge_words_with_speakers(words, turns)
|
||||
assert len(blocks) == 2
|
||||
assert [b["text"] for b in blocks] == ["one", "two"]
|
||||
|
||||
|
||||
def test_punctuation_token_joins_without_leading_space():
|
||||
words = [_w(0.0, 0.5, "hello"), _w(0.5, 0.7, ".")]
|
||||
turns = [_t(0.0, 2.0, "Speaker_0")]
|
||||
assert _merge_words_with_speakers(words, turns)[0]["text"] == "hello."
|
||||
@@ -0,0 +1,67 @@
|
||||
"""build_launch_command: argument assembly + the shell-injection invariant.
|
||||
|
||||
The security-critical property is that every user-controllable value (repo,
|
||||
vllm_args, knobs) is shlex-quoted at the sink, so `shlex.split` cleanly reverses
|
||||
the command back into the exact token list. The vLLM pre-flight validator
|
||||
(validate.py) depends on this round-trip — these tests lock it in.
|
||||
"""
|
||||
import shlex
|
||||
|
||||
from app.models import Defaults, ModelDef, build_launch_command
|
||||
|
||||
DEFAULTS = Defaults(port=8888, host="0.0.0.0")
|
||||
|
||||
|
||||
def _model(**kw) -> ModelDef:
|
||||
base = dict(display_name="X", repo="org/name", size_gb=1.0, mode="solo")
|
||||
base.update(kw)
|
||||
return ModelDef(**base)
|
||||
|
||||
|
||||
def test_solo_model_emits_solo_flag_and_ordered_args():
|
||||
cmd = build_launch_command("k", _model(vllm_args=["--max-model-len=1000"]), DEFAULTS)
|
||||
assert cmd == (
|
||||
"./launch-cluster.sh --solo -d exec vllm serve org/name "
|
||||
"--port=8888 --host=0.0.0.0 --max-model-len=1000"
|
||||
)
|
||||
|
||||
|
||||
def test_cluster_model_omits_solo_flag():
|
||||
cmd = build_launch_command("k", _model(mode="cluster", vllm_args=["-tp=2"]), DEFAULTS)
|
||||
assert " --solo " not in cmd
|
||||
assert cmd.startswith("./launch-cluster.sh -d exec vllm serve org/name")
|
||||
|
||||
|
||||
def test_knob_overrides_matching_bundled_flag():
|
||||
# bundled arg sets max-model-len; the knob must win (single occurrence).
|
||||
m = _model(vllm_args=["--max-model-len=1000"], knobs={"max_model_len": 65536})
|
||||
cmd = build_launch_command("k", m, DEFAULTS)
|
||||
assert "--max-model-len=65536" in cmd
|
||||
assert "--max-model-len=1000" not in cmd
|
||||
|
||||
|
||||
def test_repo_with_shell_metacharacters_is_quoted_not_executed():
|
||||
# build_launch_command quotes even a hostile repo (validate_repo guards the
|
||||
# API boundary; this proves the sink itself is safe in depth).
|
||||
evil = "org/name; rm -rf ~ #"
|
||||
cmd = build_launch_command("k", _model(repo=evil), DEFAULTS)
|
||||
# The raw metacharacters must not appear unquoted...
|
||||
assert "; rm -rf" not in cmd.replace(shlex.quote(evil), "")
|
||||
# ...and shlex.split must recover the repo as one literal token.
|
||||
tokens = shlex.split(cmd)
|
||||
assert evil in tokens
|
||||
|
||||
|
||||
def test_command_string_round_trips_through_shlex_split():
|
||||
# The invariant validate.py relies on: every arg survives quote -> split intact.
|
||||
args = ["--max-model-len=32768", "--load-format=fastsafetensors", "--note=a b c"]
|
||||
cmd = build_launch_command("k", _model(vllm_args=args), DEFAULTS)
|
||||
tokens = shlex.split(cmd)
|
||||
for a in args:
|
||||
assert a in tokens
|
||||
|
||||
|
||||
def test_injection_via_vllm_arg_stays_literal():
|
||||
payload = "--foo=$(touch /tmp/pwned)"
|
||||
cmd = build_launch_command("k", _model(vllm_args=[payload]), DEFAULTS)
|
||||
assert payload in shlex.split(cmd) # preserved as one inert token
|
||||
Reference in New Issue
Block a user