Files
spark-control/image/tests/test_label_merge.py
T

70 lines
2.5 KiB
Python

"""_merge_words_with_speakers + _assign_speaker_to_word: the transcript/diarizer
merge that turns Parakeet words + Sortformer turns into speaker-labelled blocks.
Pure functions, no cluster — this is the core of transcribe-with-speakers.
"""
from app.audio_proxy import _assign_speaker_to_word, _merge_words_with_speakers
def _w(start, end, text):
return {"start": start, "end": end, "text": text}
def _t(start, end, speaker):
return {"start_s": start, "end_s": end, "speaker": speaker}
# ---- _assign_speaker_to_word ----
def test_assign_by_midpoint_containment():
turns = [_t(0.0, 2.0, "Speaker_0"), _t(2.0, 4.0, "Speaker_1")]
assert _assign_speaker_to_word(2.4, 2.8, turns) == "Speaker_1"
def test_assign_falls_back_to_max_overlap_when_midpoint_outside():
# midpoint 5.0 is in no turn; word span overlaps Speaker_0 more than Speaker_1.
turns = [_t(0.0, 4.9, "Speaker_0"), _t(6.0, 8.0, "Speaker_1")]
assert _assign_speaker_to_word(4.0, 6.0, turns) == "Speaker_0"
def test_assign_unknown_when_no_overlap():
turns = [_t(0.0, 1.0, "Speaker_0")]
assert _assign_speaker_to_word(10.0, 11.0, turns) == "Speaker_unknown"
# ---- _merge_words_with_speakers ----
def test_empty_words_returns_empty():
assert _merge_words_with_speakers([], [_t(0, 1, "Speaker_0")]) == []
def test_consecutive_same_speaker_words_join_into_one_block():
words = [_w(0.0, 0.5, "good"), _w(0.5, 1.0, "morning")]
turns = [_t(0.0, 2.0, "Speaker_0")]
blocks = _merge_words_with_speakers(words, turns)
assert blocks == [
{"start_ms": 0, "end_ms": 1000, "speaker": "Speaker_0", "text": "good morning"}
]
def test_speaker_change_splits_blocks():
words = [_w(0.0, 1.0, "hi"), _w(2.1, 3.0, "hello")]
turns = [_t(0.0, 2.0, "Speaker_0"), _t(2.0, 4.0, "Speaker_1")]
blocks = _merge_words_with_speakers(words, turns)
assert [b["speaker"] for b in blocks] == ["Speaker_0", "Speaker_1"]
assert [b["text"] for b in blocks] == ["hi", "hello"]
def test_long_silence_breaks_block_for_same_speaker():
# >1.5s gap between two words of the same speaker forces a new block.
words = [_w(0.0, 0.5, "one"), _w(3.0, 3.5, "two")]
turns = [_t(0.0, 4.0, "Speaker_0")]
blocks = _merge_words_with_speakers(words, turns)
assert len(blocks) == 2
assert [b["text"] for b in blocks] == ["one", "two"]
def test_punctuation_token_joins_without_leading_space():
words = [_w(0.0, 0.5, "hello"), _w(0.5, 0.7, ".")]
turns = [_t(0.0, 2.0, "Speaker_0")]
assert _merge_words_with_speakers(words, turns)[0]["text"] == "hello."