70 lines
2.5 KiB
Python
70 lines
2.5 KiB
Python
"""_merge_words_with_speakers + _assign_speaker_to_word: the transcript/diarizer
|
|
merge that turns Parakeet words + Sortformer turns into speaker-labelled blocks.
|
|
Pure functions, no cluster — this is the core of transcribe-with-speakers.
|
|
"""
|
|
from app.audio_proxy import _assign_speaker_to_word, _merge_words_with_speakers
|
|
|
|
|
|
def _w(start, end, text):
|
|
return {"start": start, "end": end, "text": text}
|
|
|
|
|
|
def _t(start, end, speaker):
|
|
return {"start_s": start, "end_s": end, "speaker": speaker}
|
|
|
|
|
|
# ---- _assign_speaker_to_word ----
|
|
|
|
def test_assign_by_midpoint_containment():
|
|
turns = [_t(0.0, 2.0, "Speaker_0"), _t(2.0, 4.0, "Speaker_1")]
|
|
assert _assign_speaker_to_word(2.4, 2.8, turns) == "Speaker_1"
|
|
|
|
|
|
def test_assign_falls_back_to_max_overlap_when_midpoint_outside():
|
|
# midpoint 5.0 is in no turn; word span overlaps Speaker_0 more than Speaker_1.
|
|
turns = [_t(0.0, 4.9, "Speaker_0"), _t(6.0, 8.0, "Speaker_1")]
|
|
assert _assign_speaker_to_word(4.0, 6.0, turns) == "Speaker_0"
|
|
|
|
|
|
def test_assign_unknown_when_no_overlap():
|
|
turns = [_t(0.0, 1.0, "Speaker_0")]
|
|
assert _assign_speaker_to_word(10.0, 11.0, turns) == "Speaker_unknown"
|
|
|
|
|
|
# ---- _merge_words_with_speakers ----
|
|
|
|
def test_empty_words_returns_empty():
|
|
assert _merge_words_with_speakers([], [_t(0, 1, "Speaker_0")]) == []
|
|
|
|
|
|
def test_consecutive_same_speaker_words_join_into_one_block():
|
|
words = [_w(0.0, 0.5, "good"), _w(0.5, 1.0, "morning")]
|
|
turns = [_t(0.0, 2.0, "Speaker_0")]
|
|
blocks = _merge_words_with_speakers(words, turns)
|
|
assert blocks == [
|
|
{"start_ms": 0, "end_ms": 1000, "speaker": "Speaker_0", "text": "good morning"}
|
|
]
|
|
|
|
|
|
def test_speaker_change_splits_blocks():
|
|
words = [_w(0.0, 1.0, "hi"), _w(2.1, 3.0, "hello")]
|
|
turns = [_t(0.0, 2.0, "Speaker_0"), _t(2.0, 4.0, "Speaker_1")]
|
|
blocks = _merge_words_with_speakers(words, turns)
|
|
assert [b["speaker"] for b in blocks] == ["Speaker_0", "Speaker_1"]
|
|
assert [b["text"] for b in blocks] == ["hi", "hello"]
|
|
|
|
|
|
def test_long_silence_breaks_block_for_same_speaker():
|
|
# >1.5s gap between two words of the same speaker forces a new block.
|
|
words = [_w(0.0, 0.5, "one"), _w(3.0, 3.5, "two")]
|
|
turns = [_t(0.0, 4.0, "Speaker_0")]
|
|
blocks = _merge_words_with_speakers(words, turns)
|
|
assert len(blocks) == 2
|
|
assert [b["text"] for b in blocks] == ["one", "two"]
|
|
|
|
|
|
def test_punctuation_token_joins_without_leading_space():
|
|
words = [_w(0.0, 0.5, "hello"), _w(0.5, 0.7, ".")]
|
|
turns = [_t(0.0, 2.0, "Speaker_0")]
|
|
assert _merge_words_with_speakers(words, turns)[0]["text"] == "hello."
|