"""_merge_words_with_speakers + _assign_speaker_to_word: the transcript/diarizer merge that turns Parakeet words + Sortformer turns into speaker-labelled blocks. Pure functions, no cluster — this is the core of transcribe-with-speakers. """ from app.audio_proxy import _assign_speaker_to_word, _merge_words_with_speakers def _w(start, end, text): return {"start": start, "end": end, "text": text} def _t(start, end, speaker): return {"start_s": start, "end_s": end, "speaker": speaker} # ---- _assign_speaker_to_word ---- def test_assign_by_midpoint_containment(): turns = [_t(0.0, 2.0, "Speaker_0"), _t(2.0, 4.0, "Speaker_1")] assert _assign_speaker_to_word(2.4, 2.8, turns) == "Speaker_1" def test_assign_falls_back_to_max_overlap_when_midpoint_outside(): # midpoint 5.0 is in no turn; word span overlaps Speaker_0 more than Speaker_1. turns = [_t(0.0, 4.9, "Speaker_0"), _t(6.0, 8.0, "Speaker_1")] assert _assign_speaker_to_word(4.0, 6.0, turns) == "Speaker_0" def test_assign_unknown_when_no_overlap(): turns = [_t(0.0, 1.0, "Speaker_0")] assert _assign_speaker_to_word(10.0, 11.0, turns) == "Speaker_unknown" # ---- _merge_words_with_speakers ---- def test_empty_words_returns_empty(): assert _merge_words_with_speakers([], [_t(0, 1, "Speaker_0")]) == [] def test_consecutive_same_speaker_words_join_into_one_block(): words = [_w(0.0, 0.5, "good"), _w(0.5, 1.0, "morning")] turns = [_t(0.0, 2.0, "Speaker_0")] blocks = _merge_words_with_speakers(words, turns) assert blocks == [ {"start_ms": 0, "end_ms": 1000, "speaker": "Speaker_0", "text": "good morning"} ] def test_speaker_change_splits_blocks(): words = [_w(0.0, 1.0, "hi"), _w(2.1, 3.0, "hello")] turns = [_t(0.0, 2.0, "Speaker_0"), _t(2.0, 4.0, "Speaker_1")] blocks = _merge_words_with_speakers(words, turns) assert [b["speaker"] for b in blocks] == ["Speaker_0", "Speaker_1"] assert [b["text"] for b in blocks] == ["hi", "hello"] def test_long_silence_breaks_block_for_same_speaker(): # >1.5s gap between two words of the same speaker forces a new block. words = [_w(0.0, 0.5, "one"), _w(3.0, 3.5, "two")] turns = [_t(0.0, 4.0, "Speaker_0")] blocks = _merge_words_with_speakers(words, turns) assert len(blocks) == 2 assert [b["text"] for b in blocks] == ["one", "two"] def test_punctuation_token_joins_without_leading_space(): words = [_w(0.0, 0.5, "hello"), _w(0.5, 0.7, ".")] turns = [_t(0.0, 2.0, "Speaker_0")] assert _merge_words_with_speakers(words, turns)[0]["text"] == "hello."