ten31-signal-engine/signal_engine/ingest/identify.py

"""Speaker-name identification (§4.5 enhancement).

In a 1-on-1 interview the host introduces the guest by name at the top. Reading the transcript head
with the LLM, we attach a real NAME to each diarized speaker → voiceprints.person_label. This gives
the independence graph a SECOND, orthogonal overlap signal: the same NAMED guest across two shows is
a shared_guest edge even when the voiceprints don't cluster (different mic/codec/room). It complements
voiceprint cosine matching and is robust to fingerprint drift — exactly the case the operator flagged.
"""
from __future__ import annotations

import json
import logging

log = logging.getLogger(__name__)

_SYS = (
    'You identify the speakers in a podcast/interview transcript. Each line is "LABEL: text". '
    "Using the introduction and context, determine each LABEL's real full name and role. In an "
    "interview the host normally introduces themselves and the guest within the first minute. Only "
    "assert a name you can actually support from the text — if you cannot tell, use null. "
    'Return ONLY JSON: {"speakers": {"<LABEL>": {"name": "Full Name" or null, '
    '"role": "host"|"guest"|"panelist"|"unknown", "confidence": "low"|"med"|"high"}}}.'
)


def identify_speakers(backend, transcript_head: str, *, source_name: str, host_hint: str | None = None) -> dict:
    """Returns {label: {name, role, confidence}}. `backend` is any extract.backends backend."""
    ctx = f"Show: {source_name}."
    if host_hint:
        ctx += f" The show's usual host is {host_hint}."
    ctx += "\n\nTRANSCRIPT (beginning):\n" + transcript_head
    messages = [{"role": "system", "content": _SYS}, {"role": "user", "content": ctx}]
    raw = backend.complete_json(messages, max_tokens=600)
    try:
        obj = json.loads(raw)
    except Exception:
        i, j = raw.find("{"), raw.rfind("}")
        if i < 0 or j < 0:
            return {}
        try:
            obj = json.loads(raw[i:j + 1])
        except Exception:
            return {}
    spk = obj.get("speakers", {}) if isinstance(obj, dict) else {}
    return spk if isinstance(spk, dict) else {}