Initial commit: Ten31 Signal Engine (ingest, scoring brain, corpus seeds)
This commit is contained in:
@@ -0,0 +1,45 @@
|
||||
"""Speaker-name identification (§4.5 enhancement).
|
||||
|
||||
In a 1-on-1 interview the host introduces the guest by name at the top. Reading the transcript head
|
||||
with the LLM, we attach a real NAME to each diarized speaker → voiceprints.person_label. This gives
|
||||
the independence graph a SECOND, orthogonal overlap signal: the same NAMED guest across two shows is
|
||||
a shared_guest edge even when the voiceprints don't cluster (different mic/codec/room). It complements
|
||||
voiceprint cosine matching and is robust to fingerprint drift — exactly the case the operator flagged.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
_SYS = (
|
||||
'You identify the speakers in a podcast/interview transcript. Each line is "LABEL: text". '
|
||||
"Using the introduction and context, determine each LABEL's real full name and role. In an "
|
||||
"interview the host normally introduces themselves and the guest within the first minute. Only "
|
||||
"assert a name you can actually support from the text — if you cannot tell, use null. "
|
||||
'Return ONLY JSON: {"speakers": {"<LABEL>": {"name": "Full Name" or null, '
|
||||
'"role": "host"|"guest"|"panelist"|"unknown", "confidence": "low"|"med"|"high"}}}.'
|
||||
)
|
||||
|
||||
|
||||
def identify_speakers(backend, transcript_head: str, *, source_name: str, host_hint: str | None = None) -> dict:
|
||||
"""Returns {label: {name, role, confidence}}. `backend` is any extract.backends backend."""
|
||||
ctx = f"Show: {source_name}."
|
||||
if host_hint:
|
||||
ctx += f" The show's usual host is {host_hint}."
|
||||
ctx += "\n\nTRANSCRIPT (beginning):\n" + transcript_head
|
||||
messages = [{"role": "system", "content": _SYS}, {"role": "user", "content": ctx}]
|
||||
raw = backend.complete_json(messages, max_tokens=600)
|
||||
try:
|
||||
obj = json.loads(raw)
|
||||
except Exception:
|
||||
i, j = raw.find("{"), raw.rfind("}")
|
||||
if i < 0 or j < 0:
|
||||
return {}
|
||||
try:
|
||||
obj = json.loads(raw[i:j + 1])
|
||||
except Exception:
|
||||
return {}
|
||||
spk = obj.get("speakers", {}) if isinstance(obj, dict) else {}
|
||||
return spk if isinstance(spk, dict) else {}
|
||||
Reference in New Issue
Block a user