Initial commit: Ten31 Signal Engine (ingest, scoring brain, corpus seeds)

This commit is contained in:
Keysat
2026-06-15 09:24:29 -05:00
commit a6aec77506
77 changed files with 6263 additions and 0 deletions
+45
View File
@@ -0,0 +1,45 @@
"""Speaker-name identification (§4.5 enhancement).
In a 1-on-1 interview the host introduces the guest by name at the top. Reading the transcript head
with the LLM, we attach a real NAME to each diarized speaker → voiceprints.person_label. This gives
the independence graph a SECOND, orthogonal overlap signal: the same NAMED guest across two shows is
a shared_guest edge even when the voiceprints don't cluster (different mic/codec/room). It complements
voiceprint cosine matching and is robust to fingerprint drift — exactly the case the operator flagged.
"""
from __future__ import annotations
import json
import logging
log = logging.getLogger(__name__)
_SYS = (
'You identify the speakers in a podcast/interview transcript. Each line is "LABEL: text". '
"Using the introduction and context, determine each LABEL's real full name and role. In an "
"interview the host normally introduces themselves and the guest within the first minute. Only "
"assert a name you can actually support from the text — if you cannot tell, use null. "
'Return ONLY JSON: {"speakers": {"<LABEL>": {"name": "Full Name" or null, '
'"role": "host"|"guest"|"panelist"|"unknown", "confidence": "low"|"med"|"high"}}}.'
)
def identify_speakers(backend, transcript_head: str, *, source_name: str, host_hint: str | None = None) -> dict:
"""Returns {label: {name, role, confidence}}. `backend` is any extract.backends backend."""
ctx = f"Show: {source_name}."
if host_hint:
ctx += f" The show's usual host is {host_hint}."
ctx += "\n\nTRANSCRIPT (beginning):\n" + transcript_head
messages = [{"role": "system", "content": _SYS}, {"role": "user", "content": ctx}]
raw = backend.complete_json(messages, max_tokens=600)
try:
obj = json.loads(raw)
except Exception:
i, j = raw.find("{"), raw.rfind("}")
if i < 0 or j < 0:
return {}
try:
obj = json.loads(raw[i:j + 1])
except Exception:
return {}
spk = obj.get("speakers", {}) if isinstance(obj, dict) else {}
return spk if isinstance(spk, dict) else {}