Initial commit: Ten31 Signal Engine (ingest, scoring brain, corpus seeds)
This commit is contained in:
@@ -0,0 +1,308 @@
|
||||
"""Audio → speaker-attributed transcript + voiceprint library (§4.1, §4.5).
|
||||
|
||||
Per chunk (sequential — audio lock): diarize-chunk (192-d TitaNet fingerprints + timed speaker
|
||||
segments) + transcribe (word timestamps). Align words to speakers by time, stitch speakers ACROSS
|
||||
chunks by fingerprint cosine, then match the persisted voiceprint library so the SAME guest is
|
||||
recognized ACROSS shows by voice — the highest-leverage input to the source-independence graph.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import time
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ..backfill import queue
|
||||
from .chunker import chunk_audio
|
||||
from .download import download_enclosure, download_youtube_audio, to_wav_16k_mono
|
||||
from .speaker_stitch import DISTANCE_THRESHOLD, match_library, stitch_chunks
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ---------- alignment ----------
|
||||
def _speaker_at(segments: list[dict], t: float) -> str:
|
||||
for s in segments:
|
||||
if s["start_s"] <= t <= s["end_s"]:
|
||||
return s["speaker"]
|
||||
if not segments:
|
||||
return "Speaker_0"
|
||||
return min(segments, key=lambda s: min(abs(s["start_s"] - t), abs(s["end_s"] - t)))["speaker"]
|
||||
|
||||
|
||||
def align_words(words: list[dict], segments: list[dict]) -> list[dict]:
|
||||
"""Group word-level transcription into speaker turns using the diarization segments."""
|
||||
turns: list[dict] = []
|
||||
cur: dict | None = None
|
||||
for w in words:
|
||||
mid = (w["start"] + w["end"]) / 2
|
||||
spk = _speaker_at(segments, mid)
|
||||
if cur and cur["speaker"] == spk:
|
||||
cur["text"] += " " + w["text"]
|
||||
cur["end"] = w["end"]
|
||||
else:
|
||||
if cur:
|
||||
turns.append(cur)
|
||||
cur = {"speaker": spk, "start": w["start"], "end": w["end"], "text": w["text"]}
|
||||
if cur:
|
||||
turns.append(cur)
|
||||
return turns
|
||||
|
||||
|
||||
# ---------- per-document audio processing ----------
|
||||
def diarize_transcribe_chunks(sc, chunks: list[Path], *, concurrency: int = 2):
|
||||
"""Returns (chunk_turns, chunk_speakers): turns per chunk + (chunk_idx, local_spk, fingerprint).
|
||||
|
||||
Drives up to `concurrency` chunks in flight — the client's global audio SEMAPHORE is the hard cap
|
||||
across both parakeet endpoints (sit at 2: keeps the single serial GPU continuously fed = full
|
||||
throughput, no idle gap). A single chunk's failure is non-fatal (skip; the client already busy-
|
||||
retries transient blips), but if a MAJORITY of chunks fail the whole job raises so it retries later
|
||||
(rather than emitting a half-empty transcript). Results are reassembled in chunk order."""
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
|
||||
def _one(idx: int, ch: Path):
|
||||
dia = sc.diarize_chunk(str(ch))
|
||||
tr = sc.transcribe(str(ch))
|
||||
turns = align_words(tr.get("words", []), dia.get("segments", []))
|
||||
spks = [(idx, spk, np.asarray(vec, dtype=np.float32))
|
||||
for spk, vec in (dia.get("fingerprints") or {}).items()]
|
||||
return idx, turns, spks
|
||||
|
||||
results: dict[int, tuple] = {}
|
||||
failed = 0
|
||||
with ThreadPoolExecutor(max_workers=max(1, concurrency)) as pool:
|
||||
futs = {pool.submit(_one, i, ch): i for i, ch in enumerate(chunks)}
|
||||
for fut in as_completed(futs):
|
||||
try:
|
||||
idx, turns, spks = fut.result()
|
||||
results[idx] = (turns, spks)
|
||||
except Exception as e: # noqa: BLE001 — one contended chunk shouldn't kill the episode
|
||||
failed += 1
|
||||
log.warning("chunk %d/%d failed (%s) — skipping", futs[fut], len(chunks), str(e)[:90])
|
||||
if chunks and failed >= max(3, len(chunks) // 2):
|
||||
raise RuntimeError(f"{failed}/{len(chunks)} chunks failed — backend contended; will retry later")
|
||||
chunk_turns = [(idx, results[idx][0]) for idx in sorted(results)]
|
||||
chunk_speakers = [s for idx in sorted(results) for s in results[idx][1]]
|
||||
return chunk_turns, chunk_speakers
|
||||
|
||||
|
||||
def stitch_and_centroids(chunk_speakers, *, threshold: float = DISTANCE_THRESHOLD):
|
||||
"""Cluster all (chunk,speaker) fingerprints into within-episode global speakers."""
|
||||
if not chunk_speakers:
|
||||
return {}, {}
|
||||
vecs = [v for (_, _, v) in chunk_speakers]
|
||||
labels = stitch_chunks(vecs, threshold=threshold)
|
||||
keymap: dict[tuple[int, str], int] = {}
|
||||
groups: dict[int, list[np.ndarray]] = {}
|
||||
for (idx, spk, vec), lab in zip(chunk_speakers, labels):
|
||||
keymap[(idx, spk)] = lab
|
||||
groups.setdefault(lab, []).append(vec)
|
||||
centroids = {lab: np.mean(v, axis=0) for lab, v in groups.items()}
|
||||
return keymap, centroids
|
||||
|
||||
|
||||
def _load_library(conn) -> list[tuple[str, np.ndarray]]:
|
||||
rows = conn.execute("SELECT voiceprint_id, vector, person_label FROM voiceprints").fetchall()
|
||||
return [(r["voiceprint_id"], np.frombuffer(r["vector"], dtype=np.float32)) for r in rows]
|
||||
|
||||
|
||||
def _label_for(conn, vpid: str) -> str:
|
||||
r = conn.execute("SELECT person_label FROM voiceprints WHERE voiceprint_id=?", (vpid,)).fetchone()
|
||||
return (r["person_label"] if r and r["person_label"] else f"SPK:{vpid[:8]}")
|
||||
|
||||
|
||||
def resolve_voiceprints(conn, doc, centroids: dict[int, np.ndarray], *, threshold: float = DISTANCE_THRESHOLD):
|
||||
"""Match each within-episode speaker to the persisted library (cross-show identity) or mint a new
|
||||
one; record observations; add shared_guest edges when the voice also appears in ANOTHER source."""
|
||||
library = _load_library(conn)
|
||||
cluster_to_vpid: dict[int, str] = {}
|
||||
for lab, cen in centroids.items():
|
||||
vpid = match_library(cen, library, threshold=threshold)
|
||||
if vpid is None:
|
||||
vpid = "vp_" + uuid.uuid4().hex[:16]
|
||||
conn.execute(
|
||||
"INSERT INTO voiceprints (voiceprint_id, vector, first_doc_id) VALUES (?,?,?)",
|
||||
(vpid, cen.astype(np.float32).tobytes(), doc["doc_id"]),
|
||||
)
|
||||
library.append((vpid, cen))
|
||||
conn.execute(
|
||||
"INSERT INTO voiceprint_observations (voiceprint_id, doc_id, chunk_idx) VALUES (?,?,?)",
|
||||
(vpid, doc["doc_id"], None),
|
||||
)
|
||||
cluster_to_vpid[lab] = vpid
|
||||
conn.commit()
|
||||
# independence graph (§4.5): if this voice appears in a DIFFERENT source, that's a shared guest.
|
||||
for vpid in set(cluster_to_vpid.values()):
|
||||
others = conn.execute(
|
||||
"""SELECT DISTINCT d.source_id FROM voiceprint_observations o
|
||||
JOIN documents d ON d.doc_id = o.doc_id
|
||||
WHERE o.voiceprint_id=? AND d.source_id != ?""",
|
||||
(vpid, doc["source_id"]),
|
||||
).fetchall()
|
||||
for o in others:
|
||||
a, b = sorted([doc["source_id"], o["source_id"]])
|
||||
conn.execute(
|
||||
"""INSERT INTO source_edges (src_a, src_b, edge_type, weight, evidence)
|
||||
VALUES (?,?,'shared_guest',1.0,?)
|
||||
ON CONFLICT(src_a, src_b, edge_type)
|
||||
DO UPDATE SET weight = weight + 1.0, evidence = excluded.evidence""",
|
||||
(a, b, vpid),
|
||||
)
|
||||
conn.commit()
|
||||
return cluster_to_vpid
|
||||
|
||||
|
||||
def _labeled(chunk_turns, keymap, label_by_cluster: dict) -> str:
|
||||
lines: list[str] = []
|
||||
for idx, turns in chunk_turns:
|
||||
for t in turns:
|
||||
lab = keymap.get((idx, t["speaker"]))
|
||||
label = label_by_cluster.get(lab, t["speaker"])
|
||||
lines.append(f"{label}: {t['text']}")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def build_transcript(conn, chunk_turns, keymap, cluster_to_vpid) -> str:
|
||||
labels = {lab: _label_for(conn, vpid) for lab, vpid in cluster_to_vpid.items()}
|
||||
return _labeled(chunk_turns, keymap, labels)
|
||||
|
||||
|
||||
def apply_names(conn, cluster_to_vpid: dict, idmap: dict) -> dict:
|
||||
"""Attach confident names to the voiceprint library (person_label). Returns {cluster: name}."""
|
||||
named: dict[int, str] = {}
|
||||
for lab, vpid in cluster_to_vpid.items():
|
||||
info = idmap.get(f"Speaker {lab + 1}") or idmap.get(str(lab + 1)) or {}
|
||||
name = (info.get("name") or "").strip() if isinstance(info, dict) else ""
|
||||
if name and info.get("confidence") in ("med", "high"):
|
||||
conn.execute("UPDATE voiceprints SET person_label=? WHERE voiceprint_id=?", (name, vpid))
|
||||
named[lab] = name
|
||||
conn.commit()
|
||||
return named
|
||||
|
||||
|
||||
def add_name_edges(conn, doc, cluster_to_vpid: dict) -> int:
|
||||
"""Name-based shared_guest edges: same person_label seen in a DIFFERENT source → independence edge,
|
||||
even if the voiceprints didn't cluster (drift-robust complement to voiceprint matching, §4.5)."""
|
||||
n = 0
|
||||
for vpid in set(cluster_to_vpid.values()):
|
||||
r = conn.execute("SELECT person_label FROM voiceprints WHERE voiceprint_id=?", (vpid,)).fetchone()
|
||||
name = r["person_label"] if r else None
|
||||
if not name:
|
||||
continue
|
||||
others = conn.execute(
|
||||
"""SELECT DISTINCT d.source_id FROM voiceprints v
|
||||
JOIN voiceprint_observations o ON o.voiceprint_id = v.voiceprint_id
|
||||
JOIN documents d ON d.doc_id = o.doc_id
|
||||
WHERE v.person_label = ? AND d.source_id != ?""",
|
||||
(name, doc["source_id"]),
|
||||
).fetchall()
|
||||
for o in others:
|
||||
a, b = sorted([doc["source_id"], o["source_id"]])
|
||||
conn.execute(
|
||||
"""INSERT INTO source_edges (src_a, src_b, edge_type, weight, evidence)
|
||||
VALUES (?,?,'shared_guest',1.0,?)
|
||||
ON CONFLICT(src_a, src_b, edge_type)
|
||||
DO UPDATE SET weight = weight + 1.0, evidence = excluded.evidence""",
|
||||
(a, b, f"name:{name}"),
|
||||
)
|
||||
n += 1
|
||||
conn.commit()
|
||||
return n
|
||||
|
||||
|
||||
def _download_audio(doc, cfg) -> Path:
|
||||
cache = Path(cfg.audio_cache_dir)
|
||||
cache.mkdir(parents=True, exist_ok=True)
|
||||
wav = cache / f"{doc['doc_id'].replace(':', '_')}.wav"
|
||||
if wav.exists():
|
||||
return wav
|
||||
url = doc["url"]
|
||||
if doc["kind"] == "youtube" or (url and ("youtube.com" in url or "youtu.be" in url)):
|
||||
return download_youtube_audio(url, cache, archive_file=cache / "yt-archive.txt")
|
||||
raw = download_enclosure(url, cache / f"{doc['doc_id'].replace(':', '_')}.src")
|
||||
return to_wav_16k_mono(raw, wav)
|
||||
|
||||
|
||||
def process_document(conn, sc, cfg, doc, *, max_chunks: int, chunk_seconds: int = 150,
|
||||
keep_audio: bool = False) -> int:
|
||||
audio = _download_audio(doc, cfg)
|
||||
chunkdir = Path(cfg.audio_cache_dir) / f"chunks_{doc['doc_id'].replace(':', '_')}"
|
||||
chunks = chunk_audio(audio, chunkdir, chunk_seconds=chunk_seconds)[:max_chunks]
|
||||
chunk_turns, chunk_speakers = diarize_transcribe_chunks(
|
||||
sc, chunks, concurrency=getattr(cfg, "audio_concurrency", 2))
|
||||
keymap, centroids = stitch_and_centroids(chunk_speakers)
|
||||
cluster_to_vpid = resolve_voiceprints(conn, doc, centroids)
|
||||
|
||||
# Name the speakers (§4.5): host introduces guest in 1-on-1 → attach person_label, then a
|
||||
# name-based shared_guest edge that survives voiceprint drift across shows.
|
||||
src = conn.execute("SELECT name FROM sources WHERE source_id=?", (doc["source_id"],)).fetchone()
|
||||
try:
|
||||
from ..extract.backends import from_config as backend_from_config
|
||||
from .identify import identify_speakers
|
||||
backend = backend_from_config(cfg, sc)
|
||||
draft = _labeled(chunk_turns, keymap, {lab: f"Speaker {lab + 1}" for lab in cluster_to_vpid})
|
||||
idmap = identify_speakers(backend, draft[:6000], source_name=src["name"] if src else "")
|
||||
named = apply_names(conn, cluster_to_vpid, idmap)
|
||||
if named:
|
||||
log.info("named speakers in %s: %s", doc["doc_id"], ", ".join(named.values()))
|
||||
except Exception as e: # noqa: BLE001 — naming is best-effort enrichment
|
||||
log.warning("speaker identification failed for %s: %s", doc["doc_id"], e)
|
||||
add_name_edges(conn, doc, cluster_to_vpid)
|
||||
|
||||
transcript = build_transcript(conn, chunk_turns, keymap, cluster_to_vpid)
|
||||
tpath = Path(cfg.data_dir) / "transcripts" / f"{doc['doc_id'].replace(':', '_')}.txt"
|
||||
tpath.parent.mkdir(parents=True, exist_ok=True)
|
||||
tpath.write_text(transcript)
|
||||
import hashlib
|
||||
content_hash = hashlib.sha256(transcript.encode()).hexdigest()
|
||||
conn.execute(
|
||||
"UPDATE documents SET transcript_path=?, duration_sec=?, content_hash=?, processed_at=datetime('now') WHERE doc_id=?",
|
||||
(str(tpath), len(chunks) * chunk_seconds, content_hash, doc["doc_id"]),
|
||||
)
|
||||
conn.commit()
|
||||
h = hashlib.sha256(f"{doc['doc_id']}|extract-v0".encode()).hexdigest()
|
||||
queue.enqueue(conn, job_type="extract", target_id=doc["doc_id"], input_hash=h,
|
||||
parent_doc_id=doc["doc_id"], priority=100)
|
||||
if not keep_audio:
|
||||
_cleanup_audio(audio, chunkdir)
|
||||
return len(chunk_turns)
|
||||
|
||||
|
||||
def _cleanup_audio(audio: Path, chunkdir: Path) -> None:
|
||||
"""Audio files are large and disposable once transcribed — reclaim the disk (the transcript +
|
||||
voiceprints are what we keep). Backfilling hundreds of 1-3 hr episodes would otherwise be tens of GB."""
|
||||
import shutil
|
||||
try:
|
||||
if audio.exists():
|
||||
audio.unlink()
|
||||
src = audio.with_suffix(".src")
|
||||
if src.exists():
|
||||
src.unlink()
|
||||
if chunkdir.exists():
|
||||
shutil.rmtree(chunkdir, ignore_errors=True)
|
||||
except Exception as e: # noqa: BLE001
|
||||
log.warning("audio cleanup failed for %s: %s", audio, e)
|
||||
|
||||
|
||||
def run_transcribe(conn, sc, cfg, *, limit: int = 5, max_chunks: int = 999,
|
||||
lease_seconds: int = 3600, worker_id: str = "transcribe-1") -> dict:
|
||||
processed = 0
|
||||
while processed < limit:
|
||||
job = queue.lease_next(conn, worker_id=worker_id, job_types=["transcribe"], lease_seconds=lease_seconds)
|
||||
if job is None:
|
||||
break
|
||||
processed += 1
|
||||
doc = conn.execute("SELECT * FROM documents WHERE doc_id=?", (job["target_id"],)).fetchone()
|
||||
if doc is None:
|
||||
queue.skip(conn, job["job_id"], "document missing")
|
||||
continue
|
||||
try:
|
||||
n = process_document(conn, sc, cfg, doc, max_chunks=max_chunks)
|
||||
queue.complete(conn, job["job_id"], output_ref=f"{n} chunks")
|
||||
log.info("transcribed %s (%d chunks)", doc["doc_id"], n)
|
||||
except Exception as e: # noqa: BLE001
|
||||
state = queue.fail(conn, job["job_id"], e)
|
||||
log.warning("transcribe failed for %s: %s (→ %s)", job["target_id"], e, state)
|
||||
return {"jobs_processed": processed}
|
||||
Reference in New Issue
Block a user