Initial commit: Ten31 Signal Engine (ingest, scoring brain, corpus seeds)

This commit is contained in:
Keysat
2026-06-15 09:24:29 -05:00
commit a6aec77506
77 changed files with 6263 additions and 0 deletions
+5
View File
@@ -0,0 +1,5 @@
"""Ingestion layer (§4.1) — the biggest greenfield piece.
Spark Control transcribes audio you hand it; it does NOT fetch. Everything here is fetch/schedule:
RSS + YouTube + EDGAR + FMP earnings, long-audio chunking, and cross-chunk speaker stitching.
"""
+36
View File
@@ -0,0 +1,36 @@
"""Long-audio chunking (§4.1, §13.4).
Podcasts run 13 h; the diarizer caps at 4 speakers/chunk and Spark 2 is a single GPU, so we cut
long audio into ~23 min pieces sent SEQUENTIALLY (parallel audio → 503 FFT race). Each chunk is
diarized independently and re-stitched across chunks by voiceprint (see speaker_stitch.py).
Requires ffmpeg/ffprobe.
"""
from __future__ import annotations
import subprocess
from pathlib import Path
CHUNK_SECONDS_DEFAULT = 150 # 2.5 min, within the ~23 min guidance
def duration_seconds(src: str | Path) -> float:
out = subprocess.run(
["ffprobe", "-v", "error", "-show_entries", "format=duration",
"-of", "default=noprint_wrappers=1:nokey=1", str(src)],
check=True, capture_output=True, text=True,
)
return float(out.stdout.strip())
def chunk_audio(src: str | Path, out_dir: str | Path, *, chunk_seconds: int = CHUNK_SECONDS_DEFAULT) -> list[Path]:
"""Split into fixed-length WAV chunks using ffmpeg's segment muxer (no re-encode of timing).
Returns chunk paths in order. Order matters: the queue sends them sequentially."""
out_dir = Path(out_dir)
out_dir.mkdir(parents=True, exist_ok=True)
pattern = str(out_dir / "chunk_%04d.wav")
subprocess.run(
["ffmpeg", "-y", "-i", str(src), "-f", "segment", "-segment_time", str(chunk_seconds),
"-ar", "16000", "-ac", "1", "-reset_timestamps", "1", pattern],
check=True, capture_output=True,
)
return sorted(out_dir.glob("chunk_*.wav"))
+159
View File
@@ -0,0 +1,159 @@
"""Text-document fetcher for the Battery (bitcoin-collateralized lending) corpus and any non-filing,
non-audio source: policy primaries (SEC SABs, OCC/FDIC/Fed), lender/issuer blogs, credit-market data.
Unlike EDGAR (CIK-driven) and the podcast path (audio→transcribe), these are dated HTML pages, PDFs, or
article RSS feeds. We fetch ONCE, extract clean text (HTML via html_to_text, PDF via pypdf), save it, and
point documents.transcript_path at the saved text so the extract worker reads it directly (it already
supports transcript_path) — this also lets PDFs work, which the worker's on-demand html_to_text fetch can't.
A source row must exist first (FK). Lineage/axis live on the source's cluster/notes (set in the seed);
policy sources are axis=context and must NOT feed the supply resolver (weight 0) — enforced downstream.
"""
from __future__ import annotations
import hashlib
import io
import logging
import sqlite3
from pathlib import Path
import requests
from ..backfill import queue
from ..extract.html_text import html_to_text
from .feeds import fetch_feed
log = logging.getLogger(__name__)
DEFAULT_UA = "ten31-signal-engine/1.0 (research; contact ops@ten31.xyz)"
def _pdf_to_text(data: bytes, *, max_chars: int) -> str:
import pypdf
reader = pypdf.PdfReader(io.BytesIO(data))
parts: list[str] = []
total = 0
for page in reader.pages:
t = page.extract_text() or ""
parts.append(t)
total += len(t)
if total > max_chars:
break
return "\n".join(parts)[:max_chars]
def fetch_clean_text(url: str, *, method: str = "auto", ua: str = DEFAULT_UA,
timeout: int = 90, max_chars: int = 300_000) -> str:
"""Fetch a URL once and return clean text. Auto-detects PDF vs HTML by content-type + magic bytes."""
r = requests.get(url, headers={"User-Agent": ua}, timeout=timeout)
r.raise_for_status()
ctype = r.headers.get("Content-Type", "").lower()
is_pdf = method == "pdf" or "application/pdf" in ctype or r.content[:5] == b"%PDF-"
if is_pdf:
return _pdf_to_text(r.content, max_chars=max_chars)
return html_to_text(r.text, max_chars=max_chars)
_BLOCK_MARKERS = (
"aggressive automated scraping", "request access", "access denied", "are you a robot",
"enable javascript", "captcha", "verify you are human", "rate limit exceeded",
"403 forbidden", "unusual traffic", "checking your browser",
)
def _looks_blocked(text: str) -> bool:
"""Anti-scraping interstitials return 200 + a short access-denied body. Detect so we don't ingest
a block page as if it were the document (a real policy/blog doc is long and has no such markers)."""
low = text[:2500].lower()
return any(m in low for m in _BLOCK_MARKERS)
def _doc_id(source_id: str, url: str) -> str:
return f"doc:{source_id}:{hashlib.sha256(url.encode()).hexdigest()[:12]}"
def ingest_one(conn: sqlite3.Connection, cfg, *, source_id: str, url: str, title: str,
date: str | None, method: str = "auto", prompt_version: str = "extract-v0",
min_chars: int = 400) -> str | None:
"""Fetch+store one text document and enqueue extraction. Idempotent on (source_id, url).
Returns doc_id if newly ingested, else None (duplicate, too-short, or fetch error → logged)."""
doc_id = _doc_id(source_id, url)
if conn.execute("SELECT 1 FROM documents WHERE doc_id=?", (doc_id,)).fetchone():
return None
ua = getattr(cfg, "user_agent", None) or DEFAULT_UA
try:
text = fetch_clean_text(url, method=method, ua=ua)
except Exception as e: # noqa: BLE001
log.warning("doc fetch failed %s: %s", url, e)
return None
if not text or len(text) < min_chars:
log.warning("doc too short (%d chars), skipping %s", len(text or ""), url)
return None
if _looks_blocked(text):
log.warning("blocked/anti-scrape page detected, skipping %s", url)
return None
safe = doc_id.replace(":", "_")
tpath = Path(cfg.data_dir) / "docs" / f"{safe}.txt"
tpath.parent.mkdir(parents=True, exist_ok=True)
tpath.write_text(text)
content_hash = hashlib.sha256(text.encode()).hexdigest()
conn.execute(
"""INSERT OR IGNORE INTO documents
(doc_id, source_id, kind, external_id, url, title, date, transcript_path, content_hash, processed_at)
VALUES (?,?,?,?,?,?,?,?,?,datetime('now'))""",
(doc_id, source_id, "filing", url, url, title[:300] if title else url, date, str(tpath), content_hash),
)
conn.commit()
h = hashlib.sha256(f"{doc_id}|{prompt_version}".encode()).hexdigest()
queue.enqueue(conn, job_type="extract", target_id=doc_id, input_hash=h,
parent_doc_id=doc_id, priority=50)
conn.commit()
log.info("ingested doc %s (%d chars) for %s", doc_id, len(text), source_id)
return doc_id
def ingest_manifest(conn: sqlite3.Connection, cfg, path) -> dict:
"""Batch-ingest the docs listed in a YAML manifest ({docs:[{source,url,title,date,method}]}).
Returns {ingested, skipped, missing_source}. Each source must already exist (FK)."""
import yaml
from pathlib import Path as _Path
data = yaml.safe_load(_Path(path).read_text()) or {}
docs = data.get("docs", [])
ingested = skipped = missing = 0
for d in docs:
src = d.get("source")
if not conn.execute("SELECT 1 FROM sources WHERE source_id=?", (src,)).fetchone():
log.warning("manifest doc references missing source %r — skipping %s", src, d.get("url"))
missing += 1
continue
doc_id = ingest_one(conn, cfg, source_id=src, url=d["url"], title=d.get("title", d["url"]),
date=d.get("date"), method=d.get("method", "auto"))
if doc_id:
ingested += 1
else:
skipped += 1
return {"ingested": ingested, "skipped": skipped, "missing_source": missing}
def ingest_feed_text(conn: sqlite3.Connection, cfg, *, source_id: str, rss_url: str,
since: str | None = None, until: str | None = None, limit: int = 50) -> int:
"""Ingest the ARTICLE bodies behind a text RSS feed (blog/press feed). Each item's link is fetched
and stored as a dated text document. Returns count of newly-ingested docs."""
from .feeds import _published_iso
parsed = fetch_feed(rss_url, user_agent=getattr(cfg, "user_agent", None) or DEFAULT_UA)
n = 0
for entry in parsed.entries:
if n >= limit:
break
link = entry.get("link")
if not link:
continue
date = _published_iso(entry)
if since and date and date < since:
continue
if until and date and date > until:
continue
if ingest_one(conn, cfg, source_id=source_id, url=link,
title=entry.get("title", link), date=date):
n += 1
return n
+61
View File
@@ -0,0 +1,61 @@
"""Audio acquisition (§4.1). Spark Control transcribes audio you fetch — this fetches it.
- Podcast enclosures: a plain streaming download that follows the Podtrac/Megaphone redirects to the
final signed CDN object (download immediately; resolved URLs carry short-lived params).
- YouTube: yt-dlp (audio-only → 16 kHz mono WAV). NOTE: 2026 YouTube enforces PO Tokens broadly — run
the `bgutil-ytdlp-pot-provider` sidecar or pulls will 403. yt-dlp is treated as a LAST resort; prefer
the RSS enclosure where a show publishes both (ToS: downloading YT audio violates YouTube ToS).
"""
from __future__ import annotations
import subprocess
from pathlib import Path
import requests
DEFAULT_UA = "Ten31SignalEngine/0.1 (+https://ten31.xyz)"
def download_enclosure(url: str, dest: str | Path, *, user_agent: str = DEFAULT_UA, timeout: int = 120) -> Path:
dest = Path(dest)
dest.parent.mkdir(parents=True, exist_ok=True)
with requests.get(url, stream=True, allow_redirects=True,
headers={"User-Agent": user_agent}, timeout=timeout) as r:
r.raise_for_status()
with open(dest, "wb") as f:
for chunk in r.iter_content(chunk_size=1 << 16):
f.write(chunk)
return dest
def to_wav_16k_mono(src: str | Path, dst: str | Path) -> Path:
"""Normalize any audio to 16 kHz mono PCM WAV (what the ASR endpoint wants). Requires ffmpeg."""
dst = Path(dst)
dst.parent.mkdir(parents=True, exist_ok=True)
subprocess.run(
["ffmpeg", "-y", "-i", str(src), "-ar", "16000", "-ac", "1", "-f", "wav", str(dst)],
check=True, capture_output=True,
)
return dst
def download_youtube_audio(url: str, out_dir: str | Path, *, archive_file: str | Path | None = None) -> Path:
"""Audio-only via yt-dlp → 16 kHz mono WAV. `archive_file` (yt-dlp --download-archive) is the
canonical 'only-new' dedup for channel/playlist back-catalog pulls."""
out_dir = Path(out_dir)
out_dir.mkdir(parents=True, exist_ok=True)
cmd = [
"yt-dlp", "-f", "bestaudio/best", "-x", "--audio-format", "wav",
"--postprocessor-args", "ffmpeg:-ar 16000 -ac 1",
"-o", str(out_dir / "%(id)s.%(ext)s"),
"--no-progress",
]
if archive_file:
cmd += ["--download-archive", str(archive_file)]
cmd.append(url)
subprocess.run(cmd, check=True, capture_output=True)
# yt-dlp names the file by video id; return the newest wav
wavs = sorted(out_dir.glob("*.wav"), key=lambda p: p.stat().st_mtime)
if not wavs:
raise RuntimeError("yt-dlp produced no wav (PO-token/cookies issue? see module docstring)")
return wavs[-1]
+127
View File
@@ -0,0 +1,127 @@
"""Earnings-call transcripts via Financial Modeling Prep (§4.1, §12 — decision: FMP).
Audio isn't reliably fetchable for large-caps (no uniform feed; ~3090d replay expiry breaks
backfill), so FMP's transcript API is the backbone and EDGAR filings remain the durable core. FMP
also exposes an earnings *calendar* to trigger ingestion on the day a call drops.
Endpoint paths/params are marked TODO(contract): confirm against the FMP 'stable' docs for the
account tier at integration. Needs config.fmp_api_key.
"""
from __future__ import annotations
import hashlib
import sqlite3
from pathlib import Path
from typing import Any
import requests
FMP_BASE = "https://financialmodelingprep.com/stable"
class FMPClient:
def __init__(self, api_key: str, *, base: str = FMP_BASE, timeout: int = 30) -> None:
if not api_key:
raise ValueError("FMP_API_KEY is required for earnings-call transcripts")
self.api_key = api_key
self.base = base
self.timeout = timeout
self.s = requests.Session()
def _get(self, path: str, **params: Any) -> Any:
params["apikey"] = self.api_key
r = self.s.get(f"{self.base}/{path}", params=params, timeout=self.timeout)
r.raise_for_status()
return r.json()
# Confirmed against FMP 'stable' 2026-06-07 (v3 is legacy/403). Note singular "earning".
def transcript_dates(self, symbol: str) -> Any:
"""List available transcripts: [{quarter, fiscalYear, date}, ...]."""
return self._get("earning-call-transcript-dates", symbol=symbol)
def transcript(self, symbol: str, *, year: int, quarter: int) -> Any:
"""One transcript: [{symbol, period, year, date, content}]. Use the `date` field as the
document date — FMP's year/quarter labels are fiscal and can be offset from the call date."""
return self._get("earning-call-transcript", symbol=symbol, year=year, quarter=quarter)
def earnings_calendar(self, *, from_date: str, to_date: str) -> Any:
"""Earnings calendar (ingestion trigger): [{symbol, date, epsActual, ...}, ...]."""
return self._get("earnings-calendar", **{"from": from_date, "to": to_date})
def ingest_transcript(
conn: sqlite3.Connection,
*,
source_id: str,
symbol: str,
year: int,
quarter: int,
content: str,
date: str | None,
data_dir: Path,
prompt_version: str = "extract-v0",
) -> tuple[bool, bool]:
"""Store one transcript (content written to disk → transcript_path) and enqueue an 'extract'
job. Idempotent. Returns (new_document, new_job)."""
from ..backfill import queue
external_id = f"{symbol}-{year}Q{quarter}"
doc_id = f"earnings:{external_id}"
tdir = Path(data_dir) / "transcripts"
tdir.mkdir(parents=True, exist_ok=True)
tpath = tdir / f"{external_id}.txt"
tpath.write_text(content)
content_hash = hashlib.sha256(content.encode()).hexdigest()
cur = conn.execute(
"""INSERT OR IGNORE INTO documents
(doc_id, source_id, kind, external_id, title, date, transcript_path, content_hash, processed_at)
VALUES (?,?,?,?,?,?,?,?, datetime('now'))""",
(doc_id, source_id, "earnings_call", external_id, f"{symbol} {year} Q{quarter} call",
date, str(tpath), content_hash),
)
conn.commit()
if not cur.rowcount:
return (False, False)
# earnings-call Q&A is the highest-yield text source (§4.1) → priority 40, ahead of filings (50).
h = hashlib.sha256(f"{doc_id}|{prompt_version}".encode()).hexdigest()
new_job = queue.enqueue(conn, job_type="extract", target_id=doc_id, input_hash=h,
parent_doc_id=doc_id, priority=40) is not None
return (True, new_job)
def ingest_for_ticker(
conn: sqlite3.Connection,
fmp: FMPClient,
*,
source_id: str,
symbol: str,
data_dir: Path,
since: str | None = None,
until: str | None = None,
limit: int = 8,
) -> tuple[int, int]:
"""Enumerate available transcripts via the dates index, fetch those in [since, until], and
ingest. Uses each transcript's own `date` (FMP fiscal labels are offset). Returns (docs, jobs)."""
dates = fmp.transcript_dates(symbol)
picked = []
for d in dates if isinstance(dates, list) else []:
dt = d.get("date")
if since and dt and dt < since:
continue
if until and dt and dt > until:
continue
picked.append(d)
n_docs = n_jobs = 0
for d in picked[:limit]:
tr = fmp.transcript(symbol, year=d["fiscalYear"], quarter=d["quarter"])
item = (tr[0] if isinstance(tr, list) and tr else tr) or {}
content = item.get("content") or ""
if not content:
continue
nd, nj = ingest_transcript(
conn, source_id=source_id, symbol=symbol, year=d["fiscalYear"], quarter=d["quarter"],
content=content, date=item.get("date") or d.get("date"), data_dir=data_dir,
)
n_docs += int(nd)
n_jobs += int(nj)
return n_docs, n_jobs
+148
View File
@@ -0,0 +1,148 @@
"""SEC EDGAR ingestion (§4.1).
Hits the official data.sec.gov / www.sec.gov APIs directly (free, keyless, full history).
Two hard requirements:
- a descriptive User-Agent (SEC 403s requests without one) — from config.edgar_user_agent.
- ≤10 requests/sec aggregate — enforced by a min-interval throttle here.
Supports an explicit date range AND historical shards (filings.files[]), so the §7.1 backtest can
reach 20222023 filings, not just the most-recent ~1000.
"""
from __future__ import annotations
import hashlib
import sqlite3
import time
from typing import Iterator
import requests
_FILING_COLS = ("accessionNumber", "form", "filingDate", "primaryDocument", "primaryDocDescription")
class EdgarClient:
BASE_DATA = "https://data.sec.gov"
BASE_WWW = "https://www.sec.gov"
def __init__(self, user_agent: str, *, min_interval: float = 0.12) -> None:
if not user_agent or "@" not in user_agent:
raise ValueError("EDGAR requires a descriptive User-Agent with contact email (config.edgar_user_agent)")
self.s = requests.Session()
self.s.headers.update({"User-Agent": user_agent, "Accept-Encoding": "gzip, deflate"})
self.min_interval = min_interval
self._last = 0.0
self._tickers: dict[str, int] | None = None
def _throttle(self) -> None:
dt = time.monotonic() - self._last
if dt < self.min_interval:
time.sleep(self.min_interval - dt)
self._last = time.monotonic()
def _get(self, url: str) -> requests.Response:
self._throttle()
r = self.s.get(url, timeout=30)
r.raise_for_status()
return r
# ---- ticker → CIK ----
def ticker_map(self) -> dict[str, int]:
if self._tickers is None:
data = self._get(f"{self.BASE_WWW}/files/company_tickers.json").json()
self._tickers = {row["ticker"].upper(): int(row["cik_str"]) for row in data.values()}
return self._tickers
def cik_for(self, ticker: str) -> int | None:
return self.ticker_map().get(ticker.upper())
# ---- filings ----
def _iter_array(self, block: dict, forms, since, until) -> Iterator[dict]:
arrays = [block.get(c, []) for c in _FILING_COLS]
for acc, form, fdate, pdoc, pdesc in zip(*arrays):
if forms and form not in forms:
continue
if since and fdate < since:
continue
if until and fdate > until:
continue
yield {"accession": acc, "form": form, "filing_date": fdate,
"primary_document": pdoc, "description": pdesc}
def iter_filings(
self,
cik: int,
*,
forms: tuple[str, ...] = ("10-K", "10-Q", "8-K"),
since: str | None = None,
until: str | None = None,
) -> Iterator[dict]:
"""Yield filing descriptors. Pulls the inline 'recent' block AND any historical shards whose
date window overlaps [since, until] — required to reach the backtest era for active filers."""
sub = self._get(f"{self.BASE_DATA}/submissions/CIK{cik:010d}.json").json()
recent = sub.get("filings", {}).get("recent", {})
for f in self._iter_array(recent, forms, since, until):
yield self._with_url(cik, f)
for shard in sub.get("filings", {}).get("files", []):
# shard has filingFrom / filingTo; skip shards entirely outside the window.
if until and shard.get("filingFrom", "") > until:
continue
if since and shard.get("filingTo", "9999") < since:
continue
block = self._get(f"{self.BASE_DATA}/submissions/{shard['name']}").json()
for f in self._iter_array(block, forms, since, until):
yield self._with_url(cik, f)
def _with_url(self, cik: int, f: dict) -> dict:
acc_nodash = f["accession"].replace("-", "")
f["cik"] = cik
f["url"] = f"{self.BASE_WWW}/Archives/edgar/data/{cik}/{acc_nodash}/{f['primary_document']}"
return f
def fetch_html(self, filing: dict) -> str:
return self._get(filing["url"]).text
# Domestic annual/quarterly + foreign-private-issuer equivalents. 20-F (foreign annual, e.g. TSM/IREN),
# 40-F (Canadian annual, e.g. CCJ). 8-K/6-K (current reports) excluded by default — low claim yield.
HIGH_YIELD_FORMS = ("10-K", "10-Q", "20-F", "40-F")
def ingest_filings(
conn: sqlite3.Connection,
client: EdgarClient,
*,
source_id: str,
ticker: str,
since: str | None = None,
until: str | None = None,
forms: tuple[str, ...] = HIGH_YIELD_FORMS,
prompt_version: str = "extract-v0",
) -> tuple[int, int]:
"""Insert filing documents and enqueue 'extract' jobs. Filings are text → no transcription;
they go straight to extraction (the extract worker fetches + strips the HTML later). Default
forms cover both domestic (10-K/10-Q) and foreign-private-issuer (20-F/40-F) filers.
Returns (new_documents, new_jobs). Idempotent on (source_id, accession)."""
from ..backfill import queue
cik = client.cik_for(ticker)
if cik is None:
raise ValueError(f"No CIK found for ticker {ticker!r}")
n_docs = n_jobs = 0
for f in client.iter_filings(cik, forms=forms, since=since, until=until):
doc_id = f"edgar:{f['accession']}"
cur = conn.execute(
"""INSERT OR IGNORE INTO documents (doc_id, source_id, kind, external_id, url, title, date)
VALUES (?,?,?,?,?,?,?)""",
(doc_id, source_id, "filing", f["accession"], f["url"],
f"{ticker} {f['form']} {f['filing_date']}", f["filing_date"]),
)
conn.commit()
if not cur.rowcount:
continue
n_docs += 1
h = hashlib.sha256(f"{doc_id}|{prompt_version}".encode()).hexdigest()
# priority 50: filings are high-info-density (§4.1) → ahead of podcasts (100)
if queue.enqueue(conn, job_type="extract", target_id=doc_id, input_hash=h,
parent_doc_id=doc_id, priority=50) is not None:
n_jobs += 1
return n_docs, n_jobs
+65
View File
@@ -0,0 +1,65 @@
"""Podcast RSS ingestion (§4.1).
feedparser + conditional GET (ETag/Last-Modified) for efficient incremental polling, with a
composite (feed_url, guid) dedup discipline. Many podcast CDNs send no validators and some feeds
truncate to recent episodes — for the §7.1 backtest, older episodes may need the show's full
archive feed (some hosts expose `?limit=` / a separate archive URL) or a YouTube back-catalog.
"""
from __future__ import annotations
import hashlib
import time
from typing import Any
import feedparser
DEFAULT_UA = "Ten31SignalEngine/0.1 (+https://ten31.xyz)"
def fetch_feed(url: str, *, etag: str | None = None, modified: str | None = None,
user_agent: str = DEFAULT_UA) -> feedparser.FeedParserDict:
"""Conditional GET. On HTTP 304 the result has .status == 304 and .entries == [] → skip."""
return feedparser.parse(url, etag=etag, modified=modified, agent=user_agent)
def _published_iso(entry: Any) -> str | None:
t = entry.get("published_parsed") or entry.get("updated_parsed")
if not t:
return None
return time.strftime("%Y-%m-%d", t)
def _enclosure_audio_url(entry: Any) -> str | None:
for enc in entry.get("enclosures", []) or []:
if str(enc.get("type", "")).startswith("audio"):
return enc.get("href") or enc.get("url")
# some feeds put audio only in links rel=enclosure
for link in entry.get("links", []) or []:
if link.get("rel") == "enclosure" and str(link.get("type", "")).startswith("audio"):
return link.get("href")
return None
def _guid(entry: Any) -> str:
g = entry.get("id") or entry.get("link")
if g:
return str(g)
basis = f"{entry.get('title','')}|{entry.get('published','')}"
return "sha1:" + hashlib.sha1(basis.encode()).hexdigest()
def episode_records(parsed: feedparser.FeedParserDict) -> list[dict]:
"""Normalize feed entries to episode records. Skips entries with no audio enclosure."""
out: list[dict] = []
for e in parsed.entries:
audio = _enclosure_audio_url(e)
if not audio:
continue
out.append({
"guid": _guid(e),
"title": e.get("title"),
"audio_url": audio,
"link": e.get("link"),
"published": _published_iso(e),
})
return out
+195
View File
@@ -0,0 +1,195 @@
"""One-time backfill path: transcribe podcast episodes via the Gemini multimodal API instead of the
local Spark Parakeet+diarizer pipeline. Used to take a bulk backfill OFF the shared Spark GPU (which
contends with production) — it is NOT the steady-state transcriber (local Parakeet remains the default).
Scope/guardrail: podcast audio is PUBLIC data, so sending it to the frontier does NOT trip the
exposure/positioning-data rule (that guardrail is about Ten31's conviction/exposure data, never public
audio). Output is written in the SAME 'Speaker: text' transcript format the extractor consumes, so the
downstream extract→embed stages are agnostic to which transcriber produced the file.
Tradeoff vs local: Gemini yields speaker-LABELED text, not voiceprint fingerprints — so no voiceprint
auto-edges. We rely on the hand-seeded EISC edges + name-based attribution instead (acceptable for a
bounded backfill).
"""
from __future__ import annotations
import hashlib
import logging
import re
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
from ..backfill import queue
from .download import download_enclosure
log = logging.getLogger(__name__)
_PROMPT = (
"You are a precise podcast transcriptionist. Transcribe this audio VERBATIM as a speaker-diarized "
"transcript.\n"
"RULES:\n"
"- One line per speaker turn, formatted exactly as `Name: spoken text` (a colon and one space).\n"
"- The host of this show is {host} — label every host turn with exactly `{host}` (the person's "
"name, never the show's name).\n"
"- When the host introduces a guest by name (e.g. 'welcome X to the show', 'I'm joined by X'), use "
"that real first name (or full name) as the guest's label for the WHOLE transcript. Only fall back "
"to `Guest` (or `Guest 2`, `Guest 3`) if a name is never stated. Do not invent names.\n"
"- Do NOT include timestamps, ad-reads markers, summaries, headings, markdown, or any commentary. "
"Only the transcript lines.\n"
"- Transcribe the entire episode from start to finish. Do not stop early or summarize.\n"
)
def _host_person(source_name: str) -> str:
"""Derive the host's PERSON name from a source/show name so claimant attribution isn't the show.
'What Bitcoin Did (Peter McCormack)' -> 'Peter McCormack'; 'Stephan Livera Podcast' -> 'Stephan
Livera'; 'The Kevin Rooke Show' -> 'Kevin Rooke'; 'The Anita Posch Show' -> 'Anita Posch'."""
m = re.search(r"\(([^)]+)\)", source_name or "")
if m:
return m.group(1).strip()
s = re.sub(r"^The\s+", "", source_name or "").strip()
s = re.sub(r"\s+(Podcast|Show)$", "", s, flags=re.I).strip()
return s
def _sniff_audio_mime(path: Path) -> str:
"""Determine audio MIME from the file header — the downloaded enclosure has a generic `.src`
extension, so the Files API can't infer it and rejects the upload without an explicit mime_type."""
with open(path, "rb") as fh:
head = fh.read(16)
if head[:3] == b"ID3" or (len(head) > 1 and head[0] == 0xFF and (head[1] & 0xE0) == 0xE0):
return "audio/mpeg"
if head[4:8] == b"ftyp":
return "audio/mp4" # m4a/aac
if head[:4] == b"OggS":
return "audio/ogg"
if head[:4] == b"RIFF":
return "audio/wav"
if head[:4] == b"fLaC":
return "audio/flac"
return "audio/mpeg" # podcast default
def _upload_and_wait(client, audio_path: Path, *, poll_s: float = 2.0, timeout_s: float = 300.0):
"""Upload to the Files API and wait until the file is ACTIVE (audio is processed server-side)."""
from google.genai import types
mime = _sniff_audio_mime(audio_path)
f = client.files.upload(file=str(audio_path), config=types.UploadFileConfig(mime_type=mime))
waited = 0.0
while getattr(f.state, "name", str(f.state)) == "PROCESSING" and waited < timeout_s:
time.sleep(poll_s)
waited += poll_s
f = client.files.get(name=f.name)
state = getattr(f.state, "name", str(f.state))
if state != "ACTIVE":
raise RuntimeError(f"Gemini file not ACTIVE (state={state}) for {audio_path.name}")
return f
def transcribe_one(client, model: str, audio_path: Path, host_name: str, *,
max_output_tokens: int = 65536) -> tuple[str, dict]:
"""Transcribe a single audio file → (transcript_text, usage_dict). Network/CPU only; no DB."""
from google.genai import types
f = _upload_and_wait(client, audio_path)
try:
resp = client.models.generate_content(
model=model,
contents=[f, _PROMPT.format(host=host_name or "the host")],
config=types.GenerateContentConfig(temperature=0, max_output_tokens=max_output_tokens),
)
text = (resp.text or "").strip()
um = getattr(resp, "usage_metadata", None)
usage = {
"prompt_tokens": getattr(um, "prompt_token_count", 0) or 0,
"output_tokens": getattr(um, "candidates_token_count", 0) or 0,
"finish_reason": str(getattr(resp.candidates[0], "finish_reason", "")) if resp.candidates else "",
}
return text, usage
finally:
try:
client.files.delete(name=f.name)
except Exception as e: # noqa: BLE001 — best-effort cleanup
log.debug("file cleanup failed for %s: %s", f.name, e)
def _fetch_and_transcribe(client, model: str, cfg, doc, host_name: str) -> dict:
"""Worker-thread unit: download enclosure → Gemini transcribe → write transcript file. No DB writes."""
cache = Path(cfg.audio_cache_dir)
cache.mkdir(parents=True, exist_ok=True)
safe = doc["doc_id"].replace(":", "_")
src = cache / f"{safe}.src"
audio = download_enclosure(doc["url"], src)
try:
text, usage = transcribe_one(client, model, audio, host_name)
if not text or len(text) < 40:
raise RuntimeError(f"empty/short transcript ({len(text)} chars)")
tpath = Path(cfg.data_dir) / "transcripts" / f"{safe}.txt"
tpath.parent.mkdir(parents=True, exist_ok=True)
tpath.write_text(text)
return {
"doc_id": doc["doc_id"], "ok": True, "transcript_path": str(tpath),
"n_lines": text.count("\n") + 1, "content_hash": hashlib.sha256(text.encode()).hexdigest(),
"usage": usage,
}
finally:
try:
if audio.exists():
audio.unlink()
except Exception: # noqa: BLE001
pass
def run_transcribe_gemini(conn, cfg, *, limit: int = 5, concurrency: int = 4,
lease_seconds: int = 7200, worker_id: str = "gemini-transcribe") -> dict:
"""Lease pending transcribe jobs and transcribe them via Gemini in parallel. DB writes stay on the
main thread; only download+API run in the pool. Reports token usage for cost accounting."""
from google import genai
if not cfg.gemini_api_key:
raise RuntimeError("GEMINI_API_KEY not configured")
client = genai.Client(api_key=cfg.gemini_api_key)
model = cfg.gemini_model or "gemini-2.5-flash"
# Lease the batch up front (main thread); resolve docs + host names.
leased: list[tuple] = []
while len(leased) < limit:
job = queue.lease_next(conn, worker_id=worker_id, job_types=["transcribe"], lease_seconds=lease_seconds)
if job is None:
break
doc = conn.execute("SELECT * FROM documents WHERE doc_id=?", (job["target_id"],)).fetchone()
if doc is None:
queue.skip(conn, job["job_id"], "document missing")
continue
host = conn.execute("SELECT name FROM sources WHERE source_id=?", (doc["source_id"],)).fetchone()
leased.append((job, doc, _host_person(host["name"]) if host else ""))
done = failed = prompt_tok = out_tok = 0
with ThreadPoolExecutor(max_workers=concurrency) as pool:
futs = {pool.submit(_fetch_and_transcribe, client, model, cfg, doc, host): (job, doc)
for (job, doc, host) in leased}
for fut in as_completed(futs):
job, doc = futs[fut]
try:
r = fut.result()
conn.execute(
"UPDATE documents SET transcript_path=?, content_hash=?, processed_at=datetime('now') "
"WHERE doc_id=?", (r["transcript_path"], r["content_hash"], doc["doc_id"]),
)
h = hashlib.sha256(f"{doc['doc_id']}|extract-v0".encode()).hexdigest()
queue.enqueue(conn, job_type="extract", target_id=doc["doc_id"], input_hash=h,
parent_doc_id=doc["doc_id"], priority=100)
queue.complete(conn, job["job_id"], output_ref=f"gemini {r['n_lines']} lines")
conn.commit()
done += 1
prompt_tok += r["usage"]["prompt_tokens"]
out_tok += r["usage"]["output_tokens"]
fr = r["usage"]["finish_reason"]
log.info("gemini transcribed %s (%d lines, %d in/%d out tok%s)", doc["doc_id"],
r["n_lines"], r["usage"]["prompt_tokens"], r["usage"]["output_tokens"],
", TRUNCATED" if "MAX_TOKENS" in fr else "")
except Exception as e: # noqa: BLE001
state = queue.fail(conn, job["job_id"], e)
conn.commit()
failed += 1
log.warning("gemini transcribe failed for %s: %s (→ %s)", doc["doc_id"], e, state)
return {"done": done, "failed": failed, "prompt_tokens": prompt_tok, "output_tokens": out_tok}
+45
View File
@@ -0,0 +1,45 @@
"""Speaker-name identification (§4.5 enhancement).
In a 1-on-1 interview the host introduces the guest by name at the top. Reading the transcript head
with the LLM, we attach a real NAME to each diarized speaker → voiceprints.person_label. This gives
the independence graph a SECOND, orthogonal overlap signal: the same NAMED guest across two shows is
a shared_guest edge even when the voiceprints don't cluster (different mic/codec/room). It complements
voiceprint cosine matching and is robust to fingerprint drift — exactly the case the operator flagged.
"""
from __future__ import annotations
import json
import logging
log = logging.getLogger(__name__)
_SYS = (
'You identify the speakers in a podcast/interview transcript. Each line is "LABEL: text". '
"Using the introduction and context, determine each LABEL's real full name and role. In an "
"interview the host normally introduces themselves and the guest within the first minute. Only "
"assert a name you can actually support from the text — if you cannot tell, use null. "
'Return ONLY JSON: {"speakers": {"<LABEL>": {"name": "Full Name" or null, '
'"role": "host"|"guest"|"panelist"|"unknown", "confidence": "low"|"med"|"high"}}}.'
)
def identify_speakers(backend, transcript_head: str, *, source_name: str, host_hint: str | None = None) -> dict:
"""Returns {label: {name, role, confidence}}. `backend` is any extract.backends backend."""
ctx = f"Show: {source_name}."
if host_hint:
ctx += f" The show's usual host is {host_hint}."
ctx += "\n\nTRANSCRIPT (beginning):\n" + transcript_head
messages = [{"role": "system", "content": _SYS}, {"role": "user", "content": ctx}]
raw = backend.complete_json(messages, max_tokens=600)
try:
obj = json.loads(raw)
except Exception:
i, j = raw.find("{"), raw.rfind("}")
if i < 0 or j < 0:
return {}
try:
obj = json.loads(raw[i:j + 1])
except Exception:
return {}
spk = obj.get("speakers", {}) if isinstance(obj, dict) else {}
return spk if isinstance(spk, dict) else {}
+111
View File
@@ -0,0 +1,111 @@
"""Podcast ingestion → documents + 'transcribe' jobs (§4.1).
RSS path: parse the feed, take episodes in [since, until], register documents pointing at the audio
enclosure. YouTube path: enumerate a channel's videos in the date window via yt-dlp (the back-catalog
route for the ~9 shows whose RSS is a truncated rolling window — see seeds/podcast_feeds.resolved.yaml).
The transcribe worker downloads + processes either kind identically.
"""
from __future__ import annotations
import hashlib
import json
import logging
import sqlite3
import subprocess
from ..backfill import queue
from ..util import audio_dedup_key
from .feeds import episode_records, fetch_feed
log = logging.getLogger(__name__)
def _enqueue_doc(conn, *, source_id, kind, external_id, url, title, date) -> tuple[int, int]:
doc_id = f"pod:{source_id}:{hashlib.sha1(external_id.encode()).hexdigest()[:12]}"
dkey = audio_dedup_key(title, date)
# Cross-mirror dedup (pre-GPU): if this same episode was already processed (any source/feed),
# record the sighting for provenance but DON'T re-transcribe. (external_id UNIQUE already covers
# same-feed re-ingest; this covers the same episode via a different feed/YouTube mirror.)
dup = conn.execute(
"SELECT doc_id FROM documents WHERE dedup_key=? AND processed_at IS NOT NULL LIMIT 1", (dkey,)
).fetchone()
cur = conn.execute(
"""INSERT OR IGNORE INTO documents (doc_id, source_id, kind, external_id, url, title, date, dedup_key)
VALUES (?,?,?,?,?,?,?,?)""",
(doc_id, source_id, kind, external_id, url, title, date, dkey),
)
conn.commit()
if not cur.rowcount:
return (0, 0) # same (source_id, external_id) already known
if dup:
conn.execute(
"UPDATE documents SET processed_at=datetime('now'), raw_path=? WHERE doc_id=?",
(f"dup_of:{dup['doc_id']}", doc_id),
)
conn.commit()
log.info("skip transcribe for %s — duplicate content of %s", doc_id, dup["doc_id"])
return (1, 0)
h = hashlib.sha256(f"{doc_id}|audio-v0".encode()).hexdigest()
job = queue.enqueue(conn, job_type="transcribe", target_id=doc_id, input_hash=h,
parent_doc_id=doc_id, priority=100)
return (1, 1 if job is not None else 0)
def ingest_rss(conn: sqlite3.Connection, source: sqlite3.Row, *, since=None, until=None, limit=20):
if not source["rss_url"]:
raise ValueError(f"{source['source_id']} has no rss_url")
recs = episode_records(fetch_feed(source["rss_url"]))
n_docs = n_jobs = count = 0
for r in recs:
d = r["published"]
if since and d and d < since:
continue
if until and d and d > until:
continue
if count >= limit:
break
count += 1
nd, nj = _enqueue_doc(conn, source_id=source["source_id"], kind="podcast",
external_id=r["guid"], url=r["audio_url"], title=r["title"], date=d)
n_docs += nd
n_jobs += nj
return n_docs, n_jobs
def ingest_youtube(conn: sqlite3.Connection, source: sqlite3.Row, *, since=None, until=None,
limit=20, max_scan=800):
"""Enumerate channel videos in the date window via yt-dlp (NON-flat, so upload_date is populated —
flat mode returns NA). Videos come newest-first, so we use --dateafter/--datebefore to select the
window and --break-match-filters to STOP scanning once we drop below `since` (avoids walking the
entire channel history). The transcribe worker downloads audio on demand."""
if not source["channel_url"]:
raise ValueError(f"{source['source_id']} has no channel_url")
url = source["channel_url"].rstrip("/")
if "/playlist" not in url and not url.endswith("/videos"):
url = url + "/videos"
cmd = ["yt-dlp", "--no-warnings", "--ignore-errors", "--skip-download",
"--print", "%(id)s\t%(upload_date)s\t%(title)s", "--playlist-end", str(max_scan)]
if since:
s = since.replace("-", "")
cmd += ["--dateafter", s, "--break-match-filters", f"upload_date>={s}"]
if until:
cmd += ["--datebefore", until.replace("-", "")]
cmd.append(url)
out = subprocess.run(cmd, capture_output=True, text=True, timeout=900)
n_docs = n_jobs = count = 0
for line in out.stdout.splitlines():
parts = line.split("\t")
if len(parts) < 2 or not parts[0] or parts[1] in ("NA", ""):
continue
vid, upd = parts[0], parts[1]
title = parts[2] if len(parts) > 2 else vid
date = f"{upd[:4]}-{upd[4:6]}-{upd[6:8]}" if len(upd) == 8 else None
if count >= limit:
break
count += 1
nd, nj = _enqueue_doc(conn, source_id=source["source_id"], kind="youtube",
external_id=vid, url=f"https://www.youtube.com/watch?v={vid}",
title=title, date=date)
n_docs += nd
n_jobs += nj
return n_docs, n_jobs
+60
View File
@@ -0,0 +1,60 @@
"""Cross-chunk speaker stitching + the voiceprint library (§4.1, §4.5).
diarize-chunk returns a 192-d TitaNet voiceprint per speaker per chunk. Because each chunk is
diarized independently, "Speaker 1" in chunk 3 is not the same label as "Speaker 1" in chunk 7 —
we re-cluster by cosine similarity (~0.7 distance threshold) so one person gets one identity across
the whole episode. The SAME library then matches a guest ACROSS shows by voice (the independence
graph's hardest edge, §4.5).
"""
from __future__ import annotations
import numpy as np
DISTANCE_THRESHOLD = 0.7 # cosine DISTANCE (1 - cosine similarity); §4.1
def _unit(v: np.ndarray) -> np.ndarray:
n = np.linalg.norm(v)
return v / n if n else v
def cosine_distance(a: np.ndarray, b: np.ndarray) -> float:
return float(1.0 - np.dot(_unit(np.asarray(a, dtype=float)), _unit(np.asarray(b, dtype=float))))
def stitch_chunks(chunk_voiceprints: list[np.ndarray], *, threshold: float = DISTANCE_THRESHOLD) -> list[int]:
"""Greedy online clustering of per-(chunk,speaker) voiceprints into stable speaker ids.
Input: a flat list of voiceprint vectors (one per chunk-speaker, in encounter order).
Output: a parallel list of cluster ids. A vector joins the nearest existing cluster if its
distance to that cluster's centroid < threshold, else it starts a new cluster.
"""
centroids: list[np.ndarray] = []
counts: list[int] = []
labels: list[int] = []
for vp in chunk_voiceprints:
vp = np.asarray(vp, dtype=float)
if centroids:
dists = [cosine_distance(vp, c) for c in centroids]
j = int(np.argmin(dists))
if dists[j] < threshold:
centroids[j] = (centroids[j] * counts[j] + vp) / (counts[j] + 1)
counts[j] += 1
labels.append(j)
continue
centroids.append(vp.copy())
counts.append(1)
labels.append(len(centroids) - 1)
return labels
def match_library(vp: np.ndarray, library: list[tuple[str, np.ndarray]], *,
threshold: float = DISTANCE_THRESHOLD) -> str | None:
"""Return the voiceprint_id of the closest library entry within threshold, else None
(a new speaker → caller mints a new library id)."""
best_id, best_d = None, threshold
for vid, lib_vec in library:
d = cosine_distance(vp, lib_vec)
if d < best_d:
best_id, best_d = vid, d
return best_id
+308
View File
@@ -0,0 +1,308 @@
"""Audio → speaker-attributed transcript + voiceprint library (§4.1, §4.5).
Per chunk (sequential — audio lock): diarize-chunk (192-d TitaNet fingerprints + timed speaker
segments) + transcribe (word timestamps). Align words to speakers by time, stitch speakers ACROSS
chunks by fingerprint cosine, then match the persisted voiceprint library so the SAME guest is
recognized ACROSS shows by voice — the highest-leverage input to the source-independence graph.
"""
from __future__ import annotations
import logging
import time
import uuid
from pathlib import Path
import numpy as np
from ..backfill import queue
from .chunker import chunk_audio
from .download import download_enclosure, download_youtube_audio, to_wav_16k_mono
from .speaker_stitch import DISTANCE_THRESHOLD, match_library, stitch_chunks
log = logging.getLogger(__name__)
# ---------- alignment ----------
def _speaker_at(segments: list[dict], t: float) -> str:
for s in segments:
if s["start_s"] <= t <= s["end_s"]:
return s["speaker"]
if not segments:
return "Speaker_0"
return min(segments, key=lambda s: min(abs(s["start_s"] - t), abs(s["end_s"] - t)))["speaker"]
def align_words(words: list[dict], segments: list[dict]) -> list[dict]:
"""Group word-level transcription into speaker turns using the diarization segments."""
turns: list[dict] = []
cur: dict | None = None
for w in words:
mid = (w["start"] + w["end"]) / 2
spk = _speaker_at(segments, mid)
if cur and cur["speaker"] == spk:
cur["text"] += " " + w["text"]
cur["end"] = w["end"]
else:
if cur:
turns.append(cur)
cur = {"speaker": spk, "start": w["start"], "end": w["end"], "text": w["text"]}
if cur:
turns.append(cur)
return turns
# ---------- per-document audio processing ----------
def diarize_transcribe_chunks(sc, chunks: list[Path], *, concurrency: int = 2):
"""Returns (chunk_turns, chunk_speakers): turns per chunk + (chunk_idx, local_spk, fingerprint).
Drives up to `concurrency` chunks in flight — the client's global audio SEMAPHORE is the hard cap
across both parakeet endpoints (sit at 2: keeps the single serial GPU continuously fed = full
throughput, no idle gap). A single chunk's failure is non-fatal (skip; the client already busy-
retries transient blips), but if a MAJORITY of chunks fail the whole job raises so it retries later
(rather than emitting a half-empty transcript). Results are reassembled in chunk order."""
from concurrent.futures import ThreadPoolExecutor, as_completed
def _one(idx: int, ch: Path):
dia = sc.diarize_chunk(str(ch))
tr = sc.transcribe(str(ch))
turns = align_words(tr.get("words", []), dia.get("segments", []))
spks = [(idx, spk, np.asarray(vec, dtype=np.float32))
for spk, vec in (dia.get("fingerprints") or {}).items()]
return idx, turns, spks
results: dict[int, tuple] = {}
failed = 0
with ThreadPoolExecutor(max_workers=max(1, concurrency)) as pool:
futs = {pool.submit(_one, i, ch): i for i, ch in enumerate(chunks)}
for fut in as_completed(futs):
try:
idx, turns, spks = fut.result()
results[idx] = (turns, spks)
except Exception as e: # noqa: BLE001 — one contended chunk shouldn't kill the episode
failed += 1
log.warning("chunk %d/%d failed (%s) — skipping", futs[fut], len(chunks), str(e)[:90])
if chunks and failed >= max(3, len(chunks) // 2):
raise RuntimeError(f"{failed}/{len(chunks)} chunks failed — backend contended; will retry later")
chunk_turns = [(idx, results[idx][0]) for idx in sorted(results)]
chunk_speakers = [s for idx in sorted(results) for s in results[idx][1]]
return chunk_turns, chunk_speakers
def stitch_and_centroids(chunk_speakers, *, threshold: float = DISTANCE_THRESHOLD):
"""Cluster all (chunk,speaker) fingerprints into within-episode global speakers."""
if not chunk_speakers:
return {}, {}
vecs = [v for (_, _, v) in chunk_speakers]
labels = stitch_chunks(vecs, threshold=threshold)
keymap: dict[tuple[int, str], int] = {}
groups: dict[int, list[np.ndarray]] = {}
for (idx, spk, vec), lab in zip(chunk_speakers, labels):
keymap[(idx, spk)] = lab
groups.setdefault(lab, []).append(vec)
centroids = {lab: np.mean(v, axis=0) for lab, v in groups.items()}
return keymap, centroids
def _load_library(conn) -> list[tuple[str, np.ndarray]]:
rows = conn.execute("SELECT voiceprint_id, vector, person_label FROM voiceprints").fetchall()
return [(r["voiceprint_id"], np.frombuffer(r["vector"], dtype=np.float32)) for r in rows]
def _label_for(conn, vpid: str) -> str:
r = conn.execute("SELECT person_label FROM voiceprints WHERE voiceprint_id=?", (vpid,)).fetchone()
return (r["person_label"] if r and r["person_label"] else f"SPK:{vpid[:8]}")
def resolve_voiceprints(conn, doc, centroids: dict[int, np.ndarray], *, threshold: float = DISTANCE_THRESHOLD):
"""Match each within-episode speaker to the persisted library (cross-show identity) or mint a new
one; record observations; add shared_guest edges when the voice also appears in ANOTHER source."""
library = _load_library(conn)
cluster_to_vpid: dict[int, str] = {}
for lab, cen in centroids.items():
vpid = match_library(cen, library, threshold=threshold)
if vpid is None:
vpid = "vp_" + uuid.uuid4().hex[:16]
conn.execute(
"INSERT INTO voiceprints (voiceprint_id, vector, first_doc_id) VALUES (?,?,?)",
(vpid, cen.astype(np.float32).tobytes(), doc["doc_id"]),
)
library.append((vpid, cen))
conn.execute(
"INSERT INTO voiceprint_observations (voiceprint_id, doc_id, chunk_idx) VALUES (?,?,?)",
(vpid, doc["doc_id"], None),
)
cluster_to_vpid[lab] = vpid
conn.commit()
# independence graph (§4.5): if this voice appears in a DIFFERENT source, that's a shared guest.
for vpid in set(cluster_to_vpid.values()):
others = conn.execute(
"""SELECT DISTINCT d.source_id FROM voiceprint_observations o
JOIN documents d ON d.doc_id = o.doc_id
WHERE o.voiceprint_id=? AND d.source_id != ?""",
(vpid, doc["source_id"]),
).fetchall()
for o in others:
a, b = sorted([doc["source_id"], o["source_id"]])
conn.execute(
"""INSERT INTO source_edges (src_a, src_b, edge_type, weight, evidence)
VALUES (?,?,'shared_guest',1.0,?)
ON CONFLICT(src_a, src_b, edge_type)
DO UPDATE SET weight = weight + 1.0, evidence = excluded.evidence""",
(a, b, vpid),
)
conn.commit()
return cluster_to_vpid
def _labeled(chunk_turns, keymap, label_by_cluster: dict) -> str:
lines: list[str] = []
for idx, turns in chunk_turns:
for t in turns:
lab = keymap.get((idx, t["speaker"]))
label = label_by_cluster.get(lab, t["speaker"])
lines.append(f"{label}: {t['text']}")
return "\n".join(lines)
def build_transcript(conn, chunk_turns, keymap, cluster_to_vpid) -> str:
labels = {lab: _label_for(conn, vpid) for lab, vpid in cluster_to_vpid.items()}
return _labeled(chunk_turns, keymap, labels)
def apply_names(conn, cluster_to_vpid: dict, idmap: dict) -> dict:
"""Attach confident names to the voiceprint library (person_label). Returns {cluster: name}."""
named: dict[int, str] = {}
for lab, vpid in cluster_to_vpid.items():
info = idmap.get(f"Speaker {lab + 1}") or idmap.get(str(lab + 1)) or {}
name = (info.get("name") or "").strip() if isinstance(info, dict) else ""
if name and info.get("confidence") in ("med", "high"):
conn.execute("UPDATE voiceprints SET person_label=? WHERE voiceprint_id=?", (name, vpid))
named[lab] = name
conn.commit()
return named
def add_name_edges(conn, doc, cluster_to_vpid: dict) -> int:
"""Name-based shared_guest edges: same person_label seen in a DIFFERENT source → independence edge,
even if the voiceprints didn't cluster (drift-robust complement to voiceprint matching, §4.5)."""
n = 0
for vpid in set(cluster_to_vpid.values()):
r = conn.execute("SELECT person_label FROM voiceprints WHERE voiceprint_id=?", (vpid,)).fetchone()
name = r["person_label"] if r else None
if not name:
continue
others = conn.execute(
"""SELECT DISTINCT d.source_id FROM voiceprints v
JOIN voiceprint_observations o ON o.voiceprint_id = v.voiceprint_id
JOIN documents d ON d.doc_id = o.doc_id
WHERE v.person_label = ? AND d.source_id != ?""",
(name, doc["source_id"]),
).fetchall()
for o in others:
a, b = sorted([doc["source_id"], o["source_id"]])
conn.execute(
"""INSERT INTO source_edges (src_a, src_b, edge_type, weight, evidence)
VALUES (?,?,'shared_guest',1.0,?)
ON CONFLICT(src_a, src_b, edge_type)
DO UPDATE SET weight = weight + 1.0, evidence = excluded.evidence""",
(a, b, f"name:{name}"),
)
n += 1
conn.commit()
return n
def _download_audio(doc, cfg) -> Path:
cache = Path(cfg.audio_cache_dir)
cache.mkdir(parents=True, exist_ok=True)
wav = cache / f"{doc['doc_id'].replace(':', '_')}.wav"
if wav.exists():
return wav
url = doc["url"]
if doc["kind"] == "youtube" or (url and ("youtube.com" in url or "youtu.be" in url)):
return download_youtube_audio(url, cache, archive_file=cache / "yt-archive.txt")
raw = download_enclosure(url, cache / f"{doc['doc_id'].replace(':', '_')}.src")
return to_wav_16k_mono(raw, wav)
def process_document(conn, sc, cfg, doc, *, max_chunks: int, chunk_seconds: int = 150,
keep_audio: bool = False) -> int:
audio = _download_audio(doc, cfg)
chunkdir = Path(cfg.audio_cache_dir) / f"chunks_{doc['doc_id'].replace(':', '_')}"
chunks = chunk_audio(audio, chunkdir, chunk_seconds=chunk_seconds)[:max_chunks]
chunk_turns, chunk_speakers = diarize_transcribe_chunks(
sc, chunks, concurrency=getattr(cfg, "audio_concurrency", 2))
keymap, centroids = stitch_and_centroids(chunk_speakers)
cluster_to_vpid = resolve_voiceprints(conn, doc, centroids)
# Name the speakers (§4.5): host introduces guest in 1-on-1 → attach person_label, then a
# name-based shared_guest edge that survives voiceprint drift across shows.
src = conn.execute("SELECT name FROM sources WHERE source_id=?", (doc["source_id"],)).fetchone()
try:
from ..extract.backends import from_config as backend_from_config
from .identify import identify_speakers
backend = backend_from_config(cfg, sc)
draft = _labeled(chunk_turns, keymap, {lab: f"Speaker {lab + 1}" for lab in cluster_to_vpid})
idmap = identify_speakers(backend, draft[:6000], source_name=src["name"] if src else "")
named = apply_names(conn, cluster_to_vpid, idmap)
if named:
log.info("named speakers in %s: %s", doc["doc_id"], ", ".join(named.values()))
except Exception as e: # noqa: BLE001 — naming is best-effort enrichment
log.warning("speaker identification failed for %s: %s", doc["doc_id"], e)
add_name_edges(conn, doc, cluster_to_vpid)
transcript = build_transcript(conn, chunk_turns, keymap, cluster_to_vpid)
tpath = Path(cfg.data_dir) / "transcripts" / f"{doc['doc_id'].replace(':', '_')}.txt"
tpath.parent.mkdir(parents=True, exist_ok=True)
tpath.write_text(transcript)
import hashlib
content_hash = hashlib.sha256(transcript.encode()).hexdigest()
conn.execute(
"UPDATE documents SET transcript_path=?, duration_sec=?, content_hash=?, processed_at=datetime('now') WHERE doc_id=?",
(str(tpath), len(chunks) * chunk_seconds, content_hash, doc["doc_id"]),
)
conn.commit()
h = hashlib.sha256(f"{doc['doc_id']}|extract-v0".encode()).hexdigest()
queue.enqueue(conn, job_type="extract", target_id=doc["doc_id"], input_hash=h,
parent_doc_id=doc["doc_id"], priority=100)
if not keep_audio:
_cleanup_audio(audio, chunkdir)
return len(chunk_turns)
def _cleanup_audio(audio: Path, chunkdir: Path) -> None:
"""Audio files are large and disposable once transcribed — reclaim the disk (the transcript +
voiceprints are what we keep). Backfilling hundreds of 1-3 hr episodes would otherwise be tens of GB."""
import shutil
try:
if audio.exists():
audio.unlink()
src = audio.with_suffix(".src")
if src.exists():
src.unlink()
if chunkdir.exists():
shutil.rmtree(chunkdir, ignore_errors=True)
except Exception as e: # noqa: BLE001
log.warning("audio cleanup failed for %s: %s", audio, e)
def run_transcribe(conn, sc, cfg, *, limit: int = 5, max_chunks: int = 999,
lease_seconds: int = 3600, worker_id: str = "transcribe-1") -> dict:
processed = 0
while processed < limit:
job = queue.lease_next(conn, worker_id=worker_id, job_types=["transcribe"], lease_seconds=lease_seconds)
if job is None:
break
processed += 1
doc = conn.execute("SELECT * FROM documents WHERE doc_id=?", (job["target_id"],)).fetchone()
if doc is None:
queue.skip(conn, job["job_id"], "document missing")
continue
try:
n = process_document(conn, sc, cfg, doc, max_chunks=max_chunks)
queue.complete(conn, job["job_id"], output_ref=f"{n} chunks")
log.info("transcribed %s (%d chunks)", doc["doc_id"], n)
except Exception as e: # noqa: BLE001
state = queue.fail(conn, job["job_id"], e)
log.warning("transcribe failed for %s: %s (→ %s)", job["target_id"], e, state)
return {"jobs_processed": processed}