Initial commit: Ten31 Signal Engine (ingest, scoring brain, corpus seeds)

2026-06-15 09:24:29 -05:00
commit a6aec77506
77 changed files with 6263 additions and 0 deletions
@@ -0,0 +1,5 @@
+"""Ingestion layer (§4.1) — the biggest greenfield piece.
+
+Spark Control transcribes audio you hand it; it does NOT fetch. Everything here is fetch/schedule:
+RSS + YouTube + EDGAR + FMP earnings, long-audio chunking, and cross-chunk speaker stitching.
+"""
@@ -0,0 +1,36 @@
+"""Long-audio chunking (§4.1, §13.4).
+
+Podcasts run 1–3 h; the diarizer caps at 4 speakers/chunk and Spark 2 is a single GPU, so we cut
+long audio into ~2–3 min pieces sent SEQUENTIALLY (parallel audio → 503 FFT race). Each chunk is
+diarized independently and re-stitched across chunks by voiceprint (see speaker_stitch.py).
+Requires ffmpeg/ffprobe.
+"""
+from __future__ import annotations
+
+import subprocess
+from pathlib import Path
+
+CHUNK_SECONDS_DEFAULT = 150  # 2.5 min, within the ~2–3 min guidance
+
+
+def duration_seconds(src: str | Path) -> float:
+    out = subprocess.run(
+        ["ffprobe", "-v", "error", "-show_entries", "format=duration",
+         "-of", "default=noprint_wrappers=1:nokey=1", str(src)],
+        check=True, capture_output=True, text=True,
+    )
+    return float(out.stdout.strip())
+
+
+def chunk_audio(src: str | Path, out_dir: str | Path, *, chunk_seconds: int = CHUNK_SECONDS_DEFAULT) -> list[Path]:
+    """Split into fixed-length WAV chunks using ffmpeg's segment muxer (no re-encode of timing).
+    Returns chunk paths in order. Order matters: the queue sends them sequentially."""
+    out_dir = Path(out_dir)
+    out_dir.mkdir(parents=True, exist_ok=True)
+    pattern = str(out_dir / "chunk_%04d.wav")
+    subprocess.run(
+        ["ffmpeg", "-y", "-i", str(src), "-f", "segment", "-segment_time", str(chunk_seconds),
+         "-ar", "16000", "-ac", "1", "-reset_timestamps", "1", pattern],
+        check=True, capture_output=True,
+    )
+    return sorted(out_dir.glob("chunk_*.wav"))
@@ -0,0 +1,159 @@
+"""Text-document fetcher for the Battery (bitcoin-collateralized lending) corpus and any non-filing,
+non-audio source: policy primaries (SEC SABs, OCC/FDIC/Fed), lender/issuer blogs, credit-market data.
+
+Unlike EDGAR (CIK-driven) and the podcast path (audio→transcribe), these are dated HTML pages, PDFs, or
+article RSS feeds. We fetch ONCE, extract clean text (HTML via html_to_text, PDF via pypdf), save it, and
+point documents.transcript_path at the saved text so the extract worker reads it directly (it already
+supports transcript_path) — this also lets PDFs work, which the worker's on-demand html_to_text fetch can't.
+
+A source row must exist first (FK). Lineage/axis live on the source's cluster/notes (set in the seed);
+policy sources are axis=context and must NOT feed the supply resolver (weight 0) — enforced downstream.
+"""
+from __future__ import annotations
+
+import hashlib
+import io
+import logging
+import sqlite3
+from pathlib import Path
+
+import requests
+
+from ..backfill import queue
+from ..extract.html_text import html_to_text
+from .feeds import fetch_feed
+
+log = logging.getLogger(__name__)
+
+DEFAULT_UA = "ten31-signal-engine/1.0 (research; contact ops@ten31.xyz)"
+
+
+def _pdf_to_text(data: bytes, *, max_chars: int) -> str:
+    import pypdf
+    reader = pypdf.PdfReader(io.BytesIO(data))
+    parts: list[str] = []
+    total = 0
+    for page in reader.pages:
+        t = page.extract_text() or ""
+        parts.append(t)
+        total += len(t)
+        if total > max_chars:
+            break
+    return "\n".join(parts)[:max_chars]
+
+
+def fetch_clean_text(url: str, *, method: str = "auto", ua: str = DEFAULT_UA,
+                     timeout: int = 90, max_chars: int = 300_000) -> str:
+    """Fetch a URL once and return clean text. Auto-detects PDF vs HTML by content-type + magic bytes."""
+    r = requests.get(url, headers={"User-Agent": ua}, timeout=timeout)
+    r.raise_for_status()
+    ctype = r.headers.get("Content-Type", "").lower()
+    is_pdf = method == "pdf" or "application/pdf" in ctype or r.content[:5] == b"%PDF-"
+    if is_pdf:
+        return _pdf_to_text(r.content, max_chars=max_chars)
+    return html_to_text(r.text, max_chars=max_chars)
+
+
+_BLOCK_MARKERS = (
+    "aggressive automated scraping", "request access", "access denied", "are you a robot",
+    "enable javascript", "captcha", "verify you are human", "rate limit exceeded",
+    "403 forbidden", "unusual traffic", "checking your browser",
+)
+
+
+def _looks_blocked(text: str) -> bool:
+    """Anti-scraping interstitials return 200 + a short access-denied body. Detect so we don't ingest
+    a block page as if it were the document (a real policy/blog doc is long and has no such markers)."""
+    low = text[:2500].lower()
+    return any(m in low for m in _BLOCK_MARKERS)
+
+
+def _doc_id(source_id: str, url: str) -> str:
+    return f"doc:{source_id}:{hashlib.sha256(url.encode()).hexdigest()[:12]}"
+
+
+def ingest_one(conn: sqlite3.Connection, cfg, *, source_id: str, url: str, title: str,
+               date: str | None, method: str = "auto", prompt_version: str = "extract-v0",
+               min_chars: int = 400) -> str | None:
+    """Fetch+store one text document and enqueue extraction. Idempotent on (source_id, url).
+    Returns doc_id if newly ingested, else None (duplicate, too-short, or fetch error → logged)."""
+    doc_id = _doc_id(source_id, url)
+    if conn.execute("SELECT 1 FROM documents WHERE doc_id=?", (doc_id,)).fetchone():
+        return None
+    ua = getattr(cfg, "user_agent", None) or DEFAULT_UA
+    try:
+        text = fetch_clean_text(url, method=method, ua=ua)
+    except Exception as e:  # noqa: BLE001
+        log.warning("doc fetch failed %s: %s", url, e)
+        return None
+    if not text or len(text) < min_chars:
+        log.warning("doc too short (%d chars), skipping %s", len(text or ""), url)
+        return None
+    if _looks_blocked(text):
+        log.warning("blocked/anti-scrape page detected, skipping %s", url)
+        return None
+    safe = doc_id.replace(":", "_")
+    tpath = Path(cfg.data_dir) / "docs" / f"{safe}.txt"
+    tpath.parent.mkdir(parents=True, exist_ok=True)
+    tpath.write_text(text)
+    content_hash = hashlib.sha256(text.encode()).hexdigest()
+    conn.execute(
+        """INSERT OR IGNORE INTO documents
+             (doc_id, source_id, kind, external_id, url, title, date, transcript_path, content_hash, processed_at)
+           VALUES (?,?,?,?,?,?,?,?,?,datetime('now'))""",
+        (doc_id, source_id, "filing", url, url, title[:300] if title else url, date, str(tpath), content_hash),
+    )
+    conn.commit()
+    h = hashlib.sha256(f"{doc_id}|{prompt_version}".encode()).hexdigest()
+    queue.enqueue(conn, job_type="extract", target_id=doc_id, input_hash=h,
+                  parent_doc_id=doc_id, priority=50)
+    conn.commit()
+    log.info("ingested doc %s (%d chars) for %s", doc_id, len(text), source_id)
+    return doc_id
+
+
+def ingest_manifest(conn: sqlite3.Connection, cfg, path) -> dict:
+    """Batch-ingest the docs listed in a YAML manifest ({docs:[{source,url,title,date,method}]}).
+    Returns {ingested, skipped, missing_source}. Each source must already exist (FK)."""
+    import yaml
+    from pathlib import Path as _Path
+    data = yaml.safe_load(_Path(path).read_text()) or {}
+    docs = data.get("docs", [])
+    ingested = skipped = missing = 0
+    for d in docs:
+        src = d.get("source")
+        if not conn.execute("SELECT 1 FROM sources WHERE source_id=?", (src,)).fetchone():
+            log.warning("manifest doc references missing source %r — skipping %s", src, d.get("url"))
+            missing += 1
+            continue
+        doc_id = ingest_one(conn, cfg, source_id=src, url=d["url"], title=d.get("title", d["url"]),
+                            date=d.get("date"), method=d.get("method", "auto"))
+        if doc_id:
+            ingested += 1
+        else:
+            skipped += 1
+    return {"ingested": ingested, "skipped": skipped, "missing_source": missing}
+
+
+def ingest_feed_text(conn: sqlite3.Connection, cfg, *, source_id: str, rss_url: str,
+                     since: str | None = None, until: str | None = None, limit: int = 50) -> int:
+    """Ingest the ARTICLE bodies behind a text RSS feed (blog/press feed). Each item's link is fetched
+    and stored as a dated text document. Returns count of newly-ingested docs."""
+    from .feeds import _published_iso
+    parsed = fetch_feed(rss_url, user_agent=getattr(cfg, "user_agent", None) or DEFAULT_UA)
+    n = 0
+    for entry in parsed.entries:
+        if n >= limit:
+            break
+        link = entry.get("link")
+        if not link:
+            continue
+        date = _published_iso(entry)
+        if since and date and date < since:
+            continue
+        if until and date and date > until:
+            continue
+        if ingest_one(conn, cfg, source_id=source_id, url=link,
+                      title=entry.get("title", link), date=date):
+            n += 1
+    return n
@@ -0,0 +1,61 @@
+"""Audio acquisition (§4.1). Spark Control transcribes audio you fetch — this fetches it.
+
+- Podcast enclosures: a plain streaming download that follows the Podtrac/Megaphone redirects to the
+  final signed CDN object (download immediately; resolved URLs carry short-lived params).
+- YouTube: yt-dlp (audio-only → 16 kHz mono WAV). NOTE: 2026 YouTube enforces PO Tokens broadly — run
+  the `bgutil-ytdlp-pot-provider` sidecar or pulls will 403. yt-dlp is treated as a LAST resort; prefer
+  the RSS enclosure where a show publishes both (ToS: downloading YT audio violates YouTube ToS).
+"""
+from __future__ import annotations
+
+import subprocess
+from pathlib import Path
+
+import requests
+
+DEFAULT_UA = "Ten31SignalEngine/0.1 (+https://ten31.xyz)"
+
+
+def download_enclosure(url: str, dest: str | Path, *, user_agent: str = DEFAULT_UA, timeout: int = 120) -> Path:
+    dest = Path(dest)
+    dest.parent.mkdir(parents=True, exist_ok=True)
+    with requests.get(url, stream=True, allow_redirects=True,
+                      headers={"User-Agent": user_agent}, timeout=timeout) as r:
+        r.raise_for_status()
+        with open(dest, "wb") as f:
+            for chunk in r.iter_content(chunk_size=1 << 16):
+                f.write(chunk)
+    return dest
+
+
+def to_wav_16k_mono(src: str | Path, dst: str | Path) -> Path:
+    """Normalize any audio to 16 kHz mono PCM WAV (what the ASR endpoint wants). Requires ffmpeg."""
+    dst = Path(dst)
+    dst.parent.mkdir(parents=True, exist_ok=True)
+    subprocess.run(
+        ["ffmpeg", "-y", "-i", str(src), "-ar", "16000", "-ac", "1", "-f", "wav", str(dst)],
+        check=True, capture_output=True,
+    )
+    return dst
+
+
+def download_youtube_audio(url: str, out_dir: str | Path, *, archive_file: str | Path | None = None) -> Path:
+    """Audio-only via yt-dlp → 16 kHz mono WAV. `archive_file` (yt-dlp --download-archive) is the
+    canonical 'only-new' dedup for channel/playlist back-catalog pulls."""
+    out_dir = Path(out_dir)
+    out_dir.mkdir(parents=True, exist_ok=True)
+    cmd = [
+        "yt-dlp", "-f", "bestaudio/best", "-x", "--audio-format", "wav",
+        "--postprocessor-args", "ffmpeg:-ar 16000 -ac 1",
+        "-o", str(out_dir / "%(id)s.%(ext)s"),
+        "--no-progress",
+    ]
+    if archive_file:
+        cmd += ["--download-archive", str(archive_file)]
+    cmd.append(url)
+    subprocess.run(cmd, check=True, capture_output=True)
+    # yt-dlp names the file by video id; return the newest wav
+    wavs = sorted(out_dir.glob("*.wav"), key=lambda p: p.stat().st_mtime)
+    if not wavs:
+        raise RuntimeError("yt-dlp produced no wav (PO-token/cookies issue? see module docstring)")
+    return wavs[-1]
@@ -0,0 +1,127 @@
+"""Earnings-call transcripts via Financial Modeling Prep (§4.1, §12 — decision: FMP).
+
+Audio isn't reliably fetchable for large-caps (no uniform feed; ~30–90d replay expiry breaks
+backfill), so FMP's transcript API is the backbone and EDGAR filings remain the durable core. FMP
+also exposes an earnings *calendar* to trigger ingestion on the day a call drops.
+
+Endpoint paths/params are marked TODO(contract): confirm against the FMP 'stable' docs for the
+account tier at integration. Needs config.fmp_api_key.
+"""
+from __future__ import annotations
+
+import hashlib
+import sqlite3
+from pathlib import Path
+from typing import Any
+
+import requests
+
+FMP_BASE = "https://financialmodelingprep.com/stable"
+
+
+class FMPClient:
+    def __init__(self, api_key: str, *, base: str = FMP_BASE, timeout: int = 30) -> None:
+        if not api_key:
+            raise ValueError("FMP_API_KEY is required for earnings-call transcripts")
+        self.api_key = api_key
+        self.base = base
+        self.timeout = timeout
+        self.s = requests.Session()
+
+    def _get(self, path: str, **params: Any) -> Any:
+        params["apikey"] = self.api_key
+        r = self.s.get(f"{self.base}/{path}", params=params, timeout=self.timeout)
+        r.raise_for_status()
+        return r.json()
+
+    # Confirmed against FMP 'stable' 2026-06-07 (v3 is legacy/403). Note singular "earning".
+    def transcript_dates(self, symbol: str) -> Any:
+        """List available transcripts: [{quarter, fiscalYear, date}, ...]."""
+        return self._get("earning-call-transcript-dates", symbol=symbol)
+
+    def transcript(self, symbol: str, *, year: int, quarter: int) -> Any:
+        """One transcript: [{symbol, period, year, date, content}]. Use the `date` field as the
+        document date — FMP's year/quarter labels are fiscal and can be offset from the call date."""
+        return self._get("earning-call-transcript", symbol=symbol, year=year, quarter=quarter)
+
+    def earnings_calendar(self, *, from_date: str, to_date: str) -> Any:
+        """Earnings calendar (ingestion trigger): [{symbol, date, epsActual, ...}, ...]."""
+        return self._get("earnings-calendar", **{"from": from_date, "to": to_date})
+
+
+def ingest_transcript(
+    conn: sqlite3.Connection,
+    *,
+    source_id: str,
+    symbol: str,
+    year: int,
+    quarter: int,
+    content: str,
+    date: str | None,
+    data_dir: Path,
+    prompt_version: str = "extract-v0",
+) -> tuple[bool, bool]:
+    """Store one transcript (content written to disk → transcript_path) and enqueue an 'extract'
+    job. Idempotent. Returns (new_document, new_job)."""
+    from ..backfill import queue
+
+    external_id = f"{symbol}-{year}Q{quarter}"
+    doc_id = f"earnings:{external_id}"
+    tdir = Path(data_dir) / "transcripts"
+    tdir.mkdir(parents=True, exist_ok=True)
+    tpath = tdir / f"{external_id}.txt"
+    tpath.write_text(content)
+    content_hash = hashlib.sha256(content.encode()).hexdigest()
+    cur = conn.execute(
+        """INSERT OR IGNORE INTO documents
+             (doc_id, source_id, kind, external_id, title, date, transcript_path, content_hash, processed_at)
+           VALUES (?,?,?,?,?,?,?,?, datetime('now'))""",
+        (doc_id, source_id, "earnings_call", external_id, f"{symbol} {year} Q{quarter} call",
+         date, str(tpath), content_hash),
+    )
+    conn.commit()
+    if not cur.rowcount:
+        return (False, False)
+    # earnings-call Q&A is the highest-yield text source (§4.1) → priority 40, ahead of filings (50).
+    h = hashlib.sha256(f"{doc_id}|{prompt_version}".encode()).hexdigest()
+    new_job = queue.enqueue(conn, job_type="extract", target_id=doc_id, input_hash=h,
+                            parent_doc_id=doc_id, priority=40) is not None
+    return (True, new_job)
+
+
+def ingest_for_ticker(
+    conn: sqlite3.Connection,
+    fmp: FMPClient,
+    *,
+    source_id: str,
+    symbol: str,
+    data_dir: Path,
+    since: str | None = None,
+    until: str | None = None,
+    limit: int = 8,
+) -> tuple[int, int]:
+    """Enumerate available transcripts via the dates index, fetch those in [since, until], and
+    ingest. Uses each transcript's own `date` (FMP fiscal labels are offset). Returns (docs, jobs)."""
+    dates = fmp.transcript_dates(symbol)
+    picked = []
+    for d in dates if isinstance(dates, list) else []:
+        dt = d.get("date")
+        if since and dt and dt < since:
+            continue
+        if until and dt and dt > until:
+            continue
+        picked.append(d)
+    n_docs = n_jobs = 0
+    for d in picked[:limit]:
+        tr = fmp.transcript(symbol, year=d["fiscalYear"], quarter=d["quarter"])
+        item = (tr[0] if isinstance(tr, list) and tr else tr) or {}
+        content = item.get("content") or ""
+        if not content:
+            continue
+        nd, nj = ingest_transcript(
+            conn, source_id=source_id, symbol=symbol, year=d["fiscalYear"], quarter=d["quarter"],
+            content=content, date=item.get("date") or d.get("date"), data_dir=data_dir,
+        )
+        n_docs += int(nd)
+        n_jobs += int(nj)
+    return n_docs, n_jobs
@@ -0,0 +1,148 @@
+"""SEC EDGAR ingestion (§4.1).
+
+Hits the official data.sec.gov / www.sec.gov APIs directly (free, keyless, full history).
+Two hard requirements:
+  - a descriptive User-Agent (SEC 403s requests without one) — from config.edgar_user_agent.
+  - ≤10 requests/sec aggregate — enforced by a min-interval throttle here.
+
+Supports an explicit date range AND historical shards (filings.files[]), so the §7.1 backtest can
+reach 2022–2023 filings, not just the most-recent ~1000.
+"""
+from __future__ import annotations
+
+import hashlib
+import sqlite3
+import time
+from typing import Iterator
+
+import requests
+
+_FILING_COLS = ("accessionNumber", "form", "filingDate", "primaryDocument", "primaryDocDescription")
+
+
+class EdgarClient:
+    BASE_DATA = "https://data.sec.gov"
+    BASE_WWW = "https://www.sec.gov"
+
+    def __init__(self, user_agent: str, *, min_interval: float = 0.12) -> None:
+        if not user_agent or "@" not in user_agent:
+            raise ValueError("EDGAR requires a descriptive User-Agent with contact email (config.edgar_user_agent)")
+        self.s = requests.Session()
+        self.s.headers.update({"User-Agent": user_agent, "Accept-Encoding": "gzip, deflate"})
+        self.min_interval = min_interval
+        self._last = 0.0
+        self._tickers: dict[str, int] | None = None
+
+    def _throttle(self) -> None:
+        dt = time.monotonic() - self._last
+        if dt < self.min_interval:
+            time.sleep(self.min_interval - dt)
+        self._last = time.monotonic()
+
+    def _get(self, url: str) -> requests.Response:
+        self._throttle()
+        r = self.s.get(url, timeout=30)
+        r.raise_for_status()
+        return r
+
+    # ---- ticker → CIK ----
+    def ticker_map(self) -> dict[str, int]:
+        if self._tickers is None:
+            data = self._get(f"{self.BASE_WWW}/files/company_tickers.json").json()
+            self._tickers = {row["ticker"].upper(): int(row["cik_str"]) for row in data.values()}
+        return self._tickers
+
+    def cik_for(self, ticker: str) -> int | None:
+        return self.ticker_map().get(ticker.upper())
+
+    # ---- filings ----
+    def _iter_array(self, block: dict, forms, since, until) -> Iterator[dict]:
+        arrays = [block.get(c, []) for c in _FILING_COLS]
+        for acc, form, fdate, pdoc, pdesc in zip(*arrays):
+            if forms and form not in forms:
+                continue
+            if since and fdate < since:
+                continue
+            if until and fdate > until:
+                continue
+            yield {"accession": acc, "form": form, "filing_date": fdate,
+                   "primary_document": pdoc, "description": pdesc}
+
+    def iter_filings(
+        self,
+        cik: int,
+        *,
+        forms: tuple[str, ...] = ("10-K", "10-Q", "8-K"),
+        since: str | None = None,
+        until: str | None = None,
+    ) -> Iterator[dict]:
+        """Yield filing descriptors. Pulls the inline 'recent' block AND any historical shards whose
+        date window overlaps [since, until] — required to reach the backtest era for active filers."""
+        sub = self._get(f"{self.BASE_DATA}/submissions/CIK{cik:010d}.json").json()
+        recent = sub.get("filings", {}).get("recent", {})
+        for f in self._iter_array(recent, forms, since, until):
+            yield self._with_url(cik, f)
+        for shard in sub.get("filings", {}).get("files", []):
+            # shard has filingFrom / filingTo; skip shards entirely outside the window.
+            if until and shard.get("filingFrom", "") > until:
+                continue
+            if since and shard.get("filingTo", "9999") < since:
+                continue
+            block = self._get(f"{self.BASE_DATA}/submissions/{shard['name']}").json()
+            for f in self._iter_array(block, forms, since, until):
+                yield self._with_url(cik, f)
+
+    def _with_url(self, cik: int, f: dict) -> dict:
+        acc_nodash = f["accession"].replace("-", "")
+        f["cik"] = cik
+        f["url"] = f"{self.BASE_WWW}/Archives/edgar/data/{cik}/{acc_nodash}/{f['primary_document']}"
+        return f
+
+    def fetch_html(self, filing: dict) -> str:
+        return self._get(filing["url"]).text
+
+
+# Domestic annual/quarterly + foreign-private-issuer equivalents. 20-F (foreign annual, e.g. TSM/IREN),
+# 40-F (Canadian annual, e.g. CCJ). 8-K/6-K (current reports) excluded by default — low claim yield.
+HIGH_YIELD_FORMS = ("10-K", "10-Q", "20-F", "40-F")
+
+
+def ingest_filings(
+    conn: sqlite3.Connection,
+    client: EdgarClient,
+    *,
+    source_id: str,
+    ticker: str,
+    since: str | None = None,
+    until: str | None = None,
+    forms: tuple[str, ...] = HIGH_YIELD_FORMS,
+    prompt_version: str = "extract-v0",
+) -> tuple[int, int]:
+    """Insert filing documents and enqueue 'extract' jobs. Filings are text → no transcription;
+    they go straight to extraction (the extract worker fetches + strips the HTML later). Default
+    forms cover both domestic (10-K/10-Q) and foreign-private-issuer (20-F/40-F) filers.
+    Returns (new_documents, new_jobs). Idempotent on (source_id, accession)."""
+    from ..backfill import queue
+
+    cik = client.cik_for(ticker)
+    if cik is None:
+        raise ValueError(f"No CIK found for ticker {ticker!r}")
+    n_docs = n_jobs = 0
+    for f in client.iter_filings(cik, forms=forms, since=since, until=until):
+        doc_id = f"edgar:{f['accession']}"
+        cur = conn.execute(
+            """INSERT OR IGNORE INTO documents (doc_id, source_id, kind, external_id, url, title, date)
+               VALUES (?,?,?,?,?,?,?)""",
+            (doc_id, source_id, "filing", f["accession"], f["url"],
+             f"{ticker} {f['form']} {f['filing_date']}", f["filing_date"]),
+        )
+        conn.commit()
+        if not cur.rowcount:
+            continue
+        n_docs += 1
+        h = hashlib.sha256(f"{doc_id}|{prompt_version}".encode()).hexdigest()
+        # priority 50: filings are high-info-density (§4.1) → ahead of podcasts (100)
+        if queue.enqueue(conn, job_type="extract", target_id=doc_id, input_hash=h,
+                         parent_doc_id=doc_id, priority=50) is not None:
+            n_jobs += 1
+    return n_docs, n_jobs
@@ -0,0 +1,65 @@
+"""Podcast RSS ingestion (§4.1).
+
+feedparser + conditional GET (ETag/Last-Modified) for efficient incremental polling, with a
+composite (feed_url, guid) dedup discipline. Many podcast CDNs send no validators and some feeds
+truncate to recent episodes — for the §7.1 backtest, older episodes may need the show's full
+archive feed (some hosts expose `?limit=` / a separate archive URL) or a YouTube back-catalog.
+"""
+from __future__ import annotations
+
+import hashlib
+import time
+from typing import Any
+
+import feedparser
+
+DEFAULT_UA = "Ten31SignalEngine/0.1 (+https://ten31.xyz)"
+
+
+def fetch_feed(url: str, *, etag: str | None = None, modified: str | None = None,
+               user_agent: str = DEFAULT_UA) -> feedparser.FeedParserDict:
+    """Conditional GET. On HTTP 304 the result has .status == 304 and .entries == [] → skip."""
+    return feedparser.parse(url, etag=etag, modified=modified, agent=user_agent)
+
+
+def _published_iso(entry: Any) -> str | None:
+    t = entry.get("published_parsed") or entry.get("updated_parsed")
+    if not t:
+        return None
+    return time.strftime("%Y-%m-%d", t)
+
+
+def _enclosure_audio_url(entry: Any) -> str | None:
+    for enc in entry.get("enclosures", []) or []:
+        if str(enc.get("type", "")).startswith("audio"):
+            return enc.get("href") or enc.get("url")
+    # some feeds put audio only in links rel=enclosure
+    for link in entry.get("links", []) or []:
+        if link.get("rel") == "enclosure" and str(link.get("type", "")).startswith("audio"):
+            return link.get("href")
+    return None
+
+
+def _guid(entry: Any) -> str:
+    g = entry.get("id") or entry.get("link")
+    if g:
+        return str(g)
+    basis = f"{entry.get('title','')}|{entry.get('published','')}"
+    return "sha1:" + hashlib.sha1(basis.encode()).hexdigest()
+
+
+def episode_records(parsed: feedparser.FeedParserDict) -> list[dict]:
+    """Normalize feed entries to episode records. Skips entries with no audio enclosure."""
+    out: list[dict] = []
+    for e in parsed.entries:
+        audio = _enclosure_audio_url(e)
+        if not audio:
+            continue
+        out.append({
+            "guid": _guid(e),
+            "title": e.get("title"),
+            "audio_url": audio,
+            "link": e.get("link"),
+            "published": _published_iso(e),
+        })
+    return out
@@ -0,0 +1,195 @@
+"""One-time backfill path: transcribe podcast episodes via the Gemini multimodal API instead of the
+local Spark Parakeet+diarizer pipeline. Used to take a bulk backfill OFF the shared Spark GPU (which
+contends with production) — it is NOT the steady-state transcriber (local Parakeet remains the default).
+
+Scope/guardrail: podcast audio is PUBLIC data, so sending it to the frontier does NOT trip the
+exposure/positioning-data rule (that guardrail is about Ten31's conviction/exposure data, never public
+audio). Output is written in the SAME 'Speaker: text' transcript format the extractor consumes, so the
+downstream extract→embed stages are agnostic to which transcriber produced the file.
+
+Tradeoff vs local: Gemini yields speaker-LABELED text, not voiceprint fingerprints — so no voiceprint
+auto-edges. We rely on the hand-seeded EISC edges + name-based attribution instead (acceptable for a
+bounded backfill).
+"""
+from __future__ import annotations
+
+import hashlib
+import logging
+import re
+import time
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from pathlib import Path
+
+from ..backfill import queue
+from .download import download_enclosure
+
+log = logging.getLogger(__name__)
+
+_PROMPT = (
+    "You are a precise podcast transcriptionist. Transcribe this audio VERBATIM as a speaker-diarized "
+    "transcript.\n"
+    "RULES:\n"
+    "- One line per speaker turn, formatted exactly as `Name: spoken text` (a colon and one space).\n"
+    "- The host of this show is {host} — label every host turn with exactly `{host}` (the person's "
+    "name, never the show's name).\n"
+    "- When the host introduces a guest by name (e.g. 'welcome X to the show', 'I'm joined by X'), use "
+    "that real first name (or full name) as the guest's label for the WHOLE transcript. Only fall back "
+    "to `Guest` (or `Guest 2`, `Guest 3`) if a name is never stated. Do not invent names.\n"
+    "- Do NOT include timestamps, ad-reads markers, summaries, headings, markdown, or any commentary. "
+    "Only the transcript lines.\n"
+    "- Transcribe the entire episode from start to finish. Do not stop early or summarize.\n"
+)
+
+
+def _host_person(source_name: str) -> str:
+    """Derive the host's PERSON name from a source/show name so claimant attribution isn't the show.
+    'What Bitcoin Did (Peter McCormack)' -> 'Peter McCormack'; 'Stephan Livera Podcast' -> 'Stephan
+    Livera'; 'The Kevin Rooke Show' -> 'Kevin Rooke'; 'The Anita Posch Show' -> 'Anita Posch'."""
+    m = re.search(r"\(([^)]+)\)", source_name or "")
+    if m:
+        return m.group(1).strip()
+    s = re.sub(r"^The\s+", "", source_name or "").strip()
+    s = re.sub(r"\s+(Podcast|Show)$", "", s, flags=re.I).strip()
+    return s
+
+
+def _sniff_audio_mime(path: Path) -> str:
+    """Determine audio MIME from the file header — the downloaded enclosure has a generic `.src`
+    extension, so the Files API can't infer it and rejects the upload without an explicit mime_type."""
+    with open(path, "rb") as fh:
+        head = fh.read(16)
+    if head[:3] == b"ID3" or (len(head) > 1 and head[0] == 0xFF and (head[1] & 0xE0) == 0xE0):
+        return "audio/mpeg"
+    if head[4:8] == b"ftyp":
+        return "audio/mp4"          # m4a/aac
+    if head[:4] == b"OggS":
+        return "audio/ogg"
+    if head[:4] == b"RIFF":
+        return "audio/wav"
+    if head[:4] == b"fLaC":
+        return "audio/flac"
+    return "audio/mpeg"             # podcast default
+
+
+def _upload_and_wait(client, audio_path: Path, *, poll_s: float = 2.0, timeout_s: float = 300.0):
+    """Upload to the Files API and wait until the file is ACTIVE (audio is processed server-side)."""
+    from google.genai import types
+    mime = _sniff_audio_mime(audio_path)
+    f = client.files.upload(file=str(audio_path), config=types.UploadFileConfig(mime_type=mime))
+    waited = 0.0
+    while getattr(f.state, "name", str(f.state)) == "PROCESSING" and waited < timeout_s:
+        time.sleep(poll_s)
+        waited += poll_s
+        f = client.files.get(name=f.name)
+    state = getattr(f.state, "name", str(f.state))
+    if state != "ACTIVE":
+        raise RuntimeError(f"Gemini file not ACTIVE (state={state}) for {audio_path.name}")
+    return f
+
+
+def transcribe_one(client, model: str, audio_path: Path, host_name: str, *,
+                   max_output_tokens: int = 65536) -> tuple[str, dict]:
+    """Transcribe a single audio file → (transcript_text, usage_dict). Network/CPU only; no DB."""
+    from google.genai import types
+    f = _upload_and_wait(client, audio_path)
+    try:
+        resp = client.models.generate_content(
+            model=model,
+            contents=[f, _PROMPT.format(host=host_name or "the host")],
+            config=types.GenerateContentConfig(temperature=0, max_output_tokens=max_output_tokens),
+        )
+        text = (resp.text or "").strip()
+        um = getattr(resp, "usage_metadata", None)
+        usage = {
+            "prompt_tokens": getattr(um, "prompt_token_count", 0) or 0,
+            "output_tokens": getattr(um, "candidates_token_count", 0) or 0,
+            "finish_reason": str(getattr(resp.candidates[0], "finish_reason", "")) if resp.candidates else "",
+        }
+        return text, usage
+    finally:
+        try:
+            client.files.delete(name=f.name)
+        except Exception as e:  # noqa: BLE001 — best-effort cleanup
+            log.debug("file cleanup failed for %s: %s", f.name, e)
+
+
+def _fetch_and_transcribe(client, model: str, cfg, doc, host_name: str) -> dict:
+    """Worker-thread unit: download enclosure → Gemini transcribe → write transcript file. No DB writes."""
+    cache = Path(cfg.audio_cache_dir)
+    cache.mkdir(parents=True, exist_ok=True)
+    safe = doc["doc_id"].replace(":", "_")
+    src = cache / f"{safe}.src"
+    audio = download_enclosure(doc["url"], src)
+    try:
+        text, usage = transcribe_one(client, model, audio, host_name)
+        if not text or len(text) < 40:
+            raise RuntimeError(f"empty/short transcript ({len(text)} chars)")
+        tpath = Path(cfg.data_dir) / "transcripts" / f"{safe}.txt"
+        tpath.parent.mkdir(parents=True, exist_ok=True)
+        tpath.write_text(text)
+        return {
+            "doc_id": doc["doc_id"], "ok": True, "transcript_path": str(tpath),
+            "n_lines": text.count("\n") + 1, "content_hash": hashlib.sha256(text.encode()).hexdigest(),
+            "usage": usage,
+        }
+    finally:
+        try:
+            if audio.exists():
+                audio.unlink()
+        except Exception:  # noqa: BLE001
+            pass
+
+
+def run_transcribe_gemini(conn, cfg, *, limit: int = 5, concurrency: int = 4,
+                          lease_seconds: int = 7200, worker_id: str = "gemini-transcribe") -> dict:
+    """Lease pending transcribe jobs and transcribe them via Gemini in parallel. DB writes stay on the
+    main thread; only download+API run in the pool. Reports token usage for cost accounting."""
+    from google import genai
+    if not cfg.gemini_api_key:
+        raise RuntimeError("GEMINI_API_KEY not configured")
+    client = genai.Client(api_key=cfg.gemini_api_key)
+    model = cfg.gemini_model or "gemini-2.5-flash"
+
+    # Lease the batch up front (main thread); resolve docs + host names.
+    leased: list[tuple] = []
+    while len(leased) < limit:
+        job = queue.lease_next(conn, worker_id=worker_id, job_types=["transcribe"], lease_seconds=lease_seconds)
+        if job is None:
+            break
+        doc = conn.execute("SELECT * FROM documents WHERE doc_id=?", (job["target_id"],)).fetchone()
+        if doc is None:
+            queue.skip(conn, job["job_id"], "document missing")
+            continue
+        host = conn.execute("SELECT name FROM sources WHERE source_id=?", (doc["source_id"],)).fetchone()
+        leased.append((job, doc, _host_person(host["name"]) if host else ""))
+
+    done = failed = prompt_tok = out_tok = 0
+    with ThreadPoolExecutor(max_workers=concurrency) as pool:
+        futs = {pool.submit(_fetch_and_transcribe, client, model, cfg, doc, host): (job, doc)
+                for (job, doc, host) in leased}
+        for fut in as_completed(futs):
+            job, doc = futs[fut]
+            try:
+                r = fut.result()
+                conn.execute(
+                    "UPDATE documents SET transcript_path=?, content_hash=?, processed_at=datetime('now') "
+                    "WHERE doc_id=?", (r["transcript_path"], r["content_hash"], doc["doc_id"]),
+                )
+                h = hashlib.sha256(f"{doc['doc_id']}|extract-v0".encode()).hexdigest()
+                queue.enqueue(conn, job_type="extract", target_id=doc["doc_id"], input_hash=h,
+                              parent_doc_id=doc["doc_id"], priority=100)
+                queue.complete(conn, job["job_id"], output_ref=f"gemini {r['n_lines']} lines")
+                conn.commit()
+                done += 1
+                prompt_tok += r["usage"]["prompt_tokens"]
+                out_tok += r["usage"]["output_tokens"]
+                fr = r["usage"]["finish_reason"]
+                log.info("gemini transcribed %s (%d lines, %d in/%d out tok%s)", doc["doc_id"],
+                         r["n_lines"], r["usage"]["prompt_tokens"], r["usage"]["output_tokens"],
+                         ", TRUNCATED" if "MAX_TOKENS" in fr else "")
+            except Exception as e:  # noqa: BLE001
+                state = queue.fail(conn, job["job_id"], e)
+                conn.commit()
+                failed += 1
+                log.warning("gemini transcribe failed for %s: %s (→ %s)", doc["doc_id"], e, state)
+    return {"done": done, "failed": failed, "prompt_tokens": prompt_tok, "output_tokens": out_tok}
@@ -0,0 +1,45 @@
+"""Speaker-name identification (§4.5 enhancement).
+
+In a 1-on-1 interview the host introduces the guest by name at the top. Reading the transcript head
+with the LLM, we attach a real NAME to each diarized speaker → voiceprints.person_label. This gives
+the independence graph a SECOND, orthogonal overlap signal: the same NAMED guest across two shows is
+a shared_guest edge even when the voiceprints don't cluster (different mic/codec/room). It complements
+voiceprint cosine matching and is robust to fingerprint drift — exactly the case the operator flagged.
+"""
+from __future__ import annotations
+
+import json
+import logging
+
+log = logging.getLogger(__name__)
+
+_SYS = (
+    'You identify the speakers in a podcast/interview transcript. Each line is "LABEL: text". '
+    "Using the introduction and context, determine each LABEL's real full name and role. In an "
+    "interview the host normally introduces themselves and the guest within the first minute. Only "
+    "assert a name you can actually support from the text — if you cannot tell, use null. "
+    'Return ONLY JSON: {"speakers": {"<LABEL>": {"name": "Full Name" or null, '
+    '"role": "host"|"guest"|"panelist"|"unknown", "confidence": "low"|"med"|"high"}}}.'
+)
+
+
+def identify_speakers(backend, transcript_head: str, *, source_name: str, host_hint: str | None = None) -> dict:
+    """Returns {label: {name, role, confidence}}. `backend` is any extract.backends backend."""
+    ctx = f"Show: {source_name}."
+    if host_hint:
+        ctx += f" The show's usual host is {host_hint}."
+    ctx += "\n\nTRANSCRIPT (beginning):\n" + transcript_head
+    messages = [{"role": "system", "content": _SYS}, {"role": "user", "content": ctx}]
+    raw = backend.complete_json(messages, max_tokens=600)
+    try:
+        obj = json.loads(raw)
+    except Exception:
+        i, j = raw.find("{"), raw.rfind("}")
+        if i < 0 or j < 0:
+            return {}
+        try:
+            obj = json.loads(raw[i:j + 1])
+        except Exception:
+            return {}
+    spk = obj.get("speakers", {}) if isinstance(obj, dict) else {}
+    return spk if isinstance(spk, dict) else {}
@@ -0,0 +1,111 @@
+"""Podcast ingestion → documents + 'transcribe' jobs (§4.1).
+
+RSS path: parse the feed, take episodes in [since, until], register documents pointing at the audio
+enclosure. YouTube path: enumerate a channel's videos in the date window via yt-dlp (the back-catalog
+route for the ~9 shows whose RSS is a truncated rolling window — see seeds/podcast_feeds.resolved.yaml).
+The transcribe worker downloads + processes either kind identically.
+"""
+from __future__ import annotations
+
+import hashlib
+import json
+import logging
+import sqlite3
+import subprocess
+
+from ..backfill import queue
+from ..util import audio_dedup_key
+from .feeds import episode_records, fetch_feed
+
+log = logging.getLogger(__name__)
+
+
+def _enqueue_doc(conn, *, source_id, kind, external_id, url, title, date) -> tuple[int, int]:
+    doc_id = f"pod:{source_id}:{hashlib.sha1(external_id.encode()).hexdigest()[:12]}"
+    dkey = audio_dedup_key(title, date)
+    # Cross-mirror dedup (pre-GPU): if this same episode was already processed (any source/feed),
+    # record the sighting for provenance but DON'T re-transcribe. (external_id UNIQUE already covers
+    # same-feed re-ingest; this covers the same episode via a different feed/YouTube mirror.)
+    dup = conn.execute(
+        "SELECT doc_id FROM documents WHERE dedup_key=? AND processed_at IS NOT NULL LIMIT 1", (dkey,)
+    ).fetchone()
+    cur = conn.execute(
+        """INSERT OR IGNORE INTO documents (doc_id, source_id, kind, external_id, url, title, date, dedup_key)
+           VALUES (?,?,?,?,?,?,?,?)""",
+        (doc_id, source_id, kind, external_id, url, title, date, dkey),
+    )
+    conn.commit()
+    if not cur.rowcount:
+        return (0, 0)  # same (source_id, external_id) already known
+    if dup:
+        conn.execute(
+            "UPDATE documents SET processed_at=datetime('now'), raw_path=? WHERE doc_id=?",
+            (f"dup_of:{dup['doc_id']}", doc_id),
+        )
+        conn.commit()
+        log.info("skip transcribe for %s — duplicate content of %s", doc_id, dup["doc_id"])
+        return (1, 0)
+    h = hashlib.sha256(f"{doc_id}|audio-v0".encode()).hexdigest()
+    job = queue.enqueue(conn, job_type="transcribe", target_id=doc_id, input_hash=h,
+                        parent_doc_id=doc_id, priority=100)
+    return (1, 1 if job is not None else 0)
+
+
+def ingest_rss(conn: sqlite3.Connection, source: sqlite3.Row, *, since=None, until=None, limit=20):
+    if not source["rss_url"]:
+        raise ValueError(f"{source['source_id']} has no rss_url")
+    recs = episode_records(fetch_feed(source["rss_url"]))
+    n_docs = n_jobs = count = 0
+    for r in recs:
+        d = r["published"]
+        if since and d and d < since:
+            continue
+        if until and d and d > until:
+            continue
+        if count >= limit:
+            break
+        count += 1
+        nd, nj = _enqueue_doc(conn, source_id=source["source_id"], kind="podcast",
+                              external_id=r["guid"], url=r["audio_url"], title=r["title"], date=d)
+        n_docs += nd
+        n_jobs += nj
+    return n_docs, n_jobs
+
+
+def ingest_youtube(conn: sqlite3.Connection, source: sqlite3.Row, *, since=None, until=None,
+                   limit=20, max_scan=800):
+    """Enumerate channel videos in the date window via yt-dlp (NON-flat, so upload_date is populated —
+    flat mode returns NA). Videos come newest-first, so we use --dateafter/--datebefore to select the
+    window and --break-match-filters to STOP scanning once we drop below `since` (avoids walking the
+    entire channel history). The transcribe worker downloads audio on demand."""
+    if not source["channel_url"]:
+        raise ValueError(f"{source['source_id']} has no channel_url")
+    url = source["channel_url"].rstrip("/")
+    if "/playlist" not in url and not url.endswith("/videos"):
+        url = url + "/videos"
+    cmd = ["yt-dlp", "--no-warnings", "--ignore-errors", "--skip-download",
+           "--print", "%(id)s\t%(upload_date)s\t%(title)s", "--playlist-end", str(max_scan)]
+    if since:
+        s = since.replace("-", "")
+        cmd += ["--dateafter", s, "--break-match-filters", f"upload_date>={s}"]
+    if until:
+        cmd += ["--datebefore", until.replace("-", "")]
+    cmd.append(url)
+    out = subprocess.run(cmd, capture_output=True, text=True, timeout=900)
+    n_docs = n_jobs = count = 0
+    for line in out.stdout.splitlines():
+        parts = line.split("\t")
+        if len(parts) < 2 or not parts[0] or parts[1] in ("NA", ""):
+            continue
+        vid, upd = parts[0], parts[1]
+        title = parts[2] if len(parts) > 2 else vid
+        date = f"{upd[:4]}-{upd[4:6]}-{upd[6:8]}" if len(upd) == 8 else None
+        if count >= limit:
+            break
+        count += 1
+        nd, nj = _enqueue_doc(conn, source_id=source["source_id"], kind="youtube",
+                              external_id=vid, url=f"https://www.youtube.com/watch?v={vid}",
+                              title=title, date=date)
+        n_docs += nd
+        n_jobs += nj
+    return n_docs, n_jobs
@@ -0,0 +1,60 @@
+"""Cross-chunk speaker stitching + the voiceprint library (§4.1, §4.5).
+
+diarize-chunk returns a 192-d TitaNet voiceprint per speaker per chunk. Because each chunk is
+diarized independently, "Speaker 1" in chunk 3 is not the same label as "Speaker 1" in chunk 7 —
+we re-cluster by cosine similarity (~0.7 distance threshold) so one person gets one identity across
+the whole episode. The SAME library then matches a guest ACROSS shows by voice (the independence
+graph's hardest edge, §4.5).
+"""
+from __future__ import annotations
+
+import numpy as np
+
+DISTANCE_THRESHOLD = 0.7  # cosine DISTANCE (1 - cosine similarity); §4.1
+
+
+def _unit(v: np.ndarray) -> np.ndarray:
+    n = np.linalg.norm(v)
+    return v / n if n else v
+
+
+def cosine_distance(a: np.ndarray, b: np.ndarray) -> float:
+    return float(1.0 - np.dot(_unit(np.asarray(a, dtype=float)), _unit(np.asarray(b, dtype=float))))
+
+
+def stitch_chunks(chunk_voiceprints: list[np.ndarray], *, threshold: float = DISTANCE_THRESHOLD) -> list[int]:
+    """Greedy online clustering of per-(chunk,speaker) voiceprints into stable speaker ids.
+
+    Input: a flat list of voiceprint vectors (one per chunk-speaker, in encounter order).
+    Output: a parallel list of cluster ids. A vector joins the nearest existing cluster if its
+    distance to that cluster's centroid < threshold, else it starts a new cluster.
+    """
+    centroids: list[np.ndarray] = []
+    counts: list[int] = []
+    labels: list[int] = []
+    for vp in chunk_voiceprints:
+        vp = np.asarray(vp, dtype=float)
+        if centroids:
+            dists = [cosine_distance(vp, c) for c in centroids]
+            j = int(np.argmin(dists))
+            if dists[j] < threshold:
+                centroids[j] = (centroids[j] * counts[j] + vp) / (counts[j] + 1)
+                counts[j] += 1
+                labels.append(j)
+                continue
+        centroids.append(vp.copy())
+        counts.append(1)
+        labels.append(len(centroids) - 1)
+    return labels
+
+
+def match_library(vp: np.ndarray, library: list[tuple[str, np.ndarray]], *,
+                  threshold: float = DISTANCE_THRESHOLD) -> str | None:
+    """Return the voiceprint_id of the closest library entry within threshold, else None
+    (a new speaker → caller mints a new library id)."""
+    best_id, best_d = None, threshold
+    for vid, lib_vec in library:
+        d = cosine_distance(vp, lib_vec)
+        if d < best_d:
+            best_id, best_d = vid, d
+    return best_id
@@ -0,0 +1,308 @@
+"""Audio → speaker-attributed transcript + voiceprint library (§4.1, §4.5).
+
+Per chunk (sequential — audio lock): diarize-chunk (192-d TitaNet fingerprints + timed speaker
+segments) + transcribe (word timestamps). Align words to speakers by time, stitch speakers ACROSS
+chunks by fingerprint cosine, then match the persisted voiceprint library so the SAME guest is
+recognized ACROSS shows by voice — the highest-leverage input to the source-independence graph.
+"""
+from __future__ import annotations
+
+import logging
+import time
+import uuid
+from pathlib import Path
+
+import numpy as np
+
+from ..backfill import queue
+from .chunker import chunk_audio
+from .download import download_enclosure, download_youtube_audio, to_wav_16k_mono
+from .speaker_stitch import DISTANCE_THRESHOLD, match_library, stitch_chunks
+
+log = logging.getLogger(__name__)
+
+
+# ---------- alignment ----------
+def _speaker_at(segments: list[dict], t: float) -> str:
+    for s in segments:
+        if s["start_s"] <= t <= s["end_s"]:
+            return s["speaker"]
+    if not segments:
+        return "Speaker_0"
+    return min(segments, key=lambda s: min(abs(s["start_s"] - t), abs(s["end_s"] - t)))["speaker"]
+
+
+def align_words(words: list[dict], segments: list[dict]) -> list[dict]:
+    """Group word-level transcription into speaker turns using the diarization segments."""
+    turns: list[dict] = []
+    cur: dict | None = None
+    for w in words:
+        mid = (w["start"] + w["end"]) / 2
+        spk = _speaker_at(segments, mid)
+        if cur and cur["speaker"] == spk:
+            cur["text"] += " " + w["text"]
+            cur["end"] = w["end"]
+        else:
+            if cur:
+                turns.append(cur)
+            cur = {"speaker": spk, "start": w["start"], "end": w["end"], "text": w["text"]}
+    if cur:
+        turns.append(cur)
+    return turns
+
+
+# ---------- per-document audio processing ----------
+def diarize_transcribe_chunks(sc, chunks: list[Path], *, concurrency: int = 2):
+    """Returns (chunk_turns, chunk_speakers): turns per chunk + (chunk_idx, local_spk, fingerprint).
+
+    Drives up to `concurrency` chunks in flight — the client's global audio SEMAPHORE is the hard cap
+    across both parakeet endpoints (sit at 2: keeps the single serial GPU continuously fed = full
+    throughput, no idle gap). A single chunk's failure is non-fatal (skip; the client already busy-
+    retries transient blips), but if a MAJORITY of chunks fail the whole job raises so it retries later
+    (rather than emitting a half-empty transcript). Results are reassembled in chunk order."""
+    from concurrent.futures import ThreadPoolExecutor, as_completed
+
+    def _one(idx: int, ch: Path):
+        dia = sc.diarize_chunk(str(ch))
+        tr = sc.transcribe(str(ch))
+        turns = align_words(tr.get("words", []), dia.get("segments", []))
+        spks = [(idx, spk, np.asarray(vec, dtype=np.float32))
+                for spk, vec in (dia.get("fingerprints") or {}).items()]
+        return idx, turns, spks
+
+    results: dict[int, tuple] = {}
+    failed = 0
+    with ThreadPoolExecutor(max_workers=max(1, concurrency)) as pool:
+        futs = {pool.submit(_one, i, ch): i for i, ch in enumerate(chunks)}
+        for fut in as_completed(futs):
+            try:
+                idx, turns, spks = fut.result()
+                results[idx] = (turns, spks)
+            except Exception as e:  # noqa: BLE001 — one contended chunk shouldn't kill the episode
+                failed += 1
+                log.warning("chunk %d/%d failed (%s) — skipping", futs[fut], len(chunks), str(e)[:90])
+    if chunks and failed >= max(3, len(chunks) // 2):
+        raise RuntimeError(f"{failed}/{len(chunks)} chunks failed — backend contended; will retry later")
+    chunk_turns = [(idx, results[idx][0]) for idx in sorted(results)]
+    chunk_speakers = [s for idx in sorted(results) for s in results[idx][1]]
+    return chunk_turns, chunk_speakers
+
+
+def stitch_and_centroids(chunk_speakers, *, threshold: float = DISTANCE_THRESHOLD):
+    """Cluster all (chunk,speaker) fingerprints into within-episode global speakers."""
+    if not chunk_speakers:
+        return {}, {}
+    vecs = [v for (_, _, v) in chunk_speakers]
+    labels = stitch_chunks(vecs, threshold=threshold)
+    keymap: dict[tuple[int, str], int] = {}
+    groups: dict[int, list[np.ndarray]] = {}
+    for (idx, spk, vec), lab in zip(chunk_speakers, labels):
+        keymap[(idx, spk)] = lab
+        groups.setdefault(lab, []).append(vec)
+    centroids = {lab: np.mean(v, axis=0) for lab, v in groups.items()}
+    return keymap, centroids
+
+
+def _load_library(conn) -> list[tuple[str, np.ndarray]]:
+    rows = conn.execute("SELECT voiceprint_id, vector, person_label FROM voiceprints").fetchall()
+    return [(r["voiceprint_id"], np.frombuffer(r["vector"], dtype=np.float32)) for r in rows]
+
+
+def _label_for(conn, vpid: str) -> str:
+    r = conn.execute("SELECT person_label FROM voiceprints WHERE voiceprint_id=?", (vpid,)).fetchone()
+    return (r["person_label"] if r and r["person_label"] else f"SPK:{vpid[:8]}")
+
+
+def resolve_voiceprints(conn, doc, centroids: dict[int, np.ndarray], *, threshold: float = DISTANCE_THRESHOLD):
+    """Match each within-episode speaker to the persisted library (cross-show identity) or mint a new
+    one; record observations; add shared_guest edges when the voice also appears in ANOTHER source."""
+    library = _load_library(conn)
+    cluster_to_vpid: dict[int, str] = {}
+    for lab, cen in centroids.items():
+        vpid = match_library(cen, library, threshold=threshold)
+        if vpid is None:
+            vpid = "vp_" + uuid.uuid4().hex[:16]
+            conn.execute(
+                "INSERT INTO voiceprints (voiceprint_id, vector, first_doc_id) VALUES (?,?,?)",
+                (vpid, cen.astype(np.float32).tobytes(), doc["doc_id"]),
+            )
+            library.append((vpid, cen))
+        conn.execute(
+            "INSERT INTO voiceprint_observations (voiceprint_id, doc_id, chunk_idx) VALUES (?,?,?)",
+            (vpid, doc["doc_id"], None),
+        )
+        cluster_to_vpid[lab] = vpid
+    conn.commit()
+    # independence graph (§4.5): if this voice appears in a DIFFERENT source, that's a shared guest.
+    for vpid in set(cluster_to_vpid.values()):
+        others = conn.execute(
+            """SELECT DISTINCT d.source_id FROM voiceprint_observations o
+                 JOIN documents d ON d.doc_id = o.doc_id
+                WHERE o.voiceprint_id=? AND d.source_id != ?""",
+            (vpid, doc["source_id"]),
+        ).fetchall()
+        for o in others:
+            a, b = sorted([doc["source_id"], o["source_id"]])
+            conn.execute(
+                """INSERT INTO source_edges (src_a, src_b, edge_type, weight, evidence)
+                   VALUES (?,?,'shared_guest',1.0,?)
+                   ON CONFLICT(src_a, src_b, edge_type)
+                   DO UPDATE SET weight = weight + 1.0, evidence = excluded.evidence""",
+                (a, b, vpid),
+            )
+    conn.commit()
+    return cluster_to_vpid
+
+
+def _labeled(chunk_turns, keymap, label_by_cluster: dict) -> str:
+    lines: list[str] = []
+    for idx, turns in chunk_turns:
+        for t in turns:
+            lab = keymap.get((idx, t["speaker"]))
+            label = label_by_cluster.get(lab, t["speaker"])
+            lines.append(f"{label}: {t['text']}")
+    return "\n".join(lines)
+
+
+def build_transcript(conn, chunk_turns, keymap, cluster_to_vpid) -> str:
+    labels = {lab: _label_for(conn, vpid) for lab, vpid in cluster_to_vpid.items()}
+    return _labeled(chunk_turns, keymap, labels)
+
+
+def apply_names(conn, cluster_to_vpid: dict, idmap: dict) -> dict:
+    """Attach confident names to the voiceprint library (person_label). Returns {cluster: name}."""
+    named: dict[int, str] = {}
+    for lab, vpid in cluster_to_vpid.items():
+        info = idmap.get(f"Speaker {lab + 1}") or idmap.get(str(lab + 1)) or {}
+        name = (info.get("name") or "").strip() if isinstance(info, dict) else ""
+        if name and info.get("confidence") in ("med", "high"):
+            conn.execute("UPDATE voiceprints SET person_label=? WHERE voiceprint_id=?", (name, vpid))
+            named[lab] = name
+    conn.commit()
+    return named
+
+
+def add_name_edges(conn, doc, cluster_to_vpid: dict) -> int:
+    """Name-based shared_guest edges: same person_label seen in a DIFFERENT source → independence edge,
+    even if the voiceprints didn't cluster (drift-robust complement to voiceprint matching, §4.5)."""
+    n = 0
+    for vpid in set(cluster_to_vpid.values()):
+        r = conn.execute("SELECT person_label FROM voiceprints WHERE voiceprint_id=?", (vpid,)).fetchone()
+        name = r["person_label"] if r else None
+        if not name:
+            continue
+        others = conn.execute(
+            """SELECT DISTINCT d.source_id FROM voiceprints v
+                 JOIN voiceprint_observations o ON o.voiceprint_id = v.voiceprint_id
+                 JOIN documents d ON d.doc_id = o.doc_id
+                WHERE v.person_label = ? AND d.source_id != ?""",
+            (name, doc["source_id"]),
+        ).fetchall()
+        for o in others:
+            a, b = sorted([doc["source_id"], o["source_id"]])
+            conn.execute(
+                """INSERT INTO source_edges (src_a, src_b, edge_type, weight, evidence)
+                   VALUES (?,?,'shared_guest',1.0,?)
+                   ON CONFLICT(src_a, src_b, edge_type)
+                   DO UPDATE SET weight = weight + 1.0, evidence = excluded.evidence""",
+                (a, b, f"name:{name}"),
+            )
+            n += 1
+    conn.commit()
+    return n
+
+
+def _download_audio(doc, cfg) -> Path:
+    cache = Path(cfg.audio_cache_dir)
+    cache.mkdir(parents=True, exist_ok=True)
+    wav = cache / f"{doc['doc_id'].replace(':', '_')}.wav"
+    if wav.exists():
+        return wav
+    url = doc["url"]
+    if doc["kind"] == "youtube" or (url and ("youtube.com" in url or "youtu.be" in url)):
+        return download_youtube_audio(url, cache, archive_file=cache / "yt-archive.txt")
+    raw = download_enclosure(url, cache / f"{doc['doc_id'].replace(':', '_')}.src")
+    return to_wav_16k_mono(raw, wav)
+
+
+def process_document(conn, sc, cfg, doc, *, max_chunks: int, chunk_seconds: int = 150,
+                     keep_audio: bool = False) -> int:
+    audio = _download_audio(doc, cfg)
+    chunkdir = Path(cfg.audio_cache_dir) / f"chunks_{doc['doc_id'].replace(':', '_')}"
+    chunks = chunk_audio(audio, chunkdir, chunk_seconds=chunk_seconds)[:max_chunks]
+    chunk_turns, chunk_speakers = diarize_transcribe_chunks(
+        sc, chunks, concurrency=getattr(cfg, "audio_concurrency", 2))
+    keymap, centroids = stitch_and_centroids(chunk_speakers)
+    cluster_to_vpid = resolve_voiceprints(conn, doc, centroids)
+
+    # Name the speakers (§4.5): host introduces guest in 1-on-1 → attach person_label, then a
+    # name-based shared_guest edge that survives voiceprint drift across shows.
+    src = conn.execute("SELECT name FROM sources WHERE source_id=?", (doc["source_id"],)).fetchone()
+    try:
+        from ..extract.backends import from_config as backend_from_config
+        from .identify import identify_speakers
+        backend = backend_from_config(cfg, sc)
+        draft = _labeled(chunk_turns, keymap, {lab: f"Speaker {lab + 1}" for lab in cluster_to_vpid})
+        idmap = identify_speakers(backend, draft[:6000], source_name=src["name"] if src else "")
+        named = apply_names(conn, cluster_to_vpid, idmap)
+        if named:
+            log.info("named speakers in %s: %s", doc["doc_id"], ", ".join(named.values()))
+    except Exception as e:  # noqa: BLE001 — naming is best-effort enrichment
+        log.warning("speaker identification failed for %s: %s", doc["doc_id"], e)
+    add_name_edges(conn, doc, cluster_to_vpid)
+
+    transcript = build_transcript(conn, chunk_turns, keymap, cluster_to_vpid)
+    tpath = Path(cfg.data_dir) / "transcripts" / f"{doc['doc_id'].replace(':', '_')}.txt"
+    tpath.parent.mkdir(parents=True, exist_ok=True)
+    tpath.write_text(transcript)
+    import hashlib
+    content_hash = hashlib.sha256(transcript.encode()).hexdigest()
+    conn.execute(
+        "UPDATE documents SET transcript_path=?, duration_sec=?, content_hash=?, processed_at=datetime('now') WHERE doc_id=?",
+        (str(tpath), len(chunks) * chunk_seconds, content_hash, doc["doc_id"]),
+    )
+    conn.commit()
+    h = hashlib.sha256(f"{doc['doc_id']}|extract-v0".encode()).hexdigest()
+    queue.enqueue(conn, job_type="extract", target_id=doc["doc_id"], input_hash=h,
+                  parent_doc_id=doc["doc_id"], priority=100)
+    if not keep_audio:
+        _cleanup_audio(audio, chunkdir)
+    return len(chunk_turns)
+
+
+def _cleanup_audio(audio: Path, chunkdir: Path) -> None:
+    """Audio files are large and disposable once transcribed — reclaim the disk (the transcript +
+    voiceprints are what we keep). Backfilling hundreds of 1-3 hr episodes would otherwise be tens of GB."""
+    import shutil
+    try:
+        if audio.exists():
+            audio.unlink()
+        src = audio.with_suffix(".src")
+        if src.exists():
+            src.unlink()
+        if chunkdir.exists():
+            shutil.rmtree(chunkdir, ignore_errors=True)
+    except Exception as e:  # noqa: BLE001
+        log.warning("audio cleanup failed for %s: %s", audio, e)
+
+
+def run_transcribe(conn, sc, cfg, *, limit: int = 5, max_chunks: int = 999,
+                   lease_seconds: int = 3600, worker_id: str = "transcribe-1") -> dict:
+    processed = 0
+    while processed < limit:
+        job = queue.lease_next(conn, worker_id=worker_id, job_types=["transcribe"], lease_seconds=lease_seconds)
+        if job is None:
+            break
+        processed += 1
+        doc = conn.execute("SELECT * FROM documents WHERE doc_id=?", (job["target_id"],)).fetchone()
+        if doc is None:
+            queue.skip(conn, job["job_id"], "document missing")
+            continue
+        try:
+            n = process_document(conn, sc, cfg, doc, max_chunks=max_chunks)
+            queue.complete(conn, job["job_id"], output_ref=f"{n} chunks")
+            log.info("transcribed %s (%d chunks)", doc["doc_id"], n)
+        except Exception as e:  # noqa: BLE001
+            state = queue.fail(conn, job["job_id"], e)
+            log.warning("transcribe failed for %s: %s (→ %s)", job["target_id"], e, state)
+    return {"jobs_processed": processed}