Initial commit: Ten31 Signal Engine (ingest, scoring brain, corpus seeds)
This commit is contained in:
@@ -0,0 +1,5 @@
|
||||
"""Ingestion layer (§4.1) — the biggest greenfield piece.
|
||||
|
||||
Spark Control transcribes audio you hand it; it does NOT fetch. Everything here is fetch/schedule:
|
||||
RSS + YouTube + EDGAR + FMP earnings, long-audio chunking, and cross-chunk speaker stitching.
|
||||
"""
|
||||
@@ -0,0 +1,36 @@
|
||||
"""Long-audio chunking (§4.1, §13.4).
|
||||
|
||||
Podcasts run 1–3 h; the diarizer caps at 4 speakers/chunk and Spark 2 is a single GPU, so we cut
|
||||
long audio into ~2–3 min pieces sent SEQUENTIALLY (parallel audio → 503 FFT race). Each chunk is
|
||||
diarized independently and re-stitched across chunks by voiceprint (see speaker_stitch.py).
|
||||
Requires ffmpeg/ffprobe.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
CHUNK_SECONDS_DEFAULT = 150 # 2.5 min, within the ~2–3 min guidance
|
||||
|
||||
|
||||
def duration_seconds(src: str | Path) -> float:
|
||||
out = subprocess.run(
|
||||
["ffprobe", "-v", "error", "-show_entries", "format=duration",
|
||||
"-of", "default=noprint_wrappers=1:nokey=1", str(src)],
|
||||
check=True, capture_output=True, text=True,
|
||||
)
|
||||
return float(out.stdout.strip())
|
||||
|
||||
|
||||
def chunk_audio(src: str | Path, out_dir: str | Path, *, chunk_seconds: int = CHUNK_SECONDS_DEFAULT) -> list[Path]:
|
||||
"""Split into fixed-length WAV chunks using ffmpeg's segment muxer (no re-encode of timing).
|
||||
Returns chunk paths in order. Order matters: the queue sends them sequentially."""
|
||||
out_dir = Path(out_dir)
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
pattern = str(out_dir / "chunk_%04d.wav")
|
||||
subprocess.run(
|
||||
["ffmpeg", "-y", "-i", str(src), "-f", "segment", "-segment_time", str(chunk_seconds),
|
||||
"-ar", "16000", "-ac", "1", "-reset_timestamps", "1", pattern],
|
||||
check=True, capture_output=True,
|
||||
)
|
||||
return sorted(out_dir.glob("chunk_*.wav"))
|
||||
@@ -0,0 +1,159 @@
|
||||
"""Text-document fetcher for the Battery (bitcoin-collateralized lending) corpus and any non-filing,
|
||||
non-audio source: policy primaries (SEC SABs, OCC/FDIC/Fed), lender/issuer blogs, credit-market data.
|
||||
|
||||
Unlike EDGAR (CIK-driven) and the podcast path (audio→transcribe), these are dated HTML pages, PDFs, or
|
||||
article RSS feeds. We fetch ONCE, extract clean text (HTML via html_to_text, PDF via pypdf), save it, and
|
||||
point documents.transcript_path at the saved text so the extract worker reads it directly (it already
|
||||
supports transcript_path) — this also lets PDFs work, which the worker's on-demand html_to_text fetch can't.
|
||||
|
||||
A source row must exist first (FK). Lineage/axis live on the source's cluster/notes (set in the seed);
|
||||
policy sources are axis=context and must NOT feed the supply resolver (weight 0) — enforced downstream.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import io
|
||||
import logging
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
|
||||
from ..backfill import queue
|
||||
from ..extract.html_text import html_to_text
|
||||
from .feeds import fetch_feed
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
DEFAULT_UA = "ten31-signal-engine/1.0 (research; contact ops@ten31.xyz)"
|
||||
|
||||
|
||||
def _pdf_to_text(data: bytes, *, max_chars: int) -> str:
|
||||
import pypdf
|
||||
reader = pypdf.PdfReader(io.BytesIO(data))
|
||||
parts: list[str] = []
|
||||
total = 0
|
||||
for page in reader.pages:
|
||||
t = page.extract_text() or ""
|
||||
parts.append(t)
|
||||
total += len(t)
|
||||
if total > max_chars:
|
||||
break
|
||||
return "\n".join(parts)[:max_chars]
|
||||
|
||||
|
||||
def fetch_clean_text(url: str, *, method: str = "auto", ua: str = DEFAULT_UA,
|
||||
timeout: int = 90, max_chars: int = 300_000) -> str:
|
||||
"""Fetch a URL once and return clean text. Auto-detects PDF vs HTML by content-type + magic bytes."""
|
||||
r = requests.get(url, headers={"User-Agent": ua}, timeout=timeout)
|
||||
r.raise_for_status()
|
||||
ctype = r.headers.get("Content-Type", "").lower()
|
||||
is_pdf = method == "pdf" or "application/pdf" in ctype or r.content[:5] == b"%PDF-"
|
||||
if is_pdf:
|
||||
return _pdf_to_text(r.content, max_chars=max_chars)
|
||||
return html_to_text(r.text, max_chars=max_chars)
|
||||
|
||||
|
||||
_BLOCK_MARKERS = (
|
||||
"aggressive automated scraping", "request access", "access denied", "are you a robot",
|
||||
"enable javascript", "captcha", "verify you are human", "rate limit exceeded",
|
||||
"403 forbidden", "unusual traffic", "checking your browser",
|
||||
)
|
||||
|
||||
|
||||
def _looks_blocked(text: str) -> bool:
|
||||
"""Anti-scraping interstitials return 200 + a short access-denied body. Detect so we don't ingest
|
||||
a block page as if it were the document (a real policy/blog doc is long and has no such markers)."""
|
||||
low = text[:2500].lower()
|
||||
return any(m in low for m in _BLOCK_MARKERS)
|
||||
|
||||
|
||||
def _doc_id(source_id: str, url: str) -> str:
|
||||
return f"doc:{source_id}:{hashlib.sha256(url.encode()).hexdigest()[:12]}"
|
||||
|
||||
|
||||
def ingest_one(conn: sqlite3.Connection, cfg, *, source_id: str, url: str, title: str,
|
||||
date: str | None, method: str = "auto", prompt_version: str = "extract-v0",
|
||||
min_chars: int = 400) -> str | None:
|
||||
"""Fetch+store one text document and enqueue extraction. Idempotent on (source_id, url).
|
||||
Returns doc_id if newly ingested, else None (duplicate, too-short, or fetch error → logged)."""
|
||||
doc_id = _doc_id(source_id, url)
|
||||
if conn.execute("SELECT 1 FROM documents WHERE doc_id=?", (doc_id,)).fetchone():
|
||||
return None
|
||||
ua = getattr(cfg, "user_agent", None) or DEFAULT_UA
|
||||
try:
|
||||
text = fetch_clean_text(url, method=method, ua=ua)
|
||||
except Exception as e: # noqa: BLE001
|
||||
log.warning("doc fetch failed %s: %s", url, e)
|
||||
return None
|
||||
if not text or len(text) < min_chars:
|
||||
log.warning("doc too short (%d chars), skipping %s", len(text or ""), url)
|
||||
return None
|
||||
if _looks_blocked(text):
|
||||
log.warning("blocked/anti-scrape page detected, skipping %s", url)
|
||||
return None
|
||||
safe = doc_id.replace(":", "_")
|
||||
tpath = Path(cfg.data_dir) / "docs" / f"{safe}.txt"
|
||||
tpath.parent.mkdir(parents=True, exist_ok=True)
|
||||
tpath.write_text(text)
|
||||
content_hash = hashlib.sha256(text.encode()).hexdigest()
|
||||
conn.execute(
|
||||
"""INSERT OR IGNORE INTO documents
|
||||
(doc_id, source_id, kind, external_id, url, title, date, transcript_path, content_hash, processed_at)
|
||||
VALUES (?,?,?,?,?,?,?,?,?,datetime('now'))""",
|
||||
(doc_id, source_id, "filing", url, url, title[:300] if title else url, date, str(tpath), content_hash),
|
||||
)
|
||||
conn.commit()
|
||||
h = hashlib.sha256(f"{doc_id}|{prompt_version}".encode()).hexdigest()
|
||||
queue.enqueue(conn, job_type="extract", target_id=doc_id, input_hash=h,
|
||||
parent_doc_id=doc_id, priority=50)
|
||||
conn.commit()
|
||||
log.info("ingested doc %s (%d chars) for %s", doc_id, len(text), source_id)
|
||||
return doc_id
|
||||
|
||||
|
||||
def ingest_manifest(conn: sqlite3.Connection, cfg, path) -> dict:
|
||||
"""Batch-ingest the docs listed in a YAML manifest ({docs:[{source,url,title,date,method}]}).
|
||||
Returns {ingested, skipped, missing_source}. Each source must already exist (FK)."""
|
||||
import yaml
|
||||
from pathlib import Path as _Path
|
||||
data = yaml.safe_load(_Path(path).read_text()) or {}
|
||||
docs = data.get("docs", [])
|
||||
ingested = skipped = missing = 0
|
||||
for d in docs:
|
||||
src = d.get("source")
|
||||
if not conn.execute("SELECT 1 FROM sources WHERE source_id=?", (src,)).fetchone():
|
||||
log.warning("manifest doc references missing source %r — skipping %s", src, d.get("url"))
|
||||
missing += 1
|
||||
continue
|
||||
doc_id = ingest_one(conn, cfg, source_id=src, url=d["url"], title=d.get("title", d["url"]),
|
||||
date=d.get("date"), method=d.get("method", "auto"))
|
||||
if doc_id:
|
||||
ingested += 1
|
||||
else:
|
||||
skipped += 1
|
||||
return {"ingested": ingested, "skipped": skipped, "missing_source": missing}
|
||||
|
||||
|
||||
def ingest_feed_text(conn: sqlite3.Connection, cfg, *, source_id: str, rss_url: str,
|
||||
since: str | None = None, until: str | None = None, limit: int = 50) -> int:
|
||||
"""Ingest the ARTICLE bodies behind a text RSS feed (blog/press feed). Each item's link is fetched
|
||||
and stored as a dated text document. Returns count of newly-ingested docs."""
|
||||
from .feeds import _published_iso
|
||||
parsed = fetch_feed(rss_url, user_agent=getattr(cfg, "user_agent", None) or DEFAULT_UA)
|
||||
n = 0
|
||||
for entry in parsed.entries:
|
||||
if n >= limit:
|
||||
break
|
||||
link = entry.get("link")
|
||||
if not link:
|
||||
continue
|
||||
date = _published_iso(entry)
|
||||
if since and date and date < since:
|
||||
continue
|
||||
if until and date and date > until:
|
||||
continue
|
||||
if ingest_one(conn, cfg, source_id=source_id, url=link,
|
||||
title=entry.get("title", link), date=date):
|
||||
n += 1
|
||||
return n
|
||||
@@ -0,0 +1,61 @@
|
||||
"""Audio acquisition (§4.1). Spark Control transcribes audio you fetch — this fetches it.
|
||||
|
||||
- Podcast enclosures: a plain streaming download that follows the Podtrac/Megaphone redirects to the
|
||||
final signed CDN object (download immediately; resolved URLs carry short-lived params).
|
||||
- YouTube: yt-dlp (audio-only → 16 kHz mono WAV). NOTE: 2026 YouTube enforces PO Tokens broadly — run
|
||||
the `bgutil-ytdlp-pot-provider` sidecar or pulls will 403. yt-dlp is treated as a LAST resort; prefer
|
||||
the RSS enclosure where a show publishes both (ToS: downloading YT audio violates YouTube ToS).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
|
||||
DEFAULT_UA = "Ten31SignalEngine/0.1 (+https://ten31.xyz)"
|
||||
|
||||
|
||||
def download_enclosure(url: str, dest: str | Path, *, user_agent: str = DEFAULT_UA, timeout: int = 120) -> Path:
|
||||
dest = Path(dest)
|
||||
dest.parent.mkdir(parents=True, exist_ok=True)
|
||||
with requests.get(url, stream=True, allow_redirects=True,
|
||||
headers={"User-Agent": user_agent}, timeout=timeout) as r:
|
||||
r.raise_for_status()
|
||||
with open(dest, "wb") as f:
|
||||
for chunk in r.iter_content(chunk_size=1 << 16):
|
||||
f.write(chunk)
|
||||
return dest
|
||||
|
||||
|
||||
def to_wav_16k_mono(src: str | Path, dst: str | Path) -> Path:
|
||||
"""Normalize any audio to 16 kHz mono PCM WAV (what the ASR endpoint wants). Requires ffmpeg."""
|
||||
dst = Path(dst)
|
||||
dst.parent.mkdir(parents=True, exist_ok=True)
|
||||
subprocess.run(
|
||||
["ffmpeg", "-y", "-i", str(src), "-ar", "16000", "-ac", "1", "-f", "wav", str(dst)],
|
||||
check=True, capture_output=True,
|
||||
)
|
||||
return dst
|
||||
|
||||
|
||||
def download_youtube_audio(url: str, out_dir: str | Path, *, archive_file: str | Path | None = None) -> Path:
|
||||
"""Audio-only via yt-dlp → 16 kHz mono WAV. `archive_file` (yt-dlp --download-archive) is the
|
||||
canonical 'only-new' dedup for channel/playlist back-catalog pulls."""
|
||||
out_dir = Path(out_dir)
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
cmd = [
|
||||
"yt-dlp", "-f", "bestaudio/best", "-x", "--audio-format", "wav",
|
||||
"--postprocessor-args", "ffmpeg:-ar 16000 -ac 1",
|
||||
"-o", str(out_dir / "%(id)s.%(ext)s"),
|
||||
"--no-progress",
|
||||
]
|
||||
if archive_file:
|
||||
cmd += ["--download-archive", str(archive_file)]
|
||||
cmd.append(url)
|
||||
subprocess.run(cmd, check=True, capture_output=True)
|
||||
# yt-dlp names the file by video id; return the newest wav
|
||||
wavs = sorted(out_dir.glob("*.wav"), key=lambda p: p.stat().st_mtime)
|
||||
if not wavs:
|
||||
raise RuntimeError("yt-dlp produced no wav (PO-token/cookies issue? see module docstring)")
|
||||
return wavs[-1]
|
||||
@@ -0,0 +1,127 @@
|
||||
"""Earnings-call transcripts via Financial Modeling Prep (§4.1, §12 — decision: FMP).
|
||||
|
||||
Audio isn't reliably fetchable for large-caps (no uniform feed; ~30–90d replay expiry breaks
|
||||
backfill), so FMP's transcript API is the backbone and EDGAR filings remain the durable core. FMP
|
||||
also exposes an earnings *calendar* to trigger ingestion on the day a call drops.
|
||||
|
||||
Endpoint paths/params are marked TODO(contract): confirm against the FMP 'stable' docs for the
|
||||
account tier at integration. Needs config.fmp_api_key.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import requests
|
||||
|
||||
FMP_BASE = "https://financialmodelingprep.com/stable"
|
||||
|
||||
|
||||
class FMPClient:
|
||||
def __init__(self, api_key: str, *, base: str = FMP_BASE, timeout: int = 30) -> None:
|
||||
if not api_key:
|
||||
raise ValueError("FMP_API_KEY is required for earnings-call transcripts")
|
||||
self.api_key = api_key
|
||||
self.base = base
|
||||
self.timeout = timeout
|
||||
self.s = requests.Session()
|
||||
|
||||
def _get(self, path: str, **params: Any) -> Any:
|
||||
params["apikey"] = self.api_key
|
||||
r = self.s.get(f"{self.base}/{path}", params=params, timeout=self.timeout)
|
||||
r.raise_for_status()
|
||||
return r.json()
|
||||
|
||||
# Confirmed against FMP 'stable' 2026-06-07 (v3 is legacy/403). Note singular "earning".
|
||||
def transcript_dates(self, symbol: str) -> Any:
|
||||
"""List available transcripts: [{quarter, fiscalYear, date}, ...]."""
|
||||
return self._get("earning-call-transcript-dates", symbol=symbol)
|
||||
|
||||
def transcript(self, symbol: str, *, year: int, quarter: int) -> Any:
|
||||
"""One transcript: [{symbol, period, year, date, content}]. Use the `date` field as the
|
||||
document date — FMP's year/quarter labels are fiscal and can be offset from the call date."""
|
||||
return self._get("earning-call-transcript", symbol=symbol, year=year, quarter=quarter)
|
||||
|
||||
def earnings_calendar(self, *, from_date: str, to_date: str) -> Any:
|
||||
"""Earnings calendar (ingestion trigger): [{symbol, date, epsActual, ...}, ...]."""
|
||||
return self._get("earnings-calendar", **{"from": from_date, "to": to_date})
|
||||
|
||||
|
||||
def ingest_transcript(
|
||||
conn: sqlite3.Connection,
|
||||
*,
|
||||
source_id: str,
|
||||
symbol: str,
|
||||
year: int,
|
||||
quarter: int,
|
||||
content: str,
|
||||
date: str | None,
|
||||
data_dir: Path,
|
||||
prompt_version: str = "extract-v0",
|
||||
) -> tuple[bool, bool]:
|
||||
"""Store one transcript (content written to disk → transcript_path) and enqueue an 'extract'
|
||||
job. Idempotent. Returns (new_document, new_job)."""
|
||||
from ..backfill import queue
|
||||
|
||||
external_id = f"{symbol}-{year}Q{quarter}"
|
||||
doc_id = f"earnings:{external_id}"
|
||||
tdir = Path(data_dir) / "transcripts"
|
||||
tdir.mkdir(parents=True, exist_ok=True)
|
||||
tpath = tdir / f"{external_id}.txt"
|
||||
tpath.write_text(content)
|
||||
content_hash = hashlib.sha256(content.encode()).hexdigest()
|
||||
cur = conn.execute(
|
||||
"""INSERT OR IGNORE INTO documents
|
||||
(doc_id, source_id, kind, external_id, title, date, transcript_path, content_hash, processed_at)
|
||||
VALUES (?,?,?,?,?,?,?,?, datetime('now'))""",
|
||||
(doc_id, source_id, "earnings_call", external_id, f"{symbol} {year} Q{quarter} call",
|
||||
date, str(tpath), content_hash),
|
||||
)
|
||||
conn.commit()
|
||||
if not cur.rowcount:
|
||||
return (False, False)
|
||||
# earnings-call Q&A is the highest-yield text source (§4.1) → priority 40, ahead of filings (50).
|
||||
h = hashlib.sha256(f"{doc_id}|{prompt_version}".encode()).hexdigest()
|
||||
new_job = queue.enqueue(conn, job_type="extract", target_id=doc_id, input_hash=h,
|
||||
parent_doc_id=doc_id, priority=40) is not None
|
||||
return (True, new_job)
|
||||
|
||||
|
||||
def ingest_for_ticker(
|
||||
conn: sqlite3.Connection,
|
||||
fmp: FMPClient,
|
||||
*,
|
||||
source_id: str,
|
||||
symbol: str,
|
||||
data_dir: Path,
|
||||
since: str | None = None,
|
||||
until: str | None = None,
|
||||
limit: int = 8,
|
||||
) -> tuple[int, int]:
|
||||
"""Enumerate available transcripts via the dates index, fetch those in [since, until], and
|
||||
ingest. Uses each transcript's own `date` (FMP fiscal labels are offset). Returns (docs, jobs)."""
|
||||
dates = fmp.transcript_dates(symbol)
|
||||
picked = []
|
||||
for d in dates if isinstance(dates, list) else []:
|
||||
dt = d.get("date")
|
||||
if since and dt and dt < since:
|
||||
continue
|
||||
if until and dt and dt > until:
|
||||
continue
|
||||
picked.append(d)
|
||||
n_docs = n_jobs = 0
|
||||
for d in picked[:limit]:
|
||||
tr = fmp.transcript(symbol, year=d["fiscalYear"], quarter=d["quarter"])
|
||||
item = (tr[0] if isinstance(tr, list) and tr else tr) or {}
|
||||
content = item.get("content") or ""
|
||||
if not content:
|
||||
continue
|
||||
nd, nj = ingest_transcript(
|
||||
conn, source_id=source_id, symbol=symbol, year=d["fiscalYear"], quarter=d["quarter"],
|
||||
content=content, date=item.get("date") or d.get("date"), data_dir=data_dir,
|
||||
)
|
||||
n_docs += int(nd)
|
||||
n_jobs += int(nj)
|
||||
return n_docs, n_jobs
|
||||
@@ -0,0 +1,148 @@
|
||||
"""SEC EDGAR ingestion (§4.1).
|
||||
|
||||
Hits the official data.sec.gov / www.sec.gov APIs directly (free, keyless, full history).
|
||||
Two hard requirements:
|
||||
- a descriptive User-Agent (SEC 403s requests without one) — from config.edgar_user_agent.
|
||||
- ≤10 requests/sec aggregate — enforced by a min-interval throttle here.
|
||||
|
||||
Supports an explicit date range AND historical shards (filings.files[]), so the §7.1 backtest can
|
||||
reach 2022–2023 filings, not just the most-recent ~1000.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import sqlite3
|
||||
import time
|
||||
from typing import Iterator
|
||||
|
||||
import requests
|
||||
|
||||
_FILING_COLS = ("accessionNumber", "form", "filingDate", "primaryDocument", "primaryDocDescription")
|
||||
|
||||
|
||||
class EdgarClient:
|
||||
BASE_DATA = "https://data.sec.gov"
|
||||
BASE_WWW = "https://www.sec.gov"
|
||||
|
||||
def __init__(self, user_agent: str, *, min_interval: float = 0.12) -> None:
|
||||
if not user_agent or "@" not in user_agent:
|
||||
raise ValueError("EDGAR requires a descriptive User-Agent with contact email (config.edgar_user_agent)")
|
||||
self.s = requests.Session()
|
||||
self.s.headers.update({"User-Agent": user_agent, "Accept-Encoding": "gzip, deflate"})
|
||||
self.min_interval = min_interval
|
||||
self._last = 0.0
|
||||
self._tickers: dict[str, int] | None = None
|
||||
|
||||
def _throttle(self) -> None:
|
||||
dt = time.monotonic() - self._last
|
||||
if dt < self.min_interval:
|
||||
time.sleep(self.min_interval - dt)
|
||||
self._last = time.monotonic()
|
||||
|
||||
def _get(self, url: str) -> requests.Response:
|
||||
self._throttle()
|
||||
r = self.s.get(url, timeout=30)
|
||||
r.raise_for_status()
|
||||
return r
|
||||
|
||||
# ---- ticker → CIK ----
|
||||
def ticker_map(self) -> dict[str, int]:
|
||||
if self._tickers is None:
|
||||
data = self._get(f"{self.BASE_WWW}/files/company_tickers.json").json()
|
||||
self._tickers = {row["ticker"].upper(): int(row["cik_str"]) for row in data.values()}
|
||||
return self._tickers
|
||||
|
||||
def cik_for(self, ticker: str) -> int | None:
|
||||
return self.ticker_map().get(ticker.upper())
|
||||
|
||||
# ---- filings ----
|
||||
def _iter_array(self, block: dict, forms, since, until) -> Iterator[dict]:
|
||||
arrays = [block.get(c, []) for c in _FILING_COLS]
|
||||
for acc, form, fdate, pdoc, pdesc in zip(*arrays):
|
||||
if forms and form not in forms:
|
||||
continue
|
||||
if since and fdate < since:
|
||||
continue
|
||||
if until and fdate > until:
|
||||
continue
|
||||
yield {"accession": acc, "form": form, "filing_date": fdate,
|
||||
"primary_document": pdoc, "description": pdesc}
|
||||
|
||||
def iter_filings(
|
||||
self,
|
||||
cik: int,
|
||||
*,
|
||||
forms: tuple[str, ...] = ("10-K", "10-Q", "8-K"),
|
||||
since: str | None = None,
|
||||
until: str | None = None,
|
||||
) -> Iterator[dict]:
|
||||
"""Yield filing descriptors. Pulls the inline 'recent' block AND any historical shards whose
|
||||
date window overlaps [since, until] — required to reach the backtest era for active filers."""
|
||||
sub = self._get(f"{self.BASE_DATA}/submissions/CIK{cik:010d}.json").json()
|
||||
recent = sub.get("filings", {}).get("recent", {})
|
||||
for f in self._iter_array(recent, forms, since, until):
|
||||
yield self._with_url(cik, f)
|
||||
for shard in sub.get("filings", {}).get("files", []):
|
||||
# shard has filingFrom / filingTo; skip shards entirely outside the window.
|
||||
if until and shard.get("filingFrom", "") > until:
|
||||
continue
|
||||
if since and shard.get("filingTo", "9999") < since:
|
||||
continue
|
||||
block = self._get(f"{self.BASE_DATA}/submissions/{shard['name']}").json()
|
||||
for f in self._iter_array(block, forms, since, until):
|
||||
yield self._with_url(cik, f)
|
||||
|
||||
def _with_url(self, cik: int, f: dict) -> dict:
|
||||
acc_nodash = f["accession"].replace("-", "")
|
||||
f["cik"] = cik
|
||||
f["url"] = f"{self.BASE_WWW}/Archives/edgar/data/{cik}/{acc_nodash}/{f['primary_document']}"
|
||||
return f
|
||||
|
||||
def fetch_html(self, filing: dict) -> str:
|
||||
return self._get(filing["url"]).text
|
||||
|
||||
|
||||
# Domestic annual/quarterly + foreign-private-issuer equivalents. 20-F (foreign annual, e.g. TSM/IREN),
|
||||
# 40-F (Canadian annual, e.g. CCJ). 8-K/6-K (current reports) excluded by default — low claim yield.
|
||||
HIGH_YIELD_FORMS = ("10-K", "10-Q", "20-F", "40-F")
|
||||
|
||||
|
||||
def ingest_filings(
|
||||
conn: sqlite3.Connection,
|
||||
client: EdgarClient,
|
||||
*,
|
||||
source_id: str,
|
||||
ticker: str,
|
||||
since: str | None = None,
|
||||
until: str | None = None,
|
||||
forms: tuple[str, ...] = HIGH_YIELD_FORMS,
|
||||
prompt_version: str = "extract-v0",
|
||||
) -> tuple[int, int]:
|
||||
"""Insert filing documents and enqueue 'extract' jobs. Filings are text → no transcription;
|
||||
they go straight to extraction (the extract worker fetches + strips the HTML later). Default
|
||||
forms cover both domestic (10-K/10-Q) and foreign-private-issuer (20-F/40-F) filers.
|
||||
Returns (new_documents, new_jobs). Idempotent on (source_id, accession)."""
|
||||
from ..backfill import queue
|
||||
|
||||
cik = client.cik_for(ticker)
|
||||
if cik is None:
|
||||
raise ValueError(f"No CIK found for ticker {ticker!r}")
|
||||
n_docs = n_jobs = 0
|
||||
for f in client.iter_filings(cik, forms=forms, since=since, until=until):
|
||||
doc_id = f"edgar:{f['accession']}"
|
||||
cur = conn.execute(
|
||||
"""INSERT OR IGNORE INTO documents (doc_id, source_id, kind, external_id, url, title, date)
|
||||
VALUES (?,?,?,?,?,?,?)""",
|
||||
(doc_id, source_id, "filing", f["accession"], f["url"],
|
||||
f"{ticker} {f['form']} {f['filing_date']}", f["filing_date"]),
|
||||
)
|
||||
conn.commit()
|
||||
if not cur.rowcount:
|
||||
continue
|
||||
n_docs += 1
|
||||
h = hashlib.sha256(f"{doc_id}|{prompt_version}".encode()).hexdigest()
|
||||
# priority 50: filings are high-info-density (§4.1) → ahead of podcasts (100)
|
||||
if queue.enqueue(conn, job_type="extract", target_id=doc_id, input_hash=h,
|
||||
parent_doc_id=doc_id, priority=50) is not None:
|
||||
n_jobs += 1
|
||||
return n_docs, n_jobs
|
||||
@@ -0,0 +1,65 @@
|
||||
"""Podcast RSS ingestion (§4.1).
|
||||
|
||||
feedparser + conditional GET (ETag/Last-Modified) for efficient incremental polling, with a
|
||||
composite (feed_url, guid) dedup discipline. Many podcast CDNs send no validators and some feeds
|
||||
truncate to recent episodes — for the §7.1 backtest, older episodes may need the show's full
|
||||
archive feed (some hosts expose `?limit=` / a separate archive URL) or a YouTube back-catalog.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import time
|
||||
from typing import Any
|
||||
|
||||
import feedparser
|
||||
|
||||
DEFAULT_UA = "Ten31SignalEngine/0.1 (+https://ten31.xyz)"
|
||||
|
||||
|
||||
def fetch_feed(url: str, *, etag: str | None = None, modified: str | None = None,
|
||||
user_agent: str = DEFAULT_UA) -> feedparser.FeedParserDict:
|
||||
"""Conditional GET. On HTTP 304 the result has .status == 304 and .entries == [] → skip."""
|
||||
return feedparser.parse(url, etag=etag, modified=modified, agent=user_agent)
|
||||
|
||||
|
||||
def _published_iso(entry: Any) -> str | None:
|
||||
t = entry.get("published_parsed") or entry.get("updated_parsed")
|
||||
if not t:
|
||||
return None
|
||||
return time.strftime("%Y-%m-%d", t)
|
||||
|
||||
|
||||
def _enclosure_audio_url(entry: Any) -> str | None:
|
||||
for enc in entry.get("enclosures", []) or []:
|
||||
if str(enc.get("type", "")).startswith("audio"):
|
||||
return enc.get("href") or enc.get("url")
|
||||
# some feeds put audio only in links rel=enclosure
|
||||
for link in entry.get("links", []) or []:
|
||||
if link.get("rel") == "enclosure" and str(link.get("type", "")).startswith("audio"):
|
||||
return link.get("href")
|
||||
return None
|
||||
|
||||
|
||||
def _guid(entry: Any) -> str:
|
||||
g = entry.get("id") or entry.get("link")
|
||||
if g:
|
||||
return str(g)
|
||||
basis = f"{entry.get('title','')}|{entry.get('published','')}"
|
||||
return "sha1:" + hashlib.sha1(basis.encode()).hexdigest()
|
||||
|
||||
|
||||
def episode_records(parsed: feedparser.FeedParserDict) -> list[dict]:
|
||||
"""Normalize feed entries to episode records. Skips entries with no audio enclosure."""
|
||||
out: list[dict] = []
|
||||
for e in parsed.entries:
|
||||
audio = _enclosure_audio_url(e)
|
||||
if not audio:
|
||||
continue
|
||||
out.append({
|
||||
"guid": _guid(e),
|
||||
"title": e.get("title"),
|
||||
"audio_url": audio,
|
||||
"link": e.get("link"),
|
||||
"published": _published_iso(e),
|
||||
})
|
||||
return out
|
||||
@@ -0,0 +1,195 @@
|
||||
"""One-time backfill path: transcribe podcast episodes via the Gemini multimodal API instead of the
|
||||
local Spark Parakeet+diarizer pipeline. Used to take a bulk backfill OFF the shared Spark GPU (which
|
||||
contends with production) — it is NOT the steady-state transcriber (local Parakeet remains the default).
|
||||
|
||||
Scope/guardrail: podcast audio is PUBLIC data, so sending it to the frontier does NOT trip the
|
||||
exposure/positioning-data rule (that guardrail is about Ten31's conviction/exposure data, never public
|
||||
audio). Output is written in the SAME 'Speaker: text' transcript format the extractor consumes, so the
|
||||
downstream extract→embed stages are agnostic to which transcriber produced the file.
|
||||
|
||||
Tradeoff vs local: Gemini yields speaker-LABELED text, not voiceprint fingerprints — so no voiceprint
|
||||
auto-edges. We rely on the hand-seeded EISC edges + name-based attribution instead (acceptable for a
|
||||
bounded backfill).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import logging
|
||||
import re
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from pathlib import Path
|
||||
|
||||
from ..backfill import queue
|
||||
from .download import download_enclosure
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
_PROMPT = (
|
||||
"You are a precise podcast transcriptionist. Transcribe this audio VERBATIM as a speaker-diarized "
|
||||
"transcript.\n"
|
||||
"RULES:\n"
|
||||
"- One line per speaker turn, formatted exactly as `Name: spoken text` (a colon and one space).\n"
|
||||
"- The host of this show is {host} — label every host turn with exactly `{host}` (the person's "
|
||||
"name, never the show's name).\n"
|
||||
"- When the host introduces a guest by name (e.g. 'welcome X to the show', 'I'm joined by X'), use "
|
||||
"that real first name (or full name) as the guest's label for the WHOLE transcript. Only fall back "
|
||||
"to `Guest` (or `Guest 2`, `Guest 3`) if a name is never stated. Do not invent names.\n"
|
||||
"- Do NOT include timestamps, ad-reads markers, summaries, headings, markdown, or any commentary. "
|
||||
"Only the transcript lines.\n"
|
||||
"- Transcribe the entire episode from start to finish. Do not stop early or summarize.\n"
|
||||
)
|
||||
|
||||
|
||||
def _host_person(source_name: str) -> str:
|
||||
"""Derive the host's PERSON name from a source/show name so claimant attribution isn't the show.
|
||||
'What Bitcoin Did (Peter McCormack)' -> 'Peter McCormack'; 'Stephan Livera Podcast' -> 'Stephan
|
||||
Livera'; 'The Kevin Rooke Show' -> 'Kevin Rooke'; 'The Anita Posch Show' -> 'Anita Posch'."""
|
||||
m = re.search(r"\(([^)]+)\)", source_name or "")
|
||||
if m:
|
||||
return m.group(1).strip()
|
||||
s = re.sub(r"^The\s+", "", source_name or "").strip()
|
||||
s = re.sub(r"\s+(Podcast|Show)$", "", s, flags=re.I).strip()
|
||||
return s
|
||||
|
||||
|
||||
def _sniff_audio_mime(path: Path) -> str:
|
||||
"""Determine audio MIME from the file header — the downloaded enclosure has a generic `.src`
|
||||
extension, so the Files API can't infer it and rejects the upload without an explicit mime_type."""
|
||||
with open(path, "rb") as fh:
|
||||
head = fh.read(16)
|
||||
if head[:3] == b"ID3" or (len(head) > 1 and head[0] == 0xFF and (head[1] & 0xE0) == 0xE0):
|
||||
return "audio/mpeg"
|
||||
if head[4:8] == b"ftyp":
|
||||
return "audio/mp4" # m4a/aac
|
||||
if head[:4] == b"OggS":
|
||||
return "audio/ogg"
|
||||
if head[:4] == b"RIFF":
|
||||
return "audio/wav"
|
||||
if head[:4] == b"fLaC":
|
||||
return "audio/flac"
|
||||
return "audio/mpeg" # podcast default
|
||||
|
||||
|
||||
def _upload_and_wait(client, audio_path: Path, *, poll_s: float = 2.0, timeout_s: float = 300.0):
|
||||
"""Upload to the Files API and wait until the file is ACTIVE (audio is processed server-side)."""
|
||||
from google.genai import types
|
||||
mime = _sniff_audio_mime(audio_path)
|
||||
f = client.files.upload(file=str(audio_path), config=types.UploadFileConfig(mime_type=mime))
|
||||
waited = 0.0
|
||||
while getattr(f.state, "name", str(f.state)) == "PROCESSING" and waited < timeout_s:
|
||||
time.sleep(poll_s)
|
||||
waited += poll_s
|
||||
f = client.files.get(name=f.name)
|
||||
state = getattr(f.state, "name", str(f.state))
|
||||
if state != "ACTIVE":
|
||||
raise RuntimeError(f"Gemini file not ACTIVE (state={state}) for {audio_path.name}")
|
||||
return f
|
||||
|
||||
|
||||
def transcribe_one(client, model: str, audio_path: Path, host_name: str, *,
|
||||
max_output_tokens: int = 65536) -> tuple[str, dict]:
|
||||
"""Transcribe a single audio file → (transcript_text, usage_dict). Network/CPU only; no DB."""
|
||||
from google.genai import types
|
||||
f = _upload_and_wait(client, audio_path)
|
||||
try:
|
||||
resp = client.models.generate_content(
|
||||
model=model,
|
||||
contents=[f, _PROMPT.format(host=host_name or "the host")],
|
||||
config=types.GenerateContentConfig(temperature=0, max_output_tokens=max_output_tokens),
|
||||
)
|
||||
text = (resp.text or "").strip()
|
||||
um = getattr(resp, "usage_metadata", None)
|
||||
usage = {
|
||||
"prompt_tokens": getattr(um, "prompt_token_count", 0) or 0,
|
||||
"output_tokens": getattr(um, "candidates_token_count", 0) or 0,
|
||||
"finish_reason": str(getattr(resp.candidates[0], "finish_reason", "")) if resp.candidates else "",
|
||||
}
|
||||
return text, usage
|
||||
finally:
|
||||
try:
|
||||
client.files.delete(name=f.name)
|
||||
except Exception as e: # noqa: BLE001 — best-effort cleanup
|
||||
log.debug("file cleanup failed for %s: %s", f.name, e)
|
||||
|
||||
|
||||
def _fetch_and_transcribe(client, model: str, cfg, doc, host_name: str) -> dict:
|
||||
"""Worker-thread unit: download enclosure → Gemini transcribe → write transcript file. No DB writes."""
|
||||
cache = Path(cfg.audio_cache_dir)
|
||||
cache.mkdir(parents=True, exist_ok=True)
|
||||
safe = doc["doc_id"].replace(":", "_")
|
||||
src = cache / f"{safe}.src"
|
||||
audio = download_enclosure(doc["url"], src)
|
||||
try:
|
||||
text, usage = transcribe_one(client, model, audio, host_name)
|
||||
if not text or len(text) < 40:
|
||||
raise RuntimeError(f"empty/short transcript ({len(text)} chars)")
|
||||
tpath = Path(cfg.data_dir) / "transcripts" / f"{safe}.txt"
|
||||
tpath.parent.mkdir(parents=True, exist_ok=True)
|
||||
tpath.write_text(text)
|
||||
return {
|
||||
"doc_id": doc["doc_id"], "ok": True, "transcript_path": str(tpath),
|
||||
"n_lines": text.count("\n") + 1, "content_hash": hashlib.sha256(text.encode()).hexdigest(),
|
||||
"usage": usage,
|
||||
}
|
||||
finally:
|
||||
try:
|
||||
if audio.exists():
|
||||
audio.unlink()
|
||||
except Exception: # noqa: BLE001
|
||||
pass
|
||||
|
||||
|
||||
def run_transcribe_gemini(conn, cfg, *, limit: int = 5, concurrency: int = 4,
|
||||
lease_seconds: int = 7200, worker_id: str = "gemini-transcribe") -> dict:
|
||||
"""Lease pending transcribe jobs and transcribe them via Gemini in parallel. DB writes stay on the
|
||||
main thread; only download+API run in the pool. Reports token usage for cost accounting."""
|
||||
from google import genai
|
||||
if not cfg.gemini_api_key:
|
||||
raise RuntimeError("GEMINI_API_KEY not configured")
|
||||
client = genai.Client(api_key=cfg.gemini_api_key)
|
||||
model = cfg.gemini_model or "gemini-2.5-flash"
|
||||
|
||||
# Lease the batch up front (main thread); resolve docs + host names.
|
||||
leased: list[tuple] = []
|
||||
while len(leased) < limit:
|
||||
job = queue.lease_next(conn, worker_id=worker_id, job_types=["transcribe"], lease_seconds=lease_seconds)
|
||||
if job is None:
|
||||
break
|
||||
doc = conn.execute("SELECT * FROM documents WHERE doc_id=?", (job["target_id"],)).fetchone()
|
||||
if doc is None:
|
||||
queue.skip(conn, job["job_id"], "document missing")
|
||||
continue
|
||||
host = conn.execute("SELECT name FROM sources WHERE source_id=?", (doc["source_id"],)).fetchone()
|
||||
leased.append((job, doc, _host_person(host["name"]) if host else ""))
|
||||
|
||||
done = failed = prompt_tok = out_tok = 0
|
||||
with ThreadPoolExecutor(max_workers=concurrency) as pool:
|
||||
futs = {pool.submit(_fetch_and_transcribe, client, model, cfg, doc, host): (job, doc)
|
||||
for (job, doc, host) in leased}
|
||||
for fut in as_completed(futs):
|
||||
job, doc = futs[fut]
|
||||
try:
|
||||
r = fut.result()
|
||||
conn.execute(
|
||||
"UPDATE documents SET transcript_path=?, content_hash=?, processed_at=datetime('now') "
|
||||
"WHERE doc_id=?", (r["transcript_path"], r["content_hash"], doc["doc_id"]),
|
||||
)
|
||||
h = hashlib.sha256(f"{doc['doc_id']}|extract-v0".encode()).hexdigest()
|
||||
queue.enqueue(conn, job_type="extract", target_id=doc["doc_id"], input_hash=h,
|
||||
parent_doc_id=doc["doc_id"], priority=100)
|
||||
queue.complete(conn, job["job_id"], output_ref=f"gemini {r['n_lines']} lines")
|
||||
conn.commit()
|
||||
done += 1
|
||||
prompt_tok += r["usage"]["prompt_tokens"]
|
||||
out_tok += r["usage"]["output_tokens"]
|
||||
fr = r["usage"]["finish_reason"]
|
||||
log.info("gemini transcribed %s (%d lines, %d in/%d out tok%s)", doc["doc_id"],
|
||||
r["n_lines"], r["usage"]["prompt_tokens"], r["usage"]["output_tokens"],
|
||||
", TRUNCATED" if "MAX_TOKENS" in fr else "")
|
||||
except Exception as e: # noqa: BLE001
|
||||
state = queue.fail(conn, job["job_id"], e)
|
||||
conn.commit()
|
||||
failed += 1
|
||||
log.warning("gemini transcribe failed for %s: %s (→ %s)", doc["doc_id"], e, state)
|
||||
return {"done": done, "failed": failed, "prompt_tokens": prompt_tok, "output_tokens": out_tok}
|
||||
@@ -0,0 +1,45 @@
|
||||
"""Speaker-name identification (§4.5 enhancement).
|
||||
|
||||
In a 1-on-1 interview the host introduces the guest by name at the top. Reading the transcript head
|
||||
with the LLM, we attach a real NAME to each diarized speaker → voiceprints.person_label. This gives
|
||||
the independence graph a SECOND, orthogonal overlap signal: the same NAMED guest across two shows is
|
||||
a shared_guest edge even when the voiceprints don't cluster (different mic/codec/room). It complements
|
||||
voiceprint cosine matching and is robust to fingerprint drift — exactly the case the operator flagged.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
_SYS = (
|
||||
'You identify the speakers in a podcast/interview transcript. Each line is "LABEL: text". '
|
||||
"Using the introduction and context, determine each LABEL's real full name and role. In an "
|
||||
"interview the host normally introduces themselves and the guest within the first minute. Only "
|
||||
"assert a name you can actually support from the text — if you cannot tell, use null. "
|
||||
'Return ONLY JSON: {"speakers": {"<LABEL>": {"name": "Full Name" or null, '
|
||||
'"role": "host"|"guest"|"panelist"|"unknown", "confidence": "low"|"med"|"high"}}}.'
|
||||
)
|
||||
|
||||
|
||||
def identify_speakers(backend, transcript_head: str, *, source_name: str, host_hint: str | None = None) -> dict:
|
||||
"""Returns {label: {name, role, confidence}}. `backend` is any extract.backends backend."""
|
||||
ctx = f"Show: {source_name}."
|
||||
if host_hint:
|
||||
ctx += f" The show's usual host is {host_hint}."
|
||||
ctx += "\n\nTRANSCRIPT (beginning):\n" + transcript_head
|
||||
messages = [{"role": "system", "content": _SYS}, {"role": "user", "content": ctx}]
|
||||
raw = backend.complete_json(messages, max_tokens=600)
|
||||
try:
|
||||
obj = json.loads(raw)
|
||||
except Exception:
|
||||
i, j = raw.find("{"), raw.rfind("}")
|
||||
if i < 0 or j < 0:
|
||||
return {}
|
||||
try:
|
||||
obj = json.loads(raw[i:j + 1])
|
||||
except Exception:
|
||||
return {}
|
||||
spk = obj.get("speakers", {}) if isinstance(obj, dict) else {}
|
||||
return spk if isinstance(spk, dict) else {}
|
||||
@@ -0,0 +1,111 @@
|
||||
"""Podcast ingestion → documents + 'transcribe' jobs (§4.1).
|
||||
|
||||
RSS path: parse the feed, take episodes in [since, until], register documents pointing at the audio
|
||||
enclosure. YouTube path: enumerate a channel's videos in the date window via yt-dlp (the back-catalog
|
||||
route for the ~9 shows whose RSS is a truncated rolling window — see seeds/podcast_feeds.resolved.yaml).
|
||||
The transcribe worker downloads + processes either kind identically.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
import sqlite3
|
||||
import subprocess
|
||||
|
||||
from ..backfill import queue
|
||||
from ..util import audio_dedup_key
|
||||
from .feeds import episode_records, fetch_feed
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _enqueue_doc(conn, *, source_id, kind, external_id, url, title, date) -> tuple[int, int]:
|
||||
doc_id = f"pod:{source_id}:{hashlib.sha1(external_id.encode()).hexdigest()[:12]}"
|
||||
dkey = audio_dedup_key(title, date)
|
||||
# Cross-mirror dedup (pre-GPU): if this same episode was already processed (any source/feed),
|
||||
# record the sighting for provenance but DON'T re-transcribe. (external_id UNIQUE already covers
|
||||
# same-feed re-ingest; this covers the same episode via a different feed/YouTube mirror.)
|
||||
dup = conn.execute(
|
||||
"SELECT doc_id FROM documents WHERE dedup_key=? AND processed_at IS NOT NULL LIMIT 1", (dkey,)
|
||||
).fetchone()
|
||||
cur = conn.execute(
|
||||
"""INSERT OR IGNORE INTO documents (doc_id, source_id, kind, external_id, url, title, date, dedup_key)
|
||||
VALUES (?,?,?,?,?,?,?,?)""",
|
||||
(doc_id, source_id, kind, external_id, url, title, date, dkey),
|
||||
)
|
||||
conn.commit()
|
||||
if not cur.rowcount:
|
||||
return (0, 0) # same (source_id, external_id) already known
|
||||
if dup:
|
||||
conn.execute(
|
||||
"UPDATE documents SET processed_at=datetime('now'), raw_path=? WHERE doc_id=?",
|
||||
(f"dup_of:{dup['doc_id']}", doc_id),
|
||||
)
|
||||
conn.commit()
|
||||
log.info("skip transcribe for %s — duplicate content of %s", doc_id, dup["doc_id"])
|
||||
return (1, 0)
|
||||
h = hashlib.sha256(f"{doc_id}|audio-v0".encode()).hexdigest()
|
||||
job = queue.enqueue(conn, job_type="transcribe", target_id=doc_id, input_hash=h,
|
||||
parent_doc_id=doc_id, priority=100)
|
||||
return (1, 1 if job is not None else 0)
|
||||
|
||||
|
||||
def ingest_rss(conn: sqlite3.Connection, source: sqlite3.Row, *, since=None, until=None, limit=20):
|
||||
if not source["rss_url"]:
|
||||
raise ValueError(f"{source['source_id']} has no rss_url")
|
||||
recs = episode_records(fetch_feed(source["rss_url"]))
|
||||
n_docs = n_jobs = count = 0
|
||||
for r in recs:
|
||||
d = r["published"]
|
||||
if since and d and d < since:
|
||||
continue
|
||||
if until and d and d > until:
|
||||
continue
|
||||
if count >= limit:
|
||||
break
|
||||
count += 1
|
||||
nd, nj = _enqueue_doc(conn, source_id=source["source_id"], kind="podcast",
|
||||
external_id=r["guid"], url=r["audio_url"], title=r["title"], date=d)
|
||||
n_docs += nd
|
||||
n_jobs += nj
|
||||
return n_docs, n_jobs
|
||||
|
||||
|
||||
def ingest_youtube(conn: sqlite3.Connection, source: sqlite3.Row, *, since=None, until=None,
|
||||
limit=20, max_scan=800):
|
||||
"""Enumerate channel videos in the date window via yt-dlp (NON-flat, so upload_date is populated —
|
||||
flat mode returns NA). Videos come newest-first, so we use --dateafter/--datebefore to select the
|
||||
window and --break-match-filters to STOP scanning once we drop below `since` (avoids walking the
|
||||
entire channel history). The transcribe worker downloads audio on demand."""
|
||||
if not source["channel_url"]:
|
||||
raise ValueError(f"{source['source_id']} has no channel_url")
|
||||
url = source["channel_url"].rstrip("/")
|
||||
if "/playlist" not in url and not url.endswith("/videos"):
|
||||
url = url + "/videos"
|
||||
cmd = ["yt-dlp", "--no-warnings", "--ignore-errors", "--skip-download",
|
||||
"--print", "%(id)s\t%(upload_date)s\t%(title)s", "--playlist-end", str(max_scan)]
|
||||
if since:
|
||||
s = since.replace("-", "")
|
||||
cmd += ["--dateafter", s, "--break-match-filters", f"upload_date>={s}"]
|
||||
if until:
|
||||
cmd += ["--datebefore", until.replace("-", "")]
|
||||
cmd.append(url)
|
||||
out = subprocess.run(cmd, capture_output=True, text=True, timeout=900)
|
||||
n_docs = n_jobs = count = 0
|
||||
for line in out.stdout.splitlines():
|
||||
parts = line.split("\t")
|
||||
if len(parts) < 2 or not parts[0] or parts[1] in ("NA", ""):
|
||||
continue
|
||||
vid, upd = parts[0], parts[1]
|
||||
title = parts[2] if len(parts) > 2 else vid
|
||||
date = f"{upd[:4]}-{upd[4:6]}-{upd[6:8]}" if len(upd) == 8 else None
|
||||
if count >= limit:
|
||||
break
|
||||
count += 1
|
||||
nd, nj = _enqueue_doc(conn, source_id=source["source_id"], kind="youtube",
|
||||
external_id=vid, url=f"https://www.youtube.com/watch?v={vid}",
|
||||
title=title, date=date)
|
||||
n_docs += nd
|
||||
n_jobs += nj
|
||||
return n_docs, n_jobs
|
||||
@@ -0,0 +1,60 @@
|
||||
"""Cross-chunk speaker stitching + the voiceprint library (§4.1, §4.5).
|
||||
|
||||
diarize-chunk returns a 192-d TitaNet voiceprint per speaker per chunk. Because each chunk is
|
||||
diarized independently, "Speaker 1" in chunk 3 is not the same label as "Speaker 1" in chunk 7 —
|
||||
we re-cluster by cosine similarity (~0.7 distance threshold) so one person gets one identity across
|
||||
the whole episode. The SAME library then matches a guest ACROSS shows by voice (the independence
|
||||
graph's hardest edge, §4.5).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import numpy as np
|
||||
|
||||
DISTANCE_THRESHOLD = 0.7 # cosine DISTANCE (1 - cosine similarity); §4.1
|
||||
|
||||
|
||||
def _unit(v: np.ndarray) -> np.ndarray:
|
||||
n = np.linalg.norm(v)
|
||||
return v / n if n else v
|
||||
|
||||
|
||||
def cosine_distance(a: np.ndarray, b: np.ndarray) -> float:
|
||||
return float(1.0 - np.dot(_unit(np.asarray(a, dtype=float)), _unit(np.asarray(b, dtype=float))))
|
||||
|
||||
|
||||
def stitch_chunks(chunk_voiceprints: list[np.ndarray], *, threshold: float = DISTANCE_THRESHOLD) -> list[int]:
|
||||
"""Greedy online clustering of per-(chunk,speaker) voiceprints into stable speaker ids.
|
||||
|
||||
Input: a flat list of voiceprint vectors (one per chunk-speaker, in encounter order).
|
||||
Output: a parallel list of cluster ids. A vector joins the nearest existing cluster if its
|
||||
distance to that cluster's centroid < threshold, else it starts a new cluster.
|
||||
"""
|
||||
centroids: list[np.ndarray] = []
|
||||
counts: list[int] = []
|
||||
labels: list[int] = []
|
||||
for vp in chunk_voiceprints:
|
||||
vp = np.asarray(vp, dtype=float)
|
||||
if centroids:
|
||||
dists = [cosine_distance(vp, c) for c in centroids]
|
||||
j = int(np.argmin(dists))
|
||||
if dists[j] < threshold:
|
||||
centroids[j] = (centroids[j] * counts[j] + vp) / (counts[j] + 1)
|
||||
counts[j] += 1
|
||||
labels.append(j)
|
||||
continue
|
||||
centroids.append(vp.copy())
|
||||
counts.append(1)
|
||||
labels.append(len(centroids) - 1)
|
||||
return labels
|
||||
|
||||
|
||||
def match_library(vp: np.ndarray, library: list[tuple[str, np.ndarray]], *,
|
||||
threshold: float = DISTANCE_THRESHOLD) -> str | None:
|
||||
"""Return the voiceprint_id of the closest library entry within threshold, else None
|
||||
(a new speaker → caller mints a new library id)."""
|
||||
best_id, best_d = None, threshold
|
||||
for vid, lib_vec in library:
|
||||
d = cosine_distance(vp, lib_vec)
|
||||
if d < best_d:
|
||||
best_id, best_d = vid, d
|
||||
return best_id
|
||||
@@ -0,0 +1,308 @@
|
||||
"""Audio → speaker-attributed transcript + voiceprint library (§4.1, §4.5).
|
||||
|
||||
Per chunk (sequential — audio lock): diarize-chunk (192-d TitaNet fingerprints + timed speaker
|
||||
segments) + transcribe (word timestamps). Align words to speakers by time, stitch speakers ACROSS
|
||||
chunks by fingerprint cosine, then match the persisted voiceprint library so the SAME guest is
|
||||
recognized ACROSS shows by voice — the highest-leverage input to the source-independence graph.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import time
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ..backfill import queue
|
||||
from .chunker import chunk_audio
|
||||
from .download import download_enclosure, download_youtube_audio, to_wav_16k_mono
|
||||
from .speaker_stitch import DISTANCE_THRESHOLD, match_library, stitch_chunks
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ---------- alignment ----------
|
||||
def _speaker_at(segments: list[dict], t: float) -> str:
|
||||
for s in segments:
|
||||
if s["start_s"] <= t <= s["end_s"]:
|
||||
return s["speaker"]
|
||||
if not segments:
|
||||
return "Speaker_0"
|
||||
return min(segments, key=lambda s: min(abs(s["start_s"] - t), abs(s["end_s"] - t)))["speaker"]
|
||||
|
||||
|
||||
def align_words(words: list[dict], segments: list[dict]) -> list[dict]:
|
||||
"""Group word-level transcription into speaker turns using the diarization segments."""
|
||||
turns: list[dict] = []
|
||||
cur: dict | None = None
|
||||
for w in words:
|
||||
mid = (w["start"] + w["end"]) / 2
|
||||
spk = _speaker_at(segments, mid)
|
||||
if cur and cur["speaker"] == spk:
|
||||
cur["text"] += " " + w["text"]
|
||||
cur["end"] = w["end"]
|
||||
else:
|
||||
if cur:
|
||||
turns.append(cur)
|
||||
cur = {"speaker": spk, "start": w["start"], "end": w["end"], "text": w["text"]}
|
||||
if cur:
|
||||
turns.append(cur)
|
||||
return turns
|
||||
|
||||
|
||||
# ---------- per-document audio processing ----------
|
||||
def diarize_transcribe_chunks(sc, chunks: list[Path], *, concurrency: int = 2):
|
||||
"""Returns (chunk_turns, chunk_speakers): turns per chunk + (chunk_idx, local_spk, fingerprint).
|
||||
|
||||
Drives up to `concurrency` chunks in flight — the client's global audio SEMAPHORE is the hard cap
|
||||
across both parakeet endpoints (sit at 2: keeps the single serial GPU continuously fed = full
|
||||
throughput, no idle gap). A single chunk's failure is non-fatal (skip; the client already busy-
|
||||
retries transient blips), but if a MAJORITY of chunks fail the whole job raises so it retries later
|
||||
(rather than emitting a half-empty transcript). Results are reassembled in chunk order."""
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
|
||||
def _one(idx: int, ch: Path):
|
||||
dia = sc.diarize_chunk(str(ch))
|
||||
tr = sc.transcribe(str(ch))
|
||||
turns = align_words(tr.get("words", []), dia.get("segments", []))
|
||||
spks = [(idx, spk, np.asarray(vec, dtype=np.float32))
|
||||
for spk, vec in (dia.get("fingerprints") or {}).items()]
|
||||
return idx, turns, spks
|
||||
|
||||
results: dict[int, tuple] = {}
|
||||
failed = 0
|
||||
with ThreadPoolExecutor(max_workers=max(1, concurrency)) as pool:
|
||||
futs = {pool.submit(_one, i, ch): i for i, ch in enumerate(chunks)}
|
||||
for fut in as_completed(futs):
|
||||
try:
|
||||
idx, turns, spks = fut.result()
|
||||
results[idx] = (turns, spks)
|
||||
except Exception as e: # noqa: BLE001 — one contended chunk shouldn't kill the episode
|
||||
failed += 1
|
||||
log.warning("chunk %d/%d failed (%s) — skipping", futs[fut], len(chunks), str(e)[:90])
|
||||
if chunks and failed >= max(3, len(chunks) // 2):
|
||||
raise RuntimeError(f"{failed}/{len(chunks)} chunks failed — backend contended; will retry later")
|
||||
chunk_turns = [(idx, results[idx][0]) for idx in sorted(results)]
|
||||
chunk_speakers = [s for idx in sorted(results) for s in results[idx][1]]
|
||||
return chunk_turns, chunk_speakers
|
||||
|
||||
|
||||
def stitch_and_centroids(chunk_speakers, *, threshold: float = DISTANCE_THRESHOLD):
|
||||
"""Cluster all (chunk,speaker) fingerprints into within-episode global speakers."""
|
||||
if not chunk_speakers:
|
||||
return {}, {}
|
||||
vecs = [v for (_, _, v) in chunk_speakers]
|
||||
labels = stitch_chunks(vecs, threshold=threshold)
|
||||
keymap: dict[tuple[int, str], int] = {}
|
||||
groups: dict[int, list[np.ndarray]] = {}
|
||||
for (idx, spk, vec), lab in zip(chunk_speakers, labels):
|
||||
keymap[(idx, spk)] = lab
|
||||
groups.setdefault(lab, []).append(vec)
|
||||
centroids = {lab: np.mean(v, axis=0) for lab, v in groups.items()}
|
||||
return keymap, centroids
|
||||
|
||||
|
||||
def _load_library(conn) -> list[tuple[str, np.ndarray]]:
|
||||
rows = conn.execute("SELECT voiceprint_id, vector, person_label FROM voiceprints").fetchall()
|
||||
return [(r["voiceprint_id"], np.frombuffer(r["vector"], dtype=np.float32)) for r in rows]
|
||||
|
||||
|
||||
def _label_for(conn, vpid: str) -> str:
|
||||
r = conn.execute("SELECT person_label FROM voiceprints WHERE voiceprint_id=?", (vpid,)).fetchone()
|
||||
return (r["person_label"] if r and r["person_label"] else f"SPK:{vpid[:8]}")
|
||||
|
||||
|
||||
def resolve_voiceprints(conn, doc, centroids: dict[int, np.ndarray], *, threshold: float = DISTANCE_THRESHOLD):
|
||||
"""Match each within-episode speaker to the persisted library (cross-show identity) or mint a new
|
||||
one; record observations; add shared_guest edges when the voice also appears in ANOTHER source."""
|
||||
library = _load_library(conn)
|
||||
cluster_to_vpid: dict[int, str] = {}
|
||||
for lab, cen in centroids.items():
|
||||
vpid = match_library(cen, library, threshold=threshold)
|
||||
if vpid is None:
|
||||
vpid = "vp_" + uuid.uuid4().hex[:16]
|
||||
conn.execute(
|
||||
"INSERT INTO voiceprints (voiceprint_id, vector, first_doc_id) VALUES (?,?,?)",
|
||||
(vpid, cen.astype(np.float32).tobytes(), doc["doc_id"]),
|
||||
)
|
||||
library.append((vpid, cen))
|
||||
conn.execute(
|
||||
"INSERT INTO voiceprint_observations (voiceprint_id, doc_id, chunk_idx) VALUES (?,?,?)",
|
||||
(vpid, doc["doc_id"], None),
|
||||
)
|
||||
cluster_to_vpid[lab] = vpid
|
||||
conn.commit()
|
||||
# independence graph (§4.5): if this voice appears in a DIFFERENT source, that's a shared guest.
|
||||
for vpid in set(cluster_to_vpid.values()):
|
||||
others = conn.execute(
|
||||
"""SELECT DISTINCT d.source_id FROM voiceprint_observations o
|
||||
JOIN documents d ON d.doc_id = o.doc_id
|
||||
WHERE o.voiceprint_id=? AND d.source_id != ?""",
|
||||
(vpid, doc["source_id"]),
|
||||
).fetchall()
|
||||
for o in others:
|
||||
a, b = sorted([doc["source_id"], o["source_id"]])
|
||||
conn.execute(
|
||||
"""INSERT INTO source_edges (src_a, src_b, edge_type, weight, evidence)
|
||||
VALUES (?,?,'shared_guest',1.0,?)
|
||||
ON CONFLICT(src_a, src_b, edge_type)
|
||||
DO UPDATE SET weight = weight + 1.0, evidence = excluded.evidence""",
|
||||
(a, b, vpid),
|
||||
)
|
||||
conn.commit()
|
||||
return cluster_to_vpid
|
||||
|
||||
|
||||
def _labeled(chunk_turns, keymap, label_by_cluster: dict) -> str:
|
||||
lines: list[str] = []
|
||||
for idx, turns in chunk_turns:
|
||||
for t in turns:
|
||||
lab = keymap.get((idx, t["speaker"]))
|
||||
label = label_by_cluster.get(lab, t["speaker"])
|
||||
lines.append(f"{label}: {t['text']}")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def build_transcript(conn, chunk_turns, keymap, cluster_to_vpid) -> str:
|
||||
labels = {lab: _label_for(conn, vpid) for lab, vpid in cluster_to_vpid.items()}
|
||||
return _labeled(chunk_turns, keymap, labels)
|
||||
|
||||
|
||||
def apply_names(conn, cluster_to_vpid: dict, idmap: dict) -> dict:
|
||||
"""Attach confident names to the voiceprint library (person_label). Returns {cluster: name}."""
|
||||
named: dict[int, str] = {}
|
||||
for lab, vpid in cluster_to_vpid.items():
|
||||
info = idmap.get(f"Speaker {lab + 1}") or idmap.get(str(lab + 1)) or {}
|
||||
name = (info.get("name") or "").strip() if isinstance(info, dict) else ""
|
||||
if name and info.get("confidence") in ("med", "high"):
|
||||
conn.execute("UPDATE voiceprints SET person_label=? WHERE voiceprint_id=?", (name, vpid))
|
||||
named[lab] = name
|
||||
conn.commit()
|
||||
return named
|
||||
|
||||
|
||||
def add_name_edges(conn, doc, cluster_to_vpid: dict) -> int:
|
||||
"""Name-based shared_guest edges: same person_label seen in a DIFFERENT source → independence edge,
|
||||
even if the voiceprints didn't cluster (drift-robust complement to voiceprint matching, §4.5)."""
|
||||
n = 0
|
||||
for vpid in set(cluster_to_vpid.values()):
|
||||
r = conn.execute("SELECT person_label FROM voiceprints WHERE voiceprint_id=?", (vpid,)).fetchone()
|
||||
name = r["person_label"] if r else None
|
||||
if not name:
|
||||
continue
|
||||
others = conn.execute(
|
||||
"""SELECT DISTINCT d.source_id FROM voiceprints v
|
||||
JOIN voiceprint_observations o ON o.voiceprint_id = v.voiceprint_id
|
||||
JOIN documents d ON d.doc_id = o.doc_id
|
||||
WHERE v.person_label = ? AND d.source_id != ?""",
|
||||
(name, doc["source_id"]),
|
||||
).fetchall()
|
||||
for o in others:
|
||||
a, b = sorted([doc["source_id"], o["source_id"]])
|
||||
conn.execute(
|
||||
"""INSERT INTO source_edges (src_a, src_b, edge_type, weight, evidence)
|
||||
VALUES (?,?,'shared_guest',1.0,?)
|
||||
ON CONFLICT(src_a, src_b, edge_type)
|
||||
DO UPDATE SET weight = weight + 1.0, evidence = excluded.evidence""",
|
||||
(a, b, f"name:{name}"),
|
||||
)
|
||||
n += 1
|
||||
conn.commit()
|
||||
return n
|
||||
|
||||
|
||||
def _download_audio(doc, cfg) -> Path:
|
||||
cache = Path(cfg.audio_cache_dir)
|
||||
cache.mkdir(parents=True, exist_ok=True)
|
||||
wav = cache / f"{doc['doc_id'].replace(':', '_')}.wav"
|
||||
if wav.exists():
|
||||
return wav
|
||||
url = doc["url"]
|
||||
if doc["kind"] == "youtube" or (url and ("youtube.com" in url or "youtu.be" in url)):
|
||||
return download_youtube_audio(url, cache, archive_file=cache / "yt-archive.txt")
|
||||
raw = download_enclosure(url, cache / f"{doc['doc_id'].replace(':', '_')}.src")
|
||||
return to_wav_16k_mono(raw, wav)
|
||||
|
||||
|
||||
def process_document(conn, sc, cfg, doc, *, max_chunks: int, chunk_seconds: int = 150,
|
||||
keep_audio: bool = False) -> int:
|
||||
audio = _download_audio(doc, cfg)
|
||||
chunkdir = Path(cfg.audio_cache_dir) / f"chunks_{doc['doc_id'].replace(':', '_')}"
|
||||
chunks = chunk_audio(audio, chunkdir, chunk_seconds=chunk_seconds)[:max_chunks]
|
||||
chunk_turns, chunk_speakers = diarize_transcribe_chunks(
|
||||
sc, chunks, concurrency=getattr(cfg, "audio_concurrency", 2))
|
||||
keymap, centroids = stitch_and_centroids(chunk_speakers)
|
||||
cluster_to_vpid = resolve_voiceprints(conn, doc, centroids)
|
||||
|
||||
# Name the speakers (§4.5): host introduces guest in 1-on-1 → attach person_label, then a
|
||||
# name-based shared_guest edge that survives voiceprint drift across shows.
|
||||
src = conn.execute("SELECT name FROM sources WHERE source_id=?", (doc["source_id"],)).fetchone()
|
||||
try:
|
||||
from ..extract.backends import from_config as backend_from_config
|
||||
from .identify import identify_speakers
|
||||
backend = backend_from_config(cfg, sc)
|
||||
draft = _labeled(chunk_turns, keymap, {lab: f"Speaker {lab + 1}" for lab in cluster_to_vpid})
|
||||
idmap = identify_speakers(backend, draft[:6000], source_name=src["name"] if src else "")
|
||||
named = apply_names(conn, cluster_to_vpid, idmap)
|
||||
if named:
|
||||
log.info("named speakers in %s: %s", doc["doc_id"], ", ".join(named.values()))
|
||||
except Exception as e: # noqa: BLE001 — naming is best-effort enrichment
|
||||
log.warning("speaker identification failed for %s: %s", doc["doc_id"], e)
|
||||
add_name_edges(conn, doc, cluster_to_vpid)
|
||||
|
||||
transcript = build_transcript(conn, chunk_turns, keymap, cluster_to_vpid)
|
||||
tpath = Path(cfg.data_dir) / "transcripts" / f"{doc['doc_id'].replace(':', '_')}.txt"
|
||||
tpath.parent.mkdir(parents=True, exist_ok=True)
|
||||
tpath.write_text(transcript)
|
||||
import hashlib
|
||||
content_hash = hashlib.sha256(transcript.encode()).hexdigest()
|
||||
conn.execute(
|
||||
"UPDATE documents SET transcript_path=?, duration_sec=?, content_hash=?, processed_at=datetime('now') WHERE doc_id=?",
|
||||
(str(tpath), len(chunks) * chunk_seconds, content_hash, doc["doc_id"]),
|
||||
)
|
||||
conn.commit()
|
||||
h = hashlib.sha256(f"{doc['doc_id']}|extract-v0".encode()).hexdigest()
|
||||
queue.enqueue(conn, job_type="extract", target_id=doc["doc_id"], input_hash=h,
|
||||
parent_doc_id=doc["doc_id"], priority=100)
|
||||
if not keep_audio:
|
||||
_cleanup_audio(audio, chunkdir)
|
||||
return len(chunk_turns)
|
||||
|
||||
|
||||
def _cleanup_audio(audio: Path, chunkdir: Path) -> None:
|
||||
"""Audio files are large and disposable once transcribed — reclaim the disk (the transcript +
|
||||
voiceprints are what we keep). Backfilling hundreds of 1-3 hr episodes would otherwise be tens of GB."""
|
||||
import shutil
|
||||
try:
|
||||
if audio.exists():
|
||||
audio.unlink()
|
||||
src = audio.with_suffix(".src")
|
||||
if src.exists():
|
||||
src.unlink()
|
||||
if chunkdir.exists():
|
||||
shutil.rmtree(chunkdir, ignore_errors=True)
|
||||
except Exception as e: # noqa: BLE001
|
||||
log.warning("audio cleanup failed for %s: %s", audio, e)
|
||||
|
||||
|
||||
def run_transcribe(conn, sc, cfg, *, limit: int = 5, max_chunks: int = 999,
|
||||
lease_seconds: int = 3600, worker_id: str = "transcribe-1") -> dict:
|
||||
processed = 0
|
||||
while processed < limit:
|
||||
job = queue.lease_next(conn, worker_id=worker_id, job_types=["transcribe"], lease_seconds=lease_seconds)
|
||||
if job is None:
|
||||
break
|
||||
processed += 1
|
||||
doc = conn.execute("SELECT * FROM documents WHERE doc_id=?", (job["target_id"],)).fetchone()
|
||||
if doc is None:
|
||||
queue.skip(conn, job["job_id"], "document missing")
|
||||
continue
|
||||
try:
|
||||
n = process_document(conn, sc, cfg, doc, max_chunks=max_chunks)
|
||||
queue.complete(conn, job["job_id"], output_ref=f"{n} chunks")
|
||||
log.info("transcribed %s (%d chunks)", doc["doc_id"], n)
|
||||
except Exception as e: # noqa: BLE001
|
||||
state = queue.fail(conn, job["job_id"], e)
|
||||
log.warning("transcribe failed for %s: %s (→ %s)", job["target_id"], e, state)
|
||||
return {"jobs_processed": processed}
|
||||
Reference in New Issue
Block a user