Initial commit: Ten31 Signal Engine (ingest, scoring brain, corpus seeds)
This commit is contained in:
@@ -0,0 +1,195 @@
|
||||
"""One-time backfill path: transcribe podcast episodes via the Gemini multimodal API instead of the
|
||||
local Spark Parakeet+diarizer pipeline. Used to take a bulk backfill OFF the shared Spark GPU (which
|
||||
contends with production) — it is NOT the steady-state transcriber (local Parakeet remains the default).
|
||||
|
||||
Scope/guardrail: podcast audio is PUBLIC data, so sending it to the frontier does NOT trip the
|
||||
exposure/positioning-data rule (that guardrail is about Ten31's conviction/exposure data, never public
|
||||
audio). Output is written in the SAME 'Speaker: text' transcript format the extractor consumes, so the
|
||||
downstream extract→embed stages are agnostic to which transcriber produced the file.
|
||||
|
||||
Tradeoff vs local: Gemini yields speaker-LABELED text, not voiceprint fingerprints — so no voiceprint
|
||||
auto-edges. We rely on the hand-seeded EISC edges + name-based attribution instead (acceptable for a
|
||||
bounded backfill).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import logging
|
||||
import re
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from pathlib import Path
|
||||
|
||||
from ..backfill import queue
|
||||
from .download import download_enclosure
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
_PROMPT = (
|
||||
"You are a precise podcast transcriptionist. Transcribe this audio VERBATIM as a speaker-diarized "
|
||||
"transcript.\n"
|
||||
"RULES:\n"
|
||||
"- One line per speaker turn, formatted exactly as `Name: spoken text` (a colon and one space).\n"
|
||||
"- The host of this show is {host} — label every host turn with exactly `{host}` (the person's "
|
||||
"name, never the show's name).\n"
|
||||
"- When the host introduces a guest by name (e.g. 'welcome X to the show', 'I'm joined by X'), use "
|
||||
"that real first name (or full name) as the guest's label for the WHOLE transcript. Only fall back "
|
||||
"to `Guest` (or `Guest 2`, `Guest 3`) if a name is never stated. Do not invent names.\n"
|
||||
"- Do NOT include timestamps, ad-reads markers, summaries, headings, markdown, or any commentary. "
|
||||
"Only the transcript lines.\n"
|
||||
"- Transcribe the entire episode from start to finish. Do not stop early or summarize.\n"
|
||||
)
|
||||
|
||||
|
||||
def _host_person(source_name: str) -> str:
|
||||
"""Derive the host's PERSON name from a source/show name so claimant attribution isn't the show.
|
||||
'What Bitcoin Did (Peter McCormack)' -> 'Peter McCormack'; 'Stephan Livera Podcast' -> 'Stephan
|
||||
Livera'; 'The Kevin Rooke Show' -> 'Kevin Rooke'; 'The Anita Posch Show' -> 'Anita Posch'."""
|
||||
m = re.search(r"\(([^)]+)\)", source_name or "")
|
||||
if m:
|
||||
return m.group(1).strip()
|
||||
s = re.sub(r"^The\s+", "", source_name or "").strip()
|
||||
s = re.sub(r"\s+(Podcast|Show)$", "", s, flags=re.I).strip()
|
||||
return s
|
||||
|
||||
|
||||
def _sniff_audio_mime(path: Path) -> str:
|
||||
"""Determine audio MIME from the file header — the downloaded enclosure has a generic `.src`
|
||||
extension, so the Files API can't infer it and rejects the upload without an explicit mime_type."""
|
||||
with open(path, "rb") as fh:
|
||||
head = fh.read(16)
|
||||
if head[:3] == b"ID3" or (len(head) > 1 and head[0] == 0xFF and (head[1] & 0xE0) == 0xE0):
|
||||
return "audio/mpeg"
|
||||
if head[4:8] == b"ftyp":
|
||||
return "audio/mp4" # m4a/aac
|
||||
if head[:4] == b"OggS":
|
||||
return "audio/ogg"
|
||||
if head[:4] == b"RIFF":
|
||||
return "audio/wav"
|
||||
if head[:4] == b"fLaC":
|
||||
return "audio/flac"
|
||||
return "audio/mpeg" # podcast default
|
||||
|
||||
|
||||
def _upload_and_wait(client, audio_path: Path, *, poll_s: float = 2.0, timeout_s: float = 300.0):
|
||||
"""Upload to the Files API and wait until the file is ACTIVE (audio is processed server-side)."""
|
||||
from google.genai import types
|
||||
mime = _sniff_audio_mime(audio_path)
|
||||
f = client.files.upload(file=str(audio_path), config=types.UploadFileConfig(mime_type=mime))
|
||||
waited = 0.0
|
||||
while getattr(f.state, "name", str(f.state)) == "PROCESSING" and waited < timeout_s:
|
||||
time.sleep(poll_s)
|
||||
waited += poll_s
|
||||
f = client.files.get(name=f.name)
|
||||
state = getattr(f.state, "name", str(f.state))
|
||||
if state != "ACTIVE":
|
||||
raise RuntimeError(f"Gemini file not ACTIVE (state={state}) for {audio_path.name}")
|
||||
return f
|
||||
|
||||
|
||||
def transcribe_one(client, model: str, audio_path: Path, host_name: str, *,
|
||||
max_output_tokens: int = 65536) -> tuple[str, dict]:
|
||||
"""Transcribe a single audio file → (transcript_text, usage_dict). Network/CPU only; no DB."""
|
||||
from google.genai import types
|
||||
f = _upload_and_wait(client, audio_path)
|
||||
try:
|
||||
resp = client.models.generate_content(
|
||||
model=model,
|
||||
contents=[f, _PROMPT.format(host=host_name or "the host")],
|
||||
config=types.GenerateContentConfig(temperature=0, max_output_tokens=max_output_tokens),
|
||||
)
|
||||
text = (resp.text or "").strip()
|
||||
um = getattr(resp, "usage_metadata", None)
|
||||
usage = {
|
||||
"prompt_tokens": getattr(um, "prompt_token_count", 0) or 0,
|
||||
"output_tokens": getattr(um, "candidates_token_count", 0) or 0,
|
||||
"finish_reason": str(getattr(resp.candidates[0], "finish_reason", "")) if resp.candidates else "",
|
||||
}
|
||||
return text, usage
|
||||
finally:
|
||||
try:
|
||||
client.files.delete(name=f.name)
|
||||
except Exception as e: # noqa: BLE001 — best-effort cleanup
|
||||
log.debug("file cleanup failed for %s: %s", f.name, e)
|
||||
|
||||
|
||||
def _fetch_and_transcribe(client, model: str, cfg, doc, host_name: str) -> dict:
|
||||
"""Worker-thread unit: download enclosure → Gemini transcribe → write transcript file. No DB writes."""
|
||||
cache = Path(cfg.audio_cache_dir)
|
||||
cache.mkdir(parents=True, exist_ok=True)
|
||||
safe = doc["doc_id"].replace(":", "_")
|
||||
src = cache / f"{safe}.src"
|
||||
audio = download_enclosure(doc["url"], src)
|
||||
try:
|
||||
text, usage = transcribe_one(client, model, audio, host_name)
|
||||
if not text or len(text) < 40:
|
||||
raise RuntimeError(f"empty/short transcript ({len(text)} chars)")
|
||||
tpath = Path(cfg.data_dir) / "transcripts" / f"{safe}.txt"
|
||||
tpath.parent.mkdir(parents=True, exist_ok=True)
|
||||
tpath.write_text(text)
|
||||
return {
|
||||
"doc_id": doc["doc_id"], "ok": True, "transcript_path": str(tpath),
|
||||
"n_lines": text.count("\n") + 1, "content_hash": hashlib.sha256(text.encode()).hexdigest(),
|
||||
"usage": usage,
|
||||
}
|
||||
finally:
|
||||
try:
|
||||
if audio.exists():
|
||||
audio.unlink()
|
||||
except Exception: # noqa: BLE001
|
||||
pass
|
||||
|
||||
|
||||
def run_transcribe_gemini(conn, cfg, *, limit: int = 5, concurrency: int = 4,
|
||||
lease_seconds: int = 7200, worker_id: str = "gemini-transcribe") -> dict:
|
||||
"""Lease pending transcribe jobs and transcribe them via Gemini in parallel. DB writes stay on the
|
||||
main thread; only download+API run in the pool. Reports token usage for cost accounting."""
|
||||
from google import genai
|
||||
if not cfg.gemini_api_key:
|
||||
raise RuntimeError("GEMINI_API_KEY not configured")
|
||||
client = genai.Client(api_key=cfg.gemini_api_key)
|
||||
model = cfg.gemini_model or "gemini-2.5-flash"
|
||||
|
||||
# Lease the batch up front (main thread); resolve docs + host names.
|
||||
leased: list[tuple] = []
|
||||
while len(leased) < limit:
|
||||
job = queue.lease_next(conn, worker_id=worker_id, job_types=["transcribe"], lease_seconds=lease_seconds)
|
||||
if job is None:
|
||||
break
|
||||
doc = conn.execute("SELECT * FROM documents WHERE doc_id=?", (job["target_id"],)).fetchone()
|
||||
if doc is None:
|
||||
queue.skip(conn, job["job_id"], "document missing")
|
||||
continue
|
||||
host = conn.execute("SELECT name FROM sources WHERE source_id=?", (doc["source_id"],)).fetchone()
|
||||
leased.append((job, doc, _host_person(host["name"]) if host else ""))
|
||||
|
||||
done = failed = prompt_tok = out_tok = 0
|
||||
with ThreadPoolExecutor(max_workers=concurrency) as pool:
|
||||
futs = {pool.submit(_fetch_and_transcribe, client, model, cfg, doc, host): (job, doc)
|
||||
for (job, doc, host) in leased}
|
||||
for fut in as_completed(futs):
|
||||
job, doc = futs[fut]
|
||||
try:
|
||||
r = fut.result()
|
||||
conn.execute(
|
||||
"UPDATE documents SET transcript_path=?, content_hash=?, processed_at=datetime('now') "
|
||||
"WHERE doc_id=?", (r["transcript_path"], r["content_hash"], doc["doc_id"]),
|
||||
)
|
||||
h = hashlib.sha256(f"{doc['doc_id']}|extract-v0".encode()).hexdigest()
|
||||
queue.enqueue(conn, job_type="extract", target_id=doc["doc_id"], input_hash=h,
|
||||
parent_doc_id=doc["doc_id"], priority=100)
|
||||
queue.complete(conn, job["job_id"], output_ref=f"gemini {r['n_lines']} lines")
|
||||
conn.commit()
|
||||
done += 1
|
||||
prompt_tok += r["usage"]["prompt_tokens"]
|
||||
out_tok += r["usage"]["output_tokens"]
|
||||
fr = r["usage"]["finish_reason"]
|
||||
log.info("gemini transcribed %s (%d lines, %d in/%d out tok%s)", doc["doc_id"],
|
||||
r["n_lines"], r["usage"]["prompt_tokens"], r["usage"]["output_tokens"],
|
||||
", TRUNCATED" if "MAX_TOKENS" in fr else "")
|
||||
except Exception as e: # noqa: BLE001
|
||||
state = queue.fail(conn, job["job_id"], e)
|
||||
conn.commit()
|
||||
failed += 1
|
||||
log.warning("gemini transcribe failed for %s: %s (→ %s)", doc["doc_id"], e, state)
|
||||
return {"done": done, "failed": failed, "prompt_tokens": prompt_tok, "output_tokens": out_tok}
|
||||
Reference in New Issue
Block a user