Initial commit: Ten31 Signal Engine (ingest, scoring brain, corpus seeds)

This commit is contained in:
Keysat
2026-06-15 09:24:29 -05:00
commit a6aec77506
77 changed files with 6263 additions and 0 deletions
+127
View File
@@ -0,0 +1,127 @@
"""Earnings-call transcripts via Financial Modeling Prep (§4.1, §12 — decision: FMP).
Audio isn't reliably fetchable for large-caps (no uniform feed; ~3090d replay expiry breaks
backfill), so FMP's transcript API is the backbone and EDGAR filings remain the durable core. FMP
also exposes an earnings *calendar* to trigger ingestion on the day a call drops.
Endpoint paths/params are marked TODO(contract): confirm against the FMP 'stable' docs for the
account tier at integration. Needs config.fmp_api_key.
"""
from __future__ import annotations
import hashlib
import sqlite3
from pathlib import Path
from typing import Any
import requests
FMP_BASE = "https://financialmodelingprep.com/stable"
class FMPClient:
def __init__(self, api_key: str, *, base: str = FMP_BASE, timeout: int = 30) -> None:
if not api_key:
raise ValueError("FMP_API_KEY is required for earnings-call transcripts")
self.api_key = api_key
self.base = base
self.timeout = timeout
self.s = requests.Session()
def _get(self, path: str, **params: Any) -> Any:
params["apikey"] = self.api_key
r = self.s.get(f"{self.base}/{path}", params=params, timeout=self.timeout)
r.raise_for_status()
return r.json()
# Confirmed against FMP 'stable' 2026-06-07 (v3 is legacy/403). Note singular "earning".
def transcript_dates(self, symbol: str) -> Any:
"""List available transcripts: [{quarter, fiscalYear, date}, ...]."""
return self._get("earning-call-transcript-dates", symbol=symbol)
def transcript(self, symbol: str, *, year: int, quarter: int) -> Any:
"""One transcript: [{symbol, period, year, date, content}]. Use the `date` field as the
document date — FMP's year/quarter labels are fiscal and can be offset from the call date."""
return self._get("earning-call-transcript", symbol=symbol, year=year, quarter=quarter)
def earnings_calendar(self, *, from_date: str, to_date: str) -> Any:
"""Earnings calendar (ingestion trigger): [{symbol, date, epsActual, ...}, ...]."""
return self._get("earnings-calendar", **{"from": from_date, "to": to_date})
def ingest_transcript(
conn: sqlite3.Connection,
*,
source_id: str,
symbol: str,
year: int,
quarter: int,
content: str,
date: str | None,
data_dir: Path,
prompt_version: str = "extract-v0",
) -> tuple[bool, bool]:
"""Store one transcript (content written to disk → transcript_path) and enqueue an 'extract'
job. Idempotent. Returns (new_document, new_job)."""
from ..backfill import queue
external_id = f"{symbol}-{year}Q{quarter}"
doc_id = f"earnings:{external_id}"
tdir = Path(data_dir) / "transcripts"
tdir.mkdir(parents=True, exist_ok=True)
tpath = tdir / f"{external_id}.txt"
tpath.write_text(content)
content_hash = hashlib.sha256(content.encode()).hexdigest()
cur = conn.execute(
"""INSERT OR IGNORE INTO documents
(doc_id, source_id, kind, external_id, title, date, transcript_path, content_hash, processed_at)
VALUES (?,?,?,?,?,?,?,?, datetime('now'))""",
(doc_id, source_id, "earnings_call", external_id, f"{symbol} {year} Q{quarter} call",
date, str(tpath), content_hash),
)
conn.commit()
if not cur.rowcount:
return (False, False)
# earnings-call Q&A is the highest-yield text source (§4.1) → priority 40, ahead of filings (50).
h = hashlib.sha256(f"{doc_id}|{prompt_version}".encode()).hexdigest()
new_job = queue.enqueue(conn, job_type="extract", target_id=doc_id, input_hash=h,
parent_doc_id=doc_id, priority=40) is not None
return (True, new_job)
def ingest_for_ticker(
conn: sqlite3.Connection,
fmp: FMPClient,
*,
source_id: str,
symbol: str,
data_dir: Path,
since: str | None = None,
until: str | None = None,
limit: int = 8,
) -> tuple[int, int]:
"""Enumerate available transcripts via the dates index, fetch those in [since, until], and
ingest. Uses each transcript's own `date` (FMP fiscal labels are offset). Returns (docs, jobs)."""
dates = fmp.transcript_dates(symbol)
picked = []
for d in dates if isinstance(dates, list) else []:
dt = d.get("date")
if since and dt and dt < since:
continue
if until and dt and dt > until:
continue
picked.append(d)
n_docs = n_jobs = 0
for d in picked[:limit]:
tr = fmp.transcript(symbol, year=d["fiscalYear"], quarter=d["quarter"])
item = (tr[0] if isinstance(tr, list) and tr else tr) or {}
content = item.get("content") or ""
if not content:
continue
nd, nj = ingest_transcript(
conn, source_id=source_id, symbol=symbol, year=d["fiscalYear"], quarter=d["quarter"],
content=content, date=item.get("date") or d.get("date"), data_dir=data_dir,
)
n_docs += int(nd)
n_jobs += int(nj)
return n_docs, n_jobs