Initial commit: Ten31 Signal Engine (ingest, scoring brain, corpus seeds)
This commit is contained in:
@@ -0,0 +1,127 @@
|
||||
"""Earnings-call transcripts via Financial Modeling Prep (§4.1, §12 — decision: FMP).
|
||||
|
||||
Audio isn't reliably fetchable for large-caps (no uniform feed; ~30–90d replay expiry breaks
|
||||
backfill), so FMP's transcript API is the backbone and EDGAR filings remain the durable core. FMP
|
||||
also exposes an earnings *calendar* to trigger ingestion on the day a call drops.
|
||||
|
||||
Endpoint paths/params are marked TODO(contract): confirm against the FMP 'stable' docs for the
|
||||
account tier at integration. Needs config.fmp_api_key.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import requests
|
||||
|
||||
FMP_BASE = "https://financialmodelingprep.com/stable"
|
||||
|
||||
|
||||
class FMPClient:
|
||||
def __init__(self, api_key: str, *, base: str = FMP_BASE, timeout: int = 30) -> None:
|
||||
if not api_key:
|
||||
raise ValueError("FMP_API_KEY is required for earnings-call transcripts")
|
||||
self.api_key = api_key
|
||||
self.base = base
|
||||
self.timeout = timeout
|
||||
self.s = requests.Session()
|
||||
|
||||
def _get(self, path: str, **params: Any) -> Any:
|
||||
params["apikey"] = self.api_key
|
||||
r = self.s.get(f"{self.base}/{path}", params=params, timeout=self.timeout)
|
||||
r.raise_for_status()
|
||||
return r.json()
|
||||
|
||||
# Confirmed against FMP 'stable' 2026-06-07 (v3 is legacy/403). Note singular "earning".
|
||||
def transcript_dates(self, symbol: str) -> Any:
|
||||
"""List available transcripts: [{quarter, fiscalYear, date}, ...]."""
|
||||
return self._get("earning-call-transcript-dates", symbol=symbol)
|
||||
|
||||
def transcript(self, symbol: str, *, year: int, quarter: int) -> Any:
|
||||
"""One transcript: [{symbol, period, year, date, content}]. Use the `date` field as the
|
||||
document date — FMP's year/quarter labels are fiscal and can be offset from the call date."""
|
||||
return self._get("earning-call-transcript", symbol=symbol, year=year, quarter=quarter)
|
||||
|
||||
def earnings_calendar(self, *, from_date: str, to_date: str) -> Any:
|
||||
"""Earnings calendar (ingestion trigger): [{symbol, date, epsActual, ...}, ...]."""
|
||||
return self._get("earnings-calendar", **{"from": from_date, "to": to_date})
|
||||
|
||||
|
||||
def ingest_transcript(
|
||||
conn: sqlite3.Connection,
|
||||
*,
|
||||
source_id: str,
|
||||
symbol: str,
|
||||
year: int,
|
||||
quarter: int,
|
||||
content: str,
|
||||
date: str | None,
|
||||
data_dir: Path,
|
||||
prompt_version: str = "extract-v0",
|
||||
) -> tuple[bool, bool]:
|
||||
"""Store one transcript (content written to disk → transcript_path) and enqueue an 'extract'
|
||||
job. Idempotent. Returns (new_document, new_job)."""
|
||||
from ..backfill import queue
|
||||
|
||||
external_id = f"{symbol}-{year}Q{quarter}"
|
||||
doc_id = f"earnings:{external_id}"
|
||||
tdir = Path(data_dir) / "transcripts"
|
||||
tdir.mkdir(parents=True, exist_ok=True)
|
||||
tpath = tdir / f"{external_id}.txt"
|
||||
tpath.write_text(content)
|
||||
content_hash = hashlib.sha256(content.encode()).hexdigest()
|
||||
cur = conn.execute(
|
||||
"""INSERT OR IGNORE INTO documents
|
||||
(doc_id, source_id, kind, external_id, title, date, transcript_path, content_hash, processed_at)
|
||||
VALUES (?,?,?,?,?,?,?,?, datetime('now'))""",
|
||||
(doc_id, source_id, "earnings_call", external_id, f"{symbol} {year} Q{quarter} call",
|
||||
date, str(tpath), content_hash),
|
||||
)
|
||||
conn.commit()
|
||||
if not cur.rowcount:
|
||||
return (False, False)
|
||||
# earnings-call Q&A is the highest-yield text source (§4.1) → priority 40, ahead of filings (50).
|
||||
h = hashlib.sha256(f"{doc_id}|{prompt_version}".encode()).hexdigest()
|
||||
new_job = queue.enqueue(conn, job_type="extract", target_id=doc_id, input_hash=h,
|
||||
parent_doc_id=doc_id, priority=40) is not None
|
||||
return (True, new_job)
|
||||
|
||||
|
||||
def ingest_for_ticker(
|
||||
conn: sqlite3.Connection,
|
||||
fmp: FMPClient,
|
||||
*,
|
||||
source_id: str,
|
||||
symbol: str,
|
||||
data_dir: Path,
|
||||
since: str | None = None,
|
||||
until: str | None = None,
|
||||
limit: int = 8,
|
||||
) -> tuple[int, int]:
|
||||
"""Enumerate available transcripts via the dates index, fetch those in [since, until], and
|
||||
ingest. Uses each transcript's own `date` (FMP fiscal labels are offset). Returns (docs, jobs)."""
|
||||
dates = fmp.transcript_dates(symbol)
|
||||
picked = []
|
||||
for d in dates if isinstance(dates, list) else []:
|
||||
dt = d.get("date")
|
||||
if since and dt and dt < since:
|
||||
continue
|
||||
if until and dt and dt > until:
|
||||
continue
|
||||
picked.append(d)
|
||||
n_docs = n_jobs = 0
|
||||
for d in picked[:limit]:
|
||||
tr = fmp.transcript(symbol, year=d["fiscalYear"], quarter=d["quarter"])
|
||||
item = (tr[0] if isinstance(tr, list) and tr else tr) or {}
|
||||
content = item.get("content") or ""
|
||||
if not content:
|
||||
continue
|
||||
nd, nj = ingest_transcript(
|
||||
conn, source_id=source_id, symbol=symbol, year=d["fiscalYear"], quarter=d["quarter"],
|
||||
content=content, date=item.get("date") or d.get("date"), data_dir=data_dir,
|
||||
)
|
||||
n_docs += int(nd)
|
||||
n_jobs += int(nj)
|
||||
return n_docs, n_jobs
|
||||
Reference in New Issue
Block a user