Files

128 lines
5.0 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Earnings-call transcripts via Financial Modeling Prep (§4.1, §12 — decision: FMP).
Audio isn't reliably fetchable for large-caps (no uniform feed; ~3090d replay expiry breaks
backfill), so FMP's transcript API is the backbone and EDGAR filings remain the durable core. FMP
also exposes an earnings *calendar* to trigger ingestion on the day a call drops.
Endpoint paths/params are marked TODO(contract): confirm against the FMP 'stable' docs for the
account tier at integration. Needs config.fmp_api_key.
"""
from __future__ import annotations
import hashlib
import sqlite3
from pathlib import Path
from typing import Any
import requests
FMP_BASE = "https://financialmodelingprep.com/stable"
class FMPClient:
def __init__(self, api_key: str, *, base: str = FMP_BASE, timeout: int = 30) -> None:
if not api_key:
raise ValueError("FMP_API_KEY is required for earnings-call transcripts")
self.api_key = api_key
self.base = base
self.timeout = timeout
self.s = requests.Session()
def _get(self, path: str, **params: Any) -> Any:
params["apikey"] = self.api_key
r = self.s.get(f"{self.base}/{path}", params=params, timeout=self.timeout)
r.raise_for_status()
return r.json()
# Confirmed against FMP 'stable' 2026-06-07 (v3 is legacy/403). Note singular "earning".
def transcript_dates(self, symbol: str) -> Any:
"""List available transcripts: [{quarter, fiscalYear, date}, ...]."""
return self._get("earning-call-transcript-dates", symbol=symbol)
def transcript(self, symbol: str, *, year: int, quarter: int) -> Any:
"""One transcript: [{symbol, period, year, date, content}]. Use the `date` field as the
document date — FMP's year/quarter labels are fiscal and can be offset from the call date."""
return self._get("earning-call-transcript", symbol=symbol, year=year, quarter=quarter)
def earnings_calendar(self, *, from_date: str, to_date: str) -> Any:
"""Earnings calendar (ingestion trigger): [{symbol, date, epsActual, ...}, ...]."""
return self._get("earnings-calendar", **{"from": from_date, "to": to_date})
def ingest_transcript(
conn: sqlite3.Connection,
*,
source_id: str,
symbol: str,
year: int,
quarter: int,
content: str,
date: str | None,
data_dir: Path,
prompt_version: str = "extract-v0",
) -> tuple[bool, bool]:
"""Store one transcript (content written to disk → transcript_path) and enqueue an 'extract'
job. Idempotent. Returns (new_document, new_job)."""
from ..backfill import queue
external_id = f"{symbol}-{year}Q{quarter}"
doc_id = f"earnings:{external_id}"
tdir = Path(data_dir) / "transcripts"
tdir.mkdir(parents=True, exist_ok=True)
tpath = tdir / f"{external_id}.txt"
tpath.write_text(content)
content_hash = hashlib.sha256(content.encode()).hexdigest()
cur = conn.execute(
"""INSERT OR IGNORE INTO documents
(doc_id, source_id, kind, external_id, title, date, transcript_path, content_hash, processed_at)
VALUES (?,?,?,?,?,?,?,?, datetime('now'))""",
(doc_id, source_id, "earnings_call", external_id, f"{symbol} {year} Q{quarter} call",
date, str(tpath), content_hash),
)
conn.commit()
if not cur.rowcount:
return (False, False)
# earnings-call Q&A is the highest-yield text source (§4.1) → priority 40, ahead of filings (50).
h = hashlib.sha256(f"{doc_id}|{prompt_version}".encode()).hexdigest()
new_job = queue.enqueue(conn, job_type="extract", target_id=doc_id, input_hash=h,
parent_doc_id=doc_id, priority=40) is not None
return (True, new_job)
def ingest_for_ticker(
conn: sqlite3.Connection,
fmp: FMPClient,
*,
source_id: str,
symbol: str,
data_dir: Path,
since: str | None = None,
until: str | None = None,
limit: int = 8,
) -> tuple[int, int]:
"""Enumerate available transcripts via the dates index, fetch those in [since, until], and
ingest. Uses each transcript's own `date` (FMP fiscal labels are offset). Returns (docs, jobs)."""
dates = fmp.transcript_dates(symbol)
picked = []
for d in dates if isinstance(dates, list) else []:
dt = d.get("date")
if since and dt and dt < since:
continue
if until and dt and dt > until:
continue
picked.append(d)
n_docs = n_jobs = 0
for d in picked[:limit]:
tr = fmp.transcript(symbol, year=d["fiscalYear"], quarter=d["quarter"])
item = (tr[0] if isinstance(tr, list) and tr else tr) or {}
content = item.get("content") or ""
if not content:
continue
nd, nj = ingest_transcript(
conn, source_id=source_id, symbol=symbol, year=d["fiscalYear"], quarter=d["quarter"],
content=content, date=item.get("date") or d.get("date"), data_dir=data_dir,
)
n_docs += int(nd)
n_jobs += int(nj)
return n_docs, n_jobs