Initial commit: Ten31 Signal Engine (ingest, scoring brain, corpus seeds)
This commit is contained in:
@@ -0,0 +1,159 @@
|
||||
"""Text-document fetcher for the Battery (bitcoin-collateralized lending) corpus and any non-filing,
|
||||
non-audio source: policy primaries (SEC SABs, OCC/FDIC/Fed), lender/issuer blogs, credit-market data.
|
||||
|
||||
Unlike EDGAR (CIK-driven) and the podcast path (audio→transcribe), these are dated HTML pages, PDFs, or
|
||||
article RSS feeds. We fetch ONCE, extract clean text (HTML via html_to_text, PDF via pypdf), save it, and
|
||||
point documents.transcript_path at the saved text so the extract worker reads it directly (it already
|
||||
supports transcript_path) — this also lets PDFs work, which the worker's on-demand html_to_text fetch can't.
|
||||
|
||||
A source row must exist first (FK). Lineage/axis live on the source's cluster/notes (set in the seed);
|
||||
policy sources are axis=context and must NOT feed the supply resolver (weight 0) — enforced downstream.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import io
|
||||
import logging
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
|
||||
from ..backfill import queue
|
||||
from ..extract.html_text import html_to_text
|
||||
from .feeds import fetch_feed
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
DEFAULT_UA = "ten31-signal-engine/1.0 (research; contact ops@ten31.xyz)"
|
||||
|
||||
|
||||
def _pdf_to_text(data: bytes, *, max_chars: int) -> str:
|
||||
import pypdf
|
||||
reader = pypdf.PdfReader(io.BytesIO(data))
|
||||
parts: list[str] = []
|
||||
total = 0
|
||||
for page in reader.pages:
|
||||
t = page.extract_text() or ""
|
||||
parts.append(t)
|
||||
total += len(t)
|
||||
if total > max_chars:
|
||||
break
|
||||
return "\n".join(parts)[:max_chars]
|
||||
|
||||
|
||||
def fetch_clean_text(url: str, *, method: str = "auto", ua: str = DEFAULT_UA,
|
||||
timeout: int = 90, max_chars: int = 300_000) -> str:
|
||||
"""Fetch a URL once and return clean text. Auto-detects PDF vs HTML by content-type + magic bytes."""
|
||||
r = requests.get(url, headers={"User-Agent": ua}, timeout=timeout)
|
||||
r.raise_for_status()
|
||||
ctype = r.headers.get("Content-Type", "").lower()
|
||||
is_pdf = method == "pdf" or "application/pdf" in ctype or r.content[:5] == b"%PDF-"
|
||||
if is_pdf:
|
||||
return _pdf_to_text(r.content, max_chars=max_chars)
|
||||
return html_to_text(r.text, max_chars=max_chars)
|
||||
|
||||
|
||||
_BLOCK_MARKERS = (
|
||||
"aggressive automated scraping", "request access", "access denied", "are you a robot",
|
||||
"enable javascript", "captcha", "verify you are human", "rate limit exceeded",
|
||||
"403 forbidden", "unusual traffic", "checking your browser",
|
||||
)
|
||||
|
||||
|
||||
def _looks_blocked(text: str) -> bool:
|
||||
"""Anti-scraping interstitials return 200 + a short access-denied body. Detect so we don't ingest
|
||||
a block page as if it were the document (a real policy/blog doc is long and has no such markers)."""
|
||||
low = text[:2500].lower()
|
||||
return any(m in low for m in _BLOCK_MARKERS)
|
||||
|
||||
|
||||
def _doc_id(source_id: str, url: str) -> str:
|
||||
return f"doc:{source_id}:{hashlib.sha256(url.encode()).hexdigest()[:12]}"
|
||||
|
||||
|
||||
def ingest_one(conn: sqlite3.Connection, cfg, *, source_id: str, url: str, title: str,
|
||||
date: str | None, method: str = "auto", prompt_version: str = "extract-v0",
|
||||
min_chars: int = 400) -> str | None:
|
||||
"""Fetch+store one text document and enqueue extraction. Idempotent on (source_id, url).
|
||||
Returns doc_id if newly ingested, else None (duplicate, too-short, or fetch error → logged)."""
|
||||
doc_id = _doc_id(source_id, url)
|
||||
if conn.execute("SELECT 1 FROM documents WHERE doc_id=?", (doc_id,)).fetchone():
|
||||
return None
|
||||
ua = getattr(cfg, "user_agent", None) or DEFAULT_UA
|
||||
try:
|
||||
text = fetch_clean_text(url, method=method, ua=ua)
|
||||
except Exception as e: # noqa: BLE001
|
||||
log.warning("doc fetch failed %s: %s", url, e)
|
||||
return None
|
||||
if not text or len(text) < min_chars:
|
||||
log.warning("doc too short (%d chars), skipping %s", len(text or ""), url)
|
||||
return None
|
||||
if _looks_blocked(text):
|
||||
log.warning("blocked/anti-scrape page detected, skipping %s", url)
|
||||
return None
|
||||
safe = doc_id.replace(":", "_")
|
||||
tpath = Path(cfg.data_dir) / "docs" / f"{safe}.txt"
|
||||
tpath.parent.mkdir(parents=True, exist_ok=True)
|
||||
tpath.write_text(text)
|
||||
content_hash = hashlib.sha256(text.encode()).hexdigest()
|
||||
conn.execute(
|
||||
"""INSERT OR IGNORE INTO documents
|
||||
(doc_id, source_id, kind, external_id, url, title, date, transcript_path, content_hash, processed_at)
|
||||
VALUES (?,?,?,?,?,?,?,?,?,datetime('now'))""",
|
||||
(doc_id, source_id, "filing", url, url, title[:300] if title else url, date, str(tpath), content_hash),
|
||||
)
|
||||
conn.commit()
|
||||
h = hashlib.sha256(f"{doc_id}|{prompt_version}".encode()).hexdigest()
|
||||
queue.enqueue(conn, job_type="extract", target_id=doc_id, input_hash=h,
|
||||
parent_doc_id=doc_id, priority=50)
|
||||
conn.commit()
|
||||
log.info("ingested doc %s (%d chars) for %s", doc_id, len(text), source_id)
|
||||
return doc_id
|
||||
|
||||
|
||||
def ingest_manifest(conn: sqlite3.Connection, cfg, path) -> dict:
|
||||
"""Batch-ingest the docs listed in a YAML manifest ({docs:[{source,url,title,date,method}]}).
|
||||
Returns {ingested, skipped, missing_source}. Each source must already exist (FK)."""
|
||||
import yaml
|
||||
from pathlib import Path as _Path
|
||||
data = yaml.safe_load(_Path(path).read_text()) or {}
|
||||
docs = data.get("docs", [])
|
||||
ingested = skipped = missing = 0
|
||||
for d in docs:
|
||||
src = d.get("source")
|
||||
if not conn.execute("SELECT 1 FROM sources WHERE source_id=?", (src,)).fetchone():
|
||||
log.warning("manifest doc references missing source %r — skipping %s", src, d.get("url"))
|
||||
missing += 1
|
||||
continue
|
||||
doc_id = ingest_one(conn, cfg, source_id=src, url=d["url"], title=d.get("title", d["url"]),
|
||||
date=d.get("date"), method=d.get("method", "auto"))
|
||||
if doc_id:
|
||||
ingested += 1
|
||||
else:
|
||||
skipped += 1
|
||||
return {"ingested": ingested, "skipped": skipped, "missing_source": missing}
|
||||
|
||||
|
||||
def ingest_feed_text(conn: sqlite3.Connection, cfg, *, source_id: str, rss_url: str,
|
||||
since: str | None = None, until: str | None = None, limit: int = 50) -> int:
|
||||
"""Ingest the ARTICLE bodies behind a text RSS feed (blog/press feed). Each item's link is fetched
|
||||
and stored as a dated text document. Returns count of newly-ingested docs."""
|
||||
from .feeds import _published_iso
|
||||
parsed = fetch_feed(rss_url, user_agent=getattr(cfg, "user_agent", None) or DEFAULT_UA)
|
||||
n = 0
|
||||
for entry in parsed.entries:
|
||||
if n >= limit:
|
||||
break
|
||||
link = entry.get("link")
|
||||
if not link:
|
||||
continue
|
||||
date = _published_iso(entry)
|
||||
if since and date and date < since:
|
||||
continue
|
||||
if until and date and date > until:
|
||||
continue
|
||||
if ingest_one(conn, cfg, source_id=source_id, url=link,
|
||||
title=entry.get("title", link), date=date):
|
||||
n += 1
|
||||
return n
|
||||
Reference in New Issue
Block a user