Initial commit: Ten31 Signal Engine (ingest, scoring brain, corpus seeds)

2026-06-15 09:24:29 -05:00
commit a6aec77506
77 changed files with 6263 additions and 0 deletions
@@ -0,0 +1,159 @@
+"""Text-document fetcher for the Battery (bitcoin-collateralized lending) corpus and any non-filing,
+non-audio source: policy primaries (SEC SABs, OCC/FDIC/Fed), lender/issuer blogs, credit-market data.
+
+Unlike EDGAR (CIK-driven) and the podcast path (audio→transcribe), these are dated HTML pages, PDFs, or
+article RSS feeds. We fetch ONCE, extract clean text (HTML via html_to_text, PDF via pypdf), save it, and
+point documents.transcript_path at the saved text so the extract worker reads it directly (it already
+supports transcript_path) — this also lets PDFs work, which the worker's on-demand html_to_text fetch can't.
+
+A source row must exist first (FK). Lineage/axis live on the source's cluster/notes (set in the seed);
+policy sources are axis=context and must NOT feed the supply resolver (weight 0) — enforced downstream.
+"""
+from __future__ import annotations
+
+import hashlib
+import io
+import logging
+import sqlite3
+from pathlib import Path
+
+import requests
+
+from ..backfill import queue
+from ..extract.html_text import html_to_text
+from .feeds import fetch_feed
+
+log = logging.getLogger(__name__)
+
+DEFAULT_UA = "ten31-signal-engine/1.0 (research; contact ops@ten31.xyz)"
+
+
+def _pdf_to_text(data: bytes, *, max_chars: int) -> str:
+    import pypdf
+    reader = pypdf.PdfReader(io.BytesIO(data))
+    parts: list[str] = []
+    total = 0
+    for page in reader.pages:
+        t = page.extract_text() or ""
+        parts.append(t)
+        total += len(t)
+        if total > max_chars:
+            break
+    return "\n".join(parts)[:max_chars]
+
+
+def fetch_clean_text(url: str, *, method: str = "auto", ua: str = DEFAULT_UA,
+                     timeout: int = 90, max_chars: int = 300_000) -> str:
+    """Fetch a URL once and return clean text. Auto-detects PDF vs HTML by content-type + magic bytes."""
+    r = requests.get(url, headers={"User-Agent": ua}, timeout=timeout)
+    r.raise_for_status()
+    ctype = r.headers.get("Content-Type", "").lower()
+    is_pdf = method == "pdf" or "application/pdf" in ctype or r.content[:5] == b"%PDF-"
+    if is_pdf:
+        return _pdf_to_text(r.content, max_chars=max_chars)
+    return html_to_text(r.text, max_chars=max_chars)
+
+
+_BLOCK_MARKERS = (
+    "aggressive automated scraping", "request access", "access denied", "are you a robot",
+    "enable javascript", "captcha", "verify you are human", "rate limit exceeded",
+    "403 forbidden", "unusual traffic", "checking your browser",
+)
+
+
+def _looks_blocked(text: str) -> bool:
+    """Anti-scraping interstitials return 200 + a short access-denied body. Detect so we don't ingest
+    a block page as if it were the document (a real policy/blog doc is long and has no such markers)."""
+    low = text[:2500].lower()
+    return any(m in low for m in _BLOCK_MARKERS)
+
+
+def _doc_id(source_id: str, url: str) -> str:
+    return f"doc:{source_id}:{hashlib.sha256(url.encode()).hexdigest()[:12]}"
+
+
+def ingest_one(conn: sqlite3.Connection, cfg, *, source_id: str, url: str, title: str,
+               date: str | None, method: str = "auto", prompt_version: str = "extract-v0",
+               min_chars: int = 400) -> str | None:
+    """Fetch+store one text document and enqueue extraction. Idempotent on (source_id, url).
+    Returns doc_id if newly ingested, else None (duplicate, too-short, or fetch error → logged)."""
+    doc_id = _doc_id(source_id, url)
+    if conn.execute("SELECT 1 FROM documents WHERE doc_id=?", (doc_id,)).fetchone():
+        return None
+    ua = getattr(cfg, "user_agent", None) or DEFAULT_UA
+    try:
+        text = fetch_clean_text(url, method=method, ua=ua)
+    except Exception as e:  # noqa: BLE001
+        log.warning("doc fetch failed %s: %s", url, e)
+        return None
+    if not text or len(text) < min_chars:
+        log.warning("doc too short (%d chars), skipping %s", len(text or ""), url)
+        return None
+    if _looks_blocked(text):
+        log.warning("blocked/anti-scrape page detected, skipping %s", url)
+        return None
+    safe = doc_id.replace(":", "_")
+    tpath = Path(cfg.data_dir) / "docs" / f"{safe}.txt"
+    tpath.parent.mkdir(parents=True, exist_ok=True)
+    tpath.write_text(text)
+    content_hash = hashlib.sha256(text.encode()).hexdigest()
+    conn.execute(
+        """INSERT OR IGNORE INTO documents
+             (doc_id, source_id, kind, external_id, url, title, date, transcript_path, content_hash, processed_at)
+           VALUES (?,?,?,?,?,?,?,?,?,datetime('now'))""",
+        (doc_id, source_id, "filing", url, url, title[:300] if title else url, date, str(tpath), content_hash),
+    )
+    conn.commit()
+    h = hashlib.sha256(f"{doc_id}|{prompt_version}".encode()).hexdigest()
+    queue.enqueue(conn, job_type="extract", target_id=doc_id, input_hash=h,
+                  parent_doc_id=doc_id, priority=50)
+    conn.commit()
+    log.info("ingested doc %s (%d chars) for %s", doc_id, len(text), source_id)
+    return doc_id
+
+
+def ingest_manifest(conn: sqlite3.Connection, cfg, path) -> dict:
+    """Batch-ingest the docs listed in a YAML manifest ({docs:[{source,url,title,date,method}]}).
+    Returns {ingested, skipped, missing_source}. Each source must already exist (FK)."""
+    import yaml
+    from pathlib import Path as _Path
+    data = yaml.safe_load(_Path(path).read_text()) or {}
+    docs = data.get("docs", [])
+    ingested = skipped = missing = 0
+    for d in docs:
+        src = d.get("source")
+        if not conn.execute("SELECT 1 FROM sources WHERE source_id=?", (src,)).fetchone():
+            log.warning("manifest doc references missing source %r — skipping %s", src, d.get("url"))
+            missing += 1
+            continue
+        doc_id = ingest_one(conn, cfg, source_id=src, url=d["url"], title=d.get("title", d["url"]),
+                            date=d.get("date"), method=d.get("method", "auto"))
+        if doc_id:
+            ingested += 1
+        else:
+            skipped += 1
+    return {"ingested": ingested, "skipped": skipped, "missing_source": missing}
+
+
+def ingest_feed_text(conn: sqlite3.Connection, cfg, *, source_id: str, rss_url: str,
+                     since: str | None = None, until: str | None = None, limit: int = 50) -> int:
+    """Ingest the ARTICLE bodies behind a text RSS feed (blog/press feed). Each item's link is fetched
+    and stored as a dated text document. Returns count of newly-ingested docs."""
+    from .feeds import _published_iso
+    parsed = fetch_feed(rss_url, user_agent=getattr(cfg, "user_agent", None) or DEFAULT_UA)
+    n = 0
+    for entry in parsed.entries:
+        if n >= limit:
+            break
+        link = entry.get("link")
+        if not link:
+            continue
+        date = _published_iso(entry)
+        if since and date and date < since:
+            continue
+        if until and date and date > until:
+            continue
+        if ingest_one(conn, cfg, source_id=source_id, url=link,
+                      title=entry.get("title", link), date=date):
+            n += 1
+    return n