"""Text-document fetcher for the Battery (bitcoin-collateralized lending) corpus and any non-filing, non-audio source: policy primaries (SEC SABs, OCC/FDIC/Fed), lender/issuer blogs, credit-market data. Unlike EDGAR (CIK-driven) and the podcast path (audio→transcribe), these are dated HTML pages, PDFs, or article RSS feeds. We fetch ONCE, extract clean text (HTML via html_to_text, PDF via pypdf), save it, and point documents.transcript_path at the saved text so the extract worker reads it directly (it already supports transcript_path) — this also lets PDFs work, which the worker's on-demand html_to_text fetch can't. A source row must exist first (FK). Lineage/axis live on the source's cluster/notes (set in the seed); policy sources are axis=context and must NOT feed the supply resolver (weight 0) — enforced downstream. """ from __future__ import annotations import hashlib import io import logging import sqlite3 from pathlib import Path import requests from ..backfill import queue from ..extract.html_text import html_to_text from .feeds import fetch_feed log = logging.getLogger(__name__) DEFAULT_UA = "ten31-signal-engine/1.0 (research; contact ops@ten31.xyz)" def _pdf_to_text(data: bytes, *, max_chars: int) -> str: import pypdf reader = pypdf.PdfReader(io.BytesIO(data)) parts: list[str] = [] total = 0 for page in reader.pages: t = page.extract_text() or "" parts.append(t) total += len(t) if total > max_chars: break return "\n".join(parts)[:max_chars] def fetch_clean_text(url: str, *, method: str = "auto", ua: str = DEFAULT_UA, timeout: int = 90, max_chars: int = 300_000) -> str: """Fetch a URL once and return clean text. Auto-detects PDF vs HTML by content-type + magic bytes.""" r = requests.get(url, headers={"User-Agent": ua}, timeout=timeout) r.raise_for_status() ctype = r.headers.get("Content-Type", "").lower() is_pdf = method == "pdf" or "application/pdf" in ctype or r.content[:5] == b"%PDF-" if is_pdf: return _pdf_to_text(r.content, max_chars=max_chars) return html_to_text(r.text, max_chars=max_chars) _BLOCK_MARKERS = ( "aggressive automated scraping", "request access", "access denied", "are you a robot", "enable javascript", "captcha", "verify you are human", "rate limit exceeded", "403 forbidden", "unusual traffic", "checking your browser", ) def _looks_blocked(text: str) -> bool: """Anti-scraping interstitials return 200 + a short access-denied body. Detect so we don't ingest a block page as if it were the document (a real policy/blog doc is long and has no such markers).""" low = text[:2500].lower() return any(m in low for m in _BLOCK_MARKERS) def _doc_id(source_id: str, url: str) -> str: return f"doc:{source_id}:{hashlib.sha256(url.encode()).hexdigest()[:12]}" def ingest_one(conn: sqlite3.Connection, cfg, *, source_id: str, url: str, title: str, date: str | None, method: str = "auto", prompt_version: str = "extract-v0", min_chars: int = 400) -> str | None: """Fetch+store one text document and enqueue extraction. Idempotent on (source_id, url). Returns doc_id if newly ingested, else None (duplicate, too-short, or fetch error → logged).""" doc_id = _doc_id(source_id, url) if conn.execute("SELECT 1 FROM documents WHERE doc_id=?", (doc_id,)).fetchone(): return None ua = getattr(cfg, "user_agent", None) or DEFAULT_UA try: text = fetch_clean_text(url, method=method, ua=ua) except Exception as e: # noqa: BLE001 log.warning("doc fetch failed %s: %s", url, e) return None if not text or len(text) < min_chars: log.warning("doc too short (%d chars), skipping %s", len(text or ""), url) return None if _looks_blocked(text): log.warning("blocked/anti-scrape page detected, skipping %s", url) return None safe = doc_id.replace(":", "_") tpath = Path(cfg.data_dir) / "docs" / f"{safe}.txt" tpath.parent.mkdir(parents=True, exist_ok=True) tpath.write_text(text) content_hash = hashlib.sha256(text.encode()).hexdigest() conn.execute( """INSERT OR IGNORE INTO documents (doc_id, source_id, kind, external_id, url, title, date, transcript_path, content_hash, processed_at) VALUES (?,?,?,?,?,?,?,?,?,datetime('now'))""", (doc_id, source_id, "filing", url, url, title[:300] if title else url, date, str(tpath), content_hash), ) conn.commit() h = hashlib.sha256(f"{doc_id}|{prompt_version}".encode()).hexdigest() queue.enqueue(conn, job_type="extract", target_id=doc_id, input_hash=h, parent_doc_id=doc_id, priority=50) conn.commit() log.info("ingested doc %s (%d chars) for %s", doc_id, len(text), source_id) return doc_id def ingest_manifest(conn: sqlite3.Connection, cfg, path) -> dict: """Batch-ingest the docs listed in a YAML manifest ({docs:[{source,url,title,date,method}]}). Returns {ingested, skipped, missing_source}. Each source must already exist (FK).""" import yaml from pathlib import Path as _Path data = yaml.safe_load(_Path(path).read_text()) or {} docs = data.get("docs", []) ingested = skipped = missing = 0 for d in docs: src = d.get("source") if not conn.execute("SELECT 1 FROM sources WHERE source_id=?", (src,)).fetchone(): log.warning("manifest doc references missing source %r — skipping %s", src, d.get("url")) missing += 1 continue doc_id = ingest_one(conn, cfg, source_id=src, url=d["url"], title=d.get("title", d["url"]), date=d.get("date"), method=d.get("method", "auto")) if doc_id: ingested += 1 else: skipped += 1 return {"ingested": ingested, "skipped": skipped, "missing_source": missing} def ingest_feed_text(conn: sqlite3.Connection, cfg, *, source_id: str, rss_url: str, since: str | None = None, until: str | None = None, limit: int = 50) -> int: """Ingest the ARTICLE bodies behind a text RSS feed (blog/press feed). Each item's link is fetched and stored as a dated text document. Returns count of newly-ingested docs.""" from .feeds import _published_iso parsed = fetch_feed(rss_url, user_agent=getattr(cfg, "user_agent", None) or DEFAULT_UA) n = 0 for entry in parsed.entries: if n >= limit: break link = entry.get("link") if not link: continue date = _published_iso(entry) if since and date and date < since: continue if until and date and date > until: continue if ingest_one(conn, cfg, source_id=source_id, url=link, title=entry.get("title", link), date=date): n += 1 return n