Initial commit: Ten31 Signal Engine (ingest, scoring brain, corpus seeds)

This commit is contained in:
Keysat
2026-06-15 09:24:29 -05:00
commit a6aec77506
77 changed files with 6263 additions and 0 deletions
+79
View File
@@ -0,0 +1,79 @@
"""Qdrant hybrid collection: create + upsert distilled propositions (§4.3).
Collection mgmt + upserts go DIRECT to Qdrant (§13.2 "(Qdrant direct) :6333"); retrieval goes
through the gateway's /api/search. Named dense vector `bge_m3` (1024-d cosine) + sparse `bm25`
(modifier IDF). Point id is a deterministic UUID5 of claim_id, so re-upsert is idempotent.
"""
from __future__ import annotations
import logging
import sqlite3
import uuid
from qdrant_client import QdrantClient, models
from .embedder import SparseEmbedder, dense_embed
log = logging.getLogger(__name__)
COLLECTION = "propositions"
DENSE = "bge_m3"
SPARSE = "bm25"
_NS = uuid.UUID("5f9b7e10-0000-4000-8000-000000000001")
# Filterable payload (§4.3): stance/topic/cluster/date for stance distributions, time-windowed
# consensus, corroboration lookups. NEVER infer stance from vector distance (§2.2/§5.3).
_PAYLOAD_FIELDS = (
"claim_id", "doc_id", "source_id", "source_cluster", "topic_canonical", "date",
"claim_type", "time_horizon", "confidence", "rel_polarity", "engages_consensus",
"counters_position", "thesis_seam", "salience", "claimant", "proposition",
)
def get_client(qdrant_url: str) -> QdrantClient:
return QdrantClient(url=qdrant_url, prefer_grpc=False, timeout=60)
def ensure_collection(client: QdrantClient, *, dim: int = 1024) -> bool:
names = [c.name for c in client.get_collections().collections]
if COLLECTION in names:
return False
client.create_collection(
collection_name=COLLECTION,
vectors_config={DENSE: models.VectorParams(size=dim, distance=models.Distance.COSINE)},
sparse_vectors_config={SPARSE: models.SparseVectorParams(modifier=models.Modifier.IDF)},
)
log.info("created Qdrant collection %r (dense %s %dd + sparse %s/idf)", COLLECTION, DENSE, dim, SPARSE)
return True
def _point_id(claim_id: str) -> str:
return str(uuid.uuid5(_NS, claim_id))
def upsert_pending(conn: sqlite3.Connection, sc, client: QdrantClient,
sparse: SparseEmbedder | None = None, *, batch: int = 64) -> int:
"""Embed + upsert every claim that has no qdrant_point_id yet; back-link the id into SQLite."""
rows = conn.execute("SELECT * FROM claims WHERE qdrant_point_id IS NULL").fetchall()
if not rows:
return 0
total = 0
for i in range(0, len(rows), batch):
chunk = rows[i:i + batch]
texts = [r["proposition"] for r in chunk]
dvecs = dense_embed(sc, texts)
svecs = sparse.embed(texts) if sparse else [None] * len(texts)
points = []
for r, dv, sv in zip(chunk, dvecs, svecs):
vectors: dict = {DENSE: dv}
if sv is not None:
vectors[SPARSE] = models.SparseVector(indices=sv["indices"], values=sv["values"])
payload = {f: r[f] for f in _PAYLOAD_FIELDS}
points.append(models.PointStruct(id=_point_id(r["claim_id"]), vector=vectors, payload=payload))
client.upsert(collection_name=COLLECTION, points=points)
for r in chunk:
conn.execute("UPDATE claims SET qdrant_point_id=? WHERE claim_id=?",
(_point_id(r["claim_id"]), r["claim_id"]))
conn.commit()
total += len(chunk)
return total