Initial commit: Ten31 Signal Engine (ingest, scoring brain, corpus seeds)
This commit is contained in:
@@ -0,0 +1,79 @@
|
||||
"""Qdrant hybrid collection: create + upsert distilled propositions (§4.3).
|
||||
|
||||
Collection mgmt + upserts go DIRECT to Qdrant (§13.2 "(Qdrant direct) :6333"); retrieval goes
|
||||
through the gateway's /api/search. Named dense vector `bge_m3` (1024-d cosine) + sparse `bm25`
|
||||
(modifier IDF). Point id is a deterministic UUID5 of claim_id, so re-upsert is idempotent.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import sqlite3
|
||||
import uuid
|
||||
|
||||
from qdrant_client import QdrantClient, models
|
||||
|
||||
from .embedder import SparseEmbedder, dense_embed
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
COLLECTION = "propositions"
|
||||
DENSE = "bge_m3"
|
||||
SPARSE = "bm25"
|
||||
_NS = uuid.UUID("5f9b7e10-0000-4000-8000-000000000001")
|
||||
|
||||
# Filterable payload (§4.3): stance/topic/cluster/date for stance distributions, time-windowed
|
||||
# consensus, corroboration lookups. NEVER infer stance from vector distance (§2.2/§5.3).
|
||||
_PAYLOAD_FIELDS = (
|
||||
"claim_id", "doc_id", "source_id", "source_cluster", "topic_canonical", "date",
|
||||
"claim_type", "time_horizon", "confidence", "rel_polarity", "engages_consensus",
|
||||
"counters_position", "thesis_seam", "salience", "claimant", "proposition",
|
||||
)
|
||||
|
||||
|
||||
def get_client(qdrant_url: str) -> QdrantClient:
|
||||
return QdrantClient(url=qdrant_url, prefer_grpc=False, timeout=60)
|
||||
|
||||
|
||||
def ensure_collection(client: QdrantClient, *, dim: int = 1024) -> bool:
|
||||
names = [c.name for c in client.get_collections().collections]
|
||||
if COLLECTION in names:
|
||||
return False
|
||||
client.create_collection(
|
||||
collection_name=COLLECTION,
|
||||
vectors_config={DENSE: models.VectorParams(size=dim, distance=models.Distance.COSINE)},
|
||||
sparse_vectors_config={SPARSE: models.SparseVectorParams(modifier=models.Modifier.IDF)},
|
||||
)
|
||||
log.info("created Qdrant collection %r (dense %s %dd + sparse %s/idf)", COLLECTION, DENSE, dim, SPARSE)
|
||||
return True
|
||||
|
||||
|
||||
def _point_id(claim_id: str) -> str:
|
||||
return str(uuid.uuid5(_NS, claim_id))
|
||||
|
||||
|
||||
def upsert_pending(conn: sqlite3.Connection, sc, client: QdrantClient,
|
||||
sparse: SparseEmbedder | None = None, *, batch: int = 64) -> int:
|
||||
"""Embed + upsert every claim that has no qdrant_point_id yet; back-link the id into SQLite."""
|
||||
rows = conn.execute("SELECT * FROM claims WHERE qdrant_point_id IS NULL").fetchall()
|
||||
if not rows:
|
||||
return 0
|
||||
total = 0
|
||||
for i in range(0, len(rows), batch):
|
||||
chunk = rows[i:i + batch]
|
||||
texts = [r["proposition"] for r in chunk]
|
||||
dvecs = dense_embed(sc, texts)
|
||||
svecs = sparse.embed(texts) if sparse else [None] * len(texts)
|
||||
points = []
|
||||
for r, dv, sv in zip(chunk, dvecs, svecs):
|
||||
vectors: dict = {DENSE: dv}
|
||||
if sv is not None:
|
||||
vectors[SPARSE] = models.SparseVector(indices=sv["indices"], values=sv["values"])
|
||||
payload = {f: r[f] for f in _PAYLOAD_FIELDS}
|
||||
points.append(models.PointStruct(id=_point_id(r["claim_id"]), vector=vectors, payload=payload))
|
||||
client.upsert(collection_name=COLLECTION, points=points)
|
||||
for r in chunk:
|
||||
conn.execute("UPDATE claims SET qdrant_point_id=? WHERE claim_id=?",
|
||||
(_point_id(r["claim_id"]), r["claim_id"]))
|
||||
conn.commit()
|
||||
total += len(chunk)
|
||||
return total
|
||||
Reference in New Issue
Block a user