Files

37 lines
1.4 KiB
Python

"""Proposition embedding: dense (bge-m3 via gateway) + optional BM25 sparse (client-side)."""
from __future__ import annotations
import logging
log = logging.getLogger(__name__)
def dense_embed(sc, texts: list[str]) -> list[list[float]]:
"""Dense bge-m3 (1024-d) via the gateway /v1/embeddings (§4.3)."""
resp = sc.embed(texts)
data = sorted(resp["data"], key=lambda d: d.get("index", 0))
return [d["embedding"] for d in data]
class SparseEmbedder:
"""BM25 sparse vectors via FastEmbed `Qdrant/bm25` (the operator's CRM uses this exact model,
with the collection's `modifier: idf`). Degrades gracefully to dense-only if fastembed is absent."""
def __init__(self, model_name: str = "Qdrant/bm25") -> None:
self.available = False
self._model = None
try:
from fastembed import SparseTextEmbedding
self._model = SparseTextEmbedding(model_name=model_name)
self.available = True
except Exception as e: # noqa: BLE001
log.warning("fastembed sparse unavailable (%s) — upserting dense-only; add sparse later", e)
def embed(self, texts: list[str]) -> list[dict | None]:
if not self.available or self._model is None:
return [None] * len(texts)
out: list[dict | None] = []
for emb in self._model.embed(texts):
out.append({"indices": emb.indices.tolist(), "values": emb.values.tolist()})
return out