"""Proposition embedding: dense (bge-m3 via gateway) + optional BM25 sparse (client-side).""" from __future__ import annotations import logging log = logging.getLogger(__name__) def dense_embed(sc, texts: list[str]) -> list[list[float]]: """Dense bge-m3 (1024-d) via the gateway /v1/embeddings (§4.3).""" resp = sc.embed(texts) data = sorted(resp["data"], key=lambda d: d.get("index", 0)) return [d["embedding"] for d in data] class SparseEmbedder: """BM25 sparse vectors via FastEmbed `Qdrant/bm25` (the operator's CRM uses this exact model, with the collection's `modifier: idf`). Degrades gracefully to dense-only if fastembed is absent.""" def __init__(self, model_name: str = "Qdrant/bm25") -> None: self.available = False self._model = None try: from fastembed import SparseTextEmbedding self._model = SparseTextEmbedding(model_name=model_name) self.available = True except Exception as e: # noqa: BLE001 log.warning("fastembed sparse unavailable (%s) — upserting dense-only; add sparse later", e) def embed(self, texts: list[str]) -> list[dict | None]: if not self.available or self._model is None: return [None] * len(texts) out: list[dict | None] = [] for emb in self._model.embed(texts): out.append({"indices": emb.indices.tolist(), "values": emb.values.tolist()}) return out