37 lines
1.4 KiB
Python
37 lines
1.4 KiB
Python
"""Proposition embedding: dense (bge-m3 via gateway) + optional BM25 sparse (client-side)."""
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
|
|
def dense_embed(sc, texts: list[str]) -> list[list[float]]:
|
|
"""Dense bge-m3 (1024-d) via the gateway /v1/embeddings (§4.3)."""
|
|
resp = sc.embed(texts)
|
|
data = sorted(resp["data"], key=lambda d: d.get("index", 0))
|
|
return [d["embedding"] for d in data]
|
|
|
|
|
|
class SparseEmbedder:
|
|
"""BM25 sparse vectors via FastEmbed `Qdrant/bm25` (the operator's CRM uses this exact model,
|
|
with the collection's `modifier: idf`). Degrades gracefully to dense-only if fastembed is absent."""
|
|
|
|
def __init__(self, model_name: str = "Qdrant/bm25") -> None:
|
|
self.available = False
|
|
self._model = None
|
|
try:
|
|
from fastembed import SparseTextEmbedding
|
|
self._model = SparseTextEmbedding(model_name=model_name)
|
|
self.available = True
|
|
except Exception as e: # noqa: BLE001
|
|
log.warning("fastembed sparse unavailable (%s) — upserting dense-only; add sparse later", e)
|
|
|
|
def embed(self, texts: list[str]) -> list[dict | None]:
|
|
if not self.available or self._model is None:
|
|
return [None] * len(texts)
|
|
out: list[dict | None] = []
|
|
for emb in self._model.embed(texts):
|
|
out.append({"indices": emb.indices.tolist(), "values": emb.values.tolist()})
|
|
return out
|