"""Client-side BM25 sparse vectors. EMBEDDINGS.md specifies FastEmbed `Qdrant/bm25` so Qdrant applies IDF (via the sparse vector's `modifier: idf`) over OUR corpus. FastEmbed pulls onnxruntime, which has no wheel for this Python (3.14) yet, so this module provides a dependency-free BM25 term-frequency encoder with the same contract: `encode(text) -> {"indices": [...], "values": [...]}`. Qdrant computes IDF server-side from the stored sparse vectors regardless of how indices are assigned, so this is a legitimate corpus-IDF BM25 leg. The ONLY hard requirement is that ingest and query use the SAME encoder — they both import this one. For production, swap `encode()` for FastEmbed `Qdrant/bm25` (and re-index, so ingest and query stay on the same tokenizer). """ import hashlib import math import re # Prefer FastEmbed Qdrant/bm25 (the EMBEDDINGS.md-specified encoder) when it is # installable — true on the Start9 box (Python 3.11). Fall back to the # dependency-free encoder below where it is not (e.g. this dev Mac on 3.14). # Whichever is active, ingest and query in the SAME environment use it, so they # stay consistent; production rebuilds the index on the box, so it uses FastEmbed # end-to-end. BACKEND reports which is live. try: from fastembed import SparseTextEmbedding # type: ignore _MODEL = None def _model(): global _MODEL if _MODEL is None: _MODEL = SparseTextEmbedding(model_name="Qdrant/bm25") return _MODEL def encode(text: str): emb = next(_model().embed([text or ""])) return {"indices": [int(i) for i in emb.indices], "values": [float(v) for v in emb.values]} BACKEND = "fastembed:Qdrant/bm25" except Exception: BACKEND = "pure-python-bm25" _TOKEN_RE = re.compile(r"[a-z0-9]+") def tokenize(text: str): return _TOKEN_RE.findall((text or "").lower()) def _index(token: str) -> int: # Stable unsigned 32-bit index for a token (Qdrant sparse indices are u32). return int.from_bytes(hashlib.md5(token.encode("utf-8")).digest()[:4], "big") def encode(text: str): """Sparse vector {indices, values}; value = 1 + ln(tf). Qdrant applies IDF.""" tf = {} for tok in tokenize(text): tf[tok] = tf.get(tok, 0) + 1 return {"indices": [_index(t) for t in tf], "values": [1.0 + math.log(c) for c in tf.values()]}