"""Client-side BM25 sparse vectors. EMBEDDINGS.md specifies FastEmbed `Qdrant/bm25` so Qdrant applies IDF (via the sparse vector's `modifier: idf`) over OUR corpus. FastEmbed pulls onnxruntime, which has no wheel for this Python (3.14) yet, so this module provides a dependency-free BM25 term-frequency encoder with the same contract: `encode(text) -> {"indices": [...], "values": [...]}`. Qdrant computes IDF server-side from the stored sparse vectors regardless of how indices are assigned, so this is a legitimate corpus-IDF BM25 leg. The ONLY hard requirement is that ingest and query use the SAME encoder — they both import this one. For production, swap `encode()` for FastEmbed `Qdrant/bm25` (and re-index, so ingest and query stay on the same tokenizer). """ import hashlib import math import re _TOKEN_RE = re.compile(r"[a-z0-9]+") def tokenize(text: str): return _TOKEN_RE.findall((text or "").lower()) def _index(token: str) -> int: # Stable unsigned 32-bit index for a token (Qdrant sparse indices are u32). return int.from_bytes(hashlib.md5(token.encode("utf-8")).digest()[:4], "big") def encode(text: str): """Return a sparse vector {indices, values}. Value is 1 + ln(tf) (sublinear term frequency); IDF is applied by Qdrant via modifier:idf.""" tf = {} for tok in tokenize(text): tf[tok] = tf.get(tok, 0) + 1 idx_val = {} for tok, count in tf.items(): idx_val[_index(tok)] = 1.0 + math.log(count) return {"indices": list(idx_val.keys()), "values": list(idx_val.values())}