ten31-database/backend/ingest/sparse.py

"""Client-side BM25 sparse vectors.

EMBEDDINGS.md specifies FastEmbed `Qdrant/bm25` so Qdrant applies IDF (via the
sparse vector's `modifier: idf`) over OUR corpus. FastEmbed pulls onnxruntime,
which has no wheel for this Python (3.14) yet, so this module provides a
dependency-free BM25 term-frequency encoder with the same contract:
`encode(text) -> {"indices": [...], "values": [...]}`.

Qdrant computes IDF server-side from the stored sparse vectors regardless of how
indices are assigned, so this is a legitimate corpus-IDF BM25 leg. The ONLY hard
requirement is that ingest and query use the SAME encoder — they both import this
one. For production, swap `encode()` for FastEmbed `Qdrant/bm25` (and re-index, so
ingest and query stay on the same tokenizer).
"""
import hashlib
import math
import re

# Prefer FastEmbed Qdrant/bm25 (the EMBEDDINGS.md-specified encoder) when it is
# installable — true on the Start9 box (Python 3.11). Fall back to the
# dependency-free encoder below where it is not (e.g. this dev Mac on 3.14).
# Whichever is active, ingest and query in the SAME environment use it, so they
# stay consistent; production rebuilds the index on the box, so it uses FastEmbed
# end-to-end. BACKEND reports which is live.
try:
    from fastembed import SparseTextEmbedding  # type: ignore
    _MODEL = None

    def _model():
        global _MODEL
        if _MODEL is None:
            _MODEL = SparseTextEmbedding(model_name="Qdrant/bm25")
        return _MODEL

    def encode(text: str):
        emb = next(_model().embed([text or ""]))
        return {"indices": [int(i) for i in emb.indices], "values": [float(v) for v in emb.values]}

    BACKEND = "fastembed:Qdrant/bm25"
except Exception:
    BACKEND = "pure-python-bm25"
    _TOKEN_RE = re.compile(r"[a-z0-9]+")

    def tokenize(text: str):
        return _TOKEN_RE.findall((text or "").lower())

    def _index(token: str) -> int:
        # Stable unsigned 32-bit index for a token (Qdrant sparse indices are u32).
        return int.from_bytes(hashlib.md5(token.encode("utf-8")).digest()[:4], "big")

    def encode(text: str):
        """Sparse vector {indices, values}; value = 1 + ln(tf). Qdrant applies IDF."""
        tf = {}
        for tok in tokenize(text):
            tf[tok] = tf.get(tok, 0) + 1
        return {"indices": [_index(t) for t in tf], "values": [1.0 + math.log(c) for c in tf.values()]}