ten31-database/backend/ingest/sparse.py

"""Client-side BM25 sparse vectors.

EMBEDDINGS.md specifies FastEmbed `Qdrant/bm25` so Qdrant applies IDF (via the
sparse vector's `modifier: idf`) over OUR corpus. FastEmbed pulls onnxruntime,
which has no wheel for this Python (3.14) yet, so this module provides a
dependency-free BM25 term-frequency encoder with the same contract:
`encode(text) -> {"indices": [...], "values": [...]}`.

Qdrant computes IDF server-side from the stored sparse vectors regardless of how
indices are assigned, so this is a legitimate corpus-IDF BM25 leg. The ONLY hard
requirement is that ingest and query use the SAME encoder — they both import this
one. For production, swap `encode()` for FastEmbed `Qdrant/bm25` (and re-index, so
ingest and query stay on the same tokenizer).
"""
import hashlib
import math
import re

_TOKEN_RE = re.compile(r"[a-z0-9]+")


def tokenize(text: str):
    return _TOKEN_RE.findall((text or "").lower())


def _index(token: str) -> int:
    # Stable unsigned 32-bit index for a token (Qdrant sparse indices are u32).
    return int.from_bytes(hashlib.md5(token.encode("utf-8")).digest()[:4], "big")


def encode(text: str):
    """Return a sparse vector {indices, values}. Value is 1 + ln(tf) (sublinear
    term frequency); IDF is applied by Qdrant via modifier:idf."""
    tf = {}
    for tok in tokenize(text):
        tf[tok] = tf.get(tok, 0) + 1
    idx_val = {}
    for tok, count in tf.items():
        idx_val[_index(tok)] = 1.0 + math.log(count)
    return {"indices": list(idx_val.keys()), "values": list(idx_val.values())}