ten31-database/backend/ingest/backfill.py

#!/usr/bin/env python3
"""Phase-0 Workstream B — backfill the CRM into Qdrant.

Chunk -> dense (bge-m3 via Spark Control) + sparse (BM25 client-side) -> upsert
to Qdrant `crm_chunks` with payload. Idempotent: deterministic point ids mean
re-running upserts in place. Reads the CRM by file path; never sends data to Claude.

    python3 backend/ingest/backfill.py --db data/crm_dev.db --recreate
"""
import argparse
import sqlite3

import chunking
import config
import embed
import qdrant_io
import sparse


def embed_and_upsert(chunks, batch=32, progress=True):
    """Embed (dense + sparse) and upsert a list of chunks to Qdrant. Shared by the
    full backfill and the incremental sync. Returns the number of points written."""
    total = 0
    for i in range(0, len(chunks), batch):
        group = chunks[i:i + batch]
        dense = embed.dense_embed([c["text"] for c in group])
        points = []
        for c, dv in zip(group, dense):
            sv = sparse.encode(c["text"])
            points.append({
                "id": c["point_id"],
                "vector": {"dense": dv, "sparse": {"indices": sv["indices"], "values": sv["values"]}},
                "payload": {
                    "lp_id": c["lp_id"], "lp_name": c["lp_name"], "person_id": c["person_id"],
                    "doc_type": c["doc_type"], "date_ts": c["date_ts"], "text": c["text"],
                    "source_model": c["source_model"], "source_id": c["source_id"], "chunk_key": c["chunk_key"],
                },
            })
        qdrant_io.upsert(points)
        total += len(points)
        if progress:
            print(f"  upserted {total}/{len(chunks)}")
    return total


def run(db, recreate=False, batch=32):
    conn = sqlite3.connect(db)
    conn.row_factory = sqlite3.Row
    chunks = chunking.build_chunks(conn)
    conn.close()
    print(f"Built {len(chunks)} chunks from {db}")

    state = qdrant_io.create_collection(recreate=recreate)
    qdrant_io.ensure_indexes()
    print(f"Collection '{config.COLLECTION}': {state}")

    embed_and_upsert(chunks, batch=batch)
    print(f"Done. Qdrant '{config.COLLECTION}' now holds {qdrant_io.count()} points.")


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--db", default=config.DEFAULT_DB)
    ap.add_argument("--recreate", action="store_true", help="drop & recreate the collection first")
    ap.add_argument("--batch", type=int, default=32)
    args = ap.parse_args()
    run(args.db, recreate=args.recreate, batch=args.batch)


if __name__ == "__main__":
    main()