#!/usr/bin/env python3 """Phase-0 Workstream B — backfill the CRM into Qdrant. Chunk -> dense (bge-m3 via Spark Control) + sparse (BM25 client-side) -> upsert to Qdrant `crm_chunks` with payload. Idempotent: deterministic point ids mean re-running upserts in place. Reads the CRM by file path; never sends data to Claude. python3 backend/ingest/backfill.py --db data/crm_dev.db --recreate """ import argparse import sqlite3 import chunking import config import embed import qdrant_io import sparse def embed_and_upsert(chunks, batch=32, progress=True): """Embed (dense + sparse) and upsert a list of chunks to Qdrant. Shared by the full backfill and the incremental sync. Returns the number of points written.""" total = 0 for i in range(0, len(chunks), batch): group = chunks[i:i + batch] dense = embed.dense_embed([c["text"] for c in group]) points = [] for c, dv in zip(group, dense): sv = sparse.encode(c["text"]) points.append({ "id": c["point_id"], "vector": {"dense": dv, "sparse": {"indices": sv["indices"], "values": sv["values"]}}, "payload": { "lp_id": c["lp_id"], "lp_name": c["lp_name"], "person_id": c["person_id"], "doc_type": c["doc_type"], "date_ts": c["date_ts"], "text": c["text"], "source_model": c["source_model"], "source_id": c["source_id"], "chunk_key": c["chunk_key"], }, }) qdrant_io.upsert(points) total += len(points) if progress: print(f" upserted {total}/{len(chunks)}") return total def run(db, recreate=False, batch=32): conn = sqlite3.connect(db) conn.row_factory = sqlite3.Row chunks = chunking.build_chunks(conn) conn.close() print(f"Built {len(chunks)} chunks from {db}") state = qdrant_io.create_collection(recreate=recreate) qdrant_io.ensure_indexes() print(f"Collection '{config.COLLECTION}': {state}") embed_and_upsert(chunks, batch=batch) print(f"Done. Qdrant '{config.COLLECTION}' now holds {qdrant_io.count()} points.") def main(): ap = argparse.ArgumentParser() ap.add_argument("--db", default=config.DEFAULT_DB) ap.add_argument("--recreate", action="store_true", help="drop & recreate the collection first") ap.add_argument("--batch", type=int, default=32) args = ap.parse_args() run(args.db, recreate=args.recreate, batch=args.batch) if __name__ == "__main__": main()