Phase 0 complete: fuzzy entity tier, incremental sync, Start9 packaging
- Fuzzy tier (backend/ingest/fuzzy_resolve.py + llm.py): local Qwen adjudicates the deterministic resolver's flagged name-variant candidates; merges are durable via entity_merges (deterministic re-runs respect them), losers soft-deleted, logged. Idempotent. - Incremental sync (backend/ingest/sync.py): re-embeds only rows changed since a watermark (ingest_sync_state); first run / --recreate = full. Tested full→0→1. - Start9 packaging (start9/0.4): Dockerfile bundles ingest+mcp + fastembed/mcp; "Build search index" action runs the init in a subcontainer; MCP shipped as a manual stdio server (not a daemon); version 0.1.0:44. INGEST_PACKAGING.md. - backfill.py: factored embed_and_upsert() shared with sync. Verified end-to-end on synthetic data + live Sparks/Qwen/Qdrant. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
+19
-12
@@ -17,17 +17,9 @@ import qdrant_io
|
||||
import sparse
|
||||
|
||||
|
||||
def run(db, recreate=False, batch=32):
|
||||
conn = sqlite3.connect(db)
|
||||
conn.row_factory = sqlite3.Row
|
||||
chunks = chunking.build_chunks(conn)
|
||||
conn.close()
|
||||
print(f"Built {len(chunks)} chunks from {db}")
|
||||
|
||||
state = qdrant_io.create_collection(recreate=recreate)
|
||||
qdrant_io.ensure_indexes()
|
||||
print(f"Collection '{config.COLLECTION}': {state}")
|
||||
|
||||
def embed_and_upsert(chunks, batch=32, progress=True):
|
||||
"""Embed (dense + sparse) and upsert a list of chunks to Qdrant. Shared by the
|
||||
full backfill and the incremental sync. Returns the number of points written."""
|
||||
total = 0
|
||||
for i in range(0, len(chunks), batch):
|
||||
group = chunks[i:i + batch]
|
||||
@@ -46,8 +38,23 @@ def run(db, recreate=False, batch=32):
|
||||
})
|
||||
qdrant_io.upsert(points)
|
||||
total += len(points)
|
||||
print(f" upserted {total}/{len(chunks)}")
|
||||
if progress:
|
||||
print(f" upserted {total}/{len(chunks)}")
|
||||
return total
|
||||
|
||||
|
||||
def run(db, recreate=False, batch=32):
|
||||
conn = sqlite3.connect(db)
|
||||
conn.row_factory = sqlite3.Row
|
||||
chunks = chunking.build_chunks(conn)
|
||||
conn.close()
|
||||
print(f"Built {len(chunks)} chunks from {db}")
|
||||
|
||||
state = qdrant_io.create_collection(recreate=recreate)
|
||||
qdrant_io.ensure_indexes()
|
||||
print(f"Collection '{config.COLLECTION}': {state}")
|
||||
|
||||
embed_and_upsert(chunks, batch=batch)
|
||||
print(f"Done. Qdrant '{config.COLLECTION}' now holds {qdrant_io.count()} points.")
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user