Initial commit: Ten31 Signal Engine (ingest, scoring brain, corpus seeds)
This commit is contained in:
Executable
+47
@@ -0,0 +1,47 @@
|
||||
#!/bin/bash
|
||||
# Queued Strike pipeline: waits for the 4-show transcription to finish, then extracts → embeds →
|
||||
# runs the STRIKE2022 two-sided reflexivity test (live vs test). Robust: proceeds with whatever is
|
||||
# transcribed if the worker dies, and the 24h cap is a backstop. All work persists in the DB, so a
|
||||
# crash mid-run is resumable by re-running run-extract / embed-claims / two-sided by hand.
|
||||
set -u
|
||||
cd /Users/macpro/Projects/ten31-signal-engine
|
||||
PY=.venv/bin/python
|
||||
LOG=data/strike_pipeline.log
|
||||
SHOWS="('pod-whatbitcoindid','pod-stephanlivera','pod-kevinrooke','pod-anitaposch')"
|
||||
say(){ echo "[$(date '+%H:%M:%S')] $*" | tee -a "$LOG"; }
|
||||
|
||||
say "Strike pipeline QUEUED — waiting for transcription of the 4 independent shows to finish."
|
||||
|
||||
# 1) Wait for transcription completion (remaining=0) OR the transcribe worker dying. 24h backstop.
|
||||
for i in $(seq 1 480); do
|
||||
R=$(sqlite3 data/signal.db "SELECT COUNT(*) FROM backfill_jobs j JOIN documents d ON j.target_id=d.doc_id WHERE j.job_type='transcribe' AND j.state IN ('pending','running','leased') AND d.source_id IN $SHOWS;")
|
||||
D=$(sqlite3 data/signal.db "SELECT COUNT(*) FROM backfill_jobs j JOIN documents d ON j.target_id=d.doc_id WHERE j.job_type='transcribe' AND j.state='done' AND d.source_id IN $SHOWS;")
|
||||
ALIVE=$(ps aux | grep -cE "[r]un-transcribe") # matches run-transcribe AND run-transcribe-gemini
|
||||
say "transcribe: remaining=$R done=$D worker_alive=$ALIVE (poll $i)"
|
||||
if [ "$R" = "0" ]; then say "transcription COMPLETE."; break; fi
|
||||
if [ "$ALIVE" = "0" ]; then say "transcribe worker not alive and work remains ($R) — proceeding with partial corpus."; break; fi
|
||||
sleep 180
|
||||
done
|
||||
|
||||
# 2) Prioritize bitcoin-cluster podcast extract jobs (the independent legs + the 19 TFTC for test-mode contrast)
|
||||
say "prioritizing + extracting bitcoin-podcast claims (local Qwen on the now-free Spark)..."
|
||||
sqlite3 data/signal.db "UPDATE backfill_jobs SET priority=8 WHERE job_type='extract' AND state='pending' AND parent_doc_id IN (SELECT d.doc_id FROM documents d JOIN sources s ON d.source_id=s.source_id WHERE s.source_cluster='bitcoin' AND s.kind='podcast');"
|
||||
|
||||
# 3) Extract (priority-8 podcasts drain first). Loop in batches so a transient gateway hiccup doesn't end it.
|
||||
for pass in 1 2 3 4 5 6; do
|
||||
PEND=$(sqlite3 data/signal.db "SELECT COUNT(*) FROM backfill_jobs j JOIN documents d ON j.target_id=d.doc_id JOIN sources s ON d.source_id=s.source_id WHERE j.job_type='extract' AND j.state='pending' AND s.source_cluster='bitcoin' AND s.kind='podcast';")
|
||||
say "extract pass $pass: $PEND bitcoin-podcast extract jobs pending"
|
||||
[ "$PEND" = "0" ] && break
|
||||
$PY -m signal_engine run-extract --limit 250 --max-chunks 4 2>&1 | grep -vE "httpx" | tail -3 | tee -a "$LOG"
|
||||
done
|
||||
|
||||
# 4) Embed all pending claims → Qdrant
|
||||
say "embedding claims..."
|
||||
$PY -m signal_engine embed-claims 2>&1 | grep -vE "httpx|HF_TOKEN|huggingface|show_warning|Fetching|files:" | tail -3 | tee -a "$LOG"
|
||||
|
||||
# 5) STRIKE2022 two-sided: live (own_network TFTC/CD/RHR dropped) vs test (kept) — the reflexivity contrast
|
||||
say "=== STRIKE2022 TWO-SIDED RESULT (live vs test) ==="
|
||||
$PY -m signal_engine two-sided --conviction STRIKE2022 --modes live,test \
|
||||
--dates 2022-12-31,2023-06-30,2023-12-31 --window-days 180 2>&1 \
|
||||
| grep -vE "httpx|HF_TOKEN|huggingface|show_warning|Fetching|files:" | tee -a "$LOG"
|
||||
say "Strike pipeline DONE."
|
||||
Reference in New Issue
Block a user