48 lines
3.3 KiB
Bash
Executable File
48 lines
3.3 KiB
Bash
Executable File
#!/bin/bash
|
|
# Queued Strike pipeline: waits for the 4-show transcription to finish, then extracts → embeds →
|
|
# runs the STRIKE2022 two-sided reflexivity test (live vs test). Robust: proceeds with whatever is
|
|
# transcribed if the worker dies, and the 24h cap is a backstop. All work persists in the DB, so a
|
|
# crash mid-run is resumable by re-running run-extract / embed-claims / two-sided by hand.
|
|
set -u
|
|
cd /Users/macpro/Projects/ten31-signal-engine
|
|
PY=.venv/bin/python
|
|
LOG=data/strike_pipeline.log
|
|
SHOWS="('pod-whatbitcoindid','pod-stephanlivera','pod-kevinrooke','pod-anitaposch')"
|
|
say(){ echo "[$(date '+%H:%M:%S')] $*" | tee -a "$LOG"; }
|
|
|
|
say "Strike pipeline QUEUED — waiting for transcription of the 4 independent shows to finish."
|
|
|
|
# 1) Wait for transcription completion (remaining=0) OR the transcribe worker dying. 24h backstop.
|
|
for i in $(seq 1 480); do
|
|
R=$(sqlite3 data/signal.db "SELECT COUNT(*) FROM backfill_jobs j JOIN documents d ON j.target_id=d.doc_id WHERE j.job_type='transcribe' AND j.state IN ('pending','running','leased') AND d.source_id IN $SHOWS;")
|
|
D=$(sqlite3 data/signal.db "SELECT COUNT(*) FROM backfill_jobs j JOIN documents d ON j.target_id=d.doc_id WHERE j.job_type='transcribe' AND j.state='done' AND d.source_id IN $SHOWS;")
|
|
ALIVE=$(ps aux | grep -cE "[r]un-transcribe") # matches run-transcribe AND run-transcribe-gemini
|
|
say "transcribe: remaining=$R done=$D worker_alive=$ALIVE (poll $i)"
|
|
if [ "$R" = "0" ]; then say "transcription COMPLETE."; break; fi
|
|
if [ "$ALIVE" = "0" ]; then say "transcribe worker not alive and work remains ($R) — proceeding with partial corpus."; break; fi
|
|
sleep 180
|
|
done
|
|
|
|
# 2) Prioritize bitcoin-cluster podcast extract jobs (the independent legs + the 19 TFTC for test-mode contrast)
|
|
say "prioritizing + extracting bitcoin-podcast claims (local Qwen on the now-free Spark)..."
|
|
sqlite3 data/signal.db "UPDATE backfill_jobs SET priority=8 WHERE job_type='extract' AND state='pending' AND parent_doc_id IN (SELECT d.doc_id FROM documents d JOIN sources s ON d.source_id=s.source_id WHERE s.source_cluster='bitcoin' AND s.kind='podcast');"
|
|
|
|
# 3) Extract (priority-8 podcasts drain first). Loop in batches so a transient gateway hiccup doesn't end it.
|
|
for pass in 1 2 3 4 5 6; do
|
|
PEND=$(sqlite3 data/signal.db "SELECT COUNT(*) FROM backfill_jobs j JOIN documents d ON j.target_id=d.doc_id JOIN sources s ON d.source_id=s.source_id WHERE j.job_type='extract' AND j.state='pending' AND s.source_cluster='bitcoin' AND s.kind='podcast';")
|
|
say "extract pass $pass: $PEND bitcoin-podcast extract jobs pending"
|
|
[ "$PEND" = "0" ] && break
|
|
$PY -m signal_engine run-extract --limit 250 --max-chunks 4 2>&1 | grep -vE "httpx" | tail -3 | tee -a "$LOG"
|
|
done
|
|
|
|
# 4) Embed all pending claims → Qdrant
|
|
say "embedding claims..."
|
|
$PY -m signal_engine embed-claims 2>&1 | grep -vE "httpx|HF_TOKEN|huggingface|show_warning|Fetching|files:" | tail -3 | tee -a "$LOG"
|
|
|
|
# 5) STRIKE2022 two-sided: live (own_network TFTC/CD/RHR dropped) vs test (kept) — the reflexivity contrast
|
|
say "=== STRIKE2022 TWO-SIDED RESULT (live vs test) ==="
|
|
$PY -m signal_engine two-sided --conviction STRIKE2022 --modes live,test \
|
|
--dates 2022-12-31,2023-06-30,2023-12-31 --window-days 180 2>&1 \
|
|
| grep -vE "httpx|HF_TOKEN|huggingface|show_warning|Fetching|files:" | tee -a "$LOG"
|
|
say "Strike pipeline DONE."
|