#!/bin/bash # Queued Strike pipeline: waits for the 4-show transcription to finish, then extracts → embeds → # runs the STRIKE2022 two-sided reflexivity test (live vs test). Robust: proceeds with whatever is # transcribed if the worker dies, and the 24h cap is a backstop. All work persists in the DB, so a # crash mid-run is resumable by re-running run-extract / embed-claims / two-sided by hand. set -u cd /Users/macpro/Projects/ten31-signal-engine PY=.venv/bin/python LOG=data/strike_pipeline.log SHOWS="('pod-whatbitcoindid','pod-stephanlivera','pod-kevinrooke','pod-anitaposch')" say(){ echo "[$(date '+%H:%M:%S')] $*" | tee -a "$LOG"; } say "Strike pipeline QUEUED — waiting for transcription of the 4 independent shows to finish." # 1) Wait for transcription completion (remaining=0) OR the transcribe worker dying. 24h backstop. for i in $(seq 1 480); do R=$(sqlite3 data/signal.db "SELECT COUNT(*) FROM backfill_jobs j JOIN documents d ON j.target_id=d.doc_id WHERE j.job_type='transcribe' AND j.state IN ('pending','running','leased') AND d.source_id IN $SHOWS;") D=$(sqlite3 data/signal.db "SELECT COUNT(*) FROM backfill_jobs j JOIN documents d ON j.target_id=d.doc_id WHERE j.job_type='transcribe' AND j.state='done' AND d.source_id IN $SHOWS;") ALIVE=$(ps aux | grep -cE "[r]un-transcribe") # matches run-transcribe AND run-transcribe-gemini say "transcribe: remaining=$R done=$D worker_alive=$ALIVE (poll $i)" if [ "$R" = "0" ]; then say "transcription COMPLETE."; break; fi if [ "$ALIVE" = "0" ]; then say "transcribe worker not alive and work remains ($R) — proceeding with partial corpus."; break; fi sleep 180 done # 2) Prioritize bitcoin-cluster podcast extract jobs (the independent legs + the 19 TFTC for test-mode contrast) say "prioritizing + extracting bitcoin-podcast claims (local Qwen on the now-free Spark)..." sqlite3 data/signal.db "UPDATE backfill_jobs SET priority=8 WHERE job_type='extract' AND state='pending' AND parent_doc_id IN (SELECT d.doc_id FROM documents d JOIN sources s ON d.source_id=s.source_id WHERE s.source_cluster='bitcoin' AND s.kind='podcast');" # 3) Extract (priority-8 podcasts drain first). Loop in batches so a transient gateway hiccup doesn't end it. for pass in 1 2 3 4 5 6; do PEND=$(sqlite3 data/signal.db "SELECT COUNT(*) FROM backfill_jobs j JOIN documents d ON j.target_id=d.doc_id JOIN sources s ON d.source_id=s.source_id WHERE j.job_type='extract' AND j.state='pending' AND s.source_cluster='bitcoin' AND s.kind='podcast';") say "extract pass $pass: $PEND bitcoin-podcast extract jobs pending" [ "$PEND" = "0" ] && break $PY -m signal_engine run-extract --limit 250 --max-chunks 4 2>&1 | grep -vE "httpx" | tail -3 | tee -a "$LOG" done # 4) Embed all pending claims → Qdrant say "embedding claims..." $PY -m signal_engine embed-claims 2>&1 | grep -vE "httpx|HF_TOKEN|huggingface|show_warning|Fetching|files:" | tail -3 | tee -a "$LOG" # 5) STRIKE2022 two-sided: live (own_network TFTC/CD/RHR dropped) vs test (kept) — the reflexivity contrast say "=== STRIKE2022 TWO-SIDED RESULT (live vs test) ===" $PY -m signal_engine two-sided --conviction STRIKE2022 --modes live,test \ --dates 2022-12-31,2023-06-30,2023-12-31 --window-days 180 2>&1 \ | grep -vE "httpx|HF_TOKEN|huggingface|show_warning|Fetching|files:" | tee -a "$LOG" say "Strike pipeline DONE."