#!/bin/bash # End-to-end test of the v0.10 + v0.11 audio pipeline: # audio file → spark-control /api/audio/transcribe-with-speakers # (Parakeet + Sortformer merged) # → Qwen3.6 via vLLM with long-form prompt + speaker name # resolution # → ~/Desktop/-analysis.md # # Usage: # bash scripts/test-audio-with-speakers.sh [--people "Name1, Name2"] # # Examples: # # No participants list (LLM will only resolve speakers it can verify from audio cues) # bash scripts/test-audio-with-speakers.sh ~/Library/Application\ Support/hyprnote/sessions/*/audio.mp3 # # # With known participants (LLM constrained to these names) # bash scripts/test-audio-with-speakers.sh ~/Downloads/podcast.mp3 --people "Dax, Will" # # Designed to mirror exactly what recap-relay's spark-control backend will do # once the PR lands. If the output looks good here, the recap-relay version # will look the same. set -e AUDIO="${1:?Usage: $0 [--people \"Name1, Name2\"]}" PEOPLE="" if [ "$2" = "--people" ] && [ -n "$3" ]; then PEOPLE="$3" fi if [ ! -f "$AUDIO" ]; then echo "ERROR: audio file not found: $AUDIO" >&2 exit 1 fi SPARK_CONTROL="${SPARK_CONTROL:-https://spark.satsflows.com}" VLLM="${VLLM:-http://:8888/v1}" echo "════════════════════════════════════════════════════════════════" echo "Audio: $AUDIO ($(du -h "$AUDIO" | cut -f1))" echo "Spark Control: $SPARK_CONTROL" echo "vLLM: $VLLM" echo "Participants: ${PEOPLE:-}" echo "════════════════════════════════════════════════════════════════" echo # ───────── Stage 1: transcribe + diarize ───────── echo "▶ Stage 1: transcribe + diarize (Parakeet + Sortformer in parallel)..." START=$(date +%s) HTTP=$(curl -sSk -X POST "$SPARK_CONTROL/api/audio/transcribe-with-speakers" \ -F "file=@$AUDIO" \ -o /tmp/diarized.json \ -w "%{http_code}") END=$(date +%s) echo " HTTP $HTTP, $((END - START))s wall time" if [ "$HTTP" != "200" ]; then echo "ERROR — non-200 response. Full body:" cat /tmp/diarized.json exit 1 fi python3 -c " import json d = json.load(open('/tmp/diarized.json')) print(f\" Duration: {d['duration']}s Speakers: {d['speakers_detected']} Segments: {len(d['segments'])}\")" # ───────── Stage 2: format transcript ───────── echo echo "▶ Stage 2: format diarized transcript as [MM:SS] Speaker_N: text..." python3 > /tmp/transcript-formatted.txt <<'PY' import json d = json.load(open('/tmp/diarized.json')) out = [] for s in d['segments']: ms = s['start_ms'] // 1000 h, m, sec = ms // 3600, (ms % 3600) // 60, ms % 60 ts = f"{h}:{m:02d}:{sec:02d}" if h else f"{m:02d}:{sec:02d}" out.append(f"[{ts}] {s['speaker']}: {s['text']}") print("\n".join(out)) PY echo " $(wc -l < /tmp/transcript-formatted.txt) formatted lines" echo " Sample (first 3):" head -3 /tmp/transcript-formatted.txt | sed 's/^/ /' # ───────── Stage 3: discover current LLM ───────── echo echo "▶ Stage 3: discover current vLLM model..." MODEL=$(curl -sS $VLLM/models | python3 -c "import json,sys; print(json.load(sys.stdin)['data'][0]['id'])") echo " Model: $MODEL" # ───────── Stage 4: build LLM request ───────── echo echo "▶ Stage 4: build LLM request with speaker-name-resolution prompt..." python3 - "$MODEL" /tmp/transcript-formatted.txt "$PEOPLE" > /tmp/request.json <<'PY' import json, sys model, transcript_path, people = sys.argv[1], sys.argv[2], sys.argv[3] transcript = open(transcript_path).read() participants_block = "" if people.strip(): participants_block = f""" Known participants in this conversation: {people} Constrain your speaker→name mappings to this list. Still only assign a name when the audio cues unambiguously identify which participant is which — do not guess based on topic or role.""" system = ( "You are a meeting analyst producing comprehensive long-form notes. " "Preserve specific quotes, numbers, dates, names, and decisions verbatim. " "Quote speakers directly when they said something memorable. " "Generate as many sections as the meeting naturally has. " "Do not summarize aggressively — aim for 3000-6000 words for a 60-90 min conversation." ) user_prompt = f"""You will analyze a transcript with anonymous speaker labels (Speaker_0, Speaker_1, ...). CRITICAL — speaker name resolution rules: Map a speaker label to a real name ONLY when you have direct, unambiguous evidence: - The speaker explicitly identifies themselves ("I'm X", "this is X", "my name is X") - Another speaker addresses them by name as a vocative ("thanks X", "X, what do you think?") If you have ANY doubt, leave the mapping as null. False mappings are worse than no mapping. Do NOT infer names from topic context, role descriptions, or weak associations.{participants_block} OUTPUT FORMAT — produce exactly two parts: PART 1: A JSON block at the very top of your response with this shape: ```json {{ "speaker_mapping": {{ "Speaker_0": {{"name": "Real Name", "confidence": "high", "evidence": "quoted line + [MM:SS]"}}, "Speaker_1": {{"name": null, "confidence": null, "evidence": null}} }} }} ``` PART 2: Below the JSON, a structured long-form report with these sections: # Detailed Discussion Log Chronological account of every topic discussed, with verbatim quotes from speakers for important points. Aim for 8+ bullets per major topic. Use sub-bullets for examples or supporting detail. # Decisions Made Every decision, with who proposed it, who agreed, any dissent, and rationale. # Action Items Every action item, with owner, deadline, and any context. Include even minor "I'll think about it" commitments. # Open Questions Things raised that weren't resolved, with who raised them. # Key Quotes Direct quotes worth preserving, with speaker attribution. In the report body: use REAL NAMES where you mapped them, and Speaker_N where you couldn't. --- TRANSCRIPT: {transcript}""" print(json.dumps({ "model": model, "messages": [ {"role": "system", "content": system}, {"role": "user", "content": user_prompt}, ], "max_tokens": 16000, "temperature": 0.3, "chat_template_kwargs": {"enable_thinking": False}, })) PY REQ_BYTES=$(wc -c < /tmp/request.json) echo " Request size: $REQ_BYTES bytes" # ───────── Stage 5: LLM call ───────── echo echo "▶ Stage 5: send to Qwen3.6 (this is the slow part — 30-90s typical)..." START=$(date +%s) curl -sS $VLLM/chat/completions \ -H "Content-Type: application/json" \ -d @/tmp/request.json \ > /tmp/llm-raw.json END=$(date +%s) echo " Wall time: $((END - START))s" # Extract content python3 -c " import json r = json.load(open('/tmp/llm-raw.json')) if 'choices' in r: print(r['choices'][0]['message']['content']) else: print('ERROR — unexpected response:') print(json.dumps(r, indent=2)) " > /tmp/analysis.md # ───────── Stage 6: save + display ───────── BASENAME=$(basename "$AUDIO" | sed 's/\.[^.]*$//') DEST="$HOME/Desktop/${BASENAME}-analysis.md" cp /tmp/analysis.md "$DEST" echo echo "════════════════════════════════════════════════════════════════" echo "✔ Saved: $DEST" echo " ($(wc -l < "$DEST") lines, $(wc -w < "$DEST") words)" echo "════════════════════════════════════════════════════════════════" echo echo "─── Top of the report (speaker mapping JSON, if produced) ───" head -30 "$DEST" echo "..." echo open -a "TextEdit" "$DEST"