spark-control/scripts/test-audio-with-speakers.sh

#!/bin/bash
# End-to-end test of the v0.10 + v0.11 audio pipeline:
#   audio file  →  spark-control /api/audio/transcribe-with-speakers
#                  (Parakeet + Sortformer merged)
#               →  Qwen3.6 via vLLM with long-form prompt + speaker name
#                  resolution
#               →  ~/Desktop/<filename>-analysis.md
#
# Usage:
#   bash scripts/test-audio-with-speakers.sh <audio-file> [--people "Name1, Name2"]
#
# Env:
#   SPARK_CONTROL — base URL of a running Spark Control instance
#                   (default http://127.0.0.1:9999, i.e. a local dev server;
#                   point it at your installed package URL otherwise)
#   VLLM          — /v1 base URL used for chat/completions
#                   (default $SPARK_CONTROL/v1 — Spark Control proxies vLLM)
#
# Examples:
#   # No participants list (LLM will only resolve speakers it can verify from audio cues)
#   bash scripts/test-audio-with-speakers.sh ~/Library/Application\ Support/hyprnote/sessions/*/audio.mp3
#
#   # With known participants (LLM constrained to these names)
#   bash scripts/test-audio-with-speakers.sh ~/Downloads/podcast.mp3 --people "Dax, Will"
#
# Designed to mirror exactly what recap-relay's spark-control backend will do
# once the PR lands. If the output looks good here, the recap-relay version
# will look the same.

set -e

AUDIO="${1:?Usage: $0 <audio-file> [--people \"Name1, Name2\"]}"
PEOPLE=""
if [ "$2" = "--people" ] && [ -n "$3" ]; then
  PEOPLE="$3"
fi

if [ ! -f "$AUDIO" ]; then
  echo "ERROR: audio file not found: $AUDIO" >&2
  exit 1
fi

SPARK_CONTROL="${SPARK_CONTROL:-http://127.0.0.1:9999}"
VLLM="${VLLM:-$SPARK_CONTROL/v1}"

echo "════════════════════════════════════════════════════════════════"
echo "Audio:           $AUDIO ($(du -h "$AUDIO" | cut -f1))"
echo "Spark Control:   $SPARK_CONTROL"
echo "vLLM:            $VLLM"
echo "Participants:    ${PEOPLE:-<none — LLM will only resolve speakers from audio cues>}"
echo "════════════════════════════════════════════════════════════════"
echo

# ───────── Stage 1: transcribe + diarize ─────────
echo "▶ Stage 1: transcribe + diarize (Parakeet + Sortformer in parallel)..."
START=$(date +%s)
HTTP=$(curl -sSk -X POST "$SPARK_CONTROL/api/audio/transcribe-with-speakers" \
  -F "file=@$AUDIO" \
  -o /tmp/diarized.json \
  -w "%{http_code}")
END=$(date +%s)
echo "  HTTP $HTTP, $((END - START))s wall time"

if [ "$HTTP" != "200" ]; then
  echo "ERROR — non-200 response. Full body:"
  cat /tmp/diarized.json
  exit 1
fi

python3 -c "
import json
d = json.load(open('/tmp/diarized.json'))
print(f\"  Duration: {d['duration']}s   Speakers: {d['speakers_detected']}   Segments: {len(d['segments'])}\")"

# ───────── Stage 2: format transcript ─────────
echo
echo "▶ Stage 2: format diarized transcript as [MM:SS] Speaker_N: text..."
python3 > /tmp/transcript-formatted.txt <<'PY'
import json
d = json.load(open('/tmp/diarized.json'))
out = []
for s in d['segments']:
    ms = s['start_ms'] // 1000
    h, m, sec = ms // 3600, (ms % 3600) // 60, ms % 60
    ts = f"{h}:{m:02d}:{sec:02d}" if h else f"{m:02d}:{sec:02d}"
    out.append(f"[{ts}] {s['speaker']}: {s['text']}")
print("\n".join(out))
PY
echo "  $(wc -l < /tmp/transcript-formatted.txt) formatted lines"
echo "  Sample (first 3):"
head -3 /tmp/transcript-formatted.txt | sed 's/^/    /'

# ───────── Stage 3: discover current LLM ─────────
echo
echo "▶ Stage 3: discover current vLLM model..."
# Note: Spark Control's /v1/models lists *audio* models (STT + TTS voices),
# not the LLM — ask /api/status for the currently loaded vLLM model instead.
MODEL=$(curl -sSk "$SPARK_CONTROL/api/status" | python3 -c "import json,sys; print(json.load(sys.stdin)['vllm']['current_model'])")
echo "  Model: $MODEL"

# ───────── Stage 4: build LLM request ─────────
echo
echo "▶ Stage 4: build LLM request with speaker-name-resolution prompt..."
python3 - "$MODEL" /tmp/transcript-formatted.txt "$PEOPLE" > /tmp/request.json <<'PY'
import json, sys
model, transcript_path, people = sys.argv[1], sys.argv[2], sys.argv[3]
transcript = open(transcript_path).read()

participants_block = ""
if people.strip():
    participants_block = f"""

Known participants in this conversation: {people}
Constrain your speaker→name mappings to this list. Still only assign a
name when the audio cues unambiguously identify which participant is
which — do not guess based on topic or role."""

system = (
    "You are a meeting analyst producing comprehensive long-form notes. "
    "Preserve specific quotes, numbers, dates, names, and decisions verbatim. "
    "Quote speakers directly when they said something memorable. "
    "Generate as many sections as the meeting naturally has. "
    "Do not summarize aggressively — aim for 3000-6000 words for a 60-90 min conversation."
)

user_prompt = f"""You will analyze a transcript with anonymous speaker labels (Speaker_0, Speaker_1, ...).

CRITICAL — speaker name resolution rules:
  Map a speaker label to a real name ONLY when you have direct, unambiguous evidence:
    - The speaker explicitly identifies themselves ("I'm X", "this is X", "my name is X")
    - Another speaker addresses them by name as a vocative ("thanks X", "X, what do you think?")
  If you have ANY doubt, leave the mapping as null. False mappings are worse than no mapping.
  Do NOT infer names from topic context, role descriptions, or weak associations.{participants_block}

OUTPUT FORMAT — produce exactly two parts:

PART 1: A JSON block at the very top of your response with this shape:
```json
{{
  "speaker_mapping": {{
    "Speaker_0": {{"name": "Real Name", "confidence": "high", "evidence": "quoted line + [MM:SS]"}},
    "Speaker_1": {{"name": null, "confidence": null, "evidence": null}}
  }}
}}
```

PART 2: Below the JSON, a structured long-form report with these sections:

# Detailed Discussion Log
Chronological account of every topic discussed, with verbatim quotes from speakers for important points. Aim for 8+ bullets per major topic. Use sub-bullets for examples or supporting detail.

# Decisions Made
Every decision, with who proposed it, who agreed, any dissent, and rationale.

# Action Items
Every action item, with owner, deadline, and any context. Include even minor "I'll think about it" commitments.

# Open Questions
Things raised that weren't resolved, with who raised them.

# Key Quotes
Direct quotes worth preserving, with speaker attribution.

In the report body: use REAL NAMES where you mapped them, and Speaker_N where you couldn't.

---

TRANSCRIPT:

{transcript}"""

print(json.dumps({
    "model": model,
    "messages": [
        {"role": "system", "content": system},
        {"role": "user", "content": user_prompt},
    ],
    "max_tokens": 16000,
    "temperature": 0.3,
    "chat_template_kwargs": {"enable_thinking": False},
}))
PY
REQ_BYTES=$(wc -c < /tmp/request.json)
echo "  Request size: $REQ_BYTES bytes"

# ───────── Stage 5: LLM call ─────────
echo
echo "▶ Stage 5: send to Qwen3.6 (this is the slow part — 30-90s typical)..."
START=$(date +%s)
curl -sS $VLLM/chat/completions \
  -H "Content-Type: application/json" \
  -d @/tmp/request.json \
  > /tmp/llm-raw.json
END=$(date +%s)
echo "  Wall time: $((END - START))s"

# Extract content
python3 -c "
import json
r = json.load(open('/tmp/llm-raw.json'))
if 'choices' in r:
    print(r['choices'][0]['message']['content'])
else:
    print('ERROR — unexpected response:')
    print(json.dumps(r, indent=2))
" > /tmp/analysis.md

# ───────── Stage 6: save + display ─────────
BASENAME=$(basename "$AUDIO" | sed 's/\.[^.]*$//')
DEST="$HOME/Desktop/${BASENAME}-analysis.md"
cp /tmp/analysis.md "$DEST"
echo
echo "════════════════════════════════════════════════════════════════"
echo "✔ Saved: $DEST"
echo "  ($(wc -l < "$DEST") lines, $(wc -w < "$DEST") words)"
echo "════════════════════════════════════════════════════════════════"
echo
echo "─── Top of the report (speaker mapping JSON, if produced) ───"
head -30 "$DEST"
echo "..."
echo
open -a "TextEdit" "$DEST"