9ef9226e0a
- CLAUDE.md trimmed to whole-repo facts (58 lines); subsystem guidance
moved to .claude/rules/{startos-package,fastapi-image,redaction,
audio-speech}.md with paths: frontmatter so each loads only when
matching files are touched
- .gitignore: track .claude/rules/ while keeping the rest of .claude/
(settings.local.json) ignored
- test-audio-with-speakers.sh: require audio-file arg in docs, replace
owner-specific SPARK_CONTROL/VLLM defaults with generic ones
(localhost dev server + Spark Control vLLM proxy), discover the
loaded LLM via /api/status since /v1/models lists audio models only
- document REDACTION_MAP_DB + CONNECTIVITY_LOG as required for local
dev (/data only exists in the container)
- prettier pass over startos/actions (formatting drift)
223 lines
8.6 KiB
Bash
Executable File
223 lines
8.6 KiB
Bash
Executable File
#!/bin/bash
|
|
# End-to-end test of the v0.10 + v0.11 audio pipeline:
|
|
# audio file → spark-control /api/audio/transcribe-with-speakers
|
|
# (Parakeet + Sortformer merged)
|
|
# → Qwen3.6 via vLLM with long-form prompt + speaker name
|
|
# resolution
|
|
# → ~/Desktop/<filename>-analysis.md
|
|
#
|
|
# Usage:
|
|
# bash scripts/test-audio-with-speakers.sh <audio-file> [--people "Name1, Name2"]
|
|
#
|
|
# Env:
|
|
# SPARK_CONTROL — base URL of a running Spark Control instance
|
|
# (default http://127.0.0.1:9999, i.e. a local dev server;
|
|
# point it at your installed package URL otherwise)
|
|
# VLLM — /v1 base URL used for chat/completions
|
|
# (default $SPARK_CONTROL/v1 — Spark Control proxies vLLM)
|
|
#
|
|
# Examples:
|
|
# # No participants list (LLM will only resolve speakers it can verify from audio cues)
|
|
# bash scripts/test-audio-with-speakers.sh ~/Library/Application\ Support/hyprnote/sessions/*/audio.mp3
|
|
#
|
|
# # With known participants (LLM constrained to these names)
|
|
# bash scripts/test-audio-with-speakers.sh ~/Downloads/podcast.mp3 --people "Dax, Will"
|
|
#
|
|
# Designed to mirror exactly what recap-relay's spark-control backend will do
|
|
# once the PR lands. If the output looks good here, the recap-relay version
|
|
# will look the same.
|
|
|
|
set -e
|
|
|
|
AUDIO="${1:?Usage: $0 <audio-file> [--people \"Name1, Name2\"]}"
|
|
PEOPLE=""
|
|
if [ "$2" = "--people" ] && [ -n "$3" ]; then
|
|
PEOPLE="$3"
|
|
fi
|
|
|
|
if [ ! -f "$AUDIO" ]; then
|
|
echo "ERROR: audio file not found: $AUDIO" >&2
|
|
exit 1
|
|
fi
|
|
|
|
SPARK_CONTROL="${SPARK_CONTROL:-http://127.0.0.1:9999}"
|
|
VLLM="${VLLM:-$SPARK_CONTROL/v1}"
|
|
|
|
echo "════════════════════════════════════════════════════════════════"
|
|
echo "Audio: $AUDIO ($(du -h "$AUDIO" | cut -f1))"
|
|
echo "Spark Control: $SPARK_CONTROL"
|
|
echo "vLLM: $VLLM"
|
|
echo "Participants: ${PEOPLE:-<none — LLM will only resolve speakers from audio cues>}"
|
|
echo "════════════════════════════════════════════════════════════════"
|
|
echo
|
|
|
|
# ───────── Stage 1: transcribe + diarize ─────────
|
|
echo "▶ Stage 1: transcribe + diarize (Parakeet + Sortformer in parallel)..."
|
|
START=$(date +%s)
|
|
HTTP=$(curl -sSk -X POST "$SPARK_CONTROL/api/audio/transcribe-with-speakers" \
|
|
-F "file=@$AUDIO" \
|
|
-o /tmp/diarized.json \
|
|
-w "%{http_code}")
|
|
END=$(date +%s)
|
|
echo " HTTP $HTTP, $((END - START))s wall time"
|
|
|
|
if [ "$HTTP" != "200" ]; then
|
|
echo "ERROR — non-200 response. Full body:"
|
|
cat /tmp/diarized.json
|
|
exit 1
|
|
fi
|
|
|
|
python3 -c "
|
|
import json
|
|
d = json.load(open('/tmp/diarized.json'))
|
|
print(f\" Duration: {d['duration']}s Speakers: {d['speakers_detected']} Segments: {len(d['segments'])}\")"
|
|
|
|
# ───────── Stage 2: format transcript ─────────
|
|
echo
|
|
echo "▶ Stage 2: format diarized transcript as [MM:SS] Speaker_N: text..."
|
|
python3 > /tmp/transcript-formatted.txt <<'PY'
|
|
import json
|
|
d = json.load(open('/tmp/diarized.json'))
|
|
out = []
|
|
for s in d['segments']:
|
|
ms = s['start_ms'] // 1000
|
|
h, m, sec = ms // 3600, (ms % 3600) // 60, ms % 60
|
|
ts = f"{h}:{m:02d}:{sec:02d}" if h else f"{m:02d}:{sec:02d}"
|
|
out.append(f"[{ts}] {s['speaker']}: {s['text']}")
|
|
print("\n".join(out))
|
|
PY
|
|
echo " $(wc -l < /tmp/transcript-formatted.txt) formatted lines"
|
|
echo " Sample (first 3):"
|
|
head -3 /tmp/transcript-formatted.txt | sed 's/^/ /'
|
|
|
|
# ───────── Stage 3: discover current LLM ─────────
|
|
echo
|
|
echo "▶ Stage 3: discover current vLLM model..."
|
|
# Note: Spark Control's /v1/models lists *audio* models (STT + TTS voices),
|
|
# not the LLM — ask /api/status for the currently loaded vLLM model instead.
|
|
MODEL=$(curl -sSk "$SPARK_CONTROL/api/status" | python3 -c "import json,sys; print(json.load(sys.stdin)['vllm']['current_model'])")
|
|
echo " Model: $MODEL"
|
|
|
|
# ───────── Stage 4: build LLM request ─────────
|
|
echo
|
|
echo "▶ Stage 4: build LLM request with speaker-name-resolution prompt..."
|
|
python3 - "$MODEL" /tmp/transcript-formatted.txt "$PEOPLE" > /tmp/request.json <<'PY'
|
|
import json, sys
|
|
model, transcript_path, people = sys.argv[1], sys.argv[2], sys.argv[3]
|
|
transcript = open(transcript_path).read()
|
|
|
|
participants_block = ""
|
|
if people.strip():
|
|
participants_block = f"""
|
|
|
|
Known participants in this conversation: {people}
|
|
Constrain your speaker→name mappings to this list. Still only assign a
|
|
name when the audio cues unambiguously identify which participant is
|
|
which — do not guess based on topic or role."""
|
|
|
|
system = (
|
|
"You are a meeting analyst producing comprehensive long-form notes. "
|
|
"Preserve specific quotes, numbers, dates, names, and decisions verbatim. "
|
|
"Quote speakers directly when they said something memorable. "
|
|
"Generate as many sections as the meeting naturally has. "
|
|
"Do not summarize aggressively — aim for 3000-6000 words for a 60-90 min conversation."
|
|
)
|
|
|
|
user_prompt = f"""You will analyze a transcript with anonymous speaker labels (Speaker_0, Speaker_1, ...).
|
|
|
|
CRITICAL — speaker name resolution rules:
|
|
Map a speaker label to a real name ONLY when you have direct, unambiguous evidence:
|
|
- The speaker explicitly identifies themselves ("I'm X", "this is X", "my name is X")
|
|
- Another speaker addresses them by name as a vocative ("thanks X", "X, what do you think?")
|
|
If you have ANY doubt, leave the mapping as null. False mappings are worse than no mapping.
|
|
Do NOT infer names from topic context, role descriptions, or weak associations.{participants_block}
|
|
|
|
OUTPUT FORMAT — produce exactly two parts:
|
|
|
|
PART 1: A JSON block at the very top of your response with this shape:
|
|
```json
|
|
{{
|
|
"speaker_mapping": {{
|
|
"Speaker_0": {{"name": "Real Name", "confidence": "high", "evidence": "quoted line + [MM:SS]"}},
|
|
"Speaker_1": {{"name": null, "confidence": null, "evidence": null}}
|
|
}}
|
|
}}
|
|
```
|
|
|
|
PART 2: Below the JSON, a structured long-form report with these sections:
|
|
|
|
# Detailed Discussion Log
|
|
Chronological account of every topic discussed, with verbatim quotes from speakers for important points. Aim for 8+ bullets per major topic. Use sub-bullets for examples or supporting detail.
|
|
|
|
# Decisions Made
|
|
Every decision, with who proposed it, who agreed, any dissent, and rationale.
|
|
|
|
# Action Items
|
|
Every action item, with owner, deadline, and any context. Include even minor "I'll think about it" commitments.
|
|
|
|
# Open Questions
|
|
Things raised that weren't resolved, with who raised them.
|
|
|
|
# Key Quotes
|
|
Direct quotes worth preserving, with speaker attribution.
|
|
|
|
In the report body: use REAL NAMES where you mapped them, and Speaker_N where you couldn't.
|
|
|
|
---
|
|
|
|
TRANSCRIPT:
|
|
|
|
{transcript}"""
|
|
|
|
print(json.dumps({
|
|
"model": model,
|
|
"messages": [
|
|
{"role": "system", "content": system},
|
|
{"role": "user", "content": user_prompt},
|
|
],
|
|
"max_tokens": 16000,
|
|
"temperature": 0.3,
|
|
"chat_template_kwargs": {"enable_thinking": False},
|
|
}))
|
|
PY
|
|
REQ_BYTES=$(wc -c < /tmp/request.json)
|
|
echo " Request size: $REQ_BYTES bytes"
|
|
|
|
# ───────── Stage 5: LLM call ─────────
|
|
echo
|
|
echo "▶ Stage 5: send to Qwen3.6 (this is the slow part — 30-90s typical)..."
|
|
START=$(date +%s)
|
|
curl -sS $VLLM/chat/completions \
|
|
-H "Content-Type: application/json" \
|
|
-d @/tmp/request.json \
|
|
> /tmp/llm-raw.json
|
|
END=$(date +%s)
|
|
echo " Wall time: $((END - START))s"
|
|
|
|
# Extract content
|
|
python3 -c "
|
|
import json
|
|
r = json.load(open('/tmp/llm-raw.json'))
|
|
if 'choices' in r:
|
|
print(r['choices'][0]['message']['content'])
|
|
else:
|
|
print('ERROR — unexpected response:')
|
|
print(json.dumps(r, indent=2))
|
|
" > /tmp/analysis.md
|
|
|
|
# ───────── Stage 6: save + display ─────────
|
|
BASENAME=$(basename "$AUDIO" | sed 's/\.[^.]*$//')
|
|
DEST="$HOME/Desktop/${BASENAME}-analysis.md"
|
|
cp /tmp/analysis.md "$DEST"
|
|
echo
|
|
echo "════════════════════════════════════════════════════════════════"
|
|
echo "✔ Saved: $DEST"
|
|
echo " ($(wc -l < "$DEST") lines, $(wc -w < "$DEST") words)"
|
|
echo "════════════════════════════════════════════════════════════════"
|
|
echo
|
|
echo "─── Top of the report (speaker mapping JSON, if produced) ───"
|
|
head -30 "$DEST"
|
|
echo "..."
|
|
echo
|
|
open -a "TextEdit" "$DEST"
|