v0.13.0:4 - redaction gateway, embeddings proxy, expanded audio API
- Add redaction gateway (redaction_gateway.py, redaction/ scrub + tests) - Add embeddings proxy and spark_embed service (Dockerfile + main.py) - Expand audio_proxy with speaker-aware handling; deep_health/health/server updates - Package: configureSparks action + sparkConfig model updates, manifest/main wiring - Docs: AUDIO_API, EMBEDDINGS, REDACTION_GATEWAY; HANDOFF and runbook/known-issues refresh
This commit is contained in:
Executable
+213
@@ -0,0 +1,213 @@
|
||||
#!/bin/bash
|
||||
# End-to-end test of the v0.10 + v0.11 audio pipeline:
|
||||
# audio file → spark-control /api/audio/transcribe-with-speakers
|
||||
# (Parakeet + Sortformer merged)
|
||||
# → Qwen3.6 via vLLM with long-form prompt + speaker name
|
||||
# resolution
|
||||
# → ~/Desktop/<filename>-analysis.md
|
||||
#
|
||||
# Usage:
|
||||
# bash scripts/test-audio-with-speakers.sh <audio-file> [--people "Name1, Name2"]
|
||||
#
|
||||
# Examples:
|
||||
# # No participants list (LLM will only resolve speakers it can verify from audio cues)
|
||||
# bash scripts/test-audio-with-speakers.sh ~/Library/Application\ Support/hyprnote/sessions/*/audio.mp3
|
||||
#
|
||||
# # With known participants (LLM constrained to these names)
|
||||
# bash scripts/test-audio-with-speakers.sh ~/Downloads/podcast.mp3 --people "Dax, Will"
|
||||
#
|
||||
# Designed to mirror exactly what recap-relay's spark-control backend will do
|
||||
# once the PR lands. If the output looks good here, the recap-relay version
|
||||
# will look the same.
|
||||
|
||||
set -e
|
||||
|
||||
AUDIO="${1:?Usage: $0 <audio-file> [--people \"Name1, Name2\"]}"
|
||||
PEOPLE=""
|
||||
if [ "$2" = "--people" ] && [ -n "$3" ]; then
|
||||
PEOPLE="$3"
|
||||
fi
|
||||
|
||||
if [ ! -f "$AUDIO" ]; then
|
||||
echo "ERROR: audio file not found: $AUDIO" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
SPARK_CONTROL="${SPARK_CONTROL:-https://spark.satsflows.com}"
|
||||
VLLM="${VLLM:-http://<spark-1-ip>:8888/v1}"
|
||||
|
||||
echo "════════════════════════════════════════════════════════════════"
|
||||
echo "Audio: $AUDIO ($(du -h "$AUDIO" | cut -f1))"
|
||||
echo "Spark Control: $SPARK_CONTROL"
|
||||
echo "vLLM: $VLLM"
|
||||
echo "Participants: ${PEOPLE:-<none — LLM will only resolve speakers from audio cues>}"
|
||||
echo "════════════════════════════════════════════════════════════════"
|
||||
echo
|
||||
|
||||
# ───────── Stage 1: transcribe + diarize ─────────
|
||||
echo "▶ Stage 1: transcribe + diarize (Parakeet + Sortformer in parallel)..."
|
||||
START=$(date +%s)
|
||||
HTTP=$(curl -sSk -X POST "$SPARK_CONTROL/api/audio/transcribe-with-speakers" \
|
||||
-F "file=@$AUDIO" \
|
||||
-o /tmp/diarized.json \
|
||||
-w "%{http_code}")
|
||||
END=$(date +%s)
|
||||
echo " HTTP $HTTP, $((END - START))s wall time"
|
||||
|
||||
if [ "$HTTP" != "200" ]; then
|
||||
echo "ERROR — non-200 response. Full body:"
|
||||
cat /tmp/diarized.json
|
||||
exit 1
|
||||
fi
|
||||
|
||||
python3 -c "
|
||||
import json
|
||||
d = json.load(open('/tmp/diarized.json'))
|
||||
print(f\" Duration: {d['duration']}s Speakers: {d['speakers_detected']} Segments: {len(d['segments'])}\")"
|
||||
|
||||
# ───────── Stage 2: format transcript ─────────
|
||||
echo
|
||||
echo "▶ Stage 2: format diarized transcript as [MM:SS] Speaker_N: text..."
|
||||
python3 > /tmp/transcript-formatted.txt <<'PY'
|
||||
import json
|
||||
d = json.load(open('/tmp/diarized.json'))
|
||||
out = []
|
||||
for s in d['segments']:
|
||||
ms = s['start_ms'] // 1000
|
||||
h, m, sec = ms // 3600, (ms % 3600) // 60, ms % 60
|
||||
ts = f"{h}:{m:02d}:{sec:02d}" if h else f"{m:02d}:{sec:02d}"
|
||||
out.append(f"[{ts}] {s['speaker']}: {s['text']}")
|
||||
print("\n".join(out))
|
||||
PY
|
||||
echo " $(wc -l < /tmp/transcript-formatted.txt) formatted lines"
|
||||
echo " Sample (first 3):"
|
||||
head -3 /tmp/transcript-formatted.txt | sed 's/^/ /'
|
||||
|
||||
# ───────── Stage 3: discover current LLM ─────────
|
||||
echo
|
||||
echo "▶ Stage 3: discover current vLLM model..."
|
||||
MODEL=$(curl -sS $VLLM/models | python3 -c "import json,sys; print(json.load(sys.stdin)['data'][0]['id'])")
|
||||
echo " Model: $MODEL"
|
||||
|
||||
# ───────── Stage 4: build LLM request ─────────
|
||||
echo
|
||||
echo "▶ Stage 4: build LLM request with speaker-name-resolution prompt..."
|
||||
python3 - "$MODEL" /tmp/transcript-formatted.txt "$PEOPLE" > /tmp/request.json <<'PY'
|
||||
import json, sys
|
||||
model, transcript_path, people = sys.argv[1], sys.argv[2], sys.argv[3]
|
||||
transcript = open(transcript_path).read()
|
||||
|
||||
participants_block = ""
|
||||
if people.strip():
|
||||
participants_block = f"""
|
||||
|
||||
Known participants in this conversation: {people}
|
||||
Constrain your speaker→name mappings to this list. Still only assign a
|
||||
name when the audio cues unambiguously identify which participant is
|
||||
which — do not guess based on topic or role."""
|
||||
|
||||
system = (
|
||||
"You are a meeting analyst producing comprehensive long-form notes. "
|
||||
"Preserve specific quotes, numbers, dates, names, and decisions verbatim. "
|
||||
"Quote speakers directly when they said something memorable. "
|
||||
"Generate as many sections as the meeting naturally has. "
|
||||
"Do not summarize aggressively — aim for 3000-6000 words for a 60-90 min conversation."
|
||||
)
|
||||
|
||||
user_prompt = f"""You will analyze a transcript with anonymous speaker labels (Speaker_0, Speaker_1, ...).
|
||||
|
||||
CRITICAL — speaker name resolution rules:
|
||||
Map a speaker label to a real name ONLY when you have direct, unambiguous evidence:
|
||||
- The speaker explicitly identifies themselves ("I'm X", "this is X", "my name is X")
|
||||
- Another speaker addresses them by name as a vocative ("thanks X", "X, what do you think?")
|
||||
If you have ANY doubt, leave the mapping as null. False mappings are worse than no mapping.
|
||||
Do NOT infer names from topic context, role descriptions, or weak associations.{participants_block}
|
||||
|
||||
OUTPUT FORMAT — produce exactly two parts:
|
||||
|
||||
PART 1: A JSON block at the very top of your response with this shape:
|
||||
```json
|
||||
{{
|
||||
"speaker_mapping": {{
|
||||
"Speaker_0": {{"name": "Real Name", "confidence": "high", "evidence": "quoted line + [MM:SS]"}},
|
||||
"Speaker_1": {{"name": null, "confidence": null, "evidence": null}}
|
||||
}}
|
||||
}}
|
||||
```
|
||||
|
||||
PART 2: Below the JSON, a structured long-form report with these sections:
|
||||
|
||||
# Detailed Discussion Log
|
||||
Chronological account of every topic discussed, with verbatim quotes from speakers for important points. Aim for 8+ bullets per major topic. Use sub-bullets for examples or supporting detail.
|
||||
|
||||
# Decisions Made
|
||||
Every decision, with who proposed it, who agreed, any dissent, and rationale.
|
||||
|
||||
# Action Items
|
||||
Every action item, with owner, deadline, and any context. Include even minor "I'll think about it" commitments.
|
||||
|
||||
# Open Questions
|
||||
Things raised that weren't resolved, with who raised them.
|
||||
|
||||
# Key Quotes
|
||||
Direct quotes worth preserving, with speaker attribution.
|
||||
|
||||
In the report body: use REAL NAMES where you mapped them, and Speaker_N where you couldn't.
|
||||
|
||||
---
|
||||
|
||||
TRANSCRIPT:
|
||||
|
||||
{transcript}"""
|
||||
|
||||
print(json.dumps({
|
||||
"model": model,
|
||||
"messages": [
|
||||
{"role": "system", "content": system},
|
||||
{"role": "user", "content": user_prompt},
|
||||
],
|
||||
"max_tokens": 16000,
|
||||
"temperature": 0.3,
|
||||
"chat_template_kwargs": {"enable_thinking": False},
|
||||
}))
|
||||
PY
|
||||
REQ_BYTES=$(wc -c < /tmp/request.json)
|
||||
echo " Request size: $REQ_BYTES bytes"
|
||||
|
||||
# ───────── Stage 5: LLM call ─────────
|
||||
echo
|
||||
echo "▶ Stage 5: send to Qwen3.6 (this is the slow part — 30-90s typical)..."
|
||||
START=$(date +%s)
|
||||
curl -sS $VLLM/chat/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d @/tmp/request.json \
|
||||
> /tmp/llm-raw.json
|
||||
END=$(date +%s)
|
||||
echo " Wall time: $((END - START))s"
|
||||
|
||||
# Extract content
|
||||
python3 -c "
|
||||
import json
|
||||
r = json.load(open('/tmp/llm-raw.json'))
|
||||
if 'choices' in r:
|
||||
print(r['choices'][0]['message']['content'])
|
||||
else:
|
||||
print('ERROR — unexpected response:')
|
||||
print(json.dumps(r, indent=2))
|
||||
" > /tmp/analysis.md
|
||||
|
||||
# ───────── Stage 6: save + display ─────────
|
||||
BASENAME=$(basename "$AUDIO" | sed 's/\.[^.]*$//')
|
||||
DEST="$HOME/Desktop/${BASENAME}-analysis.md"
|
||||
cp /tmp/analysis.md "$DEST"
|
||||
echo
|
||||
echo "════════════════════════════════════════════════════════════════"
|
||||
echo "✔ Saved: $DEST"
|
||||
echo " ($(wc -l < "$DEST") lines, $(wc -w < "$DEST") words)"
|
||||
echo "════════════════════════════════════════════════════════════════"
|
||||
echo
|
||||
echo "─── Top of the report (speaker mapping JSON, if produced) ───"
|
||||
head -30 "$DEST"
|
||||
echo "..."
|
||||
echo
|
||||
open -a "TextEdit" "$DEST"
|
||||
Reference in New Issue
Block a user