v0.13.0:4 - redaction gateway, embeddings proxy, expanded audio API

- Add redaction gateway (redaction_gateway.py, redaction/ scrub + tests) - Add embeddings proxy and spark_embed service (Dockerfile + main.py) - Expand audio_proxy with speaker-aware handling; deep_health/health/server updates - Package: configureSparks action + sparkConfig model updates, manifest/main wiring - Docs: AUDIO_API, EMBEDDINGS, REDACTION_GATEWAY; HANDOFF and runbook/known-issues refresh
2026-06-11 17:45:21 -05:00
parent 4a75274db3
commit 8d839e3714
37 changed files with 3763 additions and 197 deletions
@@ -0,0 +1,213 @@
+#!/bin/bash
+# End-to-end test of the v0.10 + v0.11 audio pipeline:
+#   audio file  →  spark-control /api/audio/transcribe-with-speakers
+#                  (Parakeet + Sortformer merged)
+#               →  Qwen3.6 via vLLM with long-form prompt + speaker name
+#                  resolution
+#               →  ~/Desktop/<filename>-analysis.md
+#
+# Usage:
+#   bash scripts/test-audio-with-speakers.sh <audio-file> [--people "Name1, Name2"]
+#
+# Examples:
+#   # No participants list (LLM will only resolve speakers it can verify from audio cues)
+#   bash scripts/test-audio-with-speakers.sh ~/Library/Application\ Support/hyprnote/sessions/*/audio.mp3
+#
+#   # With known participants (LLM constrained to these names)
+#   bash scripts/test-audio-with-speakers.sh ~/Downloads/podcast.mp3 --people "Dax, Will"
+#
+# Designed to mirror exactly what recap-relay's spark-control backend will do
+# once the PR lands. If the output looks good here, the recap-relay version
+# will look the same.
+
+set -e
+
+AUDIO="${1:?Usage: $0 <audio-file> [--people \"Name1, Name2\"]}"
+PEOPLE=""
+if [ "$2" = "--people" ] && [ -n "$3" ]; then
+  PEOPLE="$3"
+fi
+
+if [ ! -f "$AUDIO" ]; then
+  echo "ERROR: audio file not found: $AUDIO" >&2
+  exit 1
+fi
+
+SPARK_CONTROL="${SPARK_CONTROL:-https://spark.satsflows.com}"
+VLLM="${VLLM:-http://<spark-1-ip>:8888/v1}"
+
+echo "════════════════════════════════════════════════════════════════"
+echo "Audio:           $AUDIO ($(du -h "$AUDIO" | cut -f1))"
+echo "Spark Control:   $SPARK_CONTROL"
+echo "vLLM:            $VLLM"
+echo "Participants:    ${PEOPLE:-<none — LLM will only resolve speakers from audio cues>}"
+echo "════════════════════════════════════════════════════════════════"
+echo
+
+# ───────── Stage 1: transcribe + diarize ─────────
+echo "▶ Stage 1: transcribe + diarize (Parakeet + Sortformer in parallel)..."
+START=$(date +%s)
+HTTP=$(curl -sSk -X POST "$SPARK_CONTROL/api/audio/transcribe-with-speakers" \
+  -F "file=@$AUDIO" \
+  -o /tmp/diarized.json \
+  -w "%{http_code}")
+END=$(date +%s)
+echo "  HTTP $HTTP, $((END - START))s wall time"
+
+if [ "$HTTP" != "200" ]; then
+  echo "ERROR — non-200 response. Full body:"
+  cat /tmp/diarized.json
+  exit 1
+fi
+
+python3 -c "
+import json
+d = json.load(open('/tmp/diarized.json'))
+print(f\"  Duration: {d['duration']}s   Speakers: {d['speakers_detected']}   Segments: {len(d['segments'])}\")"
+
+# ───────── Stage 2: format transcript ─────────
+echo
+echo "▶ Stage 2: format diarized transcript as [MM:SS] Speaker_N: text..."
+python3 > /tmp/transcript-formatted.txt <<'PY'
+import json
+d = json.load(open('/tmp/diarized.json'))
+out = []
+for s in d['segments']:
+    ms = s['start_ms'] // 1000
+    h, m, sec = ms // 3600, (ms % 3600) // 60, ms % 60
+    ts = f"{h}:{m:02d}:{sec:02d}" if h else f"{m:02d}:{sec:02d}"
+    out.append(f"[{ts}] {s['speaker']}: {s['text']}")
+print("\n".join(out))
+PY
+echo "  $(wc -l < /tmp/transcript-formatted.txt) formatted lines"
+echo "  Sample (first 3):"
+head -3 /tmp/transcript-formatted.txt | sed 's/^/    /'
+
+# ───────── Stage 3: discover current LLM ─────────
+echo
+echo "▶ Stage 3: discover current vLLM model..."
+MODEL=$(curl -sS $VLLM/models | python3 -c "import json,sys; print(json.load(sys.stdin)['data'][0]['id'])")
+echo "  Model: $MODEL"
+
+# ───────── Stage 4: build LLM request ─────────
+echo
+echo "▶ Stage 4: build LLM request with speaker-name-resolution prompt..."
+python3 - "$MODEL" /tmp/transcript-formatted.txt "$PEOPLE" > /tmp/request.json <<'PY'
+import json, sys
+model, transcript_path, people = sys.argv[1], sys.argv[2], sys.argv[3]
+transcript = open(transcript_path).read()
+
+participants_block = ""
+if people.strip():
+    participants_block = f"""
+
+Known participants in this conversation: {people}
+Constrain your speaker→name mappings to this list. Still only assign a
+name when the audio cues unambiguously identify which participant is
+which — do not guess based on topic or role."""
+
+system = (
+    "You are a meeting analyst producing comprehensive long-form notes. "
+    "Preserve specific quotes, numbers, dates, names, and decisions verbatim. "
+    "Quote speakers directly when they said something memorable. "
+    "Generate as many sections as the meeting naturally has. "
+    "Do not summarize aggressively — aim for 3000-6000 words for a 60-90 min conversation."
+)
+
+user_prompt = f"""You will analyze a transcript with anonymous speaker labels (Speaker_0, Speaker_1, ...).
+
+CRITICAL — speaker name resolution rules:
+  Map a speaker label to a real name ONLY when you have direct, unambiguous evidence:
+    - The speaker explicitly identifies themselves ("I'm X", "this is X", "my name is X")
+    - Another speaker addresses them by name as a vocative ("thanks X", "X, what do you think?")
+  If you have ANY doubt, leave the mapping as null. False mappings are worse than no mapping.
+  Do NOT infer names from topic context, role descriptions, or weak associations.{participants_block}
+
+OUTPUT FORMAT — produce exactly two parts:
+
+PART 1: A JSON block at the very top of your response with this shape:
+```json
+{{
+  "speaker_mapping": {{
+    "Speaker_0": {{"name": "Real Name", "confidence": "high", "evidence": "quoted line + [MM:SS]"}},
+    "Speaker_1": {{"name": null, "confidence": null, "evidence": null}}
+  }}
+}}
+```
+
+PART 2: Below the JSON, a structured long-form report with these sections:
+
+# Detailed Discussion Log
+Chronological account of every topic discussed, with verbatim quotes from speakers for important points. Aim for 8+ bullets per major topic. Use sub-bullets for examples or supporting detail.
+
+# Decisions Made
+Every decision, with who proposed it, who agreed, any dissent, and rationale.
+
+# Action Items
+Every action item, with owner, deadline, and any context. Include even minor "I'll think about it" commitments.
+
+# Open Questions
+Things raised that weren't resolved, with who raised them.
+
+# Key Quotes
+Direct quotes worth preserving, with speaker attribution.
+
+In the report body: use REAL NAMES where you mapped them, and Speaker_N where you couldn't.
+
+---
+
+TRANSCRIPT:
+
+{transcript}"""
+
+print(json.dumps({
+    "model": model,
+    "messages": [
+        {"role": "system", "content": system},
+        {"role": "user", "content": user_prompt},
+    ],
+    "max_tokens": 16000,
+    "temperature": 0.3,
+    "chat_template_kwargs": {"enable_thinking": False},
+}))
+PY
+REQ_BYTES=$(wc -c < /tmp/request.json)
+echo "  Request size: $REQ_BYTES bytes"
+
+# ───────── Stage 5: LLM call ─────────
+echo
+echo "▶ Stage 5: send to Qwen3.6 (this is the slow part — 30-90s typical)..."
+START=$(date +%s)
+curl -sS $VLLM/chat/completions \
+  -H "Content-Type: application/json" \
+  -d @/tmp/request.json \
+  > /tmp/llm-raw.json
+END=$(date +%s)
+echo "  Wall time: $((END - START))s"
+
+# Extract content
+python3 -c "
+import json
+r = json.load(open('/tmp/llm-raw.json'))
+if 'choices' in r:
+    print(r['choices'][0]['message']['content'])
+else:
+    print('ERROR — unexpected response:')
+    print(json.dumps(r, indent=2))
+" > /tmp/analysis.md
+
+# ───────── Stage 6: save + display ─────────
+BASENAME=$(basename "$AUDIO" | sed 's/\.[^.]*$//')
+DEST="$HOME/Desktop/${BASENAME}-analysis.md"
+cp /tmp/analysis.md "$DEST"
+echo
+echo "════════════════════════════════════════════════════════════════"
+echo "✔ Saved: $DEST"
+echo "  ($(wc -l < "$DEST") lines, $(wc -w < "$DEST") words)"
+echo "════════════════════════════════════════════════════════════════"
+echo
+echo "─── Top of the report (speaker mapping JSON, if produced) ───"
+head -30 "$DEST"
+echo "..."
+echo
+open -a "TextEdit" "$DEST"