Recap: readable transcript + topic sections + meeting extras (gateway LLM)

New 'Recap' phase — turns speakers.json into a human-readable recap, leveraging recap-relay's proven logic/prompts but calling the Spark gateway's OpenAI-compatible /v1/chat/completions directly (same host/TLS as label-merge; Qwen3-35B). We start from already-named speakers (label-merge), so recap-relay's speaker clustering + name-inference are skipped entirely. - GatewayLLMClient: /v1/chat/completions (JSON mode), model discovery via /api/endpoints, TLS-skip reuse, 503 retry, sequential. - RecapAnalyzer: speakers.json → numbered [N] (MM:SS) Name: text transcript → time-windowed analyze (single window for short calls, 18min/2min overlap for long) → stitch/dedup topic sections → meeting extras (TLDR/decisions/action_items/ open_questions/key_quotes). Defensive JSON parsing of LLM output. - RecapRenderer: writes transcript.md + a self-contained dark-theme recap.html (topic sections w/ collapsible transcripts, extras panels, speaker color chips, full timestamped speaker-attributed transcript, print styles). - SessionController.buildRecap: best-effort after speakers.json (gated by settings.recapEnabled); surfaces recapURL → menu 'Open recap'. Skips silently if the gateway has no LLM. Settings toggle added. Validated END-TO-END on the real Meet session against the live gateway: dual-channel transcription → 3 topic sections + accurate TLDR + key quotes; 'Go Bitcoin' correctly attributed to the remote speaker. 46/46 XCTest (10 new).
2026-06-06 14:36:18 -05:00
parent 53d7fcdac0
commit 85bfdf2b56
9 changed files with 941 additions and 1 deletions
@@ -0,0 +1,339 @@
+import Foundation
+
+/// Turns a finished `speakers.json` into topic sections + meeting extras by driving
+/// the gateway LLM — a Swift port of recap-relay's chunked-window analysis, but
+/// starting from already-named speakers (label-merge), so we skip its speaker
+/// clustering and name-inference entirely. Pure helpers are static + testable; the
+/// LLM passes are sequential (one gateway request at a time).
+final class RecapAnalyzer {
+    private let llm: GatewayLLMClient
+    private let model: String
+
+    init(llm: GatewayLLMClient, model: String) {
+        self.llm = llm
+        self.model = model
+    }
+
+    struct Entry: Equatable {
+        let offset: Double      // seconds
+        let end: Double
+        let speaker: String
+        let text: String
+    }
+
+    struct Window: Equatable {
+        let startIdx: Int       // first entry index this window analyzes (incl. overlap)
+        let endIdx: Int         // last entry index (incl. overlap)
+        let bodyStartIdx: Int   // first entry this window "owns"
+    }
+
+    // MARK: - Orchestration
+
+    /// Analyze (topics) → extras. Extras are best-effort (nil on failure).
+    func recap(file: SpeakersFile, progress: ((String) async -> Void)? = nil) async throws -> RecapResult {
+        let entries = Self.entries(from: file)
+        guard !entries.isEmpty else { return RecapResult(sections: [], extras: nil) }
+        await progress?("Finding topics…")
+        let sections = try await analyze(entries: entries)
+        await progress?("Extracting highlights…")
+        let extras = try? await self.extras(file: file, entries: entries, sections: sections)
+        return RecapResult(sections: sections, extras: extras)
+    }
+
+    // MARK: - Analyze (chunked windows → stitched sections)
+
+    func analyze(entries: [Entry]) async throws -> [TopicSection] {
+        let windows = Self.planWindows(entries)
+        var all: [TopicSection] = []
+        for w in windows {
+            let local = Array(entries[w.startIdx...w.endIdx])
+            let prompt = Self.analyzePrompt(local, totalSec: entries.last?.end ?? 0, windowCount: windows.count)
+            let content = try await llm.completeJSON(model: model, system: nil, user: prompt)
+            for s in Self.parseSections(content) {
+                let gs = w.startIdx + max(0, min(s.startIndex, local.count - 1))
+                let ge = w.startIdx + max(0, min(s.endIndex, local.count - 1))
+                guard ge >= gs else { continue }
+                all.append(TopicSection(title: s.title, summary: s.summary, startIndex: gs, endIndex: ge))
+            }
+        }
+        let stitched = Self.stitch(all)
+        // If the model returned nothing usable, fall back to one section for the whole call.
+        if stitched.isEmpty {
+            return [TopicSection(title: "Conversation", summary: "", startIndex: 0, endIndex: entries.count - 1)]
+        }
+        return stitched
+    }
+
+    /// Plan time-based windows over the entries. Single window for short calls;
+    /// otherwise ~`bodySec` bodies with `overlapSec` of overlap each side so a topic
+    /// straddling a boundary is seen by both windows (the stitcher dedupes).
+    static func planWindows(_ entries: [Entry],
+                            bodySec: Double = 18 * 60, overlapSec: Double = 2 * 60,
+                            cutoffSec: Double = 25 * 60) -> [Window] {
+        guard !entries.isEmpty else { return [] }
+        let total = entries.last!.end
+        if total <= cutoffSec {
+            return [Window(startIdx: 0, endIdx: entries.count - 1, bodyStartIdx: 0)]
+        }
+        var windows: [Window] = []
+        var bodyStartIdx = 0
+        while bodyStartIdx < entries.count {
+            let bodyStartSec = entries[bodyStartIdx].offset
+            let winStartSec = bodyStartSec - overlapSec
+            let bodyEndSec = bodyStartSec + bodySec
+            let winEndSec = bodyEndSec + overlapSec
+            let startIdx = entries.firstIndex { $0.offset >= winStartSec } ?? bodyStartIdx
+            var endIdx = bodyStartIdx
+            while endIdx + 1 < entries.count && entries[endIdx + 1].offset <= winEndSec { endIdx += 1 }
+            windows.append(Window(startIdx: startIdx, endIdx: endIdx, bodyStartIdx: bodyStartIdx))
+            let next = entries.firstIndex { $0.offset >= bodyEndSec } ?? entries.count
+            bodyStartIdx = max(next, bodyStartIdx + 1)
+        }
+        return windows
+    }
+
+    /// Merge per-window sections into one chronological, non-overlapping list.
+    /// Sort by start (wider first on ties), drop fully-contained, trim front overlaps.
+    static func stitch(_ sections: [TopicSection]) -> [TopicSection] {
+        let sorted = sections.sorted {
+            $0.startIndex != $1.startIndex ? $0.startIndex < $1.startIndex : $0.endIndex > $1.endIndex
+        }
+        var out: [TopicSection] = []
+        var maxEnd = -1
+        for s in sorted {
+            var start = s.startIndex
+            let end = s.endIndex
+            if end <= maxEnd { continue }                // fully contained → drop
+            if start <= maxEnd { start = maxEnd + 1 }     // overlap → trim front
+            guard start <= end else { continue }
+            out.append(TopicSection(title: s.title, summary: s.summary, startIndex: start, endIndex: end))
+            maxEnd = end
+        }
+        return out
+    }
+
+    // MARK: - Extras
+
+    func extras(file: SpeakersFile, entries: [Entry], sections: [TopicSection]) async throws -> MeetingExtras? {
+        let prompt = Self.extrasPrompt(file: file, entries: entries, sections: sections)
+        let content = try await llm.completeJSON(model: model, system: nil, user: prompt, maxTokens: 4096)
+        return Self.parseExtras(content)
+    }
+
+    // MARK: - Entries
+
+    static func entries(from file: SpeakersFile) -> [Entry] {
+        file.segments
+            .filter { !($0.text ?? "").trimmingCharacters(in: .whitespacesAndNewlines).isEmpty }
+            .sorted { $0.start < $1.start }
+            .map { Entry(offset: $0.start, end: $0.end, speaker: $0.speaker,
+                         text: ($0.text ?? "").trimmingCharacters(in: .whitespacesAndNewlines)) }
+    }
+
+    static func mmss(_ sec: Double) -> String {
+        let t = max(0, Int(sec.rounded()))
+        let h = t / 3600, m = (t % 3600) / 60, s = t % 60
+        return h > 0 ? String(format: "%d:%02d:%02d", h, m, s) : String(format: "%d:%02d", m, s)
+    }
+
+    // MARK: - Prompts
+
+    private static func analyzePrompt(_ window: [Entry], totalSec: Double, windowCount: Int) -> String {
+        let lines = window.enumerated()
+            .map { "[\($0.offset)] (\(mmss($0.element.offset))) \($0.element.speaker): \($0.element.text)" }
+            .joined(separator: "\n")
+        let windowSpan = (window.last?.end ?? 0) - (window.first?.offset ?? 0)
+        let windowMin = max(1, Int((windowSpan / 60).rounded()))
+        let maxIndex = window.count - 1
+        let targetSections = targetSectionsPhrase(totalSec: totalSec, windowCount: windowCount)
+        return """
+        You are analyzing a ~\(windowMin)-minute section of a longer transcript. Your job is to identify natural topic boundaries and group the transcript into discussion-based sections — aim for \(targetSections).
+
+        TRANSCRIPT (each line is numbered with a timestamp):
+        \(lines)
+
+        INSTRUCTIONS:
+        1. Read the entire transcript carefully.
+        2. Identify where the discussion naturally shifts from one topic to another.
+        3. Group consecutive transcript segments by topic. Some sections may be short (a quick aside) and some may be long (an extended deep-dive). Let the content dictate the length.
+        4. For each section, write:
+           - A short, specific topic title (3-8 words)
+           - A 1-3 sentence summary of what's discussed. Attribute points to speakers by name where it improves clarity.
+           - The start and end segment indices (inclusive), counted as the bracketed [N] number at the start of each transcript line above.
+
+        IMPORTANT:
+        - Sections must be chronological and non-overlapping.
+        - Every segment index from 0 to \(maxIndex) must belong to exactly one section.
+        - startIndex of section N+1 must equal endIndex of section N plus 1.
+        - Create as many or as few sections as the content naturally requires — but lean toward broad, substantive topics rather than minute-by-minute breakdowns. A natural topic that spans several minutes of dialogue should be one section, not several.
+        - Titles should be descriptive and specific, not generic like "Introduction" unless it truly is one.
+
+        Respond with ONLY valid JSON in this exact format, no other text:
+        {
+          "sections": [
+            {
+              "title": "Brief Topic Title",
+              "summary": "1-3 sentence summary of this discussion section.",
+              "startIndex": 0,
+              "endIndex": 15
+            }
+          ]
+        }
+        """
+    }
+
+    private static func targetSectionsPhrase(totalSec: Double, windowCount: Int) -> String {
+        let m = totalSec / 60
+        let total = m < 5 ? 3 : m < 15 ? 4 : m < 30 ? 6 : m < 60 ? 8 : m < 120 ? 12 : 16
+        let per = max(2, Int((Double(total) / Double(max(1, windowCount))).rounded()))
+        return "around \(per) sections"
+    }
+
+    private static func extrasPrompt(file: SpeakersFile, entries: [Entry], sections: [TopicSection]) -> String {
+        let names = orderedSpeakerNames(entries)
+        let roster = names.isEmpty ? "(unknown)" : names.joined(separator: ", ")
+        let topics = sections.isEmpty ? "(none)" :
+            sections.enumerated().map { "\($0.offset + 1). \($0.element.title)" }.joined(separator: "\n")
+        let transcript = cappedTranscript(entries, maxChars: 24_000)
+        let durationStr = mmss(file.durationSec)
+        return """
+        You are extracting structured information from an internal team meeting transcript. The transcript below is labeled with the speakers' real names where known.
+
+        MEETING METADATA:
+        - App: \(file.app)
+        - Duration: \(durationStr)
+
+        SPEAKERS: \(roster)
+
+        TOPIC SUMMARIES (already produced — for context only, do not duplicate):
+        \(topics)
+
+        TRANSCRIPT (each line is "[<name> <MM:SS>] text"):
+        \(transcript)
+
+        INSTRUCTIONS:
+        Extract FIVE categories of information. Return EMPTY ARRAYS for categories that don't apply — do NOT invent items. Use the speakers' names exactly as shown above; use null/empty when a person is unclear.
+
+        1. TLDR — A 2-4 sentence executive summary of the entire meeting: what it was about, the key discussion arc, and the bottom-line outcome. Past tense, third person, dense. Skip pleasantries. If the meeting was genuinely substanceless, write one factual sentence. This is the only required category.
+           - summary: the 2-4 sentence executive summary
+           - primary_speakers: array of names who drove the conversation (1-3, in rough order of contribution). Empty array if unclear.
+
+        2. DECISIONS — Things explicitly decided/agreed. Only clear commitments, not casual mentions. For each:
+           - statement: the decision in one sentence
+           - agreed_by: array of names who explicitly agreed (empty if unclear)
+           - supporting_offset: integer SECONDS where it was decided (convert the [<name> <MM:SS>] timestamp to total seconds)
+
+        3. ACTION_ITEMS — Explicit ownership ("I'll send the doc", "Matt will follow up"), not vague "someone should". For each:
+           - description: the action in imperative form
+           - owner: the person's name, or null if unclear
+           - due_hint: deadline string if mentioned ("by Friday"), or null
+           - supporting_offset: integer seconds where the commitment was made
+
+        4. OPEN_QUESTIONS — Questions raised that were NOT clearly answered. Skip rhetorical/answered ones. For each:
+           - question: rephrased to be self-contained
+           - raised_by: the person's name, or null
+           - answered: false (always)
+
+        5. KEY_QUOTES — 3-6 max. Pivotal/insightful/strong-opinion statements worth surfacing verbatim. For each:
+           - speaker: the person's name (or null)
+           - offset: integer seconds where the quote occurs
+           - quote: the verbatim quote (4-30 words)
+           - why_notable: one short clause
+
+        Be conservative — better an empty array than a fabrication. Respond with ONLY valid JSON in this exact shape, no other text:
+        {
+          "tldr": {"summary": "...", "primary_speakers": []},
+          "decisions": [{"statement": "...", "agreed_by": [], "supporting_offset": 0}],
+          "action_items": [{"description": "...", "owner": null, "due_hint": null, "supporting_offset": 0}],
+          "open_questions": [{"question": "...", "raised_by": null, "answered": false}],
+          "key_quotes": [{"speaker": null, "offset": 0, "quote": "...", "why_notable": "..."}]
+        }
+        """
+    }
+
+    /// Distinct speaker names in first-appearance order.
+    static func orderedSpeakerNames(_ entries: [Entry]) -> [String] {
+        var seen = Set<String>(), order: [String] = []
+        for e in entries where !e.speaker.isEmpty && !seen.contains(e.speaker) {
+            seen.insert(e.speaker); order.append(e.speaker)
+        }
+        return order
+    }
+
+    /// Full `[name MM:SS] text` transcript, middle-truncated to `maxChars` so a long
+    /// call still fits the model context (keeps the start and end, drops the middle).
+    static func cappedTranscript(_ entries: [Entry], maxChars: Int) -> String {
+        let full = entries.map { "[\($0.speaker) \(mmss($0.offset))] \($0.text)" }.joined(separator: "\n")
+        guard full.count > maxChars else { return full }
+        let half = maxChars / 2
+        let head = String(full.prefix(half))
+        let tail = String(full.suffix(half))
+        return head + "\n…[transcript truncated]…\n" + tail
+    }
+
+    // MARK: - Parsing (defensive — LLM output)
+
+    private static func jsonObject(_ content: String) -> [String: Any]? {
+        let cleaned = GatewayLLMClient.stripCodeFence(content)
+        return (try? JSONSerialization.jsonObject(with: Data(cleaned.utf8))) as? [String: Any]
+    }
+
+    static func parseSections(_ content: String) -> [(title: String, summary: String, startIndex: Int, endIndex: Int)] {
+        guard let o = jsonObject(content), let arr = o["sections"] as? [[String: Any]] else { return [] }
+        return arr.compactMap { d in
+            guard let t = (d["title"] as? String)?.trimmingCharacters(in: .whitespacesAndNewlines), !t.isEmpty,
+                  let si = intVal(d["startIndex"]), let ei = intVal(d["endIndex"]) else { return nil }
+            let s = (d["summary"] as? String)?.trimmingCharacters(in: .whitespacesAndNewlines) ?? ""
+            return (t, s, si, ei)
+        }
+    }
+
+    static func parseExtras(_ content: String) -> MeetingExtras? {
+        guard let o = jsonObject(content) else { return nil }
+        let tldrObj = o["tldr"] as? [String: Any]
+        let tldr = MeetingExtras.TLDR(
+            summary: (tldrObj?["summary"] as? String) ?? "",
+            primarySpeakers: stringArray(tldrObj?["primary_speakers"]))
+        let decisions = (o["decisions"] as? [[String: Any]] ?? []).compactMap { d -> MeetingExtras.Decision? in
+            guard let st = nonEmpty(d["statement"]) else { return nil }
+            return .init(statement: st, agreedBy: stringArray(d["agreed_by"]), supportingOffset: intVal(d["supporting_offset"]))
+        }
+        let actions = (o["action_items"] as? [[String: Any]] ?? []).compactMap { d -> MeetingExtras.ActionItem? in
+            guard let desc = nonEmpty(d["description"]) else { return nil }
+            return .init(description: desc, owner: nonEmpty(d["owner"]), dueHint: nonEmpty(d["due_hint"]),
+                         supportingOffset: intVal(d["supporting_offset"]))
+        }
+        let questions = (o["open_questions"] as? [[String: Any]] ?? []).compactMap { d -> MeetingExtras.OpenQuestion? in
+            guard let q = nonEmpty(d["question"]) else { return nil }
+            return .init(question: q, raisedBy: nonEmpty(d["raised_by"]))
+        }
+        let quotes = (o["key_quotes"] as? [[String: Any]] ?? []).compactMap { d -> MeetingExtras.KeyQuote? in
+            guard let q = nonEmpty(d["quote"]) else { return nil }
+            return .init(speaker: nonEmpty(d["speaker"]), offset: intVal(d["offset"]), quote: q,
+                         whyNotable: nonEmpty(d["why_notable"]) ?? "")
+        }
+        // Require at least a TLDR to consider extras present.
+        guard !tldr.summary.isEmpty || !decisions.isEmpty || !actions.isEmpty || !questions.isEmpty || !quotes.isEmpty
+        else { return nil }
+        return MeetingExtras(tldr: tldr, decisions: decisions, actionItems: actions,
+                             openQuestions: questions, keyQuotes: quotes)
+    }
+
+    private static func intVal(_ v: Any?) -> Int? {
+        if let i = v as? Int { return i }
+        if let d = v as? Double { return Int(d) }
+        if let s = v as? String { return Int(s.trimmingCharacters(in: .whitespacesAndNewlines)) }
+        return nil
+    }
+
+    private static func stringArray(_ v: Any?) -> [String] {
+        (v as? [Any])?.compactMap { ($0 as? String)?.trimmingCharacters(in: .whitespacesAndNewlines) }
+            .filter { !$0.isEmpty } ?? []
+    }
+
+    private static func nonEmpty(_ v: Any?) -> String? {
+        guard let s = (v as? String)?.trimmingCharacters(in: .whitespacesAndNewlines), !s.isEmpty,
+              s.lowercased() != "null" else { return nil }
+        return s
+    }
+}