import Foundation /// Turns noisy per-frame `SpeakerObservation`s into clean /// `(start, end, name, confidence)` segments. /// /// - Hysteresis: open a segment after `openFrames` consecutive speaking frames, /// close after `closeFrames` quiet frames — rides out UI-cue lag/flicker. /// - Overlaps allowed: each name is tracked independently (crosstalk). /// - mic-VAD "self" spans are merged in as high-confidence segments. /// - OCR name variants are normalized via an alias table. /// /// Pure logic, no UI/capture deps — fully unit-testable offline. final class TimelineBuilder { private let openFrames: Int private let closeFrames: Int private var aliases: [String: String] = [:] // normalized variant -> canonical private var states: [String: NameState] = [:] private var lastFrameT: Double = 0 private(set) var segments: [VisualTimeline.Segment] = [] init(openFrames: Int = 2, closeFrames: Int = 2) { self.openFrames = max(1, openFrames) self.closeFrames = max(1, closeFrames) } /// Register that `variant` (e.g. "Sarah J") should map to `canonical` /// (e.g. "Sarah Jones"). func addAlias(_ variant: String, canonical: String) { aliases[Self.normalize(variant)] = canonical } /// Ingest one frame's observations (all sharing time `t`). Names not present /// (or present but not speaking) count as a quiet frame for any open segment. func ingest(_ observations: [SpeakerObservation], at t: TimeInterval) { lastFrameT = t // Best confidence per canonical name that is speaking this frame. var speaking: [String: Double] = [:] for obs in observations where obs.speaking && !obs.name.isEmpty { let name = canonical(obs.name) speaking[name] = max(speaking[name] ?? 0, obs.confidence) } let names = Set(states.keys).union(speaking.keys) for name in names { var st = states[name] ?? NameState() if let conf = speaking[name] { if st.voiced == 0 { st.runStart = t } st.voiced += 1 st.silent = 0 st.lastVoicedT = t if !st.open && st.voiced >= openFrames { st.open = true st.segStart = st.runStart st.confSum = 0 st.confN = 0 } if st.open { st.confSum += conf; st.confN += 1 } } else { st.silent += 1 st.voiced = 0 if st.open && st.silent >= closeFrames { closeSegment(name: name, state: st) st.open = false } } states[name] = st } } /// Merge mic-VAD self spans (the user) as high-confidence segments. func mergeSelfSpans(_ spans: [VADSpan], selfName: String) { for span in spans where span.end > span.start { segments.append(.init(start: span.start, end: span.end, name: selfName, confidence: span.confidence, source: "mic_vad")) } } /// Force-close any open segments at `t` (used when a visual gap begins, so a /// segment isn't carried across the gap). func closeOpenSegments(at t: TimeInterval) { for (name, st) in states where st.open { closeSegment(name: name, state: st) states[name]?.open = false states[name]?.voiced = 0 states[name]?.silent = 0 } } /// Close any still-open segments at end of capture. func finish() { for (name, st) in states where st.open { closeSegment(name: name, state: st) states[name]?.open = false } segments = Self.canonicalizeByFrequency(segments) segments.sort { $0.start < $1.start } } /// Fold rare OCR misspellings into the dominant name they're a typo of: a name with /// little total time is remapped to a much longer-running name with the same initial /// within a small edit distance (e.g. "Matt Odel"/"MattOdell"/"Mare" → "Matt Odell"/ /// "Mark"). Conservative by design — it won't merge two well-attested speakers, only /// a transient variant into its clearly-dominant canonical. Pure/testable. static func canonicalizeByFrequency(_ segs: [VisualTimeline.Segment], minorMaxSec: Double = 5, dominanceRatio: Double = 8, maxEdits: Int = 2) -> [VisualTimeline.Segment] { var dur: [String: Double] = [:] for s in segs { dur[s.name, default: 0] += s.end - s.start } let names = Array(dur.keys) var remap: [String: String] = [:] for minor in names { let md = dur[minor]! guard md <= minorMaxSec, let mInit = minor.first else { continue } var best: String?, bestDur = 0.0 for major in names where major != minor { let Md = dur[major]! guard Md >= md * dominanceRatio, Md > bestDur, major.first == mInit else { continue } if levenshtein(minor.lowercased(), major.lowercased()) <= maxEdits { best = major; bestDur = Md } } if let b = best { remap[minor] = b } } guard !remap.isEmpty else { return segs } return segs.map { s in remap[s.name].map { VisualTimeline.Segment(start: s.start, end: s.end, name: $0, confidence: s.confidence, source: s.source) } ?? s } } /// Levenshtein edit distance (small strings — names). static func levenshtein(_ a: String, _ b: String) -> Int { let x = Array(a), y = Array(b) if x.isEmpty { return y.count }; if y.isEmpty { return x.count } var prev = Array(0...y.count) var cur = [Int](repeating: 0, count: y.count + 1) for i in 1...x.count { cur[0] = i for j in 1...y.count { cur[j] = x[i-1] == y[j-1] ? prev[j-1] : Swift.min(prev[j-1], prev[j], cur[j-1]) + 1 } swap(&prev, &cur) } return prev[y.count] } // MARK: - Internal private struct NameState { var voiced = 0 var silent = 0 var open = false var runStart: Double = 0 var segStart: Double = 0 var lastVoicedT: Double = 0 var confSum: Double = 0 var confN = 0 } private func closeSegment(name: String, state st: NameState) { guard st.lastVoicedT > st.segStart else { return } let confidence = st.confN > 0 ? st.confSum / Double(st.confN) : 0.8 segments.append(.init(start: st.segStart, end: st.lastVoicedT, name: name, confidence: confidence, source: "vision")) } private func canonical(_ raw: String) -> String { let key = Self.normalize(raw) return aliases[key] ?? raw.trimmingCharacters(in: .whitespacesAndNewlines) } private static func normalize(_ s: String) -> String { s.lowercased().trimmingCharacters(in: .whitespacesAndNewlines) } }