diff --git a/Ten31Transcripts/Adapters/AdapterRegistry.swift b/Ten31Transcripts/Adapters/AdapterRegistry.swift new file mode 100644 index 0000000..1baebf9 --- /dev/null +++ b/Ten31Transcripts/Adapters/AdapterRegistry.swift @@ -0,0 +1,17 @@ +import Foundation + +/// Maps a detected call app to its screen-reading adapter. One place to wire new +/// adapters; `VisualObserver` is constructed from whatever this returns when the +/// live visual capture is integrated into the session. +enum AdapterRegistry { + /// The adapter for a detected app, or nil if that app has no visual adapter + /// yet (the session then runs audio-only — graceful degradation). + static func adapter(for app: CallDetector.DetectedApp) -> (any AppAdapter)? { + switch app { + case .signal: return SignalAdapter() + case .meet: return MeetAdapter() + case .zoom: return ZoomAdapter() + case .teams: return TeamsAdapter() + } + } +} diff --git a/Ten31Transcripts/Adapters/MeetAdapter.swift b/Ten31Transcripts/Adapters/MeetAdapter.swift new file mode 100644 index 0000000..f6cfbcc --- /dev/null +++ b/Ten31Transcripts/Adapters/MeetAdapter.swift @@ -0,0 +1,45 @@ +import Foundation +import CoreVideo + +/// Google Meet adapter (browser tab — capture is at the browser-window level). +/// +/// Meet's active-speaker cue is a **coloured (Google-blue) ring/glow** around the +/// speaking participant's tile, plus animated audio bars in the tile's mic chip. +/// The participant **name sits in the tile's bottom-LEFT corner**, so the tile is +/// estimated extending up and to the right of the name. +/// +/// Detection *logic* is validated on synthetic frames; the geometry constants are a +/// first pass and will be calibrated against real Meet screenshots. Meet runs in a +/// browser, so there's no Accessibility name source we rely on — OCR only. +struct MeetAdapter: AppAdapter { + // Browsers that can host a Meet tab. The window, not the app, is what we capture; + // CallDetector decides a browser window is a Meet call by its title. + static let bundleIDs = [ + "com.google.Chrome", "org.mozilla.firefox", "com.apple.Safari", + "company.thebrowser.Browser", "com.brave.Browser", "com.microsoft.edgemac", + "com.google.Chrome.canary", "org.chromium.Chromium", + ] + let adapterVersion = "meet-0.1.0" + let preferredFPS = 3 + + private let analyzer: GridCallAnalyzer + + init() { + var config = GridCallAnalyzer.Config() + config.nameAnchor = .bottomLeft + config.detectColoredBorder = true // Google-blue speaking ring + config.detectWhiteBorder = false + config.tileExpandX = 3.0 + config.tileExpandY = 5.0 + self.analyzer = GridCallAnalyzer(config: config) + } + + func analyze(frame: CVPixelBuffer, at t: TimeInterval) -> [SpeakerObservation] { + analyzer.analyze(pixelBuffer: frame, at: t) + } + + // Exposed for fixture/synthetic tests. + func analyze(cgImage: CGImage, at t: TimeInterval) -> [SpeakerObservation] { + analyzer.analyze(cgImage: cgImage, at: t) + } +} diff --git a/Ten31Transcripts/Adapters/SignalAdapter.swift b/Ten31Transcripts/Adapters/SignalAdapter.swift index f3d428d..96b45c1 100644 --- a/Ten31Transcripts/Adapters/SignalAdapter.swift +++ b/Ten31Transcripts/Adapters/SignalAdapter.swift @@ -19,7 +19,7 @@ struct SignalAdapter: AppAdapter { // with real fixtures. (Gotchas, per Signal source: NO border in 1:1 calls — // fall back to mic-VAD/audio pill — and in Speaker view the large tile is // the speaker; both handled at a higher level later.) - config.nameAtBottom = true + config.nameAnchor = .bottomCenter config.detectWhiteBorder = true config.detectColoredBorder = false config.tileExpandX = 2.4 diff --git a/Ten31Transcripts/Adapters/TeamsAdapter.swift b/Ten31Transcripts/Adapters/TeamsAdapter.swift new file mode 100644 index 0000000..ca5d807 --- /dev/null +++ b/Ten31Transcripts/Adapters/TeamsAdapter.swift @@ -0,0 +1,38 @@ +import Foundation +import CoreVideo + +/// Microsoft Teams adapter (native app: com.microsoft.teams2 / .teams). +/// +/// Teams' active-speaker cue is a **coloured ring/border** (Teams violet) around +/// the speaking participant's tile; the **name sits in the tile's bottom-left**. +/// Not a launch priority — included so the four detected platforms all have a +/// visual adapter and degrade no worse than colored-border detection. +/// +/// Detection *logic* is validated on synthetic frames; geometry constants are a +/// first pass pending real Teams screenshots. +struct TeamsAdapter: AppAdapter { + static let bundleIDs = ["com.microsoft.teams2", "com.microsoft.teams"] + let adapterVersion = "teams-0.1.0" + let preferredFPS = 3 + + private let analyzer: GridCallAnalyzer + + init() { + var config = GridCallAnalyzer.Config() + config.nameAnchor = .bottomLeft + config.detectColoredBorder = true // Teams-violet speaking ring + config.detectWhiteBorder = false + config.tileExpandX = 3.0 + config.tileExpandY = 5.0 + self.analyzer = GridCallAnalyzer(config: config) + } + + func analyze(frame: CVPixelBuffer, at t: TimeInterval) -> [SpeakerObservation] { + analyzer.analyze(pixelBuffer: frame, at: t) + } + + // Exposed for fixture/synthetic tests. + func analyze(cgImage: CGImage, at t: TimeInterval) -> [SpeakerObservation] { + analyzer.analyze(cgImage: cgImage, at: t) + } +} diff --git a/Ten31Transcripts/Adapters/ZoomAdapter.swift b/Ten31Transcripts/Adapters/ZoomAdapter.swift new file mode 100644 index 0000000..757e507 --- /dev/null +++ b/Ten31Transcripts/Adapters/ZoomAdapter.swift @@ -0,0 +1,43 @@ +import Foundation +import CoreVideo + +/// Zoom adapter (native app: us.zoom.xos). +/// +/// Zoom's active-speaker cue is a **coloured border** around the speaking tile — +/// by default a green/yellow outline (configurable in Zoom settings). The +/// participant **name sits in the tile's bottom-LEFT corner**, so the tile is +/// estimated extending up and to the right of the name. +/// +/// Gotchas to calibrate against real fixtures later: +/// - **Speaker view** shows one big tile; the active speaker fills it (no useful +/// per-tile border) — handle by attributing speech to the large tile. +/// - The self-view tile and screen-share change the layout. +/// +/// Detection *logic* is validated on synthetic frames; geometry constants are a +/// first pass pending real Zoom screenshots. +struct ZoomAdapter: AppAdapter { + static let bundleIDs = ["us.zoom.xos"] + let adapterVersion = "zoom-0.1.0" + let preferredFPS = 3 + + private let analyzer: GridCallAnalyzer + + init() { + var config = GridCallAnalyzer.Config() + config.nameAnchor = .bottomLeft + config.detectColoredBorder = true // green/yellow speaking border + config.detectWhiteBorder = false + config.tileExpandX = 3.0 + config.tileExpandY = 5.0 + self.analyzer = GridCallAnalyzer(config: config) + } + + func analyze(frame: CVPixelBuffer, at t: TimeInterval) -> [SpeakerObservation] { + analyzer.analyze(pixelBuffer: frame, at: t) + } + + // Exposed for fixture/synthetic tests. + func analyze(cgImage: CGImage, at t: TimeInterval) -> [SpeakerObservation] { + analyzer.analyze(cgImage: cgImage, at: t) + } +} diff --git a/Ten31Transcripts/Visual/GridCallAnalyzer.swift b/Ten31Transcripts/Visual/GridCallAnalyzer.swift index eed54b7..006628e 100644 --- a/Ten31Transcripts/Visual/GridCallAnalyzer.swift +++ b/Ten31Transcripts/Visual/GridCallAnalyzer.swift @@ -13,10 +13,18 @@ import CoreImage /// Geometry (`Config`) is a first pass; tile expansion calibrates per app against /// real screenshot fixtures. Detection *logic* is validated on synthetic frames. struct GridCallAnalyzer { + /// Where the name label sits relative to its participant tile — drives how the + /// tile rect is estimated from the OCR'd name box. + enum NameAnchor { + case bottomCenter // Signal: centered footer; tile extends UP, centered on the name + case bottomLeft // Meet/Zoom: name in the bottom-left corner; tile extends UP and RIGHT + case center // name centered inside the tile + } + struct Config { var tileExpandX = 2.4 // tile width ≈ name width × this var tileExpandY = 4.8 // tile height ≈ name height × this - var nameAtBottom = true // Signal/most: name footer sits at the tile bottom + var nameAnchor: NameAnchor = .bottomCenter var detectColoredBorder = true var detectWhiteBorder = true var minTextConfidence: Float = 0.3 @@ -91,20 +99,27 @@ struct GridCallAnalyzer { return CGRect(x: box.minX * W, y: (1 - box.maxY) * H, width: box.width * W, height: box.height * H) } - /// Estimate the participant tile from the name label. With `nameAtBottom`, the - /// tile extends UP from the footer (Signal); otherwise it's centred on the name. + /// Estimate the participant tile from the name label, per the app's `nameAnchor`: + /// - `.bottomCenter` (Signal): tile extends UP from a centered footer. + /// - `.bottomLeft` (Meet/Zoom): name hugs the tile's bottom-left corner; the + /// tile extends UP and to the RIGHT of it. + /// - `.center`: tile centered on the name. private func tileRect(_ box: CGRect, _ w: Int, _ h: Int) -> CGRect { let W = Double(w), H = Double(h) let name = pixelRect(box, w, h) let nw = name.width * config.tileExpandX let nh = name.height * config.tileExpandY - let cx = name.midX let rect: CGRect - if config.nameAtBottom { + switch config.nameAnchor { + case .bottomCenter: let bottom = name.maxY + name.height * 0.3 - rect = CGRect(x: cx - nw / 2, y: bottom - nh, width: nw, height: nh) - } else { - rect = CGRect(x: cx - nw / 2, y: name.midY - nh / 2, width: nw, height: nh) + rect = CGRect(x: name.midX - nw / 2, y: bottom - nh, width: nw, height: nh) + case .bottomLeft: + let bottom = name.maxY + name.height * 0.3 + let left = name.minX - name.height * 0.4 // small left padding ≈ the corner gutter + rect = CGRect(x: left, y: bottom - nh, width: nw, height: nh) + case .center: + rect = CGRect(x: name.midX - nw / 2, y: name.midY - nh / 2, width: nw, height: nh) } return rect.intersection(CGRect(x: 0, y: 0, width: W, height: H)) } diff --git a/Ten31TranscriptsTests/GridCallAnalyzerTests.swift b/Ten31TranscriptsTests/GridCallAnalyzerTests.swift index 4d708bf..5207042 100644 --- a/Ten31TranscriptsTests/GridCallAnalyzerTests.swift +++ b/Ten31TranscriptsTests/GridCallAnalyzerTests.swift @@ -61,4 +61,67 @@ final class GridCallAnalyzerTests: XCTestCase { XCTAssertEqual(speaking.first?.bbox.midX ?? 0, 210, accuracy: 160) XCTAssertEqual(speaking.first?.bbox.midY ?? 0, 435, accuracy: 160) } + + // MARK: - Coloured-border apps (Meet / Zoom / Teams): name in bottom-LEFT corner. + + private func leftText(_ s: String, _ ctx: CGContext, leftBaseline: CGPoint, size: CGFloat) { + let font = CTFontCreateWithName("Helvetica-Bold" as CFString, size, nil) + let attrs = [kCTFontAttributeName: font, + kCTForegroundColorAttributeName: CGColor(red: 1, green: 1, blue: 1, alpha: 1)] as CFDictionary + let line = CTLineCreateWithAttributedString(CFAttributedStringCreate(nil, s as CFString, attrs)!) + ctx.textPosition = leftBaseline + CTLineDraw(line, ctx) + } + + private func coloredFrame(speakingIndex: Int, border: CGColor) -> CGImage { + let W = 900, H = 640 + let ctx = CGContext(data: nil, width: W, height: H, bitsPerComponent: 8, bytesPerRow: 0, + space: CGColorSpaceCreateDeviceRGB(), + bitmapInfo: CGImageAlphaInfo.premultipliedLast.rawValue)! + ctx.setFillColor(CGColor(red: 0.1, green: 0.1, blue: 0.12, alpha: 1)) + ctx.fill(CGRect(x: 0, y: 0, width: W, height: H)) + let rects: [(String, CGRect)] = [ + ("GRANT", CGRect(x: 40, y: 340, width: 380, height: 250)), + ("SARAH", CGRect(x: 480, y: 340, width: 380, height: 250)), + ("DMITRI", CGRect(x: 40, y: 50, width: 380, height: 250)), + ("ALEX", CGRect(x: 480, y: 50, width: 380, height: 250)), + ] + for (i, (name, rect)) in rects.enumerated() { + ctx.setFillColor(CGColor(red: 0.18, green: 0.18, blue: 0.2, alpha: 1)); ctx.fill(rect) + if i == speakingIndex { + ctx.setStrokeColor(border); ctx.setLineWidth(8) + ctx.stroke(rect.insetBy(dx: 4, dy: 4)) + } + // Name in the bottom-LEFT corner (bottom-left origin: near minX/minY). + leftText(name, ctx, leftBaseline: CGPoint(x: rect.minX + 16, y: rect.minY + 16), size: 40) + } + return ctx.makeImage()! + } + + func testMeetPicksBlueBorderedSpeaker() { + let blue = CGColor(red: 0.16, green: 0.45, blue: 0.95, alpha: 1) + for (idx, name) in ["GRANT", "SARAH", "DMITRI", "ALEX"].enumerated() { + let obs = MeetAdapter().analyze(cgImage: coloredFrame(speakingIndex: idx, border: blue), at: 0) + let speaking = Set(obs.filter { $0.speaking }.map { $0.name }) + XCTAssertEqual(speaking, [name], "Meet: only \(name) should be speaking") + } + // No border → no speaker. + let none = MeetAdapter().analyze(cgImage: coloredFrame(speakingIndex: -1, border: blue), at: 0) + XCTAssertTrue(none.filter { $0.speaking }.isEmpty) + } + + func testZoomPicksGreenBorderedSpeaker() { + let green = CGColor(red: 0.2, green: 0.85, blue: 0.3, alpha: 1) + let obs = ZoomAdapter().analyze(cgImage: coloredFrame(speakingIndex: 3, border: green), at: 0) // ALEX + let speaking = Set(obs.filter { $0.speaking }.map { $0.name }) + XCTAssertEqual(speaking, ["ALEX"]) + } + + func testWhiteBorderDetectorIgnoresColouredBorder() { + // Signal looks only for the white border, so a coloured (Meet) border must + // not register as a Signal speaker. + let blue = CGColor(red: 0.16, green: 0.45, blue: 0.95, alpha: 1) + let obs = SignalAdapter().analyze(cgImage: coloredFrame(speakingIndex: 0, border: blue), at: 0) + XCTAssertTrue(obs.filter { $0.speaking }.isEmpty) + } }