Upgrade to Pro — share decks privately, control downloads, hide ads and more …

iOSDC2023:聴いて話すiOS 現実世界の「音」との連携

たまねぎ
September 02, 2023

iOSDC2023:聴いて話すiOS 現実世界の「音」との連携

たまねぎ

September 02, 2023
Tweet

More Decks by たまねぎ

Other Decks in Programming

Transcript

  1. • Siri • FaceTime • Shazam • ϊΠζݕ஌ • Voice

    Control • VoiceOver • Ի੠ೖྗ • Ի੠ಡΈ্͛ • Ի੠ϝϞ ࣮͸ଟ༷ͳඪ४ػೳ • AVFoundation • CallKit • Core Audio • ShazamKit • SiriKit • SoundAnalysis • Speech ਐԽ͢ΔFramework
  2. ࣄલ४උ̍ɿAudioEngineͷϔϧύʔ struct AudioEngine { private let audioEngine = AVAudioEngine() func

    start( bufferSize: AVAudioFrameCount, handler: @escaping (AVAudioPCMBuffer, AVAudioTime) -> Void ) throws { // Ի੠ೝࣝͷ৔߹ͷΦʔσΟΦઃఆ let audioSession = AVAudioSession.sharedInstance() try audioSession.setCategory(.record, mode: .measurement, options: []) try audioSession.setActive(true) // Ի੠ೖྗͷ४උ audioEngine.inputNode.installTap( onBus: 0, bufferSize: bufferSize, format: audioEngine.inputNode.outputFormat(forBus: 0), block: handler ) audioEngine.prepare() try audioEngine.start() } }
  3. Ի੠ೝࣝؔ࿈ͷΠϯελϯεͱϦΫΤετΛ࡞੒ class Transcriptor: ObservableObject { // Ի੠ೝࣝΛ࣮ߦ͢ΔΠϯελϯε private let speechRecognizer

    = SFSpeechRecognizer(locale: Locale(identifier: “ja-JP"))! // ϦΞϧλΠϜೖྗͰ͸ͳ͘ϑΝΠϧೖྗͷ৔߹͸ɺSFSpeechURLRecognitionRequestΛ࢖͏ private let request = SFSpeechAudioBufferRecognitionRequest() private let audioEngine = AudioEngine() // ೝࣝ݁ՌΛར༻ଆʹ௨஌ @Published private(set) var bestTranscription: SFTranscription? init() { // ϦΫΤετͷηοτΞοϓΛߦ͏ if speechRecognizer.supportsOnDeviceRecognition { // ΦϯσόΠε࣮ߦɿར༻੍ݶ͕ͳ͘ɺϓϥΠόγʔ΋อͨΕΔ // αʔόʔ࣮ߦɿར༻੍ݶ͕͋ΓɺσόΠε֎ʹσʔλ͕ૹ৴͞ΕΔ͕ɺਫ਼౓͕ߴ͍ request.requiresOnDeviceRecognition = true } request.shouldReportPartialResults = true } }
  4. ϨίʔσΟϯάͷ։࢝ func startRecording() async throws { // ར༻Մ൱ͷνΣοΫ guard case

    .authorized = await withCheckedContinuation({ SFSpeechRecognizer.requestAuthorization($0.resume(returning:)) }), speechRecognizer.isAvailable else { throw TranscriptorError.unavailable } // ΦʔσΟΦΤϯδϯͷىಈ try audioEngine.start(bufferSize: 2048) { buffer, _ in // όοϑΝαΠζ͝ͱͷσʔλΛೝࣝϦΫΤετʹ௥Ճ͍ͯ͘͠ request.append(buffer) } // ೝࣝॲཧͷ։࢝ speechRecognizer.recognitionTask(with: request) { result, _ in DispatchQueue.main.async { self.bestTranscription = result?.bestTranscription } } }
  5. DEMOɿೝࣝͨ͠Ի੠Λදࣔ struct ContentView: View { @StateObject private var transcriptor =

    Transcriptor() private var recognizedText: String? { // Ի੠ೝࣝ݁Ռ͔ΒϑΥʔϚοτࡁΈจࣈྻΛऔಘ transcriptor.bestTranscription?.formattedString } var body: some View { VStack { if let recognizedText { Text(recognizedText) } RecordingButton { try await transcriptor.startRecording() } } } }
  6. • iOSͷςΩετԻ੠ม׵API • ϓϨʔϯςΩετ΍SSMLܗࣜͷσʔλΛೖྗʹͱΔ 
 ※ Speech Synthesis Markup LanguageʢԻ੠߹੒ϚʔΫΞοϓݴޠʣ

    • ൃ࿩ݴޠ΍εϐʔυͳͲΛίϯτϩʔϧՄೳ • iOS17͔Β͸ύʔιφϧϘΠεʹରԠʢӳޠͷΈʣ AVSpeechSynthesizer
  7. import AVFoundation // PlaneText͔ΒUtteranceͷ࡞੒ let utterance = AVSpeechUtterance(string: text) utterance.prefersAssistiveTechnologySettings

    = true // ΞγετઃఆҾܧ utterance.rate = 0.5 // εϐʔυ (0 ~ 1) utterance.pitchMultiplier = 1 // ϐον (0.5 ~ 2) utterance.volume = 1 // Իྔ (0 ~ 1) // SSML͔ΒUtteranceͷ࡞੒ let ssml = """ <speak> <prosody rate="fast" pitch="+2st" volume="loud"> ͜Μʹͪ͸ɺͨ·Ͷ͗Ͱ͢ʂ </prosody> </speak> """ let utterance = AVSpeechUtterance(ssmlRepresentation: ssml) // Voiceͷઃఆ utterance.voice = .init(language: "ja-JP") utterance.voice = .init(identifier: AVSpeechSynthesisVoiceIdentifierAlex) utterance.voice = AVSpeechSynthesisVoice.speechVoices().randomElement() • AVSpeechUtteranceʹ ൃ࿩σʔλΛηοτ • SSMLΛ࢖Θͳͯ͘ ΋ɺ֤छϓϩύςΟ͸ ઃఆՄೳ • AVSpeechSynthesisVoi ceΛηοτͯ͠ɺϏϧ τΠϯԻ੠ͷར༻΋Ͱ ͖Δ AVSpeechSynthesizerͰͷൃ࿩
  8. // AVSpeechSynthesizerͷ࡞੒ let synthesizer = AVSpeechSynthesizer() // ࠶ੜ synthesizer.speak(utterance) //

    Ұ࣌ఀࢭ synthesizer.pauseSpeaking(at: .immediate) synthesizer.pauseSpeaking(at: .word) // ࠶։ synthesizer.continueSpeaking() // ఀࢭ synthesizer.stopSpeaking(at: .immediate) synthesizer.stopSpeaking(at: .word) • AVSpeechSynthesizer ʹAVSpeechUtterance Λ౉ͯ͠ൃ࿩͢Δ • ίϯτϩʔϧ༻ͷAPI Λ׆༻ͯ͠ࡉ੍͔͍ޚ ΋Ͱ͖Δ AVSpeechSynthesizerͰͷൃ࿩
  9. DEMOɿೖྗͨ͠จࣈྻΛൃ࿩ class SpeechSynthesizer: ObservableObject { @Published var text = “͜Μʹͪ͸ɺͨ·Ͷ͗Ͱ͢ʂ”

    @Published var selectedVoice = AVSpeechSynthesisVoice .speechVoices() .first { $0.language == "ja-JP" }! @Published var rate: Float = 0.5 @Published var pitchMultiplier: Float = 1 @Published var volume: Float = 1 private let synthesizer: AVSpeechSynthesizer = { let s = AVSpeechSynthesizer() s.usesApplicationAudioSession = false return s }() var voices: [AVSpeechSynthesisVoice] { AVSpeechSynthesisVoice.speechVoices() } func speak() { let utterance = AVSpeechUtterance(string: text) utterance.voice = selectedVoice utterance.rate = rate utterance.pitchMultiplier = pitchMultiplier utterance.volume = volume synthesizer.speak(utterance) } }
  10. import ShazamKit class Matcher: ObservableObject { @Published private(set) var matchedItem:

    SHMatchedMediaItem? private let audioEngine = AudioEngine() func startMatching() async throws { let session = SHSession() // ϦΞϧλΠϜͳΦʔσΟΦϚονϯάͷ४උ try audioEngine.start(bufferSize: 2048) { buffer, audioTime in session.matchStreamingBuffer(buffer, at: audioTime) } // ೝࣝͨ͠ϝσΟΞΞΠςϜΛड͚औΔ for await case .match(let match) in session.results { await MainActor.run { matchedItem = match.mediaItems.first } } } } • ʢࣄલʹʣDeveloper ϙʔλϧͰAppService Λ௥Ճ • SHSessionΛ࡞੒ • ΦʔσΟΦσʔλΛ sessionʹྲྀ͠ࠐΉ • ΧλϩάσʔλͱϚο νͨ݁͠ՌΛऔಘ ָۂಛఆ
  11. SHMatchedMediaItemͷϓϩύςΟ܈ func explore(_ mediaItem: SHMatchedMediaItem) { mediaItem.title // λΠτϧ mediaItem.subtitle

    // αϒλΠτϧ mediaItem.artist // ΞʔςΟετ໊ mediaItem.artworkURL // ΞʔτϫʔΫURL mediaItem.genres // δϟϯϧͷ഑ྻ mediaItem.timeRanges // ࣌ؒൣғ mediaItem.matchOffset. // ϚονՕॴ mediaItem.predictedCurrentMatchOffset // ݱࡏͷϚονՕॴͷ༧ଌ mediaItem.webURL. // ShazamΧλϩάϖʔδ΁ͷϦϯΫ mediaItem.appleMusicID // AppleMusicID mediaItem.appleMusicURL // AppleMusicϖʔδ΁ͷϦϯΫ mediaItem.songs // MusicKitͷSongΦϒδΣΫτ // etc… }
  12. SHMatchedMediaItemͷ׆༻ // ָۂͷίϯτϩʔϧʹ͸ɺMusikKitΛ࢖͏ import MusicKit func play(_ mediaItem: SHMatchedMediaItem) async

    throws { guard case .authorized = await MusicAuthorization.request() else { return } // SHMatchedMediaItemͷAppleMusicؔ࿈ͷϓϩύςΟΛࢀর SystemMusicPlayer.shared.queue = .init(for: mediaItem.songs) try await SystemMusicPlayer.shared.play() }
  13. DEMOɿฉ͖औͬͨԻݯΛಛఆ struct ContentView: View { @StateObject private var matcher =

    Matcher() var body: some View { VStack(spacing: 56) { if let mediaItem = matcher.matchedItem { MatchedMediaItemView(mediaItem) } else if matcher.isActive { MatchedMediaItemView.loading() } RecordingButton { try? await matcher.startMatching() } }.padding() } }
  14. • Shazam CLIΛ࢖ͬͯ࡞ΕΔ • Իݯ͔ΒSignature (.shazamsignature)Λ࡞੒ • ೚ҙͷϝλσʔλ ΛؚΊͨϑΝΠϧ(.csv)Λ༻ҙ •

    SignatureͱCSVΛඥ෇͚ͯΧλϩάʢ.shazamcatalogʣΛ࡞੒ • Ի੠Ϛονϯά΍ΦϑηοτΛ༻͍ͨମݧߏஙʹ׆༻Մೳ ΧελϜΧλϩά
  15. SNAudioStreamAnalyzer class SoundAnalyzer: NSObject, ObservableObject { @Published private(set) var result:

    SNClassificationResult? private let audioEngine = AudioEngine() func startAnalyze() throws { // SNClassifySoundRequestΛ࡞੒ let request = try SNClassifySoundRequest(classifierIdentifier: .version1) // ϑΝΠϧೖྗͷ৔߹͸ɺSNAudioFileAnalyzerΛ࢖͏ let analyzer = SNAudioStreamAnalyzer(format: audioEngine.format) try analyzer.add(request, withObserver: self) try audioEngine.start(bufferSize: 2048) { buffer, time in // Ի੠σʔλΛྲྀ͜͠Ή analyzer.analyze(buffer, atAudioFramePosition: time.sampleTime) } } }
  16. SNAudioStreamAnalyzer extension SoundAnalyzer: SNResultsObserving { // ೝࣝ݁Ռ͕௨஌͞Εͯ͘Δ func request(_ request:

    SNRequest, didProduce result: SNResult) { DispatchQueue.main.async { self.result = result as? SNClassificationResult self.result?.classifications.first?.identifier // ೝࣝͨ͠Ի੠ͷϥϕϧ self.result?.classifications.first?.confidence // ೝࣝͨ͠Ի੠ͷ৴པ౓ } } }
  17. DEMOɿ ໐͍ͬͯΔָثΛಛఆ struct ContentView: View { // … @StateObject private

    var soundAnalyzer = SoundAnalyzer() var body: some View { ZStack(alignment: .bottom) { ScrollView { LazyVStack(spacing: 0) { // … BandImage(soundAnalyzer.result) } } RecordingButton { try? soundAnalyzer.analyze() }.padding(.vertical) } } }
  18. ·ͱΊ • SFSpeechRecognizerɿݴޠೝࣝ • τϥϯεΫϦϓτ΍ɺݴޠΛϑοΫʹͨ͠ΞΫγϣϯ • AVSpeechSynthesizerɿൃ࿩ • Ի੠ग़ྗʹΑΔϑΟʔυόοΫ •

    ShazamKitɿϚονϯά • ࣄલʹΧλϩάͷ༻ҙ͕ඞཁ • ݴޠͰ͋Δඞཁ͕ແ͘ɺOffsetͷ׆༻΋Ͱ͖Δ • SoundAnalysisɿ෼ྨ • ϏϧτΠϯϞσϧͰ͋Ε͹͙͢ʹ׆༻Մೳ • ΧελϜͷϞσϧΛ༻ҙͯ͠ɺ೚ҙͷ෼ྨ΋૊ΈࠐΊΔ