Merge branch 'master' into ndw-fixes

spokestack · Dec 22, 2020 · ab69b5e · ab69b5e
2 parents b8b0892 + 9737434
commit ab69b5e
Show file tree

Hide file tree

Showing 7 changed files with 60 additions and 100 deletions.
diff --git a/Spokestack/Error.swift b/Spokestack/Error.swift
@@ -68,11 +68,12 @@ public enum VADError: Error, Equatable, LocalizedError {
     }
 }
 
-/// Errors thrown by implementors of the WakewordRecognizer protocol.
+/// Errors thrown by  command models.
+/// - SeeAlso: `TFLiteWakewordRecognizer`, `TFLiteKeywordRecognizer`
 public enum CommandModelError: Error, Equatable, LocalizedError {
     /// The command recognizer was unable to configure the recognizer model(s).
     case model(String)
-    /// The rcommand ecognizer encountered an error during the processing of the audio frame.
+    /// The command recognizer encountered an error during the processing of the audio frame.
     case process(String)
     /// The command recognizer encountered an error during the configuration or running of the filter model.
     case filter(String)

diff --git a/Spokestack/RingBuffer.swift b/Spokestack/RingBuffer.swift
@@ -10,7 +10,7 @@ import Foundation
 
 /// A simple circular buffer of values.
 final class RingBuffer <T> {
-
+    
     // MARK: Public (properties)
 
     /// The number of empty spaces remaining in the RingBuffer until it's full.
@@ -81,11 +81,10 @@ final class RingBuffer <T> {
     func read() throws -> T {
         if self.isEmpty {
             throw RingBufferStateError.illegalState(message: "ring buffer is empty")
-        } else {
-            let value: T = self.data[self.rpos]
-            self.rpos = self.pos(self.rpos + 1)
-            return value
         }
+        let value: T = self.data[self.rpos]
+        self.rpos = self.pos(self.rpos + 1)
+        return value
     }
 
     /// Writes the next value to the buffer.

diff --git a/Spokestack/SpeechConfiguration.swift b/Spokestack/SpeechConfiguration.swift
@@ -21,9 +21,11 @@ import Foundation
     public var fftWindowType: SignalProcessing.FFTWindowType = .hann
     /// The desired linear Root Mean Squared (RMS) signal energy, which is used for signal normalization and should be tuned to the RMS target used during wakeword model training.
     /// - SeeAlso: `TFLiteWakewordRecognizer`
+    @available(*, deprecated, message: "RMS normalization is no longer used during wakeword recognition.")
     @objc public var rmsTarget: Float = 0.08
     /// The Exponentially Weighted Moving Average (EWMA) update rate for the current  Root Mean Squared (RMS) signal energy (0 for no RMS normalization).
     /// - SeeAlso: `TFLiteWakewordRecognizer`
+    @available(*, deprecated, message: "RMS normalization is no longer used during wakeword recognition.")
     @objc public var rmsAlpha: Float = 0.0
     /// The size of the signal window used to calculate the STFT, in number of samples - should be a power of 2 for maximum efficiency.
     /// - SeeAlso: `TFLiteWakewordRecognizer`
@@ -156,17 +158,11 @@ import Foundation
     /// - Remark: ex: "yes,no"
     /// - Warning: cannot contain spaces
     /// - SeeAlso: `TFLiteKeywordRecognizer`
-    @objc public var keywords: String = "bed,bird,cat,dog,down,eight,five,four,go,happy,house,left,marvin,nine,no,off,on,one,right,seven,sheila,six,stop,three,tree,two,up,wow,yes,zero"
+    @objc public var keywords: String = ""
     /// The name of the window function to apply to each audio frame before calculating the STFT.
     /// - Remark: Currently the "hann" window is supported.
     /// - SeeAlso: `TFLiteWakewordRecognizer`
     public var keywordFFTWindowType: SignalProcessing.FFTWindowType = .hann
-    /// The desired linear Root Mean Squared (RMS) signal energy, which is used for signal normalization and should be tuned to the RMS target used during wakeword model training.
-    /// - SeeAlso: `TFLiteWakewordRecognizer`
-    @objc public var keywordRMSTarget: Float = 0.08
-    /// The Exponentially Weighted Moving Average (EWMA) update rate for the current  Root Mean Squared (RMS) signal energy (0 for no RMS normalization).
-    /// - SeeAlso: `TFLiteWakewordRecognizer`
-    @objc public var keywordRMSAlpha: Float = 0.0
     /// The size of the signal window used to calculate the STFT, in number of samples - should be a power of 2 for maximum efficiency.
     /// - SeeAlso: `TFLiteWakewordRecognizer`
     @objc public var keywordFFTWindowSize: Int = 512
@@ -185,5 +181,4 @@ import Foundation
     /// The length of the sliding window of encoder output used as an input to the wakeword recognizer classifier, in milliseconds.
     /// - SeeAlso: `TFLiteWakewordRecognizer`
     @objc public var keywordEncodeLength: Int = 920
-    /// The threshold of the wakeword recognizer classifier's posterior output, above which the wakeword recognizer activates the pipeline, in the range [0, 1].
 }
diff --git a/Spokestack/TFLiteKeywordRecognizer.swift b/Spokestack/TFLiteKeywordRecognizer.swift
@@ -10,17 +10,16 @@ import Foundation
 import TensorFlowLite
 
 @objc public class TFLiteKeywordRecognizer: NSObject {
-    
+
     /// Configuration for the recognizer.
     @objc public var configuration: SpeechConfiguration
-    
+
     /// Global state for the speech pipeline.
     @objc public var context: SpeechContext
-    
+
     private var isActive: Bool = false
     private var activation = 0
 
-    private var rmsValue: Float = 0.0
     private var prevSample: Float = 0.0
     private var sampleWindow: RingBuffer<Float>!
     private var fftFrame: Array<Float> = []
@@ -31,7 +30,7 @@ import TensorFlowLite
     private var encodeState: RingBuffer<Float>!
     private var encodeWindow: RingBuffer<Float>!
     private var classes: [String] = []
-    
+
     // TensorFlowLite models
     private var filterModel: Interpreter?
     private var encodeModel: Interpreter?
@@ -40,14 +39,12 @@ import TensorFlowLite
         case encode
         case state
     }
-
-
+
     private var sampleCollector: Array<Float>?
     private var fftFrameCollector: String?
     private var filterCollector: Array<Float>?
     private var encodeCollector: Array<Float>?
-
-
+
     /// Initializes a TFLiteKeywordRecognizer instance.
     ///
     /// A recognizer is initialized by, and receives `startStreaming` and `stopStreaming` events from, an instance of `SpeechPipeline`.
@@ -64,7 +61,6 @@ import TensorFlowLite
     }
 
     private func initialize(_ c: SpeechConfiguration) {
-        self.rmsValue = c.keywordRMSTarget
         self.sampleWindow = RingBuffer(c.keywordFFTWindowSize, repeating: 0.0)
         self.fftFrame = Array(repeating: 0.0, count: c.keywordFFTWindowSize)
         self.fftWindow = SignalProcessing.fftWindowDispatch(windowType: c.keywordFFTWindowType, windowLength: c.keywordFFTWindowSize)
@@ -98,7 +94,7 @@ import TensorFlowLite
             self.encodeModel = try Interpreter(modelPath: c.keywordEncodeModelPath)
             if let model = self.encodeModel {
                 try model.allocateTensors()
-                if (model.inputTensorCount != Tensors.allCases.count) {
+                if model.inputTensorCount != Tensors.allCases.count {
                     throw CommandModelError.encode("Keyword encode model input dimension is \(model.inputTensorCount) which does not matched expected dimension \(Tensors.allCases.count)")
                 }
             } else {
@@ -108,6 +104,9 @@ import TensorFlowLite
             self.detectModel = try Interpreter(modelPath: c.keywordDetectModelPath)
             if let model = self.detectModel {
                 try model.allocateTensors()
+                if model.outputTensorCount != self.classes.count {
+                    throw CommandModelError.detect("The number of keywords defined by SpeechConfiguration.keywords does not match the number of keywords detected by SpeechConfiguration.keywordDetectModelPath.")
+                }
             } else {
                 throw CommandModelError.detect("\(c.keywordDetectModelPath) could not be initialized")
             }
@@ -117,20 +116,13 @@ import TensorFlowLite
     }
 
     private func sample(_ frame: Data) throws {
-        // Preallocate an array of data elements in the frame for use in RMS and sampling
+        // Preallocate an array of data elements in the frame for use in sampling
         let elements: Array<Int16> = frame.elements()
 
-        // Update the rms normalization factors
-        // Maintain an ewma of the rms signal energy for speech samples
-        if self.configuration.keywordRMSAlpha > 0 {
-            self.rmsValue = self.configuration.keywordRMSAlpha * SignalProcessing.rms(frame, elements) + (1 - self.configuration.keywordRMSAlpha) * self.rmsValue
-        }
-
         // Process all samples in the frame
         for e in elements {
-            // Normalize and clip the 16-bit sample to the target rms energy
+            // Normalize and clip the 16-bit sample
             var sample: Float = Float(e) / Float(Int16.max)
-            sample = sample * (self.configuration.keywordRMSTarget / self.rmsValue)
             sample = max(-1.0, min(sample, 1.0))
 
             // Run a pre-emphasis filter to balance high frequencies and eliminate any dc energy
@@ -148,7 +140,11 @@ import TensorFlowLite
             // - advance the sample sliding window
             try self.sampleWindow.write(sample)
             if self.sampleWindow.isFull {
-                try self.analyze()
+                if self.isActive {
+                    try self.analyze()
+                }
+                // rewind the sample window for another run
+                self.sampleWindow.rewind().seek(self.hopLength)
             }
         }
     }
@@ -163,14 +159,10 @@ import TensorFlowLite
         // Compute the stft spectrogram
         self.fft.forward(&self.fftFrame)
 
-        // rewind the sample window for another run
-        self.sampleWindow.rewind().seek(self.hopLength)
-
         if self.configuration.tracing.rawValue <= Trace.Level.DEBUG.rawValue {
             self.fftFrameCollector? += "\(self.fftFrame)\n"
         }
 
-        // filter() will utilize the stft spectrogram as input for the filter model
         try self.filter()
     }
 
@@ -324,34 +316,30 @@ extension TFLiteKeywordRecognizer : SpeechProcessor {
     public func process(_ frame: Data) {
         audioProcessingQueue.async {[weak self] in
             guard let strongSelf = self else { return }
-            if strongSelf.context.isActive {
-                do {
+            do {
+                if strongSelf.context.isActive {
+                    // sample every frame while active
+                    try strongSelf.sample(frame)
+                    if !strongSelf.isActive {
+                        strongSelf.isActive = true
+                    } else if
+                        (strongSelf.isActive
+                            && strongSelf.activation <= strongSelf.configuration.wakeActiveMax)
+                            ||
+                            strongSelf.activation <= strongSelf.configuration.wakeActiveMin {
+                        // already sampled, but in the midst of activation, so don't deactiavte yet
+                        strongSelf.activation += strongSelf.configuration.frameWidth
+                    } else {
+                        strongSelf.run()
+                        strongSelf.deactivate()
+                    }
+                } else if strongSelf.isActive {
                     try strongSelf.sample(frame)
-                } catch let error {
-                    strongSelf.context.dispatch { $0.failure(error: error) }
-                }
-                // sample every frame while active
-                if !strongSelf.isActive {
-                    strongSelf.isActive = true
-                } else if
-                    (strongSelf.isActive
-                        && strongSelf.activation <= strongSelf.configuration.wakeActiveMax)
-                        ||
-                        strongSelf.activation <= strongSelf.configuration.wakeActiveMin {
-                    // already sampled, but in the midst of activation, so don't deactiavte yet
-                    strongSelf.activation += strongSelf.configuration.frameWidth
-                } else {
                     strongSelf.run()
                     strongSelf.deactivate()
                 }
-            } else if strongSelf.isActive {
-                do {
-                    try strongSelf.sample(frame)
-                } catch let error {
-                    strongSelf.context.dispatch { $0.failure(error: error) }
-                }
-                strongSelf.run()
-                strongSelf.deactivate()
+            } catch let error {
+                strongSelf.context.dispatch { $0.failure(error: error) }
             }
         }
     }

diff --git a/Spokestack/TFLiteWakewordRecognizer.swift b/Spokestack/TFLiteWakewordRecognizer.swift
@@ -59,9 +59,6 @@ import TensorFlowLite
     private var fft: FFT!
 
     // Audio Signal Normalization
-    private var rmsAlpha: Float = 0.0
-    private var rmsTarget: Float = 0.0
-    private var rmsValue: Float = 0.0
     private var preEmphasis: Float = 0.0
     private var prevSample: Float = 0.0
 
@@ -164,9 +161,6 @@ import TensorFlowLite
         }
 
         // Signal normalization
-        self.rmsAlpha = c.rmsAlpha
-        self.rmsTarget = c.rmsTarget
-        self.rmsValue = self.rmsTarget
         self.preEmphasis = c.preEmphasis
 
         // Sliding window buffers
@@ -195,21 +189,14 @@ import TensorFlowLite
 
     private func sample(_ data: Data) throws -> Void {
 
-        // Preallocate an array of data elements in the frame for use in RMS and sampling
+        // Preallocate an array of data elements in the frame for use in sampling
         let dataElements: Array<Int16> = data.elements()
 
-        // Update the rms normalization factors
-        // Maintain an ewma of the rms signal energy for speech samples
-        if self.rmsAlpha > 0 {
-            self.rmsValue = self.rmsAlpha * SignalProcessing.rms(data, dataElements) + (1 - self.rmsAlpha) * self.rmsValue
-        }
-
         // Process all samples in the frame
         for d in dataElements {
 
-            // Normalize and clip the 16-bit sample to the target rms energy
+            // Normalize and clip the 16-bit sample
             var sample: Float = Float(d) / Float(Int16.max)
-            sample = sample * (self.rmsTarget / self.rmsValue)
             sample = max(-1.0, min(sample, 1.0))
 
             // Run a pre-emphasis filter to balance high frequencies and eliminate any dc energy
@@ -227,7 +214,11 @@ import TensorFlowLite
             // - advance the sample sliding window
             try self.sampleWindow.write(sample)
             if self.sampleWindow.isFull {
-                try self.analyze()
+                if self.isSpeechDetected {
+                    try self.analyze()
+                }
+                // rewind the sample window for another run
+                self.sampleWindow.rewind().seek(self.hopLength)
             }
         }
     }
@@ -243,9 +234,6 @@ import TensorFlowLite
         // Compute the stft spectrogram
         self.fft.forward(&self.fftFrame)
 
-        // rewind the sample window for another run
-        self.sampleWindow.rewind().seek(self.hopLength)
-
         if self.traceLevel.rawValue <= Trace.Level.DEBUG.rawValue {
             self.fftFrameCollector? += "\(self.fftFrame)\n"
         }
@@ -450,7 +438,7 @@ extension TFLiteWakewordRecognizer : SpeechProcessor {
                     } catch let error {
                         strongSelf.context.dispatch { $0.failure(error: error) }
                     }
-                    // vad detection edge
+                // vad detection edge
                 } else if strongSelf.isSpeechDetected {
                     strongSelf.reset()
                 }

diff --git a/SpokestackFrameworkExample/TFLiteKeywordViewController.swift b/SpokestackFrameworkExample/TFLiteKeywordViewController.swift
@@ -44,15 +44,7 @@ class TFLiteKeywordViewController: UIViewController {
         return button
     }()
 
-    lazy private var pipeline: SpeechPipeline = {
-        return try! SpeechPipelineBuilder()
-            .addListener(self)
-            .useProfile(.vadTriggerSpokestackSpeech)
-            .setProperty("tracing", ".DEBUG")
-            .setProperty("vadFallDelay", "1600")
-            .setDelegateDispatchQueue(DispatchQueue.main)
-            .build()
-    }()
+    private var pipeline: SpeechPipeline?
 
     override func loadView() {
 
@@ -104,19 +96,20 @@ class TFLiteKeywordViewController: UIViewController {
             .useProfile(.vadTriggerKeyword)
             .setProperty("tracing", Trace.Level.PERF)
             .setProperty("keywordDetectModelPath", detectPath)
+            .setProperty("keywords", "bed,bird,cat,dog,down,eight,five,four,go,happy,house,left,marvin,nine,no,off,on,one,right,seven,sheila,six,stop,three,tree,two,up,wow,yes,zero")
             .setProperty("keywordEncodeModelPath", encodePath)
             .setProperty("keywordFilterModelPath", filterPath)
             .build()
     }
 
     @objc func startRecordingAction(_ sender: Any) {
         print("pipeline started")
-        self.pipeline.start()
+        self.pipeline?.start()
     }
 
     @objc func stopRecordingAction(_ sender: Any) {
         print("pipeline finished")
-        self.pipeline.stop()
+        self.pipeline?.stop()
     }
 
     @objc func dismissViewController(_ sender: Any?) -> Void {

diff --git a/SpokestackTests/TFLiteKeywordRecognizerTest.swift b/SpokestackTests/TFLiteKeywordRecognizerTest.swift
@@ -23,11 +23,7 @@ class TFLiteKeywordRecognizerTest: XCTestCase {
         self.config.keywordFilterModelPath = MockKeywordModels.filterPath
         self.config.keywordDetectModelPath = MockKeywordModels.detectPath
         self.context = SpeechContext(config)
-        //self.config.wakeActiveMin = 20
-        //self.config.wakeActiveMax = 100
-        //self.config.keywordMelFrameWidth = 1
         self.config.keywordMelFrameLength = 16
-        //self.config.keywordEncodeLength = 1
         self.recognizer = TFLiteKeywordRecognizer(config, context: self.context!)
     }
 
@@ -156,7 +152,7 @@ class TFLiteKeywordecognizerTestDelegate: SpokestackDelegate {
     func failure(error: Error) {
         print(error)
         guard let _ = asyncExpectation else {
-            XCTFail("TFLiteKeywordecognizerTestDelegate was not setup correctly. Missing XCTExpectation reference")
+            XCTFail("TFLiteKeywordRecognizerTestDelegate was not setup correctly. Missing XCTExpectation reference")
             return
         }
         self.didError = true