Skip to content
This repository has been archived by the owner on May 6, 2022. It is now read-only.

Commit

Permalink
Merge branch 'master' into ndw-fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
noelweichbrodt committed Dec 22, 2020
2 parents b8b0892 + 9737434 commit ab69b5e
Show file tree
Hide file tree
Showing 7 changed files with 60 additions and 100 deletions.
5 changes: 3 additions & 2 deletions Spokestack/Error.swift
Original file line number Diff line number Diff line change
Expand Up @@ -68,11 +68,12 @@ public enum VADError: Error, Equatable, LocalizedError {
}
}

/// Errors thrown by implementors of the WakewordRecognizer protocol.
/// Errors thrown by command models.
/// - SeeAlso: `TFLiteWakewordRecognizer`, `TFLiteKeywordRecognizer`
public enum CommandModelError: Error, Equatable, LocalizedError {
/// The command recognizer was unable to configure the recognizer model(s).
case model(String)
/// The rcommand ecognizer encountered an error during the processing of the audio frame.
/// The command recognizer encountered an error during the processing of the audio frame.
case process(String)
/// The command recognizer encountered an error during the configuration or running of the filter model.
case filter(String)
Expand Down
9 changes: 4 additions & 5 deletions Spokestack/RingBuffer.swift
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ import Foundation

/// A simple circular buffer of values.
final class RingBuffer <T> {

// MARK: Public (properties)

/// The number of empty spaces remaining in the RingBuffer until it's full.
Expand Down Expand Up @@ -81,11 +81,10 @@ final class RingBuffer <T> {
func read() throws -> T {
if self.isEmpty {
throw RingBufferStateError.illegalState(message: "ring buffer is empty")
} else {
let value: T = self.data[self.rpos]
self.rpos = self.pos(self.rpos + 1)
return value
}
let value: T = self.data[self.rpos]
self.rpos = self.pos(self.rpos + 1)
return value
}

/// Writes the next value to the buffer.
Expand Down
11 changes: 3 additions & 8 deletions Spokestack/SpeechConfiguration.swift
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,11 @@ import Foundation
public var fftWindowType: SignalProcessing.FFTWindowType = .hann
/// The desired linear Root Mean Squared (RMS) signal energy, which is used for signal normalization and should be tuned to the RMS target used during wakeword model training.
/// - SeeAlso: `TFLiteWakewordRecognizer`
@available(*, deprecated, message: "RMS normalization is no longer used during wakeword recognition.")
@objc public var rmsTarget: Float = 0.08
/// The Exponentially Weighted Moving Average (EWMA) update rate for the current Root Mean Squared (RMS) signal energy (0 for no RMS normalization).
/// - SeeAlso: `TFLiteWakewordRecognizer`
@available(*, deprecated, message: "RMS normalization is no longer used during wakeword recognition.")
@objc public var rmsAlpha: Float = 0.0
/// The size of the signal window used to calculate the STFT, in number of samples - should be a power of 2 for maximum efficiency.
/// - SeeAlso: `TFLiteWakewordRecognizer`
Expand Down Expand Up @@ -156,17 +158,11 @@ import Foundation
/// - Remark: ex: "yes,no"
/// - Warning: cannot contain spaces
/// - SeeAlso: `TFLiteKeywordRecognizer`
@objc public var keywords: String = "bed,bird,cat,dog,down,eight,five,four,go,happy,house,left,marvin,nine,no,off,on,one,right,seven,sheila,six,stop,three,tree,two,up,wow,yes,zero"
@objc public var keywords: String = ""
/// The name of the window function to apply to each audio frame before calculating the STFT.
/// - Remark: Currently the "hann" window is supported.
/// - SeeAlso: `TFLiteWakewordRecognizer`
public var keywordFFTWindowType: SignalProcessing.FFTWindowType = .hann
/// The desired linear Root Mean Squared (RMS) signal energy, which is used for signal normalization and should be tuned to the RMS target used during wakeword model training.
/// - SeeAlso: `TFLiteWakewordRecognizer`
@objc public var keywordRMSTarget: Float = 0.08
/// The Exponentially Weighted Moving Average (EWMA) update rate for the current Root Mean Squared (RMS) signal energy (0 for no RMS normalization).
/// - SeeAlso: `TFLiteWakewordRecognizer`
@objc public var keywordRMSAlpha: Float = 0.0
/// The size of the signal window used to calculate the STFT, in number of samples - should be a power of 2 for maximum efficiency.
/// - SeeAlso: `TFLiteWakewordRecognizer`
@objc public var keywordFFTWindowSize: Int = 512
Expand All @@ -185,5 +181,4 @@ import Foundation
/// The length of the sliding window of encoder output used as an input to the wakeword recognizer classifier, in milliseconds.
/// - SeeAlso: `TFLiteWakewordRecognizer`
@objc public var keywordEncodeLength: Int = 920
/// The threshold of the wakeword recognizer classifier's posterior output, above which the wakeword recognizer activates the pipeline, in the range [0, 1].
}
86 changes: 37 additions & 49 deletions Spokestack/TFLiteKeywordRecognizer.swift
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,16 @@ import Foundation
import TensorFlowLite

@objc public class TFLiteKeywordRecognizer: NSObject {

/// Configuration for the recognizer.
@objc public var configuration: SpeechConfiguration

/// Global state for the speech pipeline.
@objc public var context: SpeechContext

private var isActive: Bool = false
private var activation = 0

private var rmsValue: Float = 0.0
private var prevSample: Float = 0.0
private var sampleWindow: RingBuffer<Float>!
private var fftFrame: Array<Float> = []
Expand All @@ -31,7 +30,7 @@ import TensorFlowLite
private var encodeState: RingBuffer<Float>!
private var encodeWindow: RingBuffer<Float>!
private var classes: [String] = []

// TensorFlowLite models
private var filterModel: Interpreter?
private var encodeModel: Interpreter?
Expand All @@ -40,14 +39,12 @@ import TensorFlowLite
case encode
case state
}



private var sampleCollector: Array<Float>?
private var fftFrameCollector: String?
private var filterCollector: Array<Float>?
private var encodeCollector: Array<Float>?



/// Initializes a TFLiteKeywordRecognizer instance.
///
/// A recognizer is initialized by, and receives `startStreaming` and `stopStreaming` events from, an instance of `SpeechPipeline`.
Expand All @@ -64,7 +61,6 @@ import TensorFlowLite
}

private func initialize(_ c: SpeechConfiguration) {
self.rmsValue = c.keywordRMSTarget
self.sampleWindow = RingBuffer(c.keywordFFTWindowSize, repeating: 0.0)
self.fftFrame = Array(repeating: 0.0, count: c.keywordFFTWindowSize)
self.fftWindow = SignalProcessing.fftWindowDispatch(windowType: c.keywordFFTWindowType, windowLength: c.keywordFFTWindowSize)
Expand Down Expand Up @@ -98,7 +94,7 @@ import TensorFlowLite
self.encodeModel = try Interpreter(modelPath: c.keywordEncodeModelPath)
if let model = self.encodeModel {
try model.allocateTensors()
if (model.inputTensorCount != Tensors.allCases.count) {
if model.inputTensorCount != Tensors.allCases.count {
throw CommandModelError.encode("Keyword encode model input dimension is \(model.inputTensorCount) which does not matched expected dimension \(Tensors.allCases.count)")
}
} else {
Expand All @@ -108,6 +104,9 @@ import TensorFlowLite
self.detectModel = try Interpreter(modelPath: c.keywordDetectModelPath)
if let model = self.detectModel {
try model.allocateTensors()
if model.outputTensorCount != self.classes.count {
throw CommandModelError.detect("The number of keywords defined by SpeechConfiguration.keywords does not match the number of keywords detected by SpeechConfiguration.keywordDetectModelPath.")
}
} else {
throw CommandModelError.detect("\(c.keywordDetectModelPath) could not be initialized")
}
Expand All @@ -117,20 +116,13 @@ import TensorFlowLite
}

private func sample(_ frame: Data) throws {
// Preallocate an array of data elements in the frame for use in RMS and sampling
// Preallocate an array of data elements in the frame for use in sampling
let elements: Array<Int16> = frame.elements()

// Update the rms normalization factors
// Maintain an ewma of the rms signal energy for speech samples
if self.configuration.keywordRMSAlpha > 0 {
self.rmsValue = self.configuration.keywordRMSAlpha * SignalProcessing.rms(frame, elements) + (1 - self.configuration.keywordRMSAlpha) * self.rmsValue
}

// Process all samples in the frame
for e in elements {
// Normalize and clip the 16-bit sample to the target rms energy
// Normalize and clip the 16-bit sample
var sample: Float = Float(e) / Float(Int16.max)
sample = sample * (self.configuration.keywordRMSTarget / self.rmsValue)
sample = max(-1.0, min(sample, 1.0))

// Run a pre-emphasis filter to balance high frequencies and eliminate any dc energy
Expand All @@ -148,7 +140,11 @@ import TensorFlowLite
// - advance the sample sliding window
try self.sampleWindow.write(sample)
if self.sampleWindow.isFull {
try self.analyze()
if self.isActive {
try self.analyze()
}
// rewind the sample window for another run
self.sampleWindow.rewind().seek(self.hopLength)
}
}
}
Expand All @@ -163,14 +159,10 @@ import TensorFlowLite
// Compute the stft spectrogram
self.fft.forward(&self.fftFrame)

// rewind the sample window for another run
self.sampleWindow.rewind().seek(self.hopLength)

if self.configuration.tracing.rawValue <= Trace.Level.DEBUG.rawValue {
self.fftFrameCollector? += "\(self.fftFrame)\n"
}

// filter() will utilize the stft spectrogram as input for the filter model
try self.filter()
}

Expand Down Expand Up @@ -324,34 +316,30 @@ extension TFLiteKeywordRecognizer : SpeechProcessor {
public func process(_ frame: Data) {
audioProcessingQueue.async {[weak self] in
guard let strongSelf = self else { return }
if strongSelf.context.isActive {
do {
do {
if strongSelf.context.isActive {
// sample every frame while active
try strongSelf.sample(frame)
if !strongSelf.isActive {
strongSelf.isActive = true
} else if
(strongSelf.isActive
&& strongSelf.activation <= strongSelf.configuration.wakeActiveMax)
||
strongSelf.activation <= strongSelf.configuration.wakeActiveMin {
// already sampled, but in the midst of activation, so don't deactiavte yet
strongSelf.activation += strongSelf.configuration.frameWidth
} else {
strongSelf.run()
strongSelf.deactivate()
}
} else if strongSelf.isActive {
try strongSelf.sample(frame)
} catch let error {
strongSelf.context.dispatch { $0.failure(error: error) }
}
// sample every frame while active
if !strongSelf.isActive {
strongSelf.isActive = true
} else if
(strongSelf.isActive
&& strongSelf.activation <= strongSelf.configuration.wakeActiveMax)
||
strongSelf.activation <= strongSelf.configuration.wakeActiveMin {
// already sampled, but in the midst of activation, so don't deactiavte yet
strongSelf.activation += strongSelf.configuration.frameWidth
} else {
strongSelf.run()
strongSelf.deactivate()
}
} else if strongSelf.isActive {
do {
try strongSelf.sample(frame)
} catch let error {
strongSelf.context.dispatch { $0.failure(error: error) }
}
strongSelf.run()
strongSelf.deactivate()
} catch let error {
strongSelf.context.dispatch { $0.failure(error: error) }
}
}
}
Expand Down
28 changes: 8 additions & 20 deletions Spokestack/TFLiteWakewordRecognizer.swift
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,6 @@ import TensorFlowLite
private var fft: FFT!

// Audio Signal Normalization
private var rmsAlpha: Float = 0.0
private var rmsTarget: Float = 0.0
private var rmsValue: Float = 0.0
private var preEmphasis: Float = 0.0
private var prevSample: Float = 0.0

Expand Down Expand Up @@ -164,9 +161,6 @@ import TensorFlowLite
}

// Signal normalization
self.rmsAlpha = c.rmsAlpha
self.rmsTarget = c.rmsTarget
self.rmsValue = self.rmsTarget
self.preEmphasis = c.preEmphasis

// Sliding window buffers
Expand Down Expand Up @@ -195,21 +189,14 @@ import TensorFlowLite

private func sample(_ data: Data) throws -> Void {

// Preallocate an array of data elements in the frame for use in RMS and sampling
// Preallocate an array of data elements in the frame for use in sampling
let dataElements: Array<Int16> = data.elements()

// Update the rms normalization factors
// Maintain an ewma of the rms signal energy for speech samples
if self.rmsAlpha > 0 {
self.rmsValue = self.rmsAlpha * SignalProcessing.rms(data, dataElements) + (1 - self.rmsAlpha) * self.rmsValue
}

// Process all samples in the frame
for d in dataElements {

// Normalize and clip the 16-bit sample to the target rms energy
// Normalize and clip the 16-bit sample
var sample: Float = Float(d) / Float(Int16.max)
sample = sample * (self.rmsTarget / self.rmsValue)
sample = max(-1.0, min(sample, 1.0))

// Run a pre-emphasis filter to balance high frequencies and eliminate any dc energy
Expand All @@ -227,7 +214,11 @@ import TensorFlowLite
// - advance the sample sliding window
try self.sampleWindow.write(sample)
if self.sampleWindow.isFull {
try self.analyze()
if self.isSpeechDetected {
try self.analyze()
}
// rewind the sample window for another run
self.sampleWindow.rewind().seek(self.hopLength)
}
}
}
Expand All @@ -243,9 +234,6 @@ import TensorFlowLite
// Compute the stft spectrogram
self.fft.forward(&self.fftFrame)

// rewind the sample window for another run
self.sampleWindow.rewind().seek(self.hopLength)

if self.traceLevel.rawValue <= Trace.Level.DEBUG.rawValue {
self.fftFrameCollector? += "\(self.fftFrame)\n"
}
Expand Down Expand Up @@ -450,7 +438,7 @@ extension TFLiteWakewordRecognizer : SpeechProcessor {
} catch let error {
strongSelf.context.dispatch { $0.failure(error: error) }
}
// vad detection edge
// vad detection edge
} else if strongSelf.isSpeechDetected {
strongSelf.reset()
}
Expand Down
15 changes: 4 additions & 11 deletions SpokestackFrameworkExample/TFLiteKeywordViewController.swift
Original file line number Diff line number Diff line change
Expand Up @@ -44,15 +44,7 @@ class TFLiteKeywordViewController: UIViewController {
return button
}()

lazy private var pipeline: SpeechPipeline = {
return try! SpeechPipelineBuilder()
.addListener(self)
.useProfile(.vadTriggerSpokestackSpeech)
.setProperty("tracing", ".DEBUG")
.setProperty("vadFallDelay", "1600")
.setDelegateDispatchQueue(DispatchQueue.main)
.build()
}()
private var pipeline: SpeechPipeline?

override func loadView() {

Expand Down Expand Up @@ -104,19 +96,20 @@ class TFLiteKeywordViewController: UIViewController {
.useProfile(.vadTriggerKeyword)
.setProperty("tracing", Trace.Level.PERF)
.setProperty("keywordDetectModelPath", detectPath)
.setProperty("keywords", "bed,bird,cat,dog,down,eight,five,four,go,happy,house,left,marvin,nine,no,off,on,one,right,seven,sheila,six,stop,three,tree,two,up,wow,yes,zero")
.setProperty("keywordEncodeModelPath", encodePath)
.setProperty("keywordFilterModelPath", filterPath)
.build()
}

@objc func startRecordingAction(_ sender: Any) {
print("pipeline started")
self.pipeline.start()
self.pipeline?.start()
}

@objc func stopRecordingAction(_ sender: Any) {
print("pipeline finished")
self.pipeline.stop()
self.pipeline?.stop()
}

@objc func dismissViewController(_ sender: Any?) -> Void {
Expand Down
6 changes: 1 addition & 5 deletions SpokestackTests/TFLiteKeywordRecognizerTest.swift
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,7 @@ class TFLiteKeywordRecognizerTest: XCTestCase {
self.config.keywordFilterModelPath = MockKeywordModels.filterPath
self.config.keywordDetectModelPath = MockKeywordModels.detectPath
self.context = SpeechContext(config)
//self.config.wakeActiveMin = 20
//self.config.wakeActiveMax = 100
//self.config.keywordMelFrameWidth = 1
self.config.keywordMelFrameLength = 16
//self.config.keywordEncodeLength = 1
self.recognizer = TFLiteKeywordRecognizer(config, context: self.context!)
}

Expand Down Expand Up @@ -156,7 +152,7 @@ class TFLiteKeywordecognizerTestDelegate: SpokestackDelegate {
func failure(error: Error) {
print(error)
guard let _ = asyncExpectation else {
XCTFail("TFLiteKeywordecognizerTestDelegate was not setup correctly. Missing XCTExpectation reference")
XCTFail("TFLiteKeywordRecognizerTestDelegate was not setup correctly. Missing XCTExpectation reference")
return
}
self.didError = true
Expand Down

0 comments on commit ab69b5e

Please sign in to comment.