Fixes and better loops (#36)

* named exceptions, cosmetic changes also changed hz2mel to accept floats instead of vectors * more fixes, small improvements broadcasting, named errors * stmvn + cosmetics generalize for AbstractVector * bump hdf5 and use symbols in dict - isdefined macro - removed HDF5.write for Bool and SubString * AbstractVector indexing, better toeplitz changed current toeplitz impl to accept same arguments, produce the same output as in current octave documentation. prev usage in levinson was dubious? * reduce allocs for delta, abstractvector stuff * improve lpc2cep loop + cosmetics * fixed lpc2cep * go back to using vector for lpc2cep * exp10, remove exp(log(*)) * small fixes and cosmetics * bump version? * actually throw errors, memoize weights * memoize dct_matrix * remove exp log
JuliaDSP · Oct 29, 2023 · b9808f9 · b9808f9 · davidavdav · Oct 29, 2023
1 parent a4b94ee
commit b9808f9
Show file tree

Hide file tree

Showing 8 changed files with 261 additions and 242 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "MFCC"
 uuid = "ca7b5df7-6146-5dcc-89ec-36256279a339"
 authors = ["David van Leeuwen"]
-version = "0.3.4"
+version = "0.3.5"
 
 [deps]
 DSP = "717857b8-e6f2-59f4-9121-6e50c889abd2"
@@ -10,18 +10,19 @@ FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
 FileIO = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549"
 HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+Memoization = "6fafb56a-5788-4b4e-91ca-c0cea6611c73"
 SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 WAV = "8149f6b0-98f6-5db9-b78f-408fbbb8ef88"
 
 [compat]
-julia = "1"
 DSP = "0.6, 0.7"
 FFTW = "1"
 FileIO = "1"
-HDF5 = "0.12 - 0.17"
+HDF5 = "0.15 - 0.17"
 SpecialFunctions = "0.8, 1, 2"
 WAV = "1"
+julia = "1"
 
 [extras]
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"

diff --git a/README.md b/README.md
@@ -8,7 +8,7 @@ The essential routine is re-coded from Dan Ellis's rastamat package, and paramet
 
 Please note that the feature-vector array consists of a vertical stacking of row-vector features.  This is consistent with the sense of direction of, e.g., `Base.cov()`, but inconsistent with, e.g., `DSP.spectrogram()` or `Clustering.kmeans()`.
 
-`mfcc()` has many parameters, but most of these are set to defaults that _should_ mimick HTK default parameter (not thoroughly tested).
+`mfcc()` has many parameters, but most of these are set to defaults that _should_ mimick HTK default parameters (not thoroughly tested).
 
 ## Feature extraction main routine
 
@@ -27,7 +27,7 @@ The actual routine for MFCC computation has many parameters, these are basically
 mfcc(x::Vector, sr=16000.0; wintime=0.025, steptime=0.01, numcep=13, lifterexp=-22, sumpower=false, preemph=0.97, dither=false, minfreq=0.0, maxfreq=sr/2, nbands=20, bwidth=1.0, dcttype=3, fbtype=:htkmel, usecmp=false, modelorder=0)
 ```
 
-  This is the main routine computing MFCCs.  `x` should be a 1D vector of `FloatingPoint` samples of speech, sampled at a frequency of `sr`.  Every `steptime` seconds, a frame of duration `wintime` is analysed.  The log energy in a filterbank of `nbands` bins is computed, and a cepstral (discrete cosine transform) representaion is made, keeping only the first `numcep` coefficients (including log energy).  The result is a tuple of three values:
+  This is the main routine computing MFCCs.  `x` should be a 1D vector of `FloatingPoint` samples of speech, sampled at a frequency of `sr`.  Every `steptime` seconds, a frame of duration `wintime` is analysed.  The log energy in a filterbank of `nbands` bins is computed, and a cepstral (discrete cosine transform) representation is made, keeping only the first `numcep` coefficients (including log energy).  The result is a tuple of three values:
 
  - a matrix of `numcep` columns with for each speech frame a row of MFCC coefficients
  - the power spectrum computed with `DSP.spectrogram()` from which the MFCCs are computed
@@ -41,15 +41,15 @@ We have defined a couple of standard sets of parameters that should function wel
 feacalc(wavfile::AbstractString, application::Symbol; kwargs...)
 ```
 This will compute speech features suitable for a specific `application`, which currently can be one of:
-- `:nbspeaker`: narrowband (telephone speech) speaker recognition: 19 MFCCs + log energy, delta's, energy-based speech activity detection, feature warping (399 samples)
+- `:nbspeaker`: narrowband (telephone speech) speaker recognition: 19 MFCCs + log energy, deltas, energy-based speech activity detection, feature warping (399 samples)
 - `:wbspeaker`: wideband speaker recognition: same as above but with wideband MFCC extraction parameters
 - `:language`: narrowband language recognition: Shifted Delta Cepstra, energy-based speech activity detection, feature warping (299 samples)
 - `:diarization`: 13 MFCCs, utterance mean and variance normalization
 
 The `kwargs...` parameters allow for various options in file format, feature augmentation, speech activity detection and MFCC parameter settings.  They trickle down to versions of `feacalc()` and `mfcc()` allow for more detailed specification of these parameters.
 
 `feacalc()` returns a tuple of three structures:
-- an `Array` of features, one row per frame
+- a `Matrix` of features, one row per frame
 - a `Dict` with metadata about the speech (length, SAD selected frames, etc.)
 - a `Dict` with the MFCC parameters used for feature extraction
 
@@ -62,7 +62,7 @@ This function reads an audio file from disk and represents the audio as an `Arra
 
 The `method` parameter determines what method is used for reading in the audio file:
 - `:wav`: use Julia's native [WAV](https://github.com/dancasimiro/WAV.jl) library to read RIFF/WAVE `.wav` files
-- `:sox`: use external `sox` program for figuring out the audio file format and converting to native represantation
+- `:sox`: use external `sox` program for figuring out the audio file format and converting to native representation
 - `:sph`: use external `w_decode` program to deal with (compressed) NIST sphere files
 
 ```julia
@@ -77,7 +77,7 @@ The `augtype` parameter specifies how the speech features are augmented.  This c
 - `:none` for no additional features
 - `:delta` for 1st order derivatives
 - `:ddelta` for first and second order derivatives
-- `:sdc` for replacement od the MFCCs with shifted delta cepstra with the default parameters `n, d, p, k = 7, 1, 3, 7`
+- `:sdc` for replacement of the MFCCs with shifted delta cepstra with the default parameters `n, d, p, k = 7, 1, 3, 7`
 
 The `normtype` parameter specifies how the features are normalized after extraction
 - `:none` for no normalization
@@ -95,7 +95,7 @@ The various applications actually have somewhat different parameter settings for
 warp(x::Matrix, w=399)
 ```
 
- This tansforms columns of `x` by short-time Gaussianization.  Each value in the middle of `w` rows is replaced with its normal deviate (the quantile function of the normal distribution) based on its rank within the `w` values.  The result has the same dimensions as `x`, but the values are chosen from a discrete set of `w` normal deviates.
+ This transforms columns of `x` by short-time Gaussianization.  Each value in the middle of `w` rows is replaced with its normal deviate (the quantile function of the normal distribution) based on its rank within the `w` values.  The result has the same dimensions as `x`, but the values are chosen from a discrete set of `w` normal deviates.
 
 ```julia
 znorm(x::Matrix)
@@ -121,7 +121,7 @@ The derivatives are computed over columns individually, and before the derivativ
 
 ### Shifted-Delta-Cepstra
 
-SDCs are features used for spoken language recognition, derived from typically MFCCs
+SDCs are features used for spoken language recognition, typically derived from MFCCs
 
 ```julia
 sdc(x::Matrix, n::Int=7, d::Int=1, p::Int=3, k::Int=7)

diff --git a/src/feacalc.jl b/src/feacalc.jl
@@ -17,87 +17,87 @@ function feacalc(wavfile::AbstractString; method=:wav, kwargs...)
         soxread(wavfile)
     elseif method == :sphere
         sphread(wavfile)
+    else
+        throw(ArgumentError(string("Method not supported: ", method)))
     end
     sr = Float64(sr)       # more reasonable sr
     feacalc(x; sr=sr, source=wavfile, kwargs...)
 end
 
 ## assume we have an array already
-function feacalc(x::AbstractArray; augtype=:ddelta, normtype=:warp, sadtype=:energy,
+function feacalc(x::AbstractVecOrMat; augtype=:ddelta, normtype=:warp, sadtype=:energy,
                  dynrange::Real=30., nwarp::Int=399, chan=:mono, sr::AbstractFloat=8000.0,
                  source=":array", defaults=:nbspeaker, mfccargs...)
-    if ndims(x) > 1
+    if x isa AbstractMatrix
         nsamples, nchan = size(x)
         if chan == :mono
             x = vec(mean(x; dims=2))            # average multiple channels for now
         elseif chan in (:a, :b)
-            channum = chan == :a ? 1 : 2
-            x = x[:,channum]
+            channum = chan == :b
+            x = x[:, begin+channum]
         elseif chan isa Integer
             if !(chan in 1:nchan)
-                error("Bad channel specification: ", chan)
+                throw(DomainError(chan, "Bad channel specification"))
             end
-            x = x[:,chan]
+            x = x[:, begin+chan-1]
             chan = (:a, :b)[chan]
         else
-            error("Unknown channel specification: ", chan)
+            throw(ArgumentError(string("Unknown channel specification: ", chan)))
         end
     else
         nsamples, nchan = length(x), 1
     end
     ## save some metadata
-    meta = Dict("nsamples" => nsamples, "sr" => sr, "source" => source,
-                "nchan" => nchan, "chan" => chan)
+    meta = Dict(:nsamples => nsamples, :sr => sr, :source => source,
+                :nchan => nchan, :chan => chan)
     preemph = 0.97
     preemph ^= 16000. / sr
 
     ## basic features
     m, pspec, params = mfcc(x, sr, defaults; preemph=preemph, mfccargs...)
-    meta["totnframes"] = nrow(m)
+    meta[:totnframes] = nrow(m)
 
     ## augment features
     if augtype in (:delta, :ddelta)
         d = deltas(m)
-        if augtype==:ddelta
+        if augtype == :ddelta
             dd = deltas(d)
             m = hcat(m, d, dd)
         else
             m = hcat(m, d)
         end
-    elseif augtype==:sdc
+    elseif augtype == :sdc
         m = sdc(m)
     end
-    meta["augtype"] = augtype
+    meta[:augtype] = augtype
 
-    if nrow(m)>0
-        if sadtype==:energy
+    if !isempty(m)
+        if sadtype == :energy
             speech = sad(pspec, sr; dynrange=dynrange)
-            params["dynrange"] = dynrange
-        elseif sadtype==:none
-            speech = collect(1:nrow(m))
+            params[:dynrange] = dynrange
+            ## perform SAD
+            m = m[speech,:]
+            meta[:speech] = map(UInt32, speech)
+        elseif sadtype == :none
+            nothing
         end
-    else
-        speech=Int[]
     end
-    meta["sadtype"] = sadtype
-    ## perform SAD
-    m = m[speech,:]
-    meta["speech"] = map(UInt32, speech)
-    meta["nframes"], meta["nfea"] = size(m)
+    meta[:sadtype] = sadtype
+    meta[:nframes], meta[:nfea] = size(m)
 
     ## normalization
-    if !iszero(nrow(m))
+    if !isempty(m)
         if normtype == :warp
             m = warp(m, nwarp)
-            params["warp"] = nwarp          # the default
+            params[:warp] = nwarp          # the default
         elseif normtype == :mvn
-            if nrow(m)>1
-                znorm!(m,1)
+            if nrow(m) > 1
+                znorm!(m, 1)
             else
                 fill!(m, 0)
             end
         end
-        meta["normtype"] = normtype
+        meta[:normtype] = normtype
     end
     return (map(Float32, m), meta, params)
 end
@@ -113,20 +113,19 @@ function feacalc(wavfile::AbstractString, application::Symbol; kwargs...)
     elseif application == :diarization
         feacalc(wavfile; defaults=:rasta, sadtype=:none, normtype=:mvn, augtype=:none, kwargs...)
     else
-        error("Unknown application ", application)
+        throw(ArgumentError(string("Unknown application: ", application)))
     end
 end
 
-function sad(pspec::AbstractMatrix{T}, sr::T, method=:energy; dynrange::T=30.) where T<:Float64
+function sad(pspec::AbstractMatrix, sr::T, method=:energy; dynrange::T=30.) where T<:Float64
     ## integrate power
     deltaf = size(pspec, 2) / (sr/2)
     minfreqi = round(Int, 300deltaf)
     maxfreqi = round(Int, 4000deltaf)
     summed_pspec = vec(sum(view(pspec, :, minfreqi:maxfreqi); dims=2))
     power = @. 10log10(summed_pspec)
     maxpow = maximum(power)
-    activity = power .> maxpow - dynrange
-    speech = findall(activity)
+    speech = findall(>(maxpow - dynrange), power)
     return speech
 end
 
@@ -136,8 +135,8 @@ function sad(wavfile::AbstractString, speechout::AbstractString, silout::Abstrac
     sr = Float64(sr)                            # more reasonable sr
     mx::Vector{Float64} = vec(mean(x; dims=2))  # average multiple channels for now
     m, pspec, meta = mfcc(mx, sr; preemph=0)
-    sp = sad(pspec, sr, dynrange=dynrange)
-    sl = round(Int, meta["steptime"] * sr)
+    sp = sad(pspec, sr; dynrange=dynrange)
+    sl = round(Int, meta[:steptime] * sr)
     xi = falses(axes(mx))
     for i in @view sp[begin:end-1]
         z = (i - 1) * sl

diff --git a/src/io.jl b/src/io.jl
@@ -8,9 +8,7 @@
 import HDF5
 
 ## encode non-HDF5 types in the key by adding type indicator---a poorman's solution
-HDF5.write(fd::HDF5.File, s::AbstractString, b::Bool) = write(fd, string(s,":Bool"), Int8(b))
 HDF5.write(fd::HDF5.File, s::AbstractString, sym::Symbol) = write(fd, string(s,":Symbol"), string(sym))
-HDF5.write(fd::HDF5.File, s::AbstractString, ss::SubString) = write(fd, s, ascii(ss))
 
 ## always save data in Float32
 ## the functiona arguments are the same as the output of feacalc
@@ -37,28 +35,26 @@ end
 ## FileIO.save(file::AbstractString, x::Matrix)
 
 ## the reverse encoding of Julia types.
-function retype(d::Dict)
-    r = Dict()
+function retype(d::Dict{<:AbstractString, T} where T)
+    r = Dict{Symbol, Any}()
     for (k, v) in d
-        if (m=match(r"^(.*):Bool$", k)) !== nothing
-            r[m.captures[1]] = v > 0
-        elseif (m=match(r"(.*):Symbol", k)) !== nothing
-            r[m.captures[1]] = Symbol(v)
+        if (m=match(r"(.*):Symbol", k)) !== nothing
+            r[Symbol(m.captures[1])] = Symbol(v)
         else
-            r[k] = v
+            r[Symbol(k)] = v
         end
     end
     r
 end
 
-## Try to handle missing elements in the hdf5 file more gacefully
+## Try to handle missing elements in the hdf5 file more gracefully
 function h5check(obj, name, content)
     HDF5.haskey(obj, content) || error('"', name, '"', " does not contain ", '"', content, '"')
     obj[content]
 end
 
-## Currently we always read the data in float64
-function feaload(file::AbstractString; meta=false, params=false)
+## Currently we always read the data in Float64
+function feaload(file::AbstractString; meta::Bool=false, params::Bool=false)
     HDF5.h5open(file, "r") do fd
         f = h5check(fd, file, "features")
         fea = map(Float64, read(h5check(f, "features", "data")))
@@ -76,7 +72,7 @@ function feaload(file::AbstractString; meta=false, params=false)
         if params
             push!(res, retype(read(h5check(f, "features", "params"))))
         end
-        tuple(res...)
+        Tuple(res)
     end
 end