diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 283d84a..525e697 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -16,18 +16,20 @@ jobs: julia-version: ['1', '1.8', 'nightly'] julia-arch: [x64] os: [ubuntu-latest, macOS-latest] - exclude: - - os: macOS-latest - julia-arch: x86 steps: - - uses: actions/checkout@v2 - - uses: julia-actions/setup-julia@v1.6 + - uses: actions/checkout@v4 + - uses: julia-actions/setup-julia@latest with: version: ${{ matrix.julia-version }} arch: ${{ matrix.julia-arch }} - - uses: julia-actions/julia-runtest@v1.6.1 - - uses: julia-actions/julia-processcoverage@v1 - - uses: coverallsapp/github-action@master + - uses: julia-actions/julia-runtest@latest + - uses: julia-actions/julia-processcoverage@latest + - name: Upload coverage reports to Codecov + uses: codecov/codecov-action@v3 + with: + file: lcov.info + + - uses: coverallsapp/github-action@main with: path-to-lcov: lcov.info github-token: ${{ secrets.GITHUB_TOKEN }} @@ -40,7 +42,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Coveralls Finished - uses: coverallsapp/github-action@master + uses: coverallsapp/github-action@main with: github-token: ${{ secrets.GITHUB_TOKEN }} parallel-finished: true diff --git a/.github/workflows/TagBot.yml b/.github/workflows/TagBot.yml index f389611..3bd826b 100644 --- a/.github/workflows/TagBot.yml +++ b/.github/workflows/TagBot.yml @@ -8,13 +8,24 @@ on: lookback: default: 3 permissions: + actions: read + checks: read contents: write + deployments: read + issues: read + discussions: read + packages: read + pages: read + pull-requests: read + repository-projects: read + security-events: read + statuses: read jobs: TagBot: if: github.event_name == 'workflow_dispatch' || github.actor == 'JuliaTagBot' runs-on: ubuntu-latest steps: - - uses: JuliaRegistries/TagBot@v1 + - uses: JuliaRegistries/TagBot@latest with: token: ${{ secrets.GITHUB_TOKEN }} ssh: ${{ secrets.DOCUMENTER_KEY }} diff --git a/.travis.yml b/.travis.yml index 4ebfb0b..0d1f3cd 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,7 +3,6 @@ sudo: false os: - linux julia: - - 0.7 - - 1.0 - - 1.1 + - 1.6 + - 1 - nightly diff --git a/Project.toml b/Project.toml index 874078a..fba661b 100644 --- a/Project.toml +++ b/Project.toml @@ -13,6 +13,7 @@ LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Memoization = "6fafb56a-5788-4b4e-91ca-c0cea6611c73" SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" +Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" WAV = "8149f6b0-98f6-5db9-b78f-408fbbb8ef88" [compat] diff --git a/README.md b/README.md index 1ad23ac..3276ed7 100644 --- a/README.md +++ b/README.md @@ -24,12 +24,12 @@ Extract MFCC features from the audio data in `x`, using parameter settings chara The actual routine for MFCC computation has many parameters, these are basically the same parameters as in Dan Ellis's rastamat package. ```julia -mfcc(x::Vector, sr=16000.0; wintime=0.025, steptime=0.01, numcep=13, lifterexp=-22, sumpower=false, preemph=0.97, dither=false, minfreq=0.0, maxfreq=sr/2, nbands=20, bwidth=1.0, dcttype=3, fbtype=:htkmel, usecmp=false, modelorder=0) +mfcc(x::AbstractVector{<:AbstractFloat}, sr::Real=16000.0; wintime=0.025, steptime=0.01, numcep=13, preemph=0.97, lifterexp=-22, nbands=20, minfreq=0.0, maxfreq=sr/2, fbtype=:htkmel, bwidth=1.0, modelorder=0, dcttype=3, dither::Real=false, sumpower::Bool=false, usecmp::Bool=false) ``` This is the main routine computing MFCCs. `x` should be a 1D vector of `FloatingPoint` samples of speech, sampled at a frequency of `sr`. Every `steptime` seconds, a frame of duration `wintime` is analysed. The log energy in a filterbank of `nbands` bins is computed, and a cepstral (discrete cosine transform) representation is made, keeping only the first `numcep` coefficients (including log energy). The result is a tuple of three values: - - a matrix of `numcep` columns with for each speech frame a row of MFCC coefficients + - a matrix with `numcep` columns and for each speech frame a row of MFCC coefficients - the power spectrum computed with `DSP.spectrogram()` from which the MFCCs are computed - a dictionary containing information about the parameters used for extracting the features. @@ -46,7 +46,7 @@ This will compute speech features suitable for a specific `application`, which c - `:language`: narrowband language recognition: Shifted Delta Cepstra, energy-based speech activity detection, feature warping (299 samples) - `:diarization`: 13 MFCCs, utterance mean and variance normalization -The `kwargs...` parameters allow for various options in file format, feature augmentation, speech activity detection and MFCC parameter settings. They trickle down to versions of `feacalc()` and `mfcc()` allow for more detailed specification of these parameters. +The `kwargs...` parameters allow for various options in file format, feature augmentation, speech activity detection and MFCC parameter settings. They trickle down to versions of `feacalc()` and `mfcc()`, allowing for more detailed specification of these parameters. `feacalc()` returns a tuple of three structures: - a `Matrix` of features, one row per frame @@ -58,7 +58,7 @@ The `kwargs...` parameters allow for various options in file format, feature aug ```julia feacalc(wavfile::AbstractString; method=:wav, kwargs...) ``` -This function reads an audio file from disk and represents the audio as an `Array`, and then runs the feature extraction. +This function reads an audio file from disk and represents the audio as a `Matrix`, and then runs the feature extraction. The `method` parameter determines what method is used for reading in the audio file: - `:wav`: use Julia's native [WAV](https://github.com/dancasimiro/WAV.jl) library to read RIFF/WAVE `.wav` files @@ -88,7 +88,7 @@ The `sad` parameter controls if Speech Activity Detection is carried out on the - `:none`: apply no SAD - `:energy`: apply energy based SAD, omitting frames with an energy less than `dynrange` below the maximum energy of the file. -The various applications actually have somewhat different parameter settings for the basic MFCC feature extraction, see the `defaults` parameter of `mfcc()` below. +The various applications actually have somewhat different parameter settings for the basic MFCC feature extraction, see the `defaults` parameter of `mfcc()` above. ### Feature warping, or short-time Gaussianization (Jason Pelecanos) ```julia @@ -127,4 +127,4 @@ SDCs are features used for spoken language recognition, typically derived from M sdc(x::Matrix, n::Int=7, d::Int=1, p::Int=3, k::Int=7) ``` -This function expands (MFCC) features in `x` by computing derivatives over `2d+1` consecutive frames for the first `n` columns of `x`, stacking derivatives shifted over `p` frames `k` times. Before the calculation, zero adding is added so that the number of rows of the resuls is the same as for `x`. +This function expands (MFCC) features in `x` by computing derivatives over `2d+1` consecutive frames for the first `n` columns of `x`, stacking derivatives shifted over `p` frames `k` times. Before the calculation, the deltas are zero padded so that the results have the same number of rows as `x`. diff --git a/src/MFCC.jl b/src/MFCC.jl index 5e47fbd..0ba4103 100644 --- a/src/MFCC.jl +++ b/src/MFCC.jl @@ -13,7 +13,7 @@ export feacalc using DSP using WAV using SpecialFunctions ## erfinv -using Statistics +using Statistics include("rasta.jl") include("mfccs.jl") diff --git a/src/feacalc.jl b/src/feacalc.jl index 6fc1fb5..11af55f 100644 --- a/src/feacalc.jl +++ b/src/feacalc.jl @@ -40,7 +40,7 @@ function feacalc(x::AbstractVecOrMat; augtype=:ddelta, normtype=:warp, sadtype=: throw(DomainError(chan, "Bad channel specification")) end x = x[:, begin+chan-1] - chan = (:a, :b)[chan] + chan = chan in 1:2 ? (:a, :b)[chan] : chan else throw(ArgumentError(string("Unknown channel specification: ", chan))) end @@ -106,7 +106,7 @@ end function feacalc(wavfile::AbstractString, application::Symbol; kwargs...) if application in (:speaker, :nbspeaker) feacalc(wavfile; defaults=:nbspeaker, kwargs...) - elseif application==:wbspeaker + elseif application == :wbspeaker feacalc(wavfile; defaults=:wbspeaker, kwargs...) elseif application == :language feacalc(wavfile; defaults=:rasta, nwarp=299, augtype=:sdc, kwargs...) @@ -130,7 +130,7 @@ function sad(pspec::AbstractMatrix, sr::T, method=:energy; dynrange::T=30.) wher end ## listen to SAD -function sad(wavfile::AbstractString, speechout::AbstractString, silout::AbstractString; dynrange::Float64=30.) +function sad(wavfile, speechout, silout; dynrange::Float64=30.) x, sr, nbits = wavread(wavfile) sr = Float64(sr) # more reasonable sr mx::Vector{Float64} = vec(mean(x; dims=2)) # average multiple channels for now diff --git a/src/mfccs.jl b/src/mfccs.jl index fb5e7e1..8dfc499 100644 --- a/src/mfccs.jl +++ b/src/mfccs.jl @@ -13,9 +13,9 @@ using Memoization ## Recoded from rastamat's "melfcc.m" (c) Dan Ellis. ## Defaults here are HTK parameters, this is contrary to melfcc function mfcc(x::AbstractVector{T}, sr::Real=16000.0; wintime=0.025, steptime=0.01, numcep=13, - lifterexp=-22, preemph=0.97, minfreq=0.0, maxfreq=sr/2, nbands=20, - bwidth=1.0, dcttype=3, fbtype=:htkmel, modelorder=0, sumpower::Bool=false, - dither::Bool=false, usecmp::Bool=false) where {T<:AbstractFloat} + preemph=0.97, lifterexp=-22, nbands=20, minfreq=0.0, maxfreq=sr/2, + fbtype=:htkmel, bwidth=1.0, modelorder=0, dcttype=3, dither::Real=false, + sumpower::Bool=false, usecmp::Bool=false) where {T<:AbstractFloat} if !iszero(preemph) x = filt(PolynomialRatio([1., -preemph], [1.]), x) end @@ -29,6 +29,8 @@ function mfcc(x::AbstractVector{T}, sr::Real=16000.0; wintime=0.025, steptime=0. if modelorder > 0 if dcttype != 1 throw(ArgumentError("Sorry, modelorder>0 and dcttype ≠ 1 is not implemented")) + elseif numcep > modelorder + throw(ArgumentError("modelorder cannot be less than numceps")) end # LPC analysis lpcas = dolpc(aspec, modelorder) @@ -46,7 +48,7 @@ function mfcc(x::AbstractVector{T}, sr::Real=16000.0; wintime=0.025, steptime=0. return (cepstra, pspec', meta) end -mfcc(x::AbstractMatrix{<:AbstractFloat}, sr::Real=16000.0; args...) = @distributed (tuple) for i in axes(x, 2) mfcc(x[:, i], sr; args...) end +mfcc(x::AbstractMatrix{<:AbstractFloat}, args...; kwargs...) = @distributed (tuple) for i in axes(x, 2) mfcc(x[:, i], args...; kwargs...) end ## default feature configurations, :rasta, :htk, :spkid_toolkit, :wbspeaker diff --git a/src/rasta.jl b/src/rasta.jl index 308ff75..cda4c36 100644 --- a/src/rasta.jl +++ b/src/rasta.jl @@ -155,7 +155,7 @@ function hynek_eql(bandcfhz) eql end -function postaud(x::AbstractMatrix{<:AbstractFloat}, fmax::Real, fbtype=:bark, broaden=false) +function postaud(x::AbstractMatrix{<:AbstractFloat}, fmax::Real, fbtype=:bark, broaden::Bool=false) nbands, nframes = size(x) nfpts = nbands + 2broaden if fbtype == :bark @@ -256,7 +256,7 @@ function spec2cep(spec::AbstractMatrix{T}, ncep::Int=13, dcttype::Int=2) where { end function lifter(x::AbstractArray{<:AbstractFloat}, lift::Real=0.6, invs::Bool=false) - ncep, nf = size(x) + ncep = nrow(x) if iszero(lift) return x elseif lift > 0 diff --git a/test/runtests.jl b/test/runtests.jl index b298588..311cbf3 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -5,15 +5,18 @@ using WAV using MFCC -using SpecialFunctions -using Statistics +using Test + +import MFCC: sad, levinson, toeplitz + +# test io x, meta, params = feacalc("bl2.wav", normtype=:none, method=:wav, augtype=:none, sadtype=:none) y = feaload("bl2.mfcc") @assert x == y -t_fn = "bl2.hdf5" +t_fn = "mfccs/bl2.hdf5" feasave(t_fn, x; meta=meta, params=params) mfcc_tup = feaload(t_fn; meta=true, params=true) @@ -23,20 +26,57 @@ mfcc_tup = feaload(t_fn; meta=true, params=true) rm(t_fn) -feacalc(x) -feacalc("bl2.wav") -y = wavread("bl2.wav")[1] -mfcc(y; fbtype=:bark) -mfcc(y; fbtype=:mel) -mfcc(y; fbtype=:fcmel) +y, sr = wavread("bl2.wav"); y_mat = y +y, sr = vec(y), Float64(sr) + +# test feacalc with different parameters +feacalc(y; normtype=:mvn) +feacalc(y_mat; sr=sr, chan=1) +feacalc(y_mat; sr=sr, chan=:a, augtype=:sdc) +feacalc(y; sr=sr, usecmp=true, modelorder=20, dcttype=1) + +# test mfcc with htk, matrix input +mfcc(repeat(y, 1, 2), sr, :htk) + +for defaults in (:nbspeaker, :wbspeaker, :language, :diarization) + feacalc("bl2.wav", defaults) +end + +for (fb, dcttype) in zip((:bark, :mel, :htkmel), (1, 3, 4)) + mfcc(y; usecmp=true, fbtype=fb, dcttype=dcttype) +end + +speech = sad("bl2.wav", devnull, devnull) z = warp(x) +z = warp(x, 100) z = deltas(x) +z = deltas(x, 1) z = znorm(x) z = stmvn(x) x = randn(100000) +l = levinson(x, 101) +z = lifter(x, 0.6, true) +z = stmvn(x, 400000) +z = warp(randn(1000)) p = powspec(x) a = audspec(p) +a = postaud(p, 8000, :bark, true) + +# test for invalid/unsupported arguments +@test_throws DomainError feacalc(y_mat; sr=sr, chan=2) +@test_throws ArgumentError feacalc(y_mat; sr=sr, chan=nothing) +@test_throws ArgumentError feacalc(y; sr=sr, usecmp=true, modelorder=1, dcttype=1) +@test_throws ArgumentError feacalc(y; sr=sr, usecmp=true, modelorder=1, dcttype=2) +@test_throws ArgumentError feacalc("bl2.wav", :bosespeaker) +@test_throws ArgumentError mfcc(y, sr, :pasta) +@test_throws ArgumentError postaud(a, 4000, :cough) +@test_throws "Lift number is too high (>10)" lifter(y, 100) +@test_throws "Negative lift must be integer" lifter(y, -0.6) +@test_throws ArgumentError levinson(Int[], 1) +@test_throws DomainError levinson(x, -1) + +@test_warn "First elements of a Toeplitz matrix should be equal." toeplitz([1+im]) println("Tests passed")