-
Notifications
You must be signed in to change notification settings - Fork 24
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 0944f75
Showing
9 changed files
with
488 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
# Densely Connected Time Delay Neural Network | ||
|
||
PyTorch implementation of Densely Connected Time Delay Neural Network (D-TDNN) in our paper ["Densely Connected Time Delay Neural Network for Speaker Verification"](https://www.isca-speech.org/archive/Interspeech_2020/abstracts/1275.html) (INTERSPEECH 2020). | ||
|
||
We provide the [pretrained models](https://github.com/yuyq96/D-TDNN/releases) which can be used in many tasks such as: | ||
|
||
- Speaker Verification | ||
- Speaker Adaption for Speech Recognition | ||
- Speaker-Dependent Speech Separation | ||
- Multi-Speaker Text-to-Speech | ||
|
||
![D-TDNN & D-TDNN-SS](figure/D_TDNN.png) | ||
|
||
## Usage | ||
|
||
Data preparation | ||
* Install [Kaldi](https://github.com/kaldi-asr/kaldi) toolkit. | ||
* Download [VoxCeleb1 test set](http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1.html) and unzip it. | ||
* Place `prepare_voxceleb1_test.sh` under `$kaldi_root/egs/voxceleb/v2` and change the `$datadir` and `$voxceleb1_root` in it. | ||
* Run `chmod +x prepare_voxceleb1_test.sh && ./prepare_voxceleb1_test.sh` to generate acoustic features ([30-Dim MFCCs](https://github.com/kaldi-asr/kaldi/blob/master/egs/voxceleb/v2/conf/mfcc.conf)). | ||
* Replace the `trials` under `$datadir/test_no_sil` with the [clean version](https://github.com/yuyq96/D-TDNN/releases). | ||
|
||
Test | ||
``` | ||
python main.py --root $datadir/test_no_sil --model D-TDNN --checkpoint model_zoo/dtdnn.pth --device cuda | ||
``` | ||
|
||
## Evaluation | ||
|
||
VoxCeleb1-O | ||
|
||
| Model | Emb. | Params (M) | Loss | Backend | EER (%) | DCF_0.01 | DCF_0.001 | | ||
| :---- | :--: | :--------: | :--: | :-----: | :-----: | :------: | :-------: | | ||
| [TDNN](https://github.com/yuyq96/D-TDNN/releases) | 512 | 4.2 | Softmax | PLDA | 2.34 | 0.28 | 0.38 | | ||
| E-TDNN | 512 | 6.1 | Softmax | PLDA | 2.08 | 0.26 | 0.41 | | ||
| F-TDNN | 512 | 12.4 | Softmax | PLDA | 1.89 | 0.21 | 0.29 | | ||
| [D-TDNN](https://github.com/yuyq96/D-TDNN/releases) | 512 | 2.8 | Softmax | Cosine | 1.81 | 0.20 | 0.28 | | ||
| D-TDNN-SS (0) | 512 | 3.0 | Softmax | Cosine | 1.55 | 0.20 | 0.30 | | ||
| D-TDNN-SS | 512 | 3.5 | Softmax | Cosine | 1.41 | 0.19 | 0.24 | | ||
| D-TDNN-SS | 128 | 3.1 | AAM-Softmax | Cosine | 1.22 | 0.13 | 0.20 | | ||
|
||
## Citation | ||
|
||
If you find D-TDNN helps your research, please cite | ||
``` | ||
@inproceedings{DBLP:conf/interspeech/YuL20, | ||
author = {Ya-Qi Yu and | ||
Wu-Jun Li}, | ||
title = {Densely Connected Time Delay Neural Network for Speaker Verification}, | ||
booktitle = {Annual Conference of the International Speech Communication Association (INTERSPEECH)}, | ||
year = {2020} | ||
} | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
import os | ||
|
||
import kaldiio | ||
from torch.utils.data import Dataset | ||
|
||
|
||
class KaldiFeatDataset(Dataset): | ||
|
||
def __init__(self, root, transform=None): | ||
super(KaldiFeatDataset, self).__init__() | ||
self.transform = transform | ||
self.feats = [] | ||
with open(os.path.join(root, 'feats.scp'), 'r') as f: | ||
for line in f: | ||
utt, feats = line.split(' ') | ||
self.feats.append((feats, utt)) | ||
|
||
def __len__(self): | ||
return len(self.feats) | ||
|
||
def __getitem__(self, index): | ||
feats, utt = self.feats[index] | ||
feats = kaldiio.load_mat(feats) | ||
if self.transform is not None: | ||
feats = self.transform(feats) | ||
return feats, utt | ||
|
||
|
||
class Transpose2D(object): | ||
|
||
def __call__(self, a): | ||
return a.transpose((1, 0)) |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
import argparse | ||
import os | ||
|
||
import numpy as np | ||
import torch | ||
from numpy import linalg | ||
from torch.utils.data import DataLoader | ||
from tqdm import tqdm | ||
|
||
from data import KaldiFeatDataset, Transpose2D | ||
from metric import compute_fnr_fpr, compute_eer, compute_c_norm | ||
from model.tdnn import TDNN | ||
from model.dtdnn import DTDNN | ||
|
||
parser = argparse.ArgumentParser(description='Speaker Verification') | ||
parser.add_argument('--root', default='data', type=str) | ||
parser.add_argument('--model', default='D-TDNN', choices=['TDNN', 'D-TDNN']) | ||
parser.add_argument('--checkpoint', default=None, type=str) | ||
parser.add_argument('--device', default="cpu", choices=['cpu', 'cuda']) | ||
parser.add_argument('--pin-memory', default=True, type=bool) | ||
|
||
|
||
def load_model(): | ||
assert os.path.isfile(args.checkpoint), "No checkpoint found at '{}'".format(args.checkpoint) | ||
print('Loading checkpoint {}'.format(args.checkpoint)) | ||
state_dict = torch.load(args.checkpoint)['state_dict'] | ||
if args.model == 'TDNN': | ||
model = TDNN() | ||
else: | ||
model = DTDNN() | ||
model.to(device) | ||
model.load_state_dict(state_dict) | ||
return model | ||
|
||
|
||
def test(): | ||
model = load_model() | ||
model.eval() | ||
|
||
transform = Transpose2D() | ||
dataset = KaldiFeatDataset(root=args.root, transform=transform) | ||
loader = DataLoader(dataset, batch_size=1, shuffle=False, num_workers=1, pin_memory=args.pin_memory) | ||
|
||
utt2emb = {} | ||
for data, utt in tqdm(loader): | ||
with torch.no_grad(): | ||
data = data.to(device) | ||
emb = model(data) | ||
utt2emb[utt[0]] = emb[0].cpu().numpy() | ||
|
||
with open(os.path.join(args.root, 'trials'), 'r') as f: | ||
scores = [] | ||
labels = [] | ||
for line in f: | ||
utt1, utt2, label = line.split(' ') | ||
emb1, emb2 = utt2emb[utt1], utt2emb[utt2] | ||
score = emb1.dot(emb2) / (linalg.norm(emb1) * linalg.norm(emb2)) | ||
scores.append(score) | ||
labels.append(1 if label.strip() == 'target' else 0) | ||
scores = np.array(scores) | ||
labels = np.array(labels) | ||
fnr, fpr = compute_fnr_fpr(scores, labels) | ||
eer, th = compute_eer(fnr, fpr, True, scores) | ||
print('Equal error rate is {:6f}%, at threshold {:6f}'.format(eer * 100, th)) | ||
print('Minimum detection cost (0.01) is {:6f}'.format(compute_c_norm(fnr, fpr, 0.01))) | ||
print('Minimum detection cost (0.001) is {:6f}'.format(compute_c_norm(fnr, fpr, 0.001))) | ||
|
||
|
||
if __name__ == '__main__': | ||
args = parser.parse_args() | ||
device = torch.device(args.device) | ||
test() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
import numpy as np | ||
|
||
|
||
def compute_fnr_fpr(scores, labels): | ||
""" computes false negative rate (FNR) and false positive rate (FPR) | ||
given trial scores and their labels. | ||
""" | ||
|
||
indices = np.argsort(scores) | ||
labels = labels[indices] | ||
|
||
target = (labels == 1).astype('f8') | ||
nontar = (labels == 0).astype('f8') | ||
|
||
fnr = np.cumsum(target) / np.sum(target) | ||
fpr = 1 - np.cumsum(nontar) / np.sum(nontar) | ||
return fnr, fpr | ||
|
||
|
||
def compute_eer(fnr, fpr, requires_threshold=False, scores=None): | ||
""" computes the equal error rate (EER) given FNR and FPR values calculated | ||
for a range of operating points on the DET curve | ||
*kaldi style* | ||
""" | ||
|
||
diff_miss_fa = fnr - fpr | ||
x = np.flatnonzero(diff_miss_fa >= 0)[0] | ||
eer = fnr[x - 1] | ||
if requires_threshold: | ||
assert scores is not None | ||
scores = np.sort(scores) | ||
th = scores[x] | ||
return eer, th | ||
return eer | ||
|
||
|
||
def compute_c_norm(fnr, fpr, p_target, c_miss=1, c_fa=1): | ||
""" computes normalized minimum detection cost function (DCF) given | ||
the costs for false accepts and false rejects as well as a priori | ||
probability for target speakers | ||
""" | ||
|
||
dcf = c_miss * fnr * p_target + c_fa * fpr * (1 - p_target) | ||
c_det = np.min(dcf) | ||
c_def = min(c_miss * p_target, c_fa * (1 - p_target)) | ||
return c_det/c_def |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
from collections import OrderedDict | ||
|
||
from torch import nn | ||
|
||
from .layers import TDNNLayer, DenseTDNNBlock, TransitLayer, DenseLayer, StatsPool | ||
|
||
|
||
class DTDNN(nn.Module): | ||
|
||
def __init__(self, feat_dim=30, embedding_size=512, | ||
growth_rate=64, bn_size=2, init_channels=128, | ||
config_str='batchnorm-relu'): | ||
super(DTDNN, self).__init__() | ||
|
||
self.xvector = nn.Sequential(OrderedDict([ | ||
('tdnn', TDNNLayer(feat_dim, init_channels, 5, dilation=1, padding=-1, | ||
config_str=config_str)), | ||
])) | ||
channels = init_channels | ||
for i, (num_layers, kernel_size, dilation) in enumerate(zip((6, 12), (3, 3), (1, 3))): | ||
block = DenseTDNNBlock( | ||
num_layers=num_layers, | ||
in_channels=channels, | ||
out_channels=growth_rate, | ||
bn_channels=bn_size * growth_rate, | ||
kernel_size=kernel_size, | ||
dilation=dilation, | ||
config_str=config_str | ||
) | ||
self.xvector.add_module('block%d' % (i + 1), block) | ||
channels = channels + num_layers * growth_rate | ||
self.xvector.add_module( | ||
'transit%d' % (i + 1), TransitLayer(channels, channels // 2, bias=False, | ||
config_str=config_str)) | ||
channels //= 2 | ||
self.xvector.add_module('stats', StatsPool()) | ||
self.xvector.add_module('dense', DenseLayer(channels * 2, embedding_size, config_str='batchnorm_')) | ||
|
||
def forward(self, x): | ||
return self.xvector(x) |
Oops, something went wrong.