From 356fa8258d848e3b092c5b894f6a7b4cab63b29c Mon Sep 17 00:00:00 2001 From: Tom Ko Date: Sun, 17 Apr 2016 12:27:15 -0400 Subject: [PATCH 01/14] A new steps/data/reverberate_data_dir.py script --- egs/wsj/s5/steps/data/reverberate_data_dir.py | 299 ++++++++++++++++++ 1 file changed, 299 insertions(+) create mode 100755 egs/wsj/s5/steps/data/reverberate_data_dir.py diff --git a/egs/wsj/s5/steps/data/reverberate_data_dir.py b/egs/wsj/s5/steps/data/reverberate_data_dir.py new file mode 100755 index 00000000000..db93df16db3 --- /dev/null +++ b/egs/wsj/s5/steps/data/reverberate_data_dir.py @@ -0,0 +1,299 @@ +#!/usr/bin/env python +# Copyright 2016 Tom Ko +# Apache 2.0 +# script to generate reverberated data + +# we're using python 3.x style print but want it to work in python 2.x, +from __future__ import print_function +import argparse, glob, math, os, random, sys, warnings, copy, imp, ast + +train_lib = imp.load_source('ntl', 'steps/nnet3/nnet3_train_lib.py') + +class list_cyclic_iterator: + def __init__(self, list, random_seed = 0): + self.list_index = 0 + self.list = list + random.seed(random_seed) + random.shuffle(self.list) + + def next(self): + item = self.list[self.list_index] + self.list_index = (self.list_index + 1) % len(self.list) + return item + + +def GetArgs(): + # we add compulsary arguments as named arguments for readability + parser = argparse.ArgumentParser(description="Generate corrupted data" + "for neural network training", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + + parser.add_argument("--rir-list-file", type=str, required = True, + help="RIR information file") + parser.add_argument("--noise-list-file", type=str, default = None, + help="Noise information file") + parser.add_argument("--num-replications", type=int, dest = "num_replica", default = 1, + help="Number of replicate to generated for the data") + parser.add_argument('--snrs', type=str, dest = "snr_string", default = '20:10:0', help='snrs to be used for corruption') + parser.add_argument('--prefix', type=str, default = None, help='prefix for the id of the corrupted utterances') + parser.add_argument("--speech-rvb-probability", type=float, default = 0.8, + help="Probability of reverberating the speech signal, e.g. 0 <= p <= 1") + parser.add_argument("--noise-adding-probability", type=float, default = 0.4, + help="Probability of adding point-source noises, e.g. 0 <= p <= 1") + parser.add_argument("--max-noises-added", type=int, default = 2, + help="Maximum number of point-source noises could be added") + parser.add_argument("input_dir", + help="Input data directory") + parser.add_argument("output_dir", + help="Output data directory") + + print(' '.join(sys.argv)) + + args = parser.parse_args() + args = CheckArgs(args) + + return args + +def CheckArgs(args): + if not os.path.exists(args.output_dir): + os.makedirs(args.output_dir) + + ## Check arguments. + if args.rir_list_file is None: + raise Exception("Rir information file must be provided") + + if not os.path.isfile(args.rir_list_file): + raise Exception(args.rir_list_file + "not found") + + if args.noise_list_file is not None: + if not os.path.isfile(args.noise_list_file): + raise Exception(args.noise_list_file + "not found") + + if args.num_replica > 1 and args.prefix is None: + args.prefix = "rvb" + warnings.warn("--prefix is set to 'rvb' as --num-replications is larger than 1.") + + return args + +def ParseFileToDict(file, assert2fields = False, value_processor = None): + if value_processor is None: + value_processor = lambda x: x[0] + + dict = {} + for line in open(file, 'r'): + parts = line.split() + if assert2fields: + assert(len(parts) == 2) + + dict[parts[0]] = value_processor(parts[1:]) + return dict + + +# This is the major function to generate pipeline command for the corruption +# The rir list would have the following format: +# --rir-id --room-id --receiver-position-id --source-position-id --rt-60 < --drr < location(support Kaldi IO strings) > +# The noise list would have the following format: +# --noise-id --noise-type --bg-fg-type --rir-file < location=(support Kaldi IO strings) > +def CorruptWav(wav_scp, durations, output_dir, room_list, noise_list, snr_string, num_replica, prefix, speech_rvb_probability, noise_adding_probability, max_noises_added): + rooms = list_cyclic_iterator(room_list, random_seed = 1) + noises = None + if len(noise_list) > 0: + noises = list_cyclic_iterator(noise_list, random_seed = 1) + snrs = list_cyclic_iterator(snr_string.split(':')) + command_list = [] + for i in range(num_replica): + keys = wav_scp.keys() + keys.sort() + for wav_id in keys: + wav_pipe = wav_scp[wav_id] + wav_dur = durations[wav_id] + if prefix is not None: + wav_id = prefix + str(i) + "_" + wav_id + command = "{0} {1} wav-reverberate".format(wav_id, wav_pipe) + + # pick the room + room = rooms.next() + command_opts = "" + noises_added = [] + snrs_added = [] + start_times_added = [] + if random.random() < speech_rvb_probability: + # pick the RIR to reverberate the speech + speech_rir = room['rir_list'][random.randint(0,len(room['rir_list'])-1)] + command_opts += "--impulse-response={0} ".format(speech_rir.rir_file_location) + # add the corresponding isotropic noise if there is any + if len(speech_rir.iso_noise_list) > 0: + isotropic_noise = speech_rir.iso_noise_list[random.randint(0,len(speech_rir.iso_noise_list)-1)] + noises_added.append("{0}".format(isotropic_noise.noise_file_location)) + snrs_added.append("{0}".format(snrs.next())) + start_times_added.append(round(random.random() * wav_dur, 2)) + + if noises is not None and random.random() < noise_adding_probability: + for k in range(random.randint(1, max_noises_added)): + # pick the RIR to reverberate the point-source noise + noise = noises.next() + noise_rir = room['rir_list'][random.randint(0,len(room['rir_list'])-1)] + start_times_added.append(round(random.random() * wav_dur, 2)) + noises_added.append("\"wav-reverberate --duration={2} --impulse-response={1} {0} - |\" ".format(noise.noise_file_location, noise_rir.rir_file_location, round(random.random()*(wav_dur-start_times_added[-1]), 2))) + snrs_added.append("{0}".format(snrs.next())) + + if len(noises_added) > 1: + command_opts += "--additive-signals='{0}' ".format(','.join(noises_added)) + if len(snrs_added) > 1: + command_opts += "--snrs='{0}' ".format(','.join(snrs_added)) + if len(start_times_added) > 1: + command_opts += "--start-times='{0}' ".format(','.join(snrs_added)) + + if command_opts == "": + command = "{0} {1}\n".format(wav_id, wav_pipe) + else: + command = "{0} {1} wav-reverberate {2} - - |\n".format(wav_id, wav_pipe, command_opts) + + command_list.append(command) + + file_handle = open(output_dir + "/wav.scp", 'w') + file_handle.write("".join(command_list)) + file_handle.close() + + +# This function replicate the entries in files like text +def ReplicateFileType1(input_file, output_file, num_replica, prefix): + list = map(lambda x: x.strip(), open(input_file)) + f = open(output_file, "w") + for i in range(num_replica): + for line in list: + split1 = line.split() + if prefix is not None: + split1[0] = prefix + str(i) + "_" + split1[0] + print(" ".join(split1), file=f) + f.close() + + +# This function replicate the entries in files like segments, utt2spk +def ReplicateFileType2(input_file, output_file, num_replica, prefix): + list = map(lambda x: x.strip(), open(input_file)) + f = open(output_file, "w") + for i in range(num_replica): + for line in list: + split1 = line.split() + if prefix is not None: + split1[0] = prefix + str(i) + "_" + split1[0] + split1[1] = prefix + str(i) + "_" + split1[1] + print(" ".join(split1), file=f) + f.close() + + +def MakeCorruption(input_dir, output_dir, room_list, noise_list, snr_string, num_replica, prefix, speech_rvb_probability, noise_adding_probability, max_noises_added): + + if not os.path.isfile(input_dir + "/reco2dur"): + print("Getting the duration of the recordings..."); + train_lib.RunKaldiCommand("wav-to-duration --read-entire-file=true scp:{0}/wav.scp ark,t:{0}/reco2dur".format(input_dir)) + durations = ParseFileToDict(input_dir + "/reco2dur", value_processor = lambda x: float(x[0])) + wav_scp = ParseFileToDict(input_dir + "/wav.scp", value_processor = lambda x: " ".join(x)) + CorruptWav(wav_scp, durations, output_dir, room_list, noise_list, snr_string, num_replica, prefix, speech_rvb_probability, noise_adding_probability, max_noises_added) + + ReplicateFileType2(input_dir + "/utt2spk", output_dir + "/utt2spk", num_replica, prefix) + train_lib.RunKaldiCommand("utils/utt2spk_to_spk2utt.pl <{output_dir}/utt2spk >{output_dir}/spk2utt" + .format(output_dir = output_dir)) + + if os.path.isfile(input_dir + "/text"): + ReplicateFileType1(input_dir + "/text", output_dir + "/text", num_replica, prefix) + if os.path.isfile(input_dir + "/segments"): + ReplicateFileType2(input_dir + "/segments", output_dir + "/segments", num_replica, prefix) + if os.path.isfile(input_dir + "/reco2file_and_channel"): + ReplicateFileType2(input_dir + "/reco2file_and_channel", output_dir + "/reco2file_and_channel", num_replica, prefix) + + train_lib.RunKaldiCommand("utils/validate_data_dir.sh --no-feats {output_dir}" + .format(output_dir = output_dir)) + + +def ParseRirList(rir_list_file): + rir_parser = argparse.ArgumentParser() + rir_parser.add_argument('--rir-id', type=str, required=True, help='rir id') + rir_parser.add_argument('--room-id', type=str, required=True, help='room id') + rir_parser.add_argument('--receiver-position-id', type=str, default=None, help='receiver position id') + rir_parser.add_argument('--source-position-id', type=str, default=None, help='source position id') + rir_parser.add_argument('--rt60', type=float, default=None, help='RT60 is the time required for reflections of a direct sound to decay 60 dB.') + rir_parser.add_argument('--drr', type=float, default=None, help='Direct-to-reverberant-ratio of the impulse.') + rir_parser.add_argument('rir_file_location', type=str, help='rir file location') + + rir_list = [] + rir_lines = map(lambda x: x.strip(), open(rir_list_file)) + for line in rir_lines: + rir = rir_parser.parse_args(line.split()) + setattr(rir, "iso_noise_list", []) + rir.iso_noise_list = [] + rir_list.append(rir) + + return rir_list + +def MakeRoomList(rir_list): + room_list = [] + for i in range(len(rir_list)): + id = -1 + for j in range(len(room_list)): + if room_list[j]['room_id'] == rir_list[i].room_id: + id = j + break + if id == -1: + # add new room + room_list.append({'room_id': rir_list[i].room_id, 'rir_list': []}) + + room_list[id]['rir_list'].append(rir_list[i]) + + return room_list + +def ParseNoiseList(rir_list, noise_list_file): + noise_parser = argparse.ArgumentParser() + noise_parser.add_argument('--noise-id', type=str, required=True, help='noise id') + noise_parser.add_argument('--noise-type', type=str, required=True, help='the type of noise; i.e. isotropic or point-source', choices = ["isotropic", "point-source"]) + noise_parser.add_argument('--bg-fg-type', type=str, default="background", help='background or foregroun noise', choices = ["background", "foreground"]) + noise_parser.add_argument('--rir-file', type=str, default=None, help='compulsary if isotropic, should not be specified if point-source') + noise_parser.add_argument('noise_file_location', type=str, help='noise file location') + + point_noise_list = [] + noise_lines = map(lambda x: x.strip(), open(noise_list_file)) + for line in noise_lines: + noise = noise_parser.parse_args(line.split()) + if noise.noise_type == "isotropic": + if noise.rir_file is None: + raise Exception("--rir-file must be specified is --noise-type is point-source") + warnings.warn("No rir file specified for noise id {0}".format(noise.noise_id)) + else: + id = -1 + for j in range(len(rir_list)): + if noise.rir_file == rir_list[j].rir_file_location: + id = j + print(noise.rir_file) + rir_list[id].iso_noise_list.append(noise) + break; + if id == -1: + warnings.warn("Rir file specified for noise id {0} is not found in rir_list".format(noise.noise_id)) + else: + point_noise_list.append(noise) + + return (point_noise_list, rir_list) + +def Main(): + args = GetArgs() + rir_list = ParseRirList(args.rir_list_file) + noise_list = [] + if args.noise_list_file is not None: + noise_list, rir_list = ParseNoiseList(rir_list, args.noise_list_file) + print("Number of point-source noises is {0}".format(len(noise_list))) + room_list = MakeRoomList(rir_list) + + MakeCorruption(input_dir = args.input_dir, + output_dir = args.output_dir, + room_list = room_list, + noise_list = noise_list, + snr_string = args.snr_string, + num_replica = args.num_replica, + prefix = args.prefix, + speech_rvb_probability = args.speech_rvb_probability, + noise_adding_probability = args.noise_adding_probability, + max_noises_added = args.max_noises_added) + +if __name__ == "__main__": + Main() + From 8671e5937c76c787d01f0801cfacb892f75a360c Mon Sep 17 00:00:00 2001 From: Tom Ko Date: Fri, 22 Apr 2016 09:59:27 -0400 Subject: [PATCH 02/14] update function names; split snrs to background and foreground; user specified random seed; always handle isotropic noise as background noise --- egs/wsj/s5/steps/data/reverberate_data_dir.py | 131 +++++++++--------- 1 file changed, 69 insertions(+), 62 deletions(-) diff --git a/egs/wsj/s5/steps/data/reverberate_data_dir.py b/egs/wsj/s5/steps/data/reverberate_data_dir.py index db93df16db3..b06035de2a6 100755 --- a/egs/wsj/s5/steps/data/reverberate_data_dir.py +++ b/egs/wsj/s5/steps/data/reverberate_data_dir.py @@ -10,10 +10,9 @@ train_lib = imp.load_source('ntl', 'steps/nnet3/nnet3_train_lib.py') class list_cyclic_iterator: - def __init__(self, list, random_seed = 0): + def __init__(self, list): self.list_index = 0 self.list = list - random.seed(random_seed) random.shuffle(self.list) def next(self): @@ -24,17 +23,28 @@ def next(self): def GetArgs(): # we add compulsary arguments as named arguments for readability - parser = argparse.ArgumentParser(description="Generate corrupted data" - "for neural network training", + parser = argparse.ArgumentParser(description="Reverberate the data directory with an option " + "to add isotropic and point source noiseis. " + "This script only deals with single channel wave files. " + "If multi-channel noise/rir/speech files are provided one " + "of the channels will be randomly picked", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--rir-list-file", type=str, required = True, - help="RIR information file") + help="RIR information file, the format of the file is " + "--rir-id --room-id " + "--receiver-position-id --source-position-id " + "--rt-60 < --drr < location(support Kaldi IO strings) >") parser.add_argument("--noise-list-file", type=str, default = None, - help="Noise information file") + help="Noise information file, the format of the file is" + "--noise-id --noise-type " + "--bg-fg-type " + "--rir-file " + "< location=(support Kaldi IO strings) >") parser.add_argument("--num-replications", type=int, dest = "num_replica", default = 1, help="Number of replicate to generated for the data") - parser.add_argument('--snrs', type=str, dest = "snr_string", default = '20:10:0', help='snrs to be used for corruption') + parser.add_argument('--foreground-snrs', type=str, dest = "foreground_snr_string", default = '20:10:0', help='snrs for foreground noises') + parser.add_argument('--background-snrs', type=str, dest = "background_snr_string", default = '20:10:0', help='snrs for background noises') parser.add_argument('--prefix', type=str, default = None, help='prefix for the id of the corrupted utterances') parser.add_argument("--speech-rvb-probability", type=float, default = 0.8, help="Probability of reverberating the speech signal, e.g. 0 <= p <= 1") @@ -42,6 +52,7 @@ def GetArgs(): help="Probability of adding point-source noises, e.g. 0 <= p <= 1") parser.add_argument("--max-noises-added", type=int, default = 2, help="Maximum number of point-source noises could be added") + parser.add_argument('--random-seed', type=int, default=0, help='seed to be used in the randomization of impulese and noises') parser.add_argument("input_dir", help="Input data directory") parser.add_argument("output_dir", @@ -59,9 +70,6 @@ def CheckArgs(args): os.makedirs(args.output_dir) ## Check arguments. - if args.rir_list_file is None: - raise Exception("Rir information file must be provided") - if not os.path.isfile(args.rir_list_file): raise Exception(args.rir_list_file + "not found") @@ -90,26 +98,25 @@ def ParseFileToDict(file, assert2fields = False, value_processor = None): # This is the major function to generate pipeline command for the corruption -# The rir list would have the following format: -# --rir-id --room-id --receiver-position-id --source-position-id --rt-60 < --drr < location(support Kaldi IO strings) > -# The noise list would have the following format: -# --noise-id --noise-type --bg-fg-type --rir-file < location=(support Kaldi IO strings) > -def CorruptWav(wav_scp, durations, output_dir, room_list, noise_list, snr_string, num_replica, prefix, speech_rvb_probability, noise_adding_probability, max_noises_added): - rooms = list_cyclic_iterator(room_list, random_seed = 1) +def CorruptWav(wav_scp, durations, output_dir, room_list, noise_list, foreground_snr_array, background_snr_array, num_replica, prefix, speech_rvb_probability, noise_adding_probability, max_noises_added): + rooms = list_cyclic_iterator(room_list) noises = None if len(noise_list) > 0: - noises = list_cyclic_iterator(noise_list, random_seed = 1) - snrs = list_cyclic_iterator(snr_string.split(':')) + noises = list_cyclic_iterator(noise_list) + foreground_snrs = list_cyclic_iterator(foreground_snr_array) + background_snrs = list_cyclic_iterator(background_snr_array) command_list = [] for i in range(num_replica): keys = wav_scp.keys() keys.sort() for wav_id in keys: wav_pipe = wav_scp[wav_id] - wav_dur = durations[wav_id] + # check if it is really a pipe + if len(wav_pipe.split()) == 1: + wav_pipe = "cat {0} |".format(wav_pipe) + speech_dur = durations[wav_id] if prefix is not None: wav_id = prefix + str(i) + "_" + wav_id - command = "{0} {1} wav-reverberate".format(wav_id, wav_pipe) # pick the room room = rooms.next() @@ -124,25 +131,31 @@ def CorruptWav(wav_scp, durations, output_dir, room_list, noise_list, snr_string # add the corresponding isotropic noise if there is any if len(speech_rir.iso_noise_list) > 0: isotropic_noise = speech_rir.iso_noise_list[random.randint(0,len(speech_rir.iso_noise_list)-1)] - noises_added.append("{0}".format(isotropic_noise.noise_file_location)) - snrs_added.append("{0}".format(snrs.next())) - start_times_added.append(round(random.random() * wav_dur, 2)) + # extend the isotropic noise to the length of the speech waveform + noises_added.append("\"wav-reverberate --duration={1} {0} - |\" ".format(isotropic_noise.noise_file_location, speech_dur)) + snrs_added.append(background_snrs.next()) + start_times_added.append(0) if noises is not None and random.random() < noise_adding_probability: for k in range(random.randint(1, max_noises_added)): # pick the RIR to reverberate the point-source noise noise = noises.next() noise_rir = room['rir_list'][random.randint(0,len(room['rir_list'])-1)] - start_times_added.append(round(random.random() * wav_dur, 2)) - noises_added.append("\"wav-reverberate --duration={2} --impulse-response={1} {0} - |\" ".format(noise.noise_file_location, noise_rir.rir_file_location, round(random.random()*(wav_dur-start_times_added[-1]), 2))) - snrs_added.append("{0}".format(snrs.next())) - - if len(noises_added) > 1: + if noise.bg_fg_type == "background": + start_times_added.append(0) + noises_added.append("\"wav-reverberate --duration={2} --impulse-response={1} {0} - |\" ".format(noise.noise_file_location, noise_rir.rir_file_location, speech_dur)) + snrs_added.append(background_snrs.next()) + else: + start_times_added.append(round(random.random() * speech_dur, 2)) + noises_added.append("\"wav-reverberate --impulse-response={1} {0} - |\" ".format(noise.noise_file_location, noise_rir.rir_file_location)) + snrs_added.append(foreground_snrs.next()) + + if len(noises_added) > 0: command_opts += "--additive-signals='{0}' ".format(','.join(noises_added)) - if len(snrs_added) > 1: - command_opts += "--snrs='{0}' ".format(','.join(snrs_added)) - if len(start_times_added) > 1: - command_opts += "--start-times='{0}' ".format(','.join(snrs_added)) + if len(snrs_added) > 0: + command_opts += "--snrs='{0}' ".format(','.join(map(lambda x:str(x),snrs_added))) + if len(start_times_added) > 0: + command_opts += "--start-times='{0}' ".format(','.join(map(lambda x:str(x),start_times_added))) if command_opts == "": command = "{0} {1}\n".format(wav_id, wav_pipe) @@ -156,52 +169,45 @@ def CorruptWav(wav_scp, durations, output_dir, room_list, noise_list, snr_string file_handle.close() -# This function replicate the entries in files like text -def ReplicateFileType1(input_file, output_file, num_replica, prefix): +# This function replicate the entries in files like segments, utt2spk, text +def AddPrefixToFields(input_file, output_file, num_replica, prefix, field = [0]): list = map(lambda x: x.strip(), open(input_file)) f = open(output_file, "w") for i in range(num_replica): for line in list: - split1 = line.split() - if prefix is not None: - split1[0] = prefix + str(i) + "_" + split1[0] - print(" ".join(split1), file=f) - f.close() - - -# This function replicate the entries in files like segments, utt2spk -def ReplicateFileType2(input_file, output_file, num_replica, prefix): - list = map(lambda x: x.strip(), open(input_file)) - f = open(output_file, "w") - for i in range(num_replica): - for line in list: - split1 = line.split() - if prefix is not None: - split1[0] = prefix + str(i) + "_" + split1[0] - split1[1] = prefix + str(i) + "_" + split1[1] - print(" ".join(split1), file=f) + if len(line) > 0 and line[0] != ';': + split1 = line.split() + for j in field: + if prefix is not None: + split1[j] = prefix + str(i) + "_" + split1[j] + print(" ".join(split1), file=f) + else: + print(line, file=f) f.close() -def MakeCorruption(input_dir, output_dir, room_list, noise_list, snr_string, num_replica, prefix, speech_rvb_probability, noise_adding_probability, max_noises_added): +def CreateReverberatedCopy(input_dir, output_dir, room_list, noise_list, foreground_snr_string, background_snr_string, num_replica, prefix, speech_rvb_probability, noise_adding_probability, max_noises_added): if not os.path.isfile(input_dir + "/reco2dur"): print("Getting the duration of the recordings..."); train_lib.RunKaldiCommand("wav-to-duration --read-entire-file=true scp:{0}/wav.scp ark,t:{0}/reco2dur".format(input_dir)) durations = ParseFileToDict(input_dir + "/reco2dur", value_processor = lambda x: float(x[0])) wav_scp = ParseFileToDict(input_dir + "/wav.scp", value_processor = lambda x: " ".join(x)) - CorruptWav(wav_scp, durations, output_dir, room_list, noise_list, snr_string, num_replica, prefix, speech_rvb_probability, noise_adding_probability, max_noises_added) + foreground_snr_array = map(lambda x: float(x), foreground_snr_string.split(':')) + background_snr_array = map(lambda x: float(x), background_snr_string.split(':')) + + CorruptWav(wav_scp, durations, output_dir, room_list, noise_list, foreground_snr_array, background_snr_array, num_replica, prefix, speech_rvb_probability, noise_adding_probability, max_noises_added) - ReplicateFileType2(input_dir + "/utt2spk", output_dir + "/utt2spk", num_replica, prefix) + AddPrefixToFields(input_dir + "/utt2spk", output_dir + "/utt2spk", num_replica, prefix, field = [0,1]) train_lib.RunKaldiCommand("utils/utt2spk_to_spk2utt.pl <{output_dir}/utt2spk >{output_dir}/spk2utt" .format(output_dir = output_dir)) if os.path.isfile(input_dir + "/text"): - ReplicateFileType1(input_dir + "/text", output_dir + "/text", num_replica, prefix) + AddPrefixToFields(input_dir + "/text", output_dir + "/text", num_replica, prefix, field =[0]) if os.path.isfile(input_dir + "/segments"): - ReplicateFileType2(input_dir + "/segments", output_dir + "/segments", num_replica, prefix) + AddPrefixToFields(input_dir + "/segments", output_dir + "/segments", num_replica, prefix, field = [0,1]) if os.path.isfile(input_dir + "/reco2file_and_channel"): - ReplicateFileType2(input_dir + "/reco2file_and_channel", output_dir + "/reco2file_and_channel", num_replica, prefix) + AddPrefixToFields(input_dir + "/reco2file_and_channel", output_dir + "/reco2file_and_channel", num_replica, prefix, field = [0,1]) train_lib.RunKaldiCommand("utils/validate_data_dir.sh --no-feats {output_dir}" .format(output_dir = output_dir)) @@ -247,7 +253,7 @@ def ParseNoiseList(rir_list, noise_list_file): noise_parser = argparse.ArgumentParser() noise_parser.add_argument('--noise-id', type=str, required=True, help='noise id') noise_parser.add_argument('--noise-type', type=str, required=True, help='the type of noise; i.e. isotropic or point-source', choices = ["isotropic", "point-source"]) - noise_parser.add_argument('--bg-fg-type', type=str, default="background", help='background or foregroun noise', choices = ["background", "foreground"]) + noise_parser.add_argument('--bg-fg-type', type=str, default="background", help='background or foreground noise', choices = ["background", "foreground"]) noise_parser.add_argument('--rir-file', type=str, default=None, help='compulsary if isotropic, should not be specified if point-source') noise_parser.add_argument('noise_file_location', type=str, help='noise file location') @@ -264,7 +270,6 @@ def ParseNoiseList(rir_list, noise_list_file): for j in range(len(rir_list)): if noise.rir_file == rir_list[j].rir_file_location: id = j - print(noise.rir_file) rir_list[id].iso_noise_list.append(noise) break; if id == -1: @@ -276,6 +281,7 @@ def ParseNoiseList(rir_list, noise_list_file): def Main(): args = GetArgs() + random.seed(args.random_seed) rir_list = ParseRirList(args.rir_list_file) noise_list = [] if args.noise_list_file is not None: @@ -283,11 +289,12 @@ def Main(): print("Number of point-source noises is {0}".format(len(noise_list))) room_list = MakeRoomList(rir_list) - MakeCorruption(input_dir = args.input_dir, + CreateReverberatedCopy(input_dir = args.input_dir, output_dir = args.output_dir, room_list = room_list, noise_list = noise_list, - snr_string = args.snr_string, + foreground_snr_string = args.foreground_snr_string, + background_snr_string = args.background_snr_string, num_replica = args.num_replica, prefix = args.prefix, speech_rvb_probability = args.speech_rvb_probability, From 99b4210cd403a53a620e5a7e7387ae2d38bb146e Mon Sep 17 00:00:00 2001 From: Tom Ko Date: Thu, 5 May 2016 05:34:38 -0400 Subject: [PATCH 03/14] Pick the RIRs and noises according to assigned probabilities. --- egs/wsj/s5/steps/data/reverberate_data_dir.py | 134 ++++++++++++------ 1 file changed, 94 insertions(+), 40 deletions(-) diff --git a/egs/wsj/s5/steps/data/reverberate_data_dir.py b/egs/wsj/s5/steps/data/reverberate_data_dir.py index b06035de2a6..a7d887e5f06 100755 --- a/egs/wsj/s5/steps/data/reverberate_data_dir.py +++ b/egs/wsj/s5/steps/data/reverberate_data_dir.py @@ -83,6 +83,40 @@ def CheckArgs(args): return args + +def PickItemFromDict(dict): + total_p = sum(dict[key].probability for key in dict.keys()) + p = random.uniform(0, total_p) + upto = 0 + for key in dict.keys(): + if upto + dict[key].probability >= p: + return dict[key] + upto += dict[key].probability + assert False, "Shouldn't get here" + + +def PickItemFromList(list): + total_p = sum(item.probability for item in list) + p = random.uniform(0, total_p) + upto = 0 + for item in list: + if upto + item.probability >= p: + return item + upto += item.probability + assert False, "Shouldn't get here" + + +def weighted_choice(choices): + total = sum(w for c, w in choices) + r = random.uniform(0, total) + upto = 0 + for c, w in choices: + if upto + w >= r: + return c + upto += w + assert False, "Shouldn't get here" + + def ParseFileToDict(file, assert2fields = False, value_processor = None): if value_processor is None: value_processor = lambda x: x[0] @@ -98,11 +132,7 @@ def ParseFileToDict(file, assert2fields = False, value_processor = None): # This is the major function to generate pipeline command for the corruption -def CorruptWav(wav_scp, durations, output_dir, room_list, noise_list, foreground_snr_array, background_snr_array, num_replica, prefix, speech_rvb_probability, noise_adding_probability, max_noises_added): - rooms = list_cyclic_iterator(room_list) - noises = None - if len(noise_list) > 0: - noises = list_cyclic_iterator(noise_list) +def CorruptWav(wav_scp, durations, output_dir, room_dict, noise_list, foreground_snr_array, background_snr_array, num_replica, prefix, speech_rvb_probability, noise_adding_probability, max_noises_added): foreground_snrs = list_cyclic_iterator(foreground_snr_array) background_snrs = list_cyclic_iterator(background_snr_array) command_list = [] @@ -119,28 +149,29 @@ def CorruptWav(wav_scp, durations, output_dir, room_list, noise_list, foreground wav_id = prefix + str(i) + "_" + wav_id # pick the room - room = rooms.next() + room = PickItemFromDict(room_dict) command_opts = "" noises_added = [] snrs_added = [] start_times_added = [] if random.random() < speech_rvb_probability: # pick the RIR to reverberate the speech - speech_rir = room['rir_list'][random.randint(0,len(room['rir_list'])-1)] + speech_rir = PickItemFromList(room.rir_list) command_opts += "--impulse-response={0} ".format(speech_rir.rir_file_location) # add the corresponding isotropic noise if there is any if len(speech_rir.iso_noise_list) > 0: - isotropic_noise = speech_rir.iso_noise_list[random.randint(0,len(speech_rir.iso_noise_list)-1)] + isotropic_noise = PickItemFromList(speech_rir.iso_noise_list) # extend the isotropic noise to the length of the speech waveform noises_added.append("\"wav-reverberate --duration={1} {0} - |\" ".format(isotropic_noise.noise_file_location, speech_dur)) snrs_added.append(background_snrs.next()) start_times_added.append(0) - if noises is not None and random.random() < noise_adding_probability: + # Add the point-source noise + if len(noise_list) > 0 and random.random() < noise_adding_probability: for k in range(random.randint(1, max_noises_added)): # pick the RIR to reverberate the point-source noise - noise = noises.next() - noise_rir = room['rir_list'][random.randint(0,len(room['rir_list'])-1)] + noise = PickItemFromList(noise_list) + noise_rir = PickItemFromList(room.rir_list) if noise.bg_fg_type == "background": start_times_added.append(0) noises_added.append("\"wav-reverberate --duration={2} --impulse-response={1} {0} - |\" ".format(noise.noise_file_location, noise_rir.rir_file_location, speech_dur)) @@ -186,7 +217,7 @@ def AddPrefixToFields(input_file, output_file, num_replica, prefix, field = [0]) f.close() -def CreateReverberatedCopy(input_dir, output_dir, room_list, noise_list, foreground_snr_string, background_snr_string, num_replica, prefix, speech_rvb_probability, noise_adding_probability, max_noises_added): +def CreateReverberatedCopy(input_dir, output_dir, room_dict, noise_list, foreground_snr_string, background_snr_string, num_replica, prefix, speech_rvb_probability, noise_adding_probability, max_noises_added): if not os.path.isfile(input_dir + "/reco2dur"): print("Getting the duration of the recordings..."); @@ -196,7 +227,7 @@ def CreateReverberatedCopy(input_dir, output_dir, room_list, noise_list, foregro foreground_snr_array = map(lambda x: float(x), foreground_snr_string.split(':')) background_snr_array = map(lambda x: float(x), background_snr_string.split(':')) - CorruptWav(wav_scp, durations, output_dir, room_list, noise_list, foreground_snr_array, background_snr_array, num_replica, prefix, speech_rvb_probability, noise_adding_probability, max_noises_added) + CorruptWav(wav_scp, durations, output_dir, room_dict, noise_list, foreground_snr_array, background_snr_array, num_replica, prefix, speech_rvb_probability, noise_adding_probability, max_noises_added) AddPrefixToFields(input_dir + "/utt2spk", output_dir + "/utt2spk", num_replica, prefix, field = [0,1]) train_lib.RunKaldiCommand("utils/utt2spk_to_spk2utt.pl <{output_dir}/utt2spk >{output_dir}/spk2utt" @@ -212,6 +243,21 @@ def CreateReverberatedCopy(input_dir, output_dir, room_list, noise_list, foregro train_lib.RunKaldiCommand("utils/validate_data_dir.sh --no-feats {output_dir}" .format(output_dir = output_dir)) +def SmoothProbability(list): + uniform_probability = 1 / float(len(list)) + for item in list: + if item.probability is None: + item.probability = uniform_probability + else: + # smooth the probability + item.probability = 0.3 * item.probability + 0.7 * uniform_probability + + sum_p = sum(item.probability for item in list) + # Normalize the probability + for item in list: + item.probability = item.probability / sum_p + + return list def ParseRirList(rir_list_file): rir_parser = argparse.ArgumentParser() @@ -221,6 +267,7 @@ def ParseRirList(rir_list_file): rir_parser.add_argument('--source-position-id', type=str, default=None, help='source position id') rir_parser.add_argument('--rt60', type=float, default=None, help='RT60 is the time required for reflections of a direct sound to decay 60 dB.') rir_parser.add_argument('--drr', type=float, default=None, help='Direct-to-reverberant-ratio of the impulse.') + rir_parser.add_argument('--probability', type=float, default=None, help='probability of the impulse.') rir_parser.add_argument('rir_file_location', type=str, help='rir file location') rir_list = [] @@ -228,26 +275,26 @@ def ParseRirList(rir_list_file): for line in rir_lines: rir = rir_parser.parse_args(line.split()) setattr(rir, "iso_noise_list", []) - rir.iso_noise_list = [] rir_list.append(rir) - return rir_list + return SmoothProbability(rir_list) -def MakeRoomList(rir_list): - room_list = [] - for i in range(len(rir_list)): - id = -1 - for j in range(len(room_list)): - if room_list[j]['room_id'] == rir_list[i].room_id: - id = j - break - if id == -1: + +def MakeRoomDict(rir_list): + room_dict = {} + for rir in rir_list: + if rir.room_id not in room_dict: # add new room - room_list.append({'room_id': rir_list[i].room_id, 'rir_list': []}) + room_dict[rir.room_id] = lambda: None + setattr(room_dict[rir.room_id], "rir_list", []) + setattr(room_dict[rir.room_id], "probability", 0) + room_dict[rir.room_id].rir_list.append(rir) + + for key in room_dict.keys(): + room_dict[key].probability = sum(rir.probability for rir in room_dict[key].rir_list) - room_list[id]['rir_list'].append(rir_list[i]) + return room_dict - return room_list def ParseNoiseList(rir_list, noise_list_file): noise_parser = argparse.ArgumentParser() @@ -255,29 +302,36 @@ def ParseNoiseList(rir_list, noise_list_file): noise_parser.add_argument('--noise-type', type=str, required=True, help='the type of noise; i.e. isotropic or point-source', choices = ["isotropic", "point-source"]) noise_parser.add_argument('--bg-fg-type', type=str, default="background", help='background or foreground noise', choices = ["background", "foreground"]) noise_parser.add_argument('--rir-file', type=str, default=None, help='compulsary if isotropic, should not be specified if point-source') + noise_parser.add_argument('--probability', type=float, default=None, help='probability of the noise.') noise_parser.add_argument('noise_file_location', type=str, help='noise file location') point_noise_list = [] + iso_noise_list = [] noise_lines = map(lambda x: x.strip(), open(noise_list_file)) for line in noise_lines: noise = noise_parser.parse_args(line.split()) if noise.noise_type == "isotropic": if noise.rir_file is None: - raise Exception("--rir-file must be specified is --noise-type is point-source") - warnings.warn("No rir file specified for noise id {0}".format(noise.noise_id)) + raise Exception("--rir-file must be specified if --noise-type is point-source") else: - id = -1 - for j in range(len(rir_list)): - if noise.rir_file == rir_list[j].rir_file_location: - id = j - rir_list[id].iso_noise_list.append(noise) - break; - if id == -1: - warnings.warn("Rir file specified for noise id {0} is not found in rir_list".format(noise.noise_id)) + iso_noise_list.append(noise) else: point_noise_list.append(noise) - return (point_noise_list, rir_list) + iso_noise_list = SmoothProbability(iso_noise_list) + + for iso_noise in iso_noise_list: + id = -1 + for j in range(len(rir_list)): + if iso_noise.rir_file == rir_list[j].rir_file_location: + id = j + rir_list[id].iso_noise_list.append(noise) + break; + if id == -1: + warnings.warn("Rir file specified for noise id {0} is not found in rir_list".format(iso_noise.noise_id)) + + return (SmoothProbability(point_noise_list), rir_list) + def Main(): args = GetArgs() @@ -287,11 +341,11 @@ def Main(): if args.noise_list_file is not None: noise_list, rir_list = ParseNoiseList(rir_list, args.noise_list_file) print("Number of point-source noises is {0}".format(len(noise_list))) - room_list = MakeRoomList(rir_list) + room_dict = MakeRoomDict(rir_list) CreateReverberatedCopy(input_dir = args.input_dir, output_dir = args.output_dir, - room_list = room_list, + room_dict = room_dict, noise_list = noise_list, foreground_snr_string = args.foreground_snr_string, background_snr_string = args.background_snr_string, From 0b7f06c11e45c3f53c7e4b21cd88357e053014d1 Mon Sep 17 00:00:00 2001 From: Tom Ko Date: Sat, 7 May 2016 20:22:49 -0400 Subject: [PATCH 04/14] Modify wav-reverberate.cc according to the new steps/data/reverberate_data_dir.py --- egs/wsj/s5/steps/data/reverberate_data_dir.py | 9 +- src/featbin/wav-reverberate.cc | 240 ++++++++++++------ 2 files changed, 171 insertions(+), 78 deletions(-) diff --git a/egs/wsj/s5/steps/data/reverberate_data_dir.py b/egs/wsj/s5/steps/data/reverberate_data_dir.py index a7d887e5f06..228eaebfba0 100755 --- a/egs/wsj/s5/steps/data/reverberate_data_dir.py +++ b/egs/wsj/s5/steps/data/reverberate_data_dir.py @@ -132,6 +132,9 @@ def ParseFileToDict(file, assert2fields = False, value_processor = None): # This is the major function to generate pipeline command for the corruption +# The generic command of wav-reverberate will be like: +# wav-reverberate --duration=t --impulse-response=rir.wav +# --additive-signals='noise1.wav,noise2.wav' --snrs='snr1,snr2' --start-times='s1,s2' input.wav output.wav def CorruptWav(wav_scp, durations, output_dir, room_dict, noise_list, foreground_snr_array, background_snr_array, num_replica, prefix, speech_rvb_probability, noise_adding_probability, max_noises_added): foreground_snrs = list_cyclic_iterator(foreground_snr_array) background_snrs = list_cyclic_iterator(background_snr_array) @@ -162,7 +165,7 @@ def CorruptWav(wav_scp, durations, output_dir, room_dict, noise_list, foreground if len(speech_rir.iso_noise_list) > 0: isotropic_noise = PickItemFromList(speech_rir.iso_noise_list) # extend the isotropic noise to the length of the speech waveform - noises_added.append("\"wav-reverberate --duration={1} {0} - |\" ".format(isotropic_noise.noise_file_location, speech_dur)) + noises_added.append("wav-reverberate --duration={1} {0} - |".format(isotropic_noise.noise_file_location, speech_dur)) snrs_added.append(background_snrs.next()) start_times_added.append(0) @@ -174,11 +177,11 @@ def CorruptWav(wav_scp, durations, output_dir, room_dict, noise_list, foreground noise_rir = PickItemFromList(room.rir_list) if noise.bg_fg_type == "background": start_times_added.append(0) - noises_added.append("\"wav-reverberate --duration={2} --impulse-response={1} {0} - |\" ".format(noise.noise_file_location, noise_rir.rir_file_location, speech_dur)) + noises_added.append("wav-reverberate --duration={2} --impulse-response={1} {0} - |".format(noise.noise_file_location, noise_rir.rir_file_location, speech_dur)) snrs_added.append(background_snrs.next()) else: start_times_added.append(round(random.random() * speech_dur, 2)) - noises_added.append("\"wav-reverberate --impulse-response={1} {0} - |\" ".format(noise.noise_file_location, noise_rir.rir_file_location)) + noises_added.append("wav-reverberate --impulse-response={1} {0} - |".format(noise.noise_file_location, noise_rir.rir_file_location)) snrs_added.append(foreground_snrs.next()) if len(noises_added) > 0: diff --git a/src/featbin/wav-reverberate.cc b/src/featbin/wav-reverberate.cc index d7599c5ea3d..56e2a0eb4f6 100644 --- a/src/featbin/wav-reverberate.cc +++ b/src/featbin/wav-reverberate.cc @@ -28,7 +28,8 @@ namespace kaldi { This function is to repeatedly concatenate signal1 by itself to match the length of signal2 and add the two signals together. */ -void AddVectorsOfUnequalLength(const Vector &signal1, Vector *signal2) { +void AddVectorsOfUnequalLength(const Vector &signal1, + Vector *signal2) { for (int32 po = 0; po < signal2->Dim(); po += signal1.Dim()) { int32 block_length = signal1.Dim(); if (signal2->Dim() - po < block_length) block_length = signal2->Dim() - po; @@ -36,6 +37,18 @@ void AddVectorsOfUnequalLength(const Vector &signal1, Vector &signal1, int32 offset, + Vector *signal2) { + int32 add_length = std::min(signal2->Dim() - offset, signal1.Dim()); + if (add_length > 0) + signal2->Range(offset, add_length).AddVec(1.0, signal1.Range(0, add_length)); +} + + BaseFloat MaxAbsolute(const Vector &vector) { return std::max(std::abs(vector.Max()), std::abs(vector.Min())); } @@ -71,29 +84,44 @@ BaseFloat ComputeEarlyReverbEnergy(const Vector &rir, const Vector &rir, BaseFloat samp_freq, - BaseFloat snr_db, Vector *noise, +float DoReverberation(const Vector &rir, BaseFloat samp_freq, Vector *signal) { - if (noise->Dim()) { - float input_power = ComputeEarlyReverbEnergy(rir, *signal, samp_freq); - float noise_power = VecVec(*noise, *noise) / noise->Dim(); - float scale_factor = sqrt(pow(10, -snr_db / 10) * input_power / noise_power); - noise->Scale(scale_factor); - KALDI_VLOG(1) << "Noise signal is being scaled with " << scale_factor - << " to generate output with SNR " << snr_db << "db\n"; - } - + float signal_power = ComputeEarlyReverbEnergy(rir, *signal, samp_freq); FFTbasedBlockConvolveSignals(rir, signal); + return signal_power; +} - if (noise->Dim() > 0) { - AddVectorsOfUnequalLength(*noise, signal); +/* + The noise will be scaled before the addition + to match the given signal-to-noise ratio (SNR). +*/ +void AddNoise(Vector *noise, BaseFloat snr_db, + BaseFloat time, BaseFloat samp_freq, + BaseFloat signal_power, Vector *signal) { + float noise_power = VecVec(*noise, *noise) / noise->Dim(); + float scale_factor = sqrt(pow(10, -snr_db / 10) * signal_power / noise_power); + noise->Scale(scale_factor); + KALDI_VLOG(1) << "Noise signal is being scaled with " << scale_factor + << " to generate output with SNR " << snr_db << "db\n"; + int32 offset = time * samp_freq; + AddVectorsWithOffset(*noise, offset, signal); +} + +/* + This function converts comma-spearted string into float vector. +*/ +void ReadCommaSeparatedCommand(const std::string &s, + std::vector *v) { + std::vector split_string; + SplitStringToVector(s, ",", true, &split_string); + for (size_t i = 0; i < split_string.size(); i++) { + float ret; + ConvertStringToReal(split_string[i], &ret); + v->push_back(ret); } } } @@ -107,20 +135,24 @@ int main(int argc, char *argv[]) { "room-impulse response (rir_matrix) and additive noise distortions\n" "(specified by corresponding files).\n" "Usage: wav-reverberate [options...] " - " \n" + "\n" "e.g.\n" - "wav-reverberate --noise-file=noise.wav \\\n" - " input.wav rir.wav output.wav\n"; + "wav-reverberate --duration=t --impulse-response=rir.wav " + "--additive-signals='noise1.wav,noise2.wav' --snrs='snr1,snr2' " + "--start-times='s1,s2' input.wav output.wav\n"; ParseOptions po(usage); - std::string noise_file; - BaseFloat snr_db = 20; + std::string rir_file; + std::string additive_signals; + std::string snrs; + std::string start_times; bool multi_channel_output = false; int32 input_channel = 0; int32 rir_channel = 0; int32 noise_channel = 0; bool normalize_output = true; BaseFloat volume = 0; + BaseFloat duration = 0; po.Register("multi-channel-output", &multi_channel_output, "Specifies if the output should be multi-channel or not"); @@ -133,14 +165,29 @@ int main(int argc, char *argv[]) { po.Register("noise-channel", &noise_channel, "Specifies the channel of the noise file, " "it will only be used when multi-channel-output is false"); - po.Register("noise-file", &noise_file, - "File with additive noise"); - po.Register("snr-db", &snr_db, - "Desired SNR(dB) of the output"); + po.Register("impulse-response", &rir_file, + "File with the impulse response for reverberating the input wave"); + po.Register("additive-signals", &additive_signals, + "A comma separated list of additive signals"); + po.Register("snrs", &snrs, + "A comma separated list of SNRs. The additive signals will be " + "scaled according to these SNRs."); + po.Register("start-times", &start_times, + "A comma separated list of start times referring to the " + "input signal. The additive signals will be added to the " + "input signal starting at the offset. If the start time " + "exceed the length of the input signal, the addition will " + "be ignored."); po.Register("normalize-output", &normalize_output, "If true, then after reverberating and " "possibly adding noise, scale so that the signal " "energy is the same as the original input signal."); + po.Register("duration", &duration, + "If nonzero, it specified the duration (secs) of the output " + "signal. If the duration t is less than the length of the " + "input signal, the first t secs of the signal is trimed, " + "otherwise, the signal will be repeated to" + "fulfill the duration specified."); po.Register("volume", &volume, "If nonzero, a scaling factor on the signal that is applied " "after reverberating and possibly adding noise. " @@ -148,7 +195,7 @@ int main(int argc, char *argv[]) { "if you had also specified --normalize-output=false."); po.Read(argc, argv); - if (po.NumArgs() != 3) { + if (po.NumArgs() != 2) { po.PrintUsage(); exit(1); } @@ -160,13 +207,14 @@ int main(int argc, char *argv[]) { } std::string input_wave_file = po.GetArg(1); - std::string rir_file = po.GetArg(2); - std::string output_wave_file = po.GetArg(3); + std::string output_wave_file = po.GetArg(2); WaveData input_wave; { + WaveHolder waveholder; Input ki(input_wave_file); - input_wave.Read(ki.Stream()); + waveholder.Read(ki.Stream()); + input_wave = waveholder.Value(); } const Matrix &input_matrix = input_wave.Data(); @@ -178,45 +226,70 @@ int main(int argc, char *argv[]) { << " #channel: " << num_input_channel; KALDI_ASSERT(input_channel < num_input_channel); - WaveData rir_wave; - { - Input ki(rir_file); - rir_wave.Read(ki.Stream()); - } - const Matrix &rir_matrix = rir_wave.Data(); - BaseFloat samp_freq_rir = rir_wave.SampFreq(); - int32 num_samp_rir = rir_matrix.NumCols(), - num_rir_channel = rir_matrix.NumRows(); - KALDI_VLOG(1) << "sampling frequency of rir: " << samp_freq_rir - << " #samples: " << num_samp_rir - << " #channel: " << num_rir_channel; - if (!multi_channel_output) { - KALDI_ASSERT(rir_channel < num_rir_channel); - } - - Matrix noise_matrix; - if (!noise_file.empty()) { - WaveData noise_wave; + Matrix rir_matrix; + BaseFloat samp_freq_rir = samp_freq_input; + int32 num_samp_rir = 1, + num_rir_channel = 1; + if (!rir_file.empty()) { + WaveData rir_wave; { - Input ki(noise_file); - noise_wave.Read(ki.Stream()); + WaveHolder waveholder; + Input ki(rir_file); + waveholder.Read(ki.Stream()); + rir_wave = waveholder.Value(); } - noise_matrix = noise_wave.Data(); - BaseFloat samp_freq_noise = noise_wave.SampFreq(); - int32 num_samp_noise = noise_matrix.NumCols(), - num_noise_channel = noise_matrix.NumRows(); - KALDI_VLOG(1) << "sampling frequency of noise: " << samp_freq_noise - << " #samples: " << num_samp_noise - << " #channel: " << num_noise_channel; - if (multi_channel_output) { - KALDI_ASSERT(num_rir_channel == num_noise_channel); - } else { - KALDI_ASSERT(noise_channel < num_noise_channel); + rir_matrix = rir_wave.Data(); + samp_freq_rir = rir_wave.SampFreq(); + num_samp_rir = rir_matrix.NumCols(); + num_rir_channel = rir_matrix.NumRows(); + KALDI_VLOG(1) << "sampling frequency of rir: " << samp_freq_rir + << " #samples: " << num_samp_rir + << " #channel: " << num_rir_channel; + if (!multi_channel_output) { + KALDI_ASSERT(rir_channel < num_rir_channel); + } + } + + std::vector > additive_signal_matrices; + if (!additive_signals.empty()) { + std::vector split_string; + SplitStringToVector(additive_signals, ",", true, &split_string); + for (size_t i = 0; i < split_string.size(); i++) { + WaveHolder waveholder; + Input ki(split_string[i]); + waveholder.Read(ki.Stream()); + WaveData additive_signal_wave = waveholder.Value(); + Matrix additive_signal_matrix = additive_signal_wave.Data(); + BaseFloat samp_freq = additive_signal_wave.SampFreq(); + KALDI_ASSERT(samp_freq == samp_freq_input); + int32 num_samp = additive_signal_matrix.NumCols(), + num_channel = additive_signal_matrix.NumRows(); + KALDI_VLOG(1) << "sampling frequency of additive signal: " << samp_freq + << " #samples: " << num_samp + << " #channel: " << num_channel; + if (multi_channel_output) { + KALDI_ASSERT(num_rir_channel == num_channel); + } else { + KALDI_ASSERT(noise_channel < num_channel); + } + + additive_signal_matrices.push_back(additive_signal_matrix); } } + std::vector snr_vector; + if (!snrs.empty()) { + ReadCommaSeparatedCommand(snrs, &snr_vector); + } + + std::vector start_time_vector; + if (!start_times.empty()) { + ReadCommaSeparatedCommand(start_times, &start_time_vector); + } + int32 num_output_channels = (multi_channel_output ? num_rir_channel : 1); - Matrix out_matrix(num_output_channels, num_samp_input); + int32 num_samp_output = (duration > 0 ? samp_freq_input * duration : num_samp_input); + Matrix out_matrix(num_output_channels, num_samp_output); for (int32 output_channel = 0; output_channel < num_output_channels; output_channel++) { Vector input(num_samp_input); @@ -224,18 +297,26 @@ int main(int argc, char *argv[]) { float power_before_reverb = VecVec(input, input) / input.Dim(); int32 this_rir_channel = (multi_channel_output ? output_channel : rir_channel); - Vector rir(num_samp_rir); - rir.CopyRowFromMat(rir_matrix, this_rir_channel); - rir.Scale(1.0 / (1 << 15)); - Vector noise(0); - if (!noise_file.empty()) { - noise.Resize(noise_matrix.NumCols()); - int32 this_noise_channel = (multi_channel_output ? output_channel : noise_channel); - noise.CopyRowFromMat(noise_matrix, this_noise_channel); + float early_energy = power_before_reverb; + if (!rir_file.empty()) { + Vector rir; + rir.Resize(num_samp_rir); + rir.CopyRowFromMat(rir_matrix, this_rir_channel); + rir.Scale(1.0 / (1 << 15)); + early_energy = DoReverberation(rir, samp_freq_rir, &input); } - DoReverberation(rir, samp_freq_rir, snr_db, &noise, &input); + if (additive_signal_matrices.size() > 0) { + Vector noise(0); + int32 this_noise_channel = (multi_channel_output ? output_channel : noise_channel); + for (int32 i = 0; i < additive_signal_matrices.size(); i++) { + noise.Resize(additive_signal_matrices[i].NumCols()); + noise.CopyRowFromMat(additive_signal_matrices[i], this_noise_channel); + AddNoise(&noise, snr_vector[i], start_time_vector[i], + samp_freq_input, early_energy, &input); + } + } float power_after_reverb = VecVec(input, input) / input.Dim(); @@ -244,7 +325,16 @@ int main(int argc, char *argv[]) { else if (normalize_output) input.Scale(sqrt(power_before_reverb / power_after_reverb)); - out_matrix.CopyRowFromVec(input, output_channel); + if (num_samp_output <= num_samp_input) { + // trim the signal from the start + out_matrix.CopyRowFromVec(input.Range(0, num_samp_output), output_channel); + } else { + // repeat the signal to fill up the duration + Vector extended_input(num_samp_output); + extended_input.SetZero(); + AddVectorsOfUnequalLength(input, &extended_input); + out_matrix.CopyRowFromVec(extended_input, output_channel); + } } WaveData out_wave(samp_freq_input, out_matrix); From 1068ec452fa77773284c30b03323329743c7a977 Mon Sep 17 00:00:00 2001 From: Tom Ko Date: Mon, 11 Jul 2016 03:51:59 -0400 Subject: [PATCH 05/14] Change the functions in signal.cc to extend the length of the convolved signal, the correct length should be original signal length + rir length - 1; add the shift option to wav-reverberate.cc --- egs/wsj/s5/steps/data/reverberate_data_dir.py | 26 ++++++------- src/feat/signal.cc | 38 +++++++++++-------- src/feat/signal.h | 7 ++++ src/featbin/wav-reverberate.cc | 31 ++++++++++++--- 4 files changed, 67 insertions(+), 35 deletions(-) diff --git a/egs/wsj/s5/steps/data/reverberate_data_dir.py b/egs/wsj/s5/steps/data/reverberate_data_dir.py index 228eaebfba0..e2f05b25aa1 100755 --- a/egs/wsj/s5/steps/data/reverberate_data_dir.py +++ b/egs/wsj/s5/steps/data/reverberate_data_dir.py @@ -41,7 +41,7 @@ def GetArgs(): "--bg-fg-type " "--rir-file " "< location=(support Kaldi IO strings) >") - parser.add_argument("--num-replications", type=int, dest = "num_replica", default = 1, + parser.add_argument("--num-replications", type=int, dest = "num_replicas", default = 1, help="Number of replicate to generated for the data") parser.add_argument('--foreground-snrs', type=str, dest = "foreground_snr_string", default = '20:10:0', help='snrs for foreground noises') parser.add_argument('--background-snrs', type=str, dest = "background_snr_string", default = '20:10:0', help='snrs for background noises') @@ -77,7 +77,7 @@ def CheckArgs(args): if not os.path.isfile(args.noise_list_file): raise Exception(args.noise_list_file + "not found") - if args.num_replica > 1 and args.prefix is None: + if args.num_replicas > 1 and args.prefix is None: args.prefix = "rvb" warnings.warn("--prefix is set to 'rvb' as --num-replications is larger than 1.") @@ -135,11 +135,11 @@ def ParseFileToDict(file, assert2fields = False, value_processor = None): # The generic command of wav-reverberate will be like: # wav-reverberate --duration=t --impulse-response=rir.wav # --additive-signals='noise1.wav,noise2.wav' --snrs='snr1,snr2' --start-times='s1,s2' input.wav output.wav -def CorruptWav(wav_scp, durations, output_dir, room_dict, noise_list, foreground_snr_array, background_snr_array, num_replica, prefix, speech_rvb_probability, noise_adding_probability, max_noises_added): +def CorruptWav(wav_scp, durations, output_dir, room_dict, noise_list, foreground_snr_array, background_snr_array, num_replicas, prefix, speech_rvb_probability, noise_adding_probability, max_noises_added): foreground_snrs = list_cyclic_iterator(foreground_snr_array) background_snrs = list_cyclic_iterator(background_snr_array) command_list = [] - for i in range(num_replica): + for i in range(num_replicas): keys = wav_scp.keys() keys.sort() for wav_id in keys: @@ -204,10 +204,10 @@ def CorruptWav(wav_scp, durations, output_dir, room_dict, noise_list, foreground # This function replicate the entries in files like segments, utt2spk, text -def AddPrefixToFields(input_file, output_file, num_replica, prefix, field = [0]): +def AddPrefixToFields(input_file, output_file, num_replicas, prefix, field = [0]): list = map(lambda x: x.strip(), open(input_file)) f = open(output_file, "w") - for i in range(num_replica): + for i in range(num_replicas): for line in list: if len(line) > 0 and line[0] != ';': split1 = line.split() @@ -220,7 +220,7 @@ def AddPrefixToFields(input_file, output_file, num_replica, prefix, field = [0]) f.close() -def CreateReverberatedCopy(input_dir, output_dir, room_dict, noise_list, foreground_snr_string, background_snr_string, num_replica, prefix, speech_rvb_probability, noise_adding_probability, max_noises_added): +def CreateReverberatedCopy(input_dir, output_dir, room_dict, noise_list, foreground_snr_string, background_snr_string, num_replicas, prefix, speech_rvb_probability, noise_adding_probability, max_noises_added): if not os.path.isfile(input_dir + "/reco2dur"): print("Getting the duration of the recordings..."); @@ -230,18 +230,18 @@ def CreateReverberatedCopy(input_dir, output_dir, room_dict, noise_list, foregro foreground_snr_array = map(lambda x: float(x), foreground_snr_string.split(':')) background_snr_array = map(lambda x: float(x), background_snr_string.split(':')) - CorruptWav(wav_scp, durations, output_dir, room_dict, noise_list, foreground_snr_array, background_snr_array, num_replica, prefix, speech_rvb_probability, noise_adding_probability, max_noises_added) + CorruptWav(wav_scp, durations, output_dir, room_dict, noise_list, foreground_snr_array, background_snr_array, num_replicas, prefix, speech_rvb_probability, noise_adding_probability, max_noises_added) - AddPrefixToFields(input_dir + "/utt2spk", output_dir + "/utt2spk", num_replica, prefix, field = [0,1]) + AddPrefixToFields(input_dir + "/utt2spk", output_dir + "/utt2spk", num_replicas, prefix, field = [0,1]) train_lib.RunKaldiCommand("utils/utt2spk_to_spk2utt.pl <{output_dir}/utt2spk >{output_dir}/spk2utt" .format(output_dir = output_dir)) if os.path.isfile(input_dir + "/text"): - AddPrefixToFields(input_dir + "/text", output_dir + "/text", num_replica, prefix, field =[0]) + AddPrefixToFields(input_dir + "/text", output_dir + "/text", num_replicas, prefix, field =[0]) if os.path.isfile(input_dir + "/segments"): - AddPrefixToFields(input_dir + "/segments", output_dir + "/segments", num_replica, prefix, field = [0,1]) + AddPrefixToFields(input_dir + "/segments", output_dir + "/segments", num_replicas, prefix, field = [0,1]) if os.path.isfile(input_dir + "/reco2file_and_channel"): - AddPrefixToFields(input_dir + "/reco2file_and_channel", output_dir + "/reco2file_and_channel", num_replica, prefix, field = [0,1]) + AddPrefixToFields(input_dir + "/reco2file_and_channel", output_dir + "/reco2file_and_channel", num_replicas, prefix, field = [0,1]) train_lib.RunKaldiCommand("utils/validate_data_dir.sh --no-feats {output_dir}" .format(output_dir = output_dir)) @@ -352,7 +352,7 @@ def Main(): noise_list = noise_list, foreground_snr_string = args.foreground_snr_string, background_snr_string = args.background_snr_string, - num_replica = args.num_replica, + num_replicas = args.num_replicas, prefix = args.prefix, speech_rvb_probability = args.speech_rvb_probability, noise_adding_probability = args.noise_adding_probability, diff --git a/src/feat/signal.cc b/src/feat/signal.cc index e8fbb0b84cf..12a9a710092 100644 --- a/src/feat/signal.cc +++ b/src/feat/signal.cc @@ -34,31 +34,34 @@ void ElementwiseProductOfFft(const Vector &a, Vector *b) { void ConvolveSignals(const Vector &filter, Vector *signal) { int32 signal_length = signal->Dim(); int32 filter_length = filter.Dim(); - Vector signal_padded(signal_length + filter_length - 1); + int32 output_length = signal_length + filter_length - 1; + Vector signal_padded(output_length); signal_padded.SetZero(); for (int32 i = 0; i < signal_length; i++) { for (int32 j = 0; j < filter_length; j++) { signal_padded(i + j) += (*signal)(i) * filter(j); } } - signal->CopyFromVec(signal_padded.Range(0, signal_length)); + signal->Resize(output_length); + signal->CopyFromVec(signal_padded); } void FFTbasedConvolveSignals(const Vector &filter, Vector *signal) { int32 signal_length = signal->Dim(); int32 filter_length = filter.Dim(); + int32 output_length = signal_length + filter_length - 1; - int32 fft_length = RoundUpToNearestPowerOfTwo(signal_length + filter_length - 1); + int32 fft_length = RoundUpToNearestPowerOfTwo(output_length); KALDI_VLOG(1) << "fft_length for full signal convolution is " << fft_length; SplitRadixRealFft srfft(fft_length); - Vector filter_padded(fft_length); + Vector filter_padded(fft_length); filter_padded.Range(0, filter_length).CopyFromVec(filter); srfft.Compute(filter_padded.Data(), true); - Vector signal_padded(fft_length); + Vector signal_padded(fft_length); signal_padded.Range(0, signal_length).CopyFromVec(*signal); srfft.Compute(signal_padded.Data(), true); @@ -67,12 +70,15 @@ void FFTbasedConvolveSignals(const Vector &filter, Vector srfft.Compute(signal_padded.Data(), false); signal_padded.Scale(1.0 / fft_length); - signal->CopyFromVec(signal_padded.Range(0, signal_length)); + signal->Resize(output_length); + signal->CopyFromVec(signal_padded.Range(0, output_length)); } void FFTbasedBlockConvolveSignals(const Vector &filter, Vector *signal) { int32 signal_length = signal->Dim(); int32 filter_length = filter.Dim(); + int32 output_length = signal_length + filter_length - 1; + signal->Resize(output_length, kCopyData); KALDI_VLOG(1) << "Length of the filter is " << filter_length; @@ -83,17 +89,17 @@ void FFTbasedBlockConvolveSignals(const Vector &filter, Vector srfft(fft_length); - Vector filter_padded(fft_length); + Vector filter_padded(fft_length); filter_padded.Range(0, filter_length).CopyFromVec(filter); srfft.Compute(filter_padded.Data(), true); - Vector temp_pad(filter_length - 1); + Vector temp_pad(filter_length - 1); temp_pad.SetZero(); - Vector signal_block_padded(fft_length); + Vector signal_block_padded(fft_length); - for (int32 po = 0; po < signal_length; po += block_length) { + for (int32 po = 0; po < output_length; po += block_length) { // get a block of the signal - int32 process_length = std::min(block_length, signal_length - po); + int32 process_length = std::min(block_length, output_length - po); signal_block_padded.SetZero(); signal_block_padded.Range(0, process_length).CopyFromVec(signal->Range(po, process_length)); @@ -105,17 +111,17 @@ void FFTbasedBlockConvolveSignals(const Vector &filter, VectorRange(po, block_length).CopyFromVec(signal_block_padded.Range(0, block_length)); signal->Range(po, filter_length - 1).AddVec(1.0, temp_pad); temp_pad.CopyFromVec(signal_block_padded.Range(block_length, filter_length - 1)); } else { - signal->Range(po, signal_length - po).CopyFromVec( - signal_block_padded.Range(0, signal_length - po)); - if (filter_length - 1 < signal_length - po) + signal->Range(po, output_length - po).CopyFromVec( + signal_block_padded.Range(0, output_length - po)); + if (filter_length - 1 < output_length - po) signal->Range(po, filter_length - 1).AddVec(1.0, temp_pad); else - signal->Range(po, signal_length - po).AddVec(1.0, temp_pad.Range(0, signal_length - po)); + signal->Range(po, output_length - po).AddVec(1.0, temp_pad.Range(0, output_length - po)); } } } diff --git a/src/feat/signal.h b/src/feat/signal.h index 7ff0ce33b52..c6c3eb50530 100644 --- a/src/feat/signal.h +++ b/src/feat/signal.h @@ -25,6 +25,13 @@ namespace kaldi { +/* + The following three functions are having the same functionality but + different implementations so as the efficiency. After the convolution, + the length of the signal will be extended to (original signal length + + filter length - 1). +*/ + /* This function implements a simple non-FFT-based convolution of two signals. It is suggested to use the FFT-based convolution function which is more diff --git a/src/featbin/wav-reverberate.cc b/src/featbin/wav-reverberate.cc index 56e2a0eb4f6..80b08307172 100644 --- a/src/featbin/wav-reverberate.cc +++ b/src/featbin/wav-reverberate.cc @@ -28,7 +28,7 @@ namespace kaldi { This function is to repeatedly concatenate signal1 by itself to match the length of signal2 and add the two signals together. */ -void AddVectorsOfUnequalLength(const Vector &signal1, +void AddVectorsOfUnequalLength(const VectorBase &signal1, Vector *signal2) { for (int32 po = 0; po < signal2->Dim(); po += signal1.Dim()) { int32 block_length = signal1.Dim(); @@ -87,6 +87,8 @@ BaseFloat ComputeEarlyReverbEnergy(const Vector &rir, const Vector &rir, BaseFloat samp_freq, Vector *signal) { @@ -147,6 +149,7 @@ int main(int argc, char *argv[]) { std::string snrs; std::string start_times; bool multi_channel_output = false; + bool shift_output = true; int32 input_channel = 0; int32 rir_channel = 0; int32 noise_channel = 0; @@ -156,6 +159,14 @@ int main(int argc, char *argv[]) { po.Register("multi-channel-output", &multi_channel_output, "Specifies if the output should be multi-channel or not"); + po.Register("shift-output", &shift_output, + "If true, the reverberated waveform will be shifted by the " + "amount of the peak position of the RIR and the length of " + "the output waveform will be equal to the input waveform." + "If false, the length of the output waveform will be " + "equal to (original input length + rir length - 1). " + "This value is default true and " + "it only affects the output when RIR file is provided."); po.Register("input-wave-channel", &input_channel, "Specifies the channel to be used from input as only a " "single channel will be used to generate reverberated output"); @@ -228,8 +239,8 @@ int main(int argc, char *argv[]) { Matrix rir_matrix; BaseFloat samp_freq_rir = samp_freq_input; - int32 num_samp_rir = 1, - num_rir_channel = 1; + int32 num_samp_rir = 0, + num_rir_channel = 0; if (!rir_file.empty()) { WaveData rir_wave; { @@ -287,8 +298,11 @@ int main(int argc, char *argv[]) { ReadCommaSeparatedCommand(start_times, &start_time_vector); } + int32 shift_index = 0; int32 num_output_channels = (multi_channel_output ? num_rir_channel : 1); - int32 num_samp_output = (duration > 0 ? samp_freq_input * duration : num_samp_input); + int32 num_samp_output = (duration > 0 ? samp_freq_input * duration : + (shift_output ? num_samp_input : + num_samp_input + num_samp_rir - 1)); Matrix out_matrix(num_output_channels, num_samp_output); for (int32 output_channel = 0; output_channel < num_output_channels; output_channel++) { @@ -305,6 +319,11 @@ int main(int argc, char *argv[]) { rir.CopyRowFromMat(rir_matrix, this_rir_channel); rir.Scale(1.0 / (1 << 15)); early_energy = DoReverberation(rir, samp_freq_rir, &input); + if (shift_output) { + // find the position of the peak of the impulse response + // and shift the output waveform by this amount + rir.Max(&shift_index); + } } if (additive_signal_matrices.size() > 0) { @@ -327,12 +346,12 @@ int main(int argc, char *argv[]) { if (num_samp_output <= num_samp_input) { // trim the signal from the start - out_matrix.CopyRowFromVec(input.Range(0, num_samp_output), output_channel); + out_matrix.CopyRowFromVec(input.Range(shift_index, num_samp_output), output_channel); } else { // repeat the signal to fill up the duration Vector extended_input(num_samp_output); extended_input.SetZero(); - AddVectorsOfUnequalLength(input, &extended_input); + AddVectorsOfUnequalLength(input.Range(shift_index, num_samp_input), &extended_input); out_matrix.CopyRowFromVec(extended_input, output_channel); } } From fdb576dff7d1940f333b6ee05f92a2d98319669f Mon Sep 17 00:00:00 2001 From: Tom Ko Date: Wed, 13 Jul 2016 01:20:45 -0400 Subject: [PATCH 06/14] Adding more comments and remove duplicate function in reverberate_data_dir.py --- egs/wsj/s5/steps/data/reverberate_data_dir.py | 62 ++++++++----------- 1 file changed, 25 insertions(+), 37 deletions(-) diff --git a/egs/wsj/s5/steps/data/reverberate_data_dir.py b/egs/wsj/s5/steps/data/reverberate_data_dir.py index e2f05b25aa1..6f4418e7aca 100755 --- a/egs/wsj/s5/steps/data/reverberate_data_dir.py +++ b/egs/wsj/s5/steps/data/reverberate_data_dir.py @@ -25,9 +25,11 @@ def GetArgs(): # we add compulsary arguments as named arguments for readability parser = argparse.ArgumentParser(description="Reverberate the data directory with an option " "to add isotropic and point source noiseis. " - "This script only deals with single channel wave files. " - "If multi-channel noise/rir/speech files are provided one " - "of the channels will be randomly picked", + "Usage: reverberate_data_dir.py [options...] " + "E.g. reverberate_data_dir.py --rir-list-file rir_list " + "--foreground-snrs 20:10:15:5:0 --background-snrs 20:10:15:5:0 " + "--noise-list-file noise_list --speech-rvb-probability 1 --num-replications 2 " + "--random-seed 1 data/train data/train_rvb", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--rir-list-file", type=str, required = True, @@ -84,37 +86,21 @@ def CheckArgs(args): return args -def PickItemFromDict(dict): - total_p = sum(dict[key].probability for key in dict.keys()) +# This function pick the item according to the associated probability +# The input could be either a dictinoary of a list +def PickItemWithProbability(x): + if isinstance(x, dict): + plist = list(set(x.values())) + else: + plist = x + total_p = sum(item.probability for item in plist) p = random.uniform(0, total_p) - upto = 0 - for key in dict.keys(): - if upto + dict[key].probability >= p: - return dict[key] - upto += dict[key].probability - assert False, "Shouldn't get here" - - -def PickItemFromList(list): - total_p = sum(item.probability for item in list) - p = random.uniform(0, total_p) - upto = 0 - for item in list: - if upto + item.probability >= p: + accumulate_p = 0 + for item in plist: + if accumulate_p + item.probability >= p: return item - upto += item.probability - assert False, "Shouldn't get here" - - -def weighted_choice(choices): - total = sum(w for c, w in choices) - r = random.uniform(0, total) - upto = 0 - for c, w in choices: - if upto + w >= r: - return c - upto += w - assert False, "Shouldn't get here" + accumulate_p += item.probability + assert False, "Shouldn't get here as the accumulated probability should always equal to 1" def ParseFileToDict(file, assert2fields = False, value_processor = None): @@ -152,18 +138,18 @@ def CorruptWav(wav_scp, durations, output_dir, room_dict, noise_list, foreground wav_id = prefix + str(i) + "_" + wav_id # pick the room - room = PickItemFromDict(room_dict) + room = PickItemWithProbability(room_dict) command_opts = "" noises_added = [] snrs_added = [] start_times_added = [] if random.random() < speech_rvb_probability: # pick the RIR to reverberate the speech - speech_rir = PickItemFromList(room.rir_list) + speech_rir = PickItemWithProbability(room.rir_list) command_opts += "--impulse-response={0} ".format(speech_rir.rir_file_location) # add the corresponding isotropic noise if there is any if len(speech_rir.iso_noise_list) > 0: - isotropic_noise = PickItemFromList(speech_rir.iso_noise_list) + isotropic_noise = PickItemWithProbability(speech_rir.iso_noise_list) # extend the isotropic noise to the length of the speech waveform noises_added.append("wav-reverberate --duration={1} {0} - |".format(isotropic_noise.noise_file_location, speech_dur)) snrs_added.append(background_snrs.next()) @@ -173,8 +159,8 @@ def CorruptWav(wav_scp, durations, output_dir, room_dict, noise_list, foreground if len(noise_list) > 0 and random.random() < noise_adding_probability: for k in range(random.randint(1, max_noises_added)): # pick the RIR to reverberate the point-source noise - noise = PickItemFromList(noise_list) - noise_rir = PickItemFromList(room.rir_list) + noise = PickItemWithProbability(noise_list) + noise_rir = PickItemWithProbability(room.rir_list) if noise.bg_fg_type == "background": start_times_added.append(0) noises_added.append("wav-reverberate --duration={2} --impulse-response={1} {0} - |".format(noise.noise_file_location, noise_rir.rir_file_location, speech_dur)) @@ -283,6 +269,8 @@ def ParseRirList(rir_list_file): return SmoothProbability(rir_list) +# This function crate the room dictinoary from the rir list +# The key of the returned dictionary is the room id def MakeRoomDict(rir_list): room_dict = {} for rir in rir_list: From 3802fde4b4da335c1482da6519f3f8d58145200f Mon Sep 17 00:00:00 2001 From: Tom Ko Date: Thu, 14 Jul 2016 03:29:02 -0400 Subject: [PATCH 07/14] Change option --max-noises-added to --max-noises-per-minute --- egs/wsj/s5/steps/data/reverberate_data_dir.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/egs/wsj/s5/steps/data/reverberate_data_dir.py b/egs/wsj/s5/steps/data/reverberate_data_dir.py index 6f4418e7aca..3f6f04fb340 100755 --- a/egs/wsj/s5/steps/data/reverberate_data_dir.py +++ b/egs/wsj/s5/steps/data/reverberate_data_dir.py @@ -49,11 +49,11 @@ def GetArgs(): parser.add_argument('--background-snrs', type=str, dest = "background_snr_string", default = '20:10:0', help='snrs for background noises') parser.add_argument('--prefix', type=str, default = None, help='prefix for the id of the corrupted utterances') parser.add_argument("--speech-rvb-probability", type=float, default = 0.8, - help="Probability of reverberating the speech signal, e.g. 0 <= p <= 1") + help="Probability of reverberating a speech signal, e.g. 0 <= p <= 1") parser.add_argument("--noise-adding-probability", type=float, default = 0.4, help="Probability of adding point-source noises, e.g. 0 <= p <= 1") - parser.add_argument("--max-noises-added", type=int, default = 2, - help="Maximum number of point-source noises could be added") + parser.add_argument("--max-noises-per-minute", type=int, default = 2, + help="This controls the maximum number of point-source noises that could be added to a recording according to its duration") parser.add_argument('--random-seed', type=int, default=0, help='seed to be used in the randomization of impulese and noises') parser.add_argument("input_dir", help="Input data directory") @@ -121,7 +121,7 @@ def ParseFileToDict(file, assert2fields = False, value_processor = None): # The generic command of wav-reverberate will be like: # wav-reverberate --duration=t --impulse-response=rir.wav # --additive-signals='noise1.wav,noise2.wav' --snrs='snr1,snr2' --start-times='s1,s2' input.wav output.wav -def CorruptWav(wav_scp, durations, output_dir, room_dict, noise_list, foreground_snr_array, background_snr_array, num_replicas, prefix, speech_rvb_probability, noise_adding_probability, max_noises_added): +def CorruptWav(wav_scp, durations, output_dir, room_dict, noise_list, foreground_snr_array, background_snr_array, num_replicas, prefix, speech_rvb_probability, noise_adding_probability, max_noises_per_minute): foreground_snrs = list_cyclic_iterator(foreground_snr_array) background_snrs = list_cyclic_iterator(background_snr_array) command_list = [] @@ -134,6 +134,7 @@ def CorruptWav(wav_scp, durations, output_dir, room_dict, noise_list, foreground if len(wav_pipe.split()) == 1: wav_pipe = "cat {0} |".format(wav_pipe) speech_dur = durations[wav_id] + max_noises_recording = math.floor(max_noises_per_minute * speech_dur / 60) if prefix is not None: wav_id = prefix + str(i) + "_" + wav_id @@ -157,7 +158,7 @@ def CorruptWav(wav_scp, durations, output_dir, room_dict, noise_list, foreground # Add the point-source noise if len(noise_list) > 0 and random.random() < noise_adding_probability: - for k in range(random.randint(1, max_noises_added)): + for k in range(random.randint(1, max_noises_recording)): # pick the RIR to reverberate the point-source noise noise = PickItemWithProbability(noise_list) noise_rir = PickItemWithProbability(room.rir_list) @@ -206,7 +207,7 @@ def AddPrefixToFields(input_file, output_file, num_replicas, prefix, field = [0] f.close() -def CreateReverberatedCopy(input_dir, output_dir, room_dict, noise_list, foreground_snr_string, background_snr_string, num_replicas, prefix, speech_rvb_probability, noise_adding_probability, max_noises_added): +def CreateReverberatedCopy(input_dir, output_dir, room_dict, noise_list, foreground_snr_string, background_snr_string, num_replicas, prefix, speech_rvb_probability, noise_adding_probability, max_noises_per_minute): if not os.path.isfile(input_dir + "/reco2dur"): print("Getting the duration of the recordings..."); @@ -216,7 +217,7 @@ def CreateReverberatedCopy(input_dir, output_dir, room_dict, noise_list, foregro foreground_snr_array = map(lambda x: float(x), foreground_snr_string.split(':')) background_snr_array = map(lambda x: float(x), background_snr_string.split(':')) - CorruptWav(wav_scp, durations, output_dir, room_dict, noise_list, foreground_snr_array, background_snr_array, num_replicas, prefix, speech_rvb_probability, noise_adding_probability, max_noises_added) + CorruptWav(wav_scp, durations, output_dir, room_dict, noise_list, foreground_snr_array, background_snr_array, num_replicas, prefix, speech_rvb_probability, noise_adding_probability, max_noises_per_minute) AddPrefixToFields(input_dir + "/utt2spk", output_dir + "/utt2spk", num_replicas, prefix, field = [0,1]) train_lib.RunKaldiCommand("utils/utt2spk_to_spk2utt.pl <{output_dir}/utt2spk >{output_dir}/spk2utt" @@ -344,7 +345,7 @@ def Main(): prefix = args.prefix, speech_rvb_probability = args.speech_rvb_probability, noise_adding_probability = args.noise_adding_probability, - max_noises_added = args.max_noises_added) + max_noises_per_minute = args.max_noises_per_minute) if __name__ == "__main__": Main() From 970def5d4387fb3f4331b28f8dd1d0d123318c1c Mon Sep 17 00:00:00 2001 From: Tom Ko Date: Fri, 15 Jul 2016 12:12:51 -0400 Subject: [PATCH 08/14] Adding data_lib.py; adding more comments, splitting large function in reverberate_data_dir.py --- egs/wsj/s5/steps/data/data_lib.py | 23 ++ egs/wsj/s5/steps/data/reverberate_data_dir.py | 287 ++++++++++++------ src/featbin/wav-reverberate.cc | 2 +- 3 files changed, 211 insertions(+), 101 deletions(-) create mode 100644 egs/wsj/s5/steps/data/data_lib.py diff --git a/egs/wsj/s5/steps/data/data_lib.py b/egs/wsj/s5/steps/data/data_lib.py new file mode 100644 index 00000000000..52aa83cae81 --- /dev/null +++ b/egs/wsj/s5/steps/data/data_lib.py @@ -0,0 +1,23 @@ +import subprocess +#import logging +#import math +#import re +#import time +#import argparse + +def RunKaldiCommand(command, wait = True): + """ Runs commands frequently seen in Kaldi scripts. These are usually a + sequence of commands connected by pipes, so we use shell=True """ + #logger.info("Running the command\n{0}".format(command)) + p = subprocess.Popen(command, shell = True, + stdout = subprocess.PIPE, + stderr = subprocess.PIPE) + + if wait: + [stdout, stderr] = p.communicate() + if p.returncode is not 0: + raise Exception("There was an error while running the command {0}\n".format(command)+"-"*10+"\n"+stderr) + return stdout, stderr + else: + return p + diff --git a/egs/wsj/s5/steps/data/reverberate_data_dir.py b/egs/wsj/s5/steps/data/reverberate_data_dir.py index 3f6f04fb340..92114b23614 100755 --- a/egs/wsj/s5/steps/data/reverberate_data_dir.py +++ b/egs/wsj/s5/steps/data/reverberate_data_dir.py @@ -7,19 +7,7 @@ from __future__ import print_function import argparse, glob, math, os, random, sys, warnings, copy, imp, ast -train_lib = imp.load_source('ntl', 'steps/nnet3/nnet3_train_lib.py') - -class list_cyclic_iterator: - def __init__(self, list): - self.list_index = 0 - self.list = list - random.shuffle(self.list) - - def next(self): - item = self.list[self.list_index] - self.list_index = (self.list_index + 1) % len(self.list) - return item - +data_lib = imp.load_source('ntl', 'steps/data/data_lib.py') def GetArgs(): # we add compulsary arguments as named arguments for readability @@ -50,8 +38,10 @@ def GetArgs(): parser.add_argument('--prefix', type=str, default = None, help='prefix for the id of the corrupted utterances') parser.add_argument("--speech-rvb-probability", type=float, default = 0.8, help="Probability of reverberating a speech signal, e.g. 0 <= p <= 1") - parser.add_argument("--noise-adding-probability", type=float, default = 0.4, + parser.add_argument("--pointsource-noise-addition-probability", type=float, default = 0.4, help="Probability of adding point-source noises, e.g. 0 <= p <= 1") + parser.add_argument("--isotropic-noise-addition-probability", type=float, default = 0.4, + help="Probability of adding isotropic noises, e.g. 0 <= p <= 1") parser.add_argument("--max-noises-per-minute", type=int, default = 2, help="This controls the maximum number of point-source noises that could be added to a recording according to its duration") parser.add_argument('--random-seed', type=int, default=0, help='seed to be used in the randomization of impulese and noises') @@ -86,6 +76,18 @@ def CheckArgs(args): return args +class list_cyclic_iterator: + def __init__(self, list): + self.list_index = 0 + self.list = list + random.shuffle(self.list) + + def next(self): + item = self.list[self.list_index] + self.list_index = (self.list_index + 1) % len(self.list) + return item + + # This function pick the item according to the associated probability # The input could be either a dictinoary of a list def PickItemWithProbability(x): @@ -116,78 +118,144 @@ def ParseFileToDict(file, assert2fields = False, value_processor = None): dict[parts[0]] = value_processor(parts[1:]) return dict +def WriteDictToFile(dict, file_name): + file = open(file_name, 'w') + keys = dict.keys() + keys.sort() + for key in keys: + value = dict[key] + if type(value) in [list, tuple] : + if type(value) is tuple: + value = list(value) + value.sort() + value = ' '.join(value) + file.write('{0}\t{1}\n'.format(key, value)) + file.close() + + +# This function returns only the isotropic noises according to the specified RIR id +def FilterIsotropicNoiseList(iso_noise_list, rir_id): + filtered_list = [] + for noise in iso_noise_list: + if noise.rir_id == rir_id: + filtered_list.append(noise) + + return filtered_list + +def GenerateReverberationOpts(room_dict, # the room dictionary, please refer to MakeRoomDict() for the format + point_noise_list, # the point source noise list + iso_noise_list, # the isotropic noise list + foreground_snrs, # the SNR for adding the foreground noises + background_snrs, # the SNR for adding the background noises + speech_rvb_probability, # Probability of reverberating a speech signal + isotropic_noise_addition_probability, # Probability of adding isotropic noises + pointsource_noise_addition_probability, # Probability of adding point-source noises + speech_dur, # duration of the recording + max_noises_recording # Maximum number of point-source noises that can be added + ): + reverberate_opts = "" + noises_added = [] + snrs_added = [] + start_times_added = [] + # Randomly select the room + room = PickItemWithProbability(room_dict) + # Randomly select the RIR in the room + speech_rir = PickItemWithProbability(room.rir_list) + if random.random() < speech_rvb_probability: + # pick the RIR to reverberate the speech + reverberate_opts += "--impulse-response={0} ".format(speech_rir.rir_file_location) + + rir_iso_noise_list = FilterIsotropicNoiseList(iso_noise_list, speech_rir.rir_id) + # Add the corresponding isotropic noise associated with the selected RIR + if len(rir_iso_noise_list) > 0 and random.random() < isotropic_noise_addition_probability: + isotropic_noise = PickItemWithProbability(rir_iso_noise_list) + # extend the isotropic noise to the length of the speech waveform + noises_added.append("wav-reverberate --duration={1} {0} - |".format(isotropic_noise.noise_file_location, speech_dur)) + snrs_added.append(background_snrs.next()) + start_times_added.append(0) + + # Add the point-source noise + if len(point_noise_list) > 0 and random.random() < pointsource_noise_addition_probability: + for k in range(random.randint(1, max_noises_recording)): + # pick the RIR to reverberate the point-source noise + noise = PickItemWithProbability(point_noise_list) + noise_rir = PickItemWithProbability(room.rir_list) + if noise.bg_fg_type == "background": + start_times_added.append(0) + noises_added.append("wav-reverberate --duration={2} --impulse-response={1} {0} - |".format(noise.noise_file_location, noise_rir.rir_file_location, speech_dur)) + snrs_added.append(background_snrs.next()) + else: + start_times_added.append(round(random.random() * speech_dur, 2)) + noises_added.append("wav-reverberate --impulse-response={1} {0} - |".format(noise.noise_file_location, noise_rir.rir_file_location)) + snrs_added.append(foreground_snrs.next()) + + assert len(noises_added) == len(snrs_added) + assert len(noises_added) == len(start_times_added) + + if len(noises_added) > 0: + reverberate_opts += "--additive-signals='{0}' ".format(','.join(noises_added)) + reverberate_opts += "--snrs='{0}' ".format(','.join(map(lambda x:str(x),snrs_added))) + reverberate_opts += "--start-times='{0}' ".format(','.join(map(lambda x:str(x),start_times_added))) -# This is the major function to generate pipeline command for the corruption + return reverberate_opts + +# This is the main function to generate pipeline command for the corruption # The generic command of wav-reverberate will be like: # wav-reverberate --duration=t --impulse-response=rir.wav # --additive-signals='noise1.wav,noise2.wav' --snrs='snr1,snr2' --start-times='s1,s2' input.wav output.wav -def CorruptWav(wav_scp, durations, output_dir, room_dict, noise_list, foreground_snr_array, background_snr_array, num_replicas, prefix, speech_rvb_probability, noise_adding_probability, max_noises_per_minute): +def CorruptWav(wav_scp, # the dictionary of which elements are the IO of the speech recordings + durations, # the dictionary of which elements are the duration (in sec) of the speech recordings + output_dir, # output directory to write the corrupted wav.scp + room_dict, # the room dictionary, please refer to MakeRoomDict() for the format + point_noise_list, # the point source noise list + iso_noise_list, # the isotropic noise list + foreground_snr_array, # the SNR for adding the foreground noises + background_snr_array, # the SNR for adding the background noises + num_replicas, # Number of replicate to generated for the data + prefix, # prefix for the id of the corrupted utterances + speech_rvb_probability, # Probability of reverberating a speech signal + isotropic_noise_addition_probability, # Probability of adding isotropic noises + pointsource_noise_addition_probability, # Probability of adding point-source noises + max_noises_per_minute # maximum number of point-source noises that can be added to a recording according to its duration + ): foreground_snrs = list_cyclic_iterator(foreground_snr_array) background_snrs = list_cyclic_iterator(background_snr_array) - command_list = [] + corrupted_wav_scp = {} for i in range(num_replicas): keys = wav_scp.keys() keys.sort() for wav_id in keys: - wav_pipe = wav_scp[wav_id] + wav_original_pipe = wav_scp[wav_id] # check if it is really a pipe - if len(wav_pipe.split()) == 1: - wav_pipe = "cat {0} |".format(wav_pipe) + if len(wav_original_pipe.split()) == 1: + wav_original_pipe = "cat {0} |".format(wav_original_pipe) speech_dur = durations[wav_id] max_noises_recording = math.floor(max_noises_per_minute * speech_dur / 60) if prefix is not None: - wav_id = prefix + str(i) + "_" + wav_id - - # pick the room - room = PickItemWithProbability(room_dict) - command_opts = "" - noises_added = [] - snrs_added = [] - start_times_added = [] - if random.random() < speech_rvb_probability: - # pick the RIR to reverberate the speech - speech_rir = PickItemWithProbability(room.rir_list) - command_opts += "--impulse-response={0} ".format(speech_rir.rir_file_location) - # add the corresponding isotropic noise if there is any - if len(speech_rir.iso_noise_list) > 0: - isotropic_noise = PickItemWithProbability(speech_rir.iso_noise_list) - # extend the isotropic noise to the length of the speech waveform - noises_added.append("wav-reverberate --duration={1} {0} - |".format(isotropic_noise.noise_file_location, speech_dur)) - snrs_added.append(background_snrs.next()) - start_times_added.append(0) - - # Add the point-source noise - if len(noise_list) > 0 and random.random() < noise_adding_probability: - for k in range(random.randint(1, max_noises_recording)): - # pick the RIR to reverberate the point-source noise - noise = PickItemWithProbability(noise_list) - noise_rir = PickItemWithProbability(room.rir_list) - if noise.bg_fg_type == "background": - start_times_added.append(0) - noises_added.append("wav-reverberate --duration={2} --impulse-response={1} {0} - |".format(noise.noise_file_location, noise_rir.rir_file_location, speech_dur)) - snrs_added.append(background_snrs.next()) - else: - start_times_added.append(round(random.random() * speech_dur, 2)) - noises_added.append("wav-reverberate --impulse-response={1} {0} - |".format(noise.noise_file_location, noise_rir.rir_file_location)) - snrs_added.append(foreground_snrs.next()) - - if len(noises_added) > 0: - command_opts += "--additive-signals='{0}' ".format(','.join(noises_added)) - if len(snrs_added) > 0: - command_opts += "--snrs='{0}' ".format(','.join(map(lambda x:str(x),snrs_added))) - if len(start_times_added) > 0: - command_opts += "--start-times='{0}' ".format(','.join(map(lambda x:str(x),start_times_added))) + new_wav_id = prefix + str(i) + "_" + wav_id + else: + new_wav_id = wav_id + + reverberate_opts = GenerateReverberationOpts(room_dict, # the room dictionary, please refer to MakeRoomDict() for the format + point_noise_list, # the point source noise list + iso_noise_list, # the isotropic noise list + foreground_snrs, # the SNR for adding the foreground noises + background_snrs, # the SNR for adding the background noises + speech_rvb_probability, # Probability of reverberating a speech signal + isotropic_noise_addition_probability, # Probability of adding isotropic noises + pointsource_noise_addition_probability, # Probability of adding point-source noises + speech_dur, # duration of the recording + max_noises_recording # Maximum number of point-source noises that can be added + ) - if command_opts == "": - command = "{0} {1}\n".format(wav_id, wav_pipe) + if reverberate_opts == "": + wav_corrupted_pipe = "{0}".format(wav_original_pipe) else: - command = "{0} {1} wav-reverberate {2} - - |\n".format(wav_id, wav_pipe, command_opts) + wav_corrupted_pipe = "{0} wav-reverberate {1} - - |".format(wav_original_pipe, reverberate_opts) - command_list.append(command) + corrupted_wav_scp[new_wav_id] = wav_corrupted_pipe - file_handle = open(output_dir + "/wav.scp", 'w') - file_handle.write("".join(command_list)) - file_handle.close() + WriteDictToFile(corrupted_wav_scp, output_dir + "/wav.scp") # This function replicate the entries in files like segments, utt2spk, text @@ -207,20 +275,37 @@ def AddPrefixToFields(input_file, output_file, num_replicas, prefix, field = [0] f.close() -def CreateReverberatedCopy(input_dir, output_dir, room_dict, noise_list, foreground_snr_string, background_snr_string, num_replicas, prefix, speech_rvb_probability, noise_adding_probability, max_noises_per_minute): +# This function creates multiple copies of the necessary files, e.g. utt2spk, wav.scp ... +def CreateReverberatedCopy(input_dir, + output_dir, + room_dict, # the room dictionary, please refer to MakeRoomDict() for the format + point_noise_list, # the point source noise list + iso_noise_list, # the isotropic noise list + foreground_snr_string, # the SNR for adding the foreground noises + background_snr_string, # the SNR for adding the background noises + num_replicas, # Number of replicate to generated for the data + prefix, # prefix for the id of the corrupted utterances + speech_rvb_probability, # Probability of reverberating a speech signal + isotropic_noise_addition_probability, # Probability of adding isotropic noises + pointsource_noise_addition_probability, # Probability of adding point-source noises + max_noises_per_minute # maximum number of point-source noises that can be added to a recording according to its duration + ): if not os.path.isfile(input_dir + "/reco2dur"): print("Getting the duration of the recordings..."); - train_lib.RunKaldiCommand("wav-to-duration --read-entire-file=true scp:{0}/wav.scp ark,t:{0}/reco2dur".format(input_dir)) + data_lib.RunKaldiCommand("wav-to-duration --read-entire-file=true scp:{0}/wav.scp ark,t:{0}/reco2dur".format(input_dir)) durations = ParseFileToDict(input_dir + "/reco2dur", value_processor = lambda x: float(x[0])) wav_scp = ParseFileToDict(input_dir + "/wav.scp", value_processor = lambda x: " ".join(x)) foreground_snr_array = map(lambda x: float(x), foreground_snr_string.split(':')) background_snr_array = map(lambda x: float(x), background_snr_string.split(':')) - CorruptWav(wav_scp, durations, output_dir, room_dict, noise_list, foreground_snr_array, background_snr_array, num_replicas, prefix, speech_rvb_probability, noise_adding_probability, max_noises_per_minute) + CorruptWav(wav_scp, durations, output_dir, room_dict, point_noise_list, iso_noise_list, + foreground_snr_array, background_snr_array, num_replicas, prefix, + speech_rvb_probability, isotropic_noise_addition_probability, + pointsource_noise_addition_probability, max_noises_per_minute) AddPrefixToFields(input_dir + "/utt2spk", output_dir + "/utt2spk", num_replicas, prefix, field = [0,1]) - train_lib.RunKaldiCommand("utils/utt2spk_to_spk2utt.pl <{output_dir}/utt2spk >{output_dir}/spk2utt" + data_lib.RunKaldiCommand("utils/utt2spk_to_spk2utt.pl <{output_dir}/utt2spk >{output_dir}/spk2utt" .format(output_dir = output_dir)) if os.path.isfile(input_dir + "/text"): @@ -230,10 +315,12 @@ def CreateReverberatedCopy(input_dir, output_dir, room_dict, noise_list, foregro if os.path.isfile(input_dir + "/reco2file_and_channel"): AddPrefixToFields(input_dir + "/reco2file_and_channel", output_dir + "/reco2file_and_channel", num_replicas, prefix, field = [0,1]) - train_lib.RunKaldiCommand("utils/validate_data_dir.sh --no-feats {output_dir}" + data_lib.RunKaldiCommand("utils/validate_data_dir.sh --no-feats {output_dir}" .format(output_dir = output_dir)) -def SmoothProbability(list): + +# This function smooths the probability distribution in the list +def SmoothProbabilityDistribution(list): uniform_probability = 1 / float(len(list)) for item in list: if item.probability is None: @@ -242,13 +329,15 @@ def SmoothProbability(list): # smooth the probability item.probability = 0.3 * item.probability + 0.7 * uniform_probability - sum_p = sum(item.probability for item in list) # Normalize the probability + sum_p = sum(item.probability for item in list) for item in list: item.probability = item.probability / sum_p return list +# This function creates the RIR list +# Each item in the list contains the following arguments def ParseRirList(rir_list_file): rir_parser = argparse.ArgumentParser() rir_parser.add_argument('--rir-id', type=str, required=True, help='rir id') @@ -267,11 +356,14 @@ def ParseRirList(rir_list_file): setattr(rir, "iso_noise_list", []) rir_list.append(rir) - return SmoothProbability(rir_list) + return SmoothProbabilityDistribution(rir_list) -# This function crate the room dictinoary from the rir list -# The key of the returned dictionary is the room id +# This function divides the global RIR list into local lists +# according to the room where the RIRs are generated +# It returns the room dictionary indexed by the room id +# Each element in the room dictionary contains a local RIR list +# and the probability of the corresponding room def MakeRoomDict(rir_list): room_dict = {} for rir in rir_list: @@ -282,18 +374,21 @@ def MakeRoomDict(rir_list): setattr(room_dict[rir.room_id], "probability", 0) room_dict[rir.room_id].rir_list.append(rir) + # the probability of the room is the sum of probabilities of its RIR for key in room_dict.keys(): room_dict[key].probability = sum(rir.probability for rir in room_dict[key].rir_list) return room_dict - -def ParseNoiseList(rir_list, noise_list_file): +# This function creates the point-source noise list +# and the isotropic noise list from the noise information file +# Each item in the list contains the following arguments +def ParseNoiseList(noise_list_file): noise_parser = argparse.ArgumentParser() noise_parser.add_argument('--noise-id', type=str, required=True, help='noise id') noise_parser.add_argument('--noise-type', type=str, required=True, help='the type of noise; i.e. isotropic or point-source', choices = ["isotropic", "point-source"]) noise_parser.add_argument('--bg-fg-type', type=str, default="background", help='background or foreground noise', choices = ["background", "foreground"]) - noise_parser.add_argument('--rir-file', type=str, default=None, help='compulsary if isotropic, should not be specified if point-source') + noise_parser.add_argument('--rir-id', type=str, default=None, help='compulsary if isotropic, should not be specified if point-source') noise_parser.add_argument('--probability', type=float, default=None, help='probability of the noise.') noise_parser.add_argument('noise_file_location', type=str, help='noise file location') @@ -303,26 +398,15 @@ def ParseNoiseList(rir_list, noise_list_file): for line in noise_lines: noise = noise_parser.parse_args(line.split()) if noise.noise_type == "isotropic": - if noise.rir_file is None: - raise Exception("--rir-file must be specified if --noise-type is point-source") + if noise.rir_id is None: + raise Exception("--rir-id must be specified if --noise-type is isotropic") else: iso_noise_list.append(noise) else: point_noise_list.append(noise) - iso_noise_list = SmoothProbability(iso_noise_list) - - for iso_noise in iso_noise_list: - id = -1 - for j in range(len(rir_list)): - if iso_noise.rir_file == rir_list[j].rir_file_location: - id = j - rir_list[id].iso_noise_list.append(noise) - break; - if id == -1: - warnings.warn("Rir file specified for noise id {0} is not found in rir_list".format(iso_noise.noise_id)) - - return (SmoothProbability(point_noise_list), rir_list) + return (SmoothProbabilityDistribution(point_noise_list), + SmoothProbabilityDistribution(iso_noise_list)) def Main(): @@ -331,20 +415,23 @@ def Main(): rir_list = ParseRirList(args.rir_list_file) noise_list = [] if args.noise_list_file is not None: - noise_list, rir_list = ParseNoiseList(rir_list, args.noise_list_file) - print("Number of point-source noises is {0}".format(len(noise_list))) + point_noise_list, iso_noise_list = ParseNoiseList(args.noise_list_file) + print("Number of point-source noises is {0}".format(len(point_noise_list))) + print("Number of isotropic noises is {0}".format(len(iso_noise_list))) room_dict = MakeRoomDict(rir_list) CreateReverberatedCopy(input_dir = args.input_dir, output_dir = args.output_dir, room_dict = room_dict, - noise_list = noise_list, + point_noise_list = point_noise_list, + iso_noise_list = iso_noise_list, foreground_snr_string = args.foreground_snr_string, background_snr_string = args.background_snr_string, num_replicas = args.num_replicas, prefix = args.prefix, speech_rvb_probability = args.speech_rvb_probability, - noise_adding_probability = args.noise_adding_probability, + isotropic_noise_addition_probability = args.isotropic_noise_addition_probability, + pointsource_noise_addition_probability = args.pointsource_noise_addition_probability, max_noises_per_minute = args.max_noises_per_minute) if __name__ == "__main__": diff --git a/src/featbin/wav-reverberate.cc b/src/featbin/wav-reverberate.cc index 80b08307172..683b8be6177 100644 --- a/src/featbin/wav-reverberate.cc +++ b/src/featbin/wav-reverberate.cc @@ -165,7 +165,7 @@ int main(int argc, char *argv[]) { "the output waveform will be equal to the input waveform." "If false, the length of the output waveform will be " "equal to (original input length + rir length - 1). " - "This value is default true and " + "This value is true by default and " "it only affects the output when RIR file is provided."); po.Register("input-wave-channel", &input_channel, "Specifies the channel to be used from input as only a " From d335c718e3b1849afe40a61fc436c0e098c6a36e Mon Sep 17 00:00:00 2001 From: Tom Ko Date: Tue, 19 Jul 2016 03:38:19 -0400 Subject: [PATCH 09/14] adding AddPointSourceNoise() --- egs/wsj/s5/steps/data/reverberate_data_dir.py | 111 +++++++++++------- 1 file changed, 71 insertions(+), 40 deletions(-) diff --git a/egs/wsj/s5/steps/data/reverberate_data_dir.py b/egs/wsj/s5/steps/data/reverberate_data_dir.py index 92114b23614..53d3aa44973 100755 --- a/egs/wsj/s5/steps/data/reverberate_data_dir.py +++ b/egs/wsj/s5/steps/data/reverberate_data_dir.py @@ -36,11 +36,11 @@ def GetArgs(): parser.add_argument('--foreground-snrs', type=str, dest = "foreground_snr_string", default = '20:10:0', help='snrs for foreground noises') parser.add_argument('--background-snrs', type=str, dest = "background_snr_string", default = '20:10:0', help='snrs for background noises') parser.add_argument('--prefix', type=str, default = None, help='prefix for the id of the corrupted utterances') - parser.add_argument("--speech-rvb-probability", type=float, default = 0.8, + parser.add_argument("--speech-rvb-probability", type=float, default = 1.0, help="Probability of reverberating a speech signal, e.g. 0 <= p <= 1") - parser.add_argument("--pointsource-noise-addition-probability", type=float, default = 0.4, + parser.add_argument("--pointsource-noise-addition-probability", type=float, default = 1.0, help="Probability of adding point-source noises, e.g. 0 <= p <= 1") - parser.add_argument("--isotropic-noise-addition-probability", type=float, default = 0.4, + parser.add_argument("--isotropic-noise-addition-probability", type=float, default = 1.0, help="Probability of adding isotropic noises, e.g. 0 <= p <= 1") parser.add_argument("--max-noises-per-minute", type=int, default = 2, help="This controls the maximum number of point-source noises that could be added to a recording according to its duration") @@ -88,8 +88,9 @@ def next(self): return item -# This function pick the item according to the associated probability -# The input could be either a dictinoary of a list +# This functions picks an item from the collection according to the associated probability distribution. +# The probability estimate of each item in the collection is stored in the "probability" field of +# the particular item. x : a collection (list or dictionary) where the values contain a field called probability def PickItemWithProbability(x): if isinstance(x, dict): plist = list(set(x.values())) @@ -105,6 +106,8 @@ def PickItemWithProbability(x): assert False, "Shouldn't get here as the accumulated probability should always equal to 1" +# This function parses a file and pack the data into a dictionary +# It is useful for parsing file like wav.scp, utt2spk, text...etc def ParseFileToDict(file, assert2fields = False, value_processor = None): if value_processor is None: value_processor = lambda x: x[0] @@ -118,6 +121,7 @@ def ParseFileToDict(file, assert2fields = False, value_processor = None): dict[parts[0]] = value_processor(parts[1:]) return dict +# This function creates a file and write the content of a dictionary into it def WriteDictToFile(dict, file_name): file = open(file_name, 'w') keys = dict.keys() @@ -134,6 +138,7 @@ def WriteDictToFile(dict, file_name): # This function returns only the isotropic noises according to the specified RIR id +# Please refer to ParseNoiseList() for the format of iso_noise_list def FilterIsotropicNoiseList(iso_noise_list, rir_id): filtered_list = [] for noise in iso_noise_list: @@ -142,6 +147,33 @@ def FilterIsotropicNoiseList(iso_noise_list, rir_id): return filtered_list + +def AddPointSourceNoise(noise_addition_descriptor, # descriptor to store the information of the noise added + room, # the room selected + point_noise_list, # the point source noise list + pointsource_noise_addition_probability, # Probability of adding point-source noises + foreground_snrs, # the SNR for adding the foreground noises + background_snrs, # the SNR for adding the background noises + speech_dur, # duration of the recording + max_noises_recording # Maximum number of point-source noises that can be added + ): + if len(point_noise_list) > 0 and random.random() < pointsource_noise_addition_probability: + for k in range(random.randint(1, max_noises_recording)): + # pick the RIR to reverberate the point-source noise + noise = PickItemWithProbability(point_noise_list) + noise_rir = PickItemWithProbability(room.rir_list) + if noise.bg_fg_type == "background": + noise_addition_descriptor['noise_io'].append("wav-reverberate --duration={2} --impulse-response={1} {0} - |".format(noise.noise_file_location, noise_rir.rir_file_location, speech_dur)) + noise_addition_descriptor['start_times'].append(0) + noise_addition_descriptor['snrs'].append(background_snrs.next()) + else: + noise_addition_descriptor['noise_io'].append("wav-reverberate --impulse-response={1} {0} - |".format(noise.noise_file_location, noise_rir.rir_file_location)) + noise_addition_descriptor['start_times'].append(round(random.random() * speech_dur, 2)) + noise_addition_descriptor['snrs'].append(foreground_snrs.next()) + + return noise_addition_descriptor + + def GenerateReverberationOpts(room_dict, # the room dictionary, please refer to MakeRoomDict() for the format point_noise_list, # the point source noise list iso_noise_list, # the isotropic noise list @@ -154,9 +186,9 @@ def GenerateReverberationOpts(room_dict, # the room dictionary, please refer to max_noises_recording # Maximum number of point-source noises that can be added ): reverberate_opts = "" - noises_added = [] - snrs_added = [] - start_times_added = [] + noise_addition_descriptor = {'noise_io': [], + 'start_times': [], + 'snrs': []} # Randomly select the room room = PickItemWithProbability(room_dict) # Randomly select the RIR in the room @@ -170,35 +202,30 @@ def GenerateReverberationOpts(room_dict, # the room dictionary, please refer to if len(rir_iso_noise_list) > 0 and random.random() < isotropic_noise_addition_probability: isotropic_noise = PickItemWithProbability(rir_iso_noise_list) # extend the isotropic noise to the length of the speech waveform - noises_added.append("wav-reverberate --duration={1} {0} - |".format(isotropic_noise.noise_file_location, speech_dur)) - snrs_added.append(background_snrs.next()) - start_times_added.append(0) - - # Add the point-source noise - if len(point_noise_list) > 0 and random.random() < pointsource_noise_addition_probability: - for k in range(random.randint(1, max_noises_recording)): - # pick the RIR to reverberate the point-source noise - noise = PickItemWithProbability(point_noise_list) - noise_rir = PickItemWithProbability(room.rir_list) - if noise.bg_fg_type == "background": - start_times_added.append(0) - noises_added.append("wav-reverberate --duration={2} --impulse-response={1} {0} - |".format(noise.noise_file_location, noise_rir.rir_file_location, speech_dur)) - snrs_added.append(background_snrs.next()) - else: - start_times_added.append(round(random.random() * speech_dur, 2)) - noises_added.append("wav-reverberate --impulse-response={1} {0} - |".format(noise.noise_file_location, noise_rir.rir_file_location)) - snrs_added.append(foreground_snrs.next()) - - assert len(noises_added) == len(snrs_added) - assert len(noises_added) == len(start_times_added) - - if len(noises_added) > 0: - reverberate_opts += "--additive-signals='{0}' ".format(','.join(noises_added)) - reverberate_opts += "--snrs='{0}' ".format(','.join(map(lambda x:str(x),snrs_added))) - reverberate_opts += "--start-times='{0}' ".format(','.join(map(lambda x:str(x),start_times_added))) + noise_addition_descriptor['noise_io'].append("wav-reverberate --duration={1} {0} - |".format(isotropic_noise.noise_file_location, speech_dur)) + noise_addition_descriptor['start_times'].append(0) + noise_addition_descriptor['snrs'].append(background_snrs.next()) + + noise_addition_descriptor = AddPointSourceNoise(noise_addition_descriptor, # descriptor to store the information of the noise added + room, # the room selected + point_noise_list, # the point source noise list + pointsource_noise_addition_probability, # Probability of adding point-source noises + foreground_snrs, # the SNR for adding the foreground noises + background_snrs, # the SNR for adding the background noises + speech_dur, # duration of the recording + max_noises_recording # Maximum number of point-source noises that can be added + ) + + assert len(noise_addition_descriptor['noise_io']) == len(noise_addition_descriptor['start_times']) + assert len(noise_addition_descriptor['noise_io']) == len(noise_addition_descriptor['snrs']) + if len(noise_addition_descriptor['noise_io']) > 0: + reverberate_opts += "--additive-signals='{0}' ".format(','.join(noise_addition_descriptor['noise_io'])) + reverberate_opts += "--start-times='{0}' ".format(','.join(map(lambda x:str(x), noise_addition_descriptor['start_times']))) + reverberate_opts += "--snrs='{0}' ".format(','.join(map(lambda x:str(x), noise_addition_descriptor['snrs']))) return reverberate_opts + # This is the main function to generate pipeline command for the corruption # The generic command of wav-reverberate will be like: # wav-reverberate --duration=t --impulse-response=rir.wav @@ -320,14 +347,14 @@ def CreateReverberatedCopy(input_dir, # This function smooths the probability distribution in the list -def SmoothProbabilityDistribution(list): +def SmoothProbabilityDistribution(list, smoothing_weight=0.3): uniform_probability = 1 / float(len(list)) for item in list: if item.probability is None: item.probability = uniform_probability else: # smooth the probability - item.probability = 0.3 * item.probability + 0.7 * uniform_probability + item.probability = (1 - smoothing_weight) * item.probability + smoothing_weight * uniform_probability # Normalize the probability sum_p = sum(item.probability for item in list) @@ -337,11 +364,13 @@ def SmoothProbabilityDistribution(list): return list # This function creates the RIR list -# Each item in the list contains the following arguments +# Each noise item in the list contains the following attributes: +# rir_id, room_id, receiver_position_id, source_position_id, rt60, drr, probability +# Please refer to the help messages in the parser for the meaning of these attributes def ParseRirList(rir_list_file): rir_parser = argparse.ArgumentParser() - rir_parser.add_argument('--rir-id', type=str, required=True, help='rir id') - rir_parser.add_argument('--room-id', type=str, required=True, help='room id') + rir_parser.add_argument('--rir-id', type=str, required=True, help='This id is unique for each RIR and the noise may associate with a particular RIR by refering to this id') + rir_parser.add_argument('--room-id', type=str, required=True, help='This is the room that where the RIR is generated') rir_parser.add_argument('--receiver-position-id', type=str, default=None, help='receiver position id') rir_parser.add_argument('--source-position-id', type=str, default=None, help='source position id') rir_parser.add_argument('--rt60', type=float, default=None, help='RT60 is the time required for reflections of a direct sound to decay 60 dB.') @@ -382,7 +411,9 @@ def MakeRoomDict(rir_list): # This function creates the point-source noise list # and the isotropic noise list from the noise information file -# Each item in the list contains the following arguments +# Each noise item in the list contains the following attributes: +# noise_id, noise_type, bg_fg_type, rir_id, probability, noise_file_location +# Please refer to the help messages in the parser for the meaning of these attributes def ParseNoiseList(noise_list_file): noise_parser = argparse.ArgumentParser() noise_parser.add_argument('--noise-id', type=str, required=True, help='noise id') From ebfba00a1d4742c50db9ffb57504d885dc033c63 Mon Sep 17 00:00:00 2001 From: Tom Ko Date: Wed, 20 Jul 2016 23:51:42 -0400 Subject: [PATCH 10/14] Fixing spelling mistake and modifying comments --- ...ta_lib.py => data_dir_manipulation_lib.py} | 5 - egs/wsj/s5/steps/data/reverberate_data_dir.py | 99 +++++++++++-------- 2 files changed, 56 insertions(+), 48 deletions(-) rename egs/wsj/s5/steps/data/{data_lib.py => data_dir_manipulation_lib.py} (90%) diff --git a/egs/wsj/s5/steps/data/data_lib.py b/egs/wsj/s5/steps/data/data_dir_manipulation_lib.py similarity index 90% rename from egs/wsj/s5/steps/data/data_lib.py rename to egs/wsj/s5/steps/data/data_dir_manipulation_lib.py index 52aa83cae81..1f7253d4891 100644 --- a/egs/wsj/s5/steps/data/data_lib.py +++ b/egs/wsj/s5/steps/data/data_dir_manipulation_lib.py @@ -1,9 +1,4 @@ import subprocess -#import logging -#import math -#import re -#import time -#import argparse def RunKaldiCommand(command, wait = True): """ Runs commands frequently seen in Kaldi scripts. These are usually a diff --git a/egs/wsj/s5/steps/data/reverberate_data_dir.py b/egs/wsj/s5/steps/data/reverberate_data_dir.py index 53d3aa44973..f9a6617fe00 100755 --- a/egs/wsj/s5/steps/data/reverberate_data_dir.py +++ b/egs/wsj/s5/steps/data/reverberate_data_dir.py @@ -7,12 +7,12 @@ from __future__ import print_function import argparse, glob, math, os, random, sys, warnings, copy, imp, ast -data_lib = imp.load_source('ntl', 'steps/data/data_lib.py') +data_lib = imp.load_source('dml', 'steps/data/data_dir_manipulation_lib.py') def GetArgs(): - # we add compulsary arguments as named arguments for readability + # we add required arguments as named arguments for readability parser = argparse.ArgumentParser(description="Reverberate the data directory with an option " - "to add isotropic and point source noiseis. " + "to add isotropic and point source noises. " "Usage: reverberate_data_dir.py [options...] " "E.g. reverberate_data_dir.py --rir-list-file rir_list " "--foreground-snrs 20:10:15:5:0 --background-snrs 20:10:15:5:0 " @@ -22,20 +22,24 @@ def GetArgs(): parser.add_argument("--rir-list-file", type=str, required = True, help="RIR information file, the format of the file is " - "--rir-id --room-id " + "--rir-id --room-id " "--receiver-position-id --source-position-id " - "--rt-60 < --drr < location(support Kaldi IO strings) >") + "--rt-60 --drr " + "E.g. --rir-id 00001 --room-id 001 --receiver-position-id 001 --source-position-id 00001 " + "--rt60 0.58 --drr -4.885 data/impulses/Room001-00001.wav") parser.add_argument("--noise-list-file", type=str, default = None, help="Noise information file, the format of the file is" - "--noise-id --noise-type " - "--bg-fg-type " - "--rir-file " - "< location=(support Kaldi IO strings) >") + "--noise-id --noise-type " + "--bg-fg-type " + "--rir-file " + " " + "E.g. --noise-id 001 --noise-type isotropic --rir-id 00019 iso_noise.wav") parser.add_argument("--num-replications", type=int, dest = "num_replicas", default = 1, help="Number of replicate to generated for the data") - parser.add_argument('--foreground-snrs', type=str, dest = "foreground_snr_string", default = '20:10:0', help='snrs for foreground noises') - parser.add_argument('--background-snrs', type=str, dest = "background_snr_string", default = '20:10:0', help='snrs for background noises') - parser.add_argument('--prefix', type=str, default = None, help='prefix for the id of the corrupted utterances') + parser.add_argument('--foreground-snrs', type=str, dest = "foreground_snr_string", default = '20:10:0', help='When foreground noises are being added the script will iterate through these SNRs.') + parser.add_argument('--background-snrs', type=str, dest = "background_snr_string", default = '20:10:0', help='When background noises are being added the script will iterate through these SNRs.') + parser.add_argument('--prefix', type=str, default = None, help='This prefix will modified for each reverberated copy, by adding additional affixes.') parser.add_argument("--speech-rvb-probability", type=float, default = 1.0, help="Probability of reverberating a speech signal, e.g. 0 <= p <= 1") parser.add_argument("--pointsource-noise-addition-probability", type=float, default = 1.0, @@ -44,7 +48,7 @@ def GetArgs(): help="Probability of adding isotropic noises, e.g. 0 <= p <= 1") parser.add_argument("--max-noises-per-minute", type=int, default = 2, help="This controls the maximum number of point-source noises that could be added to a recording according to its duration") - parser.add_argument('--random-seed', type=int, default=0, help='seed to be used in the randomization of impulese and noises') + parser.add_argument('--random-seed', type=int, default=0, help='seed to be used in the randomization of impulses and noises') parser.add_argument("input_dir", help="Input data directory") parser.add_argument("output_dir", @@ -132,7 +136,7 @@ def WriteDictToFile(dict, file_name): if type(value) is tuple: value = list(value) value.sort() - value = ' '.join(value) + value = ' '.join(str(value)) file.write('{0}\t{1}\n'.format(key, value)) file.close() @@ -150,18 +154,20 @@ def FilterIsotropicNoiseList(iso_noise_list, rir_id): def AddPointSourceNoise(noise_addition_descriptor, # descriptor to store the information of the noise added room, # the room selected - point_noise_list, # the point source noise list + pointsource_noise_list, # the point source noise list pointsource_noise_addition_probability, # Probability of adding point-source noises foreground_snrs, # the SNR for adding the foreground noises background_snrs, # the SNR for adding the background noises speech_dur, # duration of the recording max_noises_recording # Maximum number of point-source noises that can be added ): - if len(point_noise_list) > 0 and random.random() < pointsource_noise_addition_probability: + if len(pointsource_noise_list) > 0 and random.random() < pointsource_noise_addition_probability: for k in range(random.randint(1, max_noises_recording)): # pick the RIR to reverberate the point-source noise - noise = PickItemWithProbability(point_noise_list) + noise = PickItemWithProbability(pointsource_noise_list) noise_rir = PickItemWithProbability(room.rir_list) + # If it is a background noise, the noise will be extended and be added to the whole speech + # if it is a foreground noise, the noise will not extended and be added at a random time of the speech if noise.bg_fg_type == "background": noise_addition_descriptor['noise_io'].append("wav-reverberate --duration={2} --impulse-response={1} {0} - |".format(noise.noise_file_location, noise_rir.rir_file_location, speech_dur)) noise_addition_descriptor['start_times'].append(0) @@ -175,7 +181,7 @@ def AddPointSourceNoise(noise_addition_descriptor, # descriptor to store the in def GenerateReverberationOpts(room_dict, # the room dictionary, please refer to MakeRoomDict() for the format - point_noise_list, # the point source noise list + pointsource_noise_list, # the point source noise list iso_noise_list, # the isotropic noise list foreground_snrs, # the SNR for adding the foreground noises background_snrs, # the SNR for adding the background noises @@ -190,6 +196,7 @@ def GenerateReverberationOpts(room_dict, # the room dictionary, please refer to 'start_times': [], 'snrs': []} # Randomly select the room + # Here the room probability is a sum of the probabilities of the RIRs recorded in the room. room = PickItemWithProbability(room_dict) # Randomly select the RIR in the room speech_rir = PickItemWithProbability(room.rir_list) @@ -208,7 +215,7 @@ def GenerateReverberationOpts(room_dict, # the room dictionary, please refer to noise_addition_descriptor = AddPointSourceNoise(noise_addition_descriptor, # descriptor to store the information of the noise added room, # the room selected - point_noise_list, # the point source noise list + pointsource_noise_list, # the point source noise list pointsource_noise_addition_probability, # Probability of adding point-source noises foreground_snrs, # the SNR for adding the foreground noises background_snrs, # the SNR for adding the background noises @@ -225,16 +232,26 @@ def GenerateReverberationOpts(room_dict, # the room dictionary, please refer to return reverberate_opts +# This function generates a new id from the input id +# This is needed when we have to create multiple copies of the original data +def GetNewId(id, prefix=None, copy=0): + if prefix is not None: + new_id = prefix + str(copy) + "_" + id + else: + new_id = id + + return new_id + # This is the main function to generate pipeline command for the corruption # The generic command of wav-reverberate will be like: # wav-reverberate --duration=t --impulse-response=rir.wav # --additive-signals='noise1.wav,noise2.wav' --snrs='snr1,snr2' --start-times='s1,s2' input.wav output.wav -def CorruptWav(wav_scp, # the dictionary of which elements are the IO of the speech recordings - durations, # the dictionary of which elements are the duration (in sec) of the speech recordings +def GenerateReverberatedWavScp(wav_scp, # a dictionary whose values are the Kaldi-IO strings of the speech recordings + durations, # a dictionary whose values are the duration (in sec) of the speech recordings output_dir, # output directory to write the corrupted wav.scp room_dict, # the room dictionary, please refer to MakeRoomDict() for the format - point_noise_list, # the point source noise list + pointsource_noise_list, # the point source noise list iso_noise_list, # the isotropic noise list foreground_snr_array, # the SNR for adding the foreground noises background_snr_array, # the SNR for adding the background noises @@ -251,20 +268,16 @@ def CorruptWav(wav_scp, # the dictionary of which elements are the IO of the sp for i in range(num_replicas): keys = wav_scp.keys() keys.sort() - for wav_id in keys: - wav_original_pipe = wav_scp[wav_id] + for recording_id in keys: + wav_original_pipe = wav_scp[recording_id] # check if it is really a pipe if len(wav_original_pipe.split()) == 1: wav_original_pipe = "cat {0} |".format(wav_original_pipe) - speech_dur = durations[wav_id] + speech_dur = durations[recording_id] max_noises_recording = math.floor(max_noises_per_minute * speech_dur / 60) - if prefix is not None: - new_wav_id = prefix + str(i) + "_" + wav_id - else: - new_wav_id = wav_id reverberate_opts = GenerateReverberationOpts(room_dict, # the room dictionary, please refer to MakeRoomDict() for the format - point_noise_list, # the point source noise list + pointsource_noise_list, # the point source noise list iso_noise_list, # the isotropic noise list foreground_snrs, # the SNR for adding the foreground noises background_snrs, # the SNR for adding the background noises @@ -280,7 +293,8 @@ def CorruptWav(wav_scp, # the dictionary of which elements are the IO of the sp else: wav_corrupted_pipe = "{0} wav-reverberate {1} - - |".format(wav_original_pipe, reverberate_opts) - corrupted_wav_scp[new_wav_id] = wav_corrupted_pipe + new_recording_id = GetNewId(recording_id, prefix, i) + corrupted_wav_scp[new_recording_id] = wav_corrupted_pipe WriteDictToFile(corrupted_wav_scp, output_dir + "/wav.scp") @@ -294,8 +308,7 @@ def AddPrefixToFields(input_file, output_file, num_replicas, prefix, field = [0] if len(line) > 0 and line[0] != ';': split1 = line.split() for j in field: - if prefix is not None: - split1[j] = prefix + str(i) + "_" + split1[j] + split1[j] = GetNewId(split1[j], prefix, i) print(" ".join(split1), file=f) else: print(line, file=f) @@ -306,7 +319,7 @@ def AddPrefixToFields(input_file, output_file, num_replicas, prefix, field = [0] def CreateReverberatedCopy(input_dir, output_dir, room_dict, # the room dictionary, please refer to MakeRoomDict() for the format - point_noise_list, # the point source noise list + pointsource_noise_list, # the point source noise list iso_noise_list, # the isotropic noise list foreground_snr_string, # the SNR for adding the foreground noises background_snr_string, # the SNR for adding the background noises @@ -326,7 +339,7 @@ def CreateReverberatedCopy(input_dir, foreground_snr_array = map(lambda x: float(x), foreground_snr_string.split(':')) background_snr_array = map(lambda x: float(x), background_snr_string.split(':')) - CorruptWav(wav_scp, durations, output_dir, room_dict, point_noise_list, iso_noise_list, + GenerateReverberatedWavScp(wav_scp, durations, output_dir, room_dict, pointsource_noise_list, iso_noise_list, foreground_snr_array, background_snr_array, num_replicas, prefix, speech_rvb_probability, isotropic_noise_addition_probability, pointsource_noise_addition_probability, max_noises_per_minute) @@ -391,7 +404,7 @@ def ParseRirList(rir_list_file): # This function divides the global RIR list into local lists # according to the room where the RIRs are generated # It returns the room dictionary indexed by the room id -# Each element in the room dictionary contains a local RIR list +# Its values are objects with two attributes: a local RIR list # and the probability of the corresponding room def MakeRoomDict(rir_list): room_dict = {} @@ -419,11 +432,11 @@ def ParseNoiseList(noise_list_file): noise_parser.add_argument('--noise-id', type=str, required=True, help='noise id') noise_parser.add_argument('--noise-type', type=str, required=True, help='the type of noise; i.e. isotropic or point-source', choices = ["isotropic", "point-source"]) noise_parser.add_argument('--bg-fg-type', type=str, default="background", help='background or foreground noise', choices = ["background", "foreground"]) - noise_parser.add_argument('--rir-id', type=str, default=None, help='compulsary if isotropic, should not be specified if point-source') + noise_parser.add_argument('--rir-id', type=str, default=None, help='required if isotropic, should not be specified if point-source') noise_parser.add_argument('--probability', type=float, default=None, help='probability of the noise.') noise_parser.add_argument('noise_file_location', type=str, help='noise file location') - point_noise_list = [] + pointsource_noise_list = [] iso_noise_list = [] noise_lines = map(lambda x: x.strip(), open(noise_list_file)) for line in noise_lines: @@ -434,9 +447,9 @@ def ParseNoiseList(noise_list_file): else: iso_noise_list.append(noise) else: - point_noise_list.append(noise) + pointsource_noise_list.append(noise) - return (SmoothProbabilityDistribution(point_noise_list), + return (SmoothProbabilityDistribution(pointsource_noise_list), SmoothProbabilityDistribution(iso_noise_list)) @@ -446,15 +459,15 @@ def Main(): rir_list = ParseRirList(args.rir_list_file) noise_list = [] if args.noise_list_file is not None: - point_noise_list, iso_noise_list = ParseNoiseList(args.noise_list_file) - print("Number of point-source noises is {0}".format(len(point_noise_list))) + pointsource_noise_list, iso_noise_list = ParseNoiseList(args.noise_list_file) + print("Number of point-source noises is {0}".format(len(pointsource_noise_list))) print("Number of isotropic noises is {0}".format(len(iso_noise_list))) room_dict = MakeRoomDict(rir_list) CreateReverberatedCopy(input_dir = args.input_dir, output_dir = args.output_dir, room_dict = room_dict, - point_noise_list = point_noise_list, + pointsource_noise_list = pointsource_noise_list, iso_noise_list = iso_noise_list, foreground_snr_string = args.foreground_snr_string, background_snr_string = args.background_snr_string, From 617982b2d490ea009cb60de63f34aee186ae99d7 Mon Sep 17 00:00:00 2001 From: Tom Ko Date: Tue, 26 Jul 2016 05:44:21 -0400 Subject: [PATCH 11/14] Modify the aspire recipe to use the new reverberate_data_dir.py; fixing bugs in reverberate_data_dir.py; add aspire_prep_rir_noise_list.py for generating rir_list and noise_list for aspire --- .../aspire_prep_rir_noise_list.py | 79 +++++++++++++++++++ .../local/multi_condition/run_nnet2_common.sh | 29 ++++--- egs/wsj/s5/steps/data/reverberate_data_dir.py | 73 +++++++++++------ src/feat/signal.cc | 12 +-- 4 files changed, 151 insertions(+), 42 deletions(-) create mode 100755 egs/aspire/s5/local/multi_condition/aspire_prep_rir_noise_list.py diff --git a/egs/aspire/s5/local/multi_condition/aspire_prep_rir_noise_list.py b/egs/aspire/s5/local/multi_condition/aspire_prep_rir_noise_list.py new file mode 100755 index 00000000000..9dd7a38d183 --- /dev/null +++ b/egs/aspire/s5/local/multi_condition/aspire_prep_rir_noise_list.py @@ -0,0 +1,79 @@ +#!/usr/bin/env python +# Copyright 2016 Tom Ko +# Apache 2.0 +# script to generate rir_list and noise_list in aspire + +# we're using python 3.x style print but want it to work in python 2.x, +from __future__ import print_function +import argparse, glob, math, os, sys + + +def GetArgs(): + parser = argparse.ArgumentParser(description="Prepare rir_list and noise_list for Aspire " + "Usage: reverberate_data_dir.py [options...] " + "E.g. reverberate_data_dir.py " + "data/impulses_noises data/impulses_noises/info", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + + parser.add_argument("input_dir", help="Input data directory") + parser.add_argument("output_dir", help="Output data directory") + print(' '.join(sys.argv)) + args = parser.parse_args() + + return args + + +# This function generate the rir_list file for the aspire real RIR +def GenerateRirListFile(input_dir, output_dir): + rir_list_file = open(output_dir + "/rir_list", 'w') + rir_id = 1 + room_id = 1 + for db in ["RVB2014", "RWCP", "air"]: + rir_files = glob.glob(input_dir + "/{0}_*.wav".format(db)) + for rir in rir_files: + filename = rir.split('/')[-1] + if "noise" not in filename: + rir_list_file.write('--rir-id {0} --room-id {1} {2}\n'.format(str(rir_id).zfill(5), str(room_id).zfill(3), rir)) + rir_id += 1 + room_id += 1 + rir_list_file.close() + + +# This function generate the noise_list file from the aspire noise-rir pair +def GenerateNoiseListFile(input_dir, output_dir): + noise_list_file = open(output_dir + "/noise_list", 'w') + noise_files = glob.glob(input_dir + "/*_type*_noise*.wav") + noise_id = 1 + for noise_file in noise_files: + parts = noise_file.split('/')[-1].split('_') + db_name = parts[0] + type_num = parts[1] + noise_pattern = '_'.join(parts[3:len(parts)-1]) + if db_name == "RWCP": + type_num = "type*" + matched_rir_files = glob.glob(input_dir + "/{0}_{1}_rir_{2}*.wav".format(db_name, type_num, noise_pattern)) + noise_line = "--noise-id {0} --noise-type isotropic ".format(str(noise_id).zfill(5)) + for rir in matched_rir_files: + noise_line += "--rir-linkage {0} ".format(rir) + noise_line += "{0}".format(noise_file) + noise_list_file.write("{0}\n".format(noise_line)) + noise_id += 1 + noise_list_file.close() + + +def Main(): + args = GetArgs() + + if not os.path.exists(args.output_dir): + os.makedirs(args.output_dir) + + # generating the rir_list file for the new steps/data/reverberate_data_dir.py + GenerateRirListFile(args.input_dir, args.output_dir) + + # generating the noise_list file for the new steps/data/reverberate_data_dir.py + GenerateNoiseListFile(args.input_dir, args.output_dir) + + +if __name__ == "__main__": + Main() + diff --git a/egs/aspire/s5/local/multi_condition/run_nnet2_common.sh b/egs/aspire/s5/local/multi_condition/run_nnet2_common.sh index 5b6424a1d86..15bb922726c 100755 --- a/egs/aspire/s5/local/multi_condition/run_nnet2_common.sh +++ b/egs/aspire/s5/local/multi_condition/run_nnet2_common.sh @@ -7,6 +7,8 @@ stage=1 snrs="20:10:15:5:0" +foreground_snrs="20:10:15:5:0" +background_snrs="20:10:15:5:0" num_data_reps=3 ali_dir=exp/ db_string="'air' 'rwcp' 'rvb2014'" # RIR dbs to be used in the experiment @@ -31,24 +33,29 @@ if [ $stage -le 1 ]; then --RIR-home $RIR_home \ data/impulses_noises || exit 1; + # Generate the rir_list and noise_list for the reverberate_data_dir.py to corrupt the data + python local/multi_condition/aspire_prep_rir_noise_list.py data/impulses_noises data/impulses_noises/info + # corrupt the fisher data to generate multi-condition data - # for data_dir in train dev test; do for data_dir in train dev test; do if [ "$data_dir" == "train" ]; then num_reps=$num_data_reps else num_reps=1 fi - reverb_data_dirs= - for i in `seq 1 $num_reps`; do - cur_dest_dir=" data/temp_${data_dir}_${i}" - local/multi_condition/reverberate_data_dir.sh --random-seed $i \ - --snrs "$snrs" --log-dir exp/make_corrupted_wav \ - data/${data_dir} data/impulses_noises $cur_dest_dir - reverb_data_dirs+=" $cur_dest_dir" - done - utils/combine_data.sh --extra-files utt2uniq data/${data_dir}_rvb $reverb_data_dirs - rm -rf $reverb_data_dirs + python steps/data/reverberate_data_dir.py \ + --prefix "rev" \ + --rir-list-file data/impulses_noises/info/rir_list \ + --noise-list-file data/impulses_noises/info/noise_list \ + --foreground-snrs $foreground_snrs \ + --background-snrs $background_snrs \ + --speech-rvb-probability 1 \ + --pointsource-noise-addition-probability 1 \ + --isotropic-noise-addition-probability 1 \ + --num-replications $num_reps \ + --max-noises-per-minute 1 \ + --random-seed 1 \ + data/${data_dir} data/${data_dir}_rvb done # create the dev, test and eval sets from the aspire recipe diff --git a/egs/wsj/s5/steps/data/reverberate_data_dir.py b/egs/wsj/s5/steps/data/reverberate_data_dir.py index f9a6617fe00..52b07e669b9 100755 --- a/egs/wsj/s5/steps/data/reverberate_data_dir.py +++ b/egs/wsj/s5/steps/data/reverberate_data_dir.py @@ -67,11 +67,11 @@ def CheckArgs(args): ## Check arguments. if not os.path.isfile(args.rir_list_file): - raise Exception(args.rir_list_file + "not found") + raise Exception(args.rir_list_file + " not found") if args.noise_list_file is not None: if not os.path.isfile(args.noise_list_file): - raise Exception(args.noise_list_file + "not found") + raise Exception(args.noise_list_file + " not found") if args.num_replicas > 1 and args.prefix is None: args.prefix = "rvb" @@ -146,8 +146,10 @@ def WriteDictToFile(dict, file_name): def FilterIsotropicNoiseList(iso_noise_list, rir_id): filtered_list = [] for noise in iso_noise_list: - if noise.rir_id == rir_id: - filtered_list.append(noise) + for id in noise.rir_linkage: + if id == rir_id: + filtered_list.append(noise) + break return filtered_list @@ -161,7 +163,7 @@ def AddPointSourceNoise(noise_addition_descriptor, # descriptor to store the in speech_dur, # duration of the recording max_noises_recording # Maximum number of point-source noises that can be added ): - if len(pointsource_noise_list) > 0 and random.random() < pointsource_noise_addition_probability: + if len(pointsource_noise_list) > 0 and random.random() < pointsource_noise_addition_probability and max_noises_recording > 1: for k in range(random.randint(1, max_noises_recording)): # pick the RIR to reverberate the point-source noise noise = PickItemWithProbability(pointsource_noise_list) @@ -265,7 +267,7 @@ def GenerateReverberatedWavScp(wav_scp, # a dictionary whose values are the Kal foreground_snrs = list_cyclic_iterator(foreground_snr_array) background_snrs = list_cyclic_iterator(background_snr_array) corrupted_wav_scp = {} - for i in range(num_replicas): + for i in range(1, num_replicas+1): keys = wav_scp.keys() keys.sort() for recording_id in keys: @@ -303,7 +305,7 @@ def GenerateReverberatedWavScp(wav_scp, # a dictionary whose values are the Kal def AddPrefixToFields(input_file, output_file, num_replicas, prefix, field = [0]): list = map(lambda x: x.strip(), open(input_file)) f = open(output_file, "w") - for i in range(num_replicas): + for i in range(1, num_replicas+1): for line in list: if len(line) > 0 and line[0] != ';': split1 = line.split() @@ -361,18 +363,19 @@ def CreateReverberatedCopy(input_dir, # This function smooths the probability distribution in the list def SmoothProbabilityDistribution(list, smoothing_weight=0.3): - uniform_probability = 1 / float(len(list)) - for item in list: - if item.probability is None: - item.probability = uniform_probability - else: - # smooth the probability - item.probability = (1 - smoothing_weight) * item.probability + smoothing_weight * uniform_probability - - # Normalize the probability - sum_p = sum(item.probability for item in list) - for item in list: - item.probability = item.probability / sum_p + if len(list) > 0: + uniform_probability = 1 / float(len(list)) + for item in list: + if item.probability is None: + item.probability = uniform_probability + else: + # smooth the probability + item.probability = (1 - smoothing_weight) * item.probability + smoothing_weight * uniform_probability + + # Normalize the probability + sum_p = sum(item.probability for item in list) + for item in list: + item.probability = item.probability / sum_p return list @@ -422,17 +425,28 @@ def MakeRoomDict(rir_list): return room_dict + +# This function check if the RIR IO string is listed in the input rir_list file +# It returns the RIR id if the io string is found +def ValidateRirIO(rir_io_str, rir_list): + for rir in rir_list: + if rir_io_str == rir.rir_file_location: + return rir.rir_id + + return "Not found" + + # This function creates the point-source noise list # and the isotropic noise list from the noise information file # Each noise item in the list contains the following attributes: -# noise_id, noise_type, bg_fg_type, rir_id, probability, noise_file_location +# noise_id, noise_type, bg_fg_type, rir_linkage, probability, noise_file_location # Please refer to the help messages in the parser for the meaning of these attributes -def ParseNoiseList(noise_list_file): +def ParseNoiseList(noise_list_file, rir_list): noise_parser = argparse.ArgumentParser() noise_parser.add_argument('--noise-id', type=str, required=True, help='noise id') noise_parser.add_argument('--noise-type', type=str, required=True, help='the type of noise; i.e. isotropic or point-source', choices = ["isotropic", "point-source"]) noise_parser.add_argument('--bg-fg-type', type=str, default="background", help='background or foreground noise', choices = ["background", "foreground"]) - noise_parser.add_argument('--rir-id', type=str, default=None, help='required if isotropic, should not be specified if point-source') + noise_parser.add_argument('--rir-linkage', type=str, action='append', default=None, help='required if isotropic, should not be specified if point-source, this option can be repeatly added to define multiple noise-rir association, the rir linkage can either be a RIR id or a RIR file path') noise_parser.add_argument('--probability', type=float, default=None, help='probability of the noise.') noise_parser.add_argument('noise_file_location', type=str, help='noise file location') @@ -442,9 +456,18 @@ def ParseNoiseList(noise_list_file): for line in noise_lines: noise = noise_parser.parse_args(line.split()) if noise.noise_type == "isotropic": - if noise.rir_id is None: - raise Exception("--rir-id must be specified if --noise-type is isotropic") + if noise.rir_linkage is None: + raise Exception("--rir-linkage must be specified if --noise-type is isotropic") else: + for r in range(0, len(noise.rir_linkage)): + if not noise.rir_linkage[r].isdigit(): + # this is a RIR IO string, validate if it exist in the input rir_list and return the RIR id + result = ValidateRirIO(noise.rir_linkage[r], rir_list) + if result == "Not found": + raise Exception("RIR {0} specified by isotropic noise {1} not found".format(noise.rir_linkage[r], noise.noise_id)) + else: + noise.rir_linkage[r] = result + iso_noise_list.append(noise) else: pointsource_noise_list.append(noise) @@ -459,7 +482,7 @@ def Main(): rir_list = ParseRirList(args.rir_list_file) noise_list = [] if args.noise_list_file is not None: - pointsource_noise_list, iso_noise_list = ParseNoiseList(args.noise_list_file) + pointsource_noise_list, iso_noise_list = ParseNoiseList(args.noise_list_file, rir_list) print("Number of point-source noises is {0}".format(len(pointsource_noise_list))) print("Number of isotropic noises is {0}".format(len(iso_noise_list))) room_dict = MakeRoomDict(rir_list) diff --git a/src/feat/signal.cc b/src/feat/signal.cc index 12a9a710092..a206d399804 100644 --- a/src/feat/signal.cc +++ b/src/feat/signal.cc @@ -35,7 +35,7 @@ void ConvolveSignals(const Vector &filter, Vector *signal) int32 signal_length = signal->Dim(); int32 filter_length = filter.Dim(); int32 output_length = signal_length + filter_length - 1; - Vector signal_padded(output_length); + Vector signal_padded(output_length); signal_padded.SetZero(); for (int32 i = 0; i < signal_length; i++) { for (int32 j = 0; j < filter_length; j++) { @@ -57,11 +57,11 @@ void FFTbasedConvolveSignals(const Vector &filter, Vector SplitRadixRealFft srfft(fft_length); - Vector filter_padded(fft_length); + Vector filter_padded(fft_length); filter_padded.Range(0, filter_length).CopyFromVec(filter); srfft.Compute(filter_padded.Data(), true); - Vector signal_padded(fft_length); + Vector signal_padded(fft_length); signal_padded.Range(0, signal_length).CopyFromVec(*signal); srfft.Compute(signal_padded.Data(), true); @@ -89,13 +89,13 @@ void FFTbasedBlockConvolveSignals(const Vector &filter, Vector srfft(fft_length); - Vector filter_padded(fft_length); + Vector filter_padded(fft_length); filter_padded.Range(0, filter_length).CopyFromVec(filter); srfft.Compute(filter_padded.Data(), true); - Vector temp_pad(filter_length - 1); + Vector temp_pad(filter_length - 1); temp_pad.SetZero(); - Vector signal_block_padded(fft_length); + Vector signal_block_padded(fft_length); for (int32 po = 0; po < output_length; po += block_length) { // get a block of the signal From 93a2295552fd59ccce36c4074c36653bbc2dc332 Mon Sep 17 00:00:00 2001 From: Tom Ko Date: Wed, 27 Jul 2016 00:02:35 -0400 Subject: [PATCH 12/14] Changing isotropic noise linkage to a room instead of a particular rir; Support using string as room id --- .../aspire_prep_rir_noise_list.py | 28 +++++--- .../local/multi_condition/run_nnet2_common.sh | 2 +- egs/wsj/s5/steps/data/reverberate_data_dir.py | 72 ++++++++++--------- 3 files changed, 57 insertions(+), 45 deletions(-) diff --git a/egs/aspire/s5/local/multi_condition/aspire_prep_rir_noise_list.py b/egs/aspire/s5/local/multi_condition/aspire_prep_rir_noise_list.py index 9dd7a38d183..c07eed60d10 100755 --- a/egs/aspire/s5/local/multi_condition/aspire_prep_rir_noise_list.py +++ b/egs/aspire/s5/local/multi_condition/aspire_prep_rir_noise_list.py @@ -23,19 +23,31 @@ def GetArgs(): return args -# This function generate the rir_list file for the aspire real RIR +# This function generates the rir_list file for the real RIRs being in ASpIRE experiments. +# It assumes the availability of data/impulses_noises directory prepared by local/multi_condition/prepare_impulses_noises.sh def GenerateRirListFile(input_dir, output_dir): rir_list_file = open(output_dir + "/rir_list", 'w') rir_id = 1 - room_id = 1 for db in ["RVB2014", "RWCP", "air"]: rir_files = glob.glob(input_dir + "/{0}_*.wav".format(db)) + rir_files.sort() for rir in rir_files: filename = rir.split('/')[-1] if "noise" not in filename: - rir_list_file.write('--rir-id {0} --room-id {1} {2}\n'.format(str(rir_id).zfill(5), str(room_id).zfill(3), rir)) + parts = filename.split('_') + db_name = parts[0] + type_num = parts[1] + if db == "RVB2014": + noise_pattern = parts[3] + elif db == "RWCP" and len(parts) == 4: + noise_pattern = parts[3] + else: + noise_pattern = '_'.join(parts[3:len(parts)-1]) + + # We use the string as the room id + room_id = db_name + "_" + noise_pattern + rir_list_file.write('--rir-id {0} --room-id {1} {2}\n'.format(str(rir_id).zfill(5), room_id, rir)) rir_id += 1 - room_id += 1 rir_list_file.close() @@ -43,18 +55,16 @@ def GenerateRirListFile(input_dir, output_dir): def GenerateNoiseListFile(input_dir, output_dir): noise_list_file = open(output_dir + "/noise_list", 'w') noise_files = glob.glob(input_dir + "/*_type*_noise*.wav") + noise_files.sort() noise_id = 1 for noise_file in noise_files: parts = noise_file.split('/')[-1].split('_') db_name = parts[0] type_num = parts[1] noise_pattern = '_'.join(parts[3:len(parts)-1]) - if db_name == "RWCP": - type_num = "type*" - matched_rir_files = glob.glob(input_dir + "/{0}_{1}_rir_{2}*.wav".format(db_name, type_num, noise_pattern)) noise_line = "--noise-id {0} --noise-type isotropic ".format(str(noise_id).zfill(5)) - for rir in matched_rir_files: - noise_line += "--rir-linkage {0} ".format(rir) + room_id = db_name + "_" + noise_pattern + noise_line += "--room-linkage {0} ".format(room_id) noise_line += "{0}".format(noise_file) noise_list_file.write("{0}\n".format(noise_line)) noise_id += 1 diff --git a/egs/aspire/s5/local/multi_condition/run_nnet2_common.sh b/egs/aspire/s5/local/multi_condition/run_nnet2_common.sh index 15bb922726c..78942c053f3 100755 --- a/egs/aspire/s5/local/multi_condition/run_nnet2_common.sh +++ b/egs/aspire/s5/local/multi_condition/run_nnet2_common.sh @@ -6,7 +6,6 @@ . cmd.sh stage=1 -snrs="20:10:15:5:0" foreground_snrs="20:10:15:5:0" background_snrs="20:10:15:5:0" num_data_reps=3 @@ -34,6 +33,7 @@ if [ $stage -le 1 ]; then data/impulses_noises || exit 1; # Generate the rir_list and noise_list for the reverberate_data_dir.py to corrupt the data + # this script just assumes air rwcp rvb2014 databases python local/multi_condition/aspire_prep_rir_noise_list.py data/impulses_noises data/impulses_noises/info # corrupt the fisher data to generate multi-condition data diff --git a/egs/wsj/s5/steps/data/reverberate_data_dir.py b/egs/wsj/s5/steps/data/reverberate_data_dir.py index 52b07e669b9..8c25a8211ab 100755 --- a/egs/wsj/s5/steps/data/reverberate_data_dir.py +++ b/egs/wsj/s5/steps/data/reverberate_data_dir.py @@ -137,19 +137,33 @@ def WriteDictToFile(dict, file_name): value = list(value) value.sort() value = ' '.join(str(value)) - file.write('{0}\t{1}\n'.format(key, value)) + file.write('{0} {1}\n'.format(key, value)) file.close() -# This function returns only the isotropic noises according to the specified RIR id +# This function creates the utt2uniq file from the utterance id in utt2spk file +def CreateCorruptedUtt2uniq(input_dir, output_dir, num_replicas, prefix): + corrupted_utt2uniq = {} + # Parse the utt2spk to get the utterance id + utt2spk = ParseFileToDict(input_dir + "/utt2spk", value_processor = lambda x: " ".join(x)) + keys = utt2spk.keys() + keys.sort() + for i in range(1, num_replicas+1): + for utt_id in keys: + new_utt_id = GetNewId(utt_id, prefix, i) + corrupted_utt2uniq[new_utt_id] = utt_id + + WriteDictToFile(corrupted_utt2uniq, output_dir + "/utt2uniq") + + +# This function returns only the isotropic noises according to the specified room # Please refer to ParseNoiseList() for the format of iso_noise_list -def FilterIsotropicNoiseList(iso_noise_list, rir_id): +def FilterIsotropicNoiseList(iso_noise_list, room_id): filtered_list = [] for noise in iso_noise_list: - for id in noise.rir_linkage: - if id == rir_id: - filtered_list.append(noise) - break + if noise.room_linkage == room_id: + filtered_list.append(noise) + break return filtered_list @@ -206,7 +220,7 @@ def GenerateReverberationOpts(room_dict, # the room dictionary, please refer to # pick the RIR to reverberate the speech reverberate_opts += "--impulse-response={0} ".format(speech_rir.rir_file_location) - rir_iso_noise_list = FilterIsotropicNoiseList(iso_noise_list, speech_rir.rir_id) + rir_iso_noise_list = FilterIsotropicNoiseList(iso_noise_list, speech_rir.room_id) # Add the corresponding isotropic noise associated with the selected RIR if len(rir_iso_noise_list) > 0 and random.random() < isotropic_noise_addition_probability: isotropic_noise = PickItemWithProbability(rir_iso_noise_list) @@ -267,9 +281,9 @@ def GenerateReverberatedWavScp(wav_scp, # a dictionary whose values are the Kal foreground_snrs = list_cyclic_iterator(foreground_snr_array) background_snrs = list_cyclic_iterator(background_snr_array) corrupted_wav_scp = {} + keys = wav_scp.keys() + keys.sort() for i in range(1, num_replicas+1): - keys = wav_scp.keys() - keys.sort() for recording_id in keys: wav_original_pipe = wav_scp[recording_id] # check if it is really a pipe @@ -350,6 +364,13 @@ def CreateReverberatedCopy(input_dir, data_lib.RunKaldiCommand("utils/utt2spk_to_spk2utt.pl <{output_dir}/utt2spk >{output_dir}/spk2utt" .format(output_dir = output_dir)) + if os.path.isfile(input_dir + "/utt2uniq"): + AddPrefixToFields(input_dir + "/utt2uniq", output_dir + "/utt2uniq", num_replicas, prefix, field =[0]) + else: + # Create the utt2uniq file + CreateCorruptedUtt2uniq(input_dir, output_dir, num_replicas, prefix) + + if os.path.isfile(input_dir + "/text"): AddPrefixToFields(input_dir + "/text", output_dir + "/text", num_replicas, prefix, field =[0]) if os.path.isfile(input_dir + "/segments"): @@ -426,27 +447,17 @@ def MakeRoomDict(rir_list): return room_dict -# This function check if the RIR IO string is listed in the input rir_list file -# It returns the RIR id if the io string is found -def ValidateRirIO(rir_io_str, rir_list): - for rir in rir_list: - if rir_io_str == rir.rir_file_location: - return rir.rir_id - - return "Not found" - - # This function creates the point-source noise list # and the isotropic noise list from the noise information file # Each noise item in the list contains the following attributes: -# noise_id, noise_type, bg_fg_type, rir_linkage, probability, noise_file_location +# noise_id, noise_type, bg_fg_type, room_linkage, probability, noise_file_location # Please refer to the help messages in the parser for the meaning of these attributes -def ParseNoiseList(noise_list_file, rir_list): +def ParseNoiseList(noise_list_file): noise_parser = argparse.ArgumentParser() noise_parser.add_argument('--noise-id', type=str, required=True, help='noise id') noise_parser.add_argument('--noise-type', type=str, required=True, help='the type of noise; i.e. isotropic or point-source', choices = ["isotropic", "point-source"]) noise_parser.add_argument('--bg-fg-type', type=str, default="background", help='background or foreground noise', choices = ["background", "foreground"]) - noise_parser.add_argument('--rir-linkage', type=str, action='append', default=None, help='required if isotropic, should not be specified if point-source, this option can be repeatly added to define multiple noise-rir association, the rir linkage can either be a RIR id or a RIR file path') + noise_parser.add_argument('--room-linkage', type=str, default=None, help='required if isotropic, should not be specified if point-source.') noise_parser.add_argument('--probability', type=float, default=None, help='probability of the noise.') noise_parser.add_argument('noise_file_location', type=str, help='noise file location') @@ -456,18 +467,9 @@ def ParseNoiseList(noise_list_file, rir_list): for line in noise_lines: noise = noise_parser.parse_args(line.split()) if noise.noise_type == "isotropic": - if noise.rir_linkage is None: - raise Exception("--rir-linkage must be specified if --noise-type is isotropic") + if noise.room_linkage is None: + raise Exception("--room-linkage must be specified if --noise-type is isotropic") else: - for r in range(0, len(noise.rir_linkage)): - if not noise.rir_linkage[r].isdigit(): - # this is a RIR IO string, validate if it exist in the input rir_list and return the RIR id - result = ValidateRirIO(noise.rir_linkage[r], rir_list) - if result == "Not found": - raise Exception("RIR {0} specified by isotropic noise {1} not found".format(noise.rir_linkage[r], noise.noise_id)) - else: - noise.rir_linkage[r] = result - iso_noise_list.append(noise) else: pointsource_noise_list.append(noise) @@ -482,7 +484,7 @@ def Main(): rir_list = ParseRirList(args.rir_list_file) noise_list = [] if args.noise_list_file is not None: - pointsource_noise_list, iso_noise_list = ParseNoiseList(args.noise_list_file, rir_list) + pointsource_noise_list, iso_noise_list = ParseNoiseList(args.noise_list_file) print("Number of point-source noises is {0}".format(len(pointsource_noise_list))) print("Number of isotropic noises is {0}".format(len(iso_noise_list))) room_dict = MakeRoomDict(rir_list) From cbe576282f472c8337aeb4c56ef2601fb0d8a25f Mon Sep 17 00:00:00 2001 From: Tom Ko Date: Wed, 27 Jul 2016 10:47:14 -0400 Subject: [PATCH 13/14] Change comments in wav-reverberate.cc --- src/featbin/wav-reverberate.cc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/featbin/wav-reverberate.cc b/src/featbin/wav-reverberate.cc index 683b8be6177..c19bc21cd84 100644 --- a/src/featbin/wav-reverberate.cc +++ b/src/featbin/wav-reverberate.cc @@ -197,8 +197,9 @@ int main(int argc, char *argv[]) { "If nonzero, it specified the duration (secs) of the output " "signal. If the duration t is less than the length of the " "input signal, the first t secs of the signal is trimed, " - "otherwise, the signal will be repeated to" - "fulfill the duration specified."); + "otherwise, the signal will be repeated to " + "fulfill the duration specified. This option is useful for " + "extending the length of isotropic noises."); po.Register("volume", &volume, "If nonzero, a scaling factor on the signal that is applied " "after reverberating and possibly adding noise. " From d34f5971e729919a6b406b41d977a89d31a0b087 Mon Sep 17 00:00:00 2001 From: Tom Ko Date: Sun, 17 Apr 2016 12:27:15 -0400 Subject: [PATCH 14/14] A new steps/data/reverberate_data_dir.py script update function names; split snrs to background and foreground; user specified random seed; always handle isotropic noise as background noise Pick the RIRs and noises according to assigned probabilities. Modify wav-reverberate.cc according to the new steps/data/reverberate_data_dir.py Change the functions in signal.cc to extend the length of the convolved signal, the correct length should be original signal length + rir length - 1; add the shift option to wav-reverberate.cc Adding more comments and remove duplicate function in reverberate_data_dir.py Change option --max-noises-added to --max-noises-per-minute Adding data_lib.py; adding more comments, splitting large function in reverberate_data_dir.py adding AddPointSourceNoise() Fixing spelling mistake and modifying comments Modify the aspire recipe to use the new reverberate_data_dir.py; fixing bugs in reverberate_data_dir.py; add aspire_prep_rir_noise_list.py for generating rir_list and noise_list for aspire Changing isotropic noise linkage to a room instead of a particular rir; Support using string as room id Change comments in wav-reverberate.cc --- .../aspire_prep_rir_noise_list.py | 89 +++ .../local/multi_condition/run_nnet2_common.sh | 31 +- .../steps/data/data_dir_manipulation_lib.py | 18 + egs/wsj/s5/steps/data/reverberate_data_dir.py | 508 ++++++++++++++++++ src/feat/signal.cc | 28 +- src/feat/signal.h | 7 + src/featbin/wav-reverberate.cc | 260 ++++++--- 7 files changed, 843 insertions(+), 98 deletions(-) create mode 100755 egs/aspire/s5/local/multi_condition/aspire_prep_rir_noise_list.py create mode 100644 egs/wsj/s5/steps/data/data_dir_manipulation_lib.py create mode 100755 egs/wsj/s5/steps/data/reverberate_data_dir.py diff --git a/egs/aspire/s5/local/multi_condition/aspire_prep_rir_noise_list.py b/egs/aspire/s5/local/multi_condition/aspire_prep_rir_noise_list.py new file mode 100755 index 00000000000..c07eed60d10 --- /dev/null +++ b/egs/aspire/s5/local/multi_condition/aspire_prep_rir_noise_list.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python +# Copyright 2016 Tom Ko +# Apache 2.0 +# script to generate rir_list and noise_list in aspire + +# we're using python 3.x style print but want it to work in python 2.x, +from __future__ import print_function +import argparse, glob, math, os, sys + + +def GetArgs(): + parser = argparse.ArgumentParser(description="Prepare rir_list and noise_list for Aspire " + "Usage: reverberate_data_dir.py [options...] " + "E.g. reverberate_data_dir.py " + "data/impulses_noises data/impulses_noises/info", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + + parser.add_argument("input_dir", help="Input data directory") + parser.add_argument("output_dir", help="Output data directory") + print(' '.join(sys.argv)) + args = parser.parse_args() + + return args + + +# This function generates the rir_list file for the real RIRs being in ASpIRE experiments. +# It assumes the availability of data/impulses_noises directory prepared by local/multi_condition/prepare_impulses_noises.sh +def GenerateRirListFile(input_dir, output_dir): + rir_list_file = open(output_dir + "/rir_list", 'w') + rir_id = 1 + for db in ["RVB2014", "RWCP", "air"]: + rir_files = glob.glob(input_dir + "/{0}_*.wav".format(db)) + rir_files.sort() + for rir in rir_files: + filename = rir.split('/')[-1] + if "noise" not in filename: + parts = filename.split('_') + db_name = parts[0] + type_num = parts[1] + if db == "RVB2014": + noise_pattern = parts[3] + elif db == "RWCP" and len(parts) == 4: + noise_pattern = parts[3] + else: + noise_pattern = '_'.join(parts[3:len(parts)-1]) + + # We use the string as the room id + room_id = db_name + "_" + noise_pattern + rir_list_file.write('--rir-id {0} --room-id {1} {2}\n'.format(str(rir_id).zfill(5), room_id, rir)) + rir_id += 1 + rir_list_file.close() + + +# This function generate the noise_list file from the aspire noise-rir pair +def GenerateNoiseListFile(input_dir, output_dir): + noise_list_file = open(output_dir + "/noise_list", 'w') + noise_files = glob.glob(input_dir + "/*_type*_noise*.wav") + noise_files.sort() + noise_id = 1 + for noise_file in noise_files: + parts = noise_file.split('/')[-1].split('_') + db_name = parts[0] + type_num = parts[1] + noise_pattern = '_'.join(parts[3:len(parts)-1]) + noise_line = "--noise-id {0} --noise-type isotropic ".format(str(noise_id).zfill(5)) + room_id = db_name + "_" + noise_pattern + noise_line += "--room-linkage {0} ".format(room_id) + noise_line += "{0}".format(noise_file) + noise_list_file.write("{0}\n".format(noise_line)) + noise_id += 1 + noise_list_file.close() + + +def Main(): + args = GetArgs() + + if not os.path.exists(args.output_dir): + os.makedirs(args.output_dir) + + # generating the rir_list file for the new steps/data/reverberate_data_dir.py + GenerateRirListFile(args.input_dir, args.output_dir) + + # generating the noise_list file for the new steps/data/reverberate_data_dir.py + GenerateNoiseListFile(args.input_dir, args.output_dir) + + +if __name__ == "__main__": + Main() + diff --git a/egs/aspire/s5/local/multi_condition/run_nnet2_common.sh b/egs/aspire/s5/local/multi_condition/run_nnet2_common.sh index 5b6424a1d86..78942c053f3 100755 --- a/egs/aspire/s5/local/multi_condition/run_nnet2_common.sh +++ b/egs/aspire/s5/local/multi_condition/run_nnet2_common.sh @@ -6,7 +6,8 @@ . cmd.sh stage=1 -snrs="20:10:15:5:0" +foreground_snrs="20:10:15:5:0" +background_snrs="20:10:15:5:0" num_data_reps=3 ali_dir=exp/ db_string="'air' 'rwcp' 'rvb2014'" # RIR dbs to be used in the experiment @@ -31,24 +32,30 @@ if [ $stage -le 1 ]; then --RIR-home $RIR_home \ data/impulses_noises || exit 1; + # Generate the rir_list and noise_list for the reverberate_data_dir.py to corrupt the data + # this script just assumes air rwcp rvb2014 databases + python local/multi_condition/aspire_prep_rir_noise_list.py data/impulses_noises data/impulses_noises/info + # corrupt the fisher data to generate multi-condition data - # for data_dir in train dev test; do for data_dir in train dev test; do if [ "$data_dir" == "train" ]; then num_reps=$num_data_reps else num_reps=1 fi - reverb_data_dirs= - for i in `seq 1 $num_reps`; do - cur_dest_dir=" data/temp_${data_dir}_${i}" - local/multi_condition/reverberate_data_dir.sh --random-seed $i \ - --snrs "$snrs" --log-dir exp/make_corrupted_wav \ - data/${data_dir} data/impulses_noises $cur_dest_dir - reverb_data_dirs+=" $cur_dest_dir" - done - utils/combine_data.sh --extra-files utt2uniq data/${data_dir}_rvb $reverb_data_dirs - rm -rf $reverb_data_dirs + python steps/data/reverberate_data_dir.py \ + --prefix "rev" \ + --rir-list-file data/impulses_noises/info/rir_list \ + --noise-list-file data/impulses_noises/info/noise_list \ + --foreground-snrs $foreground_snrs \ + --background-snrs $background_snrs \ + --speech-rvb-probability 1 \ + --pointsource-noise-addition-probability 1 \ + --isotropic-noise-addition-probability 1 \ + --num-replications $num_reps \ + --max-noises-per-minute 1 \ + --random-seed 1 \ + data/${data_dir} data/${data_dir}_rvb done # create the dev, test and eval sets from the aspire recipe diff --git a/egs/wsj/s5/steps/data/data_dir_manipulation_lib.py b/egs/wsj/s5/steps/data/data_dir_manipulation_lib.py new file mode 100644 index 00000000000..1f7253d4891 --- /dev/null +++ b/egs/wsj/s5/steps/data/data_dir_manipulation_lib.py @@ -0,0 +1,18 @@ +import subprocess + +def RunKaldiCommand(command, wait = True): + """ Runs commands frequently seen in Kaldi scripts. These are usually a + sequence of commands connected by pipes, so we use shell=True """ + #logger.info("Running the command\n{0}".format(command)) + p = subprocess.Popen(command, shell = True, + stdout = subprocess.PIPE, + stderr = subprocess.PIPE) + + if wait: + [stdout, stderr] = p.communicate() + if p.returncode is not 0: + raise Exception("There was an error while running the command {0}\n".format(command)+"-"*10+"\n"+stderr) + return stdout, stderr + else: + return p + diff --git a/egs/wsj/s5/steps/data/reverberate_data_dir.py b/egs/wsj/s5/steps/data/reverberate_data_dir.py new file mode 100755 index 00000000000..8c25a8211ab --- /dev/null +++ b/egs/wsj/s5/steps/data/reverberate_data_dir.py @@ -0,0 +1,508 @@ +#!/usr/bin/env python +# Copyright 2016 Tom Ko +# Apache 2.0 +# script to generate reverberated data + +# we're using python 3.x style print but want it to work in python 2.x, +from __future__ import print_function +import argparse, glob, math, os, random, sys, warnings, copy, imp, ast + +data_lib = imp.load_source('dml', 'steps/data/data_dir_manipulation_lib.py') + +def GetArgs(): + # we add required arguments as named arguments for readability + parser = argparse.ArgumentParser(description="Reverberate the data directory with an option " + "to add isotropic and point source noises. " + "Usage: reverberate_data_dir.py [options...] " + "E.g. reverberate_data_dir.py --rir-list-file rir_list " + "--foreground-snrs 20:10:15:5:0 --background-snrs 20:10:15:5:0 " + "--noise-list-file noise_list --speech-rvb-probability 1 --num-replications 2 " + "--random-seed 1 data/train data/train_rvb", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + + parser.add_argument("--rir-list-file", type=str, required = True, + help="RIR information file, the format of the file is " + "--rir-id --room-id " + "--receiver-position-id --source-position-id " + "--rt-60 --drr " + "E.g. --rir-id 00001 --room-id 001 --receiver-position-id 001 --source-position-id 00001 " + "--rt60 0.58 --drr -4.885 data/impulses/Room001-00001.wav") + parser.add_argument("--noise-list-file", type=str, default = None, + help="Noise information file, the format of the file is" + "--noise-id --noise-type " + "--bg-fg-type " + "--rir-file " + " " + "E.g. --noise-id 001 --noise-type isotropic --rir-id 00019 iso_noise.wav") + parser.add_argument("--num-replications", type=int, dest = "num_replicas", default = 1, + help="Number of replicate to generated for the data") + parser.add_argument('--foreground-snrs', type=str, dest = "foreground_snr_string", default = '20:10:0', help='When foreground noises are being added the script will iterate through these SNRs.') + parser.add_argument('--background-snrs', type=str, dest = "background_snr_string", default = '20:10:0', help='When background noises are being added the script will iterate through these SNRs.') + parser.add_argument('--prefix', type=str, default = None, help='This prefix will modified for each reverberated copy, by adding additional affixes.') + parser.add_argument("--speech-rvb-probability", type=float, default = 1.0, + help="Probability of reverberating a speech signal, e.g. 0 <= p <= 1") + parser.add_argument("--pointsource-noise-addition-probability", type=float, default = 1.0, + help="Probability of adding point-source noises, e.g. 0 <= p <= 1") + parser.add_argument("--isotropic-noise-addition-probability", type=float, default = 1.0, + help="Probability of adding isotropic noises, e.g. 0 <= p <= 1") + parser.add_argument("--max-noises-per-minute", type=int, default = 2, + help="This controls the maximum number of point-source noises that could be added to a recording according to its duration") + parser.add_argument('--random-seed', type=int, default=0, help='seed to be used in the randomization of impulses and noises') + parser.add_argument("input_dir", + help="Input data directory") + parser.add_argument("output_dir", + help="Output data directory") + + print(' '.join(sys.argv)) + + args = parser.parse_args() + args = CheckArgs(args) + + return args + +def CheckArgs(args): + if not os.path.exists(args.output_dir): + os.makedirs(args.output_dir) + + ## Check arguments. + if not os.path.isfile(args.rir_list_file): + raise Exception(args.rir_list_file + " not found") + + if args.noise_list_file is not None: + if not os.path.isfile(args.noise_list_file): + raise Exception(args.noise_list_file + " not found") + + if args.num_replicas > 1 and args.prefix is None: + args.prefix = "rvb" + warnings.warn("--prefix is set to 'rvb' as --num-replications is larger than 1.") + + return args + + +class list_cyclic_iterator: + def __init__(self, list): + self.list_index = 0 + self.list = list + random.shuffle(self.list) + + def next(self): + item = self.list[self.list_index] + self.list_index = (self.list_index + 1) % len(self.list) + return item + + +# This functions picks an item from the collection according to the associated probability distribution. +# The probability estimate of each item in the collection is stored in the "probability" field of +# the particular item. x : a collection (list or dictionary) where the values contain a field called probability +def PickItemWithProbability(x): + if isinstance(x, dict): + plist = list(set(x.values())) + else: + plist = x + total_p = sum(item.probability for item in plist) + p = random.uniform(0, total_p) + accumulate_p = 0 + for item in plist: + if accumulate_p + item.probability >= p: + return item + accumulate_p += item.probability + assert False, "Shouldn't get here as the accumulated probability should always equal to 1" + + +# This function parses a file and pack the data into a dictionary +# It is useful for parsing file like wav.scp, utt2spk, text...etc +def ParseFileToDict(file, assert2fields = False, value_processor = None): + if value_processor is None: + value_processor = lambda x: x[0] + + dict = {} + for line in open(file, 'r'): + parts = line.split() + if assert2fields: + assert(len(parts) == 2) + + dict[parts[0]] = value_processor(parts[1:]) + return dict + +# This function creates a file and write the content of a dictionary into it +def WriteDictToFile(dict, file_name): + file = open(file_name, 'w') + keys = dict.keys() + keys.sort() + for key in keys: + value = dict[key] + if type(value) in [list, tuple] : + if type(value) is tuple: + value = list(value) + value.sort() + value = ' '.join(str(value)) + file.write('{0} {1}\n'.format(key, value)) + file.close() + + +# This function creates the utt2uniq file from the utterance id in utt2spk file +def CreateCorruptedUtt2uniq(input_dir, output_dir, num_replicas, prefix): + corrupted_utt2uniq = {} + # Parse the utt2spk to get the utterance id + utt2spk = ParseFileToDict(input_dir + "/utt2spk", value_processor = lambda x: " ".join(x)) + keys = utt2spk.keys() + keys.sort() + for i in range(1, num_replicas+1): + for utt_id in keys: + new_utt_id = GetNewId(utt_id, prefix, i) + corrupted_utt2uniq[new_utt_id] = utt_id + + WriteDictToFile(corrupted_utt2uniq, output_dir + "/utt2uniq") + + +# This function returns only the isotropic noises according to the specified room +# Please refer to ParseNoiseList() for the format of iso_noise_list +def FilterIsotropicNoiseList(iso_noise_list, room_id): + filtered_list = [] + for noise in iso_noise_list: + if noise.room_linkage == room_id: + filtered_list.append(noise) + break + + return filtered_list + + +def AddPointSourceNoise(noise_addition_descriptor, # descriptor to store the information of the noise added + room, # the room selected + pointsource_noise_list, # the point source noise list + pointsource_noise_addition_probability, # Probability of adding point-source noises + foreground_snrs, # the SNR for adding the foreground noises + background_snrs, # the SNR for adding the background noises + speech_dur, # duration of the recording + max_noises_recording # Maximum number of point-source noises that can be added + ): + if len(pointsource_noise_list) > 0 and random.random() < pointsource_noise_addition_probability and max_noises_recording > 1: + for k in range(random.randint(1, max_noises_recording)): + # pick the RIR to reverberate the point-source noise + noise = PickItemWithProbability(pointsource_noise_list) + noise_rir = PickItemWithProbability(room.rir_list) + # If it is a background noise, the noise will be extended and be added to the whole speech + # if it is a foreground noise, the noise will not extended and be added at a random time of the speech + if noise.bg_fg_type == "background": + noise_addition_descriptor['noise_io'].append("wav-reverberate --duration={2} --impulse-response={1} {0} - |".format(noise.noise_file_location, noise_rir.rir_file_location, speech_dur)) + noise_addition_descriptor['start_times'].append(0) + noise_addition_descriptor['snrs'].append(background_snrs.next()) + else: + noise_addition_descriptor['noise_io'].append("wav-reverberate --impulse-response={1} {0} - |".format(noise.noise_file_location, noise_rir.rir_file_location)) + noise_addition_descriptor['start_times'].append(round(random.random() * speech_dur, 2)) + noise_addition_descriptor['snrs'].append(foreground_snrs.next()) + + return noise_addition_descriptor + + +def GenerateReverberationOpts(room_dict, # the room dictionary, please refer to MakeRoomDict() for the format + pointsource_noise_list, # the point source noise list + iso_noise_list, # the isotropic noise list + foreground_snrs, # the SNR for adding the foreground noises + background_snrs, # the SNR for adding the background noises + speech_rvb_probability, # Probability of reverberating a speech signal + isotropic_noise_addition_probability, # Probability of adding isotropic noises + pointsource_noise_addition_probability, # Probability of adding point-source noises + speech_dur, # duration of the recording + max_noises_recording # Maximum number of point-source noises that can be added + ): + reverberate_opts = "" + noise_addition_descriptor = {'noise_io': [], + 'start_times': [], + 'snrs': []} + # Randomly select the room + # Here the room probability is a sum of the probabilities of the RIRs recorded in the room. + room = PickItemWithProbability(room_dict) + # Randomly select the RIR in the room + speech_rir = PickItemWithProbability(room.rir_list) + if random.random() < speech_rvb_probability: + # pick the RIR to reverberate the speech + reverberate_opts += "--impulse-response={0} ".format(speech_rir.rir_file_location) + + rir_iso_noise_list = FilterIsotropicNoiseList(iso_noise_list, speech_rir.room_id) + # Add the corresponding isotropic noise associated with the selected RIR + if len(rir_iso_noise_list) > 0 and random.random() < isotropic_noise_addition_probability: + isotropic_noise = PickItemWithProbability(rir_iso_noise_list) + # extend the isotropic noise to the length of the speech waveform + noise_addition_descriptor['noise_io'].append("wav-reverberate --duration={1} {0} - |".format(isotropic_noise.noise_file_location, speech_dur)) + noise_addition_descriptor['start_times'].append(0) + noise_addition_descriptor['snrs'].append(background_snrs.next()) + + noise_addition_descriptor = AddPointSourceNoise(noise_addition_descriptor, # descriptor to store the information of the noise added + room, # the room selected + pointsource_noise_list, # the point source noise list + pointsource_noise_addition_probability, # Probability of adding point-source noises + foreground_snrs, # the SNR for adding the foreground noises + background_snrs, # the SNR for adding the background noises + speech_dur, # duration of the recording + max_noises_recording # Maximum number of point-source noises that can be added + ) + + assert len(noise_addition_descriptor['noise_io']) == len(noise_addition_descriptor['start_times']) + assert len(noise_addition_descriptor['noise_io']) == len(noise_addition_descriptor['snrs']) + if len(noise_addition_descriptor['noise_io']) > 0: + reverberate_opts += "--additive-signals='{0}' ".format(','.join(noise_addition_descriptor['noise_io'])) + reverberate_opts += "--start-times='{0}' ".format(','.join(map(lambda x:str(x), noise_addition_descriptor['start_times']))) + reverberate_opts += "--snrs='{0}' ".format(','.join(map(lambda x:str(x), noise_addition_descriptor['snrs']))) + + return reverberate_opts + +# This function generates a new id from the input id +# This is needed when we have to create multiple copies of the original data +def GetNewId(id, prefix=None, copy=0): + if prefix is not None: + new_id = prefix + str(copy) + "_" + id + else: + new_id = id + + return new_id + + +# This is the main function to generate pipeline command for the corruption +# The generic command of wav-reverberate will be like: +# wav-reverberate --duration=t --impulse-response=rir.wav +# --additive-signals='noise1.wav,noise2.wav' --snrs='snr1,snr2' --start-times='s1,s2' input.wav output.wav +def GenerateReverberatedWavScp(wav_scp, # a dictionary whose values are the Kaldi-IO strings of the speech recordings + durations, # a dictionary whose values are the duration (in sec) of the speech recordings + output_dir, # output directory to write the corrupted wav.scp + room_dict, # the room dictionary, please refer to MakeRoomDict() for the format + pointsource_noise_list, # the point source noise list + iso_noise_list, # the isotropic noise list + foreground_snr_array, # the SNR for adding the foreground noises + background_snr_array, # the SNR for adding the background noises + num_replicas, # Number of replicate to generated for the data + prefix, # prefix for the id of the corrupted utterances + speech_rvb_probability, # Probability of reverberating a speech signal + isotropic_noise_addition_probability, # Probability of adding isotropic noises + pointsource_noise_addition_probability, # Probability of adding point-source noises + max_noises_per_minute # maximum number of point-source noises that can be added to a recording according to its duration + ): + foreground_snrs = list_cyclic_iterator(foreground_snr_array) + background_snrs = list_cyclic_iterator(background_snr_array) + corrupted_wav_scp = {} + keys = wav_scp.keys() + keys.sort() + for i in range(1, num_replicas+1): + for recording_id in keys: + wav_original_pipe = wav_scp[recording_id] + # check if it is really a pipe + if len(wav_original_pipe.split()) == 1: + wav_original_pipe = "cat {0} |".format(wav_original_pipe) + speech_dur = durations[recording_id] + max_noises_recording = math.floor(max_noises_per_minute * speech_dur / 60) + + reverberate_opts = GenerateReverberationOpts(room_dict, # the room dictionary, please refer to MakeRoomDict() for the format + pointsource_noise_list, # the point source noise list + iso_noise_list, # the isotropic noise list + foreground_snrs, # the SNR for adding the foreground noises + background_snrs, # the SNR for adding the background noises + speech_rvb_probability, # Probability of reverberating a speech signal + isotropic_noise_addition_probability, # Probability of adding isotropic noises + pointsource_noise_addition_probability, # Probability of adding point-source noises + speech_dur, # duration of the recording + max_noises_recording # Maximum number of point-source noises that can be added + ) + + if reverberate_opts == "": + wav_corrupted_pipe = "{0}".format(wav_original_pipe) + else: + wav_corrupted_pipe = "{0} wav-reverberate {1} - - |".format(wav_original_pipe, reverberate_opts) + + new_recording_id = GetNewId(recording_id, prefix, i) + corrupted_wav_scp[new_recording_id] = wav_corrupted_pipe + + WriteDictToFile(corrupted_wav_scp, output_dir + "/wav.scp") + + +# This function replicate the entries in files like segments, utt2spk, text +def AddPrefixToFields(input_file, output_file, num_replicas, prefix, field = [0]): + list = map(lambda x: x.strip(), open(input_file)) + f = open(output_file, "w") + for i in range(1, num_replicas+1): + for line in list: + if len(line) > 0 and line[0] != ';': + split1 = line.split() + for j in field: + split1[j] = GetNewId(split1[j], prefix, i) + print(" ".join(split1), file=f) + else: + print(line, file=f) + f.close() + + +# This function creates multiple copies of the necessary files, e.g. utt2spk, wav.scp ... +def CreateReverberatedCopy(input_dir, + output_dir, + room_dict, # the room dictionary, please refer to MakeRoomDict() for the format + pointsource_noise_list, # the point source noise list + iso_noise_list, # the isotropic noise list + foreground_snr_string, # the SNR for adding the foreground noises + background_snr_string, # the SNR for adding the background noises + num_replicas, # Number of replicate to generated for the data + prefix, # prefix for the id of the corrupted utterances + speech_rvb_probability, # Probability of reverberating a speech signal + isotropic_noise_addition_probability, # Probability of adding isotropic noises + pointsource_noise_addition_probability, # Probability of adding point-source noises + max_noises_per_minute # maximum number of point-source noises that can be added to a recording according to its duration + ): + + if not os.path.isfile(input_dir + "/reco2dur"): + print("Getting the duration of the recordings..."); + data_lib.RunKaldiCommand("wav-to-duration --read-entire-file=true scp:{0}/wav.scp ark,t:{0}/reco2dur".format(input_dir)) + durations = ParseFileToDict(input_dir + "/reco2dur", value_processor = lambda x: float(x[0])) + wav_scp = ParseFileToDict(input_dir + "/wav.scp", value_processor = lambda x: " ".join(x)) + foreground_snr_array = map(lambda x: float(x), foreground_snr_string.split(':')) + background_snr_array = map(lambda x: float(x), background_snr_string.split(':')) + + GenerateReverberatedWavScp(wav_scp, durations, output_dir, room_dict, pointsource_noise_list, iso_noise_list, + foreground_snr_array, background_snr_array, num_replicas, prefix, + speech_rvb_probability, isotropic_noise_addition_probability, + pointsource_noise_addition_probability, max_noises_per_minute) + + AddPrefixToFields(input_dir + "/utt2spk", output_dir + "/utt2spk", num_replicas, prefix, field = [0,1]) + data_lib.RunKaldiCommand("utils/utt2spk_to_spk2utt.pl <{output_dir}/utt2spk >{output_dir}/spk2utt" + .format(output_dir = output_dir)) + + if os.path.isfile(input_dir + "/utt2uniq"): + AddPrefixToFields(input_dir + "/utt2uniq", output_dir + "/utt2uniq", num_replicas, prefix, field =[0]) + else: + # Create the utt2uniq file + CreateCorruptedUtt2uniq(input_dir, output_dir, num_replicas, prefix) + + + if os.path.isfile(input_dir + "/text"): + AddPrefixToFields(input_dir + "/text", output_dir + "/text", num_replicas, prefix, field =[0]) + if os.path.isfile(input_dir + "/segments"): + AddPrefixToFields(input_dir + "/segments", output_dir + "/segments", num_replicas, prefix, field = [0,1]) + if os.path.isfile(input_dir + "/reco2file_and_channel"): + AddPrefixToFields(input_dir + "/reco2file_and_channel", output_dir + "/reco2file_and_channel", num_replicas, prefix, field = [0,1]) + + data_lib.RunKaldiCommand("utils/validate_data_dir.sh --no-feats {output_dir}" + .format(output_dir = output_dir)) + + +# This function smooths the probability distribution in the list +def SmoothProbabilityDistribution(list, smoothing_weight=0.3): + if len(list) > 0: + uniform_probability = 1 / float(len(list)) + for item in list: + if item.probability is None: + item.probability = uniform_probability + else: + # smooth the probability + item.probability = (1 - smoothing_weight) * item.probability + smoothing_weight * uniform_probability + + # Normalize the probability + sum_p = sum(item.probability for item in list) + for item in list: + item.probability = item.probability / sum_p + + return list + +# This function creates the RIR list +# Each noise item in the list contains the following attributes: +# rir_id, room_id, receiver_position_id, source_position_id, rt60, drr, probability +# Please refer to the help messages in the parser for the meaning of these attributes +def ParseRirList(rir_list_file): + rir_parser = argparse.ArgumentParser() + rir_parser.add_argument('--rir-id', type=str, required=True, help='This id is unique for each RIR and the noise may associate with a particular RIR by refering to this id') + rir_parser.add_argument('--room-id', type=str, required=True, help='This is the room that where the RIR is generated') + rir_parser.add_argument('--receiver-position-id', type=str, default=None, help='receiver position id') + rir_parser.add_argument('--source-position-id', type=str, default=None, help='source position id') + rir_parser.add_argument('--rt60', type=float, default=None, help='RT60 is the time required for reflections of a direct sound to decay 60 dB.') + rir_parser.add_argument('--drr', type=float, default=None, help='Direct-to-reverberant-ratio of the impulse.') + rir_parser.add_argument('--probability', type=float, default=None, help='probability of the impulse.') + rir_parser.add_argument('rir_file_location', type=str, help='rir file location') + + rir_list = [] + rir_lines = map(lambda x: x.strip(), open(rir_list_file)) + for line in rir_lines: + rir = rir_parser.parse_args(line.split()) + setattr(rir, "iso_noise_list", []) + rir_list.append(rir) + + return SmoothProbabilityDistribution(rir_list) + + +# This function divides the global RIR list into local lists +# according to the room where the RIRs are generated +# It returns the room dictionary indexed by the room id +# Its values are objects with two attributes: a local RIR list +# and the probability of the corresponding room +def MakeRoomDict(rir_list): + room_dict = {} + for rir in rir_list: + if rir.room_id not in room_dict: + # add new room + room_dict[rir.room_id] = lambda: None + setattr(room_dict[rir.room_id], "rir_list", []) + setattr(room_dict[rir.room_id], "probability", 0) + room_dict[rir.room_id].rir_list.append(rir) + + # the probability of the room is the sum of probabilities of its RIR + for key in room_dict.keys(): + room_dict[key].probability = sum(rir.probability for rir in room_dict[key].rir_list) + + return room_dict + + +# This function creates the point-source noise list +# and the isotropic noise list from the noise information file +# Each noise item in the list contains the following attributes: +# noise_id, noise_type, bg_fg_type, room_linkage, probability, noise_file_location +# Please refer to the help messages in the parser for the meaning of these attributes +def ParseNoiseList(noise_list_file): + noise_parser = argparse.ArgumentParser() + noise_parser.add_argument('--noise-id', type=str, required=True, help='noise id') + noise_parser.add_argument('--noise-type', type=str, required=True, help='the type of noise; i.e. isotropic or point-source', choices = ["isotropic", "point-source"]) + noise_parser.add_argument('--bg-fg-type', type=str, default="background", help='background or foreground noise', choices = ["background", "foreground"]) + noise_parser.add_argument('--room-linkage', type=str, default=None, help='required if isotropic, should not be specified if point-source.') + noise_parser.add_argument('--probability', type=float, default=None, help='probability of the noise.') + noise_parser.add_argument('noise_file_location', type=str, help='noise file location') + + pointsource_noise_list = [] + iso_noise_list = [] + noise_lines = map(lambda x: x.strip(), open(noise_list_file)) + for line in noise_lines: + noise = noise_parser.parse_args(line.split()) + if noise.noise_type == "isotropic": + if noise.room_linkage is None: + raise Exception("--room-linkage must be specified if --noise-type is isotropic") + else: + iso_noise_list.append(noise) + else: + pointsource_noise_list.append(noise) + + return (SmoothProbabilityDistribution(pointsource_noise_list), + SmoothProbabilityDistribution(iso_noise_list)) + + +def Main(): + args = GetArgs() + random.seed(args.random_seed) + rir_list = ParseRirList(args.rir_list_file) + noise_list = [] + if args.noise_list_file is not None: + pointsource_noise_list, iso_noise_list = ParseNoiseList(args.noise_list_file) + print("Number of point-source noises is {0}".format(len(pointsource_noise_list))) + print("Number of isotropic noises is {0}".format(len(iso_noise_list))) + room_dict = MakeRoomDict(rir_list) + + CreateReverberatedCopy(input_dir = args.input_dir, + output_dir = args.output_dir, + room_dict = room_dict, + pointsource_noise_list = pointsource_noise_list, + iso_noise_list = iso_noise_list, + foreground_snr_string = args.foreground_snr_string, + background_snr_string = args.background_snr_string, + num_replicas = args.num_replicas, + prefix = args.prefix, + speech_rvb_probability = args.speech_rvb_probability, + isotropic_noise_addition_probability = args.isotropic_noise_addition_probability, + pointsource_noise_addition_probability = args.pointsource_noise_addition_probability, + max_noises_per_minute = args.max_noises_per_minute) + +if __name__ == "__main__": + Main() + diff --git a/src/feat/signal.cc b/src/feat/signal.cc index e8fbb0b84cf..a206d399804 100644 --- a/src/feat/signal.cc +++ b/src/feat/signal.cc @@ -34,22 +34,25 @@ void ElementwiseProductOfFft(const Vector &a, Vector *b) { void ConvolveSignals(const Vector &filter, Vector *signal) { int32 signal_length = signal->Dim(); int32 filter_length = filter.Dim(); - Vector signal_padded(signal_length + filter_length - 1); + int32 output_length = signal_length + filter_length - 1; + Vector signal_padded(output_length); signal_padded.SetZero(); for (int32 i = 0; i < signal_length; i++) { for (int32 j = 0; j < filter_length; j++) { signal_padded(i + j) += (*signal)(i) * filter(j); } } - signal->CopyFromVec(signal_padded.Range(0, signal_length)); + signal->Resize(output_length); + signal->CopyFromVec(signal_padded); } void FFTbasedConvolveSignals(const Vector &filter, Vector *signal) { int32 signal_length = signal->Dim(); int32 filter_length = filter.Dim(); + int32 output_length = signal_length + filter_length - 1; - int32 fft_length = RoundUpToNearestPowerOfTwo(signal_length + filter_length - 1); + int32 fft_length = RoundUpToNearestPowerOfTwo(output_length); KALDI_VLOG(1) << "fft_length for full signal convolution is " << fft_length; SplitRadixRealFft srfft(fft_length); @@ -67,12 +70,15 @@ void FFTbasedConvolveSignals(const Vector &filter, Vector srfft.Compute(signal_padded.Data(), false); signal_padded.Scale(1.0 / fft_length); - signal->CopyFromVec(signal_padded.Range(0, signal_length)); + signal->Resize(output_length); + signal->CopyFromVec(signal_padded.Range(0, output_length)); } void FFTbasedBlockConvolveSignals(const Vector &filter, Vector *signal) { int32 signal_length = signal->Dim(); int32 filter_length = filter.Dim(); + int32 output_length = signal_length + filter_length - 1; + signal->Resize(output_length, kCopyData); KALDI_VLOG(1) << "Length of the filter is " << filter_length; @@ -91,9 +97,9 @@ void FFTbasedBlockConvolveSignals(const Vector &filter, Vector signal_block_padded(fft_length); - for (int32 po = 0; po < signal_length; po += block_length) { + for (int32 po = 0; po < output_length; po += block_length) { // get a block of the signal - int32 process_length = std::min(block_length, signal_length - po); + int32 process_length = std::min(block_length, output_length - po); signal_block_padded.SetZero(); signal_block_padded.Range(0, process_length).CopyFromVec(signal->Range(po, process_length)); @@ -105,17 +111,17 @@ void FFTbasedBlockConvolveSignals(const Vector &filter, VectorRange(po, block_length).CopyFromVec(signal_block_padded.Range(0, block_length)); signal->Range(po, filter_length - 1).AddVec(1.0, temp_pad); temp_pad.CopyFromVec(signal_block_padded.Range(block_length, filter_length - 1)); } else { - signal->Range(po, signal_length - po).CopyFromVec( - signal_block_padded.Range(0, signal_length - po)); - if (filter_length - 1 < signal_length - po) + signal->Range(po, output_length - po).CopyFromVec( + signal_block_padded.Range(0, output_length - po)); + if (filter_length - 1 < output_length - po) signal->Range(po, filter_length - 1).AddVec(1.0, temp_pad); else - signal->Range(po, signal_length - po).AddVec(1.0, temp_pad.Range(0, signal_length - po)); + signal->Range(po, output_length - po).AddVec(1.0, temp_pad.Range(0, output_length - po)); } } } diff --git a/src/feat/signal.h b/src/feat/signal.h index 7ff0ce33b52..c6c3eb50530 100644 --- a/src/feat/signal.h +++ b/src/feat/signal.h @@ -25,6 +25,13 @@ namespace kaldi { +/* + The following three functions are having the same functionality but + different implementations so as the efficiency. After the convolution, + the length of the signal will be extended to (original signal length + + filter length - 1). +*/ + /* This function implements a simple non-FFT-based convolution of two signals. It is suggested to use the FFT-based convolution function which is more diff --git a/src/featbin/wav-reverberate.cc b/src/featbin/wav-reverberate.cc index d7599c5ea3d..c19bc21cd84 100644 --- a/src/featbin/wav-reverberate.cc +++ b/src/featbin/wav-reverberate.cc @@ -28,7 +28,8 @@ namespace kaldi { This function is to repeatedly concatenate signal1 by itself to match the length of signal2 and add the two signals together. */ -void AddVectorsOfUnequalLength(const Vector &signal1, Vector *signal2) { +void AddVectorsOfUnequalLength(const VectorBase &signal1, + Vector *signal2) { for (int32 po = 0; po < signal2->Dim(); po += signal1.Dim()) { int32 block_length = signal1.Dim(); if (signal2->Dim() - po < block_length) block_length = signal2->Dim() - po; @@ -36,6 +37,18 @@ void AddVectorsOfUnequalLength(const Vector &signal1, Vector &signal1, int32 offset, + Vector *signal2) { + int32 add_length = std::min(signal2->Dim() - offset, signal1.Dim()); + if (add_length > 0) + signal2->Range(offset, add_length).AddVec(1.0, signal1.Range(0, add_length)); +} + + BaseFloat MaxAbsolute(const Vector &vector) { return std::max(std::abs(vector.Max()), std::abs(vector.Min())); } @@ -71,29 +84,46 @@ BaseFloat ComputeEarlyReverbEnergy(const Vector &rir, const Vector &rir, BaseFloat samp_freq, - BaseFloat snr_db, Vector *noise, +float DoReverberation(const Vector &rir, BaseFloat samp_freq, Vector *signal) { - if (noise->Dim()) { - float input_power = ComputeEarlyReverbEnergy(rir, *signal, samp_freq); - float noise_power = VecVec(*noise, *noise) / noise->Dim(); - float scale_factor = sqrt(pow(10, -snr_db / 10) * input_power / noise_power); - noise->Scale(scale_factor); - KALDI_VLOG(1) << "Noise signal is being scaled with " << scale_factor - << " to generate output with SNR " << snr_db << "db\n"; - } - + float signal_power = ComputeEarlyReverbEnergy(rir, *signal, samp_freq); FFTbasedBlockConvolveSignals(rir, signal); + return signal_power; +} - if (noise->Dim() > 0) { - AddVectorsOfUnequalLength(*noise, signal); +/* + The noise will be scaled before the addition + to match the given signal-to-noise ratio (SNR). +*/ +void AddNoise(Vector *noise, BaseFloat snr_db, + BaseFloat time, BaseFloat samp_freq, + BaseFloat signal_power, Vector *signal) { + float noise_power = VecVec(*noise, *noise) / noise->Dim(); + float scale_factor = sqrt(pow(10, -snr_db / 10) * signal_power / noise_power); + noise->Scale(scale_factor); + KALDI_VLOG(1) << "Noise signal is being scaled with " << scale_factor + << " to generate output with SNR " << snr_db << "db\n"; + int32 offset = time * samp_freq; + AddVectorsWithOffset(*noise, offset, signal); +} + +/* + This function converts comma-spearted string into float vector. +*/ +void ReadCommaSeparatedCommand(const std::string &s, + std::vector *v) { + std::vector split_string; + SplitStringToVector(s, ",", true, &split_string); + for (size_t i = 0; i < split_string.size(); i++) { + float ret; + ConvertStringToReal(split_string[i], &ret); + v->push_back(ret); } } } @@ -107,23 +137,36 @@ int main(int argc, char *argv[]) { "room-impulse response (rir_matrix) and additive noise distortions\n" "(specified by corresponding files).\n" "Usage: wav-reverberate [options...] " - " \n" + "\n" "e.g.\n" - "wav-reverberate --noise-file=noise.wav \\\n" - " input.wav rir.wav output.wav\n"; + "wav-reverberate --duration=t --impulse-response=rir.wav " + "--additive-signals='noise1.wav,noise2.wav' --snrs='snr1,snr2' " + "--start-times='s1,s2' input.wav output.wav\n"; ParseOptions po(usage); - std::string noise_file; - BaseFloat snr_db = 20; + std::string rir_file; + std::string additive_signals; + std::string snrs; + std::string start_times; bool multi_channel_output = false; + bool shift_output = true; int32 input_channel = 0; int32 rir_channel = 0; int32 noise_channel = 0; bool normalize_output = true; BaseFloat volume = 0; + BaseFloat duration = 0; po.Register("multi-channel-output", &multi_channel_output, "Specifies if the output should be multi-channel or not"); + po.Register("shift-output", &shift_output, + "If true, the reverberated waveform will be shifted by the " + "amount of the peak position of the RIR and the length of " + "the output waveform will be equal to the input waveform." + "If false, the length of the output waveform will be " + "equal to (original input length + rir length - 1). " + "This value is true by default and " + "it only affects the output when RIR file is provided."); po.Register("input-wave-channel", &input_channel, "Specifies the channel to be used from input as only a " "single channel will be used to generate reverberated output"); @@ -133,14 +176,30 @@ int main(int argc, char *argv[]) { po.Register("noise-channel", &noise_channel, "Specifies the channel of the noise file, " "it will only be used when multi-channel-output is false"); - po.Register("noise-file", &noise_file, - "File with additive noise"); - po.Register("snr-db", &snr_db, - "Desired SNR(dB) of the output"); + po.Register("impulse-response", &rir_file, + "File with the impulse response for reverberating the input wave"); + po.Register("additive-signals", &additive_signals, + "A comma separated list of additive signals"); + po.Register("snrs", &snrs, + "A comma separated list of SNRs. The additive signals will be " + "scaled according to these SNRs."); + po.Register("start-times", &start_times, + "A comma separated list of start times referring to the " + "input signal. The additive signals will be added to the " + "input signal starting at the offset. If the start time " + "exceed the length of the input signal, the addition will " + "be ignored."); po.Register("normalize-output", &normalize_output, "If true, then after reverberating and " "possibly adding noise, scale so that the signal " "energy is the same as the original input signal."); + po.Register("duration", &duration, + "If nonzero, it specified the duration (secs) of the output " + "signal. If the duration t is less than the length of the " + "input signal, the first t secs of the signal is trimed, " + "otherwise, the signal will be repeated to " + "fulfill the duration specified. This option is useful for " + "extending the length of isotropic noises."); po.Register("volume", &volume, "If nonzero, a scaling factor on the signal that is applied " "after reverberating and possibly adding noise. " @@ -148,7 +207,7 @@ int main(int argc, char *argv[]) { "if you had also specified --normalize-output=false."); po.Read(argc, argv); - if (po.NumArgs() != 3) { + if (po.NumArgs() != 2) { po.PrintUsage(); exit(1); } @@ -160,13 +219,14 @@ int main(int argc, char *argv[]) { } std::string input_wave_file = po.GetArg(1); - std::string rir_file = po.GetArg(2); - std::string output_wave_file = po.GetArg(3); + std::string output_wave_file = po.GetArg(2); WaveData input_wave; { + WaveHolder waveholder; Input ki(input_wave_file); - input_wave.Read(ki.Stream()); + waveholder.Read(ki.Stream()); + input_wave = waveholder.Value(); } const Matrix &input_matrix = input_wave.Data(); @@ -178,45 +238,73 @@ int main(int argc, char *argv[]) { << " #channel: " << num_input_channel; KALDI_ASSERT(input_channel < num_input_channel); - WaveData rir_wave; - { - Input ki(rir_file); - rir_wave.Read(ki.Stream()); - } - const Matrix &rir_matrix = rir_wave.Data(); - BaseFloat samp_freq_rir = rir_wave.SampFreq(); - int32 num_samp_rir = rir_matrix.NumCols(), - num_rir_channel = rir_matrix.NumRows(); - KALDI_VLOG(1) << "sampling frequency of rir: " << samp_freq_rir - << " #samples: " << num_samp_rir - << " #channel: " << num_rir_channel; - if (!multi_channel_output) { - KALDI_ASSERT(rir_channel < num_rir_channel); - } - - Matrix noise_matrix; - if (!noise_file.empty()) { - WaveData noise_wave; + Matrix rir_matrix; + BaseFloat samp_freq_rir = samp_freq_input; + int32 num_samp_rir = 0, + num_rir_channel = 0; + if (!rir_file.empty()) { + WaveData rir_wave; { - Input ki(noise_file); - noise_wave.Read(ki.Stream()); + WaveHolder waveholder; + Input ki(rir_file); + waveholder.Read(ki.Stream()); + rir_wave = waveholder.Value(); } - noise_matrix = noise_wave.Data(); - BaseFloat samp_freq_noise = noise_wave.SampFreq(); - int32 num_samp_noise = noise_matrix.NumCols(), - num_noise_channel = noise_matrix.NumRows(); - KALDI_VLOG(1) << "sampling frequency of noise: " << samp_freq_noise - << " #samples: " << num_samp_noise - << " #channel: " << num_noise_channel; - if (multi_channel_output) { - KALDI_ASSERT(num_rir_channel == num_noise_channel); - } else { - KALDI_ASSERT(noise_channel < num_noise_channel); + rir_matrix = rir_wave.Data(); + samp_freq_rir = rir_wave.SampFreq(); + num_samp_rir = rir_matrix.NumCols(); + num_rir_channel = rir_matrix.NumRows(); + KALDI_VLOG(1) << "sampling frequency of rir: " << samp_freq_rir + << " #samples: " << num_samp_rir + << " #channel: " << num_rir_channel; + if (!multi_channel_output) { + KALDI_ASSERT(rir_channel < num_rir_channel); + } + } + + std::vector > additive_signal_matrices; + if (!additive_signals.empty()) { + std::vector split_string; + SplitStringToVector(additive_signals, ",", true, &split_string); + for (size_t i = 0; i < split_string.size(); i++) { + WaveHolder waveholder; + Input ki(split_string[i]); + waveholder.Read(ki.Stream()); + WaveData additive_signal_wave = waveholder.Value(); + Matrix additive_signal_matrix = additive_signal_wave.Data(); + BaseFloat samp_freq = additive_signal_wave.SampFreq(); + KALDI_ASSERT(samp_freq == samp_freq_input); + int32 num_samp = additive_signal_matrix.NumCols(), + num_channel = additive_signal_matrix.NumRows(); + KALDI_VLOG(1) << "sampling frequency of additive signal: " << samp_freq + << " #samples: " << num_samp + << " #channel: " << num_channel; + if (multi_channel_output) { + KALDI_ASSERT(num_rir_channel == num_channel); + } else { + KALDI_ASSERT(noise_channel < num_channel); + } + + additive_signal_matrices.push_back(additive_signal_matrix); } } + std::vector snr_vector; + if (!snrs.empty()) { + ReadCommaSeparatedCommand(snrs, &snr_vector); + } + + std::vector start_time_vector; + if (!start_times.empty()) { + ReadCommaSeparatedCommand(start_times, &start_time_vector); + } + + int32 shift_index = 0; int32 num_output_channels = (multi_channel_output ? num_rir_channel : 1); - Matrix out_matrix(num_output_channels, num_samp_input); + int32 num_samp_output = (duration > 0 ? samp_freq_input * duration : + (shift_output ? num_samp_input : + num_samp_input + num_samp_rir - 1)); + Matrix out_matrix(num_output_channels, num_samp_output); for (int32 output_channel = 0; output_channel < num_output_channels; output_channel++) { Vector input(num_samp_input); @@ -224,18 +312,31 @@ int main(int argc, char *argv[]) { float power_before_reverb = VecVec(input, input) / input.Dim(); int32 this_rir_channel = (multi_channel_output ? output_channel : rir_channel); - Vector rir(num_samp_rir); - rir.CopyRowFromMat(rir_matrix, this_rir_channel); - rir.Scale(1.0 / (1 << 15)); - Vector noise(0); - if (!noise_file.empty()) { - noise.Resize(noise_matrix.NumCols()); - int32 this_noise_channel = (multi_channel_output ? output_channel : noise_channel); - noise.CopyRowFromMat(noise_matrix, this_noise_channel); + float early_energy = power_before_reverb; + if (!rir_file.empty()) { + Vector rir; + rir.Resize(num_samp_rir); + rir.CopyRowFromMat(rir_matrix, this_rir_channel); + rir.Scale(1.0 / (1 << 15)); + early_energy = DoReverberation(rir, samp_freq_rir, &input); + if (shift_output) { + // find the position of the peak of the impulse response + // and shift the output waveform by this amount + rir.Max(&shift_index); + } } - DoReverberation(rir, samp_freq_rir, snr_db, &noise, &input); + if (additive_signal_matrices.size() > 0) { + Vector noise(0); + int32 this_noise_channel = (multi_channel_output ? output_channel : noise_channel); + for (int32 i = 0; i < additive_signal_matrices.size(); i++) { + noise.Resize(additive_signal_matrices[i].NumCols()); + noise.CopyRowFromMat(additive_signal_matrices[i], this_noise_channel); + AddNoise(&noise, snr_vector[i], start_time_vector[i], + samp_freq_input, early_energy, &input); + } + } float power_after_reverb = VecVec(input, input) / input.Dim(); @@ -244,7 +345,16 @@ int main(int argc, char *argv[]) { else if (normalize_output) input.Scale(sqrt(power_before_reverb / power_after_reverb)); - out_matrix.CopyRowFromVec(input, output_channel); + if (num_samp_output <= num_samp_input) { + // trim the signal from the start + out_matrix.CopyRowFromVec(input.Range(shift_index, num_samp_output), output_channel); + } else { + // repeat the signal to fill up the duration + Vector extended_input(num_samp_output); + extended_input.SetZero(); + AddVectorsOfUnequalLength(input.Range(shift_index, num_samp_input), &extended_input); + out_matrix.CopyRowFromVec(extended_input, output_channel); + } } WaveData out_wave(samp_freq_input, out_matrix);