diff --git a/CHANGES.rst b/CHANGES.rst index 8dc0d4899..c1eece9d7 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -11,7 +11,7 @@ New features: * Bar tracking functionality (#316) * Added `quantize_notes` function (#327) * Added global key evaluation (#336) -* Added key recognition feature and program (#345) +* Added key recognition feature and program (#345, #381) Bug fixes: diff --git a/README.rst b/README.rst index 6c3262cf5..c492c9c3a 100644 --- a/README.rst +++ b/README.rst @@ -355,10 +355,9 @@ References Proceedings of IEEE International Workshop on Machine Learning for Signal Processing (MLSP), 2016. .. [18] Filip Korzeniowski and Gerhard Widmer, - *End-to-End Musical Key Estimation Using a Convolutional Neural Network*, - Proceedings of the 25th European Signal Processing Conference (EUSIPCO), - 2017. - + *Genre-Agnostic Key Classification with Convolutional Neural Networks*, + Proceedings of the 19th International Society for Music Information + Retrieval Conference (ISMIR), 2018. Acknowledgements ================ diff --git a/bin/KeyRecognition b/bin/KeyRecognition index 7d5717579..885b8a247 100755 --- a/bin/KeyRecognition +++ b/bin/KeyRecognition @@ -23,13 +23,9 @@ def main(): using a Convolutional Neural Network, as described in Filip Korzeniowski and Gerhard Widmer, - "End-to-End Musical Key Estimation Using a Convolutional Neural Network", - In Proceedings of the 25th European Signal Processing Conference (EUSIPCO), - Kos, Greece, 2017. - - The model used here differs slightly from the one in the paper: it was - trained on snippets of audio instead of full songs, and using a dataset - that includes (mostly piano) classical music. + "Genre-Agnostic Key Classification with Convolutional Neural Networks", + In Proceedings of the 19th International Society for Music Information + Retrieval Conference (ISMIR), Paris, France, 2018. This program can be run in 'single' file mode to process a single audio file and write the recognised chords to STDOUT or the given output file. @@ -42,7 +38,7 @@ def main(): $ KeyRecognition batch [-o OUTPUT_DIR] [-s OUTPUT_SUFFIX] FILES If no output directory is given, the program writes the files with the - extracted chords to the same location as the audio files. + extracted key to the same location as the audio files. The 'pickle' mode can be used to store the used parameters to be able to exactly reproduce experiments. @@ -51,7 +47,7 @@ def main(): ) # version p.add_argument('--version', action='version', - version='KeyRecognition.2017') + version='KeyRecognition.2018') io_arguments(p, output_suffix='.key.txt') ActivationsProcessor.add_arguments(p) diff --git a/madmom/features/key.py b/madmom/features/key.py index b677089fe..70de408c1 100644 --- a/madmom/features/key.py +++ b/madmom/features/key.py @@ -38,6 +38,10 @@ def key_prediction_to_label(prediction): return KEY_LABELS[prediction[0].argmax()] +def add_axis(x): + return x[np.newaxis, ...] + + class CNNKeyRecognitionProcessor(SequentialProcessor): """ Recognise the global key of a musical piece using a Convolutional Neural @@ -48,16 +52,13 @@ class CNNKeyRecognitionProcessor(SequentialProcessor): nn_files : list, optional List with trained CNN model files. Per default ('None'), an ensemble of networks will be used. - single_net : bool, optional - Use only a single CNN for prediction. This speeds up processing, but - slightly worsens the results. References ---------- .. [1] Filip Korzeniowski and Gerhard Widmer, - "End-to-End Musical Key Estimation Using a Convolutional Neural - Network", In Proceedings of the 25th European Signal Processing - Conference (EUSIPCO), Kos, Greece, 2017. + "Genre-Agnostic Key Classification with Convolutional Neural + Networks", In Proceedings of the 19th International Society for + Music Information Retrieval Conference (ISMIR), Paris, France, 2018. Examples -------- @@ -68,12 +69,10 @@ class CNNKeyRecognitionProcessor(SequentialProcessor): >>> proc # doctest: +ELLIPSIS >>> proc('tests/data/audio/sample.wav') # doctest: +NORMALIZE_WHITESPACE - array([[0. , 0. , 0.00001, 0.00012, 0. , 0. , - 0.00151, 0. , 0. , 0. , 0.00003, 0.81958, - 0. , 0. , 0. , 0.01747, 0. , 0. , - 0.00001, 0. , 0.00006, 0. , 0.00001, 0.16119]], - dtype=float32) - + array([[0.03426, 0.0331 , 0.02979, 0.04423, 0.04215, 0.0311 , 0.05225, + 0.04263, 0.04141, 0.02907, 0.03755, 0.09546, 0.0431 , 0.02792, + 0.02138, 0.05589, 0.03276, 0.02786, 0.02415, 0.04608, 0.05329, + 0.02804, 0.03868, 0.08786]]) """ def __init__(self, nn_files=None, **kwargs): @@ -81,6 +80,7 @@ def __init__(self, nn_files=None, **kwargs): from ..audio.stft import ShortTimeFourierTransformProcessor from ..audio.spectrogram import LogarithmicFilteredSpectrogramProcessor from ..ml.nn import NeuralNetworkEnsemble + from ..ml.nn.activations import softmax from ..models import KEY_CNN # spectrogram computation @@ -97,5 +97,5 @@ def __init__(self, nn_files=None, **kwargs): # create processing pipeline super(CNNKeyRecognitionProcessor, self).__init__([ - sig, frames, stft, spec, nn + sig, frames, stft, spec, nn, add_axis, softmax ]) diff --git a/madmom/ml/nn/layers.py b/madmom/ml/nn/layers.py index 124cf0de1..2d6dfc46a 100644 --- a/madmom/ml/nn/layers.py +++ b/madmom/ml/nn/layers.py @@ -661,6 +661,7 @@ def _kernel_margins(kernel_shape, margin_shift): start_x, end_x, start_y, end_y : tuple Indices determining the valid part of the convolution output. """ + start_x = int(np.floor(kernel_shape[0] / 2.)) start_y = int(np.floor(kernel_shape[1] / 2.)) @@ -671,6 +672,8 @@ def _kernel_margins(kernel_shape, margin_shift): end_x -= margin_shift else: end_x = start_x + start_x = start_x if start_x > 0 else None + end_x = -end_x if end_x > 0 else None if kernel_shape[1] % 2 == 0: end_y = start_y - 1 @@ -678,8 +681,10 @@ def _kernel_margins(kernel_shape, margin_shift): end_y -= margin_shift else: end_y = start_y + start_y = start_y if start_y > 0 else None + end_y = -end_y if end_y > 0 else None - return start_x, -end_x, start_y, -end_y + return start_x, end_x, start_y, end_y try: diff --git a/madmom/models b/madmom/models index 7e277510b..31fe6ce52 160000 --- a/madmom/models +++ b/madmom/models @@ -1 +1 @@ -Subproject commit 7e277510bb8cc46bcbffc84070eaa84a88bc2fe5 +Subproject commit 31fe6ce52affb794c2e3511908fbeccfd68f6925 diff --git a/setup.py b/setup.py index ae324ef5d..ad3507169 100644 --- a/setup.py +++ b/setup.py @@ -41,7 +41,7 @@ 'models/chords/*/*', 'models/chroma/*/*', 'models/downbeats/*/*', - 'models/key/*/*', + 'models/key/2018/*', 'models/notes/*/*', 'models/onsets/*/*', 'models/patterns/*/*', diff --git a/tests/data/activations/sample.key_cnn.npz b/tests/data/activations/sample.key_cnn.npz index 08b43d106..6dffb854f 100644 Binary files a/tests/data/activations/sample.key_cnn.npz and b/tests/data/activations/sample.key_cnn.npz differ diff --git a/tests/data/activations/sample2.key_cnn.npz b/tests/data/activations/sample2.key_cnn.npz index 87ff5f6dd..5623a0420 100644 Binary files a/tests/data/activations/sample2.key_cnn.npz and b/tests/data/activations/sample2.key_cnn.npz differ diff --git a/tests/test_ml_nn.py b/tests/test_ml_nn.py index 54dfaf926..20301b0fd 100644 --- a/tests/test_ml_nn.py +++ b/tests/test_ml_nn.py @@ -506,3 +506,83 @@ def test_constant_padding(self): self.assertTrue(np.allclose(out[:, :, 3:-3], data)) self.assertTrue(np.allclose(out[:, :, :3], 2.2)) self.assertTrue(np.allclose(out[:, :, -3:], 2.2)) + + +class ConvolutionalLayerClassTest(unittest.TestCase): + + W1 = np.array([[[[0.69557322]], + [[0.45649655]], + [[0.58179561]]], + [[[0.20438251]], + [[0.17404747]], + [[0.41624290]]]]) + + W3 = np.array([[[[0.57353216, 0.72422232, 0.15716315], + [0.82000373, 0.26902348, 0.69203708], + [0.45564084, 0.89265194, 0.98080186]], + [[0.44920649, 0.52442715, 0.33103038], + [0.24536095, 0.49307102, 0.28850389], + [0.38324254, 0.46965330, 0.76865911]], + [[0.44225901, 0.34989312, 0.92381997], + [0.32123710, 0.04856574, 0.87387125], + [0.70175767, 0.38149251, 0.40178089]]], + [[[0.28197446, 0.35315104, 0.53862099], + [0.01224023, 0.94672135, 0.87194315], + [0.69193064, 0.27611521, 0.51076897]], + [[0.22228372, 0.58605351, 0.17730248], + [0.10949298, 0.43124835, 0.71336330], + [0.57694486, 0.44623928, 0.11774881]], + [[0.76850363, 0.46740177, 0.76900027], + [0.61551742, 0.62841514, 0.05235070], + [0.01321052, 0.93591818, 0.61256317]]]]) + + B = np.array([0.27614033, 0.87995416, 0.23540803]) + + O1 = np.array([[[0.97171354, 1.33645070, 0.81720366], + [0.27614033, 0.87995416, 0.23540803], + [0.27614033, 0.87995416, 0.23540803], + [0.27614033, 0.87995416, 0.23540803], + [0.48052284, 1.05400163, 0.65165093]], + [[0.27614033, 0.87995416, 0.23540803], + [0.97171354, 1.33645070, 0.81720366], + [0.27614033, 0.87995416, 0.23540803], + [0.48052284, 1.05400163, 0.65165093], + [0.27614033, 0.87995416, 0.23540803]], + [[0.27614033, 0.87995416, 0.23540803], + [0.27614033, 0.87995416, 0.23540803], + [1.17609602, 1.51049817, 1.23344656], + [0.27614033, 0.87995416, 0.23540803], + [0.27614033, 0.87995416, 0.23540803]], + [[0.27614033, 0.87995416, 0.23540803], + [0.48052284, 1.05400163, 0.65165093], + [0.27614033, 0.87995416, 0.23540803], + [0.97171354, 1.33645070, 0.81720366], + [0.27614033, 0.87995416, 0.23540803]], + [[0.48052284, 1.05400163, 0.65165093], + [0.27614033, 0.87995416, 0.23540803], + [0.27614033, 0.87995416, 0.23540803], + [0.27614033, 0.87995416, 0.23540803], + [0.97171354, 1.33645070, 0.81720366]]]) + + O3 = np.array([[[2.38147223, 2.81317455, 1.89651736], + [2.05779099, 2.38843173, 2.54209157], + [2.61057651, 2.39648026, 2.56985398]], + [[2.35418737, 2.29051489, 2.02105685], + [4.27677071, 3.77638656, 2.53863951], + [2.84045803, 2.85248774, 2.44744130]], + [[2.90905416, 2.44869238, 2.34779163], + [3.13685429, 2.75457102, 1.92640647], + [2.61026680, 2.70863968, 1.74057683]]]) + + def setUp(self): + self.layer1x1 = ConvolutionalLayer(ConvolutionalLayerClassTest.W1, + ConvolutionalLayerClassTest.B) + self.layer3x3 = ConvolutionalLayer(ConvolutionalLayerClassTest.W3, + ConvolutionalLayerClassTest.B) + self.data = np.stack([np.eye(5), np.eye(5)[:, ::-1]], axis=-1) + + def test_activate(self): + out1 = self.layer1x1.activate(self.data) + self.assertTrue(np.allclose(out1, ConvolutionalLayerClassTest.O1)) + out3 = self.layer3x3.activate(self.data) + self.assertTrue(np.allclose(out3, ConvolutionalLayerClassTest.O3))