Merge pull request #381 from CPJKU/key2018

Integrated new Key Classification Model (ISMIR 2018)
CPJKU · Nov 5, 2018 · 870189b · 870189b
2 parents cf34790 + f9cb6e8
commit 870189b
Show file tree

Hide file tree

Showing 10 changed files with 110 additions and 30 deletions.
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -11,7 +11,7 @@ New features:
 * Bar tracking functionality (#316)
 * Added `quantize_notes` function (#327)
 * Added global key evaluation (#336)
-* Added key recognition feature and program (#345)
+* Added key recognition feature and program (#345, #381)
 
 Bug fixes:
 

diff --git a/README.rst b/README.rst
@@ -355,10 +355,9 @@ References
     Proceedings of IEEE International Workshop on Machine Learning for Signal
     Processing (MLSP), 2016.
 .. [18] Filip Korzeniowski and Gerhard Widmer,
-    *End-to-End Musical Key Estimation Using a Convolutional Neural Network*,
-    Proceedings of the 25th European Signal Processing Conference (EUSIPCO),
-    2017.
-
+    *Genre-Agnostic Key Classification with Convolutional Neural Networks*,
+    Proceedings of the 19th International Society for Music Information
+    Retrieval Conference (ISMIR), 2018.
 
 Acknowledgements
 ================

diff --git a/bin/KeyRecognition b/bin/KeyRecognition
@@ -23,13 +23,9 @@ def main():
     using a Convolutional Neural Network, as described in
 
     Filip Korzeniowski and Gerhard Widmer,
-    "End-to-End Musical Key Estimation Using a Convolutional Neural Network",
-    In Proceedings of the 25th European Signal Processing Conference (EUSIPCO),
-    Kos, Greece, 2017.
-
-    The model used here differs slightly from the one in the paper: it was
-    trained on snippets of audio instead of full songs, and using a dataset
-    that includes (mostly piano) classical music.
+    "Genre-Agnostic Key Classification with Convolutional Neural Networks",
+    In Proceedings of the 19th International Society for Music Information 
+    Retrieval Conference (ISMIR), Paris, France, 2018.
 
     This program can be run in 'single' file mode to process a single audio
     file and write the recognised chords to STDOUT or the given output file.
@@ -42,7 +38,7 @@ def main():
       $ KeyRecognition batch [-o OUTPUT_DIR] [-s OUTPUT_SUFFIX] FILES
 
     If no output directory is given, the program writes the files with the
-    extracted chords to the same location as the audio files.
+    extracted key to the same location as the audio files.
 
     The 'pickle' mode can be used to store the used parameters to be able to
     exactly reproduce experiments.
@@ -51,7 +47,7 @@ def main():
     )
     # version
     p.add_argument('--version', action='version',
-                   version='KeyRecognition.2017')
+                   version='KeyRecognition.2018')
     io_arguments(p, output_suffix='.key.txt')
     ActivationsProcessor.add_arguments(p)
 

diff --git a/madmom/features/key.py b/madmom/features/key.py
@@ -38,6 +38,10 @@ def key_prediction_to_label(prediction):
     return KEY_LABELS[prediction[0].argmax()]
 
 
+def add_axis(x):
+    return x[np.newaxis, ...]
+
+
 class CNNKeyRecognitionProcessor(SequentialProcessor):
     """
     Recognise the global key of a musical piece using a Convolutional Neural
@@ -48,16 +52,13 @@ class CNNKeyRecognitionProcessor(SequentialProcessor):
     nn_files : list, optional
         List with trained CNN model files. Per default ('None'), an ensemble
         of networks will be used.
-    single_net : bool, optional
-        Use only a single CNN for prediction. This speeds up processing, but
-        slightly worsens the results.
 
     References
     ----------
     .. [1] Filip Korzeniowski and Gerhard Widmer,
-           "End-to-End Musical Key Estimation Using a Convolutional Neural
-           Network", In Proceedings of the 25th European Signal Processing
-           Conference (EUSIPCO), Kos, Greece, 2017.
+           "Genre-Agnostic Key Classification with Convolutional Neural
+           Networks", In Proceedings of the 19th International Society for
+           Music Information Retrieval Conference (ISMIR), Paris, France, 2018.
 
     Examples
     --------
@@ -68,19 +69,18 @@ class CNNKeyRecognitionProcessor(SequentialProcessor):
     >>> proc  # doctest: +ELLIPSIS
     <madmom.features.key.CNNKeyRecognitionProcessor object at 0x...>
     >>> proc('tests/data/audio/sample.wav')  # doctest: +NORMALIZE_WHITESPACE
-    array([[0.     , 0.     , 0.00001, 0.00012, 0.     , 0.     ,
-            0.00151, 0.     , 0.     , 0.     , 0.00003, 0.81958,
-            0.     , 0.     , 0.     , 0.01747, 0.     , 0.     ,
-            0.00001, 0.     , 0.00006, 0.     , 0.00001, 0.16119]],
-          dtype=float32)
-
+    array([[0.03426, 0.0331 , 0.02979, 0.04423, 0.04215, 0.0311 , 0.05225,
+            0.04263, 0.04141, 0.02907, 0.03755, 0.09546, 0.0431 , 0.02792,
+            0.02138, 0.05589, 0.03276, 0.02786, 0.02415, 0.04608, 0.05329,
+            0.02804, 0.03868, 0.08786]])
     """
 
     def __init__(self, nn_files=None, **kwargs):
         from ..audio.signal import SignalProcessor, FramedSignalProcessor
         from ..audio.stft import ShortTimeFourierTransformProcessor
         from ..audio.spectrogram import LogarithmicFilteredSpectrogramProcessor
         from ..ml.nn import NeuralNetworkEnsemble
+        from ..ml.nn.activations import softmax
         from ..models import KEY_CNN
 
         # spectrogram computation
@@ -97,5 +97,5 @@ def __init__(self, nn_files=None, **kwargs):
 
         # create processing pipeline
         super(CNNKeyRecognitionProcessor, self).__init__([
-            sig, frames, stft, spec, nn
+            sig, frames, stft, spec, nn, add_axis, softmax
         ])
diff --git a/madmom/ml/nn/layers.py b/madmom/ml/nn/layers.py
@@ -661,6 +661,7 @@ def _kernel_margins(kernel_shape, margin_shift):
     start_x, end_x, start_y, end_y : tuple
         Indices determining the valid part of the convolution output.
     """
+
     start_x = int(np.floor(kernel_shape[0] / 2.))
     start_y = int(np.floor(kernel_shape[1] / 2.))
 
@@ -671,15 +672,19 @@ def _kernel_margins(kernel_shape, margin_shift):
         end_x -= margin_shift
     else:
         end_x = start_x
+    start_x = start_x if start_x > 0 else None
+    end_x = -end_x if end_x > 0 else None
 
     if kernel_shape[1] % 2 == 0:
         end_y = start_y - 1
         start_y += margin_shift
         end_y -= margin_shift
     else:
         end_y = start_y
+    start_y = start_y if start_y > 0 else None
+    end_y = -end_y if end_y > 0 else None
 
-    return start_x, -end_x, start_y, -end_y
+    return start_x, end_x, start_y, end_y
 
 
 try:

diff --git a/madmom/models b/madmom/models
diff --git a/setup.py b/setup.py
@@ -41,7 +41,7 @@
                 'models/chords/*/*',
                 'models/chroma/*/*',
                 'models/downbeats/*/*',
-                'models/key/*/*',
+                'models/key/2018/*',
                 'models/notes/*/*',
                 'models/onsets/*/*',
                 'models/patterns/*/*',

diff --git a/tests/data/activations/sample.key_cnn.npz b/tests/data/activations/sample.key_cnn.npz
diff --git a/tests/data/activations/sample2.key_cnn.npz b/tests/data/activations/sample2.key_cnn.npz
diff --git a/tests/test_ml_nn.py b/tests/test_ml_nn.py
@@ -506,3 +506,83 @@ def test_constant_padding(self):
         self.assertTrue(np.allclose(out[:, :, 3:-3], data))
         self.assertTrue(np.allclose(out[:, :, :3], 2.2))
         self.assertTrue(np.allclose(out[:, :, -3:], 2.2))
+
+
+class ConvolutionalLayerClassTest(unittest.TestCase):
+
+    W1 = np.array([[[[0.69557322]],
+                    [[0.45649655]],
+                    [[0.58179561]]],
+                   [[[0.20438251]],
+                    [[0.17404747]],
+                    [[0.41624290]]]])
+
+    W3 = np.array([[[[0.57353216, 0.72422232, 0.15716315],
+                     [0.82000373, 0.26902348, 0.69203708],
+                     [0.45564084, 0.89265194, 0.98080186]],
+                    [[0.44920649, 0.52442715, 0.33103038],
+                     [0.24536095, 0.49307102, 0.28850389],
+                     [0.38324254, 0.46965330, 0.76865911]],
+                    [[0.44225901, 0.34989312, 0.92381997],
+                     [0.32123710, 0.04856574, 0.87387125],
+                     [0.70175767, 0.38149251, 0.40178089]]],
+                   [[[0.28197446, 0.35315104, 0.53862099],
+                     [0.01224023, 0.94672135, 0.87194315],
+                     [0.69193064, 0.27611521, 0.51076897]],
+                    [[0.22228372, 0.58605351, 0.17730248],
+                     [0.10949298, 0.43124835, 0.71336330],
+                     [0.57694486, 0.44623928, 0.11774881]],
+                    [[0.76850363, 0.46740177, 0.76900027],
+                     [0.61551742, 0.62841514, 0.05235070],
+                     [0.01321052, 0.93591818, 0.61256317]]]])
+
+    B = np.array([0.27614033, 0.87995416, 0.23540803])
+
+    O1 = np.array([[[0.97171354, 1.33645070, 0.81720366],
+                    [0.27614033, 0.87995416, 0.23540803],
+                    [0.27614033, 0.87995416, 0.23540803],
+                    [0.27614033, 0.87995416, 0.23540803],
+                    [0.48052284, 1.05400163, 0.65165093]],
+                   [[0.27614033, 0.87995416, 0.23540803],
+                    [0.97171354, 1.33645070, 0.81720366],
+                    [0.27614033, 0.87995416, 0.23540803],
+                    [0.48052284, 1.05400163, 0.65165093],
+                    [0.27614033, 0.87995416, 0.23540803]],
+                   [[0.27614033, 0.87995416, 0.23540803],
+                    [0.27614033, 0.87995416, 0.23540803],
+                    [1.17609602, 1.51049817, 1.23344656],
+                    [0.27614033, 0.87995416, 0.23540803],
+                    [0.27614033, 0.87995416, 0.23540803]],
+                   [[0.27614033, 0.87995416, 0.23540803],
+                    [0.48052284, 1.05400163, 0.65165093],
+                    [0.27614033, 0.87995416, 0.23540803],
+                    [0.97171354, 1.33645070, 0.81720366],
+                    [0.27614033, 0.87995416, 0.23540803]],
+                   [[0.48052284, 1.05400163, 0.65165093],
+                    [0.27614033, 0.87995416, 0.23540803],
+                    [0.27614033, 0.87995416, 0.23540803],
+                    [0.27614033, 0.87995416, 0.23540803],
+                    [0.97171354, 1.33645070, 0.81720366]]])
+
+    O3 = np.array([[[2.38147223, 2.81317455, 1.89651736],
+                    [2.05779099, 2.38843173, 2.54209157],
+                    [2.61057651, 2.39648026, 2.56985398]],
+                   [[2.35418737, 2.29051489, 2.02105685],
+                    [4.27677071, 3.77638656, 2.53863951],
+                    [2.84045803, 2.85248774, 2.44744130]],
+                   [[2.90905416, 2.44869238, 2.34779163],
+                    [3.13685429, 2.75457102, 1.92640647],
+                    [2.61026680, 2.70863968, 1.74057683]]])
+
+    def setUp(self):
+        self.layer1x1 = ConvolutionalLayer(ConvolutionalLayerClassTest.W1,
+                                           ConvolutionalLayerClassTest.B)
+        self.layer3x3 = ConvolutionalLayer(ConvolutionalLayerClassTest.W3,
+                                           ConvolutionalLayerClassTest.B)
+        self.data = np.stack([np.eye(5), np.eye(5)[:, ::-1]], axis=-1)
+
+    def test_activate(self):
+        out1 = self.layer1x1.activate(self.data)
+        self.assertTrue(np.allclose(out1, ConvolutionalLayerClassTest.O1))
+        out3 = self.layer3x3.activate(self.data)
+        self.assertTrue(np.allclose(out3, ConvolutionalLayerClassTest.O3))