fix bug in mds computation for NumpyDatasets with missing data

tschuelia · Nov 23, 2023 · 2bbd5d0 · 2bbd5d0
1 parent 2da791e
commit 2bbd5d0
Show file tree

Hide file tree

Showing 3 changed files with 19 additions and 2 deletions.
diff --git a/pandora/dataset.py b/pandora/dataset.py
@@ -1203,7 +1203,7 @@ def run_mds(
         input_data[input_data == self._missing_value] = np.nan
 
         distance_matrix, populations = distance_metric(
-            self.input_data, self.populations, imputation
+            input_data, self.populations, imputation
         )
         if distance_matrix.shape[0] != populations.shape[0]:
             raise PandoraException(

diff --git a/setup.cfg b/setup.cfg
@@ -6,7 +6,7 @@ long_description_content_type = text/markdown
 url = https://github.com/tschuelia/Pandora
 author = Julia Haag
 author_email = julia.haag@h-its.org
-version = 1.0.6
+version = 1.0.7
 classifiers =
     Programming Language :: Python :: 3.8
     Programming Language :: Python :: 3.9

diff --git a/tests/test_dataset.py b/tests/test_dataset.py
@@ -562,6 +562,23 @@ def test_run_mds(
             check_names=False,
         )
 
+    def test_run_mds_fst_distance_with_missing_data(self):
+        # the following dataset contains missing data
+        # since the default dtype is uint8, missing values should be represented by the value 255
+        # however, prior to the distance matrix computation, 255 should be replaced by np.nan
+        # if this does not work properly, the matrix compuation will fail
+        test_data = np.asarray(
+            [[0, 1, 1, 1, 1, 1, 1], [2, 2, 0, 2, 2, 2, 2], [1, 2, 1, 0, 2, 1, 1]]
+        )
+        sample_ids = pd.Series(["sample1", "sample2", "sample3"])
+        populations = pd.Series(["population1", "population2", "population3"])
+        dataset = NumpyDataset(
+            test_data, sample_ids, populations, missing_value=0, dtype=np.uint8
+        )
+        dataset.run_mds(
+            n_components=2, distance_metric=fst_population_distance, imputation=None
+        )
+
     def test_numpy_dataset_from_eigenfiles(self, example_eigen_dataset_prefix):
         np_dataset = numpy_dataset_from_eigenfiles(example_eigen_dataset_prefix)