Skip to content

Commit

Permalink
fix bug in mds computation for NumpyDatasets with missing data
Browse files Browse the repository at this point in the history
  • Loading branch information
tschuelia committed Nov 23, 2023
1 parent 2da791e commit 2bbd5d0
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 2 deletions.
2 changes: 1 addition & 1 deletion pandora/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1203,7 +1203,7 @@ def run_mds(
input_data[input_data == self._missing_value] = np.nan

distance_matrix, populations = distance_metric(
self.input_data, self.populations, imputation
input_data, self.populations, imputation
)
if distance_matrix.shape[0] != populations.shape[0]:
raise PandoraException(
Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ long_description_content_type = text/markdown
url = https://github.com/tschuelia/Pandora
author = Julia Haag
author_email = julia.haag@h-its.org
version = 1.0.6
version = 1.0.7
classifiers =
Programming Language :: Python :: 3.8
Programming Language :: Python :: 3.9
Expand Down
17 changes: 17 additions & 0 deletions tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -562,6 +562,23 @@ def test_run_mds(
check_names=False,
)

def test_run_mds_fst_distance_with_missing_data(self):
# the following dataset contains missing data
# since the default dtype is uint8, missing values should be represented by the value 255
# however, prior to the distance matrix computation, 255 should be replaced by np.nan
# if this does not work properly, the matrix compuation will fail
test_data = np.asarray(
[[0, 1, 1, 1, 1, 1, 1], [2, 2, 0, 2, 2, 2, 2], [1, 2, 1, 0, 2, 1, 1]]
)
sample_ids = pd.Series(["sample1", "sample2", "sample3"])
populations = pd.Series(["population1", "population2", "population3"])
dataset = NumpyDataset(
test_data, sample_ids, populations, missing_value=0, dtype=np.uint8
)
dataset.run_mds(
n_components=2, distance_metric=fst_population_distance, imputation=None
)

def test_numpy_dataset_from_eigenfiles(self, example_eigen_dataset_prefix):
np_dataset = numpy_dataset_from_eigenfiles(example_eigen_dataset_prefix)

Expand Down

0 comments on commit 2bbd5d0

Please sign in to comment.