Skip to content

Commit

Permalink
wip: replace scikit-learn MDS
Browse files Browse the repository at this point in the history
  • Loading branch information
tschuelia committed Apr 9, 2024
1 parent 227f877 commit b97874b
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 36 deletions.
48 changes: 18 additions & 30 deletions pandora/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,14 @@

import numpy as np
import pandas as pd
from allel.stats.distance import pcoa
from numpy import typing as npt
from sklearn.decomposition import PCA as sklearnPCA
from sklearn.manifold import MDS as sklearnMDS

from pandora.custom_errors import PandoraConfigException, PandoraException
from pandora.custom_types import Executable
from pandora.distance_metrics import euclidean_sample_distance
from pandora.embedding import Embedding, from_sklearn_mds, from_smartpca
from pandora.embedding import Embedding, from_smartpca, mds_from_dataframe
from pandora.imputation import impute_data


Expand Down Expand Up @@ -760,7 +760,7 @@ def run_mds(
"""Performs MDS analysis using the data provided in this class and assigns its MDS Embedding result to self.mds.
The FST matrix is generated using the EIGENSOFT smartpca tool.
The subsequent MDS analysis is performed using the scikit-learn MDS implementation.
The subsequent MDS analysis is performed using the scikit-allel MDS implementation (PCoA).
Please note that since EIGENSOFT is used for the FST computation, all samples with the population set to
``Ignore`` will not be used for the MDS computation.
Expand Down Expand Up @@ -803,16 +803,10 @@ def run_mds(
smartpca=smartpca, result_prefix=result_dir / self.name, redo=redo
)

# TODO: replace with scikit-allel (adjust documentation)
mds = sklearnMDS(
n_components=n_components,
dissimilarity="precomputed",
normalized_stress=False,
random_state=42,
)
embedding = mds.fit_transform(fst_matrix)
embedding, explained_variance = pcoa(fst_matrix)
embedding = pd.DataFrame(
data=embedding, columns=[f"D{i}" for i in range(n_components)]
data=embedding[:, :n_components],
columns=[f"D{i}" for i in range(n_components)],
)
embedding["population"] = populations.values

Expand All @@ -826,11 +820,11 @@ def run_mds(
]
)

self.mds = from_sklearn_mds(
self.mds = mds_from_dataframe(
embedding,
pd.Series(sample_ids),
pd.Series(populations),
np.asarray([mds.stress_] * n_components),
explained_variance[:n_components],
)

def bootstrap(
Expand Down Expand Up @@ -1026,7 +1020,8 @@ def get_windows(
class NumpyDataset:
"""Class structure to represent a population genetics dataset in numerical format.
This class provides methods to perform PCA and MDS analyses on the provided numerical data using scikit-learn.
This class provides methods to perform PCA (using scikit-learn) and MDS (using scikit-allel) analyses on the
provided numerical data.
It further provides methods to generate a bootstrap replicate dataset (SNPs resampled with replacement) and
to generate overlapping sliding windows of sub-datasets.
Expand Down Expand Up @@ -1173,9 +1168,9 @@ def run_mds(
) -> None:
"""Performs MDS analysis using the data provided in this class.
The distance matrix is generated using the
provided distance_metric callable. The subsequent MDS analysis is performed using the scikit-learn MDS
implementation. The result of the MDS analysis is an Embedding object assigned to ``self.mds``.
The distance matrix is generated using the provided distance_metric callable.
The subsequent MDS analysis is performed using the scikit-allel MDS implementation (PCoA).
The result of the MDS analysis is an Embedding object assigned to ``self.mds``.
Parameters
----------
Expand Down Expand Up @@ -1228,25 +1223,18 @@ def run_mds(
f"Got {n_dims} dimensions in the distance matrix, but asked for {n_components}."
)

# TODO: replace with scikit-allel (adjust documentation)
mds = sklearnMDS(
n_components,
dissimilarity="precomputed",
normalized_stress=False,
random_state=42,
)
embedding = mds.fit_transform(distance_matrix)

embedding, explained_variance = pcoa(distance_matrix)
embedding = pd.DataFrame(
data=embedding, columns=[f"D{i}" for i in range(n_components)]
data=embedding[:, :n_components],
columns=[f"D{i}" for i in range(n_components)],
)
embedding["population"] = populations.values

self.mds = from_sklearn_mds(
self.mds = mds_from_dataframe(
embedding,
self.sample_ids,
self.populations,
np.asarray([mds.stress_] * n_components),
explained_variance[:n_components],
)

def bootstrap(self, seed: Optional[int] = None) -> NumpyDataset:
Expand Down
4 changes: 2 additions & 2 deletions pandora/embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,7 @@ def from_smartpca(evec: pathlib.Path, eval: pathlib.Path) -> Embedding:
)


def from_sklearn_mds(
def mds_from_dataframe(
embedding: pd.DataFrame,
sample_ids: pd.Series,
populations: pd.Series,
Expand Down Expand Up @@ -343,7 +343,7 @@ def from_sklearn_mds(
# we can directly use the embedding data as mds_data
mds_data = embedding
# add the sample_id column
mds_data["sample_id"] = sample_ids
mds_data["sample_id"] = sample_ids.values
else:
# otherwise we need to iterate the embedding data and duplicate the results for each sample in sample_ids
mds_data = []
Expand Down
7 changes: 3 additions & 4 deletions pandora/embedding_comparison.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from pandora.embedding import Embedding


def filter_samples(embedding: Embedding, samples_to_keep: List[str]) -> Embedding:
def _filter_samples(embedding: Embedding, samples_to_keep: List[str]) -> Embedding:
"""Filters the given Embedding object by removing all samples not contained in samples_to_keep.
Parameters
Expand Down Expand Up @@ -97,7 +97,6 @@ def _clip_missing_samples_for_comparison(
reference : Embedding
Passed ``reference`` Embedding, but only containing the samples present in both Embeddings (``comparable``, ``reference``).
"""
# TODO: this can be simplified using the pandas `align` method...
comp_data = comparable.embedding
ref_data = reference.embedding

Expand All @@ -106,8 +105,8 @@ def _clip_missing_samples_for_comparison(

shared_samples = sorted(comp_ids.intersection(ref_ids))

comparable_clipped = filter_samples(comparable, shared_samples)
reference_clipped = filter_samples(reference, shared_samples)
comparable_clipped = _filter_samples(comparable, shared_samples)
reference_clipped = _filter_samples(reference, shared_samples)

assert (
comparable_clipped.embedding_matrix.shape
Expand Down

0 comments on commit b97874b

Please sign in to comment.