Skip to content

Commit

Permalink
Merge branch 'development'
Browse files Browse the repository at this point in the history
Fixed issue with using non-sparse affinity matrix with sknetwork. Updated defaults to commonly used settings.
  • Loading branch information
aron0093 committed Nov 29, 2021
2 parents c15dd9c + 2bd2c67 commit 2b05201
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 18 deletions.
35 changes: 25 additions & 10 deletions cytopath/markov_chain_sampler.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import math
import numpy as np
import scvelo
from scipy import sparse
from tqdm import tqdm
from joblib import Parallel, delayed
Expand Down Expand Up @@ -48,7 +49,7 @@ def markov_sim(j, sim_number, max_steps, root_cells, clusters, trans_matrix, tra
clust_state[k, i] = clusters[currState[k, i]]
return currState, np.sum(-np.log(prob_state), axis=1), clust_state

def sampling(data, matrix_key = 'T_forward', cluster_key = 'louvain', max_steps=200, traj_number=1000, sim_number=2000,
def sampling(data, matrix_key = 'T_forward', cluster_key = 'louvain', max_steps=30, traj_number=500, sim_number=500,
end_point_probability=0.95, root_cell_probability=0.95, end_points=[], root_cells=[], end_clusters = [], root_clusters=[], min_clusters=3,
normalize=False, unique=True, num_cores=1, copy=False):

Expand All @@ -60,11 +61,11 @@ def sampling(data, matrix_key = 'T_forward', cluster_key = 'louvain', max_steps=
Annotated data matrix with transition probabilities, end points and clustering.
matrix_key: Key for accessing transition matrix stored in adata.uns
cluster_key: Clustering of cells to use for terminal region annotation.
max_steps: `integer` (default: 200)
max_steps: `integer` (default: 30)
Maximum number steps of simulation.
traj_number: `integer`(default: 1000)
traj_number: `integer`(default: 500)
Minimum number of samples generated for each terminal region.
sim_number: `integer`(default: 2000)
sim_number: `integer`(default: 500)
Initial number of samples generated for each terminal region.
end_point_probability: 'float' (default:0.95)
End point probability threshold at which a cell is considered to be a terminal state for samples.
Expand Down Expand Up @@ -174,12 +175,28 @@ def sampling(data, matrix_key = 'T_forward', cluster_key = 'louvain', max_steps=
if len(end_clusters) == 0:
try:
end_points = np.asarray(np.where(np.asarray(adata.obs["end_points"]) >= end_point_probability))[0]
#TODO
'''
# Recluster end points if based on probability
end_points_adata = adata[end_points]
scvelo.tl.louvain(end_points_adata)
adata.obs['updated_'+cluster_key] = adata.obs[cluster_key].astype(str).values
adata.obs.iloc[end_points, adata.obs.columns.get_loc('updated_'+cluster_key)] = 'End_'+ end_points_adata.obs['louvain'].astype(str).values
del end_points_adata
cluster_key = 'updated_'+cluster_key
clusters = adata.obs[cluster_key].astype(str).values
adata.uns['run_info'] = {'cluster_key': cluster_key}
'''

except:
raise ValueError('End points must be provided in adata.obs["end_points"]. Run cytopath.terminal_states')
else:
end_points = np.asarray(np.where(np.asarray(adata.obs[cluster_key].isin(end_clusters))))[0]
end_points = np.asarray(np.where(np.asarray(adata.obs[cluster_key].isin(end_clusters))))[0]

adata.uns['run_info']['end_points'] = end_points

end_point_clusters = adata.obs[cluster_key][end_points].astype(str).values
end_clusters_ = np.unique(end_point_clusters)

Expand All @@ -197,9 +214,6 @@ def sampling(data, matrix_key = 'T_forward', cluster_key = 'louvain', max_steps=
print("Sampling round " + str(count))
count+=1

# TODO: Add transition probability matrix update that removes transitions exclusive to end points sampled adequately
# to increase sampling efficiency for larger datasets.

# Parallelisation over the cells: 1 Thread for the samples of each cell.
if len(root_cells) == 1:
all_seq, prob_all_seq, cluster_seq_all = markov_sim(0, sim_number_, max_steps, root_cells, clusters, trans_matrix,
Expand All @@ -208,7 +222,6 @@ def sampling(data, matrix_key = 'T_forward', cluster_key = 'louvain', max_steps=
results = Parallel(n_jobs=num_cores)(delayed(markov_sim)(j, sim_number_, max_steps, root_cells, clusters, trans_matrix,
trans_matrix_indice_zeros,
trans_matrix_probabilites_zeros) for j in tqdm(range(len(root_cells))))

# Results are stored in np.arrays
all_seq = np.empty((len(root_cells)*sim_number_, max_steps), dtype=np.int32)
prob_all_seq = np.empty((len(root_cells)*sim_number_), dtype=np.float32)
Expand All @@ -225,6 +238,8 @@ def sampling(data, matrix_key = 'T_forward', cluster_key = 'louvain', max_steps=
indices = [idx for idx in np.arange(all_seq.shape[0]) if np.unique(cluster_seq_all[idx]).shape[0] >= min_clusters]
if len(indices) == 0:
raise ValueError('Zero samples obtained. Try increasing max_steps.')
elif len(indices) < 0.1*traj_number:
warnings.warn('Less than 10% of requested samples obtained. The process may run out of memory before converging.', ResourceWarning)
all_seq = [all_seq[i] for i in sorted(indices)]
prob_all_seq = [prob_all_seq[i] for i in sorted(indices)]
cluster_seq_all = [cluster_seq_all[i] for i in sorted(indices)]
Expand Down
14 changes: 11 additions & 3 deletions cytopath/trajectory_estimation/cyto_path.py
Original file line number Diff line number Diff line change
Expand Up @@ -301,14 +301,14 @@ def cytopath(adata, basis="umap", neighbors_basis='pca', surrogate_cell=False, f
adata.uns['trajectories']["cells_along_trajectories_each_step"] = np.rec.fromrecords(cells_along_trajectories,
dtype=[('End point', 'U32'), ('Trajectory', int),
('Step', int), ('Cell', int), ('Allignment Score', float)])
def estimate_cell_data(adata, groupby='median', weighted=False):
def estimate_cell_data(adata, groupby='mean', weighted=True):
"""Calculates the pseudotime per trajectory for each cell.
Arguments
---------
adata: :class:`~anndata.AnnData`
Annotated data matrix with end points. Must run cytopath.cytopath before.
groupby: str(default: max)
groupby: str(default: median)
One of max, min, median or mean. Grouping method for determing alignment score per cell per trajectory
weighted: Boolean (default:False)
If groupby is mean, reports mean pseduotime weighted by alignment score if true.
Expand All @@ -318,8 +318,16 @@ def estimate_cell_data(adata, groupby='median', weighted=False):
raise ValueError("Run cytopath.cytopath before estimating pseudotime.")

cytopath_data = pd.DataFrame(adata.uns['trajectories']['cells_along_trajectories_each_step'].copy())

if groupby == 'max_step':
cytopath_data_ = cytopath_data.loc[cytopath_data['Step']==cytopath_data.groupby(['End point', 'Trajectory', 'Cell'])["Step"].transform('max')]
cytopath_data_ = cytopath_data_.groupby(['End point', 'Trajectory', 'Cell']).mean()

elif groupby == 'min_step':
cytopath_data_ = cytopath_data.loc[cytopath_data['Step']==cytopath_data.groupby(['End point', 'Trajectory', 'Cell'])["Step"].transform('min')]
cytopath_data_ = cytopath_data_.groupby(['End point', 'Trajectory', 'Cell']).mean()

if groupby == 'max':
elif groupby == 'max':
cytopath_data_ = cytopath_data.loc[cytopath_data['Allignment Score']==cytopath_data.groupby(['End point', 'Trajectory', 'Cell'])["Allignment Score"].transform('max')]
cytopath_data_ = cytopath_data_.groupby(['End point', 'Trajectory', 'Cell']).mean()

Expand Down
6 changes: 3 additions & 3 deletions cytopath/trajectory_estimation/sample_clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from sklearn.preprocessing import minmax_scale
from sklearn.cluster import AffinityPropagation, DBSCAN, KMeans, AgglomerativeClustering, OPTICS
from sknetwork.clustering import Louvain
from scipy import stats, spatial
from scipy import stats, spatial, sparse

# Function to define terminal regions for trajectories
def end_point_cluster(adata):
Expand Down Expand Up @@ -77,7 +77,7 @@ def preclustering(adata, all_seq_cluster, sequence_coordinates, basis="umap", di
distances[j+i,i] = haus

affinity=-distances
affinity = minmax_scale(affinity, axis=1)
affinity = sparse.csr_matrix(minmax_scale(affinity, axis=1))

# Perform clustering using hausdorff distance
print('Clustering using hausdorff distances')
Expand Down Expand Up @@ -141,7 +141,7 @@ def clustering(adata, sequence_coordinates, cluster_chains, cluster_strength, cl
average_cl_d[i,j] = hausdorff_distance(cluster_chains[i], cluster_chains[j], distance=distance)

average_cl_d_affinity=-average_cl_d
average_cl_d_affinity = minmax_scale(average_cl_d_affinity, axis=1)
average_cl_d_affinity = sparse.csr_matrix(minmax_scale(average_cl_d_affinity, axis=1))

print('Clustering using hausdorff distances')
if n_clusters==None:
Expand Down
4 changes: 2 additions & 2 deletions cytopath/trajectory_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
from .trajectory_estimation.cyto_path import cytopath, estimate_cell_data
from .trajectory_estimation.cytopath_merger import cytopath_merger

def trajectories(data, smoothing=False, alignment_cutoff=0.0, basis='umap', neighbors_basis='pca', fill_cluster=True, n_neighbors_cluster=30, cluster_freq=0.1,
groupby='median', weighted=False, surrogate_cell=False, n_neighbors_alignment='auto', cluster_num=1, method = 'kmeans', distance = 'cosine', num_cores=1, copy=False):
def trajectories(data, smoothing=False, alignment_cutoff=0.0, basis='umap', neighbors_basis='pca', fill_cluster=True, n_neighbors_cluster=30, cluster_freq=0.5,
groupby='mean', weighted=True, surrogate_cell=False, n_neighbors_alignment='auto', cluster_num=None, method = 'kmeans', distance = 'euclidean', num_cores=1, copy=False):

adata = data.copy() if copy else data

Expand Down

0 comments on commit 2b05201

Please sign in to comment.