-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathATAC_HVF.py
76 lines (58 loc) · 2.94 KB
/
ATAC_HVF.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
'''
Description:
Select highly variable features for ATAC data.
Reference:
[1] https://github.com/colomemaria/epiScanpy/blob/0498043252c58b4c4187007c215dd6d2143d121b/episcanpy/preprocessing/_quality_control.py#L64
'''
from scipy.sparse import issparse
def cal_var(adata):
"""
Show distribution plots of cells sharing features and variability score.
"""
if issparse(adata.X):
adata.var['n_cells'] = adata.X.sum(axis=0).tolist()[0] ## double check this
adata.var['prop_shared_cells'] = adata.var['n_cells']/len(adata.obs_names.tolist())
adata.var['variability_score'] = [1-abs(n-0.5) for n in adata.var['prop_shared_cells']]
else:
adata.var['n_cells'] = adata.X.sum(axis=0)
adata.var['prop_shared_cells'] = adata.var['n_cells']/len(adata.obs_names.tolist())
adata.var['variability_score'] = [1-abs(n-0.5) for n in adata.var['prop_shared_cells']]
def select_var_feature(adata, min_score=0.5, nb_features=None, copy=False):
"""
This function computes a variability score to rank the most variable features across all cells.
Then it selects the most variable features according to either a specified number of features (nb_features) or a maximum variance score (max_score).
Parameters
----------
adata: adata object
min_score: minimum threshold variability score to retain features,
where 1 is the score of the most variable features and 0.5 is the score of the least variable features.
nb_features: default value is None, if specify it will select a the top most variable features.
if the nb_features is larger than the total number of feature, it filters based on the max_score argument
show: default value True, it will plot the distribution of var.
copy: return a new adata object if copy == True.
Returns
-------
Depending on ``copy``, returns a new AnnData object or overwrite the input
"""
if copy:
inplace=False
else:
inplace=True
adata = adata.copy() if not inplace else adata
# calculate variability score
cal_var(adata) # adds variability score for each feature
# adata.var['variablility_score'] = abs(adata.var['prop_shared_cells']-0.5)
var_annot = adata.var.sort_values(ascending=False, by ='variability_score')
selected_vars = var_annot.iloc[:nb_features]
# calculate the max score to get a specific number of feature
if nb_features != None and nb_features < len(adata.var_names):
min_score = var_annot['variability_score'][nb_features]
adata_tmp = adata[:,adata.var['variability_score']>=min_score].copy()
## return the filtered AnnData objet.
if not inplace:
# adata_tmp = adata[:,adata.var['variability_score']>=min_score]
adata_tmp = adata[:,selected_vars.index.values]
return(adata_tmp)
else:
# adata._inplace_subset_var(adata.var['variability_score']>=min_score)
adata._inplace_subset_var(selected_vars.id.values)