-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfind_correlation2.py
45 lines (42 loc) · 1.7 KB
/
find_correlation2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
#!/usr/bin/python
'''
This function will return one set of pairwise correlation values
from columns of a pandas dataframe.
Columns should be cells and rows(index) should be genes.
Correlation options = 'spearman', 'pearson'(default), 'kendal'
masking = False(default) full matrix is used or True
half of the matrix is masked including 100% correlation line
'''
import pandas as pd
import numpy as np
def find_correlation(df=None, masking=False, correlation='pearson'):
# make sure that dataframe is numeric
for i in df.columns.unique():
df[i] = pd.to_numeric(df[i])
# produce correlation correlation matrix as np.matrix
df_corr = df.corr(correlation).as_matrix()
# make mask of top half of matrix to remove duplicated correlations
mask = np.zeros_like(df_corr)
mask[np.triu_indices_from(mask)] = True
# make correlation correlation df
df_corr_df = df.corr(correlation)
# make mask df
mask_df = pd.DataFrame(mask,
columns=df_corr_df.columns.values,
index=df_corr_df.index.values)
# turn mask df into bool df
bool_mask_df = mask_df == 1
if masking is True:
# mask out top half of matrix with nan
df_corr_rem_dup_df = df_corr_df.mask(bool_mask_df, np.nan)
# make nested list of all array values
df_corr_list = df_corr_rem_dup_df.values.tolist()
else:
# make nested list of full matrix
df_corr_list = df_corr_df.values.tolist()
# squash nested list
import itertools
flattened_df_corr_list = list(itertools.chain(*df_corr_list))
# drop nans from list
df_corr_values = [x for x in flattened_df_corr_list if str(x) != 'nan']
return df_corr_values