-
Notifications
You must be signed in to change notification settings - Fork 2
/
08_multilayer_analysis.py
132 lines (110 loc) · 7.36 KB
/
08_multilayer_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
"""
Script responsible to run the analysis in the multilayer section of the paper.
"""
import argparse
import pickle
import pandas as pd
COMBINATIONS = {
'all_tissues': ['Adipose_Subcutaneous', 'Adipose_Visceral_Omentum', 'Adrenal_Gland', 'Artery_Aorta',
'Artery_Coronary', 'Artery_Tibial', 'Brain_Amygdala', 'Brain_Anterior_cingulate', 'Brain_Caudate',
'Brain_Cerebellar', 'Brain_Cerebellum', 'Brain_Cortex', 'Brain_Frontal_Cortex', 'Brain_Hippocampus',
'Brain_Hypothalamus', 'Brain_Nucleus', 'Brain_Putamen', 'Brain_Spinal_cord',
'Brain_Substantia_nigra', 'Breast_Mammary_Tissue', 'Cells_Cultured', 'Cells_EBV', 'Colon_Sigmoid',
'Colon_Transverse', 'Esophagus_Gastro', 'Esophagus_Mucosa', 'Esophagus_Muscularis', 'Heart_Atrial',
'Heart_L_Vent', 'Kidney_Cortex', 'Liver', 'Lung', 'Minor_Salivary', 'Muscle_Skeletal',
'Nerve_Tibial', 'Ovary', 'Pancreas', 'Pituitary', 'Prostate', 'Skin_Not_Sun_Epsd', 'Skin_Sun_Epsd',
'Small_Intestine', 'Spleen', 'Stomach', 'Testis', 'Thyroid', 'Uterus', 'Vagina', 'Whole_Blood'],
'only_brain': ['Brain_Amygdala', 'Brain_Anterior_cingulate', 'Brain_Caudate', 'Brain_Cerebellar',
'Brain_Cerebellum', 'Brain_Cortex', 'Brain_Frontal_Cortex', 'Brain_Hippocampus',
'Brain_Hypothalamus', 'Brain_Nucleus', 'Brain_Putamen', 'Brain_Spinal_cord',
'Brain_Substantia_nigra'],
'non_brain': ['Adipose_Subcutaneous', 'Adipose_Visceral_Omentum', 'Adrenal_Gland', 'Artery_Aorta',
'Artery_Coronary', 'Artery_Tibial', 'Breast_Mammary_Tissue', 'Cells_Cultured', 'Cells_EBV',
'Colon_Sigmoid', 'Colon_Transverse', 'Esophagus_Gastro', 'Esophagus_Mucosa', 'Esophagus_Muscularis',
'Heart_Atrial', 'Heart_L_Vent', 'Kidney_Cortex', 'Liver', 'Lung', 'Minor_Salivary', 'Muscle_Skeletal',
'Nerve_Tibial', 'Ovary', 'Pancreas', 'Pituitary', 'Prostate', 'Skin_Not_Sun_Epsd', 'Skin_Sun_Epsd',
'Small_Intestine', 'Spleen', 'Stomach', 'Testis', 'Thyroid', 'Uterus', 'Vagina', 'Whole_Blood'],
'brain_and_wholeblood': ['Brain_Amygdala', 'Brain_Anterior_cingulate', 'Brain_Caudate', 'Brain_Cerebellar',
'Brain_Cerebellum', 'Brain_Cortex', 'Brain_Frontal_Cortex', 'Brain_Hippocampus',
'Brain_Hypothalamus', 'Brain_Nucleus', 'Brain_Putamen', 'Brain_Spinal_cord',
'Brain_Substantia_nigra', 'Whole_Blood'],
'brain_and_intestines': ['Brain_Amygdala', 'Brain_Anterior_cingulate', 'Brain_Caudate', 'Brain_Cerebellar',
'Brain_Cerebellum', 'Brain_Cortex', 'Brain_Frontal_Cortex', 'Brain_Hippocampus',
'Brain_Hypothalamus', 'Brain_Nucleus', 'Brain_Putamen', 'Brain_Spinal_cord',
'Brain_Substantia_nigra', 'Colon_Sigmoid', 'Colon_Transverse', 'Small_Intestine']
}
# This function return the global multilayer matrix for a multilayer matrix
def global_community_multilayer_matrix(partitions, names):
# Create an empty pandas dataframe with the names of columns and rows given by the genes names
globalmultiplexity = pd.DataFrame(0, columns=names, index=names)
for j in range(0, len(names)):
for k in range(j + 1, len(names)):
for partition in partitions:
if names[j] in partition and names[k] in partition:
if partition[names[j]] == partition[names[k]]:
globalmultiplexity.at[names[j], names[k]] += 1
return globalmultiplexity
def dictionary_from_multilayer_matrix(gl2, value):
# Dictionary where I will save the output of the community detection
mydict = {}
# In this list I will save the values that I wanna put in the dictionary
# in this way I can check if I already have a gene and I don't have double genes there
list_values = []
for i in range(0, gl2.shape[0]):
# Create a dictionary where for each gene
# I have the list of the other genes that are with him tot number of times
# This will give the name of the row, i.e. of the gene of interest in a certain row that I want to consider
row_name = gl2.iloc[i, :].name
# Look for the vector where a certain condition is created
test = gl2.iloc[i, :] == value
single_list = []
# With this I can check if the value that I am considering in the i=th row of the matrix
# is already there or not
if row_name not in list_values:
for j in range(0, gl2.shape[1]):
if test[j] == True:
single_list.append(gl2.columns.values[j])
list_values.append(gl2.columns.values[j])
if len(single_list) > 0:
mydict[row_name] = single_list
return mydict
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--combination",
help="Which combination number to try in the array COMBINATIONS defined at the beginning of this file")
args = parser.parse_args()
combination = args.combination
print("Going with", combination)
# Create an empty list where I will save all the dictionaries of the various genes
list_dic = []
# Going over the tissues in that combination
for tissue_name in COMBINATIONS[combination]:
print("Reading", tissue_name)
# Then I can obtain the correlation matrix relative to a given tissue, that we have already obtained before
# Read the correlation matrix file
corr_mat = pd.read_pickle("data/corr_" + tissue_name + ".pkl")
# The columns of the correlation matrix will tell me the name of the genes that are in each community
genes = corr_mat.columns.values
# "m" gives the community number/id for each gene.
# "m" is basically an array with the same length as genes variable and each element has a number
# corresponding to the community number id
m, _ = pickle.load(open("results/louvain_modules_" + tissue_name + ".pkl", "rb"))
# At this point I can create a dictionary to then pass it to the global multiplexity index code
dictionary = dict(zip(genes, m))
# Append the dictionary each time at the total list_dic element
list_dic.append(dictionary)
# Create a list that will be needed to save the names to give to the global community multilayer matrix
names_total = []
for i in range(0, len(list_dic)):
name = list(list_dic[i].keys())
names_total = list(set(names_total + name))
global_multiplexity_matrix = global_community_multilayer_matrix(list_dic, names_total)
# Save the global multiplexity matrix
global_multiplexity_matrix.to_csv("results/global_multiplexity_" + combination + "_all.csv")
# Create a dictionary where I save the genes that always appear together across the various layers
dict_mat = dictionary_from_multilayer_matrix(global_multiplexity_matrix, len(COMBINATIONS[combination]))
# WRITE THE FINAL FILE, WHERE EACH ROW IS A SET OF GENES THAT ALWAYS APPEAR TOGETHER
# Change the name of the file accordingly
with open("results/global_multiplexity_" + combination + "_genes.csv", 'w') as file:
for key, val in dict_mat.items():
file.write(key + "\t" + str(val) + "\n")