-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrunAnalysis.py
136 lines (81 loc) · 3.95 KB
/
runAnalysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Jan 17 13:48:34 2023
@author: Ozan Ozisik
"""
import os
from enrichmentAnalysis import performEnrichmentAnalysis, applyOrsum
from NetworkAnalysis import NetworkAnalysis
from idMapping import getIDMappingDict, mapIDsInGeneGroupsDict
#### Parameters
## Choose the analyses you want to perform
runEnrichment=True
runAverageDistanceBetweenGroups=True
runRWROverlap=True
runDSD=False
rwrTopNumber=50
rwrOverlapBootstrapNumber=5 #set to a small number for people to try the code, 1000 takes time
orsumNumberOfTermsToPlot=20
averageDistanceBootstrapNumber=2000
resultFolder='Results'
enrichmentFolder=resultFolder+ os.sep+ 'EnrichmentResults'
summaryFolder=enrichmentFolder+os.sep+ 'Summary'
networkAnalysisFolder=resultFolder+ os.sep+ 'NetworkResults'
geneGroups=['DoGs', 'CoGs', 'PoGs']
pathsForGeneGroups={'DoGs':'./Data/DiseaseGenes/DoGs.txt',
'CoGs':'./Data/DiseaseGenes/CoGs.txt',
'PoGs':'./Data/DiseaseGenes/PoGs.txt'}
networkFiles=['Data/Networks/PPI_HiUnion_LitBM_APID_gene_names_190123.tsv',
'Data/Networks/Pathways_reactome_gene_names_190123.tsv',
'Data/Networks/Complexes_gene_names_190123.tsv']
#networkFiles=['Data/Networks/Biogrid-HSapiens-physical-4.4.222.txt']
geneIDMappingFile='Data/HGNC 230616.txt'
## The URL for the g:Profiler request.
## We performed the analysis with e109_eg56_p17_1d3191d (database updated on 29/03/2023)
gProfilerBaseUrl='https://biit.cs.ut.ee/gprofiler_archive3/e109_eg56_p17'
## If up-to-date data will be used,
## gProfilerBaseUrl='https://biit.cs.ut.ee/gprofiler'
## can be used. In that case the gmt files for orsum should also be updated.
#### Initialization
if not os.path.exists(resultFolder):
os.makedirs(resultFolder)
print('Result folder \"'+resultFolder+'\" is created.')
#### Reading DoGs, CoGs and PoGs
geneGroupsDict=dict()
allgenes=[]
for groupName, path in pathsForGeneGroups.items():
with open(path) as f:
geneGroupsDict[groupName]=[line.rstrip() for line in f]
allgenes=allgenes+geneGroupsDict[groupName]
#### Enrichment analysis
if runEnrichment:
if not os.path.exists(enrichmentFolder):
os.makedirs(enrichmentFolder)
print('Enrichment result folder \"'+enrichmentFolder+'\" is created.')
## Mapping gene symbols to ensembl IDs because g:Profiler discards genes
## which have ambiguity - which are mapped to different ensembl IDs
## by HGNC and NCBI. Here we use HGNC to do the mapping.
idMappingDict=getIDMappingDict(geneIDMappingFile,'Approved symbol', 'Ensembl ID(supplied by Ensembl)')
idMappedGeneGroupsDict=mapIDsInGeneGroupsDict(geneGroupsDict, idMappingDict)
sources=['GO:BP', 'GO:CC']
performEnrichmentAnalysis(idMappedGeneGroupsDict, enrichmentFolder, summaryFolder, sources=sources, gProfiler_base_url=gProfilerBaseUrl)
## The gmt file should be changed according to the data used for enrichment analysis.
for source in sources:
applyOrsum(gmt='Data/Annotation/hsapiens.'+source+'.name.gmt',
source=source,
geneGroupsDict=idMappedGeneGroupsDict,
summaryFolder=summaryFolder,
outputFolder='orsum'+source.replace(':',''),
numberOfTermsToPlot=orsumNumberOfTermsToPlot
)
#### Network analysis
if runAverageDistanceBetweenGroups or runDSD or runRWROverlap:
networkAnalysis=NetworkAnalysis(networkFiles, geneGroupsDict, networkAnalysisFolder)
if runAverageDistanceBetweenGroups:
networkAnalysis.calculateAverageDistanceBetweenGroups(averageDistanceBootstrapNumber)
if runDSD:
dfRWRScoresDict=networkAnalysis.calculateDiffusionStateDistance()
if runRWROverlap:
networkAnalysis.calculateRWRTopNodesOverlap(rwrTopNumber, orsumNumberOfTermsToPlot, rwrOverlapBootstrapNumber)
networkAnalysis.fLog.close()