-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpageRank.py
138 lines (103 loc) · 3.85 KB
/
pageRank.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import igraph as ig
import pickle
import subprocess
from functools import reduce
import pandas as pd
def build_month_list(start, end):
""" Builds a list of all months in a given range
Args:
start (int): year describing the start of the data
end (int): year describing the end of the data (inclusive)
Returns:
list: list of all update months in format "YYYY-MM"
"""
updates = []
for year in range(start, end + 1): # all years through the given end
for month in range(1, 13): #include 12 months
if month < 10:
updates.append(str(year) + "-0" + str(month))
else:
updates.append(str(year) + "-" + str(month))
return updates
def load_graph(month):
""" Loads a igraph graph (stored in pickle format)
Args:
fp (string): filepath to igraph graph
Returns:
igraph graph: Graph of 1 month of SureChemBL
"""
subprocess.run([
"rclone",
"copy",
"SureChemBL_Patents:Graphs/G_cpd_" + month + ".p",
"/scratch/jmalloy3/",
])
return pickle.load(file=open("/scratch/jmalloy3/G_cpd_" + month +
".p", "rb"))
def get_pageRank(G, month):
""" Calculates the pagerank of the graph of a given month
Args:
G (igraph Graph): graph of a specific month
month (str): month (to be used for file IO labeling)
"""
#Calculate pagerank
G.vs["pagerank"] = G.pagerank()
#Zip ChEMBL ids and pagerank values (don't care about the graph structure)
pickle.dump(list(zip(G.vs["name"], G.vs["pagerank"])),
file=open("/scratch/jmalloy3/pagerank_" + month + ".p", "wb"))
#Ensure pagerank analysis ends up in Google Drive
subprocess.run([
"rclone", "copy", "/scratch/jmalloy3/pagerank_" + month + ".p",
"SureChemBL_Patents:PageRank/"
])
#Remove files from scratch to keep it empty-ish
subprocess.run(["rm", "/scratch/jmalloy3/pagerank_" + month + ".p"])
subprocess.run(["rm", "/scratch/jmalloy3/G_cpd_" + month + ".p"])
def tidy_pageRank(month):
""" Tidyverse pagerank data
Args:
month (str): month of specific analysis
Returns:
pandas dataframe: dataframe containing SureChemBL id and pagerank value, for given month
"""
fp = "SureChemBL_Patents:PageRank/"
#Load pagerank results from GDrive
subprocess.run([
"rclone",
"copy",
fp + "pagerank_" + month + ".p",
"~/Patents/PageRank/",
])
#create dataframe from pagerank results, with columns "ID" and month
return pd.DataFrame(pickle.load(
file=open("~/Patents/PageRank/pagerank_" + month + ".p", "rb")),
columns=["ID", str(month)])
def create_dataframe(dataframes):
""" Build an overarching dataframe of all pagerank values
In the form ID: value1, value2, value3... where valueN is each month value
Args:
dataframes (list): list of dataframes containing ID and monthly pagerank values
Returns:
None (writes full dataframe & dropped nan dataframe to /scratch)
"""
#Merges dataframe
df = reduce(
lambda left, right: pd.merge(left, right, on=["ID"], how="outer"),
dataframes)
df.to_csv("/scratch/jmalloy3/pagerank_analysis.csv")
#Drops nans, so only compounds present in every month appear (may not be useful, but could be?)
df = df.dropna()
df.to_csv("scratch/jmalloy3/pagerank_lastingCpds.csv")
def main():
#Calculate pagerank for all months in dataset
months = build_month_list(1980, 2019)
# for month in months:
# G = load_graph(month)
# get_pageRank(G, month)
#Tidy data from pagerank
dataframes = []
for month in months:
dataframes.append(tidy_pageRank(month))
create_dataframe(dataframes)
if __name__ == "__main__":
main()