-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpatent_scraping.py
231 lines (169 loc) · 6.57 KB
/
patent_scraping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
import urllib
import urllib.request as request
import urllib.error as error
import time
import json
import pickle
from tqdm import tqdm
from random import sample
import pandas as pd
def get_patentIDs(fp, n):
""" Return a sample of patent ids from all patents in SureChemBL dataset
Args:
fp (str): filepath to patent_ID_index dictionary
n (int): number of patents to sample
Returns:
(list): list of n patent IDs
"""
patent_id_dict = pickle.load(file=open(fp, "rb"))
return sample(list(patent_id_dict), n)
def save_patentJSON(url, patent):
""" Save JSON for a given patent in Data/Patents/
Args:
url (urllib object): result of patentAPI search
patent (str): patent ID
"""
patresp = json.loads(url.read().decode('latin-1'))
#print(patresp)
# Serializing json
json_object = json.dumps(patresp, indent=4)
# Writing to sample.json
with open("Data/Patents/patent_{}.json".format(patent), "w") as outfile:
outfile.write(json_object)
def get_patentData(patent):
""" Download JSON with patent information
Args:
patent (str): patent ID
"""
patentapi = "https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/patent/{}/JSON/".format(
patent)
try:
url = request.urlopen(patentapi)
save_patentJSON(url, patent)
except error.HTTPError as err:
print("tried {} will sleep on it".format(patentapi))
time.sleep(5)
try:
url = request.urlopen(patentapi)
save_patentJSON(url, patent)
except error.HTTPError as err:
print(err)
except Exception as e:
print(e)
def get_author_patents(name):
""" Downloads a JSON of all patent records associated with a specific author
NOTE: TAKES ~24 HOURS FOR FULL DATASET
Args:
name (str): name of patent author to search
"""
#HREF is split into two parts, in order to add name in the middle
href_1 = "https://pubchem.ncbi.nlm.nih.gov/sdq/sdqagent.cgi?infmt=json&outfmt=json&query={%22download%22:%22*%22,%22collection%22:%22patent%22,%22where%22:{%22ands%22:["
href_2 = "]},%22order%22:[%22relevancescore,desc%22],%22start%22:1,%22limit%22:10000000}"
#Names needs to be split & added partwise
names = name.split()
for subname in names:
#Also surrounded by these specific characters (idk why...ask PubChem lol)
href_1 += "{%22*%22:%22" + subname + "%22}"
if subname != names[-1]:
href_1 += ","
#Merge full href, get resulting record (if something fails along the way, just skip it)
href = href_1 + href_2
try:
get_url = request.urlopen(href)
#Save record as JSON in Data/Patents/Patent_Author_Records
with open("Data/Patents/Patent_Author_Records/" +
name.replace(" ", "_") + ".json", "w") as f:
json_data = json.loads(get_url.read())
json.dump(json_data, f, indent=4)
except:
pass
def get_assignee_patents(name):
""" Downloads a JSON of all patent records associated with a specific assignee
NOTE: takes ~10 hours with list of assignee names
Args:
name (str): name of patent author to search
"""
href_1 = "https://pubchem.ncbi.nlm.nih.gov/sdq/sdqagent.cgi?infmt=json&outfmt=json&query={%22download%22:%22*%22,%22collection%22:%22patent%22,%22where%22:{%22ands%22:[{%22*%22:%22"
href_2 = "%22}]},%22order%22:[%22relevancescore,desc%22],%22start%22:1,%22limit%22:10000000}"
#Getting exact name - probably should do this for authors too...
name = " ".join(name.split()[:-1])
name = name.replace(" ", "%20")
href_1 += name
href = href_1 + href_2
#Merge full href, get resulting record (if something fails along the way, just skip it)
try:
get_url = request.urlopen(href)
#Save record as JSON in Data/Patents/Patent_Assignee_Records, with underscores in the name
with open(
"Data/Patents/Patent_Assignee_Records/" +
name.replace("%20", "_") + ".json", "w") as f:
json_data = json.loads(get_url.read())
json.dump(json_data, f, indent=4)
except:
pass
def build_month_increments(start, stop):
""" Build month increments in the form YYYY-MM
Args:
start (int): Starting year
stop (int): Ending year
Returns:
list: list of strings in the form YYYY-MM (e.g., "1980-01")
"""
months = []
while start <= stop:
for month in [
"01", "02", "03", "04", "05", "06", "07", "08", "09", "10",
"11", "12"
]:
months.append(str(start) + "-" + month)
start += 1
return months
def get_all_cpds(patents):
""" Get all compound IDs - filter down into a non-repeating list
Args:
patents (list): SureChemBL patent ID list
"""
months = build_month_increments(1980, 2020)
files = []
for month in months:
files.append(
"Data/CpdPatentIdsDates/Patent_Cpd_Edges/patent_cpd_edges_" +
month + ".p")
cpds = []
for file in tqdm(files):
with open(file, "rb") as f:
try:
edges = pickle.load(f)
except EOFError:
edges = {}
selected_patents = [p for p in edges.keys() if p in patents]
for p in selected_patents:
cpds.extend(edges[p])
cpds = list(set(cpds))
## NOTE: goal of this is to have ~1 million (change number of patents otherwise)
print(len(cpds))
print(cpds[0:10])
def main():
# #### Download n random patent IDs ####
# fp = "Data/CpdPatentIdsDates/patent_ID_index_dict.p"
# n = 10000
# patents = get_patentIDs(fp, n)
# for patent in tqdm(patents):
# get_patentData(patent)
# #### Download patents associated with authors from above n patents ####
# names = pickle.load(file=open("Data/Patents/authors.p", "rb"))
# for name in tqdm(names):
# get_author_patents(name)
# #### Download patents assiciated with assignees from above n patents ####
# names = pickle.load(file=open("Data/Patents/assignees.p", "rb"))
# for name in tqdm(names):
# get_assignee_patents(name)
#### Get compounds associated with author's patents
df = pd.read_csv("Data/Patents/patent_author_records.csv")
## Get all unique patents
patents = set(df["ID"])
# Randomly sampling from patents in order to make this computationally feasible (goal: ~1 million cpds)
patents = sample(patents, 100000)
get_all_cpds(patents)
if __name__ == "__main__":
main()