-
Notifications
You must be signed in to change notification settings - Fork 0
/
obc_edited.py
148 lines (121 loc) · 5.32 KB
/
obc_edited.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
"""
"""
import xml.etree.ElementTree as ET
import requests
import datetime as dt
import os
OBC_IDE_URL = "http://devapi.onbc.io/publications"
path1=os.path.dirname(os.path.abspath(__file__))
class PubmedArticle():
def __init__(self, xml): # start from xml as string for now
self.xml = xml
data = self._parse_xml(xml)
self.doi = data["doi"]
self.pmid = data["pmid"]
def _parse_xml(self, root):
# parse the data we're interested from the xml (passed as a string)
# return the data as a dictionary
# function assumes that xml root is a PubMedArticle node,
# not a PubmedArticleSet node
result = {"pmid": None, "doi": None}
for pubid in root.findall('PubmedArticle'):
print(pubid.findall('MedlineCitation/PMID'))
for med in pubid.findall('MedlineCitation/PMID'):
result["pmid"] = med.text
print(med.text)
for doi in pubid.findall('MedlineCitation/Article/ELocationID'):
if med.get('EIdType')=='doi':
result["doi"] = doi.text
#for article_id in root.find("PubmedArticle ").findall("MedlineCitation"):
# if article_id.get("PMID") == "PMID":
# result["pmid"] = article_id.text
# elif article_id.get("EIdType") == "doi":
# result["doi"] = article_id.text
# else:
# continue
return result
# these below lines are possible, but more more complicated than just using <ArticleIdList>
# if root.find("MedlineCitation").find("PMID"):
# result["pmid"] = root.find("MedlineCitation")[0].text
# else:
# result["pmid"] = None
# result["doi"] = None
# for eid in root.find("MedlineCitation").find("Article").findall("ELocationID"):
# if eid.get("EIdType") == "doi":
# result["doi"] = eid.text
#return result
def publication_in_obc(self):
"""Return True if an article is found in the obc.ide /publications api response.
Otherwise, returns False. Checks against pmid first, then doi if pmid is
not found. Accepts one argument, a PubmedArticle instance.
"""
entries = requests.get(OBC_IDE_URL).json()
for entry in entries:
#print(self.pmid)
print(entry.get("pmid"))
#print(self.doi)
#print(entry.get("doi", None))
if self.pmid == entry.get("pmid", None):
#print("matched",self.pmid)
return True
else:
if self.doi == entry.get("doi", None):
#print("mactched",self.doi)
return True
return False
def sort_xml(path):
# sort PubmedArticles from specified .xml path into two buckets:
# those that are found in obc, and those that are not found in obc
in_obc = []
not_in_obc = []
from collections import defaultdict
paper = defaultdict(list)
grant=[]
# loop thru the <PubmedArticleSet> node
tree = ET.parse(path)
root = tree.getroot()
for node in root:
article = PubmedArticle(node)
if article.publication_in_obc():
#print("pMID",node.find('./MedlineCitation/PMID').text)
#print(ET.tostring(node))
in_obc.append(article)
else:
for l in (node.findall('./MedlineCitation/Article/GrantList/Grant/GrantID')):
grantid= l.text.split(" ")
#print(pubid)
if len(grantid)==3:
grantid=grantid[1]+grantid[2]
paper["".join(grantid)].append(node.find('./MedlineCitation')[0].text)
else:
paper[" ".join(grantid)].append(node.find('./MedlineCitation')[0].text)
not_in_obc.append(article)
return in_obc, not_in_obc,paper
def write_to_xml(lst, path):
# given a list of [PubmedArticle], write the list to xml file
strs = [ET.tostring(a.xml).decode('utf-8') + "\n" for a in lst]
with open(path, 'w') as f:
#f.writelines('<?xml version="1.0" ?>'+'\n')
#f.writelines('<!DOCTYPE PubmedArticleSet PUBLIC "-//NLM//DTD PubMedArticle, 1st June 2018//EN" "https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_180601.dtd">'+'\n')
f.writelines("<PubmedArticleSet>"+'\n')
f.writelines(strs)
f.writelines('\n'+"</PubmedArticleSet>")
return
#def get_xml():
# today=dt.datetime.today().strftime(("%Y-%m-%d"))
# filepath=path1+"\\"+("most_recent_publications_{}.xml".format(today))
# in_obc, not_in_obc,paper = sort_xml(filepath)
# print("In OBC: " + str(len(in_obc)))
# print("Not In OBC: " + str(len(not_in_obc)))
# write_to_xml(in_obc, path1+"\\"+'in_obc.xml')
# write_to_xml(not_in_obc, path1+"\\"+'not_in_obc.xml')
# return paper
if __name__ == '__main__':
today=dt.datetime.today().strftime(("%Y-%m-%d"))
filepath='C:\\Users\\AKSHATA\\Desktop\\OPS_Job\\obc-pubmed-master\\most_recent_publications_2019-03-19.xml'
in_obc, not_in_obc,paper = sort_xml(filepath)
print("In OBC: " + str(len(in_obc)))
print("Not In OBC: " + str(len(not_in_obc)))
write_to_xml(in_obc, path1+"\\"+'in_obc.xml')
write_to_xml(not_in_obc, path1+"\\"+'not_in_obc.xml')
#get_xml()