-
Notifications
You must be signed in to change notification settings - Fork 0
/
obc.py
100 lines (77 loc) · 3.16 KB
/
obc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
"""
"""
import xml.etree.ElementTree as ET
import requests
OBC_IDE_URL = "http://devapi.onbc.io/publications"
class PubmedArticle():
def __init__(self, xml): # start from xml as string for now
self.xml = xml
data = self._parse_xml(xml)
self.doi = data["doi"]
self.pmid = data["pmid"]
def _parse_xml(self, root):
# parse the data we're interested from the xml (passed as a string)
# return the data as a dictionary
# function assumes that xml root is a PubMedArticle node,
# not a PubmedArticleSet node
result = {"pmid": None, "doi": None}
for article_id in root.find("PubmedData").find("ArticleIdList").findall("ArticleId"):
if article_id.get("IdType") == "pubmed":
result["pmid"] = article_id.text
elif article_id.get("IdType") == "doi":
result["doi"] = article_id.text
else:
continue
return result
# these below lines are possible, but more more complicated than just using <ArticleIdList>
# if root.find("MedlineCitation").find("PMID"):
# result["pmid"] = root.find("MedlineCitation")[0].text
# else:
# result["pmid"] = None
# result["doi"] = None
# for eid in root.find("MedlineCitation").find("Article").findall("ELocationID"):
# if eid.get("EIdType") == "doi":
# result["doi"] = eid.text
#return result
def publication_in_obc(self):
"""Return True if an article is found in the obc.ide /publications api response.
Otherwise, returns False. Checks against pmid first, then doi if pmid is
not found. Accepts one argument, a PubmedArticle instance.
"""
entries = requests.get(OBC_IDE_URL).json()
for entry in entries:
if self.pmid == entry.get("pmid", None):
print(entry.get("pmid", None))
return True
else:
if self.doi is not None and self.doi == entry.get("doi", None):
print(entry.get("doi", None))
return True
return False
def sort_xml(path):
# sort PubmedArticles from specified .xml path into two buckets:
# those that are found in obc, and those that are not found in obc
in_obc = []
not_in_obc = []
# loop thru the <PubmedArticleSet> node
tree = ET.parse(path)
root = tree.getroot()
for node in root:
article = PubmedArticle(node)
if article.publication_in_obc():
in_obc.append(article)
else:
not_in_obc.append(article)
return in_obc, not_in_obc
def write_to_xml(lst, path):
# given a list of [PubmedArticle], write the list to xml file
strs = [ET.tostring(a.xml).decode('utf-8') + "\n" for a in lst]
with open(path, 'w') as f:
f.writelines(strs)
return
if __name__ == '__main__':
in_obc, not_in_obc = sort_xml('most_recent_publications_2019-03-19.xml')
print("In OBC: " + str(len(in_obc)))
print("Not In OBC: " + str(len(not_in_obc)))
write_to_xml(in_obc, 'in_obc_test.xml')
write_to_xml(not_in_obc, 'not_in_obc_test.xml')