-
Notifications
You must be signed in to change notification settings - Fork 360
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[MRG] Dataverse content provider #739
Changes from all commits
331a610
8edafd0
063fd4d
14b758d
30375d1
a2f8228
4df4fd6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,129 @@ | ||
import os | ||
import json | ||
import shutil | ||
|
||
from urllib.request import Request | ||
from urllib.parse import urlparse, urlunparse, parse_qs | ||
|
||
from .doi import DoiProvider | ||
from ..utils import copytree, deep_get | ||
|
||
|
||
class Dataverse(DoiProvider): | ||
"""Provide contents of a Dataverse dataset.""" | ||
|
||
def __init__(self): | ||
data_file = os.path.join(os.path.dirname(__file__), "dataverse.json") | ||
with open(data_file, "r") as fp: | ||
self.hosts = json.load(fp)["installations"] | ||
super().__init__() | ||
|
||
def detect(self, doi, ref=None, extra_args=None): | ||
"""Trigger this provider for things that resolve to a Dataverse dataset. | ||
|
||
Handles: | ||
- DOI pointing to {siteURL}/dataset.xhtml?persistentId={persistentId} | ||
- DOI pointing to {siteURL}/file.xhtml?persistentId={persistentId}&... | ||
- URL {siteURL}/api/access/datafile/{fileId} | ||
|
||
Examples: | ||
- https://dataverse.harvard.edu/api/access/datafile/3323458 | ||
- doi:10.7910/DVN/6ZXAGT | ||
- doi:10.7910/DVN/6ZXAGT/3YRRYJ | ||
|
||
""" | ||
url = self.doi2url(doi) | ||
# Parse the url, to get the base for later API calls | ||
parsed_url = urlparse(url) | ||
|
||
# Check if the url matches any known Dataverse installation, bail if not. | ||
host = next( | ||
( | ||
host | ||
for host in self.hosts | ||
if urlparse(host["url"]).netloc == parsed_url.netloc | ||
), | ||
None, | ||
) | ||
if host is None: | ||
return | ||
|
||
query_args = parse_qs(parsed_url.query) | ||
|
||
# Corner case handling | ||
if parsed_url.path.startswith("/file.xhtml"): | ||
# There's no way of getting file information using its persistentId, the only thing we can do is assume that doi | ||
# is structured as "doi:<dataset_doi>/<file_doi>" and try to handle dataset that way. | ||
new_doi = doi.rsplit("/", 1)[0] | ||
if new_doi == doi: | ||
# tough luck :( Avoid inifite recursion and exit. | ||
return | ||
return self.detect(new_doi) | ||
elif parsed_url.path.startswith("/api/access/datafile"): | ||
# Raw url pointing to a datafile is a typical output from an External Tool integration | ||
entity_id = os.path.basename(parsed_url.path) | ||
search_query = "q=entityId:" + entity_id + "&type=file" | ||
# Knowing the file identifier query search api to get parent dataset | ||
search_url = urlunparse( | ||
parsed_url._replace(path="/api/search", query=search_query) | ||
) | ||
self.log.debug("Querying Dataverse: " + search_url) | ||
resp = self.urlopen(search_url).read() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Suggest to |
||
data = json.loads(resp.decode("utf-8"))["data"] | ||
if data["count_in_response"] != 1: | ||
self.log.debug( | ||
"Dataverse search query failed!\n - doi: {}\n - url: {}\n - resp: {}\n".format( | ||
doi, url, json.dump(data) | ||
) | ||
) | ||
return | ||
|
||
self.record_id = deep_get(data, "items.0.dataset_persistent_id") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could there be a case where more than 1 item will be returned? Should you warn about that here? |
||
elif ( | ||
parsed_url.path.startswith("/dataset.xhtml") | ||
and "persistentId" in query_args | ||
): | ||
self.record_id = deep_get(query_args, "persistentId.0") | ||
|
||
if hasattr(self, "record_id"): | ||
return {"record": self.record_id, "host": host} | ||
|
||
def fetch(self, spec, output_dir, yield_output=False): | ||
"""Fetch and unpack a Dataverse dataset.""" | ||
record_id = spec["record"] | ||
host = spec["host"] | ||
|
||
yield "Fetching Dataverse record {}.\n".format(record_id) | ||
req = Request( | ||
"{}/api/datasets/:persistentId?persistentId={}".format( | ||
host["url"], record_id | ||
), | ||
headers={"accept": "application/json"}, | ||
) | ||
resp = self.urlopen(req) | ||
record = json.loads(resp.read().decode("utf-8"))["data"] | ||
|
||
for fobj in deep_get(record, "latestVersion.files"): | ||
file_url = "{}/api/access/datafile/{}".format( | ||
host["url"], deep_get(fobj, "dataFile.id") | ||
) | ||
filename = os.path.join(fobj.get("directoryLabel", ""), fobj["label"]) | ||
|
||
file_ref = {"download": file_url, "filename": filename} | ||
fetch_map = {key: key for key in file_ref.keys()} | ||
|
||
for line in self.fetch_file(file_ref, fetch_map, output_dir): | ||
yield line | ||
|
||
new_subdirs = os.listdir(output_dir) | ||
# if there is only one new subdirectory move its contents | ||
# to the top level directory | ||
if len(new_subdirs) == 1 and os.path.isdir(new_subdirs[0]): | ||
d = new_subdirs[0] | ||
copytree(os.path.join(output_dir, d), output_dir) | ||
shutil.rmtree(os.path.join(output_dir, d)) | ||
|
||
@property | ||
def content_id(self): | ||
"""The Dataverse persistent identifier.""" | ||
return self.record_id |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,144 @@ | ||
import json | ||
import os | ||
import pytest | ||
|
||
from io import BytesIO | ||
from tempfile import TemporaryDirectory | ||
from unittest.mock import patch | ||
from urllib.request import urlopen, Request | ||
|
||
from repo2docker.contentproviders import Dataverse | ||
|
||
|
||
test_dv = Dataverse() | ||
harvard_dv = next((_ for _ in test_dv.hosts if _["name"] == "Harvard Dataverse")) | ||
cimmyt_dv = next((_ for _ in test_dv.hosts if _["name"] == "CIMMYT Research Data")) | ||
test_hosts = [ | ||
( | ||
[ | ||
"doi:10.7910/DVN/6ZXAGT/3YRRYJ", | ||
"10.7910/DVN/6ZXAGT", | ||
"https://dataverse.harvard.edu/api/access/datafile/3323458", | ||
"hdl:11529/10016", | ||
], | ||
[ | ||
{"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"}, | ||
{"host": cimmyt_dv, "record": "hdl:11529/10016"}, | ||
], | ||
) | ||
] | ||
test_responses = { | ||
"doi:10.7910/DVN/6ZXAGT/3YRRYJ": ( | ||
"https://dataverse.harvard.edu/file.xhtml" | ||
"?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ" | ||
), | ||
"doi:10.7910/DVN/6ZXAGT": ( | ||
"https://dataverse.harvard.edu/dataset.xhtml" | ||
"?persistentId=doi:10.7910/DVN/6ZXAGT" | ||
), | ||
"10.7910/DVN/6ZXAGT": ( | ||
"https://dataverse.harvard.edu/dataset.xhtml" | ||
"?persistentId=doi:10.7910/DVN/6ZXAGT" | ||
), | ||
"https://dataverse.harvard.edu/api/access/datafile/3323458": "https://dataverse.harvard.edu/api/access/datafile/3323458", | ||
"hdl:11529/10016": "https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016", | ||
} | ||
test_search = { | ||
"data": { | ||
"count_in_response": 1, | ||
"items": [{"dataset_persistent_id": "doi:10.7910/DVN/6ZXAGT"}], | ||
} | ||
} | ||
|
||
|
||
@pytest.mark.parametrize("test_input, expected", test_hosts) | ||
def test_detect_dataverse(test_input, expected): | ||
def doi_resolver(url): | ||
return test_responses.get(url) | ||
|
||
with patch.object(Dataverse, "urlopen") as fake_urlopen, patch.object( | ||
Dataverse, "doi2url", side_effect=doi_resolver | ||
) as fake_doi2url: | ||
fake_urlopen.return_value.read.return_value = json.dumps(test_search).encode() | ||
# valid Dataverse DOIs trigger this content provider | ||
assert Dataverse().detect(test_input[0]) == expected[0] | ||
assert fake_doi2url.call_count == 2 # File, then dataset | ||
assert Dataverse().detect(test_input[1]) == expected[0] | ||
assert Dataverse().detect(test_input[2]) == expected[0] | ||
# only two of the three calls above have to resolve a DOI | ||
assert fake_urlopen.call_count == 1 | ||
assert Dataverse().detect(test_input[3]) == expected[1] | ||
|
||
with patch.object(Dataverse, "urlopen") as fake_urlopen: | ||
# Don't trigger the Dataverse content provider | ||
assert Dataverse().detect("/some/path/here") is None | ||
assert Dataverse().detect("https://example.com/path/here") is None | ||
# don't handle DOIs that aren't from Dataverse | ||
fake_urlopen.return_value.url = ( | ||
"http://joss.theoj.org/papers/10.21105/joss.01277" | ||
) | ||
assert Dataverse().detect("https://doi.org/10.21105/joss.01277") is None | ||
|
||
|
||
@pytest.fixture | ||
def dv_files(tmpdir): | ||
|
||
f1 = tmpdir.join("some-file.txt") | ||
f1.write("some content") | ||
|
||
f2 = tmpdir.mkdir("directory").join("some-other-file.txt") | ||
f2.write("some other content") | ||
|
||
f3 = tmpdir.join("directory").mkdir("subdirectory").join("the-other-file.txt") | ||
f3.write("yet another content") | ||
|
||
return [f1, f2, f3] | ||
|
||
|
||
def test_dataverse_fetch(dv_files): | ||
mock_response_ds_query = BytesIO( | ||
json.dumps( | ||
{ | ||
"data": { | ||
"latestVersion": { | ||
"files": [ | ||
{"dataFile": {"id": 1}, "label": "some-file.txt"}, | ||
{ | ||
"dataFile": {"id": 2}, | ||
"label": "some-other-file.txt", | ||
"directoryLabel": "directory", | ||
}, | ||
{ | ||
"dataFile": {"id": 3}, | ||
"label": "the-other-file.txt", | ||
"directoryLabel": "directory/subdirectory", | ||
}, | ||
] | ||
} | ||
} | ||
} | ||
).encode("utf-8") | ||
) | ||
spec = {"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"} | ||
|
||
dv = Dataverse() | ||
|
||
def mock_urlopen(self, req): | ||
if isinstance(req, Request): | ||
return mock_response_ds_query | ||
else: | ||
file_no = int(req.split("/")[-1]) - 1 | ||
return urlopen("file://{}".format(dv_files[file_no])) | ||
|
||
with patch.object(Dataverse, "urlopen", new=mock_urlopen): | ||
with TemporaryDirectory() as d: | ||
output = [] | ||
for l in dv.fetch(spec, d): | ||
output.append(l) | ||
|
||
unpacked_files = set(os.listdir(d)) | ||
expected = set(["directory", "some-file.txt"]) | ||
assert expected == unpacked_files | ||
assert os.path.isfile( | ||
os.path.join(d, "directory", "subdirectory", "the-other-file.txt") | ||
) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
To check I parsed the above lines properly (the use of
next()
to handle the "didn't work" case made me read it a few times), is the following code equivalent (I think yes):