-
Notifications
You must be signed in to change notification settings - Fork 360
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[MRG] Dataverse content provider (#739)
[MRG] Dataverse content provider
- Loading branch information
Showing
7 changed files
with
308 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,129 @@ | ||
import os | ||
import json | ||
import shutil | ||
|
||
from urllib.request import Request | ||
from urllib.parse import urlparse, urlunparse, parse_qs | ||
|
||
from .doi import DoiProvider | ||
from ..utils import copytree, deep_get | ||
|
||
|
||
class Dataverse(DoiProvider): | ||
"""Provide contents of a Dataverse dataset.""" | ||
|
||
def __init__(self): | ||
data_file = os.path.join(os.path.dirname(__file__), "dataverse.json") | ||
with open(data_file, "r") as fp: | ||
self.hosts = json.load(fp)["installations"] | ||
super().__init__() | ||
|
||
def detect(self, doi, ref=None, extra_args=None): | ||
"""Trigger this provider for things that resolve to a Dataverse dataset. | ||
Handles: | ||
- DOI pointing to {siteURL}/dataset.xhtml?persistentId={persistentId} | ||
- DOI pointing to {siteURL}/file.xhtml?persistentId={persistentId}&... | ||
- URL {siteURL}/api/access/datafile/{fileId} | ||
Examples: | ||
- https://dataverse.harvard.edu/api/access/datafile/3323458 | ||
- doi:10.7910/DVN/6ZXAGT | ||
- doi:10.7910/DVN/6ZXAGT/3YRRYJ | ||
""" | ||
url = self.doi2url(doi) | ||
# Parse the url, to get the base for later API calls | ||
parsed_url = urlparse(url) | ||
|
||
# Check if the url matches any known Dataverse installation, bail if not. | ||
host = next( | ||
( | ||
host | ||
for host in self.hosts | ||
if urlparse(host["url"]).netloc == parsed_url.netloc | ||
), | ||
None, | ||
) | ||
if host is None: | ||
return | ||
|
||
query_args = parse_qs(parsed_url.query) | ||
|
||
# Corner case handling | ||
if parsed_url.path.startswith("/file.xhtml"): | ||
# There's no way of getting file information using its persistentId, the only thing we can do is assume that doi | ||
# is structured as "doi:<dataset_doi>/<file_doi>" and try to handle dataset that way. | ||
new_doi = doi.rsplit("/", 1)[0] | ||
if new_doi == doi: | ||
# tough luck :( Avoid inifite recursion and exit. | ||
return | ||
return self.detect(new_doi) | ||
elif parsed_url.path.startswith("/api/access/datafile"): | ||
# Raw url pointing to a datafile is a typical output from an External Tool integration | ||
entity_id = os.path.basename(parsed_url.path) | ||
search_query = "q=entityId:" + entity_id + "&type=file" | ||
# Knowing the file identifier query search api to get parent dataset | ||
search_url = urlunparse( | ||
parsed_url._replace(path="/api/search", query=search_query) | ||
) | ||
self.log.debug("Querying Dataverse: " + search_url) | ||
resp = self.urlopen(search_url).read() | ||
data = json.loads(resp.decode("utf-8"))["data"] | ||
if data["count_in_response"] != 1: | ||
self.log.debug( | ||
"Dataverse search query failed!\n - doi: {}\n - url: {}\n - resp: {}\n".format( | ||
doi, url, json.dump(data) | ||
) | ||
) | ||
return | ||
|
||
self.record_id = deep_get(data, "items.0.dataset_persistent_id") | ||
elif ( | ||
parsed_url.path.startswith("/dataset.xhtml") | ||
and "persistentId" in query_args | ||
): | ||
self.record_id = deep_get(query_args, "persistentId.0") | ||
|
||
if hasattr(self, "record_id"): | ||
return {"record": self.record_id, "host": host} | ||
|
||
def fetch(self, spec, output_dir, yield_output=False): | ||
"""Fetch and unpack a Dataverse dataset.""" | ||
record_id = spec["record"] | ||
host = spec["host"] | ||
|
||
yield "Fetching Dataverse record {}.\n".format(record_id) | ||
req = Request( | ||
"{}/api/datasets/:persistentId?persistentId={}".format( | ||
host["url"], record_id | ||
), | ||
headers={"accept": "application/json"}, | ||
) | ||
resp = self.urlopen(req) | ||
record = json.loads(resp.read().decode("utf-8"))["data"] | ||
|
||
for fobj in deep_get(record, "latestVersion.files"): | ||
file_url = "{}/api/access/datafile/{}".format( | ||
host["url"], deep_get(fobj, "dataFile.id") | ||
) | ||
filename = os.path.join(fobj.get("directoryLabel", ""), fobj["label"]) | ||
|
||
file_ref = {"download": file_url, "filename": filename} | ||
fetch_map = {key: key for key in file_ref.keys()} | ||
|
||
for line in self.fetch_file(file_ref, fetch_map, output_dir): | ||
yield line | ||
|
||
new_subdirs = os.listdir(output_dir) | ||
# if there is only one new subdirectory move its contents | ||
# to the top level directory | ||
if len(new_subdirs) == 1 and os.path.isdir(new_subdirs[0]): | ||
d = new_subdirs[0] | ||
copytree(os.path.join(output_dir, d), output_dir) | ||
shutil.rmtree(os.path.join(output_dir, d)) | ||
|
||
@property | ||
def content_id(self): | ||
"""The Dataverse persistent identifier.""" | ||
return self.record_id |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,144 @@ | ||
import json | ||
import os | ||
import pytest | ||
|
||
from io import BytesIO | ||
from tempfile import TemporaryDirectory | ||
from unittest.mock import patch | ||
from urllib.request import urlopen, Request | ||
|
||
from repo2docker.contentproviders import Dataverse | ||
|
||
|
||
test_dv = Dataverse() | ||
harvard_dv = next((_ for _ in test_dv.hosts if _["name"] == "Harvard Dataverse")) | ||
cimmyt_dv = next((_ for _ in test_dv.hosts if _["name"] == "CIMMYT Research Data")) | ||
test_hosts = [ | ||
( | ||
[ | ||
"doi:10.7910/DVN/6ZXAGT/3YRRYJ", | ||
"10.7910/DVN/6ZXAGT", | ||
"https://dataverse.harvard.edu/api/access/datafile/3323458", | ||
"hdl:11529/10016", | ||
], | ||
[ | ||
{"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"}, | ||
{"host": cimmyt_dv, "record": "hdl:11529/10016"}, | ||
], | ||
) | ||
] | ||
test_responses = { | ||
"doi:10.7910/DVN/6ZXAGT/3YRRYJ": ( | ||
"https://dataverse.harvard.edu/file.xhtml" | ||
"?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ" | ||
), | ||
"doi:10.7910/DVN/6ZXAGT": ( | ||
"https://dataverse.harvard.edu/dataset.xhtml" | ||
"?persistentId=doi:10.7910/DVN/6ZXAGT" | ||
), | ||
"10.7910/DVN/6ZXAGT": ( | ||
"https://dataverse.harvard.edu/dataset.xhtml" | ||
"?persistentId=doi:10.7910/DVN/6ZXAGT" | ||
), | ||
"https://dataverse.harvard.edu/api/access/datafile/3323458": "https://dataverse.harvard.edu/api/access/datafile/3323458", | ||
"hdl:11529/10016": "https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016", | ||
} | ||
test_search = { | ||
"data": { | ||
"count_in_response": 1, | ||
"items": [{"dataset_persistent_id": "doi:10.7910/DVN/6ZXAGT"}], | ||
} | ||
} | ||
|
||
|
||
@pytest.mark.parametrize("test_input, expected", test_hosts) | ||
def test_detect_dataverse(test_input, expected): | ||
def doi_resolver(url): | ||
return test_responses.get(url) | ||
|
||
with patch.object(Dataverse, "urlopen") as fake_urlopen, patch.object( | ||
Dataverse, "doi2url", side_effect=doi_resolver | ||
) as fake_doi2url: | ||
fake_urlopen.return_value.read.return_value = json.dumps(test_search).encode() | ||
# valid Dataverse DOIs trigger this content provider | ||
assert Dataverse().detect(test_input[0]) == expected[0] | ||
assert fake_doi2url.call_count == 2 # File, then dataset | ||
assert Dataverse().detect(test_input[1]) == expected[0] | ||
assert Dataverse().detect(test_input[2]) == expected[0] | ||
# only two of the three calls above have to resolve a DOI | ||
assert fake_urlopen.call_count == 1 | ||
assert Dataverse().detect(test_input[3]) == expected[1] | ||
|
||
with patch.object(Dataverse, "urlopen") as fake_urlopen: | ||
# Don't trigger the Dataverse content provider | ||
assert Dataverse().detect("/some/path/here") is None | ||
assert Dataverse().detect("https://example.com/path/here") is None | ||
# don't handle DOIs that aren't from Dataverse | ||
fake_urlopen.return_value.url = ( | ||
"http://joss.theoj.org/papers/10.21105/joss.01277" | ||
) | ||
assert Dataverse().detect("https://doi.org/10.21105/joss.01277") is None | ||
|
||
|
||
@pytest.fixture | ||
def dv_files(tmpdir): | ||
|
||
f1 = tmpdir.join("some-file.txt") | ||
f1.write("some content") | ||
|
||
f2 = tmpdir.mkdir("directory").join("some-other-file.txt") | ||
f2.write("some other content") | ||
|
||
f3 = tmpdir.join("directory").mkdir("subdirectory").join("the-other-file.txt") | ||
f3.write("yet another content") | ||
|
||
return [f1, f2, f3] | ||
|
||
|
||
def test_dataverse_fetch(dv_files): | ||
mock_response_ds_query = BytesIO( | ||
json.dumps( | ||
{ | ||
"data": { | ||
"latestVersion": { | ||
"files": [ | ||
{"dataFile": {"id": 1}, "label": "some-file.txt"}, | ||
{ | ||
"dataFile": {"id": 2}, | ||
"label": "some-other-file.txt", | ||
"directoryLabel": "directory", | ||
}, | ||
{ | ||
"dataFile": {"id": 3}, | ||
"label": "the-other-file.txt", | ||
"directoryLabel": "directory/subdirectory", | ||
}, | ||
] | ||
} | ||
} | ||
} | ||
).encode("utf-8") | ||
) | ||
spec = {"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"} | ||
|
||
dv = Dataverse() | ||
|
||
def mock_urlopen(self, req): | ||
if isinstance(req, Request): | ||
return mock_response_ds_query | ||
else: | ||
file_no = int(req.split("/")[-1]) - 1 | ||
return urlopen("file://{}".format(dv_files[file_no])) | ||
|
||
with patch.object(Dataverse, "urlopen", new=mock_urlopen): | ||
with TemporaryDirectory() as d: | ||
output = [] | ||
for l in dv.fetch(spec, d): | ||
output.append(l) | ||
|
||
unpacked_files = set(os.listdir(d)) | ||
expected = set(["directory", "some-file.txt"]) | ||
assert expected == unpacked_files | ||
assert os.path.isfile( | ||
os.path.join(d, "directory", "subdirectory", "the-other-file.txt") | ||
) |