Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG] Dataverse content provider #739

Merged
merged 7 commits into from
Sep 18, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ include setup.cfg
recursive-include repo2docker/buildpacks *
include versioneer.py
include repo2docker/_version.py
include repo2docker/contentproviders/dataverse.json
1 change: 1 addition & 0 deletions repo2docker/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,7 @@ def _default_log_level(self):
contentproviders.Local,
contentproviders.Zenodo,
contentproviders.Figshare,
contentproviders.Dataverse,
contentproviders.Git,
],
config=True,
Expand Down
1 change: 1 addition & 0 deletions repo2docker/contentproviders/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@
from .base import Local
from .zenodo import Zenodo
from .figshare import Figshare
from .dataverse import Dataverse
1 change: 1 addition & 0 deletions repo2docker/contentproviders/dataverse.json

Large diffs are not rendered by default.

129 changes: 129 additions & 0 deletions repo2docker/contentproviders/dataverse.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
import os
import json
import shutil

from urllib.request import Request
from urllib.parse import urlparse, urlunparse, parse_qs

from .doi import DoiProvider
from ..utils import copytree, deep_get


class Dataverse(DoiProvider):
"""Provide contents of a Dataverse dataset."""

def __init__(self):
data_file = os.path.join(os.path.dirname(__file__), "dataverse.json")
with open(data_file, "r") as fp:
self.hosts = json.load(fp)["installations"]
super().__init__()

def detect(self, doi, ref=None, extra_args=None):
"""Trigger this provider for things that resolve to a Dataverse dataset.

Handles:
- DOI pointing to {siteURL}/dataset.xhtml?persistentId={persistentId}
- DOI pointing to {siteURL}/file.xhtml?persistentId={persistentId}&...
- URL {siteURL}/api/access/datafile/{fileId}

Examples:
- https://dataverse.harvard.edu/api/access/datafile/3323458
- doi:10.7910/DVN/6ZXAGT
- doi:10.7910/DVN/6ZXAGT/3YRRYJ

"""
url = self.doi2url(doi)
# Parse the url, to get the base for later API calls
parsed_url = urlparse(url)

# Check if the url matches any known Dataverse installation, bail if not.
host = next(
(
host
for host in self.hosts
if urlparse(host["url"]).netloc == parsed_url.netloc
),
None,
)
if host is None:
return
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To check I parsed the above lines properly (the use of next() to handle the "didn't work" case made me read it a few times), is the following code equivalent (I think yes):

for host_ in self.hosts:
    if urlparse(host_["url"]).netloc == parsed_url.netloc:
        host = host_
else:
    return


query_args = parse_qs(parsed_url.query)

# Corner case handling
if parsed_url.path.startswith("/file.xhtml"):
# There's no way of getting file information using its persistentId, the only thing we can do is assume that doi
# is structured as "doi:<dataset_doi>/<file_doi>" and try to handle dataset that way.
new_doi = doi.rsplit("/", 1)[0]
if new_doi == doi:
# tough luck :( Avoid inifite recursion and exit.
return
return self.detect(new_doi)
elif parsed_url.path.startswith("/api/access/datafile"):
# Raw url pointing to a datafile is a typical output from an External Tool integration
entity_id = os.path.basename(parsed_url.path)
search_query = "q=entityId:" + entity_id + "&type=file"
# Knowing the file identifier query search api to get parent dataset
search_url = urlunparse(
parsed_url._replace(path="/api/search", query=search_query)
)
self.log.debug("Querying Dataverse: " + search_url)
resp = self.urlopen(search_url).read()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggest to log.debug the URL before you're calling it, could be important error-finding information for users.

data = json.loads(resp.decode("utf-8"))["data"]
if data["count_in_response"] != 1:
self.log.debug(
"Dataverse search query failed!\n - doi: {}\n - url: {}\n - resp: {}\n".format(
doi, url, json.dump(data)
)
)
return

self.record_id = deep_get(data, "items.0.dataset_persistent_id")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could there be a case where more than 1 item will be returned? Should you warn about that here?

elif (
parsed_url.path.startswith("/dataset.xhtml")
and "persistentId" in query_args
):
self.record_id = deep_get(query_args, "persistentId.0")

if hasattr(self, "record_id"):
return {"record": self.record_id, "host": host}

def fetch(self, spec, output_dir, yield_output=False):
"""Fetch and unpack a Dataverse dataset."""
record_id = spec["record"]
host = spec["host"]

yield "Fetching Dataverse record {}.\n".format(record_id)
req = Request(
"{}/api/datasets/:persistentId?persistentId={}".format(
host["url"], record_id
),
headers={"accept": "application/json"},
)
resp = self.urlopen(req)
record = json.loads(resp.read().decode("utf-8"))["data"]

for fobj in deep_get(record, "latestVersion.files"):
file_url = "{}/api/access/datafile/{}".format(
host["url"], deep_get(fobj, "dataFile.id")
)
filename = os.path.join(fobj.get("directoryLabel", ""), fobj["label"])

file_ref = {"download": file_url, "filename": filename}
fetch_map = {key: key for key in file_ref.keys()}

for line in self.fetch_file(file_ref, fetch_map, output_dir):
yield line

new_subdirs = os.listdir(output_dir)
# if there is only one new subdirectory move its contents
# to the top level directory
if len(new_subdirs) == 1 and os.path.isdir(new_subdirs[0]):
d = new_subdirs[0]
copytree(os.path.join(output_dir, d), output_dir)
shutil.rmtree(os.path.join(output_dir, d))

@property
def content_id(self):
"""The Dataverse persistent identifier."""
return self.record_id
32 changes: 31 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from distutils.cmd import Command
from setuptools import setup, find_packages
import sys
import versioneer
Expand All @@ -8,6 +9,35 @@
with open("README.md", encoding="utf8") as f:
readme = f.read()


class GenerateDataverseInstallationsFileCommand(Command):
description = "Generate Dataverse installations data map"
user_options = []

def initialize_options(self):
self.url = (
"https://services.dataverse.harvard.edu/miniverse/map/installations-json"
)

def finalize_options(self):
pass

def run(self):
from urllib.request import urlopen
import json

resp = urlopen(self.url, timeout=5)
resp_body = resp.read()
data = json.loads(resp_body.decode("utf-8"))
if "installations" not in data:
raise ValueError("Malformed installation map.")
with open("repo2docker/contentproviders/dataverse.json", "wb") as fp:
fp.write(resp_body)


__cmdclass = versioneer.get_cmdclass()
__cmdclass["generate_dataverse_file"] = GenerateDataverseInstallationsFileCommand

setup(
name="jupyter-repo2docker",
version=versioneer.get_version(),
Expand Down Expand Up @@ -48,7 +78,7 @@
],
packages=find_packages(),
include_package_data=True,
cmdclass=versioneer.get_cmdclass(),
cmdclass=__cmdclass,
betatim marked this conversation as resolved.
Show resolved Hide resolved
entry_points={
"console_scripts": [
"jupyter-repo2docker = repo2docker.__main__:main",
Expand Down
144 changes: 144 additions & 0 deletions tests/unit/contentproviders/test_dataverse.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
import json
import os
import pytest

from io import BytesIO
from tempfile import TemporaryDirectory
from unittest.mock import patch
from urllib.request import urlopen, Request

from repo2docker.contentproviders import Dataverse


test_dv = Dataverse()
harvard_dv = next((_ for _ in test_dv.hosts if _["name"] == "Harvard Dataverse"))
cimmyt_dv = next((_ for _ in test_dv.hosts if _["name"] == "CIMMYT Research Data"))
test_hosts = [
(
[
"doi:10.7910/DVN/6ZXAGT/3YRRYJ",
"10.7910/DVN/6ZXAGT",
"https://dataverse.harvard.edu/api/access/datafile/3323458",
"hdl:11529/10016",
],
[
{"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"},
{"host": cimmyt_dv, "record": "hdl:11529/10016"},
],
)
]
test_responses = {
"doi:10.7910/DVN/6ZXAGT/3YRRYJ": (
"https://dataverse.harvard.edu/file.xhtml"
"?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ"
),
"doi:10.7910/DVN/6ZXAGT": (
"https://dataverse.harvard.edu/dataset.xhtml"
"?persistentId=doi:10.7910/DVN/6ZXAGT"
),
"10.7910/DVN/6ZXAGT": (
"https://dataverse.harvard.edu/dataset.xhtml"
"?persistentId=doi:10.7910/DVN/6ZXAGT"
),
"https://dataverse.harvard.edu/api/access/datafile/3323458": "https://dataverse.harvard.edu/api/access/datafile/3323458",
"hdl:11529/10016": "https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016",
}
test_search = {
"data": {
"count_in_response": 1,
"items": [{"dataset_persistent_id": "doi:10.7910/DVN/6ZXAGT"}],
}
}


@pytest.mark.parametrize("test_input, expected", test_hosts)
def test_detect_dataverse(test_input, expected):
def doi_resolver(url):
return test_responses.get(url)

with patch.object(Dataverse, "urlopen") as fake_urlopen, patch.object(
Dataverse, "doi2url", side_effect=doi_resolver
) as fake_doi2url:
fake_urlopen.return_value.read.return_value = json.dumps(test_search).encode()
# valid Dataverse DOIs trigger this content provider
assert Dataverse().detect(test_input[0]) == expected[0]
assert fake_doi2url.call_count == 2 # File, then dataset
assert Dataverse().detect(test_input[1]) == expected[0]
assert Dataverse().detect(test_input[2]) == expected[0]
# only two of the three calls above have to resolve a DOI
assert fake_urlopen.call_count == 1
assert Dataverse().detect(test_input[3]) == expected[1]

with patch.object(Dataverse, "urlopen") as fake_urlopen:
# Don't trigger the Dataverse content provider
assert Dataverse().detect("/some/path/here") is None
assert Dataverse().detect("https://example.com/path/here") is None
# don't handle DOIs that aren't from Dataverse
fake_urlopen.return_value.url = (
"http://joss.theoj.org/papers/10.21105/joss.01277"
)
assert Dataverse().detect("https://doi.org/10.21105/joss.01277") is None


@pytest.fixture
def dv_files(tmpdir):

f1 = tmpdir.join("some-file.txt")
f1.write("some content")

f2 = tmpdir.mkdir("directory").join("some-other-file.txt")
f2.write("some other content")

f3 = tmpdir.join("directory").mkdir("subdirectory").join("the-other-file.txt")
f3.write("yet another content")

return [f1, f2, f3]


def test_dataverse_fetch(dv_files):
mock_response_ds_query = BytesIO(
json.dumps(
{
"data": {
"latestVersion": {
"files": [
{"dataFile": {"id": 1}, "label": "some-file.txt"},
{
"dataFile": {"id": 2},
"label": "some-other-file.txt",
"directoryLabel": "directory",
},
{
"dataFile": {"id": 3},
"label": "the-other-file.txt",
"directoryLabel": "directory/subdirectory",
},
]
}
}
}
).encode("utf-8")
)
spec = {"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"}

dv = Dataverse()

def mock_urlopen(self, req):
if isinstance(req, Request):
return mock_response_ds_query
else:
file_no = int(req.split("/")[-1]) - 1
return urlopen("file://{}".format(dv_files[file_no]))

with patch.object(Dataverse, "urlopen", new=mock_urlopen):
with TemporaryDirectory() as d:
output = []
for l in dv.fetch(spec, d):
output.append(l)

unpacked_files = set(os.listdir(d))
expected = set(["directory", "some-file.txt"])
assert expected == unpacked_files
assert os.path.isfile(
os.path.join(d, "directory", "subdirectory", "the-other-file.txt")
)