Skip to content

Commit

Permalink
[MRG] Dataverse content provider (#739)
Browse files Browse the repository at this point in the history
[MRG] Dataverse content provider
  • Loading branch information
betatim committed Sep 18, 2019
2 parents 649a2c6 + 4df4fd6 commit c98e6ac
Show file tree
Hide file tree
Showing 7 changed files with 308 additions and 1 deletion.
1 change: 1 addition & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ include setup.cfg
recursive-include repo2docker/buildpacks *
include versioneer.py
include repo2docker/_version.py
include repo2docker/contentproviders/dataverse.json
1 change: 1 addition & 0 deletions repo2docker/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,7 @@ def _default_log_level(self):
contentproviders.Local,
contentproviders.Zenodo,
contentproviders.Figshare,
contentproviders.Dataverse,
contentproviders.Git,
],
config=True,
Expand Down
1 change: 1 addition & 0 deletions repo2docker/contentproviders/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@
from .base import Local
from .zenodo import Zenodo
from .figshare import Figshare
from .dataverse import Dataverse
1 change: 1 addition & 0 deletions repo2docker/contentproviders/dataverse.json

Large diffs are not rendered by default.

129 changes: 129 additions & 0 deletions repo2docker/contentproviders/dataverse.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
import os
import json
import shutil

from urllib.request import Request
from urllib.parse import urlparse, urlunparse, parse_qs

from .doi import DoiProvider
from ..utils import copytree, deep_get


class Dataverse(DoiProvider):
"""Provide contents of a Dataverse dataset."""

def __init__(self):
data_file = os.path.join(os.path.dirname(__file__), "dataverse.json")
with open(data_file, "r") as fp:
self.hosts = json.load(fp)["installations"]
super().__init__()

def detect(self, doi, ref=None, extra_args=None):
"""Trigger this provider for things that resolve to a Dataverse dataset.
Handles:
- DOI pointing to {siteURL}/dataset.xhtml?persistentId={persistentId}
- DOI pointing to {siteURL}/file.xhtml?persistentId={persistentId}&...
- URL {siteURL}/api/access/datafile/{fileId}
Examples:
- https://dataverse.harvard.edu/api/access/datafile/3323458
- doi:10.7910/DVN/6ZXAGT
- doi:10.7910/DVN/6ZXAGT/3YRRYJ
"""
url = self.doi2url(doi)
# Parse the url, to get the base for later API calls
parsed_url = urlparse(url)

# Check if the url matches any known Dataverse installation, bail if not.
host = next(
(
host
for host in self.hosts
if urlparse(host["url"]).netloc == parsed_url.netloc
),
None,
)
if host is None:
return

query_args = parse_qs(parsed_url.query)

# Corner case handling
if parsed_url.path.startswith("/file.xhtml"):
# There's no way of getting file information using its persistentId, the only thing we can do is assume that doi
# is structured as "doi:<dataset_doi>/<file_doi>" and try to handle dataset that way.
new_doi = doi.rsplit("/", 1)[0]
if new_doi == doi:
# tough luck :( Avoid inifite recursion and exit.
return
return self.detect(new_doi)
elif parsed_url.path.startswith("/api/access/datafile"):
# Raw url pointing to a datafile is a typical output from an External Tool integration
entity_id = os.path.basename(parsed_url.path)
search_query = "q=entityId:" + entity_id + "&type=file"
# Knowing the file identifier query search api to get parent dataset
search_url = urlunparse(
parsed_url._replace(path="/api/search", query=search_query)
)
self.log.debug("Querying Dataverse: " + search_url)
resp = self.urlopen(search_url).read()
data = json.loads(resp.decode("utf-8"))["data"]
if data["count_in_response"] != 1:
self.log.debug(
"Dataverse search query failed!\n - doi: {}\n - url: {}\n - resp: {}\n".format(
doi, url, json.dump(data)
)
)
return

self.record_id = deep_get(data, "items.0.dataset_persistent_id")
elif (
parsed_url.path.startswith("/dataset.xhtml")
and "persistentId" in query_args
):
self.record_id = deep_get(query_args, "persistentId.0")

if hasattr(self, "record_id"):
return {"record": self.record_id, "host": host}

def fetch(self, spec, output_dir, yield_output=False):
"""Fetch and unpack a Dataverse dataset."""
record_id = spec["record"]
host = spec["host"]

yield "Fetching Dataverse record {}.\n".format(record_id)
req = Request(
"{}/api/datasets/:persistentId?persistentId={}".format(
host["url"], record_id
),
headers={"accept": "application/json"},
)
resp = self.urlopen(req)
record = json.loads(resp.read().decode("utf-8"))["data"]

for fobj in deep_get(record, "latestVersion.files"):
file_url = "{}/api/access/datafile/{}".format(
host["url"], deep_get(fobj, "dataFile.id")
)
filename = os.path.join(fobj.get("directoryLabel", ""), fobj["label"])

file_ref = {"download": file_url, "filename": filename}
fetch_map = {key: key for key in file_ref.keys()}

for line in self.fetch_file(file_ref, fetch_map, output_dir):
yield line

new_subdirs = os.listdir(output_dir)
# if there is only one new subdirectory move its contents
# to the top level directory
if len(new_subdirs) == 1 and os.path.isdir(new_subdirs[0]):
d = new_subdirs[0]
copytree(os.path.join(output_dir, d), output_dir)
shutil.rmtree(os.path.join(output_dir, d))

@property
def content_id(self):
"""The Dataverse persistent identifier."""
return self.record_id
32 changes: 31 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from distutils.cmd import Command
from setuptools import setup, find_packages
import sys
import versioneer
Expand All @@ -8,6 +9,35 @@
with open("README.md", encoding="utf8") as f:
readme = f.read()


class GenerateDataverseInstallationsFileCommand(Command):
description = "Generate Dataverse installations data map"
user_options = []

def initialize_options(self):
self.url = (
"https://services.dataverse.harvard.edu/miniverse/map/installations-json"
)

def finalize_options(self):
pass

def run(self):
from urllib.request import urlopen
import json

resp = urlopen(self.url, timeout=5)
resp_body = resp.read()
data = json.loads(resp_body.decode("utf-8"))
if "installations" not in data:
raise ValueError("Malformed installation map.")
with open("repo2docker/contentproviders/dataverse.json", "wb") as fp:
fp.write(resp_body)


__cmdclass = versioneer.get_cmdclass()
__cmdclass["generate_dataverse_file"] = GenerateDataverseInstallationsFileCommand

setup(
name="jupyter-repo2docker",
version=versioneer.get_version(),
Expand Down Expand Up @@ -48,7 +78,7 @@
],
packages=find_packages(),
include_package_data=True,
cmdclass=versioneer.get_cmdclass(),
cmdclass=__cmdclass,
entry_points={
"console_scripts": [
"jupyter-repo2docker = repo2docker.__main__:main",
Expand Down
144 changes: 144 additions & 0 deletions tests/unit/contentproviders/test_dataverse.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
import json
import os
import pytest

from io import BytesIO
from tempfile import TemporaryDirectory
from unittest.mock import patch
from urllib.request import urlopen, Request

from repo2docker.contentproviders import Dataverse


test_dv = Dataverse()
harvard_dv = next((_ for _ in test_dv.hosts if _["name"] == "Harvard Dataverse"))
cimmyt_dv = next((_ for _ in test_dv.hosts if _["name"] == "CIMMYT Research Data"))
test_hosts = [
(
[
"doi:10.7910/DVN/6ZXAGT/3YRRYJ",
"10.7910/DVN/6ZXAGT",
"https://dataverse.harvard.edu/api/access/datafile/3323458",
"hdl:11529/10016",
],
[
{"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"},
{"host": cimmyt_dv, "record": "hdl:11529/10016"},
],
)
]
test_responses = {
"doi:10.7910/DVN/6ZXAGT/3YRRYJ": (
"https://dataverse.harvard.edu/file.xhtml"
"?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ"
),
"doi:10.7910/DVN/6ZXAGT": (
"https://dataverse.harvard.edu/dataset.xhtml"
"?persistentId=doi:10.7910/DVN/6ZXAGT"
),
"10.7910/DVN/6ZXAGT": (
"https://dataverse.harvard.edu/dataset.xhtml"
"?persistentId=doi:10.7910/DVN/6ZXAGT"
),
"https://dataverse.harvard.edu/api/access/datafile/3323458": "https://dataverse.harvard.edu/api/access/datafile/3323458",
"hdl:11529/10016": "https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016",
}
test_search = {
"data": {
"count_in_response": 1,
"items": [{"dataset_persistent_id": "doi:10.7910/DVN/6ZXAGT"}],
}
}


@pytest.mark.parametrize("test_input, expected", test_hosts)
def test_detect_dataverse(test_input, expected):
def doi_resolver(url):
return test_responses.get(url)

with patch.object(Dataverse, "urlopen") as fake_urlopen, patch.object(
Dataverse, "doi2url", side_effect=doi_resolver
) as fake_doi2url:
fake_urlopen.return_value.read.return_value = json.dumps(test_search).encode()
# valid Dataverse DOIs trigger this content provider
assert Dataverse().detect(test_input[0]) == expected[0]
assert fake_doi2url.call_count == 2 # File, then dataset
assert Dataverse().detect(test_input[1]) == expected[0]
assert Dataverse().detect(test_input[2]) == expected[0]
# only two of the three calls above have to resolve a DOI
assert fake_urlopen.call_count == 1
assert Dataverse().detect(test_input[3]) == expected[1]

with patch.object(Dataverse, "urlopen") as fake_urlopen:
# Don't trigger the Dataverse content provider
assert Dataverse().detect("/some/path/here") is None
assert Dataverse().detect("https://example.com/path/here") is None
# don't handle DOIs that aren't from Dataverse
fake_urlopen.return_value.url = (
"http://joss.theoj.org/papers/10.21105/joss.01277"
)
assert Dataverse().detect("https://doi.org/10.21105/joss.01277") is None


@pytest.fixture
def dv_files(tmpdir):

f1 = tmpdir.join("some-file.txt")
f1.write("some content")

f2 = tmpdir.mkdir("directory").join("some-other-file.txt")
f2.write("some other content")

f3 = tmpdir.join("directory").mkdir("subdirectory").join("the-other-file.txt")
f3.write("yet another content")

return [f1, f2, f3]


def test_dataverse_fetch(dv_files):
mock_response_ds_query = BytesIO(
json.dumps(
{
"data": {
"latestVersion": {
"files": [
{"dataFile": {"id": 1}, "label": "some-file.txt"},
{
"dataFile": {"id": 2},
"label": "some-other-file.txt",
"directoryLabel": "directory",
},
{
"dataFile": {"id": 3},
"label": "the-other-file.txt",
"directoryLabel": "directory/subdirectory",
},
]
}
}
}
).encode("utf-8")
)
spec = {"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"}

dv = Dataverse()

def mock_urlopen(self, req):
if isinstance(req, Request):
return mock_response_ds_query
else:
file_no = int(req.split("/")[-1]) - 1
return urlopen("file://{}".format(dv_files[file_no]))

with patch.object(Dataverse, "urlopen", new=mock_urlopen):
with TemporaryDirectory() as d:
output = []
for l in dv.fetch(spec, d):
output.append(l)

unpacked_files = set(os.listdir(d))
expected = set(["directory", "some-file.txt"])
assert expected == unpacked_files
assert os.path.isfile(
os.path.join(d, "directory", "subdirectory", "the-other-file.txt")
)

0 comments on commit c98e6ac

Please sign in to comment.