From 331a61032467507a5bd29b6c925cc78b7a73e88c Mon Sep 17 00:00:00 2001 From: "Kacper Kowalik (Xarthisius)" Date: Wed, 10 Jul 2019 17:44:03 -0500 Subject: [PATCH 1/7] Add Dataverse content provider --- MANIFEST.in | 1 + repo2docker/app.py | 1 + repo2docker/contentproviders/__init__.py | 1 + repo2docker/contentproviders/dataverse.json | 1 + repo2docker/contentproviders/dataverse.py | 129 ++++++++++++++++++++ 5 files changed, 133 insertions(+) create mode 100644 repo2docker/contentproviders/dataverse.json create mode 100644 repo2docker/contentproviders/dataverse.py diff --git a/MANIFEST.in b/MANIFEST.in index 7e1dcf861..997a413f5 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -4,3 +4,4 @@ include setup.cfg recursive-include repo2docker/buildpacks * include versioneer.py include repo2docker/_version.py +include repo2docker/contentproviders/dataverse.json diff --git a/repo2docker/app.py b/repo2docker/app.py index ba0a03135..d8ddb046f 100644 --- a/repo2docker/app.py +++ b/repo2docker/app.py @@ -146,6 +146,7 @@ def _default_log_level(self): contentproviders.Local, contentproviders.Zenodo, contentproviders.Figshare, + contentproviders.Dataverse, contentproviders.Git, ], config=True, diff --git a/repo2docker/contentproviders/__init__.py b/repo2docker/contentproviders/__init__.py index f7f9369bc..c0cf82b8a 100644 --- a/repo2docker/contentproviders/__init__.py +++ b/repo2docker/contentproviders/__init__.py @@ -2,3 +2,4 @@ from .base import Local from .zenodo import Zenodo from .figshare import Figshare +from .dataverse import Dataverse diff --git a/repo2docker/contentproviders/dataverse.json b/repo2docker/contentproviders/dataverse.json new file mode 100644 index 000000000..7007be6a5 --- /dev/null +++ b/repo2docker/contentproviders/dataverse.json @@ -0,0 +1 @@ +{"installations": [{"id": 1740, "name": "Abacus", "full_name": "Abacus (British Columbia Research Libraries' Data Services) Dataverse", "is_active": true, "description": "Open for researchers associated with British Columbia universities to deposit data.", "lat": 49.259982, "lng": -123.250212, "logo": "https://dvn-h-prod.hz.lib.harvard.edu/media/logos/abacus-46x46.jpg", "url": "https://dvn.library.ubc.ca/dvn/", "slug": "abacus", "version": "3.6"}, {"id": 1771, "name": "ADA Dataverse", "full_name": "Australian Data Archive", "is_active": true, "description": "The Australian Data Archive provides a national service for collecting, preserving, publishing and accessing digital research data.", "lat": -35.343784, "lng": 149.082977, "logo": "https://dvn-h-prod.hz.lib.harvard.edu/media/logos/ada-46x46.jpg", "url": "https://dataverse.ada.edu.au/", "slug": "ada-dataverse", "version": "4.6.1"}, {"id": 1773, "name": "AUSSDA Dataverse", "full_name": "Austrian Social Science Data Archive", "is_active": true, "description": "AUSSDA - The Austrian Social Science Data Archive makes social science data accessible, creating opportunities for research and data reuse, benefitting science and society. AUSSDA serves as the Austrian representative in the Consortium of European Social Science Data Archives (CESSDA ERIC).", "lat": 48.210033, "lng": 16.363449, "logo": "https://dvn-h-prod.hz.lib.harvard.edu/media/logos/aussda-46x46.png", "url": "https://data.aussda.at/", "slug": "aussda-dataverse", "version": "4.6.2"}, {"id": 1778, "name": "Botswana Harvard Data", "full_name": "Botswana Harvard AIDS Institute Partnership", "is_active": true, "description": "The Botswana Harvard AIDS Institute Partneship is a world-renowned educational institution of excellence in research and education pertinent to HIV/AIDS and other emerging public health challenges. Established in 1996, the Botswana Harvard AIDS Institute Partnership (BHP) is a collaborative research and training initiative between Botswana\u2019s Ministry of Health and Wellness and the Harvard T.H. Chan School of Public Health AIDS Initiative. The BHP Dataverse is a data repository for all the research done at BHP. Raw data, anonymised data and final analysis data for every research. This repository will achieve and also easy data sharing within the organisation and outside.", "lat": -24.653257, "lng": 25.906792, "logo": "https://dvn-h-prod.hz.lib.harvard.edu/media/logos/botswana-46x46.png", "url": "https://dataverse.bhp.org.bw/", "slug": "botswana-harvard-data", "version": "4.9.4"}, {"id": 1759, "name": "Catalogues (CDSP)", "full_name": "Catalogues (CDSP)", "is_active": true, "description": "Open for researchers and organizations associated with\r\nFrench universities to deposit data. Hosted by the Center for\r\nSocio-Political Data (Sciences Po and CNRS).", "lat": 48.854027, "lng": 2.328351, "logo": "https://dvn-h-prod.hz.lib.harvard.edu/media/logos/catalogues-46x46.png", "url": "https://catalogues.cdsp.sciences-po.fr/", "slug": "catalogues-cdsp", "version": "4.6.1"}, {"id": 1763, "name": "CIFOR", "full_name": "Center for International Forestry Research (CIFOR) Dataverse", "is_active": true, "description": "Center for International Forestry Research (CIFOR) Dataverse", "lat": -6.594293, "lng": 106.806000, "logo": "https://dvn-h-prod.hz.lib.harvard.edu/media/logos/CIFOR-46x46.png", "url": "https://data.cifor.org/dataverse/s", "slug": "cifor", "version": "4.6"}, {"id": 1741, "name": "CIMMYT Research Data", "full_name": "International Maize and Wheat Improvement Center", "is_active": true, "description": "Free, open access repository of research data and software produced and developed by CIMMYT scientists.", "lat": 19.531535, "lng": -98.846064, "logo": "https://dvn-h-prod.hz.lib.harvard.edu/media/logos/cimmyt-46x46.jpg", "url": "http://data.cimmyt.org/", "slug": "cimmyt-research-data", "version": "3.0"}, {"id": 1764, "name": "CIRAD", "full_name": "CIRAD Dataverse", "is_active": true, "description": "Organisme fran\u00e7ais de recherche agronomique et de coop\u00e9ration internationale pour le d\u00e9veloppement durable des r\u00e9gions tropicales et m\u00e9diterran\u00e9ennes, les activit\u00e9s du CIRAD rel\u00e8vent des sciences du vivant, des sciences sociales et des sciences de l\u2019ing\u00e9nieur appliqu\u00e9es \u00e0 l\u2019agriculture, \u00e0 l\u2019alimentation, \u00e0 l\u2019environnement et \u00e0 la gestion des territoires.\r\n\r\nFrench agricultural research and international cooperation organization working for the sustainable development of tropical and Mediterranean regions, CIRAD's activities concern the life sciences, social sciences and engineering sciences, applied to agriculture, the environment and territorial management.", "lat": 43.650089, "lng": 3.869122, "logo": "https://dvn-h-prod.hz.lib.harvard.edu/media/logos/cirad-46x46.jpg", "url": "https://dataverse.cirad.fr/", "slug": "cirad", "version": "4.5"}, {"id": 1774, "name": "Dalhousie University Dataverse", "full_name": "Dalhousie University", "is_active": true, "description": "Share, publish and get credit for your data. Find and cite research data from across all research fields.", "lat": 44.637484, "lng": -63.591220, "logo": "https://dvn-h-prod.hz.lib.harvard.edu/media/logos/dalhousie-46x46.png", "url": "https://dataverse.library.dal.ca", "slug": "dalhousie-university-dataverse", "version": "4.7"}, {"id": 1777, "name": "Data Inra", "full_name": "National Institute of Agricultural Research (INRA)", "is_active": true, "description": "INRA is Europe\u2019s top agricultural research institute and the world\u2019s number two centre for the agricultural sciences. Data Inra is offered by INRA as part of its mission to open the results of its research.\r\n\r\nData Inra will share research data in relation with food, nutrition, agriculture and environment. It includes experimental, simulation and observation data, omic data, survey and text data.\r\n\r\nOnly data produced by or in collaboration with INRA will be hosted in the repository, but anyone can access the metadata and the open data.", "lat": 48.801407, "lng": 2.130122, "logo": "https://dvn-h-prod.hz.lib.harvard.edu/media/logos/inra-46x46-2.jpg", "url": "https://data.inra.fr/", "slug": "data-inra", "version": "4.5.1"}, {"id": 1743, "name": "DataSpace@HKUST", "full_name": "DataSpace@HKUST", "is_active": true, "description": "", "lat": 22.336281, "lng": 114.266721, "logo": "https://dvn-h-prod.hz.lib.harvard.edu/media/logos/dataspace-46x46.jpg", "url": "https://dataspace.ust.hk/", "slug": "dataspacehkust", "version": "4.2"}, {"id": 1762, "name": "Dataverse e-cienciaDatos", "full_name": "Dataverse e-cienciaDatos", "is_active": true, "description": "Repositorio de Datos del Consorcio Madro\u00f1o.", "lat": 40.416775, "lng": -3.749200, "logo": "https://dvn-h-prod.hz.lib.harvard.edu/media/logos/consorciomadrono-46x46.gif", "url": "https://edatos.consorciomadrono.es/", "slug": "dataverse-e-cienciadatos", "version": "4.8.4"}, {"id": 1742, "name": "DataverseNL", "full_name": "DataverseNL", "is_active": true, "description": "Open for researchers and organizations associated with Dutch universities to deposit data.", "lat": 52.547260, "lng": 5.242346, "logo": "https://dvn-h-prod.hz.lib.harvard.edu/media/logos/logosdataverseNL-46x46.png", "url": "https://dataverse.nl/", "slug": "dataversenl", "version": "4.6.1"}, {"id": 1767, "name": "DataverseNO", "full_name": "Dataverse Network Norway", "is_active": true, "description": "Research data archive open for Norwegian research institutions. Operated by UiT The Arctic University of Norway.", "lat": 69.649208, "lng": 18.955324, "logo": "https://dvn-h-prod.hz.lib.harvard.edu/media/logos/dataverseNO-46x46.png", "url": "https://dataverse.no/", "slug": "dataverseno", "version": "4.9.4"}, {"id": 1768, "name": "DR-NTU (Data)", "full_name": "Nanyang Technological University", "is_active": true, "description": "The institutional open access research data repository for Nanyang Technological University (NTU). NTU researchers are encouraged to use DR-NTU (Data) to deposit, publish and archive their final research data in order to make their research data discoverable, accessible and reusable.", "lat": 1.348668, "lng": 103.683104, "logo": "https://dvn-h-prod.hz.lib.harvard.edu/media/logos/NTULogo_2.gif", "url": "https://researchdata.ntu.edu.sg", "slug": "dr-ntu-data", "version": "4.7.1"}, {"id": 1744, "name": "Fudan University", "full_name": "Fudan University Dataverse", "is_active": true, "description": "Open for Fudan University affiliated researchers to deposit data.", "lat": 31.298531, "lng": 121.501446, "logo": "https://dvn-h-prod.hz.lib.harvard.edu/media/logos/fudan-46x46.png", "url": "https://dvn.fudan.edu.cn/home/", "slug": "fudan-university", "version": "4.x"}, {"id": 1791, "name": "G\u00f6ttingen Research Online", "full_name": "G\u00f6ttingen eResearch Alliance", "is_active": true, "description": "G\u00f6ttingen Research Online is an institutional repository for the publication of research data at the G\u00f6ttingen Campus. It is managed by the G\u00f6ttingen eResearch Alliance, a joint group of the G\u00f6ttingen State and University Library and Gesellschaft f\u00fcr wissenschaftliche Datenverarbeitung mbH G\u00f6ttingen.", "lat": 51.533713, "lng": 9.932198, "logo": "https://dvn-h-prod.hz.lib.harvard.edu/media/logos/gottingen-46x46_ZryDdtE.jpg", "url": "https://data.gro.uni-goettingen.de/", "slug": "gottingen-research-online", "version": "4.14"}, {"id": 1745, "name": "Harvard Dataverse", "full_name": "Harvard University", "is_active": true, "description": "Share, archive, and get credit for your data. Find and cite data across all research fields.", "lat": 42.380098, "lng": -71.116629, "logo": "https://dvn-h-prod.hz.lib.harvard.edu/media/logos/harvard-46x46.png", "url": "https://dataverse.harvard.edu", "slug": "harvard-dataverse", "version": "4.12"}, {"id": 1746, "name": "HeiDATA", "full_name": "Heidelberg University", "is_active": true, "description": "Open for Heidelberg University affiliated researchers to deposit data.", "lat": 49.398750, "lng": 8.672434, "logo": "https://dvn-h-prod.hz.lib.harvard.edu/media/logos/heidelberg-46x46.jpg", "url": "https://heidata.uni-heidelberg.de/", "slug": "heidata", "version": "4.8.2"}, {"id": 1747, "name": "IBICT", "full_name": "IBICT (Brazil)", "is_active": true, "description": "The network Cariniana, cariniana.ibict.br, is funded entirely by the Brazilian government and in particular by MCTI (Minist\u00e9rio da Ci\u00eancia, Tecnologia e Inova\u00e7\u00e3o). It is a project for long-term preservation of scientific publications in Brazil.", "lat": -15.805842, "lng": -47.881369, "logo": "https://dvn-h-prod.hz.lib.harvard.edu/media/logos/ibict-46x46.jpg", "url": "https://repositoriopesquisas.ibict.br/", "slug": "ibict", "version": "4.5.1"}, {"id": 1757, "name": "ICRISAT", "full_name": "ICRISAT", "is_active": true, "description": "International Crops Research Institute for the Semi-Arid Tropics. Free open data repository of ICRISAT research data including Social science, Phenotypic, Genotypic, Spatial and Soil & Weather data which are linked with open publications.", "lat": 17.385000, "lng": 78.486700, "logo": "https://dvn-h-prod.hz.lib.harvard.edu/media/logos/icrisat-46x46.png", "url": "http://dataverse.icrisat.org/", "slug": "icrisat", "version": "4.8.1"}, {"id": 1784, "name": "ICWSM", "full_name": "International AAAI Conference on Web and Social Media", "is_active": true, "description": "Datasets from the International AAAI Conference on Web and Social Media (ICWSM).", "lat": 37.432057, "lng": -122.175297, "logo": "https://dvn-h-prod.hz.lib.harvard.edu/media/logos/icwsm-46x46.jpg", "url": "https://dataverse.mpi-sws.org/dataverse/icwsm", "slug": "icwsm", "version": "4.8.5"}, {"id": 1782, "name": "Ifsttar Dataverse", "full_name": "French Institute of Science and Technology for Transport, Development and Networks", "is_active": true, "description": "Ifsttar Dataverse is an institutional repository for research data of the French Institute of Science and Technology for Transport, Development and Networks. It catalogues research data in the field of transports, spatial planning and civil engineering.", "lat": 48.852800, "lng": 2.602700, "logo": "https://dvn-h-prod.hz.lib.harvard.edu/media/logos/ifsttar-46x46.jpg", "url": "https://research-data.ifsttar.fr/dataverse/data", "slug": "ifsttar-dataverse", "version": "4.10.1"}, {"id": 1748, "name": "IISH Dataverse", "full_name": "International Institute of Social History", "is_active": true, "description": "The IISH Dataverse contains micro-, meso-, and macro-level datasets on social and economic history.", "lat": 52.369021, "lng": 4.939226, "logo": "https://dvn-h-prod.hz.lib.harvard.edu/media/logos/internationalInstituteOfSocialHistory-46x46.jpg", "url": "https://datasets.socialhistory.org/", "slug": "iish-dataverse", "version": "4.3"}, {"id": 1783, "name": "International Potato Center", "full_name": "International Potato Center", "is_active": true, "description": "Centro Internacional De La Papa (International Potato Center) is a member of the CGIAR Consortium, an international organization made up of 15 centers engaged in research for a food secure future.", "lat": -12.077791, "lng": -76.946888, "logo": "https://dvn-h-prod.hz.lib.harvard.edu/media/logos/cip-46x46.png", "url": "https://data.cipotato.org/dataverse.xhtml", "slug": "international-potato-center", "version": "4.8.1"}, {"id": 1749, "name": "Johns Hopkins University", "full_name": "Johns Hopkins", "is_active": true, "description": "Johns Hopkins University Data Archive", "lat": 39.329055, "lng": -76.620335, "logo": "https://dvn-h-prod.hz.lib.harvard.edu/media/logos/johns-46x46.jpg", "url": "https://archive.data.jhu.edu/", "slug": "johns-hopkins-university", "version": "4.6"}, {"id": 1750, "name": "Libra Data", "full_name": "Libra Data (University of Virginia)", "is_active": true, "description": "Libra Data is a place for UVA researchers to share data publicly, and is part of the Libra Scholarly Repository suite of services which includes works of UVA scholarship such as articles, books, theses, and data.", "lat": 38.034578, "lng": -78.507394, "logo": "https://dvn-h-prod.hz.lib.harvard.edu/media/logos/libra-46x46.jpg", "url": "https://dataverse.lib.virginia.edu/", "slug": "libra-data", "version": "4.7.1"}, {"id": 1776, "name": "LIPI Dataverse", "full_name": "Lembaga Ilmu Pengetahuan Indonesia (LIPI) Dataverse", "is_active": true, "description": "The Repositori Ilmiah Nasional (RIN) is a means to share, preserve, cite, explore, and analyze research data. RIN increases data availability and allows others to reproduce research more easily. Researchers, data authors, publishers, data distributors, and affiliate institutions all receive academic credit and web visibility. Researchers, agencies, and funders have full control over research data.", "lat": -6.228771, "lng": 106.818082, "logo": "https://dvn-h-prod.hz.lib.harvard.edu/media/logos/LIPI-46x46.png", "url": "https://data.lipi.go.id", "slug": "lipi-dataverse", "version": "4.6.2"}, {"id": 1756, "name": "Maine Dataverse Network", "full_name": "Maine Dataverse Network", "is_active": true, "description": "A service brought to you by the ACG@UMaine. The way Supercomputing should be!", "lat": 44.901349, "lng": -68.671815, "logo": "https://dvn-h-prod.hz.lib.harvard.edu/media/logos/uOfMaine-46x46.jpg", "url": "http://dataverse.acg.maine.edu/dvn/", "slug": "maine-dataverse-network", "version": "3.5.1"}, {"id": 1787, "name": "MELDATA", "full_name": "International Center for Agriculture Research in the Dry Areas", "is_active": true, "description": "The Dataverse portal of the International Center for Agricultural Research in Dry Ares.", "lat": 33.888630, "lng": 35.495480, "logo": "https://dvn-h-prod.hz.lib.harvard.edu/media/logos/meldata-46x46.png", "url": "http://data.mel.cgiar.org", "slug": "meldata", "version": "4.8.5"}, {"id": 1786, "name": "NIE Data Repository", "full_name": "National Institute of Education", "is_active": true, "description": "NIE Data Repository is the institutional open access research data repository for National Institute of Education, Singapore. NIE researchers are encouraged to use NIE Data Repository to deposit, publish and archive their final research data in order to make their research data discoverable, accessible and reusable.", "lat": 1.349115, "lng": 103.678829, "logo": "https://dvn-h-prod.hz.lib.harvard.edu/media/logos/nie-46x46.png", "url": "https://researchdata.nie.edu.sg", "slug": "nie-data-repository", "version": "4.10.1"}, {"id": 1752, "name": "Peking University", "full_name": "Peking University", "is_active": true, "description": "Peking University Open Research Data Platform", "lat": 39.993923, "lng": 116.306539, "logo": "https://dvn-h-prod.hz.lib.harvard.edu/media/logos/peking-46x46.jpg", "url": "http://opendata.pku.edu.cn/", "slug": "peking-university", "version": "4.0"}, {"id": 1781, "name": "QDR Main Collection", "full_name": "Qualitative Data Repository", "is_active": true, "description": "QDR curates, stores, preserves, publishes, and enables the download of digital data generated through qualitative and multi-method research in the social sciences. The repository develops and disseminates guidance for managing, sharing, citing, and reusing qualitative data, and contributes to the generation of common standards for doing so. QDR\u2019s overarching goals are to make sharing qualitative data customary in the social sciences, to broaden access to social science data, and to strengthen qualitative and multi-method research.", "lat": 43.038013, "lng": -76.135566, "logo": "https://dvn-h-prod.hz.lib.harvard.edu/media/logos/qdr-46x46_kCBOWwk.png", "url": "https://data.qdr.syr.edu", "slug": "qdr-main-collection", "version": "4.10.1"}, {"id": 1780, "name": "Reposit\u00f3rio de Dados de Pesquisa da UFABC", "full_name": "Universidade Federal do ABC (UFABC)", "is_active": true, "description": "", "lat": -23.643807, "lng": -46.528304, "logo": "https://dvn-h-prod.hz.lib.harvard.edu/media/logos/ufabc-46x46_pWiXhTC.png", "url": "http://dataverse.ufabc.edu.br", "slug": "repositorio-de-dados-de-pesquisa-da-ufabc", "version": "4.8.5"}, {"id": 1785, "name": "Reposit\u00f3rio de Dados de Pesquisa do ILEEL", "full_name": "Institute of Linguistics and Literature (Federal University of Uberl\u00e2ndia)", "is_active": true, "description": "Research data repository of Institute of Linguistics and Literature / Federal University of Uberlandia", "lat": -18.908702, "lng": -48.291944, "logo": "https://dvn-h-prod.hz.lib.harvard.edu/media/logos/ileel-46x46_JPxxGBm.png", "url": "http://dataverse.ileel.ufu.br", "slug": "repositorio-de-dados-de-pesquisa-do-ileel", "version": "4.11"}, {"id": 1789, "name": "Repositorio de Datos de Investigaci\u00f3n Universidad del Rosario", "full_name": "Universidad del Rosario", "is_active": true, "description": "Explore research data from Universidad del Rosario affiliated researchers.", "lat": 4.600598, "lng": -74.073352, "logo": "https://dvn-h-prod.hz.lib.harvard.edu/media/logos/ur-46x46_ohTqcfg.png", "url": "http://research-data.urosario.edu.co/", "slug": "repositorio-de-datos-de-investigacion-universidad-del-rosario", "version": "4.9.4"}, {"id": 1758, "name": "Scholars Portal", "full_name": "Scholars Portal Dataverse", "is_active": true, "description": "Open for researchers and organizations associated with Ontario universities to deposit data.", "lat": 43.653200, "lng": -79.383200, "logo": "https://dvn-h-prod.hz.lib.harvard.edu/media/logos/scholarsportal-46x46.jpg", "url": "https://dataverse.scholarsportal.info/", "slug": "scholars-portal", "version": "4.10.1"}, {"id": 1761, "name": "Texas Data Repository Dataverse", "full_name": "Texas Data Repository Dataverse", "is_active": true, "description": "A statewide archive of research data from Texas Digital Library (TDL) member institutions.", "lat": 30.307182, "lng": -97.755996, "logo": "https://dvn-h-prod.hz.lib.harvard.edu/media/logos/tdl-46x46.png", "url": "https://dataverse.tdl.org/", "slug": "texas-data-repository-dataverse", "version": "4.7.1"}, {"id": 1755, "name": "UAL Dataverse", "full_name": "University of Alberta Libraries Dataverse", "is_active": true, "description": "Open for University of Alberta affiliated researchers to deposit data.", "lat": 53.494321, "lng": -113.549027, "logo": "https://dvn-h-prod.hz.lib.harvard.edu/media/logos/uOfAlberta-46x46.jpg", "url": "https://dataverse.library.ualberta.ca/dvn/", "slug": "ual-dataverse", "version": "4.5.1"}, {"id": 1788, "name": "UCLA Dataverse", "full_name": "UCLA Data Science Center", "is_active": true, "description": "", "lat": 34.068900, "lng": -118.445200, "logo": "https://dvn-h-prod.hz.lib.harvard.edu/media/logos/ucla-46x46.png", "url": "https://dataverse.ucla.edu/dataverse/dataverse", "slug": "ucla-dataverse", "version": "4.13"}, {"id": 1772, "name": "UNB Libraries Dataverse", "full_name": "University of New Brunswick Libraries", "is_active": true, "description": "", "lat": 45.964993, "lng": -66.646332, "logo": "https://dvn-h-prod.hz.lib.harvard.edu/media/logos/unbDataverse-46x46.png", "url": "https://dataverse.lib.unb.ca/", "slug": "unb-libraries-dataverse", "version": "4.8.2"}, {"id": 1751, "name": "UNC Dataverse", "full_name": "Odum Institute for Research in Social Science", "is_active": true, "description": "Open for all researchers worldwide from all disciplines to deposit data. The Odum Institute also offers multiple data curation service levels. For more information, go to http://www.irss.unc.edu/odum/contentPrimary.jsp?nodeid=5.", "lat": 35.905022, "lng": -79.050851, "logo": "https://dvn-h-prod.hz.lib.harvard.edu/media/logos/unc-46x46.png", "url": "https://dataverse.unc.edu/", "slug": "unc-dataverse", "version": "4.7.1"}, {"id": 1790, "name": "Universit\u00e0 degli Studi di Milano", "full_name": "University of Milan", "is_active": true, "description": "The University of Milan is home to important research teams operating in the university's extremely rich variety of scientific-disciplinary sectors. Besides taking part in the most relevant national and international research programs, the University is also very active in the field of technology transfer and developing applications for scientific research results. dataverse.unimi.it is the repository for research data offered to all researchers in disciplines from health sciences to laws, from economics to hard sciences, from humanities to mathematics.", "lat": 45.460100, "lng": 9.194600, "logo": "https://dvn-h-prod.hz.lib.harvard.edu/media/logos/milan-46x46.png", "url": "https://dataverse.unimi.it/", "slug": "universita-degli-studi-di-milano", "version": null}, {"id": 1765, "name": "University of Manitoba Dataverse", "full_name": "University of Manitoba Dataverse", "is_active": true, "description": "", "lat": 49.895077, "lng": -97.138451, "logo": "https://dvn-h-prod.hz.lib.harvard.edu/media/logos/manitoba-46x46.jpg", "url": "https://dataverse.lib.umanitoba.ca/", "slug": "university-of-manitoba-dataverse", "version": "4.8.4"}, {"id": 1775, "name": "UWI", "full_name": "The University of the West Indies", "is_active": true, "description": "The University of the West Indies Research Datasets Repository.", "lat": 18.006372, "lng": -76.747148, "logo": "https://dvn-h-prod.hz.lib.harvard.edu/media/logos/uwi-46x46.png", "url": "http://dataverse.sta.uwi.edu/", "slug": "uwi", "version": "4.8.4"}, {"id": 1779, "name": "VTTI", "full_name": "Virginia Tech Transportation Institute", "is_active": true, "description": "Transportation data repository maintained by the Virginia Tech Transportation Institute.", "lat": 37.190102, "lng": -80.396776, "logo": "https://dvn-h-prod.hz.lib.harvard.edu/media/logos/vtti-46x46_Q6PC6r7.png", "url": "https://dataverse.vtti.vt.edu/", "slug": "vtti", "version": "4.9.4"}]} diff --git a/repo2docker/contentproviders/dataverse.py b/repo2docker/contentproviders/dataverse.py new file mode 100644 index 000000000..15f2ec469 --- /dev/null +++ b/repo2docker/contentproviders/dataverse.py @@ -0,0 +1,129 @@ +import os +import json +import shutil + +from urllib.request import Request +from urllib.parse import urlparse, urlunparse +from zipfile import ZipFile + +from .doi import DoiProvider +from ..utils import copytree, deep_get, is_doi, normalize_doi + + +class Dataverse(DoiProvider): + """Provide contents of a Dataverse dataset.""" + + def __init__(self): + data_file = os.path.join(os.path.dirname(__file__), "dataverse.json") + with open(data_file, "r") as fp: + self.hosts = json.load(fp)["installations"] + super().__init__() + + def detect(self, doi, ref=None, extra_args=None): + """Trigger this provider for things that resolve to a Dataverse dataset. + + Handles: + - DOI pointing to {siteURL}/dataset.xhtml?persistentId={persistentId} + - DOI pointing to {siteURL}/file.xhtml?persistentId={persistentId}&... + - URL {siteURL}/api/access/datafile/{fileId} + + Examples: + - https://dataverse.harvard.edu/api/access/datafile/3323458 + - doi:10.7910/DVN/6ZXAGT + - doi:10.7910/DVN/6ZXAGT/3YRRYJ + + """ + url = self.doi2url(doi) + + # Check if the url matches any known Dataverse installation, bail if not. + host = next((host for host in self.hosts if url.startswith(host["url"])), None) + if host is None: + return + + # Parse the url, to get the base for later API calls + parsed_url = urlparse(url) + + # Corner case handling + if parsed_url.path.startswith("/file.xhtml"): + # There's no way of getting file information using its persistentId, the only thing we can do is assume that doi + # is structured as "doi:/" and try to handle dataset that way. + new_doi = doi.rsplit("/", 1)[0] + if new_doi == doi: + # tough luck :( Avoid inifite recursion and exit. + return + return self.detect(new_doi) + elif parsed_url.path.startswith("/api/access/datafile"): + # Raw url pointing to a datafile is a typical output from an External Tool integration + entity_id = os.path.basename(parsed_url.path) + search_query = "q=entityId:" + entity_id + "&type=file" + # Knowing the file identifier query search api to get parent dataset + search_url = urlunparse( + parsed_url._replace(path="/api/search", query=search_query) + ) + resp = self.urlopen(search_url).read() + data = json.loads(resp.decode("utf-8"))["data"] + if data["count_in_response"] != 1: + self.log.debug("Dataverse search query failed!") + self.log.debug(" - doi = " + doi) + self.log.debug(" - url = " + url) + self.log.debug(" - resp = " + json.dumps(data)) + return + + self.record_id = deep_get(data, "items.0.dataset_persistent_id") + elif is_doi(doi): + self.record_id = "doi:" + normalize_doi(doi) + + if hasattr(self, "record_id"): + return {"record": self.record_id, "host": host} + + def fetch(self, spec, output_dir, yield_output=False): + """Fetch and unpack a Dataverse dataset.""" + record_id = spec["record"] + host = spec["host"] + + yield "Fetching Dataverse record {}.\n".format(record_id) + req = Request( + "{}/api/datasets/:persistentId?persistentId={}".format( + host["url"], record_id + ), + headers={"accept": "application/json"}, + ) + resp = self.urlopen(req) + record = json.loads(resp.read().decode("utf-8"))["data"] + + # In order to fetch entire dataset we build a list of file IDs we want to fetch + # and then receive a zip file containing all of them. + # TODO: Dataverse has a limit for the zipfile size (see + # https://github.com/jupyter/repo2docker/pull/739#issuecomment-510834729) + # If size of the dataset is grater than 100MB individual files should be downloaded. + file_ids = [ + str(deep_get(fobj, "dataFile.id")) + for fobj in deep_get(record, "latestVersion.files") + ] + + req = Request( + "{}/api/access/datafiles/{}".format(host["url"], ",".join(file_ids)) + ) + + dst_fname = os.path.join(output_dir, "dataverse.zip") + with self.urlopen(req) as src, open(dst_fname, "wb") as dst: + yield "Fetching files bundle\n" + shutil.copyfileobj(src, dst) + + yield "Extracting files\n" + with ZipFile(dst_fname) as zfile: + zfile.extractall(path=output_dir) + + os.remove(dst_fname) + new_subdirs = os.listdir(output_dir) + # if there is only one new subdirectory move its contents + # to the top level directory + if len(new_subdirs) == 1: + d = new_subdirs[0] + copytree(os.path.join(output_dir, d), output_dir) + shutil.rmtree(os.path.join(output_dir, d)) + + @property + def content_id(self): + """The Dataverse persistent identifier (could use internal dataset_id too).""" + return self.record_id From 8edafd0d077a934be75768edf16b8de27916e6ed Mon Sep 17 00:00:00 2001 From: "Kacper Kowalik (Xarthisius)" Date: Fri, 13 Sep 2019 11:59:36 -0500 Subject: [PATCH 2/7] Add tests --- tests/unit/contentproviders/test_dataverse.py | 121 ++++++++++++++++++ 1 file changed, 121 insertions(+) create mode 100644 tests/unit/contentproviders/test_dataverse.py diff --git a/tests/unit/contentproviders/test_dataverse.py b/tests/unit/contentproviders/test_dataverse.py new file mode 100644 index 000000000..370da54cb --- /dev/null +++ b/tests/unit/contentproviders/test_dataverse.py @@ -0,0 +1,121 @@ +import json +import os +import pytest + +from contextlib import contextmanager +from io import BytesIO +from tempfile import TemporaryDirectory, NamedTemporaryFile +from unittest.mock import patch +from urllib.request import urlopen, Request +from zipfile import ZipFile + +from repo2docker.contentproviders import Dataverse + + +test_dv = Dataverse() +harvard_dv = next((_ for _ in test_dv.hosts if _["id"] == 1745)) +test_hosts = [ + ( + [ + "doi:10.7910/DVN/6ZXAGT/3YRRYJ", + "10.7910/DVN/6ZXAGT", + "https://dataverse.harvard.edu/api/access/datafile/3323458", + ], + {"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"}, + ) +] +test_responses = { + "doi:10.7910/DVN/6ZXAGT/3YRRYJ": ( + "https://dataverse.harvard.edu/file.xhtml" + "?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ" + ), + "doi:10.7910/DVN/6ZXAGT": ( + "https://dataverse.harvard.edu/dataset.xhtml" + "?persistentId=doi:10.7910/DVN/6ZXAGT" + ), + "10.7910/DVN/6ZXAGT": ( + "https://dataverse.harvard.edu/dataset.xhtml" + "?persistentId=doi:10.7910/DVN/6ZXAGT" + ), + "https://dataverse.harvard.edu/api/access/datafile/3323458": "https://dataverse.harvard.edu/api/access/datafile/3323458", +} +test_search = { + "data": { + "count_in_response": 1, + "items": [{"dataset_persistent_id": "doi:10.7910/DVN/6ZXAGT"}], + } +} + + +@pytest.mark.parametrize("test_input, expected", test_hosts) +def test_detect_dataverse(test_input, expected): + def doi_resolver(url): + return test_responses.get(url) + + with patch.object(Dataverse, "urlopen") as fake_urlopen, patch.object( + Dataverse, "doi2url", side_effect=doi_resolver + ) as fake_doi2url: + fake_urlopen.return_value.read.return_value = json.dumps(test_search).encode() + # valid Dataverse DOIs trigger this content provider + assert Dataverse().detect(test_input[0]) == expected + assert fake_doi2url.call_count == 2 # File, then dataset + assert Dataverse().detect(test_input[1]) == expected + assert Dataverse().detect(test_input[2]) == expected + # only two of the three calls above have to resolve a DOI + assert fake_urlopen.call_count == 1 + + with patch.object(Dataverse, "urlopen") as fake_urlopen: + # Don't trigger the Dataverse content provider + assert Dataverse().detect("/some/path/here") is None + assert Dataverse().detect("https://example.com/path/here") is None + # don't handle DOIs that aren't from Dataverse + fake_urlopen.return_value.url = ( + "http://joss.theoj.org/papers/10.21105/joss.01277" + ) + assert Dataverse().detect("https://doi.org/10.21105/joss.01277") is None + + +@contextmanager +def dv_archive(prefix="a_directory"): + with NamedTemporaryFile(suffix=".zip") as zfile: + with ZipFile(zfile.name, mode="w") as zip: + zip.writestr("{}/some-file.txt".format(prefix), "some content") + zip.writestr("{}/some-other-file.txt".format(prefix), "some more content") + + yield zfile.name + + +def test_dataverse_fetch(): + mock_response_ds_query = BytesIO( + json.dumps( + { + "data": { + "latestVersion": { + "files": [{"dataFile": {"id": 1}}, {"dataFile": {"id": 2}}] + } + } + } + ).encode("utf-8") + ) + spec = {"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"} + dv = Dataverse() + + with dv_archive() as data_local_path: + + def mock_urlopen(self, req): + if isinstance(req, Request): + if "/api/datasets" in req.full_url: + return mock_response_ds_query + elif "/api/access/datafiles" in req.full_url: + assert req.full_url.endswith("1,2") + return urlopen("file://{}".format(data_local_path)) + + with patch.object(Dataverse, "urlopen", new=mock_urlopen): + with TemporaryDirectory() as d: + output = [] + for l in dv.fetch(spec, d): + output.append(l) + + unpacked_files = set(os.listdir(d)) + expected = set(["some-other-file.txt", "some-file.txt"]) + assert expected == unpacked_files From 063fd4dd9a709c43a83a3fcfe8374645055a1fa7 Mon Sep 17 00:00:00 2001 From: "Kacper Kowalik (Xarthisius)" Date: Fri, 13 Sep 2019 13:40:31 -0500 Subject: [PATCH 3/7] Add a command for generating dataverse data file --- setup.py | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 9b7ce5223..4df1c522a 100644 --- a/setup.py +++ b/setup.py @@ -1,3 +1,4 @@ +from distutils.cmd import Command from setuptools import setup, find_packages import sys import versioneer @@ -8,6 +9,35 @@ with open("README.md", encoding="utf8") as f: readme = f.read() + +class GenerateDataverseInstallationsFileCommand(Command): + description = "Generate Dataverse installations data map" + user_options = [] + + def initialize_options(self): + self.url = ( + "https://services.dataverse.harvard.edu/miniverse/map/installations-json" + ) + + def finalize_options(self): + pass + + def run(self): + from urllib.request import urlopen + import json + + resp = urlopen(self.url, timeout=5) + resp_body = resp.read() + data = json.loads(resp_body.decode("utf-8")) + if "installations" not in data: + raise ValueError("Malformed installation map.") + with open("repo2docker/contentproviders/dataverse.json", "wb") as fp: + fp.write(resp_body) + + +__cmdclass = versioneer.get_cmdclass() +__cmdclass["generate_dataverse_file"] = GenerateDataverseInstallationsFileCommand + setup( name="jupyter-repo2docker", version=versioneer.get_version(), @@ -48,7 +78,7 @@ ], packages=find_packages(), include_package_data=True, - cmdclass=versioneer.get_cmdclass(), + cmdclass=__cmdclass, entry_points={ "console_scripts": [ "jupyter-repo2docker = repo2docker.__main__:main", From 14b758d327765fe0e41a6b06fcd5038f2efaa7ea Mon Sep 17 00:00:00 2001 From: "Kacper Kowalik (Xarthisius)" Date: Mon, 16 Sep 2019 09:17:15 -0500 Subject: [PATCH 4/7] Improve debug logging --- repo2docker/contentproviders/dataverse.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/repo2docker/contentproviders/dataverse.py b/repo2docker/contentproviders/dataverse.py index 15f2ec469..dee04b3fd 100644 --- a/repo2docker/contentproviders/dataverse.py +++ b/repo2docker/contentproviders/dataverse.py @@ -60,13 +60,15 @@ def detect(self, doi, ref=None, extra_args=None): search_url = urlunparse( parsed_url._replace(path="/api/search", query=search_query) ) + self.log.debug("Querying Dataverse: " + search_url) resp = self.urlopen(search_url).read() data = json.loads(resp.decode("utf-8"))["data"] if data["count_in_response"] != 1: - self.log.debug("Dataverse search query failed!") - self.log.debug(" - doi = " + doi) - self.log.debug(" - url = " + url) - self.log.debug(" - resp = " + json.dumps(data)) + self.log.debug( + "Dataverse search query failed!\n - doi: {}\n - url: {}\n - resp: {}\n".format( + doi, url, json.dump(data) + ) + ) return self.record_id = deep_get(data, "items.0.dataset_persistent_id") From 30375d13df0c43968d77548f43591ee528cc618e Mon Sep 17 00:00:00 2001 From: "Kacper Kowalik (Xarthisius)" Date: Mon, 16 Sep 2019 14:24:31 -0500 Subject: [PATCH 5/7] Use the persistent id from the url regardless whether it's a DOI or not --- repo2docker/contentproviders/dataverse.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/repo2docker/contentproviders/dataverse.py b/repo2docker/contentproviders/dataverse.py index dee04b3fd..8a9f7d0fe 100644 --- a/repo2docker/contentproviders/dataverse.py +++ b/repo2docker/contentproviders/dataverse.py @@ -3,11 +3,11 @@ import shutil from urllib.request import Request -from urllib.parse import urlparse, urlunparse +from urllib.parse import urlparse, urlunparse, parse_qs from zipfile import ZipFile from .doi import DoiProvider -from ..utils import copytree, deep_get, is_doi, normalize_doi +from ..utils import copytree, deep_get class Dataverse(DoiProvider): @@ -42,6 +42,7 @@ def detect(self, doi, ref=None, extra_args=None): # Parse the url, to get the base for later API calls parsed_url = urlparse(url) + query_args = parse_qs(parsed_url.query) # Corner case handling if parsed_url.path.startswith("/file.xhtml"): @@ -72,8 +73,11 @@ def detect(self, doi, ref=None, extra_args=None): return self.record_id = deep_get(data, "items.0.dataset_persistent_id") - elif is_doi(doi): - self.record_id = "doi:" + normalize_doi(doi) + elif ( + parsed_url.path.startswith("/dataset.xhtml") + and "persistentId" in query_args + ): + self.record_id = deep_get(query_args, "persistentId.0") if hasattr(self, "record_id"): return {"record": self.record_id, "host": host} @@ -127,5 +131,5 @@ def fetch(self, spec, output_dir, yield_output=False): @property def content_id(self): - """The Dataverse persistent identifier (could use internal dataset_id too).""" + """The Dataverse persistent identifier.""" return self.record_id From a2f8228b157cffe81c358fe3c3c063ab46ca91f8 Mon Sep 17 00:00:00 2001 From: "Kacper Kowalik (Xarthisius)" Date: Mon, 16 Sep 2019 15:03:12 -0500 Subject: [PATCH 6/7] Match DV hosts based on netloc instead of url --- repo2docker/contentproviders/dataverse.py | 13 ++++++++++--- tests/unit/contentproviders/test_dataverse.py | 17 ++++++++++++----- 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/repo2docker/contentproviders/dataverse.py b/repo2docker/contentproviders/dataverse.py index 8a9f7d0fe..791337578 100644 --- a/repo2docker/contentproviders/dataverse.py +++ b/repo2docker/contentproviders/dataverse.py @@ -34,14 +34,21 @@ def detect(self, doi, ref=None, extra_args=None): """ url = self.doi2url(doi) + # Parse the url, to get the base for later API calls + parsed_url = urlparse(url) # Check if the url matches any known Dataverse installation, bail if not. - host = next((host for host in self.hosts if url.startswith(host["url"])), None) + host = next( + ( + host + for host in self.hosts + if urlparse(host["url"]).netloc == parsed_url.netloc + ), + None, + ) if host is None: return - # Parse the url, to get the base for later API calls - parsed_url = urlparse(url) query_args = parse_qs(parsed_url.query) # Corner case handling diff --git a/tests/unit/contentproviders/test_dataverse.py b/tests/unit/contentproviders/test_dataverse.py index 370da54cb..69ab9917b 100644 --- a/tests/unit/contentproviders/test_dataverse.py +++ b/tests/unit/contentproviders/test_dataverse.py @@ -13,15 +13,20 @@ test_dv = Dataverse() -harvard_dv = next((_ for _ in test_dv.hosts if _["id"] == 1745)) +harvard_dv = next((_ for _ in test_dv.hosts if _["name"] == "Harvard Dataverse")) +cimmyt_dv = next((_ for _ in test_dv.hosts if _["name"] == "CIMMYT Research Data")) test_hosts = [ ( [ "doi:10.7910/DVN/6ZXAGT/3YRRYJ", "10.7910/DVN/6ZXAGT", "https://dataverse.harvard.edu/api/access/datafile/3323458", + "hdl:11529/10016", + ], + [ + {"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"}, + {"host": cimmyt_dv, "record": "hdl:11529/10016"}, ], - {"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"}, ) ] test_responses = { @@ -38,6 +43,7 @@ "?persistentId=doi:10.7910/DVN/6ZXAGT" ), "https://dataverse.harvard.edu/api/access/datafile/3323458": "https://dataverse.harvard.edu/api/access/datafile/3323458", + "hdl:11529/10016": "https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016", } test_search = { "data": { @@ -57,12 +63,13 @@ def doi_resolver(url): ) as fake_doi2url: fake_urlopen.return_value.read.return_value = json.dumps(test_search).encode() # valid Dataverse DOIs trigger this content provider - assert Dataverse().detect(test_input[0]) == expected + assert Dataverse().detect(test_input[0]) == expected[0] assert fake_doi2url.call_count == 2 # File, then dataset - assert Dataverse().detect(test_input[1]) == expected - assert Dataverse().detect(test_input[2]) == expected + assert Dataverse().detect(test_input[1]) == expected[0] + assert Dataverse().detect(test_input[2]) == expected[0] # only two of the three calls above have to resolve a DOI assert fake_urlopen.call_count == 1 + assert Dataverse().detect(test_input[3]) == expected[1] with patch.object(Dataverse, "urlopen") as fake_urlopen: # Don't trigger the Dataverse content provider From 4df4fd6ab41a44b994f2ea8664ede9df89220020 Mon Sep 17 00:00:00 2001 From: "Kacper Kowalik (Xarthisius)" Date: Tue, 17 Sep 2019 15:44:56 -0500 Subject: [PATCH 7/7] Download individual files instead of zip bundle --- repo2docker/contentproviders/dataverse.py | 33 +++----- tests/unit/contentproviders/test_dataverse.py | 78 +++++++++++-------- 2 files changed, 57 insertions(+), 54 deletions(-) diff --git a/repo2docker/contentproviders/dataverse.py b/repo2docker/contentproviders/dataverse.py index 791337578..3aa679946 100644 --- a/repo2docker/contentproviders/dataverse.py +++ b/repo2docker/contentproviders/dataverse.py @@ -4,7 +4,6 @@ from urllib.request import Request from urllib.parse import urlparse, urlunparse, parse_qs -from zipfile import ZipFile from .doi import DoiProvider from ..utils import copytree, deep_get @@ -104,34 +103,22 @@ def fetch(self, spec, output_dir, yield_output=False): resp = self.urlopen(req) record = json.loads(resp.read().decode("utf-8"))["data"] - # In order to fetch entire dataset we build a list of file IDs we want to fetch - # and then receive a zip file containing all of them. - # TODO: Dataverse has a limit for the zipfile size (see - # https://github.com/jupyter/repo2docker/pull/739#issuecomment-510834729) - # If size of the dataset is grater than 100MB individual files should be downloaded. - file_ids = [ - str(deep_get(fobj, "dataFile.id")) - for fobj in deep_get(record, "latestVersion.files") - ] - - req = Request( - "{}/api/access/datafiles/{}".format(host["url"], ",".join(file_ids)) - ) + for fobj in deep_get(record, "latestVersion.files"): + file_url = "{}/api/access/datafile/{}".format( + host["url"], deep_get(fobj, "dataFile.id") + ) + filename = os.path.join(fobj.get("directoryLabel", ""), fobj["label"]) - dst_fname = os.path.join(output_dir, "dataverse.zip") - with self.urlopen(req) as src, open(dst_fname, "wb") as dst: - yield "Fetching files bundle\n" - shutil.copyfileobj(src, dst) + file_ref = {"download": file_url, "filename": filename} + fetch_map = {key: key for key in file_ref.keys()} - yield "Extracting files\n" - with ZipFile(dst_fname) as zfile: - zfile.extractall(path=output_dir) + for line in self.fetch_file(file_ref, fetch_map, output_dir): + yield line - os.remove(dst_fname) new_subdirs = os.listdir(output_dir) # if there is only one new subdirectory move its contents # to the top level directory - if len(new_subdirs) == 1: + if len(new_subdirs) == 1 and os.path.isdir(new_subdirs[0]): d = new_subdirs[0] copytree(os.path.join(output_dir, d), output_dir) shutil.rmtree(os.path.join(output_dir, d)) diff --git a/tests/unit/contentproviders/test_dataverse.py b/tests/unit/contentproviders/test_dataverse.py index 69ab9917b..76d396544 100644 --- a/tests/unit/contentproviders/test_dataverse.py +++ b/tests/unit/contentproviders/test_dataverse.py @@ -2,12 +2,10 @@ import os import pytest -from contextlib import contextmanager from io import BytesIO -from tempfile import TemporaryDirectory, NamedTemporaryFile +from tempfile import TemporaryDirectory from unittest.mock import patch from urllib.request import urlopen, Request -from zipfile import ZipFile from repo2docker.contentproviders import Dataverse @@ -82,47 +80,65 @@ def doi_resolver(url): assert Dataverse().detect("https://doi.org/10.21105/joss.01277") is None -@contextmanager -def dv_archive(prefix="a_directory"): - with NamedTemporaryFile(suffix=".zip") as zfile: - with ZipFile(zfile.name, mode="w") as zip: - zip.writestr("{}/some-file.txt".format(prefix), "some content") - zip.writestr("{}/some-other-file.txt".format(prefix), "some more content") +@pytest.fixture +def dv_files(tmpdir): - yield zfile.name + f1 = tmpdir.join("some-file.txt") + f1.write("some content") + f2 = tmpdir.mkdir("directory").join("some-other-file.txt") + f2.write("some other content") -def test_dataverse_fetch(): + f3 = tmpdir.join("directory").mkdir("subdirectory").join("the-other-file.txt") + f3.write("yet another content") + + return [f1, f2, f3] + + +def test_dataverse_fetch(dv_files): mock_response_ds_query = BytesIO( json.dumps( { "data": { "latestVersion": { - "files": [{"dataFile": {"id": 1}}, {"dataFile": {"id": 2}}] + "files": [ + {"dataFile": {"id": 1}, "label": "some-file.txt"}, + { + "dataFile": {"id": 2}, + "label": "some-other-file.txt", + "directoryLabel": "directory", + }, + { + "dataFile": {"id": 3}, + "label": "the-other-file.txt", + "directoryLabel": "directory/subdirectory", + }, + ] } } } ).encode("utf-8") ) spec = {"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"} + dv = Dataverse() - with dv_archive() as data_local_path: - - def mock_urlopen(self, req): - if isinstance(req, Request): - if "/api/datasets" in req.full_url: - return mock_response_ds_query - elif "/api/access/datafiles" in req.full_url: - assert req.full_url.endswith("1,2") - return urlopen("file://{}".format(data_local_path)) - - with patch.object(Dataverse, "urlopen", new=mock_urlopen): - with TemporaryDirectory() as d: - output = [] - for l in dv.fetch(spec, d): - output.append(l) - - unpacked_files = set(os.listdir(d)) - expected = set(["some-other-file.txt", "some-file.txt"]) - assert expected == unpacked_files + def mock_urlopen(self, req): + if isinstance(req, Request): + return mock_response_ds_query + else: + file_no = int(req.split("/")[-1]) - 1 + return urlopen("file://{}".format(dv_files[file_no])) + + with patch.object(Dataverse, "urlopen", new=mock_urlopen): + with TemporaryDirectory() as d: + output = [] + for l in dv.fetch(spec, d): + output.append(l) + + unpacked_files = set(os.listdir(d)) + expected = set(["directory", "some-file.txt"]) + assert expected == unpacked_files + assert os.path.isfile( + os.path.join(d, "directory", "subdirectory", "the-other-file.txt") + )