From 09d156cc02c0a14f2909d0bc046ab25b75c8024b Mon Sep 17 00:00:00 2001 From: Leonardo Uieda Date: Thu, 16 Jan 2020 16:26:02 +0000 Subject: [PATCH 01/10] Use pooch.os_cache and pkg_resources in datasets With Pooch 0.7.0, the recommended way of loading the registry file is with `pkg_resources` (see fatiando/pooch#120). It's also better to use the default cache location so users can more easily clean up unused files. Because this is system specific, add the `verde.datasets.locate` function to return the cache folder location. --- data/examples/README.txt | 30 +++++++++++++----------------- doc/api/index.rst | 1 + verde/datasets/__init__.py | 1 + verde/datasets/sample_data.py | 26 ++++++++++++++++++++++++-- verde/tests/test_datasets.py | 12 ++++++++++++ 5 files changed, 51 insertions(+), 19 deletions(-) diff --git a/data/examples/README.txt b/data/examples/README.txt index 4b29661b1..8cf9df1f7 100644 --- a/data/examples/README.txt +++ b/data/examples/README.txt @@ -3,27 +3,23 @@ Sample Data =========== -Verde provides some sample data and ways of generating synthetic data through the -:mod:`verde.datasets` module. The sample data are automatically downloaded from the `Github -repository `__ to a folder on your computer the first -time you use them. After that, the data are loaded from this folder. The download is -managed by the :mod:`pooch` package. +Verde provides some sample data and ways of generating synthetic data through +the :mod:`verde.datasets` module. +Where are my data files? +------------------------ -Where is my data? ------------------ - -The data files are downloaded to a folder ``~/.verde/data/`` by default. This is the -*base data directory*. :mod:`pooch` will create a separate folder in the base directory -for each version of Verde. So for Verde 0.1, the base data dir is ``~/.verde/data/0.1``. -If you're using the latest development version from Github, the version is ``master``. - -You can change the base data directory by setting the ``VERDE_DATA_DIR`` environment -variable to a different path. +The sample data files are downloaded automatically by :mod:`pooch` the first +time you load them. The files are saved to the default cache location on your +operating system. The location varies depending on your system and +configuration. We provide the :func:`verde.datasets.locate` function if you +need to find the data storage location on your system. +You can change the base data directory by setting the ``VERDE_DATA_DIR`` +environment variable to the desired path. Available datasets ------------------ -These are the datasets currently available. Most also come with a function for setting -up a Cartopy map to display the data. +These are the datasets currently available. Most also come with a companion +function for setting up a Cartopy map to display the data. diff --git a/doc/api/index.rst b/doc/api/index.rst index b4c07aa60..4f1e0708f 100644 --- a/doc/api/index.rst +++ b/doc/api/index.rst @@ -93,6 +93,7 @@ Datasets .. autosummary:: :toctree: generated/ + datasets.locate datasets.CheckerBoard datasets.fetch_baja_bathymetry datasets.setup_baja_bathymetry_map diff --git a/verde/datasets/__init__.py b/verde/datasets/__init__.py index a70018574..e803e3cb2 100644 --- a/verde/datasets/__init__.py +++ b/verde/datasets/__init__.py @@ -1,6 +1,7 @@ # pylint: disable=missing-docstring from .synthetic import CheckerBoard from .sample_data import ( + locate, fetch_baja_bathymetry, setup_baja_bathymetry_map, fetch_rio_magnetic, diff --git a/verde/datasets/sample_data.py b/verde/datasets/sample_data.py index 6f975b7b6..793fee371 100644 --- a/verde/datasets/sample_data.py +++ b/verde/datasets/sample_data.py @@ -4,6 +4,7 @@ import os import warnings +import pkg_resources import numpy as np import pandas as pd import pooch @@ -22,13 +23,34 @@ warnings.simplefilter("default") POOCH = pooch.create( - path=["~", ".verde", "data"], + path=pooch.os_cache("verde"), base_url="https://github.com/fatiando/verde/raw/{version}/data/", version=full_version, version_dev="master", env="VERDE_DATA_DIR", ) -POOCH.load_registry(os.path.join(os.path.dirname(__file__), "registry.txt")) +POOCH.load_registry(pkg_resources.resource_stream("verde.datasets", "registry.txt")) + + +def locate(): + r""" + The absolute path to the sample data storage location on disk. + + This is where the data are saved on your computer. The location is + dependent on the operating system. The folder locations are defined by the + ``appdirs`` package (see the `appdirs documentation + `__). + + The location can be overwritten by the ``VERDE_DATA_DIR`` environment + variable to the desired destination. + + Returns + ------- + path : str + The local data storage location. + + """ + return str(POOCH.abspath) def _setup_map( diff --git a/verde/tests/test_datasets.py b/verde/tests/test_datasets.py index eb5dfce45..9bdb4711f 100644 --- a/verde/tests/test_datasets.py +++ b/verde/tests/test_datasets.py @@ -1,12 +1,15 @@ """ Test data fetching routines. """ +import os + import matplotlib.pyplot as plt import cartopy.crs as ccrs import pytest from ..datasets.sample_data import ( + locate, fetch_baja_bathymetry, setup_baja_bathymetry_map, fetch_rio_magnetic, @@ -18,6 +21,15 @@ ) +def test_datasets_locate(): + "Make sure the data cache location has the right package name" + path = locate() + assert os.path.exists(path) + # This is the most we can check in a platform independent way without + # testing appdirs itself. + assert "verde" in path + + def test_fetch_texas_wind(): "Make sure the data are loaded properly" data = fetch_texas_wind() From b12720f7888033da13b0d798d323aa2dfd0381eb Mon Sep 17 00:00:00 2001 From: Leonardo Uieda Date: Thu, 16 Jan 2020 16:37:15 +0000 Subject: [PATCH 02/10] Require pooch >= 0.7.0 Using pkg_resources was not allowed before then. --- requirements.txt | 2 +- setup.py | 9 ++++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index bc6ae2c61..8ce202a6a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,4 @@ scipy pandas xarray scikit-learn -pooch +pooch>=0.7.0 diff --git a/setup.py b/setup.py index a896659d6..993ab32f7 100644 --- a/setup.py +++ b/setup.py @@ -43,7 +43,14 @@ "verde.datasets": ["registry.txt"], "verde.tests": ["data/*", "baseline/*"], } -INSTALL_REQUIRES = ["numpy", "scipy", "pandas", "xarray", "scikit-learn", "pooch"] +INSTALL_REQUIRES = [ + "numpy", + "scipy", + "pandas", + "xarray", + "scikit-learn", + "pooch>=0.7.0", +] PYTHON_REQUIRES = ">=3.6" if __name__ == "__main__": From 439e1803e923f630d9acdc11bab0151c301020db Mon Sep 17 00:00:00 2001 From: Leonardo Uieda Date: Thu, 16 Jan 2020 16:40:03 +0000 Subject: [PATCH 03/10] Configure CI to save to predictable location This way we can copy the data to avoid downloads --- .azure-pipelines.yml | 10 ++++++---- .travis.yml | 5 +++-- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/.azure-pipelines.yml b/.azure-pipelines.yml index 15bcf87f6..3cf1bb434 100644 --- a/.azure-pipelines.yml +++ b/.azure-pipelines.yml @@ -77,6 +77,7 @@ jobs: CONDA_REQUIREMENTS: requirements.txt CONDA_REQUIREMENTS_DEV: requirements-dev.txt CONDA_INSTALL_EXTRA: "codecov" + VERDE_DATA_DIR: "$HOME/.verde/data/master" strategy: matrix: @@ -127,8 +128,8 @@ jobs: # Copy the test data to the cache folder - bash: | set -x -e - mkdir -p $HOME/.verde/data/master - cp -r data/* $HOME/.verde/data/master + mkdir -p $VERDE_DATA_DIR + cp -r data/* $VERDE_DATA_DIR displayName: Copy test data to cache # Install the package @@ -178,6 +179,7 @@ jobs: CONDA_REQUIREMENTS: requirements.txt CONDA_REQUIREMENTS_DEV: requirements-dev.txt CONDA_INSTALL_EXTRA: "codecov" + VERDE_DATA_DIR: "~/.verde/data/master" strategy: matrix: @@ -222,8 +224,8 @@ jobs: # Copy the test data to the cache folder - bash: | set -x -e - mkdir -p ~/.verde/data/master - cp -r data/* ~/.verde/data/master + mkdir -p $VERDE_DATA_DIR + cp -r data/* $VERDE_DATA_DIR displayName: Copy test data to cache # Install the package that we want to test diff --git a/.travis.yml b/.travis.yml index deaca4997..0cd690d96 100644 --- a/.travis.yml +++ b/.travis.yml @@ -23,6 +23,7 @@ env: # PyPI password for deploying releases (TWINE_PASSWORD) - secure: "Gvd2kH5bGIng7Wz3R4Md5d48qU0vYo0Sb4g7A1UAn8EOuWAcbkdSAq5yDiAp4pENeGceHQG0+jX+GQBZoSOMUpwAfhPkWG5HBIc+P/G+iTUyF2oELLCekcGgccPzwNgQt574FzM0PkC9L4hINNRjVtnFa+SIx72D2r1OdTvmk2+c4jXBZl52e4l5dU+Hjzwh22KNzAMtXDVuvr3NVdJZHA/ldTwEBUQfiLo2CGkgls6o8ZLixK0tCRGIFKlZko9WeBTzQYidloSo3EQx0eqiTz7qydm3UfCezA9UYPefGOtUaA/4ysqs8tgG8xrnx8NhhRqH9pfPAhgsCMwfmtibslNwH+C7gtbERT8lLY5NfU1xyDC4UxkjbwbzKQno/vPhiqEJ/uR458IdZbzUeWXlt+Rz+Dyj1lW7FqPLOl3Zpfgfv1swWqxjVwduV46c3nlgu9fEkAiEH2SzAtBlsQ2qwbJCZKXj+8Ps9FmaqvQ+SCOTAycgR9WnYoIIutpn0cs3k8zqqQyBq2zXJLkPHflVich8wKKaOsaFMCIKLWaOODCw5fLkfxck/QtlolGGFi3lh5W5p4Zxxr7KdL8f+UrkAb6gY9LStvqwe2rSG2olqc95+zozsMY/YHXTIG092WB3EmptwO9jL67D3AIVBKOdvcRYFetWMyY61ZmEK0s/43I=" - TWINE_USERNAME=Leonardo.Uieda + - VERDE_DATA_DIR="$HOME/.verde/data/master" # The file with the listed requirements to be installed by conda - CONDA_REQUIREMENTS=requirements.txt - CONDA_REQUIREMENTS_DEV=requirements-dev.txt @@ -65,8 +66,8 @@ matrix: # Setup the build environment before_install: # Copy sample data to the verde data dir to avoid downloading all the time - - mkdir -p $HOME/.verde/data/master - - cp -r data/* $HOME/.verde/data/master + - mkdir -p $VERDE_DATA_DIR + - cp -r data/* $VERDE_DATA_DIR # Get the Fatiando CI scripts - git clone --branch=1.2.0 --depth=1 https://github.com/fatiando/continuous-integration.git # Download and install miniconda and setup dependencies From 40a63f18905ddcacdacda0414c848fb15f5170df Mon Sep 17 00:00:00 2001 From: Leonardo Uieda Date: Thu, 16 Jan 2020 16:53:25 +0000 Subject: [PATCH 04/10] Remove unused os import --- verde/datasets/sample_data.py | 1 - 1 file changed, 1 deletion(-) diff --git a/verde/datasets/sample_data.py b/verde/datasets/sample_data.py index 793fee371..621e071ad 100644 --- a/verde/datasets/sample_data.py +++ b/verde/datasets/sample_data.py @@ -1,7 +1,6 @@ """ Functions to load sample data """ -import os import warnings import pkg_resources From da1b8805a4e96e24d59f6d17be1d394acda6f4f8 Mon Sep 17 00:00:00 2001 From: Leonardo Uieda Date: Mon, 20 Jan 2020 13:32:18 +0000 Subject: [PATCH 05/10] Remove "master" from VERDE_DATA_DIR It's appended by Pooch --- .azure-pipelines.yml | 12 ++++++------ .travis.yml | 6 +++--- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/.azure-pipelines.yml b/.azure-pipelines.yml index 3cf1bb434..665e6347e 100644 --- a/.azure-pipelines.yml +++ b/.azure-pipelines.yml @@ -77,7 +77,7 @@ jobs: CONDA_REQUIREMENTS: requirements.txt CONDA_REQUIREMENTS_DEV: requirements-dev.txt CONDA_INSTALL_EXTRA: "codecov" - VERDE_DATA_DIR: "$HOME/.verde/data/master" + VERDE_DATA_DIR: "$HOME/.verde/data" strategy: matrix: @@ -128,8 +128,8 @@ jobs: # Copy the test data to the cache folder - bash: | set -x -e - mkdir -p $VERDE_DATA_DIR - cp -r data/* $VERDE_DATA_DIR + mkdir -p ${VERDE_DATA_DIR}/master + cp -r data/* ${VERDE_DATA_DIR}/master displayName: Copy test data to cache # Install the package @@ -179,7 +179,7 @@ jobs: CONDA_REQUIREMENTS: requirements.txt CONDA_REQUIREMENTS_DEV: requirements-dev.txt CONDA_INSTALL_EXTRA: "codecov" - VERDE_DATA_DIR: "~/.verde/data/master" + VERDE_DATA_DIR: "~/.verde/data" strategy: matrix: @@ -224,8 +224,8 @@ jobs: # Copy the test data to the cache folder - bash: | set -x -e - mkdir -p $VERDE_DATA_DIR - cp -r data/* $VERDE_DATA_DIR + mkdir -p ${VERDE_DATA_DIR}/master + cp -r data/* ${VERDE_DATA_DIR}/master displayName: Copy test data to cache # Install the package that we want to test diff --git a/.travis.yml b/.travis.yml index 0cd690d96..9f3ebea86 100644 --- a/.travis.yml +++ b/.travis.yml @@ -23,7 +23,7 @@ env: # PyPI password for deploying releases (TWINE_PASSWORD) - secure: "Gvd2kH5bGIng7Wz3R4Md5d48qU0vYo0Sb4g7A1UAn8EOuWAcbkdSAq5yDiAp4pENeGceHQG0+jX+GQBZoSOMUpwAfhPkWG5HBIc+P/G+iTUyF2oELLCekcGgccPzwNgQt574FzM0PkC9L4hINNRjVtnFa+SIx72D2r1OdTvmk2+c4jXBZl52e4l5dU+Hjzwh22KNzAMtXDVuvr3NVdJZHA/ldTwEBUQfiLo2CGkgls6o8ZLixK0tCRGIFKlZko9WeBTzQYidloSo3EQx0eqiTz7qydm3UfCezA9UYPefGOtUaA/4ysqs8tgG8xrnx8NhhRqH9pfPAhgsCMwfmtibslNwH+C7gtbERT8lLY5NfU1xyDC4UxkjbwbzKQno/vPhiqEJ/uR458IdZbzUeWXlt+Rz+Dyj1lW7FqPLOl3Zpfgfv1swWqxjVwduV46c3nlgu9fEkAiEH2SzAtBlsQ2qwbJCZKXj+8Ps9FmaqvQ+SCOTAycgR9WnYoIIutpn0cs3k8zqqQyBq2zXJLkPHflVich8wKKaOsaFMCIKLWaOODCw5fLkfxck/QtlolGGFi3lh5W5p4Zxxr7KdL8f+UrkAb6gY9LStvqwe2rSG2olqc95+zozsMY/YHXTIG092WB3EmptwO9jL67D3AIVBKOdvcRYFetWMyY61ZmEK0s/43I=" - TWINE_USERNAME=Leonardo.Uieda - - VERDE_DATA_DIR="$HOME/.verde/data/master" + - VERDE_DATA_DIR="$HOME/.verde/data" # The file with the listed requirements to be installed by conda - CONDA_REQUIREMENTS=requirements.txt - CONDA_REQUIREMENTS_DEV=requirements-dev.txt @@ -66,8 +66,8 @@ matrix: # Setup the build environment before_install: # Copy sample data to the verde data dir to avoid downloading all the time - - mkdir -p $VERDE_DATA_DIR - - cp -r data/* $VERDE_DATA_DIR + - mkdir -p ${VERDE_DATA_DIR}/master + - cp -r data/* ${VERDE_DATA_DIR}/master # Get the Fatiando CI scripts - git clone --branch=1.2.0 --depth=1 https://github.com/fatiando/continuous-integration.git # Download and install miniconda and setup dependencies From f5229bde63ddaf9c945a5134cc128f6ef323f1c9 Mon Sep 17 00:00:00 2001 From: Leonardo Uieda Date: Mon, 20 Jan 2020 14:28:58 +0000 Subject: [PATCH 06/10] Close the registry resource --- verde/datasets/sample_data.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/verde/datasets/sample_data.py b/verde/datasets/sample_data.py index 621e071ad..76f7196ec 100644 --- a/verde/datasets/sample_data.py +++ b/verde/datasets/sample_data.py @@ -21,14 +21,15 @@ # Otherwise, DeprecationWarning won't be shown, kind of defeating the purpose. warnings.simplefilter("default") -POOCH = pooch.create( +REGISTRY = pooch.create( path=pooch.os_cache("verde"), base_url="https://github.com/fatiando/verde/raw/{version}/data/", version=full_version, version_dev="master", env="VERDE_DATA_DIR", ) -POOCH.load_registry(pkg_resources.resource_stream("verde.datasets", "registry.txt")) +with pkg_resources.resource_stream("verde.datasets", "registry.txt") as registry_file: + REGISTRY.load_registry(registry_file) def locate(): @@ -49,7 +50,7 @@ def locate(): The local data storage location. """ - return str(POOCH.abspath) + return str(REGISTRY.abspath) def _setup_map( @@ -96,7 +97,7 @@ def fetch_baja_bathymetry(): setup_baja_bathymetry_map: Utility function to help setup a Cartopy map. """ - data_file = POOCH.fetch("baja-bathymetry.csv.xz") + data_file = REGISTRY.fetch("baja-bathymetry.csv.xz") data = pd.read_csv(data_file, compression="xz") return data @@ -182,7 +183,7 @@ def fetch_rio_magnetic(): "in Verde v2.0.0. Use a different dataset instead.", DeprecationWarning, ) - data_file = POOCH.fetch("rio-magnetic.csv.xz") + data_file = REGISTRY.fetch("rio-magnetic.csv.xz") data = pd.read_csv(data_file, compression="xz") return data @@ -261,7 +262,7 @@ def fetch_california_gps(): setup_california_gps_map: Utility function to help setup a Cartopy map. """ - data_file = POOCH.fetch("california-gps.csv.xz") + data_file = REGISTRY.fetch("california-gps.csv.xz") data = pd.read_csv(data_file, compression="xz") return data @@ -322,7 +323,7 @@ def fetch_texas_wind(): setup_texas_wind_map: Utility function to help setup a Cartopy map. """ - data_file = POOCH.fetch("texas-wind.csv") + data_file = REGISTRY.fetch("texas-wind.csv") data = pd.read_csv(data_file) return data From 32c6a1aebd28e10828772829e0da2947e39cf649 Mon Sep 17 00:00:00 2001 From: Leonardo Uieda Date: Mon, 20 Jan 2020 14:35:40 +0000 Subject: [PATCH 07/10] Don't use HOME for data dir on Azure --- .azure-pipelines.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.azure-pipelines.yml b/.azure-pipelines.yml index 665e6347e..550e814ba 100644 --- a/.azure-pipelines.yml +++ b/.azure-pipelines.yml @@ -77,7 +77,7 @@ jobs: CONDA_REQUIREMENTS: requirements.txt CONDA_REQUIREMENTS_DEV: requirements-dev.txt CONDA_INSTALL_EXTRA: "codecov" - VERDE_DATA_DIR: "$HOME/.verde/data" + VERDE_DATA_DIR: "data_dir" strategy: matrix: @@ -179,7 +179,7 @@ jobs: CONDA_REQUIREMENTS: requirements.txt CONDA_REQUIREMENTS_DEV: requirements-dev.txt CONDA_INSTALL_EXTRA: "codecov" - VERDE_DATA_DIR: "~/.verde/data" + VERDE_DATA_DIR: "data_dir" strategy: matrix: From 692c268ca3ad36067e3c28487be70e90f3eb9bb2 Mon Sep 17 00:00:00 2001 From: Leonardo Uieda Date: Mon, 20 Jan 2020 14:40:52 +0000 Subject: [PATCH 08/10] Have to use Verde in the name for testing --- .azure-pipelines.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.azure-pipelines.yml b/.azure-pipelines.yml index 550e814ba..58536a104 100644 --- a/.azure-pipelines.yml +++ b/.azure-pipelines.yml @@ -77,7 +77,7 @@ jobs: CONDA_REQUIREMENTS: requirements.txt CONDA_REQUIREMENTS_DEV: requirements-dev.txt CONDA_INSTALL_EXTRA: "codecov" - VERDE_DATA_DIR: "data_dir" + VERDE_DATA_DIR: "verde_data_cache" strategy: matrix: @@ -179,7 +179,7 @@ jobs: CONDA_REQUIREMENTS: requirements.txt CONDA_REQUIREMENTS_DEV: requirements-dev.txt CONDA_INSTALL_EXTRA: "codecov" - VERDE_DATA_DIR: "data_dir" + VERDE_DATA_DIR: "verde_data_cache" strategy: matrix: From 6c162f687905ffd0b69ea93bb8912851ceb55c92 Mon Sep 17 00:00:00 2001 From: Leonardo Uieda Date: Mon, 20 Jan 2020 15:05:21 +0000 Subject: [PATCH 09/10] Check which files are in the data folder --- .azure-pipelines.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.azure-pipelines.yml b/.azure-pipelines.yml index 58536a104..9d176887a 100644 --- a/.azure-pipelines.yml +++ b/.azure-pipelines.yml @@ -144,6 +144,7 @@ jobs: - bash: | set -x -e source activate testing + ls ${VERDE_DATA_DIR}/master make test displayName: Test @@ -240,6 +241,7 @@ jobs: - bash: | set -x -e source activate testing + ls ${VERDE_DATA_DIR}/master make test displayName: Test From ccde8e9d496905df9c9d21b4d9b209f6d76e83a9 Mon Sep 17 00:00:00 2001 From: Leonardo Uieda Date: Mon, 20 Jan 2020 15:19:54 +0000 Subject: [PATCH 10/10] Use an absolute directory on Azure --- .azure-pipelines.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.azure-pipelines.yml b/.azure-pipelines.yml index 9d176887a..84b502ff7 100644 --- a/.azure-pipelines.yml +++ b/.azure-pipelines.yml @@ -77,7 +77,7 @@ jobs: CONDA_REQUIREMENTS: requirements.txt CONDA_REQUIREMENTS_DEV: requirements-dev.txt CONDA_INSTALL_EXTRA: "codecov" - VERDE_DATA_DIR: "verde_data_cache" + VERDE_DATA_DIR: "$(Agent.TempDirectory)/.verde/data" strategy: matrix: @@ -180,7 +180,7 @@ jobs: CONDA_REQUIREMENTS: requirements.txt CONDA_REQUIREMENTS_DEV: requirements-dev.txt CONDA_INSTALL_EXTRA: "codecov" - VERDE_DATA_DIR: "verde_data_cache" + VERDE_DATA_DIR: "$(Agent.TempDirectory)/.verde/data" strategy: matrix: