Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optional pooch dependency #4666

Merged
merged 16 commits into from
May 11, 2020
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion requirements/default.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,3 @@ pillow>=4.3.0,!=7.1.0,!=7.1.1
imageio>=2.3.0
tifffile>=2019.7.26
PyWavelets>=1.1.1
pooch>=0.5.2
1 change: 1 addition & 0 deletions requirements/docs.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@ dask[array]>=0.15.0
cloudpickle>=0.2.1
pandas>=0.23.0
seaborn>=0.7.1
pooch>=0.5.2
1 change: 1 addition & 0 deletions requirements/optional.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ pyamg
dask[array]>=0.15.0
# cloudpickle is necessary to provide the 'processes' scheduler for dask
cloudpickle>=0.2.1
pooch>=0.5.2
2 changes: 1 addition & 1 deletion skimage/_shared/testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ def fetch(data_filename):
"""Attempt to fetch data, but if unavailable, skip the tests."""
try:
return data._fetch(data_filename)
except ConnectionError:
except (ConnectionError, ModuleNotFoundError):
pytest.skip(f'Unable to download {data_filename}')


Expand Down
199 changes: 148 additions & 51 deletions skimage/data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,11 @@
- http://sipi.usc.edu/database/database.php

"""

import warnings
import sys
from warnings import warn
import numpy as np
import shutil

from distutils.version import LooseVersion as Version

from ..util.dtype import img_as_bool
from ._binary_blobs import binary_blobs
from ._registry import registry, legacy_registry, registry_urls
Expand Down Expand Up @@ -54,42 +51,51 @@
'shepp_logan_phantom',
'stereo_motorcycle']

skimage_distribution_dir = osp.join(osp.abspath(osp.dirname(__file__)), '..')


import pooch
from pooch.utils import file_hash


# Pooch expects a `+` to exist in development versions.
# Since scikit-image doesn't follow that convention, we have to manually
# remove `.dev` with a `+` if it exists.
# This helps pooch understand that it should look in master
# to find the required files
pooch_version = __version__.replace('.dev', '+')
url = "https://github.com/scikit-image/scikit-image/raw/{version}/skimage/"

# Create a new friend to manage your sample data storage
image_fetcher = pooch.create(
# Pooch uses appdirs to select an appropriate directory for the cache
# on each platform.
# https://github.com/ActiveState/appdirs
# On linux this converges to
# '$HOME/.cache/scikit-image'
# With a version qualifier
path=pooch.os_cache("scikit-image"),
base_url=url,
version=pooch_version,
env="SKIMAGE_DATADIR",
registry=registry,
urls=registry_urls,
)

data_dir = osp.join(str(image_fetcher.abspath), 'data')

os.makedirs(data_dir, exist_ok=True)
shutil.copy2(osp.join(skimage_distribution_dir, 'data', 'README.txt'),
osp.join(data_dir, 'README.txt'))
legacy_data_dir = osp.abspath(osp.dirname(__file__))
skimage_distribution_dir = osp.join(legacy_data_dir, '..')

try:
from pooch.utils import file_hash
except ModuleNotFoundError:
# Function taken from
# https://github.com/fatiando/pooch/blob/master/pooch/utils.py
def file_hash(fname, alg="sha256"):
"""
Calculate the hash of a given file.
Useful for checking if a file has changed or been corrupted.
Parameters
----------
fname : str
The name of the file.
alg : str
The type of the hashing algorithm
Returns
-------
hash : str
The hash of the file.
Examples
--------
>>> fname = "test-file-for-hash.txt"
>>> with open(fname, "w") as f:
... __ = f.write("content of the file")
>>> print(file_hash(fname))
0fc74468e6a9a829f103d069aeb2bb4f8646bad58bf146bb0e3379b759ec4a00
>>> import os
>>> os.remove(fname)
"""
import hashlib
if alg not in hashlib.algorithms_available:
raise ValueError(
"Algorithm '{}' not available in hashlib".format(alg))
# Calculate the hash in chunks to avoid overloading the memory
chunksize = 65536
hasher = hashlib.new(alg)
with open(fname, "rb") as fin:
buff = fin.read(chunksize)
while buff:
hasher.update(buff)
buff = fin.read(chunksize)
return hasher.hexdigest()


def _has_hash(path, expected_hash):
Expand All @@ -99,6 +105,51 @@ def _has_hash(path, expected_hash):
return file_hash(path) == expected_hash


def create_image_fetcher():
try:
import pooch
except ImportError:
# Without pooch, fallback on the standard data directory
# which for now, includes a few limited data samples
return None, legacy_data_dir

# Pooch expects a `+` to exist in development versions.
# Since scikit-image doesn't follow that convention, we have to manually
# remove `.dev` with a `+` if it exists.
# This helps pooch understand that it should look in master
# to find the required files
pooch_version = __version__.replace('.dev', '+')
url = "https://github.com/scikit-image/scikit-image/raw/{version}/skimage/"

# Create a new friend to manage your sample data storage
image_fetcher = pooch.create(
# Pooch uses appdirs to select an appropriate directory for the cache
# on each platform.
# https://github.com/ActiveState/appdirs
# On linux this converges to
# '$HOME/.cache/scikit-image'
# With a version qualifier
path=pooch.os_cache("scikit-image"),
base_url=url,
version=pooch_version,
env="SKIMAGE_DATADIR",
registry=registry,
urls=registry_urls,
)

data_dir = osp.join(str(image_fetcher.abspath), 'data')


return image_fetcher, data_dir


image_fetcher, data_dir = create_image_fetcher()

if image_fetcher is None:
has_pooch = False
else:
has_pooch = True

def _fetch(data_filename):
"""Fetch a given data file from either the local cache or the repository.

Expand All @@ -117,24 +168,27 @@ def _fetch(data_filename):

Raises
------
ValueError:
KeyError:
If the filename is not known to the scikit-image distribution.

ModuleNotFoundError:
If the filename is known to the scikit-image distribution but pooch
is not installed.

ConnectionError:
If scikit-image is unable to connect to the internet but the dataset
has not been downloaded yet.
If scikit-image is unable to connect to the internet but the
dataset has not been downloaded yet.
"""
resolved_path = osp.join(data_dir, '..', data_filename)
expected_hash = registry[data_filename]

# Case 1:
# The file may already be in the data_dir. We may have decided to ship it
# in the scikit-image distribution.
# The file may already be in the data_dir.
# We may have decided to ship it in the scikit-image distribution.
if _has_hash(resolved_path, expected_hash):
# Nothing to be done, file is where it is expected to be
return resolved_path


# Case 2:
# The user is using a cloned version of the github repo, which
# contains both the publicly shipped data, and test data.
Expand All @@ -148,8 +202,20 @@ def _fetch(data_filename):
return resolved_path

# Case 3:
# Pooch needs to download the data. Let the image fetcher to search for our
# data. A ConnectionError is raised if no internet connection is available.
# Pooch not found.
if image_fetcher is None:
raise ModuleNotFoundError(
"The requested file is part of the scikit-image distribution, "
"but requires the installation of an optional dependency, pooch. "
"To install pooch, use your preferred python package manager. "
"Follow installation instruction found at "
"https://scikit-image.org/docs/stable/install.html"
)

# Case 4:
# Pooch needs to download the data. Let the image fetcher to search for
# our data. A ConnectionError is raised if no internet connection is
# available.
try:
resolved_path = image_fetcher.fetch(data_filename)
except ConnectionError as err:
Expand All @@ -164,9 +230,23 @@ def _fetch(data_filename):
return resolved_path


# Fetch all legacy data so that it is available by default
for filename in legacy_registry:
_fetch(filename)
def _init_pooch():
os.makedirs(data_dir, exist_ok=True)
shutil.copy2(osp.join(skimage_distribution_dir, 'data', 'README.txt'),
osp.join(data_dir, 'README.txt'))

data_base_dir = osp.join(data_dir, '..')
# Fetch all legacy data so that it is available by default
for filename in legacy_registry:
_fetch(filename)


# This function creates directories, and has been the source of issues for
# downstream users, see
# https://github.com/scikit-image/scikit-image/issues/4660
# https://github.com/scikit-image/scikit-image/issues/4664
if has_pooch:
_init_pooch()


def download_all(directory=None):
Expand All @@ -176,6 +256,11 @@ def download_all(directory=None):
This allows us to use higher quality datasets, while keeping the
library download size small.

This function requires the installation of an optional dependency, pooch,
to download the full dataset. Follow installation instruction found at

https://scikit-image.org/docs/stable/install.html

Call this function to download all sample images making them available
offline on your machine.

Expand All @@ -184,6 +269,11 @@ def download_all(directory=None):
directory: path-like, optional
The directory where the dataset should be stored.

Raises
------
ModuleNotFoundError:
If pooch is not install, this error will be raised.

Notes
-----
scikit-image will only search for images stored in the default directory.
Expand All @@ -192,6 +282,13 @@ def download_all(directory=None):
data directory by inspecting the variable `skimage.data.data_dir`.
"""

if image_fetcher is None:
raise ModuleNotFoundError(
"To download all package, scikit-image needs an optional "
hmaarrfk marked this conversation as resolved.
Show resolved Hide resolved
"dependency, pooch."
"To install pooch, follow our installation instructions found at "
"https://scikit-image.org/docs/stable/install.html"
)
# Consider moving this kind of logic to Pooch
old_dir = image_fetcher.path
try:
Expand Down
1 change: 1 addition & 0 deletions skimage/data/_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
'coffee.png',
'coins.png',
'color.png',
'cell.png',
'grass.png',
'gravel.png',
'horse.png',
Expand Down
54 changes: 32 additions & 22 deletions skimage/data/tests/test_data.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,39 @@
import os
from urllib import request

import numpy as np
import skimage.data as data
from skimage.data import image_fetcher
from skimage import io
from skimage._shared.testing import (
assert_equal, assert_almost_equal, fetch, skipif
)
from skimage._shared.testing import assert_equal, assert_almost_equal, fetch
import os
import pytest


# check whether github is reachable.
OFFLINE = request.urlopen('https://github.com').getcode() != 200
def test_data_dir():
# data_dir should be a directory people can use as a standard directory
# https://github.com/scikit-image/scikit-image/pull/3945#issuecomment-498141893
data_dir = data.data_dir
assert 'astronaut.png' in os.listdir(data_dir)


def test_download_all_with_pooch():
# jni first wrote this test with the intention of
# fully deleting the files in the data_dir,
# then ensure that the data gets downloaded accordingly.
# hmaarrfk raised the concern that this test wouldn't
# play well with parallel testing since we
# may be breaking the global state that certain other
# tests require, especially in parallel testing

# The second concern is that this test essentially uses
# alot of bandwidth, which is not fun for developers on
# lower speed connections.
# https://github.com/scikit-image/scikit-image/pull/4666/files/26d5138b25b958da6e97ebf979e9bc36f32c3568#r422604863
data_dir = data.data_dir
if image_fetcher is not None:
data.download_all()
assert len(os.listdir(data_dir)) > 50
else:
with pytest.raises(ModuleNotFoundError):
data.download_all()


def test_astronaut():
Expand Down Expand Up @@ -116,24 +139,11 @@ def test_cell():
data.cell()


@skipif(OFFLINE, reason='No internet connection')
def test_cells_3d():
"""Needs internet connection."""
path = fetch('data/cells.tif')
image = io.imread(path)
assert image.shape == (60, 256, 256)


def test_data_dir():
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why remove this test? I feel like it's important.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, I added it back.

astro = data.astronaut()
datadir = data.data_dir
assert 'astronaut.png' in os.listdir(datadir)


@skipif(OFFLINE, reason='No internet connection')
def test_download_all():
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a good reason to remove this test?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test really doen't play with parallel testing.... affecting the global filesystem state is not fun.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it also may induce a full download of all the files for the end users. not fun on our laptops when travelling with 4G connections.

datadir = data.data_dir
for filename in os.listdir(datadir):
os.remove(os.path.join(datadir, filename))
data.download_all()
assert len(os.listdir(datadir)) > 50