From 0a2b3cda3be00e296dc8b526fb9a4151e7463d11 Mon Sep 17 00:00:00 2001 From: andream Date: Fri, 14 Jun 2024 14:41:45 +0200 Subject: [PATCH 01/11] add support for fill_value and set_empty_bucket_to in bucket resampler get_sum --- pyresample/bucket/__init__.py | 47 ++++++++++++++++++++++++----------- 1 file changed, 32 insertions(+), 15 deletions(-) diff --git a/pyresample/bucket/__init__.py b/pyresample/bucket/__init__.py index 9851da571..317c2b64c 100644 --- a/pyresample/bucket/__init__.py +++ b/pyresample/bucket/__init__.py @@ -202,20 +202,30 @@ def _get_indices(self): target_shape = self.target_area.shape self.idxs = self.y_idxs * target_shape[1] + self.x_idxs - def get_sum(self, data, skipna=True): + def get_sum(self, data, fill_value=np.nan, skipna=True, + set_empty_bucket_to=0): """Calculate sums for each bin with drop-in-a-bucket resampling. Parameters ---------- data : Numpy or Dask array Data to be binned and summed. + fill_value : float + Fill value to mark missing/invalid values in the input data. + Default: np.nan skipna : boolean (optional) - If True, skips NaN values for the sum calculation - (similarly to Numpy's `nansum`). Buckets containing only NaN are set to zero. - If False, sets the bucket to NaN if one or more NaN values are present in the bucket - (similarly to Numpy's `sum`). - In both cases, empty buckets are set to 0. - Default: True + If True, skips missing values (as marked by NaN or `fill_value`) for the sum calculation + (similarly to Numpy's `nansum`). Buckets containing only missing values are set to `set_empty_bucket_to`. + If False, sets the bucket to fill_value if one or more missing values are present in the bucket + (similarly to Numpy's `sum`). + In both cases, empty buckets are set to `set_empty_bucket_to`. + Default: True + set_empty_bucket_to : float + Set empty buckets to the given value. Empty buckets are considered as the buckets with value 0. + Note that a bucket could become 0 as the result of a sum + of positive and negative values. If the user needs to identify these zero-buckets reliably, + `get_count()` can be used for this purpose. + Default: np.nan Returns ------- @@ -228,8 +238,11 @@ def get_sum(self, data, skipna=True): data = data.data data = data.ravel() - # Remove NaN values from the data when used as weights - weights = da.where(np.isnan(data), 0, data) + # Remove fill_values values from the data when used as weights + if np.isnan(fill_value): + weights = da.where(np.isnan(data), 0, data) + else: + weights = da.where(data == fill_value, 0, data) # Rechunk indices to match the data chunking if weights.chunks != self.idxs.chunks: @@ -241,16 +254,20 @@ def get_sum(self, data, skipna=True): weights=weights, density=False) # TODO remove following line in favour of weights = data when dask histogram bug (issue #6935) is fixed - sums = self._mask_bins_with_nan_if_not_skipna(skipna, data, out_size, sums) + sums = self._mask_bins_with_nan_if_not_skipna(skipna, data, out_size, sums, fill_value) + sums = da.where(sums == 0, set_empty_bucket_to, sums) return sums.reshape(self.target_area.shape) - def _mask_bins_with_nan_if_not_skipna(self, skipna, data, out_size, statistic): + def _mask_bins_with_nan_if_not_skipna(self, skipna, data, out_size, statistic, fill_value): if not skipna: - nans = np.isnan(data) - nan_bins, _ = da.histogram(self.idxs[nans], bins=out_size, - range=(0, out_size)) - statistic = da.where(nan_bins > 0, np.nan, statistic) + if np.isnan(fill_value): + missing_val = np.isnan(data) + else: + missing_val = data == fill_value + missing_val_bins, _ = da.histogram(self.idxs[missing_val], bins=out_size, + range=(0, out_size)) + statistic = da.where(missing_val_bins > 0, fill_value, statistic) return statistic def _call_bin_statistic(self, statistic_method, data, fill_value=None, skipna=None): From e9e214dc01739a1b589951740293c2380bbdfe5a Mon Sep 17 00:00:00 2001 From: andream Date: Fri, 14 Jun 2024 14:42:20 +0200 Subject: [PATCH 02/11] add new tests for fill_value and set_emtpy_bucket_to --- pyresample/test/test_bucket.py | 51 ++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/pyresample/test/test_bucket.py b/pyresample/test/test_bucket.py index b58f44727..c08af5108 100644 --- a/pyresample/test/test_bucket.py +++ b/pyresample/test/test_bucket.py @@ -174,10 +174,61 @@ def test_get_sum_nan_data_skipna_true(self): result = self._get_sum_result(data, skipna=True) # 2 + nan is 2 self.assertEqual(np.count_nonzero(result == 2.), 1) + # 5 is untouched in a single bin + self.assertEqual(np.count_nonzero(result == 5.), 1) # all-nan and rest is 0 self.assertEqual(np.count_nonzero(np.isnan(result)), 0) self.assertEqual(np.nanmin(result), 0) + def test_get_sum_non_default_fill_value_skipna_false(self): + """Test drop-in-a-bucket sum for data input with non-default fill_value and skipna=False.""" + data = da.from_array(np.array([[2., 255], [5., 255]]), + chunks=self.chunks) + + result = self._get_sum_result(data, skipna=False, fill_value=255) + # 2 + fill_value is fill_value, all-fill_value is fill_value + self.assertEqual(np.count_nonzero(result == 255), 2) + # 5 is untouched in a single bin + self.assertEqual(np.count_nonzero(result == 5.), 1) + # rest is 0 + self.assertEqual(np.nanmin(result), 0) + + def test_get_sum_non_default_fill_value_skipna_true(self): + """Test drop-in-a-bucket sum for data input with non-default fill_value and skipna=True.""" + data = da.from_array(np.array([[2., 255], [5., 255]]), + chunks=self.chunks) + + result = self._get_sum_result(data, skipna=True, fill_value=255) + # 2 + fill_value is 2 + self.assertEqual(np.count_nonzero(result == 2.), 1) + # all-missing and rest is 0 + self.assertEqual(np.count_nonzero(result == 255), 0) + self.assertEqual(np.nanmin(result), 0) + + def test_nonzero_set_empty_bucket_to_number(self): + """Test drop-in-a-bucket sum for non-zero set_empty_bucket_to np.nan.""" + data = da.from_array(np.array([[2., np.nan], [5., np.nan]]), + chunks=self.chunks) + + result = self._get_sum_result(data, skipna=True, set_empty_bucket_to=4095) + # 5 is untouched in a single bin + self.assertEqual(np.count_nonzero(result == 5.), 1) + # all-nan and rest is 4095 + self.assertEqual(np.count_nonzero(result == 4095), 2048*2560-2) + self.assertEqual(np.nanmin(result), 2) + + def test_nonzero_set_empty_bucket_to_npnan(self): + """Test drop-in-a-bucket sum for non-zero set_empty_bucket_to np.nan.""" + data = da.from_array(np.array([[2., np.nan], [5., np.nan]]), + chunks=self.chunks) + + result = self._get_sum_result(data, skipna=True, set_empty_bucket_to=np.nan) + # 5 is untouched in a single bin + self.assertEqual(np.count_nonzero(result == 5.), 1) + # all-nan and rest is np.nan + self.assertEqual(np.count_nonzero(np.isnan(result)), 2048*2560-2) + self.assertEqual(np.nanmin(result), 2) + def test_get_count(self): """Test drop-in-a-bucket sum.""" with dask.config.set(scheduler=CustomScheduler(max_computes=0)): From 512a265247ec3283a50222aa7d186876a6a0c806 Mon Sep 17 00:00:00 2001 From: andream Date: Fri, 14 Jun 2024 15:17:45 +0200 Subject: [PATCH 03/11] add a few more spaces --- pyresample/test/test_bucket.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyresample/test/test_bucket.py b/pyresample/test/test_bucket.py index c08af5108..d7797a190 100644 --- a/pyresample/test/test_bucket.py +++ b/pyresample/test/test_bucket.py @@ -214,7 +214,7 @@ def test_nonzero_set_empty_bucket_to_number(self): # 5 is untouched in a single bin self.assertEqual(np.count_nonzero(result == 5.), 1) # all-nan and rest is 4095 - self.assertEqual(np.count_nonzero(result == 4095), 2048*2560-2) + self.assertEqual(np.count_nonzero(result == 4095), 2048 * 2560 - 2) self.assertEqual(np.nanmin(result), 2) def test_nonzero_set_empty_bucket_to_npnan(self): @@ -226,7 +226,7 @@ def test_nonzero_set_empty_bucket_to_npnan(self): # 5 is untouched in a single bin self.assertEqual(np.count_nonzero(result == 5.), 1) # all-nan and rest is np.nan - self.assertEqual(np.count_nonzero(np.isnan(result)), 2048*2560-2) + self.assertEqual(np.count_nonzero(np.isnan(result)), 2048 * 2560 - 2) self.assertEqual(np.nanmin(result), 2) def test_get_count(self): From 6fc18c64e3f51be41617a6fb4196a920be2a7cd6 Mon Sep 17 00:00:00 2001 From: andream Date: Tue, 18 Jun 2024 19:24:45 +0200 Subject: [PATCH 04/11] change arg name to empty_bucket_value and add replacement check --- pyresample/bucket/__init__.py | 13 +++++++------ pyresample/test/test_bucket.py | 12 ++++++------ 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/pyresample/bucket/__init__.py b/pyresample/bucket/__init__.py index 317c2b64c..cb99d94cc 100644 --- a/pyresample/bucket/__init__.py +++ b/pyresample/bucket/__init__.py @@ -202,8 +202,7 @@ def _get_indices(self): target_shape = self.target_area.shape self.idxs = self.y_idxs * target_shape[1] + self.x_idxs - def get_sum(self, data, fill_value=np.nan, skipna=True, - set_empty_bucket_to=0): + def get_sum(self, data, fill_value=np.nan, skipna=True, empty_bucket_value=0): """Calculate sums for each bin with drop-in-a-bucket resampling. Parameters @@ -215,12 +214,12 @@ def get_sum(self, data, fill_value=np.nan, skipna=True, Default: np.nan skipna : boolean (optional) If True, skips missing values (as marked by NaN or `fill_value`) for the sum calculation - (similarly to Numpy's `nansum`). Buckets containing only missing values are set to `set_empty_bucket_to`. + (similarly to Numpy's `nansum`). Buckets containing only missing values are set to `empty_bucket_value`. If False, sets the bucket to fill_value if one or more missing values are present in the bucket (similarly to Numpy's `sum`). - In both cases, empty buckets are set to `set_empty_bucket_to`. + In both cases, empty buckets are set to `empty_bucket_value`. Default: True - set_empty_bucket_to : float + empty_bucket_value : float Set empty buckets to the given value. Empty buckets are considered as the buckets with value 0. Note that a bucket could become 0 as the result of a sum of positive and negative values. If the user needs to identify these zero-buckets reliably, @@ -255,7 +254,9 @@ def get_sum(self, data, fill_value=np.nan, skipna=True, # TODO remove following line in favour of weights = data when dask histogram bug (issue #6935) is fixed sums = self._mask_bins_with_nan_if_not_skipna(skipna, data, out_size, sums, fill_value) - sums = da.where(sums == 0, set_empty_bucket_to, sums) + + if empty_bucket_value != 0: + sums = da.where(sums == 0, empty_bucket_value, sums) return sums.reshape(self.target_area.shape) diff --git a/pyresample/test/test_bucket.py b/pyresample/test/test_bucket.py index d7797a190..a36c5bd98 100644 --- a/pyresample/test/test_bucket.py +++ b/pyresample/test/test_bucket.py @@ -205,24 +205,24 @@ def test_get_sum_non_default_fill_value_skipna_true(self): self.assertEqual(np.count_nonzero(result == 255), 0) self.assertEqual(np.nanmin(result), 0) - def test_nonzero_set_empty_bucket_to_number(self): - """Test drop-in-a-bucket sum for non-zero set_empty_bucket_to np.nan.""" + def test_nonzero_empty_bucket_value_number(self): + """Test drop-in-a-bucket sum for non-zero empty_bucket_value set as number.""" data = da.from_array(np.array([[2., np.nan], [5., np.nan]]), chunks=self.chunks) - result = self._get_sum_result(data, skipna=True, set_empty_bucket_to=4095) + result = self._get_sum_result(data, skipna=True, empty_bucket_value=4095) # 5 is untouched in a single bin self.assertEqual(np.count_nonzero(result == 5.), 1) # all-nan and rest is 4095 self.assertEqual(np.count_nonzero(result == 4095), 2048 * 2560 - 2) self.assertEqual(np.nanmin(result), 2) - def test_nonzero_set_empty_bucket_to_npnan(self): - """Test drop-in-a-bucket sum for non-zero set_empty_bucket_to np.nan.""" + def test_nonzero_empty_bucket_valueo_npnan(self): + """Test drop-in-a-bucket sum for non-zero empty_bucket_value set as np.nan.""" data = da.from_array(np.array([[2., np.nan], [5., np.nan]]), chunks=self.chunks) - result = self._get_sum_result(data, skipna=True, set_empty_bucket_to=np.nan) + result = self._get_sum_result(data, skipna=True, empty_bucket_value=np.nan) # 5 is untouched in a single bin self.assertEqual(np.count_nonzero(result == 5.), 1) # all-nan and rest is np.nan From 7cbdc8c5df733397b03010b03795499c66c6db33 Mon Sep 17 00:00:00 2001 From: andream Date: Tue, 2 Jul 2024 14:35:57 +0200 Subject: [PATCH 05/11] fix default value in documentation --- pyresample/bucket/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyresample/bucket/__init__.py b/pyresample/bucket/__init__.py index cb99d94cc..2cf11588d 100644 --- a/pyresample/bucket/__init__.py +++ b/pyresample/bucket/__init__.py @@ -224,7 +224,7 @@ def get_sum(self, data, fill_value=np.nan, skipna=True, empty_bucket_value=0): Note that a bucket could become 0 as the result of a sum of positive and negative values. If the user needs to identify these zero-buckets reliably, `get_count()` can be used for this purpose. - Default: np.nan + Default: 0 Returns ------- From fffc7f771b5c403c196b4a7f84255fcff9271826 Mon Sep 17 00:00:00 2001 From: andream Date: Tue, 2 Jul 2024 14:46:22 +0200 Subject: [PATCH 06/11] implement get_invalid_mask --- pyresample/bucket/__init__.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/pyresample/bucket/__init__.py b/pyresample/bucket/__init__.py index 2cf11588d..a0aced488 100644 --- a/pyresample/bucket/__init__.py +++ b/pyresample/bucket/__init__.py @@ -238,10 +238,8 @@ def get_sum(self, data, fill_value=np.nan, skipna=True, empty_bucket_value=0): data = data.ravel() # Remove fill_values values from the data when used as weights - if np.isnan(fill_value): - weights = da.where(np.isnan(data), 0, data) - else: - weights = da.where(data == fill_value, 0, data) + invalid_mask = get_invalid_mask(data, fill_value) + weights = da.where(invalid_mask, 0, data) # Rechunk indices to match the data chunking if weights.chunks != self.idxs.chunks: @@ -262,10 +260,7 @@ def get_sum(self, data, fill_value=np.nan, skipna=True, empty_bucket_value=0): def _mask_bins_with_nan_if_not_skipna(self, skipna, data, out_size, statistic, fill_value): if not skipna: - if np.isnan(fill_value): - missing_val = np.isnan(data) - else: - missing_val = data == fill_value + missing_val = get_invalid_mask(data, fill_value) missing_val_bins, _ = da.histogram(self.idxs[missing_val], bins=out_size, range=(0, out_size)) statistic = da.where(missing_val_bins > 0, fill_value, statistic) @@ -474,6 +469,14 @@ def get_fractions(self, data, categories=None, fill_value=np.nan): return results +def get_invalid_mask(data, fill_value): + """Get a boolean array marking values equal to fill_value in data as True.""" + if np.isnan(fill_value): + return np.isnan(data) + else: + return data == fill_value + + def round_to_resolution(arr, resolution): """Round the values in *arr* to closest resolution element. From 3e4b16fd6f1d505fe21c5b5cb5a498086b142716 Mon Sep 17 00:00:00 2001 From: andream Date: Wed, 3 Jul 2024 12:33:44 +0200 Subject: [PATCH 07/11] switch to pytest --- pyresample/bucket/__init__.py | 4 +- pyresample/test/test_bucket.py | 737 +++++++++++++++++---------------- 2 files changed, 380 insertions(+), 361 deletions(-) diff --git a/pyresample/bucket/__init__.py b/pyresample/bucket/__init__.py index a0aced488..dc6c4f5ca 100644 --- a/pyresample/bucket/__init__.py +++ b/pyresample/bucket/__init__.py @@ -68,7 +68,7 @@ def _expand_bin_statistics(bins, unique_bin, unique_idx, weights_sorted): # assign the valid index to array weight_idx[unique_bin[~unique_bin.mask].data] = unique_idx[~unique_bin.mask] - return weights_sorted[weight_idx] # last value of weigths_sorted always nan + return weights_sorted[weight_idx] # last value of weights_sorted always nan @dask.delayed(pure=True) @@ -470,7 +470,7 @@ def get_fractions(self, data, categories=None, fill_value=np.nan): def get_invalid_mask(data, fill_value): - """Get a boolean array marking values equal to fill_value in data as True.""" + """Get a boolean array where values equal to fill_value in data are True.""" if np.isnan(fill_value): return np.isnan(data) else: diff --git a/pyresample/test/test_bucket.py b/pyresample/test/test_bucket.py index a36c5bd98..17f43ce74 100644 --- a/pyresample/test/test_bucket.py +++ b/pyresample/test/test_bucket.py @@ -17,379 +17,398 @@ # along with this program. If not, see . """Test the bucket resampler.""" -import unittest from unittest.mock import MagicMock, patch import dask import dask.array as da import numpy as np import xarray as xr - +import pytest from pyresample import bucket, create_area_def from pyresample.geometry import AreaDefinition from pyresample.test.utils import CustomScheduler +CHUNKS = 2 -class Test(unittest.TestCase): - """Test bucket resampler.""" - - adef = AreaDefinition('eurol', 'description', '', +@pytest.fixture(scope="module") +def adef(): + return AreaDefinition('eurol', + 'description', + '', {'ellps': 'WGS84', 'lat_0': '90.0', 'lat_ts': '60.0', 'lon_0': '0.0', - 'proj': 'stere'}, 2560, 2048, + 'proj': 'stere'}, + 2560, + 2048, (-3780000.0, -7644000.0, 3900000.0, -1500000.0)) - chunks = 2 - lons = da.from_array(np.array([[25., 25.], [25., 25.]]), - chunks=chunks) - lats = da.from_array(np.array([[60., 60.00001], [60.2, 60.3]]), - chunks=chunks) - - def setUp(self): - self.resampler = bucket.BucketResampler(self.adef, self.lons, self.lats) - - @patch('pyresample.bucket.Proj') - @patch('pyresample.bucket.BucketResampler._get_indices') - def test_init(self, get_indices, prj): - resampler = bucket.BucketResampler(self.adef, self.lons, self.lats) - get_indices.assert_called_once() - prj.assert_called_once_with(self.adef.proj_dict) - self.assertTrue(hasattr(resampler, 'target_area')) - self.assertTrue(hasattr(resampler, 'source_lons')) - self.assertTrue(hasattr(resampler, 'source_lats')) - self.assertTrue(hasattr(resampler, 'x_idxs')) - self.assertTrue(hasattr(resampler, 'y_idxs')) - self.assertTrue(hasattr(resampler, 'idxs')) - self.assertTrue(hasattr(resampler, 'get_sum')) - self.assertTrue(hasattr(resampler, 'get_count')) - self.assertTrue(hasattr(resampler, 'get_min')) - self.assertTrue(hasattr(resampler, 'get_max')) - self.assertTrue(hasattr(resampler, 'get_abs_max')) - self.assertTrue(hasattr(resampler, 'get_average')) - self.assertTrue(hasattr(resampler, 'get_fractions')) - self.assertIsNone(resampler.counts) - - def test_round_to_resolution(self): - """Test rounding to given resolution.""" - # Scalar, integer resolution - self.assertEqual(bucket.round_to_resolution(5.5, 2.), 6) - # Scalar, non-integer resolution - self.assertEqual(bucket.round_to_resolution(5.5, 1.7), 5.1) - # List - self.assertTrue(np.all(bucket.round_to_resolution([4.2, 5.6], 2) == - np.array([4., 6.]))) - # Numpy array - self.assertTrue(np.all(bucket.round_to_resolution(np.array([4.2, 5.6]), 2) == - np.array([4., 6.]))) - # Dask array - self.assertTrue( - np.all(bucket.round_to_resolution(da.array([4.2, 5.6]), 2) == - np.array([4., 6.]))) - - def test_get_proj_coordinates(self): - """Test calculation of projection coordinates.""" - prj = MagicMock() - prj.return_value = ([3.1, 3.1, 3.1], [4.8, 4.8, 4.8]) - lons = [1., 1., 1.] - lats = [2., 2., 2.] - self.resampler.prj = prj - result = self.resampler._get_proj_coordinates(lons, lats) - prj.assert_called_once_with(lons, lats) - self.assertTrue(isinstance(result, np.ndarray)) - np.testing.assert_equal(result, np.array([[3.1, 3.1, 3.1], - [4.8, 4.8, 4.8]])) - - def test_get_bucket_indices(self): - """Test calculation of array indices.""" - # Ensure nothing is calculated - with dask.config.set(scheduler=CustomScheduler(max_computes=0)): - self.resampler._get_indices() - x_idxs, y_idxs = da.compute(self.resampler.x_idxs, - self.resampler.y_idxs) - np.testing.assert_equal(x_idxs, np.array([1710, 1710, 1707, 1705])) - np.testing.assert_equal(y_idxs, np.array([465, 465, 459, 455])) - - # Additional small test case - adef = create_area_def( - area_id='test', - projection={'proj': 'latlong'}, - width=2, height=2, - center=(0, 0), - resolution=10) - lons = da.from_array( - np.array([-10.0, -9.9, -0.1, 0, 0.1, 9.9, 10.0, -10.1, 0]), - chunks=2) - lats = da.from_array( - np.array([-10.0, -9.9, -0.1, 0, 0.1, 9.9, 10.0, 0, 10.1]), - chunks=2) - resampler = bucket.BucketResampler(source_lats=lats, - source_lons=lons, - target_area=adef) + + +@pytest.fixture(scope="module") +def lons(): + return da.from_array(np.array([[25., 25.], [25., 25.]]), chunks=CHUNKS) + + +@pytest.fixture(scope="module") +def lats(): + return da.from_array(np.array([[60., 60.00001], [60.2, 60.3]]), chunks=CHUNKS) + + +@pytest.fixture(scope="module") +def resampler(adef, lons, lats): + return bucket.BucketResampler(adef, lons, lats) + + +@patch('pyresample.bucket.Proj') +@patch('pyresample.bucket.BucketResampler._get_indices') +def test_init(get_indices, prj, adef, lons, lats): + """Test the init method of the BucketResampler""" + resampler = bucket.BucketResampler(adef, lons, lats) + + get_indices.assert_called_once() + prj.assert_called_once_with(adef.proj_dict) + + assert hasattr(resampler, 'target_area') + assert hasattr(resampler, 'source_lons') + assert hasattr(resampler, 'source_lats') + assert hasattr(resampler, 'x_idxs') + assert hasattr(resampler, 'y_idxs') + assert hasattr(resampler, 'idxs') + assert hasattr(resampler, 'get_sum') + assert hasattr(resampler, 'get_count') + assert hasattr(resampler, 'get_min') + assert hasattr(resampler, 'get_max') + assert hasattr(resampler, 'get_abs_max') + assert hasattr(resampler, 'get_average') + assert hasattr(resampler, 'get_fractions') + assert resampler.counts is None + + +def test_round_to_resolution(): + """Test rounding to given resolution.""" + # Scalar, integer resolution + assert bucket.round_to_resolution(5.5, 2.) == 6 + # Scalar, non-integer resolution + assert bucket.round_to_resolution(5.5, 1.7) == 5.1 + # List + assert np.all(bucket.round_to_resolution([4.2, 5.6], 2) == np.array([4., 6.])) + # Numpy array + assert np.all(bucket.round_to_resolution(np.array([4.2, 5.6]), 2) == np.array([4., 6.])) + # Dask array + assert np.all(bucket.round_to_resolution(da.array([4.2, 5.6]), 2) == np.array([4., 6.])) + + +def test_get_proj_coordinates(adef, lons, lats): + """Test calculation of projection coordinates.""" + resampler = bucket.BucketResampler(source_lats=lats, source_lons=lons, target_area=adef) + prj = MagicMock() + prj.return_value = ([3.1, 3.1, 3.1], [4.8, 4.8, 4.8]) + lons = [1., 1., 1.] + lats = [2., 2., 2.] + resampler.prj = prj + + result = resampler._get_proj_coordinates(lons, lats) + + prj.assert_called_once_with(lons, lats) + assert isinstance(result, np.ndarray) + np.testing.assert_equal(result, np.array([[3.1, 3.1, 3.1], + [4.8, 4.8, 4.8]])) + + +def test_get_bucket_indices(resampler): + """Test calculation of array indices.""" + # Ensure nothing is calculated + with dask.config.set(scheduler=CustomScheduler(max_computes=0)): resampler._get_indices() - np.testing.assert_equal(resampler.x_idxs, np.array([-1, 0, 0, 1, 1, 1, -1, -1, -1])) - np.testing.assert_equal(resampler.y_idxs, np.array([-1, 1, 1, 1, 0, 0, -1, -1, -1])) - - def _get_sum_result(self, data, **kwargs): - """Compute the bucket average with kwargs and check that no dask computation is performed.""" - with dask.config.set(scheduler=CustomScheduler(max_computes=0)): - result = self.resampler.get_sum(data, **kwargs) - return result.compute() - - def test_get_sum_valid_data(self): - """Test drop-in-a-bucket sum for valid data input.""" - data = da.from_array(np.array([[2., 3.], [7., 16.]]), - chunks=self.chunks) - - result = self._get_sum_result(data) - - # first two values are in same bin - self.assertEqual(np.count_nonzero(result == 5), 1) - # others are in separate bins - self.assertEqual(np.count_nonzero(result == 7), 1) - self.assertEqual(np.count_nonzero(result == 16), 1) - - self.assertEqual(result.shape, self.adef.shape) - - # Test that also xarray.DataArrays work (same output) - data = xr.DataArray(data) - np.testing.assert_array_equal(result, self._get_sum_result(data)) - - def test_get_sum_nan_data_skipna_false(self): - """Test drop-in-a-bucket sum for data input with nan and skipna False.""" - data = da.from_array(np.array([[2., np.nan], [5., np.nan]]), - chunks=self.chunks) - - result = self._get_sum_result(data, skipna=False) - # 2 + nan is nan, all-nan bin is nan - self.assertEqual(np.count_nonzero(np.isnan(result)), 2) - # rest is 0 - self.assertEqual(np.nanmin(result), 0) - - def test_get_sum_nan_data_skipna_true(self): - """Test drop-in-a-bucket sum for data input with nan and skipna True.""" - data = da.from_array(np.array([[2., np.nan], [5., np.nan]]), - chunks=self.chunks) - - result = self._get_sum_result(data, skipna=True) - # 2 + nan is 2 - self.assertEqual(np.count_nonzero(result == 2.), 1) - # 5 is untouched in a single bin - self.assertEqual(np.count_nonzero(result == 5.), 1) - # all-nan and rest is 0 - self.assertEqual(np.count_nonzero(np.isnan(result)), 0) - self.assertEqual(np.nanmin(result), 0) - - def test_get_sum_non_default_fill_value_skipna_false(self): - """Test drop-in-a-bucket sum for data input with non-default fill_value and skipna=False.""" - data = da.from_array(np.array([[2., 255], [5., 255]]), - chunks=self.chunks) - - result = self._get_sum_result(data, skipna=False, fill_value=255) - # 2 + fill_value is fill_value, all-fill_value is fill_value - self.assertEqual(np.count_nonzero(result == 255), 2) - # 5 is untouched in a single bin - self.assertEqual(np.count_nonzero(result == 5.), 1) - # rest is 0 - self.assertEqual(np.nanmin(result), 0) - - def test_get_sum_non_default_fill_value_skipna_true(self): - """Test drop-in-a-bucket sum for data input with non-default fill_value and skipna=True.""" - data = da.from_array(np.array([[2., 255], [5., 255]]), - chunks=self.chunks) - - result = self._get_sum_result(data, skipna=True, fill_value=255) - # 2 + fill_value is 2 - self.assertEqual(np.count_nonzero(result == 2.), 1) - # all-missing and rest is 0 - self.assertEqual(np.count_nonzero(result == 255), 0) - self.assertEqual(np.nanmin(result), 0) - - def test_nonzero_empty_bucket_value_number(self): - """Test drop-in-a-bucket sum for non-zero empty_bucket_value set as number.""" - data = da.from_array(np.array([[2., np.nan], [5., np.nan]]), - chunks=self.chunks) - - result = self._get_sum_result(data, skipna=True, empty_bucket_value=4095) - # 5 is untouched in a single bin - self.assertEqual(np.count_nonzero(result == 5.), 1) - # all-nan and rest is 4095 - self.assertEqual(np.count_nonzero(result == 4095), 2048 * 2560 - 2) - self.assertEqual(np.nanmin(result), 2) - - def test_nonzero_empty_bucket_valueo_npnan(self): - """Test drop-in-a-bucket sum for non-zero empty_bucket_value set as np.nan.""" - data = da.from_array(np.array([[2., np.nan], [5., np.nan]]), - chunks=self.chunks) - - result = self._get_sum_result(data, skipna=True, empty_bucket_value=np.nan) - # 5 is untouched in a single bin - self.assertEqual(np.count_nonzero(result == 5.), 1) - # all-nan and rest is np.nan - self.assertEqual(np.count_nonzero(np.isnan(result)), 2048 * 2560 - 2) - self.assertEqual(np.nanmin(result), 2) - - def test_get_count(self): - """Test drop-in-a-bucket sum.""" - with dask.config.set(scheduler=CustomScheduler(max_computes=0)): - result = self.resampler.get_count() - result = result.compute() - self.assertTrue(np.max(result) == 2) - self.assertEqual(np.sum(result == 1), 2) - self.assertEqual(np.sum(result == 2), 1) - self.assertTrue(self.resampler.counts is not None) - - def _get_min_result(self, data, **kwargs): - """Compute the bucket average with kwargs and check that no dask computation is performed.""" - with dask.config.set(scheduler=CustomScheduler(max_computes=0)): - result = self.resampler.get_min(data, **kwargs) - return result.compute() - - def test_get_min(self): - """Test min bucket resampling.""" - data = da.from_array(np.array([[2, 11], [5, np.nan]]), - chunks=self.chunks) - result = self._get_min_result(data) - # test multiple entries minimum - self.assertEqual(np.count_nonzero(result == 2), 1) - # test single entry minimum - self.assertEqual(np.count_nonzero(result == 5), 1) - # test that minimum of bucket with only nan is nan, and empty buckets are nan - self.assertEqual(np.count_nonzero(~np.isnan(result)), 2) - - def _get_max_result(self, data, **kwargs): - """Compute the bucket max with kwargs and check that no dask computation is performed.""" - with dask.config.set(scheduler=CustomScheduler(max_computes=0)): - result = self.resampler.get_max(data, **kwargs) - return result.compute() - - def test_get_max(self): - """Test max bucket resampling.""" - data = da.from_array(np.array([[2, 11], [5, np.nan]]), - chunks=self.chunks) - result = self._get_max_result(data) - # test multiple entries maximum - self.assertEqual(np.count_nonzero(result == 11), 1) - # test single entry maximum - self.assertEqual(np.count_nonzero(result == 5), 1) - # test that minimum of bucket with only nan is nan, and empty buckets are nan - self.assertEqual(np.count_nonzero(~np.isnan(result)), 2) - - def test_get_abs_max(self): - """Test abs max bucket resampling.""" - data = da.from_array(np.array([[2, -11], [5, np.nan]]), - chunks=self.chunks) - result = self._get_abs_max_result(data) - # test multiple entries absolute maximum - self.assertEqual(np.count_nonzero(result == -11), 1) - # test single entry maximum - self.assertEqual(np.count_nonzero(result == 5), 1) - # test that minimum of bucket with only nan is nan, and empty buckets are nan - self.assertEqual(np.count_nonzero(~np.isnan(result)), 2) - - def _get_abs_max_result(self, data, **kwargs): - """Compute the bucket abs max with kwargs and check that no dask computation is performed.""" - with dask.config.set(scheduler=CustomScheduler(max_computes=0)): - result = self.resampler.get_abs_max(data, **kwargs) - return result.compute() - - def _get_average_result(self, data, **kwargs): - """Compute the bucket average with kwargs and check that no dask computation is performed.""" - with dask.config.set(scheduler=CustomScheduler(max_computes=0)): - result = self.resampler.get_average(data, **kwargs) - return result.compute() - - def test_get_average_basic(self): - """Test averaging bucket resampling.""" - data = da.from_array(np.array([[2, 11], [5, np.nan]]), - chunks=self.chunks) - result = self._get_average_result(data) - # test multiple entries average - self.assertEqual(np.count_nonzero(result == 6.5), 1) - # test single entry average - self.assertEqual(np.count_nonzero(result == 5), 1) - # test that average of bucket with only nan is nan, and empty buckets are nan - self.assertEqual(np.count_nonzero(~np.isnan(result)), 2) - - def test_get_average_with_fillvalue_for_output(self): - """Test averaging bucket resampling with defined fill_value for output.""" - data = da.from_array(np.array([[2, 11], [5, np.nan]]), - chunks=self.chunks) - # test fill_value other than np.nan - result = self._get_average_result(data, fill_value=-1) - # check that all empty buckets are fill_value - self.assertEqual(np.count_nonzero(result != -1), 2) - - def test_get_average_skipna_true(self): - """Test averaging bucket resampling with skipna True.""" - # test skipna - data = da.from_array(np.array([[2, np.nan], [np.nan, np.nan]]), - chunks=self.chunks) - result = self._get_average_result(data, skipna=True) - # test that average of 2 and np.nan is 2 for skipna=True - self.assertEqual(np.count_nonzero(result == 2), 1) - - def test_get_average_skipna_false(self): - """Test averaging bucket resampling with skipna False.""" - data = da.from_array(np.array([[2, np.nan], [np.nan, np.nan]]), - chunks=self.chunks) - result = self._get_average_result(data, skipna=False) - # test that average of 2 and np.nan is nan for skipna=False - self.assertTrue(np.all(np.isnan(result))) - - def test_get_average_only_nan_input(self): - """Test averaging bucket resampling with only NaN as input.""" - data = da.from_array(np.array([[np.nan, np.nan], [np.nan, np.nan]]), - chunks=self.chunks) - result = self._get_average_result(data, skipna=True) - # test that average of np.nan and np.nan is np.nan for both skipna - self.assertTrue(np.all(np.isnan(result))) - np.testing.assert_array_equal(result, self._get_average_result(data, skipna=False)) - - def test_get_average_with_fill_value_in_input(self): - """Test averaging bucket resampling with fill_value in input and skipna True.""" - # test that fill_value in input is recognised as missing value - data = da.from_array(np.array([[2, -1], [-1, np.nan]]), - chunks=self.chunks) - result = self._get_average_result(data, fill_value=-1, skipna=True) - # test that average of 2 and -1 (missing value) is 2 - self.assertEqual(np.count_nonzero(result == 2), 1) - # test than all other buckets are -1 - self.assertEqual(np.count_nonzero(result != -1), 1) - - def test_resample_bucket_fractions(self): - """Test fraction calculations for categorical data.""" - data = da.from_array(np.array([[2, 4], [2, 2]]), - chunks=self.chunks) - categories = [1, 2, 3, 4] - with dask.config.set(scheduler=CustomScheduler(max_computes=0)): - result = self.resampler.get_fractions(data, categories=categories) - self.assertEqual(set(categories), set(result.keys())) - res = result[1].compute() - self.assertTrue(np.nanmax(res) == 0.) - res = result[2].compute() - self.assertTrue(np.nanmax(res) == 1.) - self.assertTrue(np.nanmin(res) == 0.5) - res = result[3].compute() - self.assertTrue(np.nanmax(res) == 0.) - res = result[4].compute() - self.assertTrue(np.nanmax(res) == 0.5) - self.assertTrue(np.nanmin(res) == 0.) - # There should be NaN values - self.assertTrue(np.any(np.isnan(res))) - - # Use a fill value - with dask.config.set(scheduler=CustomScheduler(max_computes=0)): - result = self.resampler.get_fractions(data, categories=categories, - fill_value=-1) - - # There should not be any NaN values - for i in categories: - res = result[i].compute() - self.assertFalse(np.any(np.isnan(res))) - self.assertTrue(np.min(res) == -1) - - # No categories given, need to compute the data once to get - # the categories - with dask.config.set(scheduler=CustomScheduler(max_computes=1)): - _ = self.resampler.get_fractions(data, categories=None) + x_idxs, y_idxs = da.compute(resampler.x_idxs, resampler.y_idxs) + np.testing.assert_equal(x_idxs, np.array([1710, 1710, 1707, 1705])) + np.testing.assert_equal(y_idxs, np.array([465, 465, 459, 455])) + + +def test_get_bucket_indices_on_latlong(): + """Test calculation of array indices on latlong grid.""" + adef = create_area_def( + area_id='test', + projection={'proj': 'latlong'}, + width=2, height=2, + center=(0, 0), + resolution=10) + lons = da.from_array(np.array([-10.0, -9.9, -0.1, 0, 0.1, 9.9, 10.0, -10.1, 0]), chunks=CHUNKS) + lats = da.from_array(np.array([-10.0, -9.9, -0.1, 0, 0.1, 9.9, 10.0, 0, 10.1]), chunks=CHUNKS) + resampler = bucket.BucketResampler(source_lats=lats, source_lons=lons, target_area=adef) + resampler._get_indices() + + np.testing.assert_equal(resampler.x_idxs, np.array([-1, 0, 0, 1, 1, 1, -1, -1, -1])) + np.testing.assert_equal(resampler.y_idxs, np.array([-1, 1, 1, 1, 0, 0, -1, -1, -1])) + + +def _get_sum_result(resampler, data, **kwargs): + """Compute the bucket average with kwargs and check that no dask computation is performed.""" + with dask.config.set(scheduler=CustomScheduler(max_computes=0)): + result = resampler.get_sum(data, **kwargs) + return result.compute() + + +def test_get_sum_valid_data(resampler, adef): + """Test drop-in-a-bucket sum for valid data input.""" + data = da.from_array(np.array([[2., 3.], [7., 16.]]), chunks=CHUNKS) + result = _get_sum_result(resampler, data) + + # first two values are in same bin + assert np.count_nonzero(result == 5) == 1 + # others are in separate bins + assert np.count_nonzero(result == 7) == 1 + assert np.count_nonzero(result == 16) == 1 + assert result.shape == adef.shape + + # Test that also xarray.DataArrays work (same output) + data = xr.DataArray(data) + np.testing.assert_array_equal(result, _get_sum_result(resampler, data)) + + +def test_get_sum_nan_data_skipna_false(resampler): + """Test drop-in-a-bucket sum for data input with nan and skipna False.""" + data = da.from_array(np.array([[2., np.nan], [5., np.nan]]), chunks=CHUNKS) + result = _get_sum_result(resampler, data, skipna=False) + + # 2 + nan is nan, all-nan bin is nan + assert np.count_nonzero(np.isnan(result)) == 2 + # rest is 0 + assert np.nanmin(result) == 0 + + +def test_get_sum_nan_data_skipna_true(resampler): + """Test drop-in-a-bucket sum for data input with nan and skipna True.""" + data = da.from_array(np.array([[2., np.nan], [5., np.nan]]), chunks=CHUNKS) + result = _get_sum_result(resampler, data, skipna=True) + + # 2 + nan is 2 + assert np.count_nonzero(result == 2.) == 1 + # 5 is untouched in a single bin + assert np.count_nonzero(result == 5.) == 1 + # all-nan and rest is 0 + assert np.count_nonzero(np.isnan(result)) == 0 + assert np.nanmin(result) == 0 + + +def test_get_sum_non_default_fill_value_skipna_false(resampler): + """Test drop-in-a-bucket sum for data input with non-default fill_value and skipna=False.""" + data = da.from_array(np.array([[2., 255], [5., 255]]), chunks=CHUNKS) + result = _get_sum_result(resampler, data, skipna=False, fill_value=255) + + # 2 + fill_value is fill_value, all-fill_value is fill_value + assert np.count_nonzero(result == 255) == 2 + # 5 is untouched in a single bin + assert np.count_nonzero(result == 5.) == 1 + # rest is 0 + assert np.nanmin(result) == 0 + + +def test_get_sum_non_default_fill_value_skipna_true(resampler): + """Test drop-in-a-bucket sum for data input with non-default fill_value and skipna=True.""" + data = da.from_array(np.array([[2., 255], [5., 255]]), chunks=CHUNKS) + result = _get_sum_result(resampler, data, skipna=True, fill_value=255) + + # 2 + fill_value is 2 + assert np.count_nonzero(result == 2.) == 1 + # all-missing and rest is 0 + assert np.count_nonzero(result == 255) == 0 + assert np.nanmin(result) == 0 + + +def test_nonzero_empty_bucket_value_number(resampler): + """Test drop-in-a-bucket sum for non-zero empty_bucket_value set as number.""" + data = da.from_array(np.array([[2., np.nan], [5., np.nan]]), chunks=CHUNKS) + result = _get_sum_result(resampler, data, skipna=True, empty_bucket_value=4095) + + # 5 is untouched in a single bin + assert np.count_nonzero(result == 5.) == 1 + # all-nan and rest is 4095 + assert np.count_nonzero(result == 4095) == 2048 * 2560 - 2 + assert np.nanmin(result) == 2 + + +def test_nonzero_empty_bucket_value_npnan(resampler): + """Test drop-in-a-bucket sum for non-zero empty_bucket_value set as np.nan.""" + data = da.from_array(np.array([[2., np.nan], [5., np.nan]]), chunks=CHUNKS) + result = _get_sum_result(resampler, data, skipna=True, empty_bucket_value=np.nan) + + # 5 is untouched in a single bin + assert np.count_nonzero(result == 5.) == 1 + # all-nan and rest is np.nan + assert np.count_nonzero(np.isnan(result)) == 2048 * 2560 - 2 + assert np.nanmin(result) == 2 + + +def test_get_count(resampler): + """Test drop-in-a-bucket sum.""" + with dask.config.set(scheduler=CustomScheduler(max_computes=0)): + result = resampler.get_count() + result = result.compute() + assert np.max(result) == 2 + assert np.sum(result == 1) == 2 + assert np.sum(result == 2) == 1 + assert resampler.counts is not None + + +def _get_min_result(resampler, data, **kwargs): + """Compute the bucket average with kwargs and check that no dask computation is performed.""" + with dask.config.set(scheduler=CustomScheduler(max_computes=0)): + result = resampler.get_min(data, **kwargs) + return result.compute() + + +def test_get_min(resampler): + """Test min bucket resampling.""" + data = da.from_array(np.array([[2, 11], [5, np.nan]]), chunks=CHUNKS) + result = _get_min_result(resampler, data) + # test multiple entries minimum + assert np.count_nonzero(result == 2) == 1 + # test single entry minimum + assert np.count_nonzero(result == 5) == 1 + # test that minimum of bucket with only nan is nan, and empty buckets are nan + assert np.count_nonzero(~np.isnan(result)) == 2 + + +def _get_max_result(resampler, data, **kwargs): + """Compute the bucket max with kwargs and check that no dask computation is performed.""" + with dask.config.set(scheduler=CustomScheduler(max_computes=0)): + result = resampler.get_max(data, **kwargs) + return result.compute() + + +def test_get_max(resampler): + """Test max bucket resampling.""" + data = da.from_array(np.array([[2, 11], [5, np.nan]]), chunks=CHUNKS) + result = _get_max_result(resampler, data) + # test multiple entries maximum + assert np.count_nonzero(result == 11) == 1 + # test single entry maximum + assert np.count_nonzero(result == 5) == 1 + # test that minimum of bucket with only nan is nan, and empty buckets are nan + assert np.count_nonzero(~np.isnan(result)) == 2 + + +def _get_abs_max_result(resampler, data, **kwargs): + """Compute the bucket abs max with kwargs and check that no dask computation is performed.""" + with dask.config.set(scheduler=CustomScheduler(max_computes=0)): + result = resampler.get_abs_max(data, **kwargs) + return result.compute() + + +def test_get_abs_max(resampler): + """Test abs max bucket resampling.""" + data = da.from_array(np.array([[2, -11], [5, np.nan]]), chunks=CHUNKS) + result = _get_abs_max_result(resampler, data) + # test multiple entries absolute maximum + assert np.count_nonzero(result == -11) == 1 + # test single entry maximum + assert np.count_nonzero(result == 5) == 1 + # test that minimum of bucket with only nan is nan, and empty buckets are nan + assert np.count_nonzero(~np.isnan(result)) == 2 + + +def _get_average_result(resampler, data, **kwargs): + """Compute the bucket average with kwargs and check that no dask computation is performed.""" + with dask.config.set(scheduler=CustomScheduler(max_computes=0)): + result = resampler.get_average(data, **kwargs) + return result.compute() + + +def test_get_average_basic(resampler): + """Test averaging bucket resampling.""" + data = da.from_array(np.array([[2, 11], [5, np.nan]]), chunks=CHUNKS) + result = _get_average_result(resampler, data) + # test multiple entries average + assert np.count_nonzero(result == 6.5) == 1 + # test single entry average + assert np.count_nonzero(result == 5) == 1 + # test that average of bucket with only nan is nan, and empty buckets are nan + assert np.count_nonzero(~np.isnan(result)) == 2 + + +def test_get_average_with_fillvalue_for_output(resampler): + """Test averaging bucket resampling with defined fill_value for output.""" + data = da.from_array(np.array([[2, 11], [5, np.nan]]), chunks=CHUNKS) + # test fill_value other than np.nan + result = _get_average_result(resampler, data, fill_value=-1) + # check that all empty buckets are fill_value + assert np.count_nonzero(result != -1) == 2 + + +def test_get_average_skipna_true(resampler): + """Test averaging bucket resampling with skipna True.""" + # test skipna + data = da.from_array(np.array([[2, np.nan], [np.nan, np.nan]]), chunks=CHUNKS) + result = _get_average_result(resampler, data, skipna=True) + # test that average of 2 and np.nan is 2 for skipna=True + assert np.count_nonzero(result == 2) == 1 + + +def test_get_average_skipna_false(resampler): + """Test averaging bucket resampling with skipna False.""" + data = da.from_array(np.array([[2, np.nan], [np.nan, np.nan]]), chunks=CHUNKS) + result = _get_average_result(resampler, data, skipna=False) + # test that average of 2 and np.nan is nan for skipna=False + assert np.all(np.isnan(result)) + + +def test_get_average_only_nan_input(resampler): + """Test averaging bucket resampling with only NaN as input.""" + data = da.from_array(np.array([[np.nan, np.nan], [np.nan, np.nan]]), chunks=CHUNKS) + result = _get_average_result(resampler, data, skipna=True) + # test that average of np.nan and np.nan is np.nan for both skipna + assert np.all(np.isnan(result)) + np.testing.assert_array_equal(result, _get_average_result(resampler, data, skipna=False)) + + +def test_get_average_with_fill_value_in_input(resampler): + """Test averaging bucket resampling with fill_value in input and skipna True.""" + # test that fill_value in input is recognised as missing value + data = da.from_array(np.array([[2, -1], [-1, np.nan]]), chunks=CHUNKS) + result = _get_average_result(resampler, data, fill_value=-1, skipna=True) + # test that average of 2 and -1 (missing value) is 2 + assert np.count_nonzero(result == 2) == 1 + # test that all other buckets are -1 + assert np.count_nonzero(result != -1) == 1 + + +def test_resample_bucket_fractions(resampler): + """Test fraction calculations for categorical data.""" + data = da.from_array(np.array([[2, 4], [2, 2]]), chunks=CHUNKS) + categories = [1, 2, 3, 4] + with dask.config.set(scheduler=CustomScheduler(max_computes=0)): + result = resampler.get_fractions(data, categories=categories) + assert set(categories) == set(result.keys()) + + res = result[1].compute() + assert np.nanmax(res) == 0. + + res = result[2].compute() + assert np.nanmax(res) == 1. + assert np.nanmin(res) == 0.5 + + res = result[3].compute() + assert np.nanmax(res) == 0. + + res = result[4].compute() + assert np.nanmax(res) == 0.5 + assert np.nanmin(res) == 0. + # There should be NaN values + assert np.any(np.isnan(res)) + + # Use a fill value + with dask.config.set(scheduler=CustomScheduler(max_computes=0)): + result = resampler.get_fractions(data, categories=categories, fill_value=-1) + + # There should not be any NaN values + for i in categories: + res = result[i].compute() + assert not np.any(np.isnan(res)) + assert np.min(res) == -1 + + # No categories given, need to compute the data once to get + # the categories + with dask.config.set(scheduler=CustomScheduler(max_computes=1)): + _ = resampler.get_fractions(data, categories=None) From c8cd927cb04192705dece07006e4038ec73796dd Mon Sep 17 00:00:00 2001 From: andream Date: Wed, 3 Jul 2024 15:21:38 +0200 Subject: [PATCH 08/11] combine sum tests into one parametrised test --- pyresample/test/test_bucket.py | 118 +++++++++++++-------------------- 1 file changed, 46 insertions(+), 72 deletions(-) diff --git a/pyresample/test/test_bucket.py b/pyresample/test/test_bucket.py index 17f43ce74..a4663df7d 100644 --- a/pyresample/test/test_bucket.py +++ b/pyresample/test/test_bucket.py @@ -25,10 +25,14 @@ import xarray as xr import pytest from pyresample import bucket, create_area_def +from pyresample.bucket import get_invalid_mask from pyresample.geometry import AreaDefinition from pyresample.test.utils import CustomScheduler CHUNKS = 2 +WIDTH = 2560 +HEIGHT = 2048 + @pytest.fixture(scope="module") def adef(): @@ -167,78 +171,48 @@ def test_get_sum_valid_data(resampler, adef): np.testing.assert_array_equal(result, _get_sum_result(resampler, data)) -def test_get_sum_nan_data_skipna_false(resampler): - """Test drop-in-a-bucket sum for data input with nan and skipna False.""" - data = da.from_array(np.array([[2., np.nan], [5., np.nan]]), chunks=CHUNKS) - result = _get_sum_result(resampler, data, skipna=False) - - # 2 + nan is nan, all-nan bin is nan - assert np.count_nonzero(np.isnan(result)) == 2 - # rest is 0 - assert np.nanmin(result) == 0 - - -def test_get_sum_nan_data_skipna_true(resampler): - """Test drop-in-a-bucket sum for data input with nan and skipna True.""" - data = da.from_array(np.array([[2., np.nan], [5., np.nan]]), chunks=CHUNKS) - result = _get_sum_result(resampler, data, skipna=True) - - # 2 + nan is 2 - assert np.count_nonzero(result == 2.) == 1 - # 5 is untouched in a single bin - assert np.count_nonzero(result == 5.) == 1 - # all-nan and rest is 0 - assert np.count_nonzero(np.isnan(result)) == 0 - assert np.nanmin(result) == 0 - - -def test_get_sum_non_default_fill_value_skipna_false(resampler): - """Test drop-in-a-bucket sum for data input with non-default fill_value and skipna=False.""" - data = da.from_array(np.array([[2., 255], [5., 255]]), chunks=CHUNKS) - result = _get_sum_result(resampler, data, skipna=False, fill_value=255) - - # 2 + fill_value is fill_value, all-fill_value is fill_value - assert np.count_nonzero(result == 255) == 2 - # 5 is untouched in a single bin - assert np.count_nonzero(result == 5.) == 1 - # rest is 0 - assert np.nanmin(result) == 0 - - -def test_get_sum_non_default_fill_value_skipna_true(resampler): - """Test drop-in-a-bucket sum for data input with non-default fill_value and skipna=True.""" - data = da.from_array(np.array([[2., 255], [5., 255]]), chunks=CHUNKS) - result = _get_sum_result(resampler, data, skipna=True, fill_value=255) - - # 2 + fill_value is 2 - assert np.count_nonzero(result == 2.) == 1 - # all-missing and rest is 0 - assert np.count_nonzero(result == 255) == 0 - assert np.nanmin(result) == 0 - - -def test_nonzero_empty_bucket_value_number(resampler): - """Test drop-in-a-bucket sum for non-zero empty_bucket_value set as number.""" - data = da.from_array(np.array([[2., np.nan], [5., np.nan]]), chunks=CHUNKS) - result = _get_sum_result(resampler, data, skipna=True, empty_bucket_value=4095) - - # 5 is untouched in a single bin - assert np.count_nonzero(result == 5.) == 1 - # all-nan and rest is 4095 - assert np.count_nonzero(result == 4095) == 2048 * 2560 - 2 - assert np.nanmin(result) == 2 - - -def test_nonzero_empty_bucket_value_npnan(resampler): - """Test drop-in-a-bucket sum for non-zero empty_bucket_value set as np.nan.""" - data = da.from_array(np.array([[2., np.nan], [5., np.nan]]), chunks=CHUNKS) - result = _get_sum_result(resampler, data, skipna=True, empty_bucket_value=np.nan) - - # 5 is untouched in a single bin - assert np.count_nonzero(result == 5.) == 1 - # all-nan and rest is np.nan - assert np.count_nonzero(np.isnan(result)) == 2048 * 2560 - 2 - assert np.nanmin(result) == 2 +def _equal_or_both_nan(val1, val2): + return val1 == val2 or (np.isnan(val1) and np.isnan(val2)) + + +@pytest.mark.parametrize("skipna", [True, False]) +@pytest.mark.parametrize("fill_value", [np.nan, 255, -1]) +@pytest.mark.parametrize("empty_bucket_value", [0, 4095, np.nan, -1]) +def test_get_sum_skipna_fillvalue_empty_bucket_value(resampler, skipna, fill_value, empty_bucket_value): + """Test drop-in-a-bucket sum for invalid data input and according arguments.""" + data = da.from_array(np.array([[2., fill_value], [5., fill_value]]), chunks=CHUNKS) + result = _get_sum_result(resampler, data, + skipna=skipna, + fill_value=fill_value, + empty_bucket_value=empty_bucket_value) + n_target_bkt = WIDTH * HEIGHT + + # 5 is untouched in a single bin, in any case + n_bkt_with_val_5 = 1 + + if skipna: + # 2 + fill_value is 2 (nansum) + n_bkt_with_val_2 = 1 + # and fill_value+fill_value is empty_bucket_value, + # hence no fill_value bkt are left + n_bkt_with_val_fill_value = 0 + else: + # 2 + fill_value is fill_value (sum) + n_bkt_with_val_2 = 0 + # and fill_value + fill_value is fill_value, so + n_bkt_with_val_fill_value = 2 + + n_bkt_with_empty_value = n_target_bkt - n_bkt_with_val_fill_value - n_bkt_with_val_5 - n_bkt_with_val_2 + + # special case + if _equal_or_both_nan(fill_value, empty_bucket_value): + # the fill and empty values are equal, so they should be added up + n_bkt_with_empty_value = n_bkt_with_val_fill_value = n_bkt_with_empty_value + n_bkt_with_val_fill_value + + assert np.count_nonzero(result == 5.) == n_bkt_with_val_5 + assert np.count_nonzero(result == 2.) == n_bkt_with_val_2 + assert np.count_nonzero(get_invalid_mask(result, fill_value)) == n_bkt_with_val_fill_value + assert np.count_nonzero(get_invalid_mask(result, empty_bucket_value)) == n_bkt_with_empty_value def test_get_count(resampler): From f3acfb450a858a5039aa7466f69bfa2fdf9b9b26 Mon Sep 17 00:00:00 2001 From: andream Date: Wed, 3 Jul 2024 15:30:52 +0200 Subject: [PATCH 09/11] fix imports and docstrings --- pyresample/test/test_bucket.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pyresample/test/test_bucket.py b/pyresample/test/test_bucket.py index a4663df7d..9c6e60895 100644 --- a/pyresample/test/test_bucket.py +++ b/pyresample/test/test_bucket.py @@ -22,8 +22,9 @@ import dask import dask.array as da import numpy as np -import xarray as xr import pytest +import xarray as xr + from pyresample import bucket, create_area_def from pyresample.bucket import get_invalid_mask from pyresample.geometry import AreaDefinition @@ -36,6 +37,7 @@ @pytest.fixture(scope="module") def adef(): + """Get AreaDefinition for tests.""" return AreaDefinition('eurol', 'description', '', @@ -51,23 +53,26 @@ def adef(): @pytest.fixture(scope="module") def lons(): + """Get longitudes for tests.""" return da.from_array(np.array([[25., 25.], [25., 25.]]), chunks=CHUNKS) @pytest.fixture(scope="module") def lats(): + """Get latitudes for tests.""" return da.from_array(np.array([[60., 60.00001], [60.2, 60.3]]), chunks=CHUNKS) @pytest.fixture(scope="module") def resampler(adef, lons, lats): + """Get initialised resampler for tests.""" return bucket.BucketResampler(adef, lons, lats) @patch('pyresample.bucket.Proj') @patch('pyresample.bucket.BucketResampler._get_indices') def test_init(get_indices, prj, adef, lons, lats): - """Test the init method of the BucketResampler""" + """Test the init method of the BucketResampler.""" resampler = bucket.BucketResampler(adef, lons, lats) get_indices.assert_called_once() From c91b53d0e2c40f07cd1467a004af91276c15cc50 Mon Sep 17 00:00:00 2001 From: andream Date: Tue, 23 Jul 2024 11:49:40 +0200 Subject: [PATCH 10/11] switch to private _get_invalid_mask --- pyresample/bucket/__init__.py | 6 +++--- pyresample/test/test_bucket.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pyresample/bucket/__init__.py b/pyresample/bucket/__init__.py index dc6c4f5ca..bc2e3876c 100644 --- a/pyresample/bucket/__init__.py +++ b/pyresample/bucket/__init__.py @@ -238,7 +238,7 @@ def get_sum(self, data, fill_value=np.nan, skipna=True, empty_bucket_value=0): data = data.ravel() # Remove fill_values values from the data when used as weights - invalid_mask = get_invalid_mask(data, fill_value) + invalid_mask = _get_invalid_mask(data, fill_value) weights = da.where(invalid_mask, 0, data) # Rechunk indices to match the data chunking @@ -260,7 +260,7 @@ def get_sum(self, data, fill_value=np.nan, skipna=True, empty_bucket_value=0): def _mask_bins_with_nan_if_not_skipna(self, skipna, data, out_size, statistic, fill_value): if not skipna: - missing_val = get_invalid_mask(data, fill_value) + missing_val = _get_invalid_mask(data, fill_value) missing_val_bins, _ = da.histogram(self.idxs[missing_val], bins=out_size, range=(0, out_size)) statistic = da.where(missing_val_bins > 0, fill_value, statistic) @@ -469,7 +469,7 @@ def get_fractions(self, data, categories=None, fill_value=np.nan): return results -def get_invalid_mask(data, fill_value): +def _get_invalid_mask(data, fill_value): """Get a boolean array where values equal to fill_value in data are True.""" if np.isnan(fill_value): return np.isnan(data) diff --git a/pyresample/test/test_bucket.py b/pyresample/test/test_bucket.py index 9c6e60895..4ff9e8928 100644 --- a/pyresample/test/test_bucket.py +++ b/pyresample/test/test_bucket.py @@ -26,7 +26,7 @@ import xarray as xr from pyresample import bucket, create_area_def -from pyresample.bucket import get_invalid_mask +from pyresample.bucket import _get_invalid_mask from pyresample.geometry import AreaDefinition from pyresample.test.utils import CustomScheduler @@ -216,8 +216,8 @@ def test_get_sum_skipna_fillvalue_empty_bucket_value(resampler, skipna, fill_val assert np.count_nonzero(result == 5.) == n_bkt_with_val_5 assert np.count_nonzero(result == 2.) == n_bkt_with_val_2 - assert np.count_nonzero(get_invalid_mask(result, fill_value)) == n_bkt_with_val_fill_value - assert np.count_nonzero(get_invalid_mask(result, empty_bucket_value)) == n_bkt_with_empty_value + assert np.count_nonzero(_get_invalid_mask(result, fill_value)) == n_bkt_with_val_fill_value + assert np.count_nonzero(_get_invalid_mask(result, empty_bucket_value)) == n_bkt_with_empty_value def test_get_count(resampler): From 05920a801387a2417205c5bc2014743231683ff7 Mon Sep 17 00:00:00 2001 From: andream Date: Tue, 23 Jul 2024 12:50:16 +0200 Subject: [PATCH 11/11] update docstring --- pyresample/bucket/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyresample/bucket/__init__.py b/pyresample/bucket/__init__.py index bc2e3876c..a65830201 100644 --- a/pyresample/bucket/__init__.py +++ b/pyresample/bucket/__init__.py @@ -210,7 +210,7 @@ def get_sum(self, data, fill_value=np.nan, skipna=True, empty_bucket_value=0): data : Numpy or Dask array Data to be binned and summed. fill_value : float - Fill value to mark missing/invalid values in the input data. + Fill value of the input data marking missing/invalid values. Default: np.nan skipna : boolean (optional) If True, skips missing values (as marked by NaN or `fill_value`) for the sum calculation