Revised binning kind arguments. Added 'equal area' binning and 'custo…

…m' bin options
GlacioHack · Mar 15, 2021 · 3b33d4b · 3b33d4b
1 parent ecdfdea
commit 3b33d4b
Show file tree

Hide file tree

Showing 2 changed files with 76 additions and 18 deletions.
diff --git a/tests/test_volume.py b/tests/test_volume.py
@@ -1,8 +1,12 @@
+import warnings
+
 import geoutils as gu
 import numpy as np
 import pandas as pd
 
-import xdem
+with warnings.catch_warnings():
+    warnings.simplefilter("ignore")
+    import xdem
 
 xdem.examples.download_longyearbyen_examples(overwrite=False)
 
@@ -23,7 +27,11 @@ def test_bin_ddem(self):
         """Test dDEM binning."""
         ddem = self.dem_2009.data - self.dem_1990.data
 
-        ddem_bins = xdem.volume.hypsometric_binning(ddem.squeeze()[self.mask], self.dem_2009.data.squeeze()[self.mask])
+        ddem_bins = xdem.volume.hypsometric_binning(
+            ddem.squeeze()[self.mask],
+            self.dem_2009.data.squeeze()[self.mask],
+            bins=50,
+            kind="equal_height")
 
         ddem_bins_masked = xdem.volume.hypsometric_binning(
             np.ma.masked_array(ddem.squeeze(), mask=~self.mask),
@@ -116,3 +124,40 @@ def test_area_calculation(self):
 
         # The area of Scott Turnerbreen was around 3.4 km² in 1990, so this should be close to that number.
         assert 2e6 < bin_area.sum() < 5e6
+
+    def test_ddem_bin_methods(self):
+        """Test different dDEM binning methods."""
+        ddem = self.dem_2009.data - self.dem_1990.data
+
+        # equal height is already tested in test_bin_ddem
+
+        # Make a fixed amount of bins
+        equal_count_bins = xdem.volume.hypsometric_binning(
+            ddem.squeeze()[self.mask],
+            self.dem_2009.data.squeeze()[self.mask],
+            bins=50,
+            kind="equal_count"
+        )
+        assert equal_count_bins.shape[0] == 50
+
+        # Make 50 bins with approximately the same area (pixel count)
+        equal_area_bins = xdem.volume.hypsometric_binning(
+            ddem.squeeze()[self.mask],
+            self.dem_2009.data.squeeze()[self.mask],
+            bins=50,
+            kind="equal_area"
+        )
+
+        assert equal_area_bins.shape[0] == 50
+        # Make sure that the pixel count variation is low.
+        assert equal_area_bins["count"].std() < 1
+
+        # Try to feed the previous bins as "custom" bins to the function.
+        custom_bins = xdem.volume.hypsometric_binning(
+            ddem.squeeze()[self.mask],
+            self.dem_2009.data.squeeze()[self.mask],
+            bins=np.r_[equal_area_bins.index.left[0], equal_area_bins.index.right],
+            kind="custom"
+        )
+
+        assert custom_bins.shape[0] == equal_area_bins.shape[0]
diff --git a/xdem/volume.py b/xdem/volume.py
@@ -8,22 +8,23 @@
 import scipy.interpolate
 
 
-def hypsometric_binning(ddem: np.ndarray, ref_dem: np.ndarray, bin_size=50,
-                        normalized_bin_size: bool = False, aggregation_function: Callable = np.median) -> pd.DataFrame:
+def hypsometric_binning(ddem: np.ndarray, ref_dem: np.ndarray, bins: Union[float, np.ndarray] = 50.0,
+                        kind: str = "equal_height", aggregation_function: Callable = np.median) -> pd.DataFrame:
     """
     Separate the dDEM in discrete elevation bins.
 
     It is assumed that the dDEM is calculated as 'ref_dem - dem' (not 'dem - ref_dem').
 
     :param ddem: The dDEM as a 2D or 1D array.
     :param ref_dem: The reference DEM as a 2D or 1D array.
-    :param bin_size: The bin interval size in georeferenced units (or percent; 0-100, if normalized_bin_size=True)
-    :param normalized_bin_size: If the given bin size should be parsed as a percentage of the glacier's elevation range.
+    :param bins: The bin size, count, or array, depending on the binning method ('kind').
+    :param kind: The kind of binning to do. Choices: ['equal_height', 'equal_count', 'equal_area', 'custom'].
     :param aggregation_function: The function to aggregate the elevation values within a bin. Defaults to the median.
 
     :returns: A Pandas DataFrame with elevation bins and dDEM statistics.
     """
     assert ddem.shape == ref_dem.shape
+
     # Remove all nans, and flatten the inputs.
     nan_mask = np.logical_or(
         np.isnan(ddem) if not isinstance(ddem, np.ma.masked_array) else ddem.mask,
@@ -37,21 +38,33 @@ def hypsometric_binning(ddem: np.ndarray, ref_dem: np.ndarray, bin_size=50,
     mean_dem = ref_dem - (ddem / 2)
 
     # If the bin size should be seen as a percentage.
-    if normalized_bin_size:
-        assert 0 < bin_size < 100
-
-        # Get the statistical elevation range to normalize the bin size with
-        elevation_range = np.percentile(mean_dem, 99) - np.percentile(mean_dem, 1)
-        bin_size = elevation_range / bin_size
+    if kind == "equal_height":
+        zbins = np.arange(mean_dem.min(), mean_dem.max() + bins, step=bins)
+    elif kind == "equal_count":
+        # Make bins between mean_dem.min() and a little bit above mean_dem.max().
+        # The bin count has to be bins + 1 because zbins[0] will be a "below min value" bin, which will be irrelevant.
+        zbins = np.linspace(mean_dem.min(), mean_dem.max() + 1e-6 / bins, num=int(bins + 1))
+    elif kind == "equal_area":
+        # Make the percentile steps. The bins + 1 is explained above.
+        steps = np.linspace(0, 100, num=int(bins) + 1)
+        zbins = np.fromiter(
+            (np.percentile(mean_dem, step) for step in steps),
+            dtype=float
+        )
+        # The uppermost bin needs to be a tiny amount larger than the highest value to include it.
+        zbins[-1] += 1e-6
+    elif kind == "custom":
+        zbins = bins
+    else:
+        raise ValueError(f"Invalid bin kind: {kind}. Choices: ['equal_size', 'equal_count', 'equal_area', 'custom']")
 
     # Generate bins and get bin indices from the mean DEM
-    bins = np.arange(mean_dem.min(), mean_dem.max() + bin_size, step=bin_size)
-    indices = np.digitize(mean_dem, bins=bins)
+    indices = np.digitize(mean_dem, bins=zbins)
 
     # Calculate statistics for each bin.
     # If no values exist, all stats should be nans (except count with should be 0)
     # medians, means, stds, nmads = (np.zeros(shape=bins.shape[0] - 1, dtype=ddem.dtype) * np.nan, ) * 4
-    values = np.zeros(shape=bins.shape[0] - 1, dtype=ddem.dtype) * np.nan
+    values = np.zeros(shape=zbins.shape[0] - 1, dtype=ddem.dtype) * np.nan
     counts = np.zeros_like(values, dtype=int)
     for i in np.arange(indices.min(), indices.max() + 1):
         values_in_bin = ddem[indices == i]
@@ -68,7 +81,7 @@ def hypsometric_binning(ddem: np.ndarray, ref_dem: np.ndarray, bin_size=50,
 
     # Collect the results in a dataframe
     output = pd.DataFrame(
-        index=pd.IntervalIndex.from_breaks(bins),
+        index=pd.IntervalIndex.from_breaks(zbins),
         data=np.vstack([
             values, counts
         ]).T,
@@ -142,11 +155,11 @@ def calculate_hypsometry_area(ddem_bins: Union[pd.Series, pd.DataFrame], ref_dem
                                            kind="linear", fill_value="extrapolate")
     # Generate average elevations by subtracting half of the dDEM's values to the reference DEM
     if timeframe == "mean":
-        elevations = ref_dem - (ddem_func(ref_dem) / 2)
+        elevations = ref_dem - (ddem_func(ref_dem.data) / 2)
     elif timeframe == "reference":
         elevations = ref_dem
     elif timeframe == "nonreference":
-        elevations = ref_dem - ddem_func(ref_dem)
+        elevations = ref_dem - ddem_func(ref_dem.data)
 
     # Extract the bins from the dDEM series and compute the frequency of points in the bins.
     bins = np.r_[[ddem_bins.index.left[0]], ddem_bins.index.right]