GlacioHack · rhugonnet · Sep 8, 2021 · Mar 27, 2021 · Jun 17, 2021 · Jun 22, 2021
diff --git a/tests/test_spatial_tools.py b/tests/test_spatial_tools.py
@@ -2,8 +2,11 @@
 
 Author(s):
     Erik S. Holmlund
+    Romain Hugonnet
 
 """
+from __future__ import annotations
+
 import os
 import shutil
 import subprocess
@@ -12,8 +15,10 @@
 
 import geoutils as gu
 import numpy as np
+import pandas as pd
 import pytest
 import rasterio as rio
+from sklearn.metrics import mean_squared_error, median_absolute_error
 
 import xdem
 from xdem import examples
@@ -28,6 +33,31 @@ def test_dem_subtraction():
     assert np.nanmean(np.abs(diff.data)) < 100
 
 
+def load_ref_and_diff() -> tuple[gu.georaster.Raster, gu.georaster.Raster, np.ndarray]:
+    """Load example files to try coregistration methods with."""
+    examples.download_longyearbyen_examples(overwrite=False)
+
+    reference_raster = gu.georaster.Raster(examples.FILEPATHS["longyearbyen_ref_dem"])
+    to_be_aligned_raster = gu.georaster.Raster(examples.FILEPATHS["longyearbyen_tba_dem"])
+    glacier_mask = gu.geovector.Vector(examples.FILEPATHS["longyearbyen_glacier_outlines"])
+    inlier_mask = ~glacier_mask.create_mask(reference_raster)
+
+    metadata = {}
+    # aligned_raster, _ = xdem.coreg.coregister(reference_raster, to_be_aligned_raster, method="amaury", mask=glacier_mask,
+    #                                          metadata=metadata)
+    nuth_kaab = xdem.coreg.NuthKaab()
+    nuth_kaab.fit(reference_raster.data, to_be_aligned_raster.data,
+                  inlier_mask=inlier_mask, transform=reference_raster.transform)
+    aligned_raster = nuth_kaab.apply(to_be_aligned_raster.data, transform=reference_raster.transform)
+
+    diff = gu.Raster.from_array((reference_raster.data - aligned_raster),
+                                transform=reference_raster.transform, crs=reference_raster.crs)
+    mask = glacier_mask.create_mask(diff)
+
+    return reference_raster, diff, mask
+
+
+
 class TestMerging:
     """
     Test cases for stacking and merging DEMs
@@ -178,6 +208,136 @@ def make_gdal_hillshade(filepath) -> np.ndarray:
     # Make sure that this doesn't create weird division warnings.
     xdem.spatial_tools.hillshade(dem.data, dem.res)
 
+class TestRobustFitting:
+
+    @pytest.mark.parametrize("pkg_estimator", [('sklearn','Linear'), ('scipy','Linear'), ('sklearn','Theil-Sen'),
+                                           ('sklearn','RANSAC'),('sklearn','Huber')])
+    def test_robust_polynomial_fit(self, pkg_estimator: str) -> None:
+
+        np.random.seed(42)
+
+        # x vector
+        x = np.linspace(1, 10, 1000)
+        # exact polynomial
+        true_coefs = [-100, 5, 3, 2]
+        y = true_coefs[0] + true_coefs[1] * x + true_coefs[2] * x ** 2 + true_coefs[3] * x ** 3
+
+        coefs, deg = xdem.spatial_tools.robust_polynomial_fit(x, y, linear_pkg=pkg_estimator[0], estimator=pkg_estimator[1], random_state=42)
+
+        assert deg == 3 or deg == 4
+        assert np.abs(coefs[0] - true_coefs[0]) <= 100
+        assert np.abs(coefs[1] - true_coefs[1]) < 5
+        assert np.abs(coefs[2] - true_coefs[2]) < 2
+        assert np.abs(coefs[3] - true_coefs[3]) < 1
+
+    def test_robust_polynomial_fit_noise_and_outliers(self):
+
+        np.random.seed(42)
+
+        # x vector
+        x = np.linspace(1,10,1000)
+        # exact polynomial
+        true_coefs = [-100, 5, 3, 2]
+        y = true_coefs[0] + true_coefs[1] * x + true_coefs[2] * x**2 + true_coefs[3] * x**3
+        # add some noise on top
+        y += np.random.normal(loc=0,scale=3,size=1000)
+        # and some outliers
+        y[50:75] = 0
+        y[900:925] = 1000
+
+        # test linear estimators
+        coefs, deg = xdem.spatial_tools.robust_polynomial_fit(x,y, estimator='Linear', linear_pkg='scipy', loss='soft_l1', f_scale=0.5)
+
+        # scipy solution should be quite robust to outliers/noise (with the soft_l1 method and f_scale parameter)
+        # however, it is subject to random processes inside the scipy function (couldn't find how to fix those...)
+        assert deg == 3 or deg == 4 # can find degree 3, or 4 with coefficient close to 0
+        assert np.abs(coefs[0] - true_coefs[0]) < 3
+        assert np.abs(coefs[1] - true_coefs[1]) < 3
+        assert np.abs(coefs[2] - true_coefs[2]) < 1
+        assert np.abs(coefs[3] - true_coefs[3]) < 1
+
+        # the sklearn Linear solution with MSE cost function will not be robust
+        coefs2, deg2 = xdem.spatial_tools.robust_polynomial_fit(x,y, estimator='Linear', linear_pkg='sklearn', cost_func=mean_squared_error, margin_improvement=50)
+        assert deg2 != 3
+        # using the median absolute error should improve the fit, but the parameters will still be hard to constrain
+        coefs3, deg3 = xdem.spatial_tools.robust_polynomial_fit(x,y, estimator='Linear', linear_pkg='sklearn', cost_func=median_absolute_error, margin_improvement=50)
+        assert deg3 == 3
+        assert np.abs(coefs3[0] - true_coefs[0]) > 50
+        assert np.abs(coefs3[1] - true_coefs[1]) > 10
+        assert np.abs(coefs3[2] - true_coefs[2]) > 5
+        assert np.abs(coefs3[3] - true_coefs[3]) > 0.5
+
+        # test robust estimator
+
+        # Theil-Sen should have better coefficients
+        coefs4, deg4 = xdem.spatial_tools.robust_polynomial_fit(x, y, estimator='Theil-Sen', random_state=42)
+        assert deg4 == 3
+        # high degree coefficients should be well constrained
+        assert np.abs(coefs4[2] - true_coefs[2]) < 1
+        assert np.abs(coefs4[3] - true_coefs[3]) < 1
+
+        # RANSAC is not always optimal, here it does not work well
+        coefs5, deg5 = xdem.spatial_tools.robust_polynomial_fit(x, y, estimator='RANSAC', random_state=42)
+        assert deg5 != 3
+
+        # Huber should perform well, close to the scipy robust solution
+        coefs6, deg6 = xdem.spatial_tools.robust_polynomial_fit(x, y, estimator='Huber')
+        assert deg6 == 3
+        assert np.abs(coefs6[1] - true_coefs[1]) < 1
+        assert np.abs(coefs6[2] - true_coefs[2]) < 1
+        assert np.abs(coefs6[3] - true_coefs[3]) < 1
+
+    def test_robust_sumsin_fit(self) -> None:
+
+        # x vector
+        x = np.linspace(0, 10, 1000)
+        # exact polynomial
+        true_coefs = np.array([(5, 1, np.pi),(3, 0.3, 0)]).flatten()
+        y = xdem.spatial_tools._fitfun_sumofsin(x,params=true_coefs)
+
+        # check that the function runs
+        coefs, deg = xdem.spatial_tools.robust_sumsin_fit(x,y, random_state=42)
+
+        # check that the estimated sum of sinusoid correspond to the input
+        for i in range(2):
+            assert (coefs[3*i] - true_coefs[3*i]) < 0.01
+
+        # test that using custom arguments does not create an error
+        bounds = [(3,7),(0.1,3),(0,2*np.pi),(1,7),(0.1,1),(0,2*np.pi),(0,1),(0.1,1),(0,2*np.pi)]
+        coefs, deg = xdem.spatial_tools.robust_sumsin_fit(x,y,bounds_amp_freq_phase=bounds, nb_frequency_max=2
+                                                          , significant_res=0.01, random_state=42)
+
+    def test_robust_simsin_fit_noise_and_outliers(self):
+
+        # test the robustness to outliers
+
+        np.random.seed(42)
+        # x vector
+        x = np.linspace(0, 10, 1000)
+        # exact polynomial
+        true_coefs = np.array([(5, 1, np.pi), (3, 0.3, 0)]).flatten()
+        y = xdem.spatial_tools._fitfun_sumofsin(x, params=true_coefs)
+
+        # adding some noise
+        y += np.random.normal(loc=0, scale=0.25, size=1000)
+        # and some outliers
+        y[50:75] = -10
+        y[900:925] = 10
+
+        bounds = [(3, 7), (0.1, 3), (0, 2 * np.pi), (1, 7), (0.1, 1), (0, 2 * np.pi), (0, 1), (0.1, 1), (0, 2 * np.pi)]
+        coefs, deg = xdem.spatial_tools.robust_sumsin_fit(x,y, random_state=42, bounds_amp_freq_phase=bounds)
+
+        # should be less precise, but still on point
+        # re-order output coefficient to match input
+        if coefs[3] > coefs[0]:
+            coefs = np.concatenate((coefs[3:],coefs[0:3]))
+
+        # check values
+        for i in range(2):
+            assert (coefs[3*i] - true_coefs[3*i]) < 0.2
+            assert (coefs[3 * i +1] - true_coefs[3 * i +1]) < 0.2
+            error_phase = min(np.abs(coefs[3 * i + 2] - true_coefs[ 3* i + 2]), np.abs(2* np.pi - (coefs[3 * i + 2] - true_coefs[ 3* i + 2])))
+            assert error_phase < 0.2
 
 class TestSubsample:
     """
@@ -233,3 +393,50 @@ def test_subsample(self, array):
         assert len(indices) == np.ndim(array)
         assert np.ndim(array[indices]) == 1
         assert np.size(array[indices]) == int(np.size(array) * 0.3)
+
+
+class TestBinning:
+
+    def test_nd_binning(self):
+
+        ref, diff, mask = load_ref_and_diff()
+
+        slope, aspect = xdem.coreg.calculate_slope_and_aspect(ref.data.squeeze())
+
+        # 1d binning, by default will create 10 bins
+        df = xdem.spatial_tools.nd_binning(values=diff.data.flatten(),list_var=[slope.flatten()],list_var_names=['slope'])
+
+        # check length matches
+        assert df.shape[0] == 10
+        # check bin edges match the minimum and maximum of binning variable
+        assert np.nanmin(slope) == np.min(pd.IntervalIndex(df.slope).left)
+        assert np.nanmax(slope) == np.max(pd.IntervalIndex(df.slope).right)
+
+        # 1d binning with 20 bins
+        df = xdem.spatial_tools.nd_binning(values=diff.data.flatten(), list_var=[slope.flatten()], list_var_names=['slope'],
+                                           list_var_bins=[[20]])
+        # check length matches
+        assert df.shape[0] == 20
+
+        # nmad goes up quite a bit with slope, we can expect a 10 m measurement error difference
+        assert df.nmad.values[-1] - df.nmad.values[0] > 10
+
+        # try custom stat
+        def percentile_80(a):
+            return np.nanpercentile(a, 80)
+
+        # check the function runs with custom functions
+        xdem.spatial_tools.nd_binning(values=diff.data.flatten(),list_var=[slope.flatten()],list_var_names=['slope'], statistics=['count',percentile_80])
+
+        # 2d binning
+        df = xdem.spatial_tools.nd_binning(values=diff.data.flatten(),list_var=[slope.flatten(),ref.data.flatten()],list_var_names=['slope','elevation'])
+
+        # dataframe should contain two 1D binning of length 10 and one 2D binning of length 100
+        assert df.shape[0] == (10 + 10 + 100)
+
+        # nd binning
+        df = xdem.spatial_tools.nd_binning(values=diff.data.flatten(),list_var=[slope.flatten(),ref.data.flatten(),aspect.flatten()],list_var_names=['slope','elevation','aspect'])
+
+        # dataframe should contain three 1D binning of length 10 and three 2D binning of length 100 and one 2D binning of length 1000
+        assert df.shape[0] == (1000 + 3 * 100 + 3 * 10)
+
diff --git a/tests/test_spstats.py b/tests/test_spstats.py
@@ -3,8 +3,6 @@
 
 """
 from __future__ import annotations
-from typing import Tuple
-
 import warnings
 
 import geoutils as gu
@@ -224,50 +222,4 @@ def test_patches_method(self):
             mask=~mask.astype(bool).squeeze(),
             gsd=diff.res[0],
             area_size=10000
-        )
-
-class TestBinning:
-
-    def test_nd_binning(self):
-
-        ref, diff, mask = load_ref_and_diff()
-
-        slope, aspect = xdem.coreg.calculate_slope_and_aspect(ref.data.squeeze())
-
-        # 1d binning, by default will create 10 bins
-        df = xdem.spstats.nd_binning(values=diff.data.flatten(),list_var=[slope.flatten()],list_var_names=['slope'])
-
-        # check length matches
-        assert df.shape[0] == 10
-        # check bin edges match the minimum and maximum of binning variable
-        assert np.nanmin(slope) == np.min(pd.IntervalIndex(df.slope).left)
-        assert np.nanmax(slope) == np.max(pd.IntervalIndex(df.slope).right)
-
-        # 1d binning with 20 bins
-        df = xdem.spstats.nd_binning(values=diff.data.flatten(), list_var=[slope.flatten()], list_var_names=['slope'],
-                                           list_var_bins=[[20]])
-        # check length matches
-        assert df.shape[0] == 20
-
-        # nmad goes up quite a bit with slope, we can expect a 10 m measurement error difference
-        assert df.nmad.values[-1] - df.nmad.values[0] > 10
-
-        # try custom stat
-        def percentile_80(a):
-            return np.nanpercentile(a, 80)
-
-        # check the function runs with custom functions
-        xdem.spstats.nd_binning(values=diff.data.flatten(),list_var=[slope.flatten()],list_var_names=['slope'], statistics=['count',percentile_80])
-
-        # 2d binning
-        df = xdem.spstats.nd_binning(values=diff.data.flatten(),list_var=[slope.flatten(),ref.data.flatten()],list_var_names=['slope','elevation'])
-
-        # dataframe should contain two 1D binning of length 10 and one 2D binning of length 100
-        assert df.shape[0] == (10 + 10 + 100)
-
-        # nd binning
-        df = xdem.spstats.nd_binning(values=diff.data.flatten(),list_var=[slope.flatten(),ref.data.flatten(),aspect.flatten()],list_var_names=['slope','elevation','aspect'])
-
-        # dataframe should contain three 1D binning of length 10 and three 2D binning of length 100 and one 2D binning of length 1000
-        assert df.shape[0] == (1000 + 3 * 100 + 3 * 10)
-
+        )