From b61ac1fbd639f482b7ecb661da47254695139299 Mon Sep 17 00:00:00 2001 From: Avik Basu <3485425+ab93@users.noreply.github.com> Date: Mon, 27 Mar 2023 16:37:42 -0700 Subject: [PATCH] fix: Tanhscaler nan output for constant feature (#153) Signed-off-by: Avik Basu --- numalogic/preprocess/transformer.py | 18 +++++++++++++++--- pyproject.toml | 2 +- tests/preprocess/test_transformer.py | 20 ++++++++++++++++++++ 3 files changed, 36 insertions(+), 4 deletions(-) diff --git a/numalogic/preprocess/transformer.py b/numalogic/preprocess/transformer.py index c2e79256..0fb70175 100644 --- a/numalogic/preprocess/transformer.py +++ b/numalogic/preprocess/transformer.py @@ -54,7 +54,7 @@ def inverse_transform(self, X) -> ArrayLike: class TanhScaler(OneToOneFeatureMixin, TransformerMixin): r""" - Tanh Estimator applies tanh normalization to the Z-score, + Tanh Estimator applies column-wise tanh normalization to the Z-score, and scales the values between 0 and 1. After scaling, the data has a mean of 0.5. @@ -63,21 +63,29 @@ class TanhScaler(OneToOneFeatureMixin, TransformerMixin): Higher the value, the linear portion of the curve will have a higher slope but will reach the asymptote (flatten out) earlier. + Args: + coeff: float value determining the spread of the scores + eps: minimum value below which the feature will be treated as constant. + In order to avoid division by zero or a very small number, + standard deviation will be set as 1 for that feature. + References: Nandakumar, Jain, Ross. 2005. Score Normalization in Multimodal Biometric Systems, Pattern Recognition 38, 2270-2285. https://web.cse.msu.edu/~rossarun/pubs/RossScoreNormalization_PR05.pdf """ - __slots__ = ("_coeff", "_std", "_mean") + __slots__ = ("_coeff", "_std", "_mean", "_eps") - def __init__(self, coeff: float = 0.2): + def __init__(self, coeff: float = 0.2, eps: float = 1e-10): self._coeff = coeff self._std = None self._mean = None + self._eps = eps def fit(self, x: npt.NDArray[float]) -> Self: self._mean = np.mean(x, axis=0) self._std = np.std(x, axis=0) + self._check_if_constant(x) return self def transform(self, x: npt.NDArray[float]) -> npt.NDArray[float]: @@ -86,3 +94,7 @@ def transform(self, x: npt.NDArray[float]) -> npt.NDArray[float]: def fit_transform(self, x: npt.NDArray[float], y=None, **_) -> npt.NDArray[float]: return self.fit(x).transform(x) + + def _check_if_constant(self, x: npt.NDArray[float]) -> None: + delta = np.max(x, axis=0) - np.min(x, axis=0) + self._std[delta < self._eps] = 1.0 diff --git a/pyproject.toml b/pyproject.toml index e6b5b6c2..83c8be6d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "numalogic" -version = "0.3.6" +version = "0.3.7" description = "Collection of operational Machine Learning models and tools." authors = ["Numalogic Developers"] packages = [{ include = "numalogic" }] diff --git a/tests/preprocess/test_transformer.py b/tests/preprocess/test_transformer.py index ac1fe094..6ef61b4a 100644 --- a/tests/preprocess/test_transformer.py +++ b/tests/preprocess/test_transformer.py @@ -42,6 +42,26 @@ def test_tanh_scaler_2(self): assert_array_less(x_scaled, np.ones_like(x_scaled)) assert_array_less(np.zeros_like(x_scaled), x_scaled) + def test_tanh_scaler_3(self): + x = np.random.randn(5, 3) + x[:, 1] = np.zeros(5) + + scaler = TanhScaler() + + x_scaled = scaler.fit_transform(x) + self.assertFalse(np.isnan(x_scaled[:, 1]).all()) + assert_array_less(x_scaled, np.ones_like(x_scaled)) + assert_array_less(np.zeros_like(x_scaled), x_scaled) + + def test_tanh_scaler_nan(self): + x = np.random.randn(5, 3) + x[:, 1] = np.zeros(5) + + scaler = TanhScaler(eps=0.0) + + x_scaled = scaler.fit_transform(x) + self.assertTrue(np.isnan(x_scaled[:, 1]).all()) + if __name__ == "__main__": unittest.main()