IBM · stefano81 · Sep 10, 2021 · Aug 31, 2021 · Aug 31, 2021 · Sep 6, 2021
diff --git a/diffprivlib/mechanisms/laplace.py b/diffprivlib/mechanisms/laplace.py
@@ -149,7 +149,8 @@ def randomise(self, value):
         self._check_all(value)
 
         scale = self.sensitivity / (self.epsilon - np.log(1 - self.delta))
-        standard_laplace = self._laplace_sampler(self._rng.random(), self._rng.random(), self._rng.random(), self._rng.random())
+        standard_laplace = self._laplace_sampler(self._rng.random(), self._rng.random(), self._rng.random(),
+                                                 self._rng.random())
 
         return value - scale * standard_laplace
 

diff --git a/diffprivlib/models/k_means.py b/diffprivlib/models/k_means.py
@@ -22,22 +22,24 @@
 
 import numpy as np
 import sklearn.cluster as sk_cluster
-from sklearn.utils import check_array
 
 from diffprivlib.accountant import BudgetAccountant
 from diffprivlib.mechanisms import LaplaceBoundedDomain, GeometricFolded
-from diffprivlib.utils import PrivacyLeakWarning, warn_unused_args
-from diffprivlib.validation import check_bounds, clip_to_bounds
+from diffprivlib.utils import PrivacyLeakWarning
+from diffprivlib.validation import DiffprivlibMixin
 
 
-class KMeans(sk_cluster.KMeans):
+class KMeans(sk_cluster.KMeans, DiffprivlibMixin):
     r"""K-Means clustering with differential privacy.
 
     Implements the DPLloyd approach presented in [SCL16]_, leveraging the :class:`sklearn.cluster.KMeans` class for full
     integration with Scikit Learn.
 
     Parameters
     ----------
+    n_clusters : int, default: 8
+        The number of clusters to form as well as the number of centroids to generate.
+
     epsilon : float, default: 1.0
         Privacy parameter :math:`\epsilon`.
 
@@ -46,9 +48,6 @@ class KMeans(sk_cluster.KMeans):
         the min/max of the entire data, or vectors with one entry per feature.  If not provided, the bounds are computed
         on the data when ``.fit()`` is first called, resulting in a :class:`.PrivacyLeakWarning`.
 
-    n_clusters : int, default: 8
-        The number of clusters to form as well as the number of centroids to generate.
-
     accountant : BudgetAccountant, optional
         Accountant to keep track of privacy budget.
 
@@ -75,14 +74,14 @@ class KMeans(sk_cluster.KMeans):
 
     """
 
-    def __init__(self, epsilon=1.0, bounds=None, n_clusters=8, accountant=None, **unused_args):
+    def __init__(self, n_clusters=8, *, epsilon=1.0, bounds=None, accountant=None, **unused_args):
         super().__init__(n_clusters=n_clusters)
 
         self.epsilon = epsilon
         self.bounds = bounds
         self.accountant = BudgetAccountant.load_default(accountant)
 
-        warn_unused_args(unused_args)
+        self._warn_unused_args(unused_args)
 
         self.cluster_centers_ = None
         self.bounds_processed = None
@@ -113,11 +112,11 @@ def fit(self, X, y=None, sample_weight=None):
         self.accountant.check(self.epsilon, 0)
 
         if sample_weight is not None:
-            warn_unused_args("sample_weight")
+            self._warn_unused_args("sample_weight")
 
         del y
 
-        X = check_array(X, accept_sparse=False, dtype=[np.float64, np.float32])
+        X = self._validate_data(X, accept_sparse=False, dtype=[np.float64, np.float32])
         n_samples, n_dims = X.shape
 
         if n_samples < self.n_clusters:
@@ -131,8 +130,8 @@ def fit(self, X, y=None, sample_weight=None):
                           "privacy leakage, specify `bounds` for each dimension.", PrivacyLeakWarning)
             self.bounds = (np.min(X, axis=0), np.max(X, axis=0))
 
-        self.bounds = check_bounds(self.bounds, n_dims, min_separation=1e-5)
-        X = clip_to_bounds(X, self.bounds)
+        self.bounds = self._check_bounds(self.bounds, n_dims, min_separation=1e-5)
+        X = self._clip_to_bounds(X, self.bounds)
 
         centers = self._init_centers(n_dims)
         labels = None

diff --git a/diffprivlib/models/linear_regression.py b/diffprivlib/models/linear_regression.py
@@ -48,14 +48,14 @@
 import numpy as np
 import sklearn.linear_model as sk_lr
 from scipy.optimize import minimize
-from sklearn.utils import check_X_y, check_array
+from sklearn.utils import check_array
 from sklearn.utils.validation import FLOAT_DTYPES
 
 from diffprivlib.accountant import BudgetAccountant
 from diffprivlib.mechanisms import Laplace, LaplaceFolded
 from diffprivlib.tools import mean
 from diffprivlib.utils import warn_unused_args, PrivacyLeakWarning
-from diffprivlib.validation import check_bounds, clip_to_bounds
+from diffprivlib.validation import check_bounds, clip_to_bounds, DiffprivlibMixin
 
 
 # noinspection PyPep8Naming
@@ -161,7 +161,7 @@ def obj(omega):
 
 
 # noinspection PyPep8Naming,PyAttributeOutsideInit
-class LinearRegression(sk_lr.LinearRegression):
+class LinearRegression(sk_lr.LinearRegression, DiffprivlibMixin):
     r"""
     Ordinary least squares Linear Regression with differential privacy.
 
@@ -211,17 +211,16 @@ class LinearRegression(sk_lr.LinearRegression):
         regression analysis under differential privacy." arXiv preprint arXiv:1208.0219 (2012).
 
     """
-    def __init__(self, epsilon=1.0, bounds_X=None, bounds_y=None, fit_intercept=True, copy_X=True, accountant=None,
+    def __init__(self, *, epsilon=1.0, bounds_X=None, bounds_y=None, fit_intercept=True, copy_X=True, accountant=None,
                  **unused_args):
-        super().__init__(fit_intercept=fit_intercept, normalize=False, copy_X=copy_X, n_jobs=None)
+        super().__init__(fit_intercept=fit_intercept, copy_X=copy_X, n_jobs=None)
 
         self.epsilon = epsilon
         self.bounds_X = bounds_X
         self.bounds_y = bounds_y
         self.accountant = BudgetAccountant.load_default(accountant)
-        self.__repr__()
 
-        warn_unused_args(unused_args)
+        self._warn_unused_args(unused_args)
 
     def fit(self, X, y, sample_weight=None):
         """
@@ -245,9 +244,9 @@ def fit(self, X, y, sample_weight=None):
         self.accountant.check(self.epsilon, 0)
 
         if sample_weight is not None:
-            warn_unused_args("sample_weight")
+            self._warn_unused_args("sample_weight")
 
-        X, y = check_X_y(X, y, accept_sparse=False, y_numeric=True, multi_output=True)
+        X, y = self._validate_data(X, y, accept_sparse=False, y_numeric=True, multi_output=True)
 
         if self.bounds_X is None or self.bounds_y is None:
             warnings.warn(
@@ -262,8 +261,8 @@ def fit(self, X, y, sample_weight=None):
             if self.bounds_y is None:
                 self.bounds_y = (np.min(y, axis=0), np.max(y, axis=0))
 
-        self.bounds_X = check_bounds(self.bounds_X, X.shape[1])
-        self.bounds_y = check_bounds(self.bounds_y, y.shape[1] if y.ndim > 1 else 1)
+        self.bounds_X = self._check_bounds(self.bounds_X, X.shape[1])
+        self.bounds_y = self._check_bounds(self.bounds_y, y.shape[1] if y.ndim > 1 else 1)
 
         n_features = X.shape[1]
         n_targets = y.shape[1] if y.ndim > 1 else 1

diff --git a/diffprivlib/models/logistic_regression.py b/diffprivlib/models/logistic_regression.py
@@ -52,17 +52,17 @@
 from sklearn.exceptions import ConvergenceWarning
 from sklearn import linear_model
 from sklearn.linear_model._logistic import _logistic_loss_and_grad
-from sklearn.utils import check_X_y, check_array, check_consistent_length
+from sklearn.utils import check_array, check_consistent_length
 from sklearn.utils.fixes import _joblib_parallel_args
 from sklearn.utils.multiclass import check_classification_targets
 
 from diffprivlib.accountant import BudgetAccountant
 from diffprivlib.mechanisms import Vector
 from diffprivlib.utils import PrivacyLeakWarning, DiffprivlibCompatibilityWarning, warn_unused_args
-from diffprivlib.validation import clip_to_norm
+from diffprivlib.validation import DiffprivlibMixin
 
 
-class LogisticRegression(linear_model.LogisticRegression):
+class LogisticRegression(linear_model.LogisticRegression, DiffprivlibMixin):
     r"""Logistic Regression (aka logit, MaxEnt) classifier with differential privacy.
 
     This class implements regularised logistic regression using :ref:`Scipy's L-BFGS-B algorithm
@@ -166,7 +166,7 @@ class LogisticRegression(linear_model.LogisticRegression):
 
     """
 
-    def __init__(self, epsilon=1.0, data_norm=None, tol=1e-4, C=1.0, fit_intercept=True, max_iter=100, verbose=0,
+    def __init__(self, *, epsilon=1.0, data_norm=None, tol=1e-4, C=1.0, fit_intercept=True, max_iter=100, verbose=0,
                  warm_start=False, n_jobs=None, accountant=None, **unused_args):
         super().__init__(penalty='l2', dual=False, tol=tol, C=C, fit_intercept=fit_intercept, intercept_scaling=1.0,
                          class_weight=None, random_state=None, solver='lbfgs', max_iter=max_iter, multi_class='ovr',
@@ -176,7 +176,7 @@ def __init__(self, epsilon=1.0, data_norm=None, tol=1e-4, C=1.0, fit_intercept=T
         self.classes_ = None
         self.accountant = BudgetAccountant.load_default(accountant)
 
-        warn_unused_args(unused_args)
+        self._warn_unused_args(unused_args)
 
     # noinspection PyAttributeOutsideInit
     def fit(self, X, y, sample_weight=None):
@@ -201,7 +201,7 @@ def fit(self, X, y, sample_weight=None):
         self.accountant.check(self.epsilon, 0)
 
         if sample_weight is not None:
-            warn_unused_args("sample_weight")
+            self._warn_unused_args("sample_weight")
 
         if not isinstance(self.C, numbers.Real) or self.C < 0:
             raise ValueError("Penalty term must be positive; got (C=%r)" % self.C)
@@ -211,8 +211,8 @@ def fit(self, X, y, sample_weight=None):
             raise ValueError("Tolerance for stopping criteria must be positive; got (tol=%r)" % self.tol)
 
         solver = _check_solver(self.solver, self.penalty, self.dual)
-        X, y = check_X_y(X, y, accept_sparse='csr', dtype=np.float64, order="C",
-                         accept_large_sparse=solver != 'liblinear')
+        X, y = self._validate_data(X, y, accept_sparse='csr', dtype=float, order="C",
+                                   accept_large_sparse=solver != 'liblinear')
         check_classification_targets(y)
         self.classes_ = np.unique(y)
         _, n_features = X.shape
@@ -223,7 +223,7 @@ def fit(self, X, y, sample_weight=None):
                           "privacy leakage, specify `data_norm` at initialisation.", PrivacyLeakWarning)
             self.data_norm = np.linalg.norm(X, axis=1).max()
 
-        X = clip_to_norm(X, self.data_norm)
+        X = self._clip_to_norm(X, self.data_norm)
 
         self.multi_class = _check_multi_class(self.multi_class, solver, len(self.classes_))
 

diff --git a/diffprivlib/models/naive_bayes.py b/diffprivlib/models/naive_bayes.py
@@ -22,16 +22,15 @@
 
 import numpy as np
 import sklearn.naive_bayes as sk_nb
-from sklearn.utils import check_X_y
 from sklearn.utils.multiclass import _check_partial_fit_first_call
 
 from diffprivlib.accountant import BudgetAccountant
 from diffprivlib.mechanisms import LaplaceBoundedDomain, GeometricTruncated, LaplaceTruncated
 from diffprivlib.utils import PrivacyLeakWarning, warn_unused_args
-from diffprivlib.validation import check_bounds, clip_to_bounds
+from diffprivlib.validation import DiffprivlibMixin
 
 
-class GaussianNB(sk_nb.GaussianNB):
+class GaussianNB(sk_nb.GaussianNB, DiffprivlibMixin):
     r"""Gaussian Naive Bayes (GaussianNB) with differential privacy
 
     Inherits the :class:`sklearn.naive_bayes.GaussianNB` class from Scikit Learn and adds noise to satisfy differential
@@ -67,7 +66,7 @@ class GaussianNB(sk_nb.GaussianNB):
     theta_ : array, shape (n_classes, n_features)
         mean of each feature per class
 
-    sigma_ : array, shape (n_classes, n_features)
+    var_ : array, shape (n_classes, n_features)
         variance of each feature per class
 
     epsilon_ : float
@@ -81,7 +80,7 @@ class GaussianNB(sk_nb.GaussianNB):
 
     """
 
-    def __init__(self, epsilon=1.0, bounds=None, priors=None, var_smoothing=1e-9, accountant=None):
+    def __init__(self, *, epsilon=1.0, bounds=None, priors=None, var_smoothing=1e-9, accountant=None):
         super().__init__(priors=priors, var_smoothing=var_smoothing)
 
         self.epsilon = epsilon
@@ -94,16 +93,16 @@ def _partial_fit(self, X, y, classes=None, _refit=False, sample_weight=None):
         if sample_weight is not None:
             warn_unused_args("sample_weight")
 
-        X, y = check_X_y(X, y)
+        X, y = self._validate_data(X, y)
 
         if self.bounds is None:
             warnings.warn("Bounds have not been specified and will be calculated on the data provided. This will "
                           "result in additional privacy leakage. To ensure differential privacy and no additional "
                           "privacy leakage, specify bounds for each dimension.", PrivacyLeakWarning)
             self.bounds = (np.min(X, axis=0), np.max(X, axis=0))
 
-        self.bounds = check_bounds(self.bounds, shape=X.shape[1])
-        X = clip_to_bounds(X, self.bounds)
+        self.bounds = self._check_bounds(self.bounds, shape=X.shape[1])
+        X = self._clip_to_bounds(X, self.bounds)
 
         self.epsilon_ = self.var_smoothing
 
@@ -114,7 +113,7 @@ def _partial_fit(self, X, y, classes=None, _refit=False, sample_weight=None):
             n_features = X.shape[1]
             n_classes = len(self.classes_)
             self.theta_ = np.zeros((n_classes, n_features))
-            self.sigma_ = np.zeros((n_classes, n_features))
+            self.var_ = np.zeros((n_classes, n_features))
 
             self.class_count_ = np.zeros(n_classes, dtype=np.float64)
 
@@ -136,7 +135,7 @@ def _partial_fit(self, X, y, classes=None, _refit=False, sample_weight=None):
                 raise ValueError("Number of features %d does not match previous data %d." %
                                  (X.shape[1], self.theta_.shape[1]))
             # Put epsilon back in each time
-            self.sigma_[:, :] -= self.epsilon_
+            self.var_[:, :] -= self.epsilon_
 
         classes = self.classes_
 
@@ -155,14 +154,14 @@ def _partial_fit(self, X, y, classes=None, _refit=False, sample_weight=None):
 
             n_i = noisy_class_counts[_i]
 
-            new_theta, new_sigma = self._update_mean_variance(self.class_count_[i], self.theta_[i, :],
-                                                              self.sigma_[i, :], X_i, n_noisy=n_i)
+            new_theta, new_var = self._update_mean_variance(self.class_count_[i], self.theta_[i, :],
+                                                            self.var_[i, :], X_i, n_noisy=n_i)
 
             self.theta_[i, :] = new_theta
-            self.sigma_[i, :] = new_sigma
+            self.var_[i, :] = new_var
             self.class_count_[i] += n_i
 
-        self.sigma_[:, :] += self.epsilon_
+        self.var_[:, :] += self.epsilon_
 
         # Update if only no priors is provided
         if self.priors is None:
@@ -231,18 +230,18 @@ def _update_mean_variance(self, n_past, mu, var, X, sample_weight=None, n_noisy=
         new_var = np.zeros((n_features,))
 
         for feature in range(n_features):
-            _X = X[:, feature]
+            temp_x = X[:, feature]
             lower, upper = self.bounds[0][feature], self.bounds[1][feature]
             local_diameter = upper - lower
 
             mech_mu = LaplaceTruncated(epsilon=local_epsilon, delta=0, sensitivity=local_diameter,
                                        lower=lower * n_noisy, upper=upper * n_noisy)
-            _mu = mech_mu.randomise(_X.sum()) / n_noisy
+            _mu = mech_mu.randomise(temp_x.sum()) / n_noisy
 
             local_sq_sens = max(_mu - lower, upper - _mu) ** 2
             mech_var = LaplaceBoundedDomain(epsilon=local_epsilon, delta=0, sensitivity=local_sq_sens, lower=0,
                                             upper=local_sq_sens * n_noisy)
-            _var = mech_var.randomise(((_X - _mu) ** 2).sum()) / n_noisy
+            _var = mech_var.randomise(((temp_x - _mu) ** 2).sum()) / n_noisy
 
             new_mu[feature] = _mu
             new_var[feature] = _var
@@ -285,3 +284,8 @@ def _noisy_class_counts(self, y):
             i = (i - sgn) % len(unique_y)
 
         return noisy_counts
+
+    @property
+    def sigma_(self):
+        # Todo: Consider removing when sklearn v1.0 is required
+        return self.var_