diff --git a/diffprivlib/mechanisms/laplace.py b/diffprivlib/mechanisms/laplace.py index 10a8c1c..84dc349 100644 --- a/diffprivlib/mechanisms/laplace.py +++ b/diffprivlib/mechanisms/laplace.py @@ -149,7 +149,8 @@ def randomise(self, value): self._check_all(value) scale = self.sensitivity / (self.epsilon - np.log(1 - self.delta)) - standard_laplace = self._laplace_sampler(self._rng.random(), self._rng.random(), self._rng.random(), self._rng.random()) + standard_laplace = self._laplace_sampler(self._rng.random(), self._rng.random(), self._rng.random(), + self._rng.random()) return value - scale * standard_laplace diff --git a/diffprivlib/models/k_means.py b/diffprivlib/models/k_means.py index e4fae04..89d310f 100644 --- a/diffprivlib/models/k_means.py +++ b/diffprivlib/models/k_means.py @@ -22,15 +22,14 @@ import numpy as np import sklearn.cluster as sk_cluster -from sklearn.utils import check_array from diffprivlib.accountant import BudgetAccountant from diffprivlib.mechanisms import LaplaceBoundedDomain, GeometricFolded -from diffprivlib.utils import PrivacyLeakWarning, warn_unused_args -from diffprivlib.validation import check_bounds, clip_to_bounds +from diffprivlib.utils import PrivacyLeakWarning +from diffprivlib.validation import DiffprivlibMixin -class KMeans(sk_cluster.KMeans): +class KMeans(sk_cluster.KMeans, DiffprivlibMixin): r"""K-Means clustering with differential privacy. Implements the DPLloyd approach presented in [SCL16]_, leveraging the :class:`sklearn.cluster.KMeans` class for full @@ -38,6 +37,9 @@ class KMeans(sk_cluster.KMeans): Parameters ---------- + n_clusters : int, default: 8 + The number of clusters to form as well as the number of centroids to generate. + epsilon : float, default: 1.0 Privacy parameter :math:`\epsilon`. @@ -46,9 +48,6 @@ class KMeans(sk_cluster.KMeans): the min/max of the entire data, or vectors with one entry per feature. If not provided, the bounds are computed on the data when ``.fit()`` is first called, resulting in a :class:`.PrivacyLeakWarning`. - n_clusters : int, default: 8 - The number of clusters to form as well as the number of centroids to generate. - accountant : BudgetAccountant, optional Accountant to keep track of privacy budget. @@ -75,14 +74,14 @@ class KMeans(sk_cluster.KMeans): """ - def __init__(self, epsilon=1.0, bounds=None, n_clusters=8, accountant=None, **unused_args): + def __init__(self, n_clusters=8, *, epsilon=1.0, bounds=None, accountant=None, **unused_args): super().__init__(n_clusters=n_clusters) self.epsilon = epsilon self.bounds = bounds self.accountant = BudgetAccountant.load_default(accountant) - warn_unused_args(unused_args) + self._warn_unused_args(unused_args) self.cluster_centers_ = None self.bounds_processed = None @@ -113,11 +112,11 @@ def fit(self, X, y=None, sample_weight=None): self.accountant.check(self.epsilon, 0) if sample_weight is not None: - warn_unused_args("sample_weight") + self._warn_unused_args("sample_weight") del y - X = check_array(X, accept_sparse=False, dtype=[np.float64, np.float32]) + X = self._validate_data(X, accept_sparse=False, dtype=[np.float64, np.float32]) n_samples, n_dims = X.shape if n_samples < self.n_clusters: @@ -131,8 +130,8 @@ def fit(self, X, y=None, sample_weight=None): "privacy leakage, specify `bounds` for each dimension.", PrivacyLeakWarning) self.bounds = (np.min(X, axis=0), np.max(X, axis=0)) - self.bounds = check_bounds(self.bounds, n_dims, min_separation=1e-5) - X = clip_to_bounds(X, self.bounds) + self.bounds = self._check_bounds(self.bounds, n_dims, min_separation=1e-5) + X = self._clip_to_bounds(X, self.bounds) centers = self._init_centers(n_dims) labels = None diff --git a/diffprivlib/models/linear_regression.py b/diffprivlib/models/linear_regression.py index 4c91c1b..43b5c59 100644 --- a/diffprivlib/models/linear_regression.py +++ b/diffprivlib/models/linear_regression.py @@ -48,14 +48,14 @@ import numpy as np import sklearn.linear_model as sk_lr from scipy.optimize import minimize -from sklearn.utils import check_X_y, check_array +from sklearn.utils import check_array from sklearn.utils.validation import FLOAT_DTYPES from diffprivlib.accountant import BudgetAccountant from diffprivlib.mechanisms import Laplace, LaplaceFolded from diffprivlib.tools import mean from diffprivlib.utils import warn_unused_args, PrivacyLeakWarning -from diffprivlib.validation import check_bounds, clip_to_bounds +from diffprivlib.validation import check_bounds, clip_to_bounds, DiffprivlibMixin # noinspection PyPep8Naming @@ -161,7 +161,7 @@ def obj(omega): # noinspection PyPep8Naming,PyAttributeOutsideInit -class LinearRegression(sk_lr.LinearRegression): +class LinearRegression(sk_lr.LinearRegression, DiffprivlibMixin): r""" Ordinary least squares Linear Regression with differential privacy. @@ -211,17 +211,16 @@ class LinearRegression(sk_lr.LinearRegression): regression analysis under differential privacy." arXiv preprint arXiv:1208.0219 (2012). """ - def __init__(self, epsilon=1.0, bounds_X=None, bounds_y=None, fit_intercept=True, copy_X=True, accountant=None, + def __init__(self, *, epsilon=1.0, bounds_X=None, bounds_y=None, fit_intercept=True, copy_X=True, accountant=None, **unused_args): - super().__init__(fit_intercept=fit_intercept, normalize=False, copy_X=copy_X, n_jobs=None) + super().__init__(fit_intercept=fit_intercept, copy_X=copy_X, n_jobs=None) self.epsilon = epsilon self.bounds_X = bounds_X self.bounds_y = bounds_y self.accountant = BudgetAccountant.load_default(accountant) - self.__repr__() - warn_unused_args(unused_args) + self._warn_unused_args(unused_args) def fit(self, X, y, sample_weight=None): """ @@ -245,9 +244,9 @@ def fit(self, X, y, sample_weight=None): self.accountant.check(self.epsilon, 0) if sample_weight is not None: - warn_unused_args("sample_weight") + self._warn_unused_args("sample_weight") - X, y = check_X_y(X, y, accept_sparse=False, y_numeric=True, multi_output=True) + X, y = self._validate_data(X, y, accept_sparse=False, y_numeric=True, multi_output=True) if self.bounds_X is None or self.bounds_y is None: warnings.warn( @@ -262,8 +261,8 @@ def fit(self, X, y, sample_weight=None): if self.bounds_y is None: self.bounds_y = (np.min(y, axis=0), np.max(y, axis=0)) - self.bounds_X = check_bounds(self.bounds_X, X.shape[1]) - self.bounds_y = check_bounds(self.bounds_y, y.shape[1] if y.ndim > 1 else 1) + self.bounds_X = self._check_bounds(self.bounds_X, X.shape[1]) + self.bounds_y = self._check_bounds(self.bounds_y, y.shape[1] if y.ndim > 1 else 1) n_features = X.shape[1] n_targets = y.shape[1] if y.ndim > 1 else 1 diff --git a/diffprivlib/models/logistic_regression.py b/diffprivlib/models/logistic_regression.py index e488fb9..d122e50 100644 --- a/diffprivlib/models/logistic_regression.py +++ b/diffprivlib/models/logistic_regression.py @@ -52,17 +52,17 @@ from sklearn.exceptions import ConvergenceWarning from sklearn import linear_model from sklearn.linear_model._logistic import _logistic_loss_and_grad -from sklearn.utils import check_X_y, check_array, check_consistent_length +from sklearn.utils import check_array, check_consistent_length from sklearn.utils.fixes import _joblib_parallel_args from sklearn.utils.multiclass import check_classification_targets from diffprivlib.accountant import BudgetAccountant from diffprivlib.mechanisms import Vector from diffprivlib.utils import PrivacyLeakWarning, DiffprivlibCompatibilityWarning, warn_unused_args -from diffprivlib.validation import clip_to_norm +from diffprivlib.validation import DiffprivlibMixin -class LogisticRegression(linear_model.LogisticRegression): +class LogisticRegression(linear_model.LogisticRegression, DiffprivlibMixin): r"""Logistic Regression (aka logit, MaxEnt) classifier with differential privacy. This class implements regularised logistic regression using :ref:`Scipy's L-BFGS-B algorithm @@ -166,7 +166,7 @@ class LogisticRegression(linear_model.LogisticRegression): """ - def __init__(self, epsilon=1.0, data_norm=None, tol=1e-4, C=1.0, fit_intercept=True, max_iter=100, verbose=0, + def __init__(self, *, epsilon=1.0, data_norm=None, tol=1e-4, C=1.0, fit_intercept=True, max_iter=100, verbose=0, warm_start=False, n_jobs=None, accountant=None, **unused_args): super().__init__(penalty='l2', dual=False, tol=tol, C=C, fit_intercept=fit_intercept, intercept_scaling=1.0, class_weight=None, random_state=None, solver='lbfgs', max_iter=max_iter, multi_class='ovr', @@ -176,7 +176,7 @@ def __init__(self, epsilon=1.0, data_norm=None, tol=1e-4, C=1.0, fit_intercept=T self.classes_ = None self.accountant = BudgetAccountant.load_default(accountant) - warn_unused_args(unused_args) + self._warn_unused_args(unused_args) # noinspection PyAttributeOutsideInit def fit(self, X, y, sample_weight=None): @@ -201,7 +201,7 @@ def fit(self, X, y, sample_weight=None): self.accountant.check(self.epsilon, 0) if sample_weight is not None: - warn_unused_args("sample_weight") + self._warn_unused_args("sample_weight") if not isinstance(self.C, numbers.Real) or self.C < 0: raise ValueError("Penalty term must be positive; got (C=%r)" % self.C) @@ -211,8 +211,8 @@ def fit(self, X, y, sample_weight=None): raise ValueError("Tolerance for stopping criteria must be positive; got (tol=%r)" % self.tol) solver = _check_solver(self.solver, self.penalty, self.dual) - X, y = check_X_y(X, y, accept_sparse='csr', dtype=np.float64, order="C", - accept_large_sparse=solver != 'liblinear') + X, y = self._validate_data(X, y, accept_sparse='csr', dtype=float, order="C", + accept_large_sparse=solver != 'liblinear') check_classification_targets(y) self.classes_ = np.unique(y) _, n_features = X.shape @@ -223,7 +223,7 @@ def fit(self, X, y, sample_weight=None): "privacy leakage, specify `data_norm` at initialisation.", PrivacyLeakWarning) self.data_norm = np.linalg.norm(X, axis=1).max() - X = clip_to_norm(X, self.data_norm) + X = self._clip_to_norm(X, self.data_norm) self.multi_class = _check_multi_class(self.multi_class, solver, len(self.classes_)) diff --git a/diffprivlib/models/naive_bayes.py b/diffprivlib/models/naive_bayes.py index f75c60e..9a2565d 100644 --- a/diffprivlib/models/naive_bayes.py +++ b/diffprivlib/models/naive_bayes.py @@ -22,16 +22,15 @@ import numpy as np import sklearn.naive_bayes as sk_nb -from sklearn.utils import check_X_y from sklearn.utils.multiclass import _check_partial_fit_first_call from diffprivlib.accountant import BudgetAccountant from diffprivlib.mechanisms import LaplaceBoundedDomain, GeometricTruncated, LaplaceTruncated from diffprivlib.utils import PrivacyLeakWarning, warn_unused_args -from diffprivlib.validation import check_bounds, clip_to_bounds +from diffprivlib.validation import DiffprivlibMixin -class GaussianNB(sk_nb.GaussianNB): +class GaussianNB(sk_nb.GaussianNB, DiffprivlibMixin): r"""Gaussian Naive Bayes (GaussianNB) with differential privacy Inherits the :class:`sklearn.naive_bayes.GaussianNB` class from Scikit Learn and adds noise to satisfy differential @@ -67,7 +66,7 @@ class GaussianNB(sk_nb.GaussianNB): theta_ : array, shape (n_classes, n_features) mean of each feature per class - sigma_ : array, shape (n_classes, n_features) + var_ : array, shape (n_classes, n_features) variance of each feature per class epsilon_ : float @@ -81,7 +80,7 @@ class GaussianNB(sk_nb.GaussianNB): """ - def __init__(self, epsilon=1.0, bounds=None, priors=None, var_smoothing=1e-9, accountant=None): + def __init__(self, *, epsilon=1.0, bounds=None, priors=None, var_smoothing=1e-9, accountant=None): super().__init__(priors=priors, var_smoothing=var_smoothing) self.epsilon = epsilon @@ -94,7 +93,7 @@ def _partial_fit(self, X, y, classes=None, _refit=False, sample_weight=None): if sample_weight is not None: warn_unused_args("sample_weight") - X, y = check_X_y(X, y) + X, y = self._validate_data(X, y) if self.bounds is None: warnings.warn("Bounds have not been specified and will be calculated on the data provided. This will " @@ -102,8 +101,8 @@ def _partial_fit(self, X, y, classes=None, _refit=False, sample_weight=None): "privacy leakage, specify bounds for each dimension.", PrivacyLeakWarning) self.bounds = (np.min(X, axis=0), np.max(X, axis=0)) - self.bounds = check_bounds(self.bounds, shape=X.shape[1]) - X = clip_to_bounds(X, self.bounds) + self.bounds = self._check_bounds(self.bounds, shape=X.shape[1]) + X = self._clip_to_bounds(X, self.bounds) self.epsilon_ = self.var_smoothing @@ -114,7 +113,7 @@ def _partial_fit(self, X, y, classes=None, _refit=False, sample_weight=None): n_features = X.shape[1] n_classes = len(self.classes_) self.theta_ = np.zeros((n_classes, n_features)) - self.sigma_ = np.zeros((n_classes, n_features)) + self.var_ = np.zeros((n_classes, n_features)) self.class_count_ = np.zeros(n_classes, dtype=np.float64) @@ -136,7 +135,7 @@ def _partial_fit(self, X, y, classes=None, _refit=False, sample_weight=None): raise ValueError("Number of features %d does not match previous data %d." % (X.shape[1], self.theta_.shape[1])) # Put epsilon back in each time - self.sigma_[:, :] -= self.epsilon_ + self.var_[:, :] -= self.epsilon_ classes = self.classes_ @@ -155,14 +154,14 @@ def _partial_fit(self, X, y, classes=None, _refit=False, sample_weight=None): n_i = noisy_class_counts[_i] - new_theta, new_sigma = self._update_mean_variance(self.class_count_[i], self.theta_[i, :], - self.sigma_[i, :], X_i, n_noisy=n_i) + new_theta, new_var = self._update_mean_variance(self.class_count_[i], self.theta_[i, :], + self.var_[i, :], X_i, n_noisy=n_i) self.theta_[i, :] = new_theta - self.sigma_[i, :] = new_sigma + self.var_[i, :] = new_var self.class_count_[i] += n_i - self.sigma_[:, :] += self.epsilon_ + self.var_[:, :] += self.epsilon_ # Update if only no priors is provided if self.priors is None: @@ -231,18 +230,18 @@ def _update_mean_variance(self, n_past, mu, var, X, sample_weight=None, n_noisy= new_var = np.zeros((n_features,)) for feature in range(n_features): - _X = X[:, feature] + temp_x = X[:, feature] lower, upper = self.bounds[0][feature], self.bounds[1][feature] local_diameter = upper - lower mech_mu = LaplaceTruncated(epsilon=local_epsilon, delta=0, sensitivity=local_diameter, lower=lower * n_noisy, upper=upper * n_noisy) - _mu = mech_mu.randomise(_X.sum()) / n_noisy + _mu = mech_mu.randomise(temp_x.sum()) / n_noisy local_sq_sens = max(_mu - lower, upper - _mu) ** 2 mech_var = LaplaceBoundedDomain(epsilon=local_epsilon, delta=0, sensitivity=local_sq_sens, lower=0, upper=local_sq_sens * n_noisy) - _var = mech_var.randomise(((_X - _mu) ** 2).sum()) / n_noisy + _var = mech_var.randomise(((temp_x - _mu) ** 2).sum()) / n_noisy new_mu[feature] = _mu new_var[feature] = _var @@ -285,3 +284,8 @@ def _noisy_class_counts(self, y): i = (i - sgn) % len(unique_y) return noisy_counts + + @property + def sigma_(self): + # Todo: Consider removing when sklearn v1.0 is required + return self.var_ diff --git a/diffprivlib/models/pca.py b/diffprivlib/models/pca.py index 05fa8f5..7f562d1 100644 --- a/diffprivlib/models/pca.py +++ b/diffprivlib/models/pca.py @@ -53,12 +53,12 @@ from diffprivlib.accountant import BudgetAccountant from diffprivlib.models.utils import covariance_eig from diffprivlib.tools import mean -from diffprivlib.utils import warn_unused_args, copy_docstring, PrivacyLeakWarning -from diffprivlib.validation import clip_to_norm, check_bounds +from diffprivlib.utils import copy_docstring, PrivacyLeakWarning +from diffprivlib.validation import DiffprivlibMixin # noinspection PyPep8Naming -class PCA(sk_pca.PCA): +class PCA(sk_pca.PCA, DiffprivlibMixin): r"""Principal component analysis (PCA) with differential privacy. This class is a child of :obj:`sklearn.decomposition.PCA`, with amendments to allow for the implementation of @@ -86,13 +86,6 @@ class PCA(sk_pca.PCA): n_components == min(n_samples, n_features) - 1 - centered : bool, default: False - If False, the data will be centered before calculating the principal components. This will be calculated with - differential privacy, consuming privacy budget from epsilon. - - If True, the data is assumed to have been centered previously (e.g. using :class:`.StandardScaler`), and - therefore will not require the consumption of privacy budget to calculate the mean. - epsilon : float, default: 1.0 Privacy parameter :math:`\epsilon`. If ``centered=False``, half of epsilon is used to calculate the differentially private mean to center the data prior to the calculation of principal components. @@ -105,6 +98,13 @@ class PCA(sk_pca.PCA): :class:`.PrivacyLeakWarning`, as it reveals information about the data. To preserve differential privacy fully, `data_norm` should be selected independently of the data, i.e. with domain knowledge. + centered : bool, default: False + If False, the data will be centered before calculating the principal components. This will be calculated with + differential privacy, consuming privacy budget from epsilon. + + If True, the data is assumed to have been centered previously (e.g. using :class:`.StandardScaler`), and + therefore will not require the consumption of privacy budget to calculate the mean. + bounds: tuple, optional Bounds of the data, provided as a tuple of the form (min, max). `min` and `max` can either be scalars, covering the min/max of the entire data, or vectors with one entry per feature. If not provided, the bounds are computed @@ -184,7 +184,7 @@ class PCA(sk_pca.PCA): component analysis." In 2016 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 2339-2343. IEEE, 2016. """ - def __init__(self, n_components=None, centered=False, epsilon=1.0, data_norm=None, bounds=None, copy=True, + def __init__(self, n_components=None, *, epsilon=1.0, data_norm=None, centered=False, bounds=None, copy=True, whiten=False, random_state=None, accountant=None, **unused_args): super().__init__(n_components=n_components, copy=copy, whiten=whiten, svd_solver='full', tol=0.0, iterated_power='auto', random_state=random_state) @@ -194,7 +194,7 @@ def __init__(self, n_components=None, centered=False, epsilon=1.0, data_norm=Non self.bounds = bounds self.accountant = BudgetAccountant.load_default(accountant) - warn_unused_args(unused_args) + self._warn_unused_args(unused_args) def _fit_full(self, X, n_components): self.accountant.check(self.epsilon, 0) @@ -213,7 +213,7 @@ def _fit_full(self, X, n_components): self.bounds = (np.min(X, axis=0), np.max(X, axis=0)) - self.bounds = check_bounds(self.bounds, n_features) + self.bounds = self._check_bounds(self.bounds, n_features) self.mean_ = mean(X, epsilon=self.epsilon / 2, bounds=self.bounds, axis=0, accountant=BudgetAccountant()) X -= self.mean_ @@ -224,28 +224,25 @@ def _fit_full(self, X, n_components): "privacy leakage, specify `data_norm` at initialisation.", PrivacyLeakWarning) self.data_norm = np.linalg.norm(X, axis=1).max() - X = clip_to_norm(X, self.data_norm) + X = self._clip_to_norm(X, self.data_norm) - s, u = covariance_eig(X, epsilon=self.epsilon if self.centered else self.epsilon / 2, norm=self.data_norm, - dims=n_components if isinstance(n_components, Integral) else None) - u, _ = svd_flip(u, np.zeros_like(u).T) - s = np.sqrt(s) + sigma_vec, u_mtx = covariance_eig(X, epsilon=self.epsilon if self.centered else self.epsilon / 2, + norm=self.data_norm, + dims=n_components if isinstance(n_components, Integral) else None) + u_mtx, _ = svd_flip(u_mtx, np.zeros_like(u_mtx).T) + sigma_vec = np.sqrt(sigma_vec) - components_ = u.T + components_ = u_mtx.T # Get variance explained by singular values - explained_variance_ = np.sort((s ** 2) / (n_samples - 1))[::-1] + explained_variance_ = np.sort((sigma_vec ** 2) / (n_samples - 1))[::-1] total_var = explained_variance_.sum() explained_variance_ratio_ = explained_variance_ / total_var - singular_values_ = s.copy() # Store the singular values. + singular_values_ = sigma_vec.copy() # Store the singular values. # Post-process the number of components required if n_components == 'mle': - # TODO: Update when sklearn requirement changes to >= 0.23, removing try...except - try: - n_components = sk_pca._infer_dimension(explained_variance_, n_samples) - except AttributeError: - n_components = sk_pca._infer_dimension_(explained_variance_, n_samples, n_features) + n_components = sk_pca._infer_dimension(explained_variance_, n_samples) elif 0 < n_components < 1.0: # number of components for which the cumulated explained # variance percentage is superior to the desired threshold @@ -268,7 +265,7 @@ def _fit_full(self, X, n_components): self.accountant.spend(self.epsilon, 0) - return u, s[:n_components], u.T + return u_mtx, sigma_vec[:n_components], u_mtx.T @copy_docstring(sk_pca.PCA.fit_transform) def fit_transform(self, X, y=None): diff --git a/diffprivlib/models/standard_scaler.py b/diffprivlib/models/standard_scaler.py index a2d51ae..aea9908 100644 --- a/diffprivlib/models/standard_scaler.py +++ b/diffprivlib/models/standard_scaler.py @@ -48,15 +48,11 @@ import numpy as np import sklearn.preprocessing as sk_pp from sklearn.preprocessing._data import _handle_zeros_in_scale -from sklearn.utils import check_array -from sklearn.utils.validation import FLOAT_DTYPES from diffprivlib.accountant import BudgetAccountant -from diffprivlib.utils import PrivacyLeakWarning, warn_unused_args +from diffprivlib.utils import PrivacyLeakWarning from diffprivlib.tools import nanvar, nanmean -from diffprivlib.validation import clip_to_bounds, check_bounds - -range_ = range +from diffprivlib.validation import DiffprivlibMixin def _incremental_mean_and_var(X, epsilon, bounds, last_mean, last_variance, last_sample_count): @@ -97,7 +93,7 @@ def _incremental_mean_and_var(X, epsilon, bounds, last_mean, last_variance, last # noinspection PyPep8Naming,PyAttributeOutsideInit -class StandardScaler(sk_pp.StandardScaler): +class StandardScaler(sk_pp.StandardScaler, DiffprivlibMixin): """Standardize features by removing the mean and scaling to unit variance, calculated with differential privacy guarantees. Differential privacy is guaranteed on the learned scaler with respect to the training sample; the transformed output will certainly not satisfy differential privacy. @@ -171,7 +167,7 @@ class StandardScaler(sk_pp.StandardScaler): NaNs are treated as missing values: disregarded in fit, and maintained in transform. """ # noqa - def __init__(self, epsilon=1.0, bounds=None, copy=True, with_mean=True, with_std=True, accountant=None): + def __init__(self, *, epsilon=1.0, bounds=None, copy=True, with_mean=True, with_std=True, accountant=None): super().__init__(copy=copy, with_mean=with_mean, with_std=with_std) self.epsilon = epsilon self.bounds = bounds @@ -201,24 +197,22 @@ def partial_fit(self, X, y=None, sample_weight=None): self.accountant.check(self.epsilon, 0) if sample_weight is not None: - warn_unused_args("sample_weight") + self._warn_unused_args("sample_weight") epsilon_0 = self.epsilon / 2 if self.with_std else self.epsilon - X = check_array(X, accept_sparse=False, copy=self.copy, estimator=self, dtype=FLOAT_DTYPES, - force_all_finite='allow-nan') - # Hotfix for sklearn v 0.23 - self.n_features_in_ = X.shape[1] + X = self._validate_data(X, accept_sparse=False, copy=self.copy, estimator=self, dtype=float, + force_all_finite='allow-nan') if self.bounds is None: - warnings.warn("Range parameter hasn't been specified, so falling back to determining range from the data.\n" - "This will result in additional privacy leakage. To ensure differential privacy with no " - "additional privacy loss, specify `range` for each valued returned by np.mean().", + warnings.warn("Bounds parameter hasn't been specified, so falling back to determining bounds from the " + "data.\n This will result in additional privacy leakage. To ensure differential privacy " + "with no additional privacy loss, specify `bounds` for each valued returned by np.mean().", PrivacyLeakWarning) self.bounds = (np.min(X, axis=0), np.max(X, axis=0)) - self.bounds = check_bounds(self.bounds, X.shape[1]) - X = clip_to_bounds(X, self.bounds) + self.bounds = self._check_bounds(self.bounds, X.shape[1]) + X = self._clip_to_bounds(X, self.bounds) # Even in the case of `with_mean=False`, we update the mean anyway. This is needed for the incremental # computation of the var See incr_mean_variance_axis and _incremental_mean_variance_axis diff --git a/diffprivlib/utils.py b/diffprivlib/utils.py index 6f32b8c..f087cc1 100644 --- a/diffprivlib/utils.py +++ b/diffprivlib/utils.py @@ -22,8 +22,6 @@ import numpy as np -from diffprivlib.validation import check_epsilon_delta - def global_seed(seed): """Sets the seed for all random number generators, to guarantee reproducibility in experiments. @@ -108,6 +106,8 @@ class Budget(tuple): """ def __new__(cls, epsilon, delta): + from diffprivlib.validation import check_epsilon_delta + check_epsilon_delta(epsilon, delta, allow_zero=True) return tuple.__new__(cls, (epsilon, delta)) diff --git a/diffprivlib/validation.py b/diffprivlib/validation.py index 1bf855a..f825246 100644 --- a/diffprivlib/validation.py +++ b/diffprivlib/validation.py @@ -22,6 +22,8 @@ import numpy as np +from diffprivlib.utils import warn_unused_args + def check_epsilon_delta(epsilon, delta, allow_zero=False): """Checks that epsilon and delta are valid values for differential privacy. Throws an error if checks fail, @@ -203,3 +205,10 @@ def clip_to_bounds(array, bounds): clipped_array[:, feature] = np.clip(array[:, feature], lower[feature], upper[feature]) return clipped_array + + +class DiffprivlibMixin: + _check_bounds = staticmethod(check_bounds) + _clip_to_norm = staticmethod(clip_to_norm) + _clip_to_bounds = staticmethod(clip_to_bounds) + _warn_unused_args = staticmethod(warn_unused_args) diff --git a/setup.py b/setup.py index e377e74..bbc2762 100644 --- a/setup.py +++ b/setup.py @@ -20,11 +20,11 @@ def get_version(file_path): raise RuntimeError("Unable to find version string.") -install_requires = ['numpy >= 1.17.0', - 'setuptools >= 39.0.1', - 'scikit-learn >= 0.22.0', - 'scipy >= 1.2.1', - 'joblib >= 0.11', +install_requires = ['numpy >= 1.19.0', + 'setuptools >= 49.0.0', + 'scikit-learn >= 0.23.0', + 'scipy >= 1.5.0', + 'joblib >= 0.16.0', ] docs_require = ['sphinx >= 1.4', diff --git a/tests/mechanisms/test_PermuteAndFlip.py b/tests/mechanisms/test_PermuteAndFlip.py index 7ac9307..c9e6699 100644 --- a/tests/mechanisms/test_PermuteAndFlip.py +++ b/tests/mechanisms/test_PermuteAndFlip.py @@ -176,7 +176,7 @@ def test_distrib_prob(self): def test_monotonic_distrib(self): epsilon = np.log(2) - runs = 20000 + runs = 40000 mech1 = self.mech(epsilon=epsilon, utility=[2, 1, 0], sensitivity=1, monotonic=True) mech2 = self.mech(epsilon=epsilon, utility=[2, 1, 1], sensitivity=1, monotonic=True) counts = np.zeros((2, 3)) diff --git a/tests/models/test_GaussianNB.py b/tests/models/test_GaussianNB.py index 2ebc58d..579d84f 100644 --- a/tests/models/test_GaussianNB.py +++ b/tests/models/test_GaussianNB.py @@ -98,11 +98,17 @@ def test_different_results(self): for clf in [clf_dp, clf_non_private]: clf.fit(x_train, y_train) + # Todo: remove try...except when sklearn v1.0 is required + try: + nonprivate_var = clf_non_private.var_ + except AttributeError: + nonprivate_var = clf_non_private.sigma_ + theta_diff = (clf_dp.theta_ - clf_non_private.theta_) ** 2 self.assertGreater(theta_diff.sum(), 0) - sigma_diff = (clf_dp.sigma_ - clf_non_private.sigma_) ** 2 - self.assertGreater(sigma_diff.sum(), 0) + var_diff = (clf_dp.var_ - nonprivate_var) ** 2 + self.assertGreater(var_diff.sum(), 0) @pytest.mark.filterwarnings('ignore: numpy.ufunc size changed') def test_with_iris(self): diff --git a/tests/models/test_KMeans.py b/tests/models/test_KMeans.py index 49c88bc..f14fa1a 100644 --- a/tests/models/test_KMeans.py +++ b/tests/models/test_KMeans.py @@ -11,7 +11,7 @@ def test_not_none(self): def test_simple(self): global_seed(3141592653) - clf = KMeans(epsilon=10, bounds=(0, 1), n_clusters=3) + clf = KMeans(3, epsilon=10, bounds=(0, 1)) X = np.zeros(6000) + 0.1 X[:4000] = 0.5 @@ -46,7 +46,7 @@ def test_no_bounds(self): clf.fit(X) def test_predict(self): - clf = KMeans(epsilon=1, bounds=(0, 1), n_clusters=3) + clf = KMeans(3, epsilon=1, bounds=(0, 1)) X = np.array([0.1, 0.1, 0.1, 0.1, 0.5, 0.5, 0.5, 0.5, 0.9, 0.9, 0.9]).reshape(-1, 1) clf.fit(X) @@ -60,7 +60,7 @@ def test_predict(self): self.assertTrue(0 <= predicted[2] <= 2) def test_sample_weights(self): - clf = KMeans(30, (0, 1), 3) + clf = KMeans(3, epsilon=30, bounds=(0, 1)) X = np.array([0.1, 0.1, 0.1, 0.1, 0.5, 0.5, 0.5, 0.5, 0.9, 0.9, 0.9]).reshape(-1, 1) with self.assertWarns(DiffprivlibCompatibilityWarning): @@ -68,7 +68,7 @@ def test_sample_weights(self): def test_inf_epsilon(self): global_seed(3141592653) - clf = KMeans(float("inf"), (0, 1), 3) + clf = KMeans(3, epsilon=float("inf"), bounds=(0, 1)) X = np.array([0.1, 0.1, 0.1, 0.1, 0.5, 0.5, 0.5, 0.5, 0.9, 0.9, 0.9]).reshape(-1, 1) clf.fit(X) @@ -82,7 +82,7 @@ def test_many_features(self): X = np.random.random(size=(500, 3)) bounds = (0, 1) - clf = KMeans(bounds=bounds, n_clusters=4) + clf = KMeans(4, bounds=bounds) clf.fit(X) centers = clf.cluster_centers_ @@ -96,7 +96,7 @@ def test_accountant(self): from diffprivlib.accountant import BudgetAccountant acc = BudgetAccountant() - clf = KMeans(30, (0, 1), 3, accountant=acc) + clf = KMeans(3, epsilon=30, bounds=(0, 1), accountant=acc) X = np.array([0.1, 0.1, 0.1, 0.1, 0.5, 0.5, 0.5, 0.5, 0.9, 0.9, 0.9]).reshape(-1, 1) clf.fit(X) @@ -106,7 +106,7 @@ def test_accountant(self): self.assertEqual((60, 0), acc.total()) with BudgetAccountant(15, 0) as acc2: - clf2 = KMeans(10, (0, 1), 3) + clf2 = KMeans(3, epsilon=10, bounds=(0, 1)) clf2.fit(X) self.assertEqual((10, 0), acc2.total()) diff --git a/tests/models/test_LinearRegression.py b/tests/models/test_LinearRegression.py index 7636dac..b294de3 100644 --- a/tests/models/test_LinearRegression.py +++ b/tests/models/test_LinearRegression.py @@ -104,7 +104,7 @@ def test_same_results(self): predict1 = clf.predict(X_test) - clf2 = linear_model.LinearRegression(normalize=False) + clf2 = linear_model.LinearRegression() clf2.fit(X_train, y_train) predict2 = clf2.predict(X_test)