Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updating dependencies #51

Merged
merged 7 commits into from
Sep 10, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion diffprivlib/mechanisms/laplace.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,8 @@ def randomise(self, value):
self._check_all(value)

scale = self.sensitivity / (self.epsilon - np.log(1 - self.delta))
standard_laplace = self._laplace_sampler(self._rng.random(), self._rng.random(), self._rng.random(), self._rng.random())
standard_laplace = self._laplace_sampler(self._rng.random(), self._rng.random(), self._rng.random(),
self._rng.random())

return value - scale * standard_laplace

Expand Down
25 changes: 12 additions & 13 deletions diffprivlib/models/k_means.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,22 +22,24 @@

import numpy as np
import sklearn.cluster as sk_cluster
from sklearn.utils import check_array

from diffprivlib.accountant import BudgetAccountant
from diffprivlib.mechanisms import LaplaceBoundedDomain, GeometricFolded
from diffprivlib.utils import PrivacyLeakWarning, warn_unused_args
from diffprivlib.validation import check_bounds, clip_to_bounds
from diffprivlib.utils import PrivacyLeakWarning
from diffprivlib.validation import DiffprivlibMixin


class KMeans(sk_cluster.KMeans):
class KMeans(sk_cluster.KMeans, DiffprivlibMixin):
r"""K-Means clustering with differential privacy.

Implements the DPLloyd approach presented in [SCL16]_, leveraging the :class:`sklearn.cluster.KMeans` class for full
integration with Scikit Learn.

Parameters
----------
n_clusters : int, default: 8
The number of clusters to form as well as the number of centroids to generate.

epsilon : float, default: 1.0
Privacy parameter :math:`\epsilon`.

Expand All @@ -46,9 +48,6 @@ class KMeans(sk_cluster.KMeans):
the min/max of the entire data, or vectors with one entry per feature. If not provided, the bounds are computed
on the data when ``.fit()`` is first called, resulting in a :class:`.PrivacyLeakWarning`.

n_clusters : int, default: 8
The number of clusters to form as well as the number of centroids to generate.

accountant : BudgetAccountant, optional
Accountant to keep track of privacy budget.

Expand All @@ -75,14 +74,14 @@ class KMeans(sk_cluster.KMeans):

"""

def __init__(self, epsilon=1.0, bounds=None, n_clusters=8, accountant=None, **unused_args):
def __init__(self, n_clusters=8, *, epsilon=1.0, bounds=None, accountant=None, **unused_args):
super().__init__(n_clusters=n_clusters)

self.epsilon = epsilon
self.bounds = bounds
self.accountant = BudgetAccountant.load_default(accountant)

warn_unused_args(unused_args)
self._warn_unused_args(unused_args)

self.cluster_centers_ = None
self.bounds_processed = None
Expand Down Expand Up @@ -113,11 +112,11 @@ def fit(self, X, y=None, sample_weight=None):
self.accountant.check(self.epsilon, 0)

if sample_weight is not None:
warn_unused_args("sample_weight")
self._warn_unused_args("sample_weight")

del y

X = check_array(X, accept_sparse=False, dtype=[np.float64, np.float32])
X = self._validate_data(X, accept_sparse=False, dtype=[np.float64, np.float32])
n_samples, n_dims = X.shape

if n_samples < self.n_clusters:
Expand All @@ -131,8 +130,8 @@ def fit(self, X, y=None, sample_weight=None):
"privacy leakage, specify `bounds` for each dimension.", PrivacyLeakWarning)
self.bounds = (np.min(X, axis=0), np.max(X, axis=0))

self.bounds = check_bounds(self.bounds, n_dims, min_separation=1e-5)
X = clip_to_bounds(X, self.bounds)
self.bounds = self._check_bounds(self.bounds, n_dims, min_separation=1e-5)
X = self._clip_to_bounds(X, self.bounds)

centers = self._init_centers(n_dims)
labels = None
Expand Down
21 changes: 10 additions & 11 deletions diffprivlib/models/linear_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,14 +48,14 @@
import numpy as np
import sklearn.linear_model as sk_lr
from scipy.optimize import minimize
from sklearn.utils import check_X_y, check_array
from sklearn.utils import check_array
from sklearn.utils.validation import FLOAT_DTYPES

from diffprivlib.accountant import BudgetAccountant
from diffprivlib.mechanisms import Laplace, LaplaceFolded
from diffprivlib.tools import mean
from diffprivlib.utils import warn_unused_args, PrivacyLeakWarning
from diffprivlib.validation import check_bounds, clip_to_bounds
from diffprivlib.validation import check_bounds, clip_to_bounds, DiffprivlibMixin


# noinspection PyPep8Naming
Expand Down Expand Up @@ -161,7 +161,7 @@ def obj(omega):


# noinspection PyPep8Naming,PyAttributeOutsideInit
class LinearRegression(sk_lr.LinearRegression):
class LinearRegression(sk_lr.LinearRegression, DiffprivlibMixin):
r"""
Ordinary least squares Linear Regression with differential privacy.

Expand Down Expand Up @@ -211,17 +211,16 @@ class LinearRegression(sk_lr.LinearRegression):
regression analysis under differential privacy." arXiv preprint arXiv:1208.0219 (2012).

"""
def __init__(self, epsilon=1.0, bounds_X=None, bounds_y=None, fit_intercept=True, copy_X=True, accountant=None,
def __init__(self, *, epsilon=1.0, bounds_X=None, bounds_y=None, fit_intercept=True, copy_X=True, accountant=None,
**unused_args):
super().__init__(fit_intercept=fit_intercept, normalize=False, copy_X=copy_X, n_jobs=None)
super().__init__(fit_intercept=fit_intercept, copy_X=copy_X, n_jobs=None)

self.epsilon = epsilon
self.bounds_X = bounds_X
self.bounds_y = bounds_y
self.accountant = BudgetAccountant.load_default(accountant)
self.__repr__()

warn_unused_args(unused_args)
self._warn_unused_args(unused_args)

def fit(self, X, y, sample_weight=None):
"""
Expand All @@ -245,9 +244,9 @@ def fit(self, X, y, sample_weight=None):
self.accountant.check(self.epsilon, 0)

if sample_weight is not None:
warn_unused_args("sample_weight")
self._warn_unused_args("sample_weight")

X, y = check_X_y(X, y, accept_sparse=False, y_numeric=True, multi_output=True)
X, y = self._validate_data(X, y, accept_sparse=False, y_numeric=True, multi_output=True)

if self.bounds_X is None or self.bounds_y is None:
warnings.warn(
Expand All @@ -262,8 +261,8 @@ def fit(self, X, y, sample_weight=None):
if self.bounds_y is None:
self.bounds_y = (np.min(y, axis=0), np.max(y, axis=0))

self.bounds_X = check_bounds(self.bounds_X, X.shape[1])
self.bounds_y = check_bounds(self.bounds_y, y.shape[1] if y.ndim > 1 else 1)
self.bounds_X = self._check_bounds(self.bounds_X, X.shape[1])
self.bounds_y = self._check_bounds(self.bounds_y, y.shape[1] if y.ndim > 1 else 1)

n_features = X.shape[1]
n_targets = y.shape[1] if y.ndim > 1 else 1
Expand Down
18 changes: 9 additions & 9 deletions diffprivlib/models/logistic_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,17 +52,17 @@
from sklearn.exceptions import ConvergenceWarning
from sklearn import linear_model
from sklearn.linear_model._logistic import _logistic_loss_and_grad
from sklearn.utils import check_X_y, check_array, check_consistent_length
from sklearn.utils import check_array, check_consistent_length
from sklearn.utils.fixes import _joblib_parallel_args
from sklearn.utils.multiclass import check_classification_targets

from diffprivlib.accountant import BudgetAccountant
from diffprivlib.mechanisms import Vector
from diffprivlib.utils import PrivacyLeakWarning, DiffprivlibCompatibilityWarning, warn_unused_args
from diffprivlib.validation import clip_to_norm
from diffprivlib.validation import DiffprivlibMixin


class LogisticRegression(linear_model.LogisticRegression):
class LogisticRegression(linear_model.LogisticRegression, DiffprivlibMixin):
r"""Logistic Regression (aka logit, MaxEnt) classifier with differential privacy.

This class implements regularised logistic regression using :ref:`Scipy's L-BFGS-B algorithm
Expand Down Expand Up @@ -166,7 +166,7 @@ class LogisticRegression(linear_model.LogisticRegression):

"""

def __init__(self, epsilon=1.0, data_norm=None, tol=1e-4, C=1.0, fit_intercept=True, max_iter=100, verbose=0,
def __init__(self, *, epsilon=1.0, data_norm=None, tol=1e-4, C=1.0, fit_intercept=True, max_iter=100, verbose=0,
warm_start=False, n_jobs=None, accountant=None, **unused_args):
super().__init__(penalty='l2', dual=False, tol=tol, C=C, fit_intercept=fit_intercept, intercept_scaling=1.0,
class_weight=None, random_state=None, solver='lbfgs', max_iter=max_iter, multi_class='ovr',
Expand All @@ -176,7 +176,7 @@ def __init__(self, epsilon=1.0, data_norm=None, tol=1e-4, C=1.0, fit_intercept=T
self.classes_ = None
self.accountant = BudgetAccountant.load_default(accountant)

warn_unused_args(unused_args)
self._warn_unused_args(unused_args)

# noinspection PyAttributeOutsideInit
def fit(self, X, y, sample_weight=None):
Expand All @@ -201,7 +201,7 @@ def fit(self, X, y, sample_weight=None):
self.accountant.check(self.epsilon, 0)

if sample_weight is not None:
warn_unused_args("sample_weight")
self._warn_unused_args("sample_weight")

if not isinstance(self.C, numbers.Real) or self.C < 0:
raise ValueError("Penalty term must be positive; got (C=%r)" % self.C)
Expand All @@ -211,8 +211,8 @@ def fit(self, X, y, sample_weight=None):
raise ValueError("Tolerance for stopping criteria must be positive; got (tol=%r)" % self.tol)

solver = _check_solver(self.solver, self.penalty, self.dual)
X, y = check_X_y(X, y, accept_sparse='csr', dtype=np.float64, order="C",
accept_large_sparse=solver != 'liblinear')
X, y = self._validate_data(X, y, accept_sparse='csr', dtype=float, order="C",
accept_large_sparse=solver != 'liblinear')
check_classification_targets(y)
self.classes_ = np.unique(y)
_, n_features = X.shape
Expand All @@ -223,7 +223,7 @@ def fit(self, X, y, sample_weight=None):
"privacy leakage, specify `data_norm` at initialisation.", PrivacyLeakWarning)
self.data_norm = np.linalg.norm(X, axis=1).max()

X = clip_to_norm(X, self.data_norm)
X = self._clip_to_norm(X, self.data_norm)

self.multi_class = _check_multi_class(self.multi_class, solver, len(self.classes_))

Expand Down
38 changes: 21 additions & 17 deletions diffprivlib/models/naive_bayes.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,16 +22,15 @@

import numpy as np
import sklearn.naive_bayes as sk_nb
from sklearn.utils import check_X_y
from sklearn.utils.multiclass import _check_partial_fit_first_call

from diffprivlib.accountant import BudgetAccountant
from diffprivlib.mechanisms import LaplaceBoundedDomain, GeometricTruncated, LaplaceTruncated
from diffprivlib.utils import PrivacyLeakWarning, warn_unused_args
from diffprivlib.validation import check_bounds, clip_to_bounds
from diffprivlib.validation import DiffprivlibMixin


class GaussianNB(sk_nb.GaussianNB):
class GaussianNB(sk_nb.GaussianNB, DiffprivlibMixin):
r"""Gaussian Naive Bayes (GaussianNB) with differential privacy

Inherits the :class:`sklearn.naive_bayes.GaussianNB` class from Scikit Learn and adds noise to satisfy differential
Expand Down Expand Up @@ -67,7 +66,7 @@ class GaussianNB(sk_nb.GaussianNB):
theta_ : array, shape (n_classes, n_features)
mean of each feature per class

sigma_ : array, shape (n_classes, n_features)
var_ : array, shape (n_classes, n_features)
variance of each feature per class

epsilon_ : float
Expand All @@ -81,7 +80,7 @@ class GaussianNB(sk_nb.GaussianNB):

"""

def __init__(self, epsilon=1.0, bounds=None, priors=None, var_smoothing=1e-9, accountant=None):
def __init__(self, *, epsilon=1.0, bounds=None, priors=None, var_smoothing=1e-9, accountant=None):
super().__init__(priors=priors, var_smoothing=var_smoothing)

self.epsilon = epsilon
Expand All @@ -94,16 +93,16 @@ def _partial_fit(self, X, y, classes=None, _refit=False, sample_weight=None):
if sample_weight is not None:
warn_unused_args("sample_weight")

X, y = check_X_y(X, y)
X, y = self._validate_data(X, y)

if self.bounds is None:
warnings.warn("Bounds have not been specified and will be calculated on the data provided. This will "
"result in additional privacy leakage. To ensure differential privacy and no additional "
"privacy leakage, specify bounds for each dimension.", PrivacyLeakWarning)
self.bounds = (np.min(X, axis=0), np.max(X, axis=0))

self.bounds = check_bounds(self.bounds, shape=X.shape[1])
X = clip_to_bounds(X, self.bounds)
self.bounds = self._check_bounds(self.bounds, shape=X.shape[1])
X = self._clip_to_bounds(X, self.bounds)

self.epsilon_ = self.var_smoothing

Expand All @@ -114,7 +113,7 @@ def _partial_fit(self, X, y, classes=None, _refit=False, sample_weight=None):
n_features = X.shape[1]
n_classes = len(self.classes_)
self.theta_ = np.zeros((n_classes, n_features))
self.sigma_ = np.zeros((n_classes, n_features))
self.var_ = np.zeros((n_classes, n_features))

self.class_count_ = np.zeros(n_classes, dtype=np.float64)

Expand All @@ -136,7 +135,7 @@ def _partial_fit(self, X, y, classes=None, _refit=False, sample_weight=None):
raise ValueError("Number of features %d does not match previous data %d." %
(X.shape[1], self.theta_.shape[1]))
# Put epsilon back in each time
self.sigma_[:, :] -= self.epsilon_
self.var_[:, :] -= self.epsilon_

classes = self.classes_

Expand All @@ -155,14 +154,14 @@ def _partial_fit(self, X, y, classes=None, _refit=False, sample_weight=None):

n_i = noisy_class_counts[_i]

new_theta, new_sigma = self._update_mean_variance(self.class_count_[i], self.theta_[i, :],
self.sigma_[i, :], X_i, n_noisy=n_i)
new_theta, new_var = self._update_mean_variance(self.class_count_[i], self.theta_[i, :],
self.var_[i, :], X_i, n_noisy=n_i)

self.theta_[i, :] = new_theta
self.sigma_[i, :] = new_sigma
self.var_[i, :] = new_var
self.class_count_[i] += n_i

self.sigma_[:, :] += self.epsilon_
self.var_[:, :] += self.epsilon_

# Update if only no priors is provided
if self.priors is None:
Expand Down Expand Up @@ -231,18 +230,18 @@ def _update_mean_variance(self, n_past, mu, var, X, sample_weight=None, n_noisy=
new_var = np.zeros((n_features,))

for feature in range(n_features):
_X = X[:, feature]
temp_x = X[:, feature]
lower, upper = self.bounds[0][feature], self.bounds[1][feature]
local_diameter = upper - lower

mech_mu = LaplaceTruncated(epsilon=local_epsilon, delta=0, sensitivity=local_diameter,
lower=lower * n_noisy, upper=upper * n_noisy)
_mu = mech_mu.randomise(_X.sum()) / n_noisy
_mu = mech_mu.randomise(temp_x.sum()) / n_noisy

local_sq_sens = max(_mu - lower, upper - _mu) ** 2
mech_var = LaplaceBoundedDomain(epsilon=local_epsilon, delta=0, sensitivity=local_sq_sens, lower=0,
upper=local_sq_sens * n_noisy)
_var = mech_var.randomise(((_X - _mu) ** 2).sum()) / n_noisy
_var = mech_var.randomise(((temp_x - _mu) ** 2).sum()) / n_noisy

new_mu[feature] = _mu
new_var[feature] = _var
Expand Down Expand Up @@ -285,3 +284,8 @@ def _noisy_class_counts(self, y):
i = (i - sgn) % len(unique_y)

return noisy_counts

@property
def sigma_(self):
# Todo: Consider removing when sklearn v1.0 is required
return self.var_
Loading