[MNT] Diffprivlib 0.6.2 (#77)

* Deprecating n_features_ attribute in PCA * Updating PCA test * Adding parameter constraints and validation * Updating github actions versions * Updating more github actions versions * Adding library legacy tests * Adding tests to support Python 3.11 * Removing unreachable branch * Updating version number
IBM · Dec 9, 2022 · 9369ae7 · 9369ae7
1 parent 049f463
commit 9369ae7
Show file tree

Hide file tree

Showing 16 changed files with 98 additions and 43 deletions.
diff --git a/.github/workflows/code.yml b/.github/workflows/code.yml
@@ -14,10 +14,10 @@ jobs:
   build:
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v2
-    - uses: actions/setup-python@v2
+    - uses: actions/checkout@v3
+    - uses: actions/setup-python@v4
       with:
-        python-version: 3.9
+        python-version: '3.10'
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
@@ -28,7 +28,7 @@ jobs:
       run: |
         pytest --cov-report=xml --cov=diffprivlib --cov-append
     - name: Codecov upload
-      uses: codecov/codecov-action@v2.1.0
+      uses: codecov/codecov-action@v3
     - name: pycodestyle
       run: pycodestyle --max-line-length=120 diffprivlib
     - name: pylint

diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml
@@ -10,10 +10,10 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
-      - uses: actions/checkout@v2
-      - uses: actions/setup-python@v2
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v4
         with:
-          python-version: 3.9
+          python-version: '3.10'
 
       - name: Update pip
         run: python -m pip install --upgrade pip
@@ -33,6 +33,6 @@ jobs:
           .
       - name: Publish distribution 📦 to PyPI
         if: startsWith(github.ref, 'refs/tags')
-        uses: pypa/gh-action-pypi-publish@master
+        uses: pypa/gh-action-pypi-publish@release/v1
         with:
           password: ${{ secrets.PYPI_API_TOKEN }}
diff --git a/.github/workflows/general.yml b/.github/workflows/general.yml
@@ -17,12 +17,12 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ['3.8', '3.9', '3.10']
+        python-version: ['3.8', '3.9', '3.10', '3.11']
 
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3
     - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
+      uses: actions/setup-python@v4
       with:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies

diff --git a/.github/workflows/libraries.yml b/.github/workflows/libraries.yml
@@ -25,13 +25,19 @@ jobs:
           - library: numpy
             version: 1.22.4
             python-version: '3.10'
+          - library: numpy
+            version: 1.23.5
+            python-version: '3.10'
 
           - library: scikit-learn
             version: 0.24.2
             python-version: 3.9
           - library: scikit-learn
             version: 1.0.2
             python-version: '3.10'
+          - library: scikit-learn
+            version: 1.1.3
+            python-version: '3.10'
 
           - library: scipy
             version: 1.7.3
@@ -44,9 +50,9 @@ jobs:
             python-version: '3.10'
 
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3
     - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
+      uses: actions/setup-python@v4
       with:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies

diff --git a/README.md b/README.md
@@ -17,7 +17,7 @@ Use diffprivlib if you are looking to:
 - Explore the impact of differential privacy on machine learning accuracy using classification and clustering models 
 - Build your own differential privacy applications, using our extensive collection of mechanisms
 
-Diffprivlib supports Python versions 3.8 to 3.10.
+Diffprivlib supports Python versions 3.8 to 3.11.
 
 ## Getting started: [Machine learning with differential privacy in 30 seconds](https://github.com/IBM/differential-privacy-library/blob/main/notebooks/30seconds.ipynb)
 We're using the [Iris dataset](https://archive.ics.uci.edu/ml/datasets/iris), so let's load it and perform an 80/20 train/test split.

diff --git a/diffprivlib/__init__.py b/diffprivlib/__init__.py
@@ -29,4 +29,4 @@
 from diffprivlib import tools
 from diffprivlib.accountant import BudgetAccountant
 
-__version__ = '0.6.1'
+__version__ = '0.6.2'
diff --git a/diffprivlib/mechanisms/gaussian.py b/diffprivlib/mechanisms/gaussian.py
@@ -175,36 +175,33 @@ def b_minus(val):
 
         delta_0 = b_plus(0)
 
-        if delta_0 == 0:
-            alpha = 1
+        if delta_0 < 0:
+            target_func = b_plus
         else:
-            if delta_0 < 0:
-                target_func = b_plus
-            else:
-                target_func = b_minus
+            target_func = b_minus
 
-            # Find the starting interval by doubling the initial size until the target_func sign changes, as suggested
-            # in the paper
-            left = 0
-            right = 1
+        # Find the starting interval by doubling the initial size until the target_func sign changes, as suggested
+        # in the paper
+        left = 0
+        right = 1
 
-            while target_func(left) * target_func(right) > 0:
-                left = right
-                right *= 2
+        while target_func(left) * target_func(right) > 0:
+            left = right
+            right *= 2
 
-            # Binary search code copied from mechanisms.LaplaceBoundedDomain
-            old_interval_size = (right - left) * 2
+        # Binary search code copied from mechanisms.LaplaceBoundedDomain
+        old_interval_size = (right - left) * 2
 
-            while old_interval_size > right - left:
-                old_interval_size = right - left
-                middle = (right + left) / 2
+        while old_interval_size > right - left:
+            old_interval_size = right - left
+            middle = (right + left) / 2
 
-                if target_func(middle) * target_func(left) <= 0:
-                    right = middle
-                if target_func(middle) * target_func(right) <= 0:
-                    left = middle
+            if target_func(middle) * target_func(left) <= 0:
+                right = middle
+            if target_func(middle) * target_func(right) <= 0:
+                left = middle
 
-            alpha = np.sqrt(1 + (left + right) / 4) + (-1 if delta_0 < 0 else 1) * np.sqrt((left + right) / 4)
+        alpha = np.sqrt(1 + (left + right) / 4) + (-1 if delta_0 < 0 else 1) * np.sqrt((left + right) / 4)
 
         return alpha * self.sensitivity / np.sqrt(2 * self.epsilon)
 

diff --git a/diffprivlib/models/forest.py b/diffprivlib/models/forest.py
@@ -128,6 +128,9 @@ class RandomForestClassifier(skRandomForestClassifier, DiffprivlibMixin):  # pyl
 
     """
 
+    _parameter_constraints = DiffprivlibMixin._copy_parameter_constraints(
+        skRandomForestClassifier, "n_estimators", "n_jobs", "verbose", "random_state", "warm_start")
+
     def __init__(self, n_estimators=10, *, epsilon=1.0, bounds=None, classes=None, n_jobs=1, verbose=0, accountant=None,
                  random_state=None, max_depth=5, warm_start=False, shuffle=False, **unused_args):
         super().__init__(
@@ -145,7 +148,11 @@ def __init__(self, n_estimators=10, *, epsilon=1.0, bounds=None, classes=None, n
         self.shuffle = shuffle
         self.accountant = BudgetAccountant.load_default(accountant)
 
-        self.base_estimator = DecisionTreeClassifier()
+        # Todo: Remove when scikit-learn v1.2 is a min requirement
+        if hasattr(self, "estimator"):
+            self.estimator = DecisionTreeClassifier()
+        else:
+            self.base_estimator = DecisionTreeClassifier()
         self.estimator_params = ("max_depth", "epsilon", "bounds", "classes")
 
         self._warn_unused_args(unused_args)
@@ -170,6 +177,7 @@ def fit(self, X, y, sample_weight=None):
         self : object
             Fitted estimator.
         """
+        self._validate_params()
         self.accountant.check(self.epsilon, 0)
 
         if sample_weight is not None:
@@ -250,6 +258,7 @@ def fit(self, X, y, sample_weight=None):
         # that case. However, for joblib 0.12+ we respect any
         # parallel_backend contexts set at a higher level,
         # since correctness does not rely on using threads.
+        # Todo: Remove when scikit-learn v1.1 is a min requirement
         try:
             trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, prefer="threads")(
                 delayed(_parallel_build_trees)(
@@ -332,9 +341,12 @@ class DecisionTreeClassifier(skDecisionTreeClassifier, DiffprivlibMixin):
 
     """
 
+    _parameter_constraints = DiffprivlibMixin._copy_parameter_constraints(
+        skDecisionTreeClassifier, "max_depth", "random_state")
+
     def __init__(self, max_depth=5, *, epsilon=1, bounds=None, classes=None, random_state=None, accountant=None,
                  **unused_args):
-        # TODO: Remove try...except when sklearn v1.0 is min-requirement
+        # Todo: Remove when scikit-learn v1.0 is a min requirement
         try:
             super().__init__(  # pylint: disable=unexpected-keyword-arg
                 criterion=None,
@@ -391,6 +403,7 @@ def fit(self, X, y, sample_weight=None, check_input=True):
         self : DecisionTreeClassifier
             Fitted estimator.
         """
+        self._validate_params()
         random_state = check_random_state(self.random_state)
 
         self.accountant.check(self.epsilon, 0)

diff --git a/diffprivlib/models/k_means.py b/diffprivlib/models/k_means.py
@@ -78,6 +78,9 @@ class KMeans(sk_cluster.KMeans, DiffprivlibMixin):
 
     """
 
+    _parameter_constraints = DiffprivlibMixin._copy_parameter_constraints(
+        sk_cluster.KMeans, "n_clusters", "random_state")
+
     def __init__(self, n_clusters=8, *, epsilon=1.0, bounds=None, random_state=None, accountant=None, **unused_args):
         super().__init__(n_clusters=n_clusters, random_state=random_state)
 
@@ -113,6 +116,7 @@ def fit(self, X, y=None, sample_weight=None):
         self : class
 
         """
+        self._validate_params()
         self.accountant.check(self.epsilon, 0)
 
         if sample_weight is not None:

diff --git a/diffprivlib/models/linear_regression.py b/diffprivlib/models/linear_regression.py
@@ -221,6 +221,10 @@ class LinearRegression(sk_lr.LinearRegression, DiffprivlibMixin):
         regression analysis under differential privacy." arXiv preprint arXiv:1208.0219 (2012).
 
     """
+
+    _parameter_constraints = DiffprivlibMixin._copy_parameter_constraints(
+        sk_lr.LinearRegression, "fit_intercept", "copy_X")
+
     def __init__(self, *, epsilon=1.0, bounds_X=None, bounds_y=None, fit_intercept=True, copy_X=True, random_state=None,
                  accountant=None, **unused_args):
         super().__init__(fit_intercept=fit_intercept, copy_X=copy_X, n_jobs=None)
@@ -253,6 +257,7 @@ def fit(self, X, y, sample_weight=None):
         self : returns an instance of self.
 
         """
+        self._validate_params()
         self.accountant.check(self.epsilon, 0)
 
         if sample_weight is not None:

diff --git a/diffprivlib/models/logistic_regression.py b/diffprivlib/models/logistic_regression.py
@@ -177,6 +177,10 @@ class LogisticRegression(linear_model.LogisticRegression, DiffprivlibMixin):
 
     """
 
+    _parameter_constraints = DiffprivlibMixin._copy_parameter_constraints(
+        linear_model.LogisticRegression, "tol", "C", "fit_intercept", "max_iter", "verbose", "warm_start", "n_jobs",
+        "random_state")
+
     def __init__(self, *, epsilon=1.0, data_norm=None, tol=1e-4, C=1.0, fit_intercept=True, max_iter=100, verbose=0,
                  warm_start=False, n_jobs=None, random_state=None, accountant=None, **unused_args):
         super().__init__(penalty='l2', dual=False, tol=tol, C=C, fit_intercept=fit_intercept, intercept_scaling=1.0,
@@ -209,13 +213,15 @@ def fit(self, X, y, sample_weight=None):
         self : class
 
         """
+        self._validate_params()
         self.accountant.check(self.epsilon, 0)
 
         if sample_weight is not None:
             self._warn_unused_args("sample_weight")
 
         random_state = check_random_state(self.random_state)
 
+        # Todo: Remove when scikit-learn v1.2 is a min requirement
         if not isinstance(self.C, numbers.Real) or self.C < 0:
             raise ValueError(f"Penalty term must be positive; got (C={self.C})")
         if not isinstance(self.max_iter, numbers.Integral) or self.max_iter < 0:

diff --git a/diffprivlib/models/pca.py b/diffprivlib/models/pca.py
@@ -159,7 +159,7 @@ class PCA(sk_pca.PCA, DiffprivlibMixin):
         svd_solver == 'full') this number is estimated from input data.  Otherwise it equals the parameter
         n_components, or the lesser value of n_features and n_samples if n_components is None.
 
-    n_features_ : int
+    n_features_in_ : int
         Number of features in the training data.
 
     n_samples_ : int
@@ -184,6 +184,10 @@ class PCA(sk_pca.PCA, DiffprivlibMixin):
         component analysis." In 2016 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP),
         pp. 2339-2343. IEEE, 2016.
     """
+
+    _parameter_constraints = DiffprivlibMixin._copy_parameter_constraints(
+        sk_pca.PCA, "n_components", "copy", "whiten", "random_state")
+
     def __init__(self, n_components=None, *, epsilon=1.0, data_norm=None, centered=False, bounds=None, copy=True,
                  whiten=False, random_state=None, accountant=None, **unused_args):
         super().__init__(n_components=n_components, copy=copy, whiten=whiten, svd_solver='full', tol=0.0,
@@ -196,6 +200,11 @@ def __init__(self, n_components=None, *, epsilon=1.0, data_norm=None, centered=F
 
         self._warn_unused_args(unused_args)
 
+    # Todo: Remove when scikit-learn v1.2 is a min requirement
+    @property
+    def n_features_(self):
+        return self.n_features_in_
+
     def _fit_full(self, X, n_components):
         self.accountant.check(self.epsilon, 0)
 
@@ -259,7 +268,7 @@ def _fit_full(self, X, n_components):
         else:
             self.noise_variance_ = 0.
 
-        self.n_samples_, self.n_features_ = n_samples, n_features
+        self.n_samples_ = n_samples
         self.components_ = components_[:n_components]
         self.n_components_ = n_components
         self.explained_variance_ = explained_variance_[:n_components]

diff --git a/diffprivlib/models/standard_scaler.py b/diffprivlib/models/standard_scaler.py
@@ -200,6 +200,7 @@ def partial_fit(self, X, y=None, sample_weight=None):
             Ignored by diffprivlib.  Present for consistency with sklearn API.
 
         """
+        self._validate_params()
         self.accountant.check(self.epsilon, 0)
 
         if sample_weight is not None:

diff --git a/diffprivlib/validation.py b/diffprivlib/validation.py
@@ -206,3 +206,16 @@ class DiffprivlibMixin:  # pylint: disable=too-few-public-methods
     _clip_to_norm = staticmethod(clip_to_norm)
     _clip_to_bounds = staticmethod(clip_to_bounds)
     _warn_unused_args = staticmethod(warn_unused_args)
+
+    # todo: remove when scikit-learn v1.2 is a min requirement
+    def _validate_params(self):
+        pass
+
+    @staticmethod
+    def _copy_parameter_constraints(cls, *args):
+        """Copies the parameter constraints for `*args` from `cls`
+        """
+        if not hasattr(cls, "_parameter_constraints"):
+            return {}
+
+        return {k: cls._parameter_constraints[k] for k in args if k in cls._parameter_constraints}
diff --git a/setup.py b/setup.py
@@ -62,6 +62,7 @@ def get_version(file_path):
         'Programming Language :: Python :: 3.8',
         'Programming Language :: Python :: 3.9',
         'Programming Language :: Python :: 3.10',
+        'Programming Language :: Python :: 3.11',
         'Topic :: Software Development :: Libraries',
         'Topic :: Software Development :: Libraries :: Python Modules',
         'Topic :: Scientific/Engineering',

diff --git a/tests/models/test_PCA.py b/tests/models/test_PCA.py
@@ -88,7 +88,7 @@ def test_inf_epsilon(self):
             self.assertAlmostEqual(clf.score(X), sk_clf.score(X))
             self.assertTrue(np.allclose(clf.get_precision(), sk_clf.get_precision()))
             self.assertTrue(np.allclose(clf.get_covariance(), sk_clf.get_covariance()))
-            self.assertTrue(np.allclose(np.abs((clf.components_ / sk_clf.components_).sum(axis=1)), clf.n_features_))
+            self.assertTrue(np.allclose(np.abs((clf.components_ / sk_clf.components_).sum(axis=1)), clf.n_features_in_))
 
     def test_big_epsilon(self):
         rng = check_random_state(2)