From cb1ab6929a59042a76b80799b777a473f7e9f79e Mon Sep 17 00:00:00 2001 From: Richard Preen Date: Tue, 11 Jun 2024 10:52:45 +0100 Subject: [PATCH] PEP 257 - docstring formatting (#286) * PEP 257 - docstring formatting * clean up after tests * clean up * fix wc test code coverage * clean up docstring example --- .codecov.yml | 1 + aisdc/__init__.py | 1 + aisdc/attacks/__init__.py | 1 + aisdc/attacks/attack.py | 10 +- aisdc/attacks/attack_report_formatter.py | 54 +++--- aisdc/attacks/attribute_attack.py | 92 +++++----- aisdc/attacks/failfast.py | 44 ++--- aisdc/attacks/likelihood_attack.py | 58 +++---- aisdc/attacks/multiple_attacks.py | 27 ++- aisdc/attacks/report.py | 7 +- aisdc/attacks/structural_attack.py | 107 +++++------- aisdc/attacks/target.py | 13 +- aisdc/attacks/worst_case_attack.py | 92 +++++----- aisdc/metrics.py | 47 +++--- aisdc/preprocessing/__init__.py | 1 + aisdc/preprocessing/loaders.py | 72 ++++---- aisdc/safemodel/__init__.py | 2 +- aisdc/safemodel/classifiers/dp_svc.py | 63 ++++--- .../classifiers/new_model_template.py | 24 +-- .../classifiers/safedecisiontreeclassifier.py | 28 +--- aisdc/safemodel/classifiers/safekeras.py | 96 +++-------- .../classifiers/saferandomforestclassifier.py | 17 +- aisdc/safemodel/classifiers/safesvc.py | 10 +- aisdc/safemodel/classifiers/safetf.py | 14 +- aisdc/safemodel/reporting.py | 5 +- aisdc/safemodel/safemodel.py | 157 ++++-------------- docs/source/conf.py | 4 +- examples/MIAandAIA_attacks_example.py | 13 +- examples/attribute_inference_example.py | 10 +- examples/lira_attack_example.py | 13 +- .../safemodel_attack_integration_bothcalls.py | 5 +- examples/worst_case_attack_example.py | 26 ++- pyproject.toml | 15 +- tests/__init__.py | 1 + tests/attacks/__init__.py | 1 + tests/attacks/test_attack_report_formatter.py | 3 +- tests/attacks/test_lira_attack.py | 4 +- tests/attacks/test_metrics.py | 2 - tests/attacks/test_structural_attack.py | 8 +- tests/attacks/test_worst_case_attack.py | 10 +- tests/conftest.py | 8 +- tests/preprocessing/__init__.py | 1 + tests/preprocessing/test_loaders.py | 8 +- tests/safemodel/__init__.py | 1 + tests/safemodel/test_attacks.py | 1 - tests/safemodel/test_safekeras2.py | 2 - tests/safemodel/test_safemodel.py | 7 +- .../generate_disclosure_risk_report.py | 16 +- .../user_story_1_researcher_template.py | 4 +- user_stories/user_story_1/user_story_1_tre.py | 20 +-- .../data_processing_researcher.py | 3 +- .../user_story_2_researcher_template.py | 4 +- user_stories/user_story_2/user_story_2_tre.py | 22 +-- .../user_story_3_researcher_template.py | 4 +- user_stories/user_story_3/user_story_3_tre.py | 20 +-- user_stories/user_story_4/user_story_4_tre.py | 20 +-- .../user_story_7_researcher_template.py | 4 +- user_stories/user_story_7/user_story_7_tre.py | 27 ++- .../data_processing_researcher.py | 3 +- .../user_story_8_researcher_template.py | 4 +- user_stories/user_story_8/user_story_8_tre.py | 27 ++- 61 files changed, 532 insertions(+), 832 deletions(-) diff --git a/.codecov.yml b/.codecov.yml index 8a9841a4..dcfbc6a8 100644 --- a/.codecov.yml +++ b/.codecov.yml @@ -4,4 +4,5 @@ ignore: - "setup.py" - "aisdc/safemodel/classifiers/new_model_template.py" - "aisdc/preprocessing" + - "user_stories" ... diff --git a/aisdc/__init__.py b/aisdc/__init__.py index e69de29b..874a2dca 100644 --- a/aisdc/__init__.py +++ b/aisdc/__init__.py @@ -0,0 +1 @@ +"""Tools for managing the statistical disclosure control of trained ML models.""" diff --git a/aisdc/attacks/__init__.py b/aisdc/attacks/__init__.py index e69de29b..a9aacd77 100644 --- a/aisdc/attacks/__init__.py +++ b/aisdc/attacks/__init__.py @@ -0,0 +1 @@ +"""Collection of attacks for assessing the privacy of trained ML models.""" diff --git a/aisdc/attacks/attack.py b/aisdc/attacks/attack.py index 30e55906..c15a1f10 100644 --- a/aisdc/attacks/attack.py +++ b/aisdc/attacks/attack.py @@ -1,4 +1,4 @@ -"""Attack.py - base class for an attack object.""" +"""Base class for an attack object.""" import inspect import json @@ -13,14 +13,15 @@ def __init__(self): self.attack_config_json_file_name = None def attack(self, target: Target) -> None: - """Method to run an attack.""" + """Run an attack.""" raise NotImplementedError def __str__(self): + """Return the string representation of an attack.""" raise NotImplementedError def _update_params_from_config_file(self) -> None: - """Reads a configuration file and loads it into a dictionary object.""" + """Read a configuration file and load it into a dictionary object.""" with open(self.attack_config_json_file_name, encoding="utf-8") as f: config = json.loads(f.read()) for key, value in config.items(): @@ -38,8 +39,7 @@ def _get_param_names(cls): return parameters def get_params(self): - """ - Get parameters for this attack. + """Get parameters for this attack. Returns ------- diff --git a/aisdc/attacks/attack_report_formatter.py b/aisdc/attacks/attack_report_formatter.py index b694d01d..50cfd1e6 100644 --- a/aisdc/attacks/attack_report_formatter.py +++ b/aisdc/attacks/attack_report_formatter.py @@ -19,11 +19,7 @@ def cleanup_files_for_release( release_dir="release_files", artefacts_dir="training_artefacts", ): - """ - Function that will move any files created throughout the release process and - sort them into appropriate folders. - """ - + """Move files created during the release process into appropriate folders.""" if not os.path.exists(release_dir): os.makedirs(release_dir) @@ -44,7 +40,7 @@ def cleanup_files_for_release( class GenerateJSONModule: - """Module that creates and appends to a JSON file.""" + """Create and append to a JSON file.""" def __init__(self, filename=None): self.filename = filename @@ -63,7 +59,6 @@ def __init__(self, filename=None): def add_attack_output(self, incoming_json, class_name): """Add a section of JSON to the file which is already open.""" - # Read the contents of the file and then clear the file with open(self.filename, "r+", encoding="utf-8") as f: file_contents = f.read() @@ -85,7 +80,7 @@ def add_attack_output(self, incoming_json, class_name): json.dump(file_data, f) def get_output_filename(self): - """Returns the filename of the JSON file which has been created.""" + """Return the filename of the JSON file which has been created.""" return self.filename def clean_file(self): @@ -106,19 +101,20 @@ def __init__(self): self.support_release = [] def process_dict(self): - """Function that produces a risk summary output based on analysis in this module.""" + """Produce a risk summary output based on analysis in this module.""" raise NotImplementedError() def get_recommendation(self): - """Function that returns the three recommendation buckets created by this module.""" + """Return the three recommendation buckets created by this module.""" return self.immediate_rejection, self.support_rejection, self.support_release def __str__(self): + """Return the string representation of an analysis module.""" raise NotImplementedError() class FinalRecommendationModule(AnalysisModule): # pylint: disable=too-many-instance-attributes - """Module that generates the first layer of a recommendation report.""" + """Generate the first layer of a recommendation report.""" def __init__(self, report: dict): super().__init__() @@ -239,6 +235,7 @@ def _statistically_significant_auc( self.support_release.append(msg) def process_dict(self): + """Return a dictionary summarising the metrics.""" self._tree_min_samples_leaf(self.MIN_SAMPLES_LEAF_SCORE) self._statistically_significant_auc( self.P_VAL_THRESH, @@ -258,19 +255,15 @@ def process_dict(self): summarised_score = self.INSTANCE_MODEL_WEIGHTING_SCORE output = {} - - # msg = "Final score (scale of 0-5, where 0 is least disclosive and 5 is recommend - # rejection)" - # output[msg] = summarised_score - return output def __str__(self): + """Return string representation of the final recommendation.""" return "Final Recommendation" class SummariseUnivariateMetricsModule(AnalysisModule): - """Module that summarises a set of chosen univariate metrics from the output dictionary.""" + """Summarise a set of chosen univariate metrics from the output dictionary.""" def __init__(self, report: dict, metrics_list=None): super().__init__() @@ -282,6 +275,7 @@ def __init__(self, report: dict, metrics_list=None): self.metrics_list = metrics_list def process_dict(self): + """Return a dictionary summarising the metrics.""" output_dict = {} for k in self.report.keys(): @@ -305,11 +299,12 @@ def process_dict(self): return output_dict def __str__(self): + """Return the string representation of a univariate metrics module.""" return "Summary of Univarite Metrics" class SummariseAUCPvalsModule(AnalysisModule): - """Module that summarises a list of AUC values.""" + """Summarise a list of AUC values.""" def __init__(self, report: dict, p_thresh: float = 0.05, correction: str = "bh"): super().__init__() @@ -319,10 +314,7 @@ def __init__(self, report: dict, p_thresh: float = 0.05, correction: str = "bh") self.correction = correction def _n_sig(self, p_val_list: list[float], correction: str = "none") -> int: - """Compute the number of significant p-vals in a list with different corrections for - multiple testing. - """ - + """Compute the number of significant p-vals with different corrections.""" if correction == "none": return len(np.where(np.array(p_val_list) <= self.p_thresh)[0]) if correction == "bh": @@ -362,6 +354,7 @@ def process_dict(self): return output def __str__(self): + """Return the string representation of a AUC p-values module.""" return f"Summary of AUC p-values at p = ({self.p_thresh})" @@ -377,11 +370,12 @@ def get_metric_list(self, input_dict: dict) -> list[float]: return metric_list def __str__(self): + """Return the string representation of a FDIF p-values module.""" return f"Summary of FDIF p-values at p = ({self.p_thresh})" class LogLogROCModule(AnalysisModule): - """Module that generates a log-log plot.""" + """Generate a log-log plot.""" def __init__(self, report: dict, output_folder=None, include_mean=True): super().__init__() @@ -438,11 +432,12 @@ def process_dict(self): return msg def __str__(self): + """Return the string representation of a ROC log plot module.""" return "ROC Log Plot" class GenerateTextReport: - """Module that generates a text report from a JSON input.""" + """Generate a text report from a JSON input.""" def __init__(self): self.text_out = [] @@ -455,8 +450,7 @@ def __init__(self): self.support_release = [] def _process_target_json(self): - """Function that creates a summary of a target model JSON file.""" - + """Create a summary of a target model JSON file.""" model_params_of_interest = [ "C", "kernel", @@ -502,8 +496,7 @@ def _process_target_json(self): self.text_out.append(output_string) def pretty_print(self, report: dict, title) -> str: - """Function that formats JSON code to make it more readable for TREs.""" - + """Format JSON code to make it more readable for TREs.""" returned_string = str(title) + "\n" for key in report.keys(): @@ -515,7 +508,7 @@ def pretty_print(self, report: dict, title) -> str: def process_attack_target_json( self, attack_filename: str, target_filename: str = None ): - """Function that creates a neat summary of an attack JSON file.""" + """Create a neat summary of an attack JSON file.""" self.attack_json_filename = attack_filename with open(attack_filename, encoding="utf-8") as f: @@ -575,8 +568,7 @@ def export_to_file( # pylint: disable=too-many-arguments release_dir="release_files", artefacts_dir="training_artefacts", ): - """Function that takes the input strings collected and combines into a neat text file.""" - + """Take the input strings collected and combine into a neat text file.""" copy_of_text_out = self.text_out self.text_out = [] diff --git a/aisdc/attacks/attribute_attack.py b/aisdc/attacks/attribute_attack.py index ce9205cd..b978adef 100644 --- a/aisdc/attacks/attribute_attack.py +++ b/aisdc/attacks/attribute_attack.py @@ -29,7 +29,7 @@ class AttributeAttack(Attack): - """Class to wrap the attribute inference attack code.""" + """Attribute inference attack.""" def __init__( # pylint: disable = too-many-arguments self, @@ -39,7 +39,7 @@ def __init__( # pylint: disable = too-many-arguments attack_config_json_file_name: str = None, target_path: str = None, ) -> None: - """Constructs an object to execute an attribute inference attack. + """Construct an object to execute an attribute inference attack. Parameters ---------- @@ -68,12 +68,13 @@ def __init__( # pylint: disable = too-many-arguments self.metadata: dict = {} def __str__(self): + """Return the name of the attack.""" return "Attribute inference attack" def attack(self, target: Target) -> None: - """Programmatic attack entry point. + """Run attribute inference attack. - To be used when code has access to Target class and trained target model + To be used when code has access to Target class and trained target model. Parameters ---------- @@ -83,22 +84,20 @@ def attack(self, target: Target) -> None: self.attack_metrics = _attribute_inference(target, self.n_cpu) def _construct_metadata(self) -> None: - """Constructs the metadata object. Called by the reporting method.""" + """Construct the metadata object.""" self.metadata = {} self.metadata["experiment_details"] = {} self.metadata["experiment_details"] = self.get_params() - self.metadata["attack"] = str(self) def make_report(self) -> dict: """Create the report. - Creates the output report. If self.report_name is not None, it will also save the - information in json and pdf formats. + Creates the output report. If self.report_name is not None, it will + also save the information in json and pdf formats. Returns ------- - output : dict Dictionary containing all attack output. """ @@ -129,20 +128,16 @@ def make_report(self) -> dict: return output def _get_attack_metrics_instances(self) -> dict: - """Constructs the instances metric calculated, during attacks.""" + """Construct the instances metric calculated, during attacks.""" attack_metrics_experiment = {} attack_metrics_instances = {} - attack_metrics_instances["instance_0"] = self.attack_metrics - attack_metrics_experiment["attack_instance_logger"] = attack_metrics_instances return attack_metrics_experiment def _unique_max(confidences: list[float], threshold: float) -> bool: - """Returns whether there is a unique maximum confidence value above - threshold. - """ + """Return if there is a unique maximum confidence value above threshold.""" if len(confidences) > 0: max_conf = np.max(confidences) if max_conf < threshold: @@ -157,7 +152,7 @@ def _unique_max(confidences: list[float], threshold: float) -> bool: def _get_inference_data( # pylint: disable=too-many-locals target: Target, feature_id: int, memberset: bool ) -> tuple[np.ndarray, np.ndarray, float]: - """Returns a dataset of each sample with the attributes to test.""" + """Return a dataset of each sample with the attributes to test.""" attack_feature: dict = target.features[feature_id] indices: list[int] = attack_feature["indices"] unique = np.unique(target.x_orig[:, feature_id]) @@ -198,11 +193,12 @@ def _infer( # pylint: disable=too-many-locals threshold: float, memberset: bool, ) -> tuple[int, int, float, int, int]: - """ - For each possible missing value, compute the confidence scores and - label with the target model; if the label matches the known target model - label for the original sample, and the highest confidence score is unique, - infer that attribute if the confidence score is greater than a threshold. + """Infer attribute. + + For each possible missing value, compute the confidence scores and label + with the target model; if the label matches the known target model label + for the original sample, and the highest confidence score is unique, infer + that attribute if the confidence score is greater than a threshold. """ logger.debug("Commencing attack on feature %d set %d", feature_id, int(memberset)) correct: int = 0 # number of correct inferences made @@ -234,7 +230,7 @@ def _infer( # pylint: disable=too-many-locals def report_categorical(results: dict) -> str: - """Returns a string report of the categorical results.""" + """Return a string report of the categorical results.""" results = results["categorical"] msg = "" for feature in results: @@ -256,7 +252,7 @@ def report_categorical(results: dict) -> str: def report_quantitative(results: dict) -> str: - """Returns a string report of the quantitative results.""" + """Return a string report of the quantitative results.""" results = results["quantitative"] msg = "" for feature in results: @@ -269,7 +265,7 @@ def report_quantitative(results: dict) -> str: def plot_quantitative_risk(res: dict, savefile: str = "") -> None: - """Generates bar chart showing quantitative value risk scores.""" + """Generate a bar chart showing quantitative value risk scores.""" logger.debug("Plotting quantitative feature risk scores") results = res["quantitative"] if len(results) < 1: # pragma: no cover @@ -306,7 +302,7 @@ def plot_quantitative_risk(res: dict, savefile: str = "") -> None: def plot_categorical_risk( # pylint: disable=too-many-locals res: dict, savefile: str = "" ) -> None: - """Generates bar chart showing categorical risk scores.""" + """Generate a bar chart showing categorical risk scores.""" logger.debug("Plotting categorical feature risk scores") results: list[dict] = res["categorical"] if len(results) < 1: # pragma: no cover @@ -347,7 +343,7 @@ def plot_categorical_risk( # pylint: disable=too-many-locals def plot_categorical_fraction( # pylint: disable=too-many-locals res: dict, savefile: str = "" ) -> None: - """Generates bar chart showing fraction of dataset inferred.""" + """Generate a bar chart showing fraction of dataset inferred.""" logger.debug("Plotting categorical feature tranche sizes") results: list[dict] = res["categorical"] if len(results) < 1: # pragma: no cover @@ -385,21 +381,8 @@ def plot_categorical_fraction( # pylint: disable=too-many-locals plt.show() -# def plot_from_file(filename: str, savefile: str = "") -> None: #pragma: no cover -# """Loads a results save file and plots risk scores. -# Has been tested but not iuncluded in unit tests or coverage -# at this stage -# """ -# logger.debug("Loading from results file: %s", filename) -# with open(filename + ".pickle", "rb") as handle: -# results = pickle.load(handle) -# plot_categorical_risk(results, savefile=savefile) -# plot_categorical_fraction(results, savefile=savefile) -# plot_quantitative_risk(results, savefile=savefile) - - def _infer_categorical(target: Target, feature_id: int, threshold: float) -> dict: - """Returns the training and test set risks of a categorical feature.""" + """Return the training and test set risks of a categorical feature.""" result: dict = { "name": target.features[feature_id]["name"], "train": _infer(target, feature_id, threshold, True), @@ -409,7 +392,8 @@ def _infer_categorical(target: Target, feature_id: int, threshold: float) -> dic def _is_categorical(target: Target, feature_id: int) -> bool: - """Returns whether a feature is categorical. + """Return whether a feature is categorical. + For simplicity, assumes integer datatypes are categorical. """ encoding: str = target.features[feature_id]["encoding"] @@ -424,10 +408,11 @@ def _attack_brute_force( n_cpu: int, attack_threshold: float = 0, ) -> list[dict]: - """ - Performs a brute force attribute inference attack by computing the target - model confidence scores for every value in the list and making an inference - if there is a unique highest confidence score that exceeds attack_threshold. + """Perform a brute force attribute inference attack. + + Computes the target model confidence scores for every value in the list and + makes an inference if there is a unique highest confidence score that + exceeds attack_threshold. """ logger.debug("Brute force attacking categorical features") args = [(target, feature_id, attack_threshold) for feature_id in features] @@ -446,12 +431,13 @@ def _get_bounds_risk_for_sample( # pylint: disable=too-many-locals,too-many-arg protection_limit: float = 0.1, feat_n: int = 100, ) -> bool: - """Returns a bool based on conditions surrounding upper and lower bounds of + """Return whether a quantitative feature is at risk for the sample. + + Returns a bool based on conditions surrounding upper and lower bounds of guesses that would lead to the same model confidence. Parameters ---------- - target_model : BaseEstimator Trained target model. feat_id : int @@ -515,7 +501,7 @@ def _get_bounds_risk_for_sample( # pylint: disable=too-many-locals,too-many-arg def _get_bounds_risk_for_feature( target_model: BaseEstimator, feature_id: int, samples: np.ndarray ) -> float: - """Returns the average feature risk score over a set of samples.""" + """Return the average feature risk score over a set of samples.""" feature_risk: int = 0 n_samples: int = len(samples) feat_min: float = np.min(samples[:, feature_id]) @@ -543,9 +529,7 @@ def _get_bounds_risk( x_train: np.ndarray, x_test: np.ndarray, ) -> dict: - """Returns a dictionary containing the training and test set risks of a - quantitative feature. - """ + """Return a dict containing the dataset risks of a quantitative feature.""" risk: dict = { "name": feature_name, "train": _get_bounds_risk_for_feature(target_model, feature_id, x_train), @@ -555,7 +539,7 @@ def _get_bounds_risk( def _get_bounds_risks(target: Target, features: list[int], n_cpu: int) -> list[dict]: - """Computes the bounds risk for all specified features.""" + """Compute the bounds risk for all specified features.""" logger.debug("Computing bounds risk for all specified features") args = [ ( @@ -599,7 +583,7 @@ def _attribute_inference(target: Target, n_cpu: int) -> dict: def create_aia_report(output: dict, name: str = "aia_report") -> FPDF: - """Creates PDF report.""" + """Create PDF report.""" metadata = output["metadata"] aia_metrics = output["attack_experiment_logger"]["attack_instance_logger"][ "instance_0" @@ -649,7 +633,7 @@ def _run_attack_from_configfile(args): def main(): - """Main method to parse args and invoke relevant code.""" + """Parse args and invoke relevant code.""" parser = argparse.ArgumentParser(add_help=False) subparsers = parser.add_subparsers() diff --git a/aisdc/attacks/failfast.py b/aisdc/attacks/failfast.py index 6137b29d..390713ab 100644 --- a/aisdc/attacks/failfast.py +++ b/aisdc/attacks/failfast.py @@ -1,4 +1,4 @@ -"""Failfast.py - class to evaluate metric for fail fast option.""" +"""Class to evaluate metric for fail fast option.""" from __future__ import annotations @@ -6,11 +6,11 @@ class FailFast: - """Class to check attack being successful or not for a given metric + """Class to check attack being successful or not for a given metric. + Note: An object of a FailFast is stateful and instance members - (success_count and fail_count) will preserve values - across repetitions for a test. For the new test - a new object will require to be instantiated. + (success_count and fail_count) will preserve values across repetitions for + a test. For the new test a new object will require to be instantiated. """ def __init__(self, attack_obj: Any): @@ -21,24 +21,24 @@ def __init__(self, attack_obj: Any): self.fail_count = 0 def check_attack_success(self, metric_dict: dict) -> bool: - """A function to check if attack was successful for a given metric. + """Check if attack was successful for a given metric. Parameters ---------- metric_dict : dict - a dictionary with all computed metric values + Dictionary with all computed metric values. Returns ------- success_status : bool - a boolean value is returned based on the comparison for a given threshold + Boolean value based on the comparison for a given threshold. Notes ----- - If value of a given metric value has a value meeting the threshold based on - the comparison type returns true otherwise it returns false. This function - also counts how many times the attack was successful (i.e. true) and - how many times it was not successful (i.e. false). + If value of a given metric value has a value meeting the threshold + based on the comparison type returns true otherwise it returns false. + This function also counts how many times the attack was successful + (i.e. true) and how many times it was not successful (i.e. false). """ metric_value = metric_dict[self.metric_name] success_status = False @@ -54,37 +54,29 @@ def check_attack_success(self, metric_dict: dict) -> bool: success_status = bool(metric_value == self.metric_success_thresh) elif self.comp_type == "not_eq": success_status = bool(metric_value != self.metric_success_thresh) - if success_status: - self._increment_success_count() + self.success_count += 1 else: - self._incremenet_fail_count() - + self.fail_count += 1 return success_status - def _increment_success_count(self) -> int: - self.success_count += 1 - - def _incremenet_fail_count(self) -> int: - self.fail_count += 1 - def get_success_count(self) -> int: - """Returns a count of attack being successful.""" + """Return a count of attack being successful.""" return self.success_count def get_fail_count(self): - """Returns a count of attack being not successful.""" + """Return a count of attack being not successful.""" return self.fail_count def get_attack_summary(self) -> dict: - """Returns a dictionary of counts of attack being successful and not successful.""" + """Return a dict of counts of attack being successful and not successful.""" summary = {} summary["success_count"] = self.success_count summary["fail_count"] = self.fail_count return summary def check_overall_attack_success(self, attack_obj: Any) -> bool: - """Returns true if the attack is successful for a given success count threshold.""" + """Return true if attack is successful for a given success count threshold.""" overall_success_status = False if self.success_count >= attack_obj.attack_metric_success_count_thresh: overall_success_status = True diff --git a/aisdc/attacks/likelihood_attack.py b/aisdc/attacks/likelihood_attack.py index 4c82e3ff..fcbf22e0 100644 --- a/aisdc/attacks/likelihood_attack.py +++ b/aisdc/attacks/likelihood_attack.py @@ -47,7 +47,7 @@ def predict_proba(self, test_X): def _logit(p: float) -> float: - """Standard logit function. + """Return standard logit. Parameters ---------- @@ -61,8 +61,9 @@ def _logit(p: float) -> float: Notes ----- - If p is close to 0 or 1, evaluating the log will result in numerical instabilities. - This code thresholds p at EPS and 1 - EPS where EPS defaults at 1e-16. + If p is close to 0 or 1, evaluating the log will result in numerical + instabilities. This code thresholds p at EPS and 1 - EPS where EPS + defaults at 1e-16. """ if p > 1 - EPS: # pylint:disable=consider-using-min-builtin p = 1 - EPS @@ -72,7 +73,7 @@ def _logit(p: float) -> float: class LIRAAttack(Attack): - """The main LIRA Attack class.""" + """The main LiRA Attack class.""" # pylint: disable=too-many-instance-attributes @@ -93,7 +94,7 @@ def __init__( # pylint: disable = too-many-arguments shadow_models_fail_fast: bool = False, target_path: str = None, ) -> None: - """Constructs an object to execute a LIRA attack. + """Construct an object to execute a LiRA attack. Parameters ---------- @@ -154,19 +155,19 @@ def __init__( # pylint: disable = too-many-arguments self.metadata = None def __str__(self): - return "LIRA Attack" + """Return the name of the attack.""" + return "LiRA Attack" def attack(self, target: Target) -> None: - """Programmatic attack running - Runs a LIRA attack from a Target object and a target model. + """Run a LiRA attack from a Target object and a target model. + + Needs to have x_train, x_test, y_train and y_test set. Parameters ---------- target : attacks.target.Target - target as an instance of the Target class. Needs to have x_train, - x_test, y_train and y_test set. + target as an instance of the Target class. """ - shadow_clf = sklearn.base.clone(target.model) target = self._check_and_update_dataset(target) @@ -182,9 +183,9 @@ def attack(self, target: Target) -> None: ) def _check_and_update_dataset(self, target: Target) -> Target: - """ - Makes sure that it is ok to use the class variables to index the - prediction arrays. This has two steps: + """Check that it is safe to use class variables to index prediction arrays. + + This has two steps: 1. Replacing the values in y_train with their position in target.model.classes (will normally result in no change) 2. Removing from the test set any rows corresponding to classes that @@ -229,7 +230,8 @@ def run_scenario_from_preds( # pylint: disable = too-many-statements, too-many- y_shadow_train: Iterable[float], shadow_train_preds: Iterable[float], ) -> tuple[np.ndarray, np.ndarray, sklearn.base.BaseEstimator]: - """Implements the likelihood test, using the "offline" version + """Run the likelihood test, using the "offline" version. + See p.6 (top of second column) for details. Parameters @@ -262,7 +264,6 @@ def run_scenario_from_preds( # pylint: disable = too-many-statements, too-many- Examples -------- - >>> X, y = load_breast_cancer(return_X_y=True, as_frame=False) >>> train_X, test_X, train_y, test_y = train_test_split( >>> X, y, test_size=0.5, stratify=y @@ -280,7 +281,6 @@ def run_scenario_from_preds( # pylint: disable = too-many-statements, too-many- >>> n_shadow_models=100 >>> ) """ - logger = logging.getLogger("lr-scenario") n_train_rows, _ = X_target_train.shape @@ -400,7 +400,7 @@ def run_scenario_from_preds( # pylint: disable = too-many-statements, too-many- self.attack_metrics = [metrics.get_metrics(y_pred_proba, y_test)] def example(self) -> None: - """Runs an example attack using data from sklearn. + """Run an example attack using data from sklearn. Generates example data, trains a classifier and tuns the attack """ @@ -421,7 +421,7 @@ def example(self) -> None: ) def _construct_metadata(self) -> None: - """Constructs the metadata object. Called by the reporting method.""" + """Construct the metadata object.""" self.metadata = {} self.metadata["experiment_details"] = {} self.metadata["experiment_details"] = self.get_params() @@ -455,12 +455,11 @@ def _construct_metadata(self) -> None: def make_report(self) -> dict: """Create the report. - Creates the output report. If self.args.report_name is not None, it will also save the - information in json and pdf formats + Creates the output report. If self.args.report_name is not None, it + will also save the information in json and pdf formats. Returns ------- - output : Dict Dictionary containing all attack output """ @@ -493,7 +492,7 @@ def make_report(self) -> dict: return output def _get_attack_metrics_instances(self) -> dict: - """Constructs the metadata object, after attacks.""" + """Construct the metadata object after attacks.""" attack_metrics_experiment = {} attack_metrics_instances = {} @@ -507,11 +506,12 @@ def _get_attack_metrics_instances(self) -> dict: return attack_metrics_experiment def setup_example_data(self) -> None: - """Method to create example data and save (including config). Intended to allow users - to see how they would need to setup their own data. + """Create example data and save (including config). - Generates train and test data .csv files, train and test predictions .csv files and - a config.json file that can be used to run the attack from the command line. + Intended to allow users to see how they would need to setup their own + data. Generates train and test data .csv files, train and test + predictions .csv files and a config.json file that can be used to run + the attack from the command line. """ X, y = load_breast_cancer(return_X_y=True) train_X, test_X, train_y, test_y = train_test_split( @@ -543,7 +543,7 @@ def setup_example_data(self) -> None: f.write(json.dumps(config)) def attack_from_config(self) -> None: # pylint: disable = too-many-locals - """Runs an attack based on the args parsed from the command line.""" + """Run an attack based on the args parsed from the command line.""" logger = logging.getLogger("run-attack") logger.info("Loading training data csv from %s", self.training_data_filename) training_data = np.loadtxt(self.training_data_filename, delimiter=",") @@ -638,7 +638,7 @@ def _run_attack_from_configfile(args): def main(): - """Main method to parse args and invoke relevant code.""" + """Parse args and invoke relevant code.""" parser = argparse.ArgumentParser(add_help=False) parser.add_argument( "-s", diff --git a/aisdc/attacks/multiple_attacks.py b/aisdc/attacks/multiple_attacks.py index 1cae78f0..48e4191c 100644 --- a/aisdc/attacks/multiple_attacks.py +++ b/aisdc/attacks/multiple_attacks.py @@ -1,8 +1,4 @@ -""" -An entry point to run multiple attacks including MIA (worst-case and LIRA) -and attribute inference attack using a single configuration file -with multiple attack configuration. -""" +"""Run multiple attacks including MIA and AIA using a single configuration file.""" from __future__ import annotations @@ -21,7 +17,7 @@ class MultipleAttacks(Attack): - """Class to wrap the MIA and AIA attack codes.""" + """Wrap the MIA and AIA attack codes.""" def __init__( self, @@ -29,21 +25,21 @@ def __init__( ) -> None: super().__init__() self.config_filename = config_filename - """Constructs an object to execute a worst case attack. + """Construct an object to execute a worst case attack. Parameters ---------- config_filename : str - name of the configuration file which has configurations in a single JSON file - to support running multiple attacks + name of the configuration file which has configurations in a single + JSON file to support running multiple attacks. """ def __str__(self): + """Return the name of the attack.""" return "Multiple Attacks (MIA and AIA) given configurations" def attack(self, target: Target) -> None: - """ - Runs attacks from a Target object and a target model. + """Run attacks from a Target object and a target model. Parameters ---------- @@ -86,7 +82,7 @@ def attack(self, target: Target) -> None: class ConfigFile: - """Module that creates a single JSON configuration file.""" + """Create a single JSON configuration file.""" def __init__( self, @@ -102,7 +98,6 @@ def __init__( def add_config(self, config_obj: Any, config_attack_type: str) -> None: """Add a section of JSON to the file which is already open.""" - # Read the contents of the file and then clear the file config_file_data = self.read_config_file() @@ -119,9 +114,7 @@ def add_config(self, config_obj: Any, config_attack_type: str) -> None: f.write(json.dumps(config_file_data)) def read_config_file(self) -> dict: - """Reads a JSON configuration file and returns dictionary - with a number of configuration objects. - """ + """Read a JSON config file and return dict with configuration objects.""" with open(self.filename, encoding="utf-8") as f: file_contents = f.read() if file_contents != "": @@ -142,7 +135,7 @@ def _run_attack_from_configfile(args): def main(): - """Main method to parse args and invoke relevant code.""" + """Parse args and invoke relevant code.""" parser = argparse.ArgumentParser(add_help=False) subparsers = parser.add_subparsers() diff --git a/aisdc/attacks/report.py b/aisdc/attacks/report.py index 178e3a73..043154d4 100644 --- a/aisdc/attacks/report.py +++ b/aisdc/attacks/report.py @@ -198,7 +198,6 @@ def create_mia_report(attack_output: dict) -> FPDF: Parameters ---------- - attack_output : dict dictionary with following items @@ -213,7 +212,6 @@ def create_mia_report(attack_output: dict) -> FPDF: Returns ------- - pdf : fpdf.FPDF fpdf document object """ @@ -307,7 +305,7 @@ def create_mia_report(attack_output: dict) -> FPDF: def add_output_to_pdf(report_dest: str, pdf_report: FPDF, attack_type: str) -> None: - """Creates pdf and appends contents if it already exists.""" + """Create pdf and append contents if it already exists.""" if os.path.exists(report_dest + ".pdf"): old_pdf = report_dest + ".pdf" new_pdf = report_dest + "_new.pdf" @@ -353,7 +351,6 @@ def create_lr_report(output: dict) -> FPDF: Parameters ---------- - output : dict dictionary with following items @@ -368,7 +365,6 @@ def create_lr_report(output: dict) -> FPDF: Returns ------- - pdf : fpdf.FPDF fpdf document object """ @@ -376,7 +372,6 @@ def create_lr_report(output: dict) -> FPDF: v for _, v in output["attack_experiment_logger"]["attack_instance_logger"].items() ][0] - # mia_metrics = output["attack_metrics"][0] metadata = output["metadata"] dest_log_roc = ( os.path.join( diff --git a/aisdc/attacks/structural_attack.py b/aisdc/attacks/structural_attack.py index b8017428..a9f26380 100644 --- a/aisdc/attacks/structural_attack.py +++ b/aisdc/attacks/structural_attack.py @@ -1,6 +1,7 @@ -""" +"""Structural attacks. + Runs a number of 'static' structural attacks based on: -(i) the target model's properties +(i) the target model's properties; (ii) the TREs risk appetite as applied to tables and standard regressions. """ @@ -31,27 +32,27 @@ def get_unnecessary_risk(model: BaseEstimator) -> bool: - """ - Checks whether a model's hyper-parameters against - a set of rules that predict the top 20% most risky. + """Check whether model hyperparameters are in the top 20% most risky. - This check is designed to assess whether a model is - likely to be **unnecessarily** risky, i.e., - whether it is highly likely that a different combination of hyper-parameters - would have led to model with similar or better accuracy on the task - but with lower membership inference risk. + This check is designed to assess whether a model is likely to be + **unnecessarily** risky, i.e., whether it is highly likely that a different + combination of hyper-parameters would have led to model with similar or + better accuracy on the task but with lower membership inference risk. The rules applied from an experimental study using a grid search in which: - max_features was one-hot encoded from the set [None, log2, sqrt] - splitter was encoded using 0=best, 1=random - The target models created were then subject to membership inference attacks (MIA) - and the hyper-param combinations rank-ordered according to MIA AUC. - Then a decision tree trained to recognise whether - hyper-params combintions were in the 20% most risky. - The rules below were extracted from that tree for the 'least risky' nodes + The target models created were then subject to membership inference attacks + (MIA) and the hyper-param combinations rank-ordered according to MIA AUC. + Then a decision tree trained to recognise whether hyper-params combintions + were in the 20% most risky. The rules below were extracted from that tree + for the 'least risky' nodes. + + Notes + ----- + Returns 1 if high risk, otherwise 0. """ - # Returns 1 if high risk, otherwise 0 if not isinstance( model, (DecisionTreeClassifier, RandomForestClassifier, XGBClassifier) ): @@ -118,7 +119,7 @@ def get_unnecessary_risk(model: BaseEstimator) -> bool: unnecessary_risk = 1 elif isinstance(model, XGBClassifier): - # checking whether params exist and using xgboost defaults if not using defaults + # check whether params exist and using xgboost defaults if not using defaults # from https://github.com/dmlc/xgboost/blob/master/python-package/xgboost/sklearn.py # and here: https://xgboost.readthedocs.io/en/stable/parameter.html n_estimators = int(model.n_estimators) if model.n_estimators else 100 @@ -137,7 +138,7 @@ def get_unnecessary_risk(model: BaseEstimator) -> bool: def get_tree_parameter_count(dtree: DecisionTreeClassifier) -> int: - """Reads the tree structure a returns the number of learned parameters.""" + """Read the tree structure and return the number of learned parameters.""" n_nodes = dtree.tree_.node_count left = dtree.tree_.children_left right = dtree.tree_.children_right @@ -155,7 +156,7 @@ def get_tree_parameter_count(dtree: DecisionTreeClassifier) -> int: def get_model_param_count(model: BaseEstimator) -> int: - """Returns the number of trained parameters in a model.""" + """Return the number of trained parameters in a model.""" n_params = 0 if isinstance(model, DecisionTreeClassifier): @@ -195,7 +196,7 @@ def get_model_param_count(model: BaseEstimator) -> int: class StructuralAttack(Attack): - """Class to wrap a number of attacks based on the static structure of a model.""" + """Structural attacks based on the static structure of a model.""" # pylint: disable=too-many-instance-attributes @@ -207,7 +208,7 @@ def __init__( # pylint: disable = too-many-arguments output_dir="outputs_structural", report_name="report_structural", ) -> None: - """Constructs an object to execute a structural attack. + """Construct an object to execute a structural attack. Parameters ---------- @@ -218,7 +219,6 @@ def __init__( # pylint: disable = too-many-arguments risk_appetite_config : str path to yaml file specifying TRE risk appetite """ - super().__init__() logger = logging.getLogger("structural_attack") self.target: Target = None @@ -258,12 +258,13 @@ def __init__( # pylint: disable = too-many-arguments self.report_name = report_name def __str__(self): + """Return the name of the attack.""" return "Structural attack" def attack(self, target: Target) -> None: - """Programmatic attack entry point. + """Run structural attack. - To be used when code has access to Target class and trained target model + To be used when code has access to Target class and trained target model. Parameters ---------- @@ -295,11 +296,6 @@ def attack(self, target: Target) -> None: errstr = "len mismatch between equiv classes and " assert len(equiv_classes) == len(equiv_counts), errstr + "counts" assert len(equiv_classes) == len(equiv_members), errstr + "membership" - # print( - # f"equiv_classes is {equiv_classes}\n" - # f"equiv_counts is {equiv_counts}\n" - # # #f'equiv_members is {equiv_members}\n' - # ) # now assess the risk # Degrees of Freedom @@ -323,19 +319,14 @@ def attack(self, target: Target) -> None: self.lowvals_cd_risk = np.any(freqs < self.THRESHOLD).astype(int) def dt_get_equivalence_classes(self) -> tuple: - """ - Gets details of equivalence classes - based on white box inspection. - """ + """Get details of equivalence classes based on white box inspection.""" destinations = self.target.model.apply(self.target.x_train) ret_tuple = np.unique(destinations, return_counts=True) - # print(f'leaves and counts:\n{ret_tuple}\n') leaves = ret_tuple[0] counts = ret_tuple[1] members = [] for leaf in leaves: ingroup = np.asarray(destinations == leaf).nonzero()[0] - # print(f'ingroup {ingroup},count {len(ingroup)}') members.append(ingroup) equiv_classes = np.zeros((len(leaves), self.target.model.n_classes_)) @@ -347,10 +338,7 @@ def dt_get_equivalence_classes(self) -> tuple: return [equiv_classes, counts, members] def get_equivalence_classes(self) -> tuple: - """ - Gets details of equivalence classes - based on black box observation of probabilities. - """ + """Get details of equivalence classes based on predicted probabilities.""" uniques = np.unique(self.yprobs, axis=0, return_counts=True) equiv_classes = uniques[0] equiv_counts = uniques[1] @@ -358,21 +346,20 @@ def get_equivalence_classes(self) -> tuple: for prob_vals in equiv_classes: ingroup = np.unique(np.asarray(self.yprobs == prob_vals).nonzero()[0]) members.append(ingroup) - # print(equiv_counts) return [equiv_classes, equiv_counts, members] def _get_global_metrics(self, attack_metrics: list) -> dict: - """Summarise metrics from a metric list. + """Get dictionary summarising metrics from a metric list. + + Parameters + ---------- + attack_metrics : List + list of attack metrics to be reported. Returns ------- global_metrics : Dict Dictionary of summary metrics - - Arguments - --------- - attack_metrics: List - list of attack metrics to be reported """ global_metrics = {} if attack_metrics is not None and len(attack_metrics) != 0: @@ -385,7 +372,7 @@ def _get_global_metrics(self, attack_metrics: list) -> dict: return global_metrics def _construct_metadata(self): - """Constructs the metadata object, after attacks.""" + """Construct the metadata object, after attacks.""" self.metadata = {} # Store all args self.metadata["experiment_details"] = {} @@ -395,14 +382,9 @@ def _construct_metadata(self): self.metadata["global_metrics"] = self._get_global_metrics(self.attack_metrics) def _get_attack_metrics_instances(self) -> dict: - """Constructs the metadata object, after attacks.""" + """Construct the metadata object, after attacks.""" attack_metrics_experiment = {} attack_metrics_instances = {} - - # for rep, name in enumerate(self.attack_metrics): - # #attack_metrics_instances["instance_" + str(rep)] = self.attack_metrics[rep] - # attack_metrics_instances["instance_" + str(name)] = self.__dict__.[name] - attack_metrics_experiment["attack_instance_logger"] = attack_metrics_instances attack_metrics_experiment["DoF_risk"] = self.DoF_risk attack_metrics_experiment["k_anonymity_risk"] = self.k_anonymity_risk @@ -412,34 +394,22 @@ def _get_attack_metrics_instances(self) -> dict: return attack_metrics_experiment def make_report(self) -> dict: - """Creates output dictionary structure and generates - pdf and json outputs if filenames are given. - """ + """Create output dict and generate pdf and json if filenames are given.""" output = {} output["log_id"] = str(uuid.uuid4()) output["log_time"] = datetime.now().strftime("%d/%m/%Y %H:%M:%S") - self._construct_metadata() output["metadata"] = self.metadata - output["attack_experiment_logger"] = self._get_attack_metrics_instances() - # output[ - # "dummy_attack_experiments_logger" - # ] = self._get_dummy_attack_metrics_experiments_instances() - report_dest = os.path.join(self.output_dir, self.report_name) json_attack_formatter = GenerateJSONModule(report_dest + ".json") json_report = report.create_json_report(output) json_attack_formatter.add_attack_output(json_report, "StructuralAttack") - - # pdf_report = report.create_mia_report(output) - # report.add_output_to_pdf(report_dest, pdf_report, "StructuralAttack") return output def _run_attack(args): """Initialise class and run attack.""" - attack_obj = StructuralAttack( risk_appetite_config=args.risk_appetite_config, target_path=args.target_path, @@ -454,8 +424,7 @@ def _run_attack(args): def _run_attack_from_configfile(args): - """Initialise class and run attack using config file.""" - + """Initialise class and run attack using config file.""" attack_obj = StructuralAttack( attack_config_json_file_name=str(args.attack_config_json_file_name), target_path=str(args.target_path), @@ -467,7 +436,7 @@ def _run_attack_from_configfile(args): def main(): - """Main method to parse arguments and invoke relevant method.""" + """Parse arguments and invoke relevant method.""" logger = logging.getLogger("main") parser = argparse.ArgumentParser(description="Perform a structural attack") diff --git a/aisdc/attacks/target.py b/aisdc/attacks/target.py index 528e94a9..71f88180 100644 --- a/aisdc/attacks/target.py +++ b/aisdc/attacks/target.py @@ -1,4 +1,4 @@ -"""Stores information about the target model and data.""" +"""Store information about the target model and data.""" from __future__ import annotations @@ -17,7 +17,7 @@ class Target: # pylint: disable=too-many-instance-attributes - """Stores information about the target model and data.""" + """Store information about the target model and data.""" def __init__(self, model: sklearn.base.BaseEstimator | None = None) -> None: """Store information about a target model and associated data. @@ -253,7 +253,7 @@ def __load_data(self, path: str, target: dict) -> None: self.__load_numpy(path, target, "y_test_orig") def __ge(self) -> str: - """Returns the model generalisation error. + """Return the model generalisation error. Returns ------- @@ -276,7 +276,7 @@ def __ge(self) -> str: return "unknown" def save(self, path: str = "target", ext: str = "pkl") -> None: - """Saves the target class to persistent storage. + """Save the target class to persistent storage. Parameters ---------- @@ -308,7 +308,7 @@ def save(self, path: str = "target", ext: str = "pkl") -> None: json.dump(target, fp, indent=4, cls=NumpyArrayEncoder) def load(self, path: str = "target") -> None: - """Loads the target class from persistent storage. + """Load the target class from persistent storage. Parameters ---------- @@ -342,7 +342,7 @@ def load(self, path: str = "target") -> None: self.__load_data(path, target) def add_safemodel_results(self, data: list) -> None: - """Adds the results of safemodel disclosure checking. + """Add the results of safemodel disclosure checking. Parameters ---------- @@ -352,4 +352,5 @@ def add_safemodel_results(self, data: list) -> None: self.safemodel = data def __str__(self): + """Return the name of the dataset used.""" return self.name diff --git a/aisdc/attacks/worst_case_attack.py b/aisdc/attacks/worst_case_attack.py index 608da316..cbd18d70 100644 --- a/aisdc/attacks/worst_case_attack.py +++ b/aisdc/attacks/worst_case_attack.py @@ -1,4 +1,4 @@ -"""Runs a worst case attack based upon predictive probabilities.""" +"""Run a worst case attack based upon predictive probabilities.""" # pylint: disable = too-many-lines @@ -29,7 +29,7 @@ class WorstCaseAttack(Attack): - """Class to wrap the worst case attack code.""" + """Worst case attack.""" # pylint: disable=too-many-instance-attributes @@ -60,7 +60,7 @@ def __init__( # pylint: disable = too-many-arguments, too-many-locals, too-many attack_config_json_file_name: str = None, target_path: str = None, ) -> None: - """Constructs an object to execute a worst case attack. + """Construct an object to execute a worst case attack. Parameters ---------- @@ -125,7 +125,6 @@ def __init__( # pylint: disable = too-many-arguments, too-many-locals, too-many target_path : str path to the saved trained target model and target data """ - super().__init__() self.n_reps = n_reps self.reproduce_split = reproduce_split @@ -134,9 +133,8 @@ def __init__( # pylint: disable = too-many-arguments, too-many-locals, too-many x**2 for x in range(reproduce_split, reproduce_split + n_reps - 1) ] else: - reproduce_split = list( - dict.fromkeys(reproduce_split) - ) # remove potential duplicates + # remove potential duplicates + reproduce_split = list(dict.fromkeys(reproduce_split)) if len(reproduce_split) == n_reps: pass elif len(reproduce_split) > n_reps: @@ -199,12 +197,13 @@ def __init__( # pylint: disable = too-many-arguments, too-many-locals, too-many self.metadata = None def __str__(self): + """Return name of attack.""" return "WorstCase attack" def attack(self, target: Target) -> None: - """Programmatic attack entry point. + """Run worst case attack. - To be used when code has access to Target class and trained target model + To be used when code has access to Target class and trained target model. Parameters ---------- @@ -227,12 +226,12 @@ def attack(self, target: Target) -> None: ) def attack_from_prediction_files(self): - """Start an attack from saved prediction files. + """Run attack from saved prediction files. To be used when only saved predictions are available. - Filenames for the saved prediction files to be specified in the arguments provided - in the constructor + Filenames for the saved prediction files to be specified in the + arguments provided in the constructor. """ train_preds = np.loadtxt(self.training_preds_filename, delimiter=",") test_preds = np.loadtxt(self.test_preds_filename, delimiter=",") @@ -245,16 +244,14 @@ def attack_from_preds( train_correct: np.ndarray = None, test_correct: np.ndarray = None, ) -> None: - """ - Runs the attack based upon the predictions in train_preds and test_preds, and the params - stored in self.args. + """Run attack based upon the predictions in train_preds and test_preds. Parameters ---------- train_preds : np.ndarray - Array of train predictions. One row per example, one column per class (i.e. 2) + Array of train predictions. One row per example, one column per class. test_preds : np.ndarray - Array of test predictions. One row per example, one column per class (i.e. 2) + Array of test predictions. One row per example, one column per class. """ logger = logging.getLogger("attack-from-preds") logger.info("Running main attack repetitions") @@ -304,10 +301,12 @@ def _prepare_attack_data( train_correct: np.ndarray = None, test_correct: np.ndarray = None, ) -> tuple[np.ndarray, np.ndarray]: - """Prepare training data and labels for attack model - Combines the train and test preds into a single numpy array (optionally) sorting each - row to have the highest probabilities in the first column. Constructs a label array that - has ones corresponding to training rows and zeros to testing rows. + """Prepare training data and labels for attack model. + + Combines the train and test preds into a single numpy array + (optionally) sorting each row to have the highest probabilities in the + first column. Constructs a label array that has ones corresponding to + training rows and zeros to testing rows. """ logger = logging.getLogger("prep-attack-data") if self.sort_probs: @@ -333,8 +332,7 @@ def run_attack_reps( # pylint: disable = too-many-locals train_correct: np.ndarray = None, test_correct: np.ndarray = None, ) -> dict: - """ - Run actual attack reps from train and test predictions. + """Run actual attack reps from train and test predictions. Parameters ---------- @@ -411,15 +409,15 @@ def run_attack_reps( # pylint: disable = too-many-locals def _get_global_metrics(self, attack_metrics: list) -> dict: """Summarise metrics from a metric list. + Parameters + ---------- + attack_metrics : List + list of attack metrics dictionaries + Returns ------- global_metrics : Dict Dictionary of summary metrics - - Arguments - --------- - attack_metrics: List - list of attack metrics dictionaries """ global_metrics = {} if attack_metrics is not None and len(attack_metrics) != 0: @@ -455,10 +453,10 @@ def _get_global_metrics(self, attack_metrics: list) -> dict: return global_metrics - def _get_n_significant(self, p_val_list, p_thresh, bh_fdr_correction=False): - """ - Helper method to determine if values within a list of p-values are significant at - p_thresh. Can perform multiple testing correction. + def _get_n_significant(self, p_val_list, p_thresh, bh_fdr_correction=False) -> int: + """Return number of p-values significant at p_thresh. + + Can perform multiple testing correction. """ if not bh_fdr_correction: return sum(1 for p in p_val_list if p <= p_thresh) @@ -473,7 +471,7 @@ def _get_n_significant(self, p_val_list, p_thresh, bh_fdr_correction=False): return n_sig_bh def _generate_array(self, n_rows: int, beta: float) -> np.ndarray: - """Generate a single array of predictions, used when doing baseline experiments. + """Generate array of predictions, used when doing baseline experiments. Parameters ---------- @@ -486,14 +484,7 @@ def _generate_array(self, n_rows: int, beta: float) -> np.ndarray: ------- preds : np.ndarray Array of predictions. Two columns, n_rows rows - - Notes - ----- - - Examples - -------- """ - preds = np.zeros((n_rows, 2), float) for row_idx in range(n_rows): train_class = np.random.choice(2) @@ -534,19 +525,16 @@ def generate_arrays( return train_preds, test_preds def make_dummy_data(self) -> None: - """Makes dummy data for testing functionality. + """Make dummy data for testing functionality. Parameters ---------- args : dict Command line arguments - Returns - ------- - Notes ----- - Returns nothing but saves two .csv files + Returns nothing but saves two .csv files. """ logger = logging.getLogger("dummy-data") logger.info( @@ -566,7 +554,7 @@ def make_dummy_data(self) -> None: np.savetxt(self.test_preds_filename, test_preds, delimiter=",") def _construct_metadata(self): - """Constructs the metadata object, after attacks.""" + """Construct the metadata object after attacks.""" self.metadata = {} # Store all args self.metadata["experiment_details"] = {} @@ -581,7 +569,7 @@ def _construct_metadata(self): ) def _unpack_dummy_attack_metrics_experiments_instances(self) -> list: - """Constructs the metadata object, after attacks.""" + """Construct the metadata object after attacks.""" dummy_attack_metrics_instances = [] for exp_rep, _ in enumerate(self.dummy_attack_metrics): @@ -591,7 +579,7 @@ def _unpack_dummy_attack_metrics_experiments_instances(self) -> list: return dummy_attack_metrics_instances def _get_attack_metrics_instances(self) -> dict: - """Constructs the metadata object, after attacks.""" + """Construct the metadata object after attacks.""" attack_metrics_experiment = {} attack_metrics_instances = {} @@ -606,7 +594,7 @@ def _get_attack_metrics_instances(self) -> dict: return attack_metrics_experiment def _get_dummy_attack_metrics_experiments_instances(self) -> dict: - """Constructs the metadata object, after attacks.""" + """Construct the metadata object after attacks.""" dummy_attack_metrics_experiments = {} for exp_rep, _ in enumerate(self.dummy_attack_metrics): @@ -628,9 +616,7 @@ def _get_dummy_attack_metrics_experiments_instances(self) -> dict: return dummy_attack_metrics_experiments def make_report(self) -> dict: - """Creates output dictionary structure and generates - pdf and json outputs if filenames are given. - """ + """Create output dict and generate pdf and json if filenames are given.""" output = {} output["log_id"] = str(uuid.uuid4()) output["log_time"] = datetime.now().strftime("%d/%m/%Y %H:%M:%S") @@ -706,7 +692,7 @@ def _run_attack_from_configfile(args): def main(): - """Main method to parse arguments and invoke relevant method.""" + """Parse arguments and invoke relevant method.""" logger = logging.getLogger("main") parser = argparse.ArgumentParser( description=("Perform a worst case attack from saved model predictions") diff --git a/aisdc/metrics.py b/aisdc/metrics.py index 3c3c64f6..2591ad5f 100644 --- a/aisdc/metrics.py +++ b/aisdc/metrics.py @@ -68,7 +68,6 @@ def _tpr_at_fpr( tpr : float true positive rate at fpr """ - if fpr_perc: fpr /= 100.0 @@ -83,12 +82,12 @@ def _tpr_at_fpr( def _expected_auc_var(auc: float, num_pos: int, num_neg: int) -> float: - """Compute variance of AUC under assumption of uniform probabilities - uses the expression given as eqn (2) in https://cs.nyu.edu/~mohri/pub/area.pdf. + """Compute variance of AUC under assumption of uniform probabilities. + + Uses the expression given as eqn (2) in https://cs.nyu.edu/~mohri/pub/area.pdf. Parameters ---------- - auc : float auc value at which to compute the variance num_pos : int @@ -113,17 +112,19 @@ def _expected_auc_var(auc: float, num_pos: int, num_neg: int) -> float: def min_max_disc( y_true: np.ndarray, pred_probs: np.ndarray, x_prop: float = 0.1, log_p: bool = True ) -> tuple[float, float, float, float]: - """ - Non-average-case methods for MIA attacks. Considers actual frequency of membership - amongst samples with highest- and lowest- assessed probability of membership. If an - MIA method confidently asserts that 5% of samples are members and 5% of samples are - not, but cannot tell for the remaining 90% of samples, then these metrics will flag - this behaviour, but AUC/advantage may not. Since the difference may be noisy, a - p-value against a null of independence of true membership and assessed membership - probability (that is, membership probabilities are essentially random) is also used - as a metric (using a usual Gaussian approximation to binomial). If the p-value is - low and the frequency difference is high (>0.5) then the MIA attack is successful - for some samples. + """Return non-average-case methods for MIA attacks. + + Considers actual frequency of membership amongst samples with highest- and + lowest- assessed probability of membership. If an MIA method confidently + asserts that 5% of samples are members and 5% of samples are not, but + cannot tell for the remaining 90% of samples, then these metrics will flag + this behaviour, but AUC/advantage may not. Since the difference may be + noisy, a p-value against a null of independence of true membership and + assessed membership probability (that is, membership probabilities are + essentially random) is also used as a metric (using a usual Gaussian + approximation to binomial). If the p-value is low and the frequency + difference is high (>0.5) then the MIA attack is successful for some + samples. Parameters ---------- @@ -151,16 +152,12 @@ def min_max_disc( p-value or log-p value corresponding to mmd against null hypothesis that random variables corresponding to y and yp are independent. - Notes - ----- - Examples -------- >>> y = np.random.choice(2, 100) >>> yp = np.random.rand(100) >>> maxd, mind, mmd, pval = min_max_desc(y, yp, xprop=0.2, logp=True) """ - n_examples = int(np.ceil(len(y_true) * x_prop)) pos_frequency = np.mean(y_true) # average frequency y_order = np.argsort(pred_probs) # ordering permutation @@ -218,9 +215,7 @@ def get_probabilities( # pylint: disable=too-many-locals y_test: np.ndarray = np.array([]), permute_rows: bool = False, ): - """ - Given a prediction model and a dataset, calculate the predictions of the model for - each data sample in probability format. + """Get probabilities for a given model and dataset. Parameters ---------- @@ -242,7 +237,6 @@ def get_probabilities( # pylint: disable=too-many-locals If permute_rows is set to true, y_test must also be supplied. The function will then return both the predicted probabilities and corresponding y_test """ - if permute_rows and (y_test is None): raise ValueError("If permute_rows is set to True, y_test must be supplied") @@ -264,11 +258,11 @@ def get_probabilities( # pylint: disable=too-many-locals def get_metrics( # pylint: disable=too-many-locals, too-many-statements y_pred_proba: np.ndarray, y_test: np.ndarray ): - """ - Calculate metrics, including attacker advantage for MIA binary. + """Calculate metrics, including attacker advantage for MIA binary. Implemented as Definition 4 on https://arxiv.org/pdf/1709.01604.pdf - which is also implemented in tensorFlow-privacy https://github.com/tensorflow/privacy. + which is also implemented in tensorFlow-privacy + https://github.com/tensorflow/privacy. Parameters ---------- @@ -299,7 +293,6 @@ def get_metrics( # pylint: disable=too-many-locals, too-many-statements * F1 Score - harmonic mean of precision and recall. * Advantage. """ - invalid_format = ( "y_pred must be an array of shape [x,2] with elements of type float" ) diff --git a/aisdc/preprocessing/__init__.py b/aisdc/preprocessing/__init__.py index e69de29b..0c69ca2a 100644 --- a/aisdc/preprocessing/__init__.py +++ b/aisdc/preprocessing/__init__.py @@ -0,0 +1 @@ +"""Handlers to pull in datasets and perform preprocessing.""" diff --git a/aisdc/preprocessing/loaders.py b/aisdc/preprocessing/loaders.py index c6af20d0..515b0f07 100644 --- a/aisdc/preprocessing/loaders.py +++ b/aisdc/preprocessing/loaders.py @@ -1,7 +1,4 @@ -""" -A set of useful handlers to pull in datasets common to the project and perform -the appropriate pre-processing. -""" +"""Handlers to pull in datasets and perform preprocessing.""" # pylint: disable=import-error, invalid-name, consider-using-with, too-many-return-statements @@ -35,16 +32,16 @@ class UnknownDataset(Exception): class DataNotAvailable(Exception): - """Exception raised if the user asks for a dataset that they do not have the data for. I.e. - some datasets require a .csv file to have been downloaded. - """ + """Exception raised if the user asks for a dataset that they do not have.""" def get_data_sklearn( # pylint: disable = too-many-branches dataset_name: str, data_folder: str = os.path.join(PROJECT_ROOT_FOLDER, "data") ) -> Tuple[pd.DataFrame, pd.DataFrame]: - """Main entry method to return data in format sensible for sklearn. User passes a name and that - dataset is returned as a tuple of pandas DataFrames (data, labels). + """Get data in a format sensible for sklearn. + + User passes a name and that dataset is returned as a tuple of pandas + DataFrames (data, labels). Parameters ---------- @@ -55,7 +52,6 @@ def get_data_sklearn( # pylint: disable = too-many-branches Returns ------- - X : pd.DataFrame The input dataframe -- rows are examples, columns variables y : pd.DataFrame @@ -63,7 +59,6 @@ def get_data_sklearn( # pylint: disable = too-many-branches Notes ----- - The following datasets are available: mimic2-iaccd (requires data download) in-hospital-mortality (requires data download) @@ -85,8 +80,13 @@ def get_data_sklearn( # pylint: disable = too-many-branches Examples -------- - >>> X, y = get_data_sklearn("mimic2-iaccd") # pull the mimic2-iaccd data - >>> X, y = get_data_sklearn("minmax iris") # pull the iris data and round continuous features + .. code-block:: python + + # pull the mimic2-iaccd data + X, y = get_data_sklearn("mimic2-iaccd") + + # pull the iris data and round continuous features + X, y = get_data_sklearn("minmax iris") """ logger.info("DATASET FOLDER = %s", data_folder) @@ -157,7 +157,7 @@ def get_data_sklearn( # pylint: disable = too-many-branches def _iris() -> Tuple[pd.DataFrame, pd.DataFrame]: - """Sklearn iris data - just first two classes.""" + """Get the Sklearn iris data - just first two classes.""" X, y = load_iris(return_X_y=True, as_frame=True) X = X[y < 2] y = y[y < 2] @@ -165,8 +165,7 @@ def _iris() -> Tuple[pd.DataFrame, pd.DataFrame]: def _nursery() -> Tuple[pd.DataFrame, pd.DataFrame]: - """The sklearn nursery dataset.""" - + """Return the sklearn nursery dataset.""" data = fetch_openml(data_id=26, as_frame=True) target_encoder = LabelEncoder() @@ -182,12 +181,14 @@ def _nursery() -> Tuple[pd.DataFrame, pd.DataFrame]: return feature_dataframe, target_dataframe -# Patched to support non-flattened images. Same behaviour as before except if called with -# flatten=False explicitly. def _images_to_ndarray( images_dir: str, number_to_load: int, label: int, flatten: bool = True ) -> Tuple[np.array, np.array]: - """Grab number_to_load images from the images_dir and create a np array and label array.""" + """Get number_to_load images from the images_dir and create arrays. + + Patched to support non-flattened images. + Same behaviour as before except if called with flatten=False explicitly. + """ folder_path = images_dir + os.sep images_names = sorted(os.listdir(folder_path)) images_names = images_names[:number_to_load] @@ -208,12 +209,11 @@ def _images_to_ndarray( def _medical_mnist_loader( # pylint: disable = too-many-locals data_folder: str, n_per_class: int, classes: List[str] ) -> Tuple[pd.DataFrame, pd.DataFrame]: - """ - Load Medical MNIST into pandas format - borrows heavily from: https://www.kaggle.com/harelshattenstein/medical-mnist-knn + """Get Medical MNIST into pandas format. + + Borrows heavily from: https://www.kaggle.com/harelshattenstein/medical-mnist-knn Creates a binary classification. """ - base_folder = os.path.join( data_folder, "kaggle-medical-mnist", @@ -272,11 +272,11 @@ def _medical_mnist_loader( # pylint: disable = too-many-locals def _synth_ae( data_folder: str, n_rows: int = 5000 ) -> Tuple[pd.DataFrame, pd.DataFrame]: - """ + """Get synth ae data. + First norws (default 5000) rows from the Synthetic A&E data from NHS England https://data.england.nhs.uk/dataset/a-e-synthetic-data/resource/81b068e5-6501-4840-a880-a8e7aa56890e # pylint: disable=line-too-long. """ - file_path = os.path.join(data_folder, "AE_England_synthetic.csv") if not os.path.exists(file_path): @@ -328,11 +328,10 @@ def _synth_ae( def _indian_liver(data_folder: str) -> Tuple[pd.DataFrame, pd.DataFrame]: - """ - Indian Liver Patient Dataset + """Get Indian Liver Patient Dataset. + https://archive.ics.uci.edu/ml/machine-learning-databases/00225/Indian%20Liver%20Patient%20Dataset%20(ILPD).csv # pylint: disable=line-too-long. """ - # (https://archive.ics.uci.edu/ml/datasets/ILPD+(Indian+Liver+Patient+Dataset) file_path = os.path.join(data_folder, "Indian Liver Patient Dataset (ILPD).csv") if not os.path.exists(file_path): help_message = f""" @@ -373,11 +372,12 @@ def _indian_liver(data_folder: str) -> Tuple[pd.DataFrame, pd.DataFrame]: def _in_hospital_mortality(data_folder: str) -> Tuple[pd.DataFrame, pd.DataFrame]: + """Get In-hospital mortality data. + + See: https://datadryad.org/stash/dataset/doi:10.5061/dryad.0p2ngf1zd. """ - In-hospital mortality data from this study: - https://datadryad.org/stash/dataset/doi:10.5061/dryad.0p2ngf1zd. - """ - # Check the data has been downloaded. If not throw an exception with instructions on how to + # Check the data has been downloaded. + # If not, throw an exception with instructions on how to # download, and where to store files = ["data01.csv", "doi_10.5061_dryad.0p2ngf1zd__v5.zip"] file_path = [os.path.join(data_folder, f) for f in files] @@ -413,9 +413,9 @@ def _in_hospital_mortality(data_folder: str) -> Tuple[pd.DataFrame, pd.DataFrame def _mimic_iaccd(data_folder: str) -> Tuple[pd.DataFrame, pd.DataFrame]: - """Loads the mimic_iaccd data and performs pre-processing.""" - - # Check the data has been downloaded. If not throw an exception with instructions on how to + """Get the mimic_iaccd data and perform preprocessing.""" + # Check the data has been downloaded. + # If not throw an exception with instructions on how to # download, and where to store file_path = os.path.join(data_folder, "mimic2-iaccd", "1.0", "full_cohort_data.csv") print(file_path, os.path.exists(file_path)) @@ -468,6 +468,8 @@ def _mimic_iaccd(data_folder: str) -> Tuple[pd.DataFrame, pd.DataFrame]: def _RDMP( # pylint: disable=too-many-locals, too-many-statements data_folder: str, ) -> Tuple[pd.DataFrame, pd.DataFrame]: + """Get the RDMP dataset.""" + def find_age(row): date_ = pd.to_datetime("01/06/2020") if row.date_of_death != row.date_of_death: diff --git a/aisdc/safemodel/__init__.py b/aisdc/safemodel/__init__.py index 72dd2622..4dd28a47 100644 --- a/aisdc/safemodel/__init__.py +++ b/aisdc/safemodel/__init__.py @@ -1,3 +1,3 @@ -"""Empty file.""" +"""Collection of defensive wrappers for preserving the privacy of ML models.""" from .reporting import get_reporting_string diff --git a/aisdc/safemodel/classifiers/dp_svc.py b/aisdc/safemodel/classifiers/dp_svc.py index fbc518ac..f47e2f3e 100644 --- a/aisdc/safemodel/classifiers/dp_svc.py +++ b/aisdc/safemodel/classifiers/dp_svc.py @@ -1,8 +1,4 @@ -""" -Differentially private SVC -James Liley -21/03/22. -""" +"""Differentially private SVC.""" import logging from typing import Any @@ -22,35 +18,39 @@ class DPSVC: - """ - Wrapper for differentially private SVM, implemented according to the method in. + """Differentially private SVM. - https://arxiv.org/pdf/0911.5708.pdf + Implemented according to: https://arxiv.org/pdf/0911.5708.pdf. - Essentially approximates an infinite-dimensional latent space (and corresponding kernel) with - a finite dimensional latent space, and adds noise to the normal to the separating hyperplane - in this latent space. + Essentially approximates an infinite-dimensional latent space (and + corresponding kernel) with a finite dimensional latent space, and adds + noise to the normal to the separating hyperplane in this latent space. Only currently implemented for a radial basis kernel, but could be extended. More specifically - - draws a set of dhat random vectors from a probability measure induced by the Fourier - transform of the kernel function + - draws a set of dhat random vectors from a probability measure induced by + the Fourier transform of the kernel function - approximates the kernel with a 2*dhat dimensional latent space - computes the separating hyperplane in this latent space with normal w - - then adds Laplacian noise to w and returns it along with the map to the latent space. + - then adds Laplacian noise to w and returns it along with the map to the + latent space. - The SKlearn SVM (see https://scikit-learn.org/stable/modules/svm.html#mathematical-formulation) + The SKlearn SVM (see + https://scikit-learn.org/stable/modules/svm.html#mathematical-formulation) minimises the function (1/2) ||w||_2 + C sum(zeta_i) - where 1-zeta_i≤ y_i (w phi(x_i) + b), where phi maps x to the latent space and zeta_i ≥ 0. + where 1-zeta_i≤ y_i (w phi(x_i) + b), where phi maps x to the latent space + and zeta_i ≥ 0. + This is equivalent to minimising (1/2) ||w||_2 + C/n sum(l(y_i,f_w(x_i))) - where l(x,y)=n*max(0,1- x.y), which is n-Lipschitz continuous in y (given x is in {-1,1}) + where l(x,y)=n*max(0,1- x.y), which is n-Lipschitz continuous in y (given x + is in {-1,1}) """ def __init__(self, C=1.0, gamma="scale", dhat=1000, eps=10, **kwargs): @@ -86,10 +86,7 @@ def phi_hat_multi(self, input_features): return phi_hat def k_hat_svm(self, x, y=None): - """ - Define the version which is sent to sklearn.svm. AFAICT python/numpy - doesn't have an 'outer' for arbitrary functions. - """ + """Define the version which is sent to sklearn.svm.""" phi_hat_x = self.phi_hat_multi(x) if y is None: phi_hat_y = phi_hat_x @@ -99,7 +96,6 @@ def k_hat_svm(self, x, y=None): def fit(self, train_features: Any, train_labels: Any) -> None: """Fit the model.""" - # Check that the data passed is np.ndarray if not isinstance(train_features, np.ndarray) or not isinstance( train_labels, np.ndarray @@ -136,9 +132,8 @@ def fit(self, train_features: Any, train_labels: Any) -> None: local_logger.warning( "gamma value passed in was zero, set to %g", SMALL_NUMBER ) - self.dpsvc_gamma = 1.0 / np.sqrt( - 2.0 * self.gamma - ) # alternative parameterisation + # alternative parameterisation + self.dpsvc_gamma = 1.0 / np.sqrt(2.0 * self.gamma) local_logger.info( "Gamma = %f (dp parameterisation = %f)", self.gamma, self.dpsvc_gamma @@ -157,9 +152,8 @@ def fit(self, train_features: Any, train_labels: Any) -> None: self.svc.fit(gram_matrix, train_labels) # Get separating hyperplane and intercept - alpha = ( - self.svc.dual_coef_ - ) # alpha from solved dual, multiplied by labels (-1,1) + # alpha from solved dual, multiplied by labels (-1,1) + alpha = self.svc.dual_coef_ xi = train_features[self.svc.support_, :] # support vectors x_i weights = np.zeros(2 * self.dhat) for i in range(alpha.shape[1]): @@ -172,7 +166,8 @@ def fit(self, train_features: Any, train_labels: Any) -> None: 0, self.lambdaval, len(weights) ) - # Logistic transform for predict_proba (rough): generate predictions (DP) for training data + # Logistic transform for predict_proba (rough): generate predictions + # (DP) for training data ypredn = np.zeros(n_data) for i in range(n_data): ypredn[i] = ( @@ -181,9 +176,7 @@ def fit(self, train_features: Any, train_labels: Any) -> None: ) local_logger.info("Fitting Platt scaling") - self.platt_transform.fit( - ypredn.reshape(-1, 1), train_labels - ) # was called ptransform + self.platt_transform.fit(ypredn.reshape(-1, 1), train_labels) def set_params(self, **kwargs) -> None: """Set params.""" @@ -204,13 +197,13 @@ def _raw_outputs(self, test_features: Any) -> np.ndarray: return out def predict(self, test_features: Any) -> np.ndarray: - """Make predictions.""" + """Return the predictions.""" out = self._raw_outputs(test_features) out = 1 * (out > 0) - return out # Predictions + return out def predict_proba(self, test_features: Any) -> np.ndarray: - """Predictive probabilities.""" + """Return the predictive probabilities.""" out = self._raw_outputs(test_features) pred_probs = self.platt_transform.predict_proba(out.reshape(-1, 1)) return pred_probs diff --git a/aisdc/safemodel/classifiers/new_model_template.py b/aisdc/safemodel/classifiers/new_model_template.py index 4f97dce4..3a38cbc6 100644 --- a/aisdc/safemodel/classifiers/new_model_template.py +++ b/aisdc/safemodel/classifiers/new_model_template.py @@ -1,7 +1,8 @@ -"""This is a template for implementing supplementary models -Obviously we have invented an sklearn ensemble called ModelToMakeSafer -Replace this with details of the model you wish to create a wrapper for -and then remove the comment which disables the pylint warning. +"""Template for implementing supplementary models. + +Obviously we have invented an sklearn ensemble called ModelToMakeSafer. Replace +this with details of the model you wish to create a wrapper for and then remove +the comment which disables the pylint warning. """ # pylint: disable=duplicate-code @@ -23,7 +24,7 @@ def check_present( item: str, curr_separate: dict, saved_separate: dict ) -> tuple[str, bool]: - """Checks item is present in both dicts and reports suitably.""" + """Check item is present in both dicts and report suitably.""" disclosive = False msg = "" if curr_separate[item] == "Absent" and saved_separate[item] == "Absent": @@ -44,7 +45,7 @@ class SafeModelToMakeSafe(SafeModel, ModelToMakeSafer): """Privacy protected ModelToMakeSafer.""" def __init__(self, **kwargs: Any) -> None: - """Creates model and applies constraints to params.""" + """Create model and apply constraints to params.""" SafeModel.__init__(self) self.k_anonymity = 0 self.basemodel_paramnames = [ @@ -88,7 +89,8 @@ def __init__(self, **kwargs: Any) -> None: def additional_checks( # pylint: disable=too-many-nested-blocks,too-many-branches self, curr_separate: dict, saved_separate: dict ) -> tuple[str, str]: - """ModelToMakeSafer specific checks + """Perform model specific checks. + This example shows how to deal with instances of sklearn's tree class as base estimators in a forest (line 99) or as single estimators (lines 114-118). @@ -152,18 +154,18 @@ def additional_checks( # pylint: disable=too-many-nested-blocks,too-many-branch return msg, disclosive def fit(self, x: np.ndarray, y: np.ndarray) -> None: - """Do fit and then store model dict.""" + """Fit model and store model dict.""" super().fit(x, y) self.k_anonymity = self.get_k_anonymity(x) self.saved_model = copy.deepcopy(self.__dict__) def get_k_anonymity(self, x: np.ndarray) -> int: - """Calculates the k-anonymity of a random forest model - as the minimum of the anonymity for each record. + """Calculate the k-anonymity of a random forest model. + + The k-anonymity is the minimum of the anonymity for each record. That is defined as the size of the set of records which appear in the same leaf as the record in every tree. """ - # dataset must be 2-D assert len(x.shape) == 2 diff --git a/aisdc/safemodel/classifiers/safedecisiontreeclassifier.py b/aisdc/safemodel/classifiers/safedecisiontreeclassifier.py index 5095b2f9..c960341d 100644 --- a/aisdc/safemodel/classifiers/safedecisiontreeclassifier.py +++ b/aisdc/safemodel/classifiers/safedecisiontreeclassifier.py @@ -16,9 +16,7 @@ def decision_trees_are_equal( tree1: DecisionTreeClassifier, tree2: DecisionTreeClassifier ) -> tuple[bool, str]: - """Compares two estimators of type sklearn.tree - e.g. two decisionTreeClassifiers. - """ + """Compare two estimators of type sklearn.tree.""" msg = "" same = True @@ -36,7 +34,6 @@ def decision_trees_are_equal( msg += get_reporting_string( name="basic_params_differ", length=num_differences ) - # f"Warning: basic parameters differ in {len(match)} places:\n" for i in range(num_differences): if match[i][0] == "change": msg += f"parameter {match[i][1]} changed from {match[i][2][1]} " @@ -52,7 +49,6 @@ def decision_trees_are_equal( except BaseException as error: # pylint:disable=broad-except #pragma:no cover msg += get_reporting_string(name="unable_to_check", error=error) - # f"Unable to check as an exception occurred: {error}" same = False return same, msg @@ -61,10 +57,10 @@ def decision_trees_are_equal( def decision_tree_internal_trees_are_equal( tree1_tree: Any, tree2_tree: Any ) -> tuple[bool, str]: - """Tests for equality of the internal structures in a sklearn.tree._tree - e.g. the structure, feature and threshold in each internal node etc. - """ + """Test for equality of the internal structures in a sklearn.tree._tree. + For example, the structure, feature and threshold in each internal node etc. + """ same = True msg = "" tree_internal_att_names = ( @@ -102,28 +98,23 @@ def decision_tree_internal_trees_are_equal( msg += get_reporting_string( name="internal_attribute_differs", attr=attr ) - # f"internal tree attribute {attr} differs\n" same = False else: if t1val != t2val: msg += get_reporting_string( name="internal_attribute_differs", attr=attr ) - # f"internal tree attribute {attr} differs\n" same = False except BaseException as error: # pylint:disable=broad-except #pragma:no cover msg += get_reporting_string(name="exception_occurred", error=error) - # f"An exception occurred: {error}" return same, msg def get_tree_k_anonymity(thetree: DecisionTreeClassifier, X: Any) -> int: - """Returns the smallest number of data items in any leaf.""" + """Return the smallest number of data items in any leaf.""" leaves = thetree.apply(X) uniqs_counts = np.unique(leaves, return_counts=True) k_anonymity = np.min(uniqs_counts[1]) - # print(f' leaf ids {uniqs_counts[0]} and counts {uniqs_counts[1]}' - # f'the k-anonymity of the tree is {k_anonymity}') return k_anonymity @@ -131,7 +122,7 @@ class SafeDecisionTreeClassifier(SafeModel, DecisionTreeClassifier): # pylint: """Privacy protected Decision Tree classifier.""" def __init__(self, **kwargs: Any) -> None: - """Creates model and applies constraints to params.""" + """Create model and apply constraints to params.""" SafeModel.__init__(self) self.basemodel_paramnames = [ "criterion", @@ -180,17 +171,12 @@ def additional_checks( disclosive = True if len(curr_separate) > 1: msg += get_reporting_string(name="unexpected_item") - # ( - # "unexpected item in curr_seperate dict " - # " passed by generic additional checks." - # ) - return msg, disclosive def fit( # pylint: disable=arguments-differ self, x: np.ndarray, y: np.ndarray ) -> None: - """Do fit and then store k-anonymity and model dict.""" + """Fit model and store k-anonymity and model dict.""" super().fit(x, y) # calculate k-anonymity her since we have the tainigf data leaves = self.apply(x) diff --git a/aisdc/safemodel/classifiers/safekeras.py b/aisdc/safemodel/classifiers/safekeras.py index 7c7cdebc..ae449527 100644 --- a/aisdc/safemodel/classifiers/safekeras.py +++ b/aisdc/safemodel/classifiers/safekeras.py @@ -1,31 +1,19 @@ -"""Safekeras.py: -Jim Smith, Andrew McCarty and Richard Preen -UWE 2022. -""" - -# general imports +"""Privacy protected Keras model.""" import os import warnings - -# import sys from typing import Any, Tuple import numpy as np - -# tensorflow imports import tensorflow as tf import tensorflow_privacy as tfp from dictdiffer import diff from tensorflow.keras import Model as KerasModel # pylint: disable = import-error from tensorflow_privacy import compute_dp_sgd_privacy -# safemodel superclass from ..reporting import get_reporting_string from ..safemodel import SafeModel -# suppress numerous deprecatino warnings -# shut tensorflow up warnings.filterwarnings("ignore", category=DeprecationWarning) warnings.filterwarnings("ignore", category=FutureWarning) @@ -43,7 +31,7 @@ def same_configs(m1: Any, m2: Any) -> Tuple[bool, str]: - """Checks if two models havethe same architecture.""" + """Check if two models have the same architecture.""" num_layers = len(m1.layers) if len(m2.layers) != num_layers: errstr = get_reporting_string(name="different_layer_count") @@ -76,7 +64,7 @@ def same_configs(m1: Any, m2: Any) -> Tuple[bool, str]: def same_weights(m1: Any, m2: Any) -> Tuple[bool, str]: - """Checks if two nets with same architecture havethe same weights.""" + """Check if two nets with same architecture have the same weights.""" num_layers = len(m1.layers) if num_layers != len(m2.layers): return False, "different numbers of layers" @@ -89,14 +77,14 @@ def same_weights(m1: Any, m2: Any) -> Tuple[bool, str]: for dim in range(len(m1layer)): # pylint: disable=consider-using-enumerate m1d = m1layer[dim] m2d = m2layer[dim] - # print(type(m1d), m1d.shape) if not np.array_equal(m1d, m2d): # pragma: no cover return False, f"dimension {dim} of layer {layer} differs" return True, "weights match" def check_checkpoint_equality(v1: str, v2: str) -> Tuple[bool, str]: - """Compares two checkpoints saved with tensorflow save_model + """Compare two checkpoints saved with tensorflow save_model. + On the assumption that the optimiser is not going to be saved, and that the model is going to be saved in frozen form this only checks the architecture and weights layer by layer. @@ -108,13 +96,11 @@ def check_checkpoint_equality(v1: str, v2: str) -> Tuple[bool, str]: model1 = tf.keras.models.load_model(v1) except Exception as e: # pylint:disable=broad-except msg = get_reporting_string(name="error_reloading_model_v1", e=e) - # f"Error re-loading model from {v1}: {e}" return False, msg try: model2 = tf.keras.models.load_model(v2) except Exception as e: # pylint:disable=broad-except msg = get_reporting_string(name="error_reloading_model_v2", e=e) - # f"Error re-loading model from {v2}: {e}" return False, msg same_config, config_message = same_configs(model1, model2) @@ -133,8 +119,7 @@ def check_checkpoint_equality(v1: str, v2: str) -> Tuple[bool, str]: def check_DP_used(optimizer) -> Tuple[bool, str]: - """Checks whether the DP optimizer was actually the one used.""" - + """Check whether the DP optimizer was actually the one used.""" key_needed = "_was_dp_gradients_called" critical_val = optimizer.__dict__.get(key_needed, "missing") @@ -151,17 +136,16 @@ def check_DP_used(optimizer) -> Tuple[bool, str]: # not currently reachable because optimizer class does # not support assignment # but leave in to future-proof - reason = get_reporting_string( - name="unrecognised_combination" - ) # pragma: no cover - DPused = False # pragma: no cover + reason = get_reporting_string(name="unrecognised_combination") + DPused = False return DPused, reason def check_optimizer_allowed(optimizer) -> Tuple[bool, str]: - """Checks if the model's optimizer is in our white-list - default setting is not allowed. + """Check if the model's optimizer is in our white-list. + + Default setting is not allowed. """ allowed = False opt_type = str(type(optimizer)) @@ -174,7 +158,7 @@ def check_optimizer_allowed(optimizer) -> Tuple[bool, str]: def check_optimizer_is_DP(optimizer) -> Tuple[bool, str]: - """Checks whether optimizer is one of tensorflow's DP versions.""" + """Check whether optimizer is one of tensorflow's DP versions.""" DPused = False reason = "None" if "_was_dp_gradients_called" not in optimizer.__dict__: @@ -186,8 +170,8 @@ def check_optimizer_is_DP(optimizer) -> Tuple[bool, str]: def load_safe_keras_model(name: str = "undefined") -> Tuple[bool, Any]: - """ - Reads model from file in appropriate format. + """Read model from file in appropriate format. + Optimizer is deliberately excluded in the save. This is to prevent possibility of restarting training, which could offer possible back door into attacks. @@ -201,9 +185,7 @@ def load_safe_keras_model(name: str = "undefined") -> Tuple[bool, Any]: elif model_load_file[-3:] == ".tf": # load from tf - the_model = tf.keras.models.load_model( - model_load_file # , custom_objects={"SafeKerasModel"} - ) + the_model = tf.keras.models.load_model(model_load_file) load = tf.keras.models.load_model(model_load_file, compile="False") the_model.set_weights(load.get_weights()) @@ -213,7 +195,6 @@ def load_safe_keras_model(name: str = "undefined") -> Tuple[bool, Any]: if the_model is not None: return (True, the_model) - # else return (False, msg) @@ -222,29 +203,20 @@ class SafeKerasModel(KerasModel, SafeModel): # pylint: disable=too-many-instance-attributes def __init__(self, *args: Any, **kwargs: Any) -> None: - """Creates model and applies constraints to params.""" - - # the_args = args + """Create model and apply constraints to params.""" the_kwargs = kwargs # initialise all the values that get provided as options to keras # and also l2 norm clipping and learning rates, batch sizes - ##inputs = kwargs.get("inputs","notFound") - ##if inputs=="notFound": - ## inputs = args[0] if len(args) == 3 else None inputs = None if "inputs" in kwargs.keys(): # pylint: disable=consider-iterating-dictionary inputs = the_kwargs["inputs"] elif len(args) == 3: # defaults is for Model(input,outputs,names) inputs = args[0] self.outputs = None - ##outputs = kwargs.get("outputs","notFound") - ##if outputs=="notFound": - ## outputs = args[1] if len(args) == 3 else None if "outputs" in kwargs.keys(): # pylint: disable=consider-iterating-dictionary outputs = the_kwargs["outputs"] elif len(args) == 3: - # self.outputs = args[1] outputs = args[1] # call the keras super class first as this comes first in chain @@ -291,7 +263,8 @@ def __init__(self, *args: Any, **kwargs: Any) -> None: def dp_epsilon_met( self, num_examples: int, batch_size: int = 0, epochs: int = 0 ) -> Tuple[bool, str]: - """Checks if epsilon is sufficient for Differential Privacy + """Check if epsilon is sufficient for Differential Privacy. + Provides feedback to user if epsilon is not sufficient. """ privacy = compute_dp_sgd_privacy( @@ -307,9 +280,7 @@ def dp_epsilon_met( def check_epsilon( self, num_samples: int, batch_size: int, epochs: int ) -> Tuple[bool, str]: - """Computes the level of privacy guarantee is within recommended limits, - and produces feedback". - """ + """Check if the level of privacy guarantee is within recommended limits.""" msg = "" ok = False if batch_size == 0: @@ -336,7 +307,8 @@ def check_epsilon( def compile( self, optimizer=None, loss="categorical_crossentropy", metrics=["accuracy"] ): # pylint:disable=dangerous-default-value) - """ + """Compile the safe Keras model. + Replaces the optimiser with a DP variant if needed and creates the necessary DP params in the opt and loss dict, then calls tf compile. Allow None as default value for optimizer param because we explicitly @@ -394,7 +366,8 @@ def fit( # pylint:disable=too-many-arguments batch_size: int, refine_epsilon: bool = False, ) -> Any: - """ + """Fit a safe Keras model. + Overrides the tensorflow fit() method with some extra functionality: (i) records number of samples for checking DP epsilon values. (ii) does an automatic epsilon check and reports. @@ -402,8 +375,6 @@ def fit( # pylint:disable=too-many-arguments (iii) then calls the tensorflow fit() function. (iv) finally makes a saved copy of the newly fitted model. """ - - # pylint can't cope that we first declared these via a dict :( self.num_samples = X.shape[0] # pylint: disable=attribute-defined-outside-init self.epochs = epochs # pylint: disable=attribute-defined-outside-init self.batch_size = batch_size @@ -438,11 +409,11 @@ def fit( # pylint:disable=too-many-arguments return returnval def posthoc_check(self, verbose: bool = True) -> Tuple[str, bool]: - """Checks whether model should be considered unsafe - for example, has been changed since fit() was last run, + """Check whether the model should be considered unsafe. + + For example, has been changed since fit() was last run, or does not meet DP policy. """ - disclosive = False msg = "" @@ -510,34 +481,25 @@ def posthoc_check(self, verbose: bool = True) -> Tuple[str, bool]: return msg, False def save(self, name: str = "undefined") -> None: - """Writes model to file in appropriate format. + """Write model to file in appropriate format. Parameters ---------- - name : string The name of the file to save - Returns - ------- - Notes ----- - - No return value - Optimizer is deliberately excluded. To prevent possible to restart training and thus possible back door into attacks. """ - self.model_save_file = name while self.model_save_file == "undefined": print(get_reporting_string(name="input_filename_with_extension")) return thename = self.model_save_file.split(".") - # print(f'in save(), parsed filename is {thename}') if len(thename) == 1: print(get_reporting_string(name="filename_must_indicate_type")) # "file name must indicate type as a suffix") @@ -550,7 +512,6 @@ def save(self, name: str = "undefined") -> None: self, self.model_save_file, include_optimizer=False, - # save_traces=False, save_format=suffix, ) # pragma:no cover @@ -560,12 +521,9 @@ def save(self, name: str = "undefined") -> None: name="error_saving_file", suffix=suffix, er=er ) ) - # f"saving as a {suffix} file gave this error message: {er}") else: print( get_reporting_string( name="suffix_not_supported_for_type", model_type=self.model_type ) ) - # f"{suffix} file suffix not supported " - # f"for models of type {self.model_type}.\n" diff --git a/aisdc/safemodel/classifiers/saferandomforestclassifier.py b/aisdc/safemodel/classifiers/saferandomforestclassifier.py index 567cd07e..0934830d 100644 --- a/aisdc/safemodel/classifiers/saferandomforestclassifier.py +++ b/aisdc/safemodel/classifiers/saferandomforestclassifier.py @@ -19,7 +19,7 @@ class SafeRandomForestClassifier(SafeModel, RandomForestClassifier): """Privacy protected Random Forest classifier.""" def __init__(self, **kwargs: Any) -> None: - """Creates model and applies constraints to params.""" + """Create model and apply constraints to params.""" SafeModel.__init__(self) self.basemodel_paramnames = [ "n_estimators", @@ -61,9 +61,9 @@ def __init__(self, **kwargs: Any) -> None: def additional_checks( # pylint: disable=too-many-nested-blocks self, curr_separate: dict, saved_separate: dict ) -> tuple[str, str]: - """Random Forest-specific checks - would benefit from refactoring into simpler blocks perhaps. - NOTE that this is never called if the model has not been fitted. + """Perform Random Forest specific checks. + + NOTE: this is never called if the model has not been fitted. """ msg = "" disclosive = False @@ -72,7 +72,6 @@ def additional_checks( # pylint: disable=too-many-nested-blocks # template for class of things that make up forest if item == "estimator": if type(curr_separate[item]) != type(saved_separate[item]): - # msg += get_reporting_string(name="basic_params_differ",length=1) msg += get_reporting_string( name="param_changed_from_to", key="estimator", @@ -117,18 +116,18 @@ def additional_checks( # pylint: disable=too-many-nested-blocks # pylint: disable=arguments-differ def fit(self, x: np.ndarray, y: np.ndarray) -> None: - """Do fit and then store model dict.""" + """Fit model and store model dict.""" super().fit(x, y) self.k_anonymity = self.get_k_anonymity(x) self.saved_model = copy.deepcopy(self.__dict__) def get_k_anonymity(self, x: np.ndarray) -> int: - """Calculates the k-anonymity of a random forest model - as the minimum of the anonymity for each record. + """Calculate the k-anonymity of a random forest model. + + The k-anonymity is the minimum of the anonymity for each record. That is defined as the size of the set of records which appear in the same leaf as the record in every tree. """ - # dataset must be 2-D assert len(x.shape) == 2 diff --git a/aisdc/safemodel/classifiers/safesvc.py b/aisdc/safemodel/classifiers/safesvc.py index 6f74eea8..bac81a0e 100644 --- a/aisdc/safemodel/classifiers/safesvc.py +++ b/aisdc/safemodel/classifiers/safesvc.py @@ -15,7 +15,7 @@ class SafeSVC(SafeModel, DPSVC): """Privacy protected Support Vector Classifier.""" def __init__(self, C=1.0, gamma="scale", dhat=1000, eps=10, **kwargs) -> None: - """Initialises a differentially private SVC.""" + """Initialise a differentially private SVC.""" SafeModel.__init__(self) DPSVC.__init__(self, C=C, gamma=gamma, dhat=dhat, eps=eps, **kwargs) self.model_type: str = "SVC" @@ -32,7 +32,7 @@ def __init__(self, C=1.0, gamma="scale", dhat=1000, eps=10, **kwargs) -> None: self.examine_seperately_items = ["platt_transform", "svc"] def fit(self, train_features: np.ndarray, train_labels: np.ndarray) -> None: - """Do fit and then store model dict.""" + """Fit model and store model dict.""" super().fit(train_features, train_labels) self.saved_model = copy.deepcopy(self.__dict__) @@ -49,10 +49,8 @@ def additional_checks( if len(diffs_list) > 0: disclosive = True if len(diffs_list) == 1: - msg += f"structure {item} has one difference.\n" #: {diffs_list}" + msg += f"structure {item} has one difference.\n" else: - msg += ( - f"structure {item} has several differences.\n" #: {diffs_list}" - ) + msg += f"structure {item} has several differences.\n" return msg, disclosive diff --git a/aisdc/safemodel/classifiers/safetf.py b/aisdc/safemodel/classifiers/safetf.py index 3b685e51..d8776f60 100644 --- a/aisdc/safemodel/classifiers/safetf.py +++ b/aisdc/safemodel/classifiers/safetf.py @@ -1,7 +1,4 @@ -"""Work in progress to allow use of the DPModel classes -Jim smith 2022 -When ready, linting of the imports will be enabled. -""" +"""Privacy protected TensorFlow model.""" # pylint: disable=unused-import from typing import Any @@ -9,8 +6,6 @@ import tensorflow as tf import tensorflow_privacy as tf_privacy from tensorflow_privacy import DPModel - -# from tensorflow.keras import Model as KerasModel from tensorflow_privacy.privacy.analysis import compute_dp_sgd_privacy from tensorflow_privacy.privacy.optimizers import dp_optimizer_keras @@ -20,7 +15,6 @@ class Safe_tf_DPModel(SafeModel, DPModel): """Privacy Protected tensorflow_privacy DP-SGD subclass of Keras model.""" - # remove comment once model starts to be populated # pylint:disable=super-init-not-called def __init__( self, @@ -30,9 +24,5 @@ def __init__( *args: any, **kwargs: any, ) -> None: - """Creates model and applies constraints to parameters.""" - # safemodel.__init__(self) - # DPModel.__init__(self, **kwargs) - # self.model_type: str = "tf_DPModel" - # super().preliminary_check(apply_constraints=True, verbose=True) + """Create model and apply constraints to parameters.""" raise NotImplementedError diff --git a/aisdc/safemodel/reporting.py b/aisdc/safemodel/reporting.py index d6351a52..f182e276 100644 --- a/aisdc/safemodel/reporting.py +++ b/aisdc/safemodel/reporting.py @@ -2,11 +2,10 @@ def get_reporting_string(**kwargs): - """Returns a standard formatted string from a diction of f-strings. + """Return a standard formatted string from a dictionary of f-strings. Parameters ---------- - name : string The dictionary key and the name of the string to return. all-the-keywords : Any Type @@ -14,13 +13,11 @@ def get_reporting_string(**kwargs): Returns ------- - msg : string A standard message string. Notes ----- - Sometimes an f-string has no parameters. Sometimes there are multiple parameters embedded in the f-string. """ diff --git a/aisdc/safemodel/safemodel.py b/aisdc/safemodel/safemodel.py index 0a508f7c..718f777f 100644 --- a/aisdc/safemodel/safemodel.py +++ b/aisdc/safemodel/safemodel.py @@ -1,4 +1,4 @@ -"""This module contains prototypes of privacy safe model wrappers.""" +"""Prototypes of privacy safe model wrappers.""" from __future__ import annotations @@ -28,11 +28,10 @@ def check_min(key: str, val: Any, cur_val: Any) -> tuple[str, bool]: - """Checks minimum value constraint. + """Check minimum value constraint. Parameters ---------- - key : string The dictionary key to examine. val : Any Type @@ -43,14 +42,10 @@ def check_min(key: str, val: Any, cur_val: Any) -> tuple[str, bool]: Returns ------- - msg : string A message string. disclosive : bool A boolean value indicating whether the model is potentially disclosive. - - Notes - ----- """ if isinstance(cur_val, (int, float)): if cur_val < val: @@ -71,11 +66,10 @@ def check_min(key: str, val: Any, cur_val: Any) -> tuple[str, bool]: def check_max(key: str, val: Any, cur_val: Any) -> tuple[str, bool]: - """Checks maximum value constraint. + """Check maximum value constraint. Parameters ---------- - key : string The dictionary key to examine. val : Any Type @@ -85,14 +79,10 @@ def check_max(key: str, val: Any, cur_val: Any) -> tuple[str, bool]: Returns ------- - msg : string A message string. disclosive : bool A boolean value indicating whether the model is potentially disclosive. - - Notes - ----- """ if isinstance(cur_val, (int, float)): if cur_val > val: @@ -113,11 +103,10 @@ def check_max(key: str, val: Any, cur_val: Any) -> tuple[str, bool]: def check_equal(key: str, val: Any, cur_val: Any) -> tuple[str, bool]: - """Checks equality value constraint. + """Check equality value constraint. Parameters ---------- - key : string The dictionary key to examine. val : Any Type @@ -127,14 +116,10 @@ def check_equal(key: str, val: Any, cur_val: Any) -> tuple[str, bool]: Returns ------- - msg : string A message string. disclosive : bool A boolean value indicating whether the model is potentially disclosive. - - Notes - ----- """ if cur_val != val: disclosive = True @@ -148,11 +133,10 @@ def check_equal(key: str, val: Any, cur_val: Any) -> tuple[str, bool]: def check_type(key: str, val: Any, cur_val: Any) -> tuple[str, bool]: - """Checks the type of a value. + """Check the type of a value. Parameters ---------- - key : string The dictionary key to examine. val : Any Type @@ -162,14 +146,10 @@ def check_type(key: str, val: Any, cur_val: Any) -> tuple[str, bool]: Returns ------- - msg : string A message string. disclosive : bool A boolean value indicating whether the model is potentially disclosive. - - Notes - ----- """ if type(cur_val).__name__ != val: disclosive = True @@ -187,7 +167,6 @@ class SafeModel: # pylint: disable = too-many-instance-attributes Attributes ---------- - model_type : string A string describing the type of model. Default is "None". model: @@ -205,16 +184,17 @@ class SafeModel: # pylint: disable = too-many-instance-attributes researcher : string The researcher user-id used for logging - Notes - ----- - Examples -------- >>> safeRFModel = SafeRandomForestClassifier() >>> safeRFModel.fit(X, y) >>> safeRFModel.save(name="safe.pkl") >>> safeRFModel.preliminary_check() - >>> safeRFModel.request_release(path="safe", ext="pkl", target=target) + >>> safeRFModel.request_release( + ... path="safe", + ... ext="pkl", + ... target=target, + ... ) WARNING: model parameters may present a disclosure risk: - parameter min_samples_leaf = 1 identified as less than the recommended min value of 5. Changed parameter min_samples_leaf = 5. @@ -241,9 +221,7 @@ def __init__(self) -> None: self.researcher = "unknown" def get_params(self, deep=True): - """Gets dictionary of parameter values - restricted to those expected by base classifier. - """ + """Get a dictionary of parameter values restricted to those expected.""" the_params = {} for key, val in self.__dict__.items(): if key in self.basemodel_paramnames: @@ -253,30 +231,22 @@ def get_params(self, deep=True): return the_params def save(self, name: str = "undefined") -> None: - """Writes model to file in appropriate format. + """Write model to file in appropriate format. Note this is overloaded in SafeKerasClassifer to deal with tensorflow specifics. Parameters ---------- - name : string The name of the file to save - Returns - ------- - Notes ----- - - No return value - Optimizer is deliberately excluded. To prevent possible to restart training and thus possible back door into attacks. """ - self.model_save_file = name if self.model_save_file == "undefined": print("You must input a name with extension to save the model.") @@ -312,66 +282,14 @@ def save(self, name: str = "undefined") -> None: f"for models of type {self.model_type}." f"Error message was {type_err}" ) - # Overloaded in safekeras - # elif suffix in ("h5", "tf") and self.model_type == "KerasModel": - # try: - # tf.keras.models.save_model( - # self, - # self.model_save_file, - # include_optimizer=False, - # # save_traces=False, - # save_format=suffix, - # ) - - # except (ImportError, NotImplementedError) as exception_err: - # print( - # "saving as a {suffix} file gave this error message:" - # f"{exception_err}" - # ) else: print( f"{suffix} file suffix currently not supported " f"for models of type {self.model_type}.\n" ) - ## Load functionality not needed - # - provide directly by underlying pickle/joblib mechanisms - # and safekeras provides its own to deal with tensorflow - - # def load(self, name: str = "undefined") -> None: - # """reads model from file in appropriate format. - # Note that safekeras overloads this function. - - # Optimizer is deliberately excluded in the save - # To prevent possible to restart training and thus - # possible back door into attacks. - # Thus optimizer cannot be loaded. - # """ - # temp_file=None - # self.model_load_file = name - # if self.model_load_file == "undefined": - # print("You must input a file name with extension to load a model.") - # else: - # thename = self.model_save_file.split(".") - # suffix = self.model_save_file.split(".")[-1] - - # if suffix == ".pkl": # load from pickle - # with open(self.model_load_file, "rb") as file: - # temp_file = pickle.load(self, file) - # elif suffix == ".sav": # load from joblib - # temp_file = joblib.load(self, self.model_save_file) - # #safekeras overloads loads - # elif suffix in ("h5","tf") and self.model_type != "KerasModel": - # print("tensorflow objects saved as h5 or tf" - # "can only be loaded into models of type SafeKerasClassifier" - # ) - # else: - # print(f"loading from a {suffix} file is currently not supported") - - # return temp_file - def __get_constraints(self) -> dict: - """Gets constraints relevant to the model type from the master read-only file.""" + """Get constraints relevant to the model type from the a read-only file.""" rules: dict = {} rule_path = pathlib.Path(__file__).with_name("rules.json") with open(rule_path, encoding="utf-8") as json_file: @@ -382,7 +300,7 @@ def __get_constraints(self) -> dict: def __apply_constraints( self, operator: str, key: str, val: Any, cur_val: Any ) -> str: - """Applies a safe rule for a given parameter.""" + """Apply a safe rule for a given parameter.""" if operator == "is_type": if (val == "int") and (type(cur_val).__name__ == "float"): self.__dict__[key] = int(self.__dict__[key]) @@ -402,7 +320,8 @@ def __apply_constraints( def __check_model_param( self, rule: dict, apply_constraints: bool ) -> tuple[str, bool]: - """Checks whether a current model parameter violates a safe rule. + """Check whether a current model parameter violates a safe rule. + Optionally fixes violations. """ disclosive: bool = False @@ -430,7 +349,8 @@ def __check_model_param( def __check_model_param_and( self, rule: dict, apply_constraints: bool ) -> tuple[str, bool]: - """Checks whether current model parameters violate a logical AND rule. + """Check whether current model parameters violate a logical AND rule. + Optionally fixes violations. """ disclosive: bool = False @@ -443,7 +363,7 @@ def __check_model_param_and( return msg, disclosive def __check_model_param_or(self, rule: dict) -> tuple[str, bool]: - """Checks whether current model parameters violate a logical OR rule.""" + """Check whether current model parameters violate a logical OR rule.""" disclosive: bool = True msg: str = "" for arg in rule["subexpr"]: @@ -456,12 +376,12 @@ def __check_model_param_or(self, rule: dict) -> tuple[str, bool]: def preliminary_check( self, verbose: bool = True, apply_constraints: bool = False ) -> tuple[str, bool]: - """Checks whether current model parameters violate the safe rules. + """Check whether current model parameters violate the safe rules. + Optionally fixes violations. Parameters ---------- - verbose : bool A boolean value to determine increased output level. @@ -471,15 +391,11 @@ def preliminary_check( Returns ------- - msg : string A message string disclosive : bool A boolean value indicating whether the model is potentially disclosive. - - Notes - ----- """ disclosive: bool = False msg: str = "" @@ -510,16 +426,13 @@ def preliminary_check( return msg, disclosive def get_current_and_saved_models(self) -> tuple[dict, dict]: - """Makes a copy of self.__dict__ - and splits it into dicts for the current and saved versions. - """ + """Copy self.__dict__ and split into dicts for current and saved versions.""" current_model = {} attribute_names_as_list = copy.copy(list(self.__dict__.keys())) for key in attribute_names_as_list: if key not in self.ignore_items: - # logger.debug(f'copying {key}') try: value = self.__dict__[key] # jim added current_model[key] = copy.deepcopy(value) @@ -547,10 +460,7 @@ def get_current_and_saved_models(self) -> tuple[dict, dict]: def examine_seperate_items( self, curr_vals: dict, saved_vals: dict ) -> tuple[str, bool]: - """Comparison of more complex structures - in the super class we just check these model-specific items exist - in both current and saved copies. - """ + """Check model-specific items exist in both current and saved copies.""" msg = "" disclosive = False @@ -573,8 +483,7 @@ def examine_seperate_items( return msg, disclosive def posthoc_check(self) -> tuple[str, bool]: - """Checks whether model has been interfered with since fit() was last run.""" - + """Check whether model has been interfered with since fit() was last run.""" disclosive = False msg = "" @@ -629,19 +538,18 @@ def posthoc_check(self) -> tuple[str, bool]: def additional_checks( self, curr_separate: dict, saved_separate: dict ) -> tuple[str, bool]: - """Placeholder function for additional posthoc checks e.g. keras this + """Perform additional posthoc checks. + + Placeholder function for additional posthoc checks e.g. keras this version just checks that any lists have the same contents. Parameters ---------- - curr_separate : python dictionary - saved_separate : python dictionary Returns ------- - msg : string A message string disclosive : bool @@ -649,11 +557,9 @@ def additional_checks( Notes ----- - posthoc checking makes sure that the two dicts have the same set of keys as defined in the list self.examine_separately """ - msg = "" disclosive = False for item in self.examine_seperately_items: @@ -679,8 +585,7 @@ def additional_checks( return msg, disclosive def request_release(self, path: str, ext: str, target: Target = None) -> None: - """Saves model to filename specified and creates a report for the TRE - output checkers. + """Save model and create a report for the TRE output checkers. Parameters ---------- @@ -737,7 +642,7 @@ def run_attack( output_dir: str = "RES", report_name: str = "undefined", ) -> dict: - """Runs a specified attack on the trained model and saves a report to file. + """Run a specified attack on the trained model and save report to file. Parameters ---------- @@ -800,7 +705,5 @@ def run_attack( return metadata def __str__(self) -> str: # pragma: no cover - """Returns string with model description. - No point writing a test, especially as it depends on username. - """ + """Return string with model description.""" return self.model_type + " with parameters: " + str(self.__dict__) diff --git a/docs/source/conf.py b/docs/source/conf.py index 6ad7fdf9..0ba94786 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -1,5 +1,5 @@ -# Configuration file for the Sphinx documentation builder. -# +"""Configuration file for the Sphinx documentation builder.""" + # -- Path setup -------------------------------------------------------------- import os diff --git a/examples/MIAandAIA_attacks_example.py b/examples/MIAandAIA_attacks_example.py index 11632153..0af3eb0b 100644 --- a/examples/MIAandAIA_attacks_example.py +++ b/examples/MIAandAIA_attacks_example.py @@ -1,14 +1,9 @@ -"""Examples for running multiple attacks including the 'Membership Inferene Attack' -and the "Attribute Inference Attack" with a single configuration file have -multiple configurations. +"""Examples for running multiple attacks. -In the code, [Researcher] and [TRE] are used in comments to denote which bit is done by whom +Includes the Membership Inference Attack and the Attribute Inference Attack +with a single configuration file have multiple configurations. -Running -------- - -Invoke this code from the root AI-SDC folder with -python -m examples.MIAandAIA_attacks_example +Below, [Researcher] and [TRE] are used to denote which task is performed by whom. """ import json diff --git a/examples/attribute_inference_example.py b/examples/attribute_inference_example.py index a6ab8c52..fd571451 100644 --- a/examples/attribute_inference_example.py +++ b/examples/attribute_inference_example.py @@ -1,12 +1,4 @@ -""" -Example demonstrating the attribute inference attacks. - -Running -------- - -Invoke this code from the root AI-SDC folder with -python -m examples.attribute_inference_example -""" +"""Example demonstrating the attribute inference attacks.""" import json import os diff --git a/examples/lira_attack_example.py b/examples/lira_attack_example.py index 3da263d7..42430384 100644 --- a/examples/lira_attack_example.py +++ b/examples/lira_attack_example.py @@ -1,8 +1,9 @@ """Examples for using the likelihood ratio attack code. -This code simulates a MIA attack providing the attacker with as much information as possible. -i.e. they have a subset of rows that they _know_ were used for training. And a subset that they -know were not. They also have query access to the target model. +This code simulates a MIA attack providing the attacker with as much +information as possible. That is, they have a subset of rows that they _know_ +were used for training. And a subset that they know were not. They also have +query access to the target model. The attack proceeds as described in this paper: https://arxiv.org/pdf/2112.03570.pdf @@ -22,11 +23,7 @@ in the previous two steps, as well as specifications for the shadow models. 5. The attack is run with a command line command, creating a report. -Running -------- - -Invoke this code from the root AI-SDC folder with -python -m examples.lira_attack_example +Below, [Researcher] and [TRE] are used to denote which task is performed by whom. """ # pylint: disable = duplicate-code diff --git a/examples/safemodel_attack_integration_bothcalls.py b/examples/safemodel_attack_integration_bothcalls.py index 0ec4a808..58d43790 100644 --- a/examples/safemodel_attack_integration_bothcalls.py +++ b/examples/safemodel_attack_integration_bothcalls.py @@ -1,7 +1,4 @@ -"""Workimg on how to integrate attacks into safemosdel classes -Invoke this code from the root AI-SDC folder with -python -m examples.safemodel_attack_integration_bothcalls. -""" +"""Example showing how to integrate attacks into safemodel classes.""" import logging diff --git a/examples/worst_case_attack_example.py b/examples/worst_case_attack_example.py index aca768dc..abcf7589 100644 --- a/examples/worst_case_attack_example.py +++ b/examples/worst_case_attack_example.py @@ -1,23 +1,19 @@ """Examples for using the 'worst case' attack code. -This code simulates a MIA attack providing the attacker with as much information as possible. -i.e. they have a subset of rows that they _know_ were used for training. And a subset that they -know were not. They also have query access to the target model. +This code simulates a MIA attack providing the attacker with as much +information as possible. That is, they have a subset of rows that they _know_ +were used for training. And a subset that they know were not. They also have +query access to the target model. -They pass the training and non-training rows through the target model to get the predictive -probabilities. These are then used to train an _attack model_. And the attack model is evaluated -to see how well it can predict whether or not other examples were in the training set or not. +They pass the training and non-training rows through the target model to get +the predictive probabilities. These are then used to train an _attack model_. +And the attack model is evaluated to see how well it can predict whether or not +other examples were in the training set or not. -The code can be called from the command line, or accessed programmatically. Examples of both -are shown below. +The code can be called from the command line, or accessed programmatically. +Examples of both are shown below. -In the code, [Researcher] and [TRE] are used in comments to denote which bit is done by whom - -Running -------- - -Invoke this code from the root AI-SDC folder with -python -m examples.worst_case_attack_example +Below, [Researcher] and [TRE] are used to denote which task is performed by whom. """ import json diff --git a/pyproject.toml b/pyproject.toml index 1075166c..77f6499a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,6 +35,7 @@ max-module-lines = 1000 # Maximum number of lines in a module. indent-width = 4 line-length = 88 target-version = "py39" +extend-include = ["*.ipynb"] lint.select = [ # "ANN", # flake8-annotations @@ -42,7 +43,7 @@ lint.select = [ # "B", # flake8-bugbear # "C4", # flake8-comprehensions # "C90", # mccabe -# "D", # pydocstyle + "D", # pydocstyle # "DTZ", # flake8-datetimez # "E", # pycodestyle # "EM", # flake8-errmsg @@ -72,5 +73,15 @@ lint.select = [ "YTT", # flake8-2020 ] -[tool.ruff.per-file-ignores] +lint.ignore = [ +] + +[tool.ruff.lint.pydocstyle] +convention = "numpy" + +[tool.ruff.format] +docstring-code-format = true +docstring-code-line-length = 80 + +[tool.ruff.lint.extend-per-file-ignores] "tests/**/*" = ["S101"] diff --git a/tests/__init__.py b/tests/__init__.py index e69de29b..8980c6b8 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -0,0 +1 @@ +"""Tests for the aisdc package.""" diff --git a/tests/attacks/__init__.py b/tests/attacks/__init__.py index e69de29b..bdf12e0b 100644 --- a/tests/attacks/__init__.py +++ b/tests/attacks/__init__.py @@ -0,0 +1 @@ +"""Tests for the attacks package.""" diff --git a/tests/attacks/test_attack_report_formatter.py b/tests/attacks/test_attack_report_formatter.py index 1773cca9..292e4ec3 100644 --- a/tests/attacks/test_attack_report_formatter.py +++ b/tests/attacks/test_attack_report_formatter.py @@ -70,7 +70,7 @@ class TestGenerateReport(unittest.TestCase): """Class which tests the attack_report_formatter.py file.""" def process_json_from_file(self, json_formatted): - """Function which handles file input/output from the process_json function.""" + """Handle file input/output from the process_json function.""" filename = "test.json" output_filename = "results.txt" @@ -278,7 +278,6 @@ def test_instance_based(self): def test_min_samples_leaf(self): """Test process_json when the target model includes decision trees.""" - # test when min_samples_leaf > 5 json_formatted = get_test_report() diff --git a/tests/attacks/test_lira_attack.py b/tests/attacks/test_lira_attack.py index 931d63b6..2f119592 100644 --- a/tests/attacks/test_lira_attack.py +++ b/tests/attacks/test_lira_attack.py @@ -23,7 +23,7 @@ @pytest.fixture(name="dummy_classifier_setup") def fixture_dummy_classifier_setup(): - """Setup common things for DummyClassifier.""" + """Set up common things for DummyClassifier.""" dummy = DummyClassifier() X = np.array([[0.2, 0.8], [0.7, 0.3]]) return dummy, X @@ -46,7 +46,7 @@ def test_predict(dummy_classifier_setup): @pytest.fixture(name="lira_classifier_setup") def fixture_lira_classifier_setup(): - """Setup common things for LiRA.""" + """Set up common things for LiRA.""" X, y = load_breast_cancer(return_X_y=True, as_frame=False) train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3) target_model = RandomForestClassifier( diff --git a/tests/attacks/test_metrics.py b/tests/attacks/test_metrics.py index 8f69bf3d..11a19741 100644 --- a/tests/attacks/test_metrics.py +++ b/tests/attacks/test_metrics.py @@ -85,7 +85,6 @@ def test_permute_rows_errors(self): def test_permute_rows_with_permute_rows(self): """Test permute_rows = True succeeds.""" - clf = DummyClassifier() testX = np.zeros((4, 2)) testY = np.zeros((4, 2)) @@ -134,7 +133,6 @@ def test_metrics(self): def test_mia_extremecase(self): """Test the extreme case mia in metrics.py.""" - # create actual values y = np.zeros(50000) y[:25] = 1 diff --git a/tests/attacks/test_structural_attack.py b/tests/attacks/test_structural_attack.py index 263a061d..00c5f877 100644 --- a/tests/attacks/test_structural_attack.py +++ b/tests/attacks/test_structural_attack.py @@ -20,8 +20,7 @@ def get_target(modeltype: str, **kwparams: dict) -> Target: - """Loads dataset and creates target of the desired type.""" - + """Load dataset and create target of the desired type.""" X, y = load_breast_cancer(return_X_y=True, as_frame=False) train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3) @@ -53,7 +52,7 @@ def get_target(modeltype: str, **kwparams: dict) -> Target: def test_unnecessary_risk(): - """Checking the unnecessary rules.""" + """Check the unnecessary rules.""" # non-tree we have no evidence yet model = SVC() assert sa.get_unnecessary_risk(model) == 0, "no risk without evidence" @@ -169,7 +168,6 @@ def test_non_trees(): def test_dt(): """Test for decision tree classifier.""" - # 'non' disclosive' param_dict = {"max_depth": 1, "min_samples_leaf": 150} target = get_target("dt", **param_dict) @@ -203,7 +201,6 @@ def test_dt(): def test_adaboost(): """Test for adaboost classifier.""" - # 'non' disclosive' # - base estimator =None => DecisionTreeClassifier with max_depth 1 # also set THRESHOLD to 4 @@ -240,7 +237,6 @@ def test_adaboost(): def test_rf(): """Test for random forest classifier.""" - # 'non' disclosive' param_dict = {"max_depth": 1, "min_samples_leaf": 150, "n_estimators": 10} target = get_target("rf", **param_dict) diff --git a/tests/attacks/test_worst_case_attack.py b/tests/attacks/test_worst_case_attack.py index 605fcde2..8b09651f 100644 --- a/tests/attacks/test_worst_case_attack.py +++ b/tests/attacks/test_worst_case_attack.py @@ -111,11 +111,11 @@ def test_report_worstcase(): output_dir="test_output_worstcase", ) attack_obj.attack(target) - # attack_obj.make_dummy_data() cause exception when used like this! _ = attack_obj.make_report() # with one rep attack_obj = worst_case_attack.WorstCaseAttack( + reproduce_split=[5, 5], n_reps=1, n_dummy_reps=1, p_thresh=0.05, @@ -162,7 +162,6 @@ def test_attack_with_correct_feature(): def test_attack_from_predictions(): """Checks code that runs attacks from predictions.""" - X, y = load_breast_cancer(return_X_y=True, as_frame=False) train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3) @@ -197,7 +196,6 @@ def test_attack_from_predictions(): def test_attack_from_predictions_no_dummy(): """Checks code that runs attacks from predictions.""" - X, y = load_breast_cancer(return_X_y=True, as_frame=False) train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3) @@ -271,6 +269,7 @@ def test_attack_data_prep(): def test_attack_data_prep_with_correct_feature(): """Test the method that prepares the attack data. + This time, testing that the model correctness values are added, are always the final feature, and are not included in the sorting. """ @@ -303,13 +302,13 @@ def test_attack_data_prep_with_correct_feature(): def test_non_rf_mia(): - """Tests that it is possible to set the attack model via the args + """Test that it is possible to set the attack model via the args. + In this case, we set as a SVC. But we set probability to false. If the code does indeed try and use the SVC (as we want) it will fail as it will try and access the predict_proba which won't work if probability=False. Hence, if the code throws an AttributeError we now it *is* trying to use the SVC. """ - X, y = load_breast_cancer(return_X_y=True, as_frame=False) train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3) @@ -331,7 +330,6 @@ def test_non_rf_mia(): def test_main(): """Test invocation via command line.""" - # option 1 testargs = ["prog", "make-dummy-data"] with patch.object(sys, "argv", testargs): diff --git a/tests/conftest.py b/tests/conftest.py index c67a5528..9ecfb436 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2,6 +2,7 @@ import os import shutil +from datetime import date import numpy as np import pytest @@ -42,7 +43,6 @@ files = [ "1024-WorstCase.png", "2048-WorstCase.png", - "ATTACK_RESULTS09_06_2024.json", "attack.txt", "config.json", "config_structural_test.json", @@ -75,11 +75,17 @@ def _cleanup(): """Remove created files and directories.""" yield + for folder in folders: try: shutil.rmtree(folder) except Exception: # pylint: disable=broad-exception-caught pass + + files.append( # from attack_report_formater.py + "ATTACK_RESULTS" + str(date.today().strftime("%d_%m_%Y")) + ".json" + ) + for file in files: try: os.remove(file) diff --git a/tests/preprocessing/__init__.py b/tests/preprocessing/__init__.py index e69de29b..1ba534e4 100644 --- a/tests/preprocessing/__init__.py +++ b/tests/preprocessing/__init__.py @@ -0,0 +1 @@ +"""Tests for the preprocessing package.""" diff --git a/tests/preprocessing/test_loaders.py b/tests/preprocessing/test_loaders.py index 28450c4d..1fde7ae6 100644 --- a/tests/preprocessing/test_loaders.py +++ b/tests/preprocessing/test_loaders.py @@ -34,7 +34,8 @@ def test_get_sklearn_dataset(): - """Test ability to load some standard datasets + """Test ability to load some standard datasets. + These loaders only return binary versions. """ # test preprocessing with iris for speed @@ -121,10 +122,7 @@ def test_mimic(): def test_in_hospital(): - """Tests loading the in hospital mortality data - in two different ways. - """ - + """Tests loading the in hospital mortality data in two different ways.""" zip_file_name = os.path.join(DATA_FOLDER, "doi_10.5061_dryad.0p2ngf1zd__v5.zip") new_file_name = os.path.join(DATA_FOLDER, "doi_10.5061_dryad.0p2ngf1zd__v5.renamed") # first attempt reads from zip file diff --git a/tests/safemodel/__init__.py b/tests/safemodel/__init__.py index e69de29b..d1d33185 100644 --- a/tests/safemodel/__init__.py +++ b/tests/safemodel/__init__.py @@ -0,0 +1 @@ +"""Tests for the safemodel package.""" diff --git a/tests/safemodel/test_attacks.py b/tests/safemodel/test_attacks.py index 12c296d1..18fef622 100644 --- a/tests/safemodel/test_attacks.py +++ b/tests/safemodel/test_attacks.py @@ -26,7 +26,6 @@ def test_superclass(): def test_NumpyArrayEncoder(): """Conversion routine from reports.py.""" - i32 = np.int32(2) i64 = np.int64(2) twoDarray = np.zeros((2, 2)) diff --git a/tests/safemodel/test_safekeras2.py b/tests/safemodel/test_safekeras2.py index 264c6c8a..084c3862 100644 --- a/tests/safemodel/test_safekeras2.py +++ b/tests/safemodel/test_safekeras2.py @@ -133,7 +133,6 @@ def test_init_variants(): def test_same_configs(): # pylint: disable=too-many-locals """Test whether tests for equal configuration work.""" - model1, X, _, _, _ = make_small_model(num_hidden_layers=1) model2, _, _, _, _ = make_small_model(num_hidden_layers=2) model2a, _, _, _, _ = make_small_model(num_hidden_layers=2) @@ -320,7 +319,6 @@ def test_checkpoints_are_equal(): def test_load(): """Test the loading functionality.""" - # make a model, train then save it model, X, y, Xval, yval = make_small_model() loss = tf.keras.losses.CategoricalCrossentropy( diff --git a/tests/safemodel/test_safemodel.py b/tests/safemodel/test_safemodel.py index e6e2721d..88528974 100644 --- a/tests/safemodel/test_safemodel.py +++ b/tests/safemodel/test_safemodel.py @@ -25,7 +25,7 @@ class DummyClassifier: def __init__( self, at_least_5f=5.0, at_most_5i=5, exactly_boo="boo", keyA=True, keyB=True ): - """Dummy init.""" + """Instantiate a dummy classifier.""" self.at_least_5f = at_least_5f self.at_most_5i = at_most_5i self.exactly_boo = exactly_boo @@ -33,7 +33,7 @@ def __init__( self.keyB = keyB def fit(self, x: np.ndarray, y: np.ndarray): - """Dummy fit.""" + """Fit a dummy classifier.""" def predict(self, x: np.ndarray): # pragma: no cover """Predict all ones.""" @@ -85,7 +85,7 @@ def set_params(self, **kwargs): # pragma: no cover self.key = val # pylint:disable=attribute-defined-outside-init def fit(self, x: np.ndarray, y: np.ndarray): # noqa: ARG002 - """Dummy fit.""" + """Fit a safe dummy classifier.""" self.saved_model = copy.deepcopy(self.__dict__) @@ -354,7 +354,6 @@ def test_loads(): def test_apply_constraints(): """Test constraints can be applied as expected.""" - # wrong type model = SafeDummyClassifier() model.at_least_5f = 3.9 diff --git a/user_stories/generate_disclosure_risk_report.py b/user_stories/generate_disclosure_risk_report.py index e98fe4ce..6a37ef94 100644 --- a/user_stories/generate_disclosure_risk_report.py +++ b/user_stories/generate_disclosure_risk_report.py @@ -1,16 +1,14 @@ -""" -TRE script to run the code to do the disclosure risk checking for a -machine learning model that has been trained by a researcher. - -Researchers should fill out the relevant parameters in the .yaml file, which should be in the same -directory as this file +"""TRE script to perform disclosure checking for a trained ML model. -TREs can change the script that is run using the user_story parameter at the top of the file +Researchers should fill out the relevant parameters in the .yaml file, which +should be in the same directory as this file. TREs can change the script that +is run using the user_story parameter at the top of the file. To run this code: - python generate_disclosure_risk_report.py (with the .yaml file in the same directory) -NOTE: you should not need to change this file at all +python generate_disclosure_risk_report.py (with the .yaml file in the same directory) + +NOTE: you should not need to change this file at all. """ import argparse diff --git a/user_stories/user_story_1/user_story_1_researcher_template.py b/user_stories/user_story_1/user_story_1_researcher_template.py index 94c852b4..8437b1a5 100644 --- a/user_stories/user_story_1/user_story_1_researcher_template.py +++ b/user_stories/user_story_1/user_story_1_researcher_template.py @@ -1,5 +1,4 @@ -""" -RESEARCHER EXAMPLE FOR USER STORY 1. +"""RESEARCHER EXAMPLE FOR USER STORY 1. This file is an example of a researcher creating/training a machine learning model and requesting for it to be released. @@ -33,7 +32,6 @@ def main(): # pylint: disable=too-many-statements, disable=too-many-locals """Create and train a model to be released.""" - # This section is not necessary but helpful - cleans up files that are created by aisdc save_directory = "training_artefacts" print("Creating directory for training artefacts") diff --git a/user_stories/user_story_1/user_story_1_tre.py b/user_stories/user_story_1/user_story_1_tre.py index 1039e3f4..faf21ff9 100644 --- a/user_stories/user_story_1/user_story_1_tre.py +++ b/user_stories/user_story_1/user_story_1_tre.py @@ -1,12 +1,12 @@ -""" -TRE SCRIPT FOR USER STORY 1. +"""TRE SCRIPT FOR USER STORY 1. -This file contains the code needed to run user story 1 +This file contains the code needed to run user story 1. -To run: change the user_story key inside the .yaml config file to '1', and run the -'generate_disclosure_risk_report.py' file +To run: change the user_story key inside the .yaml config file to '1', and run +the 'generate_disclosure_risk_report.py' file. -NOTE: you should not need to change this file at all, set all parameters via the .yaml file +NOTE: you should not need to change this file at all, set all parameters via +the .yaml file. """ import argparse @@ -21,7 +21,6 @@ def generate_report(directory, attack_results, target, outfile): """Generate report based on target model.""" - print() print("Acting as TRE...") print() @@ -40,8 +39,7 @@ def generate_report(directory, attack_results, target, outfile): def run_user_story(release_config: dict): - """Main method to parse arguments and then invoke report generation.""" - + """Run the user story, parsing arguments and then invoking report generation.""" generate_report( release_config["training_artefacts_dir"], release_config["attack_results"], @@ -50,7 +48,7 @@ def run_user_story(release_config: dict): ) -if __name__ == "__main__": # pragma:no cover +if __name__ == "__main__": parser = argparse.ArgumentParser( description=( "Generate a risk report after request_release() has been called by researcher" @@ -72,7 +70,7 @@ def run_user_story(release_config: dict): try: with open(args.config_file, encoding="utf-8") as handle: config = yaml.load(handle, Loader=yaml.loader.SafeLoader) - except AttributeError as error: # pragma:no cover + except AttributeError as error: print( "Invalid command. Try --help to get more details" f"error message is {error}" diff --git a/user_stories/user_story_2/data_processing_researcher.py b/user_stories/user_story_2/data_processing_researcher.py index ee27d577..e3ad658a 100644 --- a/user_stories/user_story_2/data_processing_researcher.py +++ b/user_stories/user_story_2/data_processing_researcher.py @@ -1,5 +1,4 @@ -""" -SUPPORTING FILE FOR USER STORY 2. +"""SUPPORTING FILE FOR USER STORY 2. This file is an example of a function created by a researcher that will pre-process a dataset diff --git a/user_stories/user_story_2/user_story_2_researcher_template.py b/user_stories/user_story_2/user_story_2_researcher_template.py index 17391d78..1d45a163 100644 --- a/user_stories/user_story_2/user_story_2_researcher_template.py +++ b/user_stories/user_story_2/user_story_2_researcher_template.py @@ -1,5 +1,4 @@ -""" -RESEARCHER EXAMPLE FOR USER STORY 2. +"""RESEARCHER EXAMPLE FOR USER STORY 2. This file is an example of a researcher creating/training a machine learning model and to be released form a secure environment @@ -32,7 +31,6 @@ def run_user_story(): # pylint: disable=too-many-locals """Create and train a model to be released.""" - # This section is not necessary but helpful - cleans up files that are created by aisdc directory = "training_artefacts" print("Creating directory for training artefacts") diff --git a/user_stories/user_story_2/user_story_2_tre.py b/user_stories/user_story_2/user_story_2_tre.py index 75400f9d..c1148267 100644 --- a/user_stories/user_story_2/user_story_2_tre.py +++ b/user_stories/user_story_2/user_story_2_tre.py @@ -1,12 +1,12 @@ -""" -TRE SCRIPT FOR USER STORY 2. +"""TRE SCRIPT FOR USER STORY 2. -This file contains the code needed to run user story 2 +This file contains the code needed to run user story 2. -To run: change the user_story key inside the .yaml config file to '2', and run the -'generate_disclosure_risk_report.py' file +To run: change the user_story key inside the .yaml config file to '2', and run +the 'generate_disclosure_risk_report.py' file. -NOTE: you should not need to change this file at all, set all parameters via the .yaml file +NOTE: you should not need to change this file at all, set all parameters via +the .yaml file. """ import argparse @@ -27,7 +27,11 @@ def process_dataset(filename, function_name, data_to_be_processed): - """DO NOT CHANGE: a wrapper function that allows a callable function to be read from a file.""" + """Process dataset. + + DO NOT CHANGE: this is a wrapper function that allows a callable function + to be read from a file. + """ spec = importlib.util.spec_from_file_location(function_name, filename) module = importlib.util.module_from_spec(spec) spec.loader.exec_module(module) @@ -48,7 +52,6 @@ def generate_report( outfile, ): # pylint: disable=too-many-locals, disable=too-many-arguments """Generate report based on target model.""" - print() print("Acting as TRE...") print( @@ -117,8 +120,7 @@ def generate_report( def run_user_story(release_config: dict): - """Main method to parse arguments and then invoke report generation.""" - + """Run the user story, parsing arguments and then invoking report generation.""" generate_report( release_config["data_processing_filename"], release_config["data_processing_function_name"], diff --git a/user_stories/user_story_3/user_story_3_researcher_template.py b/user_stories/user_story_3/user_story_3_researcher_template.py index 914a3397..d9b0a804 100644 --- a/user_stories/user_story_3/user_story_3_researcher_template.py +++ b/user_stories/user_story_3/user_story_3_researcher_template.py @@ -1,5 +1,4 @@ -""" -RESEARCHER EXAMPLE FOR USER STORY 3. +"""RESEARCHER EXAMPLE FOR USER STORY 3. This file is an example of a researcher creating/training a machine learning model and to be released form a secure environment @@ -31,7 +30,6 @@ def run_user_story(): # pylint: disable=too-many-locals """Create and train a model to be released.""" - # This section is not necessary but helpful - cleans up files that are created by aisdc directory = "training_artefacts" print("Creating directory for training artefacts") diff --git a/user_stories/user_story_3/user_story_3_tre.py b/user_stories/user_story_3/user_story_3_tre.py index 7324a66e..5f91ea92 100644 --- a/user_stories/user_story_3/user_story_3_tre.py +++ b/user_stories/user_story_3/user_story_3_tre.py @@ -1,12 +1,12 @@ -""" -TRE SCRIPT FOR USER STORY 3. +"""TRE SCRIPT FOR USER STORY 3. -This file contains the code needed to run user story 3 +This file contains the code needed to run user story 3. -To run: change the user_story key inside the .yaml config file to '3', and run the -'generate_disclosure_risk_report.py' file +To run: change the user_story key inside the .yaml config file to '3', and run +the 'generate_disclosure_risk_report.py' file. -NOTE: you should not need to change this file at all, set all parameters via the .yaml file +NOTE: you should not need to change this file at all, set all parameters via +the .yaml file. """ import argparse @@ -40,7 +40,6 @@ def generate_report( outfile, ): # pylint: disable=too-many-arguments, disable=too-many-locals """Generate report based on target model.""" - print() print("Acting as TRE...") print() @@ -123,8 +122,7 @@ def generate_report( def run_user_story(release_config: dict): - """Main method to parse arguments and then invoke report generation.""" - + """Run the user story, parsing arguments and then invoking report generation.""" generate_report( release_config["training_artefacts_dir"], release_config["target_model"], @@ -138,7 +136,7 @@ def run_user_story(release_config: dict): ) -if __name__ == "__main__": # pragma:no cover +if __name__ == "__main__": parser = argparse.ArgumentParser( description=( "Generate a risk report after request_release() has been called by researcher" @@ -160,7 +158,7 @@ def run_user_story(release_config: dict): try: with open(args.config_file, encoding="utf-8") as handle: config = yaml.load(handle, Loader=yaml.loader.SafeLoader) - except AttributeError as error: # pragma:no cover + except AttributeError as error: print( "Invalid command. Try --help to get more details" f"error message is {error}" diff --git a/user_stories/user_story_4/user_story_4_tre.py b/user_stories/user_story_4/user_story_4_tre.py index a85774ab..00c97873 100644 --- a/user_stories/user_story_4/user_story_4_tre.py +++ b/user_stories/user_story_4/user_story_4_tre.py @@ -1,12 +1,12 @@ -""" -TRE SCRIPT FOR USER STORY 4. +"""TRE SCRIPT FOR USER STORY 4. -This file contains the code needed to run user story 4 +This file contains the code needed to run user story 4. -To run: change the user_story key inside the .yaml config file to '4', and run the -'generate_disclosure_risk_report.py' file +To run: change the user_story key inside the .yaml config file to '4', and run +the 'generate_disclosure_risk_report.py' file. -NOTE: you should not need to change this file at all, set all parameters via the .yaml file +NOTE: you should not need to change this file at all, set all parameters via +the .yaml file. """ import argparse @@ -37,7 +37,6 @@ def generate_report( outfile, ): # pylint: disable=too-many-arguments, disable=too-many-locals """Generate report based on target model.""" - print() print("Acting as TRE...") print() @@ -96,8 +95,7 @@ def sort_prob_row(row): def run_user_story(release_config: dict): - """Main method to parse arguments and then invoke report generation.""" - + """Run the user story, parsing arguments and then invoking report generation.""" generate_report( release_config["training_artefacts_dir"], release_config["train_probabilities"], @@ -107,7 +105,7 @@ def run_user_story(release_config: dict): ) -if __name__ == "__main__": # pragma:no cover +if __name__ == "__main__": parser = argparse.ArgumentParser( description=( "Generate a risk report after request_release() has been called by researcher" @@ -129,7 +127,7 @@ def run_user_story(release_config: dict): try: with open(args.config_file, encoding="utf-8") as handle: config = yaml.load(handle, Loader=yaml.loader.SafeLoader) - except AttributeError as error: # pragma:no cover + except AttributeError as error: print( "Invalid command. Try --help to get more details" f"error message is {error}" diff --git a/user_stories/user_story_7/user_story_7_researcher_template.py b/user_stories/user_story_7/user_story_7_researcher_template.py index 52876467..a6becb40 100644 --- a/user_stories/user_story_7/user_story_7_researcher_template.py +++ b/user_stories/user_story_7/user_story_7_researcher_template.py @@ -1,5 +1,4 @@ -""" -RESEARCHER EXAMPLE FOR USER STORY 7. +"""RESEARCHER EXAMPLE FOR USER STORY 7. This file is an example of a researcher creating/training a machine learning model and to be released form a secure environment @@ -36,7 +35,6 @@ def run_user_story(): # pylint: disable=too-many-locals """Create and train model to be released.""" - # This section is not necessary but helpful - cleans up files that are created by aisdc directory = "training_artefacts" print("Creating directory for training artefacts") diff --git a/user_stories/user_story_7/user_story_7_tre.py b/user_stories/user_story_7/user_story_7_tre.py index 6a6292bd..8860a4c7 100644 --- a/user_stories/user_story_7/user_story_7_tre.py +++ b/user_stories/user_story_7/user_story_7_tre.py @@ -1,16 +1,16 @@ -""" -TRE SCRIPT FOR USER STORY 7. +"""TRE SCRIPT FOR USER STORY 7. -This file contains the code needed to run user story 7 +This file contains the code needed to run user story 7. -NOTE: this user story will not produce an output, this user story covers cases where the -researcher has not provided enough information -See user stories 1, 2 or 3 for guidance on what you need to release a model +NOTE: this user story will not produce an output, this user story covers cases +where the researcher has not provided enough information. See user stories 1, 2 +or 3 for guidance on what you need to release a model. -To run: change the user_story key inside the .yaml config file to '7', and run the -'generate_disclosure_risk_report.py' file +To run: change the user_story key inside the .yaml config file to '7', and run +the 'generate_disclosure_risk_report.py' file. -NOTE: you should not need to change this file at all, set all parameters via the .yaml file +NOTE: you should not need to change this file at all, set all parameters via +the .yaml file. """ import argparse @@ -21,7 +21,7 @@ def generate_report(directory, target_model_filepath): - """Main method to parse arguments and then invoke report generation.""" + """Parse arguments and then invoke report generation.""" print() print("Acting as TRE...") print( @@ -41,14 +41,13 @@ def generate_report(directory, target_model_filepath): def run_user_story(release_config: dict): - """Main method to parse arguments and then invoke report generation.""" - + """Run the user story, parsing arguments and then invoking report generation.""" generate_report( release_config["training_artefacts_dir"], release_config["target_model"] ) -if __name__ == "__main__": # pragma:no cover +if __name__ == "__main__": parser = argparse.ArgumentParser( description=( "Generate a risk report after request_release() has been called by researcher" @@ -70,7 +69,7 @@ def run_user_story(release_config: dict): try: with open(args.config_file, encoding="utf-8") as handle: config = yaml.load(handle, Loader=yaml.loader.SafeLoader) - except AttributeError as error: # pragma:no cover + except AttributeError as error: print( "Invalid command. Try --help to get more details" f"error message is {error}" diff --git a/user_stories/user_story_8/data_processing_researcher.py b/user_stories/user_story_8/data_processing_researcher.py index ee27d577..e3ad658a 100644 --- a/user_stories/user_story_8/data_processing_researcher.py +++ b/user_stories/user_story_8/data_processing_researcher.py @@ -1,5 +1,4 @@ -""" -SUPPORTING FILE FOR USER STORY 2. +"""SUPPORTING FILE FOR USER STORY 2. This file is an example of a function created by a researcher that will pre-process a dataset diff --git a/user_stories/user_story_8/user_story_8_researcher_template.py b/user_stories/user_story_8/user_story_8_researcher_template.py index f0b5501d..fd3eb7bd 100644 --- a/user_stories/user_story_8/user_story_8_researcher_template.py +++ b/user_stories/user_story_8/user_story_8_researcher_template.py @@ -1,5 +1,4 @@ -""" -RESEARCHER EXAMPLE FOR USER STORY 8. +"""RESEARCHER EXAMPLE FOR USER STORY 8. This file is an example of a researcher creating/training a machine learning model and to be released form a secure environment @@ -30,7 +29,6 @@ def run_user_story(): # pylint: disable=too-many-locals """Create and train a model to be released.""" - # This section is not necessary but helpful - cleans up files that are created by aisdc directory = "training_artefacts" print("Creating directory for training artefacts") diff --git a/user_stories/user_story_8/user_story_8_tre.py b/user_stories/user_story_8/user_story_8_tre.py index 7457df78..41d71bf3 100644 --- a/user_stories/user_story_8/user_story_8_tre.py +++ b/user_stories/user_story_8/user_story_8_tre.py @@ -1,16 +1,16 @@ -""" -TRE SCRIPT FOR USER STORY 8. +"""TRE SCRIPT FOR USER STORY 8. -This file contains the code needed to run user story 8 +This file contains the code needed to run user story 8. -NOTE: this user story will not produce an output, this user story covers cases where the -researcher has not provided enough information -See user stories 1, 2 or 3 for guidance on what you need to release a model +NOTE: this user story will not produce an output, this user story covers cases +where the researcher has not provided enough information. See user stories 1, 2 +or 3 for guidance on what you need to release a model. -To run: change the user_story key inside the .yaml config file to '8', and run the -'generate_disclosure_risk_report.py' file +To run: change the user_story key inside the .yaml config file to '8', and run +the 'generate_disclosure_risk_report.py' file. -NOTE: you should not need to change this file at all, set all parameters via the .yaml file +NOTE: you should not need to change this file at all, set all parameters via +the .yaml file. """ import argparse @@ -21,7 +21,7 @@ def generate_report(directory, target_model_filepath): - """Main method to parse arguments and then invoke report generation.""" + """Parse arguments and then invoke report generation.""" print() print("Acting as TRE...") print( @@ -41,14 +41,13 @@ def generate_report(directory, target_model_filepath): def run_user_story(release_config: dict): - """Main method to parse arguments and then invoke report generation.""" - + """Run the user story, parsing arguments and then invoking report generation.""" generate_report( release_config["training_artefacts_dir"], release_config["target_model"] ) -if __name__ == "__main__": # pragma:no cover +if __name__ == "__main__": parser = argparse.ArgumentParser( description=( "Generate a risk report after request_release() has been called by researcher" @@ -70,7 +69,7 @@ def run_user_story(release_config: dict): try: with open(args.config_file, encoding="utf-8") as handle: config = yaml.load(handle, Loader=yaml.loader.SafeLoader) - except AttributeError as error: # pragma:no cover + except AttributeError as error: print( "Invalid command. Try --help to get more details" f"error message is {error}"