Minor save/load refactoring (#635)

SeldonIO · Sep 29, 2022 · 4af80a1 · 4af80a1
1 parent df7edca
commit 4af80a1
Show file tree

Hide file tree

Showing 10 changed files with 104 additions and 44 deletions.
diff --git a/alibi_detect/base.py b/alibi_detect/base.py
@@ -119,16 +119,14 @@ def get_config(self) -> dict:  # TODO - move to BaseDetector once config save/lo
         if self.config is not None:
             # Get config (stored in top-level self)
             cfg = self.config
-            # Get low-level nested detector (if needed)
-            detector = self._detector if hasattr(self, '_detector') else self  # type: ignore[attr-defined]
-            detector = detector._detector if hasattr(detector, '_detector') else detector  # type: ignore[attr-defined]
             # Add large artefacts back to config
             for key in LARGE_ARTEFACTS:
-                if key in cfg:  # self.config is validated, therefore if a key is not in cfg, it isn't valid to insert
-                    cfg[key] = getattr(detector, key)
+                if key in cfg and hasattr(self._nested_detector, key):
+                    cfg[key] = getattr(self._nested_detector, key)
             # Set x_ref_preprocessed flag
-            preprocess_at_init = getattr(detector, 'preprocess_at_init', True)  # If no preprocess_at_init, always true!
-            cfg['x_ref_preprocessed'] = preprocess_at_init and detector.preprocess_fn is not None
+            # If no preprocess_at_init, always true!
+            preprocess_at_init = getattr(self._nested_detector, 'preprocess_at_init', True)
+            cfg['x_ref_preprocessed'] = preprocess_at_init and self._nested_detector.preprocess_fn is not None
             return cfg
         else:
             raise NotImplementedError('Getting a config (or saving via a config file) is not yet implemented for this'
@@ -185,17 +183,26 @@ def _set_config(self, inputs):  # TODO - move to BaseDetector once config save/l
 
         # Overwrite any large artefacts with None to save memory. They'll be added back by get_config()
         for key in LARGE_ARTEFACTS:
-            if key in inputs:
+            if key in inputs and hasattr(self._nested_detector, key):
                 inputs[key] = None
 
         self.config.update(inputs)
 
+    @property
+    def _nested_detector(self):
+        """
+        The low-level nested detector.
+        """
+        detector = self._detector if hasattr(self, '_detector') else self  # type: ignore[attr-defined]
+        detector = detector._detector if hasattr(detector, '_detector') else detector  # type: ignore[attr-defined]
+        return detector
+
 
 @runtime_checkable
 class Detector(Protocol):
     """Type Protocol for all detectors.
 
-    Used for typing legacy save and load functionality in `alibi_detect.saving.tensorflow._saving.py`.
+    Used for typing legacy save and load functionality in `alibi_detect.saving._tensorflow.saving.py`.
 
     Note:
         This exists to distinguish between detectors with and without support for config saving and loading. Once all

diff --git a/alibi_detect/saving/tensorflow/__init__.py → alibi_detect/saving/_tensorflow/__init__.py b/alibi_detect/saving/tensorflow/__init__.py → alibi_detect/saving/_tensorflow/__init__.py
@@ -2,7 +2,7 @@
 
 load_detector_legacy, load_kernel_config_tf, load_embedding_tf, load_model_tf, load_optimizer_tf, \
     prep_model_and_emb_tf = import_optional(
-        'alibi_detect.saving.tensorflow._loading',
+        'alibi_detect.saving._tensorflow.loading',
         names=['load_detector_legacy',
                'load_kernel_config',
                'load_embedding',
@@ -11,11 +11,23 @@
                'prep_model_and_emb'])
 
 save_detector_legacy, save_model_config_tf = import_optional(
-    'alibi_detect.saving.tensorflow._saving',
+    'alibi_detect.saving._tensorflow.saving',
     names=['save_detector_legacy', 'save_model_config']
 )
 
 get_tf_dtype = import_optional(
-    'alibi_detect.saving.tensorflow._conversions',
+    'alibi_detect.saving._tensorflow.conversions',
     names=['get_tf_dtype']
 )
+
+__all__ = [
+    "load_detector_legacy",
+    "load_kernel_config_tf",
+    "load_embedding_tf",
+    "load_model_tf",
+    "load_optimizer_tf",
+    "prep_model_and_emb_tf",
+    "save_detector_legacy",
+    "save_model_config_tf",
+    "get_tf_dtype"
+]
diff --git a/..._detect/saving/tensorflow/_conversions.py → ..._detect/saving/_tensorflow/conversions.py b/..._detect/saving/tensorflow/_conversions.py → ..._detect/saving/_tensorflow/conversions.py
diff --git a/alibi_detect/saving/tensorflow/_loading.py → alibi_detect/saving/_tensorflow/loading.py b/alibi_detect/saving/tensorflow/_loading.py → alibi_detect/saving/_tensorflow/loading.py
diff --git a/alibi_detect/saving/tensorflow/_saving.py → alibi_detect/saving/_tensorflow/saving.py b/alibi_detect/saving/tensorflow/_saving.py → alibi_detect/saving/_tensorflow/saving.py
@@ -150,6 +150,9 @@ def save_embedding_config(embed: TransformerEmbedding,
     return cfg_embed
 
 
+#######################################################################################################
+# TODO: Everything below here is legacy saving code, and will be removed in the future
+#######################################################################################################
 def save_embedding_legacy(embed: TransformerEmbedding,
                           embed_args: dict,
                           filepath: Path) -> None:
@@ -177,9 +180,6 @@ def save_embedding_legacy(embed: TransformerEmbedding,
         dill.dump(embed_args, f)
 
 
-#######################################################################################################
-# TODO: Everything below here is legacy saving code, and will be removed in the future
-#######################################################################################################
 def save_detector_legacy(detector, filepath):
     detector_name = detector.meta['name']
 

diff --git a/alibi_detect/saving/loading.py b/alibi_detect/saving/loading.py
@@ -12,7 +12,7 @@
 from transformers import AutoTokenizer
 
 from alibi_detect.saving.registry import registry
-from alibi_detect.saving.tensorflow import load_detector_legacy, load_embedding_tf, load_kernel_config_tf, \
+from alibi_detect.saving._tensorflow import load_detector_legacy, load_embedding_tf, load_kernel_config_tf, \
     load_model_tf, load_optimizer_tf, prep_model_and_emb_tf, get_tf_dtype
 from alibi_detect.saving.validate import validate_config
 from alibi_detect.base import Detector, ConfigurableDetector
@@ -129,7 +129,7 @@ def _load_detector_config(filepath: Union[str, os.PathLike]) -> ConfigurableDete
     # Backend
     backend = cfg.pop('backend')  # popping so that cfg left as kwargs + `name` when passed to _init_detector
     if backend.lower() != 'tensorflow':
-        raise NotImplementedError('Loading detectors with PyTorch or sklearn backend is not yet supported.')
+        raise NotImplementedError('Loading detectors with PyTorch, sklearn or keops backend is not yet supported.')
 
     # Init detector from config
     logger.info('Instantiating detector.')

diff --git a/alibi_detect/saving/saving.py b/alibi_detect/saving/saving.py
@@ -14,7 +14,7 @@
 from alibi_detect.saving.loading import _replace, validate_config
 from alibi_detect.saving.registry import registry
 from alibi_detect.saving.schemas import SupportedModels
-from alibi_detect.saving.tensorflow import save_detector_legacy, save_model_config_tf
+from alibi_detect.saving._tensorflow import save_detector_legacy, save_model_config_tf
 from alibi_detect.base import Detector, ConfigurableDetector
 
 # do not extend pickle dispatch table so as not to change pickle behaviour

diff --git a/alibi_detect/saving/tests/test_saving.py b/alibi_detect/saving/tests/test_saving.py
@@ -176,7 +176,7 @@ def deep_kernel(request, backend, encoder_model):
 
 
 @fixture
-def classifier(backend, current_cases):
+def classifier_model(backend, current_cases):
     """
     Classification model with given input dimension and backend.
     """
@@ -204,7 +204,7 @@ def nlp_embedding_and_tokenizer(model_name, max_len, uae, backend):
 
     # Load tokenizer
     try:
-        tokenizer = AutoTokenizer.from_pretrained(model_name + 'TODO')
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
     except (OSError, HTTPError):
         pytest.skip(f"Problem downloading {model_name} from huggingface.co")
     X = 'A dummy string'  # this will be padded to max_len
@@ -267,15 +267,18 @@ def preprocess_nlp(embedding, tokenizer, max_len, backend):
 
 
 @fixture
-def preprocess_hiddenoutput(classifier, backend):
+def preprocess_hiddenoutput(classifier_model, current_cases, backend):
     """
     Preprocess function to extract the softmax layer of a classifier (with the HiddenOutput utility function).
     """
+    _, _, data_params = current_cases["data"]
+    _, input_dim = data_params['data_shape']
+
     if backend == 'tensorflow':
-        model = HiddenOutput_tf(classifier, layer=-1)
+        model = HiddenOutput_tf(classifier_model, layer=-1, input_shape=(None, input_dim))
         preprocess_fn = partial(preprocess_drift_tf, model=model)
     else:
-        model = HiddenOutput_pt(classifier, layer=-1)
+        model = HiddenOutput_pt(classifier_model, layer=-1)
         preprocess_fn = partial(preprocess_drift_pt, model=model)
     return preprocess_fn
 
@@ -310,7 +313,7 @@ def test_load_simple_config(cfg, tmp_path):
 @parametrize_with_cases("data", cases=ContinuousData, prefix='data_')
 def test_save_ksdrift(data, preprocess_fn, tmp_path):
     """
-    Test KSDrift on continuous datasets, with UAE and classifier softmax output as preprocess_fn's. Only this
+    Test KSDrift on continuous datasets, with UAE and classifier_model softmax output as preprocess_fn's. Only this
     detector is tested with preprocessing strategies, as other detectors should see the same preprocess_fn output.
 
     Detector is saved and then loaded, with assertions checking that the reinstantiated detector is equivalent.
@@ -337,9 +340,9 @@ def test_save_ksdrift(data, preprocess_fn, tmp_path):
 
 @parametrize('preprocess_fn', [preprocess_nlp])
 @parametrize_with_cases("data", cases=TextData.movie_sentiment_data, prefix='data_')
-def test_save_ksdrift_nlp(data, preprocess_fn, max_len, enc_dim, tmp_path):
+def test_save_ksdrift_nlp(data, preprocess_fn, enc_dim, tmp_path):
     """
-    Test KSDrift on continuous datasets, with UAE and classifier softmax output as preprocess_fn's. Only this
+    Test KSDrift on continuous datasets, with UAE and classifier_model softmax output as preprocess_fn's. Only this
     detector is tested with embedding and embedding+uae, as other detectors should see the same preprocessed data.
 
     Detector is saved and then loaded, with assertions checking that the reinstantiated detector is equivalent.
@@ -350,7 +353,7 @@ def test_save_ksdrift_nlp(data, preprocess_fn, max_len, enc_dim, tmp_path):
                  p_val=P_VAL,
                  preprocess_fn=preprocess_fn,
                  preprocess_at_init=True,
-                 input_shape=(max_len,),
+                 input_shape=(768,),  # hardcoded to bert-base-cased for now
                  )
     save_detector(cd, tmp_path, legacy=False)
     cd_load = load_detector(tmp_path)
@@ -572,13 +575,13 @@ def test_save_tabulardrift(data, tmp_path):
 
 
 @parametrize_with_cases("data", cases=ContinuousData, prefix='data_')
-def test_save_classifierdrift(data, classifier, backend, tmp_path, seed):
+def test_save_classifierdrift(data, classifier_model, backend, tmp_path, seed):
     """ Test ClassifierDrift on continuous datasets."""
     # Init detector and predict
     X_ref, X_h0 = data
     with fixed_seed(seed):
         cd = ClassifierDrift(X_ref,
-                             model=classifier,
+                             model=classifier_model,
                              p_val=P_VAL,
                              n_folds=5,
                              backend=backend,
@@ -606,7 +609,7 @@ def test_save_classifierdrift(data, classifier, backend, tmp_path, seed):
 
 
 @parametrize_with_cases("data", cases=ContinuousData, prefix='data_')
-def test_save_spotthediff(data, classifier, backend, tmp_path, seed):
+def test_save_spotthediff(data, classifier_model, backend, tmp_path, seed):
     """
     Test SpotTheDiffDrift on continuous datasets.
 
@@ -731,13 +734,13 @@ def test_save_contextmmddrift(data, kernel, backend, tmp_path, seed):
 
 
 @parametrize_with_cases("data", cases=ContinuousData, prefix='data_')
-def test_save_classifieruncertaintydrift(data, classifier, backend, tmp_path, seed):
+def test_save_classifieruncertaintydrift(data, classifier_model, backend, tmp_path, seed):
     """ Test ClassifierDrift on continuous datasets."""
     # Init detector and predict
     X_ref, X_h0 = data
     with fixed_seed(seed):
         cd = ClassifierUncertaintyDrift(X_ref,
-                                        model=classifier,
+                                        model=classifier_model,
                                         p_val=P_VAL,
                                         backend=backend,
                                         preds_type='probs',
@@ -1071,19 +1074,20 @@ def test_save_deepkernel(data, deep_kernel, backend, tmp_path):
     """
     # Get data dim
     X, _ = data
-    input_dim = X.shape[1]
+    input_shape = (X.shape[1],)
 
     # Save kernel to config
     filepath = tmp_path
     filename = 'mykernel'
     cfg_kernel = _save_kernel_config(deep_kernel, filepath, filename)
-    cfg_kernel['proj'], _ = _save_model_config(cfg_kernel['proj'], base_path=filepath, input_shape=input_dim,
+    cfg_kernel['proj'], _ = _save_model_config(cfg_kernel['proj'], base_path=filepath, input_shape=input_shape,
                                                backend=backend)
     cfg_kernel = _path2str(cfg_kernel)
-    cfg_kernel['proj'] = ModelConfig(**cfg_kernel['proj']).dict()  # Pass thru ModelConfig to set `custom_objects` etc
+    cfg_kernel['proj'] = ModelConfig(**cfg_kernel['proj']).dict()  # Pass thru ModelConfig to set `layers` etc
     cfg_kernel = DeepKernelConfig(**cfg_kernel).dict()  # pydantic validation
     assert cfg_kernel['proj']['src'] == 'model'
     assert cfg_kernel['proj']['custom_objects'] is None
+    assert cfg_kernel['proj']['layer'] is None
 
     # Resolve and load config
     cfg = {'kernel': cfg_kernel, 'backend': backend}
@@ -1115,10 +1119,10 @@ def test_save_preprocess(data, preprocess_fn, tmp_path, backend):
     # Save preprocess_fn to config
     filepath = tmp_path
     X_ref, X_h0 = data
-    input_dim = X_ref.shape[1]
+    input_shape = (X_ref.shape[1],)
     cfg_preprocess = _save_preprocess_config(preprocess_fn,
                                              backend=backend,
-                                             input_shape=input_dim,
+                                             input_shape=input_shape,
                                              filepath=filepath)
     cfg_preprocess = _path2str(cfg_preprocess)
     cfg_preprocess = PreprocessConfig(**cfg_preprocess).dict()  # pydantic validation
@@ -1136,7 +1140,7 @@ def test_save_preprocess(data, preprocess_fn, tmp_path, backend):
 
 @parametrize('preprocess_fn', [preprocess_nlp])
 @parametrize_with_cases("data", cases=TextData.movie_sentiment_data, prefix='data_')
-def test_save_preprocess_nlp(data, preprocess_fn, max_len, tmp_path, backend):
+def test_save_preprocess_nlp(data, preprocess_fn, tmp_path, backend):
     """
     Unit test for _save_preprocess_config and _load_preprocess_config, with text data.
 
@@ -1147,7 +1151,7 @@ def test_save_preprocess_nlp(data, preprocess_fn, max_len, tmp_path, backend):
     filepath = tmp_path
     cfg_preprocess = _save_preprocess_config(preprocess_fn,
                                              backend=backend,
-                                             input_shape=max_len,
+                                             input_shape=(768,),  # hardcoded to bert-base-cased for now
                                              filepath=filepath)
     cfg_preprocess = _path2str(cfg_preprocess)
     cfg_preprocess = PreprocessConfig(**cfg_preprocess).dict()  # pydantic validation
@@ -1185,8 +1189,8 @@ def test_save_model(data, model, layer, backend, tmp_path):
     """
     # Save model
     filepath = tmp_path
-    input_dim = data[0].shape[1]
-    cfg_model, _ = _save_model_config(model, base_path=filepath, input_shape=input_dim, backend=backend)
+    input_shape = (data[0].shape[1],)
+    cfg_model, _ = _save_model_config(model, base_path=filepath, input_shape=input_shape, backend=backend)
     cfg_model = _path2str(cfg_model)
     cfg_model = ModelConfig(**cfg_model).dict()
     assert tmp_path.joinpath('model').is_dir()

diff --git a/alibi_detect/tests/test_dep_management.py b/alibi_detect/tests/test_dep_management.py
@@ -190,7 +190,7 @@ def test_fetching_utils_dependencies(opt_dep):
 
 
 def test_saving_tf_dependencies(opt_dep):
-    """Tests that the alibi_detect.saving.tensorflow module correctly protects against uninstalled optional
+    """Tests that the alibi_detect.saving._tensorflow module correctly protects against uninstalled optional
     dependencies.
     """
 
@@ -208,7 +208,7 @@ def test_saving_tf_dependencies(opt_dep):
         ('get_tf_dtype', ['tensorflow'])
     ]:
         dependency_map[dependency] = relations
-    from alibi_detect.saving import tensorflow as tf_saving
+    from alibi_detect.saving import _tensorflow as tf_saving
     check_correct_dependencies(tf_saving, dependency_map, opt_dep)
 
 

diff --git a/doc/source/overview/saving.md b/doc/source/overview/saving.md
@@ -98,5 +98,42 @@ for the remaining detectors is in the [Roadmap](roadmap.md).
 ````
 
 ```{note}
-Saving/loading of detectors using PyTorch models and/or a PyTorch backend is currently not supported. 
+For detectors with backends, or using preprocessing, save/load support is currently limited to TensorFlow models and backends.
 ```
+
+(supported_models)=
+## Supported ML models
+
+Alibi Detect drift detectors offer the option to perform [preprocessing](../cd/background.md#input-preprocessing)
+with user-defined machine learning models:
+
+```python
+model = ... # TensorFlow model; tf.keras.Model or tf.keras.Sequential
+preprocess_fn = partial(preprocess_drift, model=model, batch_size=128)
+cd = MMDDrift(x_ref, backend='tensorflow', p_val=.05, preprocess_fn=preprocess_fn)
+```
+
+Additionally, some detectors are built upon models directly, 
+for example the [Classifier](../cd/methods/classifierdrift.ipynb) drift detector requires a `model` to be passed
+as an argument:
+
+```python
+cd = ClassifierDrift(x_ref, model, p_val=.05, preds_type='probs')
+```
+
+In order for a detector to be saveable and loadable, any models contained within it (or referenced within a 
+[detector configuration file](config_files.md#specifying-artefacts)) must fall within the family of supported models
+documented below.
+
+### TensorFlow models
+
+Alibi Detect supports any TensorFlow model that can be serialized to the 
+[HDF5](https://www.tensorflow.org/guide/keras/save_and_serialize#keras_h5_format) format. 
+Custom objects should be pre-registered with 
+[register_keras_serializable](https://www.tensorflow.org/api_docs/python/tf/keras/utils/register_keras_serializable).
+```
+
+%### PyTorch
+
+%### scikit-learn
+