Skip to content

Commit

Permalink
Minor save/load refactoring (#635)
Browse files Browse the repository at this point in the history
  • Loading branch information
ascillitoe authored Sep 29, 2022
1 parent df7edca commit 4af80a1
Show file tree
Hide file tree
Showing 10 changed files with 104 additions and 44 deletions.
25 changes: 16 additions & 9 deletions alibi_detect/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,16 +119,14 @@ def get_config(self) -> dict: # TODO - move to BaseDetector once config save/lo
if self.config is not None:
# Get config (stored in top-level self)
cfg = self.config
# Get low-level nested detector (if needed)
detector = self._detector if hasattr(self, '_detector') else self # type: ignore[attr-defined]
detector = detector._detector if hasattr(detector, '_detector') else detector # type: ignore[attr-defined]
# Add large artefacts back to config
for key in LARGE_ARTEFACTS:
if key in cfg: # self.config is validated, therefore if a key is not in cfg, it isn't valid to insert
cfg[key] = getattr(detector, key)
if key in cfg and hasattr(self._nested_detector, key):
cfg[key] = getattr(self._nested_detector, key)
# Set x_ref_preprocessed flag
preprocess_at_init = getattr(detector, 'preprocess_at_init', True) # If no preprocess_at_init, always true!
cfg['x_ref_preprocessed'] = preprocess_at_init and detector.preprocess_fn is not None
# If no preprocess_at_init, always true!
preprocess_at_init = getattr(self._nested_detector, 'preprocess_at_init', True)
cfg['x_ref_preprocessed'] = preprocess_at_init and self._nested_detector.preprocess_fn is not None
return cfg
else:
raise NotImplementedError('Getting a config (or saving via a config file) is not yet implemented for this'
Expand Down Expand Up @@ -185,17 +183,26 @@ def _set_config(self, inputs): # TODO - move to BaseDetector once config save/l

# Overwrite any large artefacts with None to save memory. They'll be added back by get_config()
for key in LARGE_ARTEFACTS:
if key in inputs:
if key in inputs and hasattr(self._nested_detector, key):
inputs[key] = None

self.config.update(inputs)

@property
def _nested_detector(self):
"""
The low-level nested detector.
"""
detector = self._detector if hasattr(self, '_detector') else self # type: ignore[attr-defined]
detector = detector._detector if hasattr(detector, '_detector') else detector # type: ignore[attr-defined]
return detector


@runtime_checkable
class Detector(Protocol):
"""Type Protocol for all detectors.
Used for typing legacy save and load functionality in `alibi_detect.saving.tensorflow._saving.py`.
Used for typing legacy save and load functionality in `alibi_detect.saving._tensorflow.saving.py`.
Note:
This exists to distinguish between detectors with and without support for config saving and loading. Once all
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

load_detector_legacy, load_kernel_config_tf, load_embedding_tf, load_model_tf, load_optimizer_tf, \
prep_model_and_emb_tf = import_optional(
'alibi_detect.saving.tensorflow._loading',
'alibi_detect.saving._tensorflow.loading',
names=['load_detector_legacy',
'load_kernel_config',
'load_embedding',
Expand All @@ -11,11 +11,23 @@
'prep_model_and_emb'])

save_detector_legacy, save_model_config_tf = import_optional(
'alibi_detect.saving.tensorflow._saving',
'alibi_detect.saving._tensorflow.saving',
names=['save_detector_legacy', 'save_model_config']
)

get_tf_dtype = import_optional(
'alibi_detect.saving.tensorflow._conversions',
'alibi_detect.saving._tensorflow.conversions',
names=['get_tf_dtype']
)

__all__ = [
"load_detector_legacy",
"load_kernel_config_tf",
"load_embedding_tf",
"load_model_tf",
"load_optimizer_tf",
"prep_model_and_emb_tf",
"save_detector_legacy",
"save_model_config_tf",
"get_tf_dtype"
]
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,9 @@ def save_embedding_config(embed: TransformerEmbedding,
return cfg_embed


#######################################################################################################
# TODO: Everything below here is legacy saving code, and will be removed in the future
#######################################################################################################
def save_embedding_legacy(embed: TransformerEmbedding,
embed_args: dict,
filepath: Path) -> None:
Expand Down Expand Up @@ -177,9 +180,6 @@ def save_embedding_legacy(embed: TransformerEmbedding,
dill.dump(embed_args, f)


#######################################################################################################
# TODO: Everything below here is legacy saving code, and will be removed in the future
#######################################################################################################
def save_detector_legacy(detector, filepath):
detector_name = detector.meta['name']

Expand Down
4 changes: 2 additions & 2 deletions alibi_detect/saving/loading.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from transformers import AutoTokenizer

from alibi_detect.saving.registry import registry
from alibi_detect.saving.tensorflow import load_detector_legacy, load_embedding_tf, load_kernel_config_tf, \
from alibi_detect.saving._tensorflow import load_detector_legacy, load_embedding_tf, load_kernel_config_tf, \
load_model_tf, load_optimizer_tf, prep_model_and_emb_tf, get_tf_dtype
from alibi_detect.saving.validate import validate_config
from alibi_detect.base import Detector, ConfigurableDetector
Expand Down Expand Up @@ -129,7 +129,7 @@ def _load_detector_config(filepath: Union[str, os.PathLike]) -> ConfigurableDete
# Backend
backend = cfg.pop('backend') # popping so that cfg left as kwargs + `name` when passed to _init_detector
if backend.lower() != 'tensorflow':
raise NotImplementedError('Loading detectors with PyTorch or sklearn backend is not yet supported.')
raise NotImplementedError('Loading detectors with PyTorch, sklearn or keops backend is not yet supported.')

# Init detector from config
logger.info('Instantiating detector.')
Expand Down
2 changes: 1 addition & 1 deletion alibi_detect/saving/saving.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from alibi_detect.saving.loading import _replace, validate_config
from alibi_detect.saving.registry import registry
from alibi_detect.saving.schemas import SupportedModels
from alibi_detect.saving.tensorflow import save_detector_legacy, save_model_config_tf
from alibi_detect.saving._tensorflow import save_detector_legacy, save_model_config_tf
from alibi_detect.base import Detector, ConfigurableDetector

# do not extend pickle dispatch table so as not to change pickle behaviour
Expand Down
50 changes: 27 additions & 23 deletions alibi_detect/saving/tests/test_saving.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ def deep_kernel(request, backend, encoder_model):


@fixture
def classifier(backend, current_cases):
def classifier_model(backend, current_cases):
"""
Classification model with given input dimension and backend.
"""
Expand Down Expand Up @@ -204,7 +204,7 @@ def nlp_embedding_and_tokenizer(model_name, max_len, uae, backend):

# Load tokenizer
try:
tokenizer = AutoTokenizer.from_pretrained(model_name + 'TODO')
tokenizer = AutoTokenizer.from_pretrained(model_name)
except (OSError, HTTPError):
pytest.skip(f"Problem downloading {model_name} from huggingface.co")
X = 'A dummy string' # this will be padded to max_len
Expand Down Expand Up @@ -267,15 +267,18 @@ def preprocess_nlp(embedding, tokenizer, max_len, backend):


@fixture
def preprocess_hiddenoutput(classifier, backend):
def preprocess_hiddenoutput(classifier_model, current_cases, backend):
"""
Preprocess function to extract the softmax layer of a classifier (with the HiddenOutput utility function).
"""
_, _, data_params = current_cases["data"]
_, input_dim = data_params['data_shape']

if backend == 'tensorflow':
model = HiddenOutput_tf(classifier, layer=-1)
model = HiddenOutput_tf(classifier_model, layer=-1, input_shape=(None, input_dim))
preprocess_fn = partial(preprocess_drift_tf, model=model)
else:
model = HiddenOutput_pt(classifier, layer=-1)
model = HiddenOutput_pt(classifier_model, layer=-1)
preprocess_fn = partial(preprocess_drift_pt, model=model)
return preprocess_fn

Expand Down Expand Up @@ -310,7 +313,7 @@ def test_load_simple_config(cfg, tmp_path):
@parametrize_with_cases("data", cases=ContinuousData, prefix='data_')
def test_save_ksdrift(data, preprocess_fn, tmp_path):
"""
Test KSDrift on continuous datasets, with UAE and classifier softmax output as preprocess_fn's. Only this
Test KSDrift on continuous datasets, with UAE and classifier_model softmax output as preprocess_fn's. Only this
detector is tested with preprocessing strategies, as other detectors should see the same preprocess_fn output.
Detector is saved and then loaded, with assertions checking that the reinstantiated detector is equivalent.
Expand All @@ -337,9 +340,9 @@ def test_save_ksdrift(data, preprocess_fn, tmp_path):

@parametrize('preprocess_fn', [preprocess_nlp])
@parametrize_with_cases("data", cases=TextData.movie_sentiment_data, prefix='data_')
def test_save_ksdrift_nlp(data, preprocess_fn, max_len, enc_dim, tmp_path):
def test_save_ksdrift_nlp(data, preprocess_fn, enc_dim, tmp_path):
"""
Test KSDrift on continuous datasets, with UAE and classifier softmax output as preprocess_fn's. Only this
Test KSDrift on continuous datasets, with UAE and classifier_model softmax output as preprocess_fn's. Only this
detector is tested with embedding and embedding+uae, as other detectors should see the same preprocessed data.
Detector is saved and then loaded, with assertions checking that the reinstantiated detector is equivalent.
Expand All @@ -350,7 +353,7 @@ def test_save_ksdrift_nlp(data, preprocess_fn, max_len, enc_dim, tmp_path):
p_val=P_VAL,
preprocess_fn=preprocess_fn,
preprocess_at_init=True,
input_shape=(max_len,),
input_shape=(768,), # hardcoded to bert-base-cased for now
)
save_detector(cd, tmp_path, legacy=False)
cd_load = load_detector(tmp_path)
Expand Down Expand Up @@ -572,13 +575,13 @@ def test_save_tabulardrift(data, tmp_path):


@parametrize_with_cases("data", cases=ContinuousData, prefix='data_')
def test_save_classifierdrift(data, classifier, backend, tmp_path, seed):
def test_save_classifierdrift(data, classifier_model, backend, tmp_path, seed):
""" Test ClassifierDrift on continuous datasets."""
# Init detector and predict
X_ref, X_h0 = data
with fixed_seed(seed):
cd = ClassifierDrift(X_ref,
model=classifier,
model=classifier_model,
p_val=P_VAL,
n_folds=5,
backend=backend,
Expand Down Expand Up @@ -606,7 +609,7 @@ def test_save_classifierdrift(data, classifier, backend, tmp_path, seed):


@parametrize_with_cases("data", cases=ContinuousData, prefix='data_')
def test_save_spotthediff(data, classifier, backend, tmp_path, seed):
def test_save_spotthediff(data, classifier_model, backend, tmp_path, seed):
"""
Test SpotTheDiffDrift on continuous datasets.
Expand Down Expand Up @@ -731,13 +734,13 @@ def test_save_contextmmddrift(data, kernel, backend, tmp_path, seed):


@parametrize_with_cases("data", cases=ContinuousData, prefix='data_')
def test_save_classifieruncertaintydrift(data, classifier, backend, tmp_path, seed):
def test_save_classifieruncertaintydrift(data, classifier_model, backend, tmp_path, seed):
""" Test ClassifierDrift on continuous datasets."""
# Init detector and predict
X_ref, X_h0 = data
with fixed_seed(seed):
cd = ClassifierUncertaintyDrift(X_ref,
model=classifier,
model=classifier_model,
p_val=P_VAL,
backend=backend,
preds_type='probs',
Expand Down Expand Up @@ -1071,19 +1074,20 @@ def test_save_deepkernel(data, deep_kernel, backend, tmp_path):
"""
# Get data dim
X, _ = data
input_dim = X.shape[1]
input_shape = (X.shape[1],)

# Save kernel to config
filepath = tmp_path
filename = 'mykernel'
cfg_kernel = _save_kernel_config(deep_kernel, filepath, filename)
cfg_kernel['proj'], _ = _save_model_config(cfg_kernel['proj'], base_path=filepath, input_shape=input_dim,
cfg_kernel['proj'], _ = _save_model_config(cfg_kernel['proj'], base_path=filepath, input_shape=input_shape,
backend=backend)
cfg_kernel = _path2str(cfg_kernel)
cfg_kernel['proj'] = ModelConfig(**cfg_kernel['proj']).dict() # Pass thru ModelConfig to set `custom_objects` etc
cfg_kernel['proj'] = ModelConfig(**cfg_kernel['proj']).dict() # Pass thru ModelConfig to set `layers` etc
cfg_kernel = DeepKernelConfig(**cfg_kernel).dict() # pydantic validation
assert cfg_kernel['proj']['src'] == 'model'
assert cfg_kernel['proj']['custom_objects'] is None
assert cfg_kernel['proj']['layer'] is None

# Resolve and load config
cfg = {'kernel': cfg_kernel, 'backend': backend}
Expand Down Expand Up @@ -1115,10 +1119,10 @@ def test_save_preprocess(data, preprocess_fn, tmp_path, backend):
# Save preprocess_fn to config
filepath = tmp_path
X_ref, X_h0 = data
input_dim = X_ref.shape[1]
input_shape = (X_ref.shape[1],)
cfg_preprocess = _save_preprocess_config(preprocess_fn,
backend=backend,
input_shape=input_dim,
input_shape=input_shape,
filepath=filepath)
cfg_preprocess = _path2str(cfg_preprocess)
cfg_preprocess = PreprocessConfig(**cfg_preprocess).dict() # pydantic validation
Expand All @@ -1136,7 +1140,7 @@ def test_save_preprocess(data, preprocess_fn, tmp_path, backend):

@parametrize('preprocess_fn', [preprocess_nlp])
@parametrize_with_cases("data", cases=TextData.movie_sentiment_data, prefix='data_')
def test_save_preprocess_nlp(data, preprocess_fn, max_len, tmp_path, backend):
def test_save_preprocess_nlp(data, preprocess_fn, tmp_path, backend):
"""
Unit test for _save_preprocess_config and _load_preprocess_config, with text data.
Expand All @@ -1147,7 +1151,7 @@ def test_save_preprocess_nlp(data, preprocess_fn, max_len, tmp_path, backend):
filepath = tmp_path
cfg_preprocess = _save_preprocess_config(preprocess_fn,
backend=backend,
input_shape=max_len,
input_shape=(768,), # hardcoded to bert-base-cased for now
filepath=filepath)
cfg_preprocess = _path2str(cfg_preprocess)
cfg_preprocess = PreprocessConfig(**cfg_preprocess).dict() # pydantic validation
Expand Down Expand Up @@ -1185,8 +1189,8 @@ def test_save_model(data, model, layer, backend, tmp_path):
"""
# Save model
filepath = tmp_path
input_dim = data[0].shape[1]
cfg_model, _ = _save_model_config(model, base_path=filepath, input_shape=input_dim, backend=backend)
input_shape = (data[0].shape[1],)
cfg_model, _ = _save_model_config(model, base_path=filepath, input_shape=input_shape, backend=backend)
cfg_model = _path2str(cfg_model)
cfg_model = ModelConfig(**cfg_model).dict()
assert tmp_path.joinpath('model').is_dir()
Expand Down
4 changes: 2 additions & 2 deletions alibi_detect/tests/test_dep_management.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ def test_fetching_utils_dependencies(opt_dep):


def test_saving_tf_dependencies(opt_dep):
"""Tests that the alibi_detect.saving.tensorflow module correctly protects against uninstalled optional
"""Tests that the alibi_detect.saving._tensorflow module correctly protects against uninstalled optional
dependencies.
"""

Expand All @@ -208,7 +208,7 @@ def test_saving_tf_dependencies(opt_dep):
('get_tf_dtype', ['tensorflow'])
]:
dependency_map[dependency] = relations
from alibi_detect.saving import tensorflow as tf_saving
from alibi_detect.saving import _tensorflow as tf_saving
check_correct_dependencies(tf_saving, dependency_map, opt_dep)


Expand Down
39 changes: 38 additions & 1 deletion doc/source/overview/saving.md
Original file line number Diff line number Diff line change
Expand Up @@ -98,5 +98,42 @@ for the remaining detectors is in the [Roadmap](roadmap.md).
````

```{note}
Saving/loading of detectors using PyTorch models and/or a PyTorch backend is currently not supported.
For detectors with backends, or using preprocessing, save/load support is currently limited to TensorFlow models and backends.
```

(supported_models)=
## Supported ML models

Alibi Detect drift detectors offer the option to perform [preprocessing](../cd/background.md#input-preprocessing)
with user-defined machine learning models:

```python
model = ... # TensorFlow model; tf.keras.Model or tf.keras.Sequential
preprocess_fn = partial(preprocess_drift, model=model, batch_size=128)
cd = MMDDrift(x_ref, backend='tensorflow', p_val=.05, preprocess_fn=preprocess_fn)
```

Additionally, some detectors are built upon models directly,
for example the [Classifier](../cd/methods/classifierdrift.ipynb) drift detector requires a `model` to be passed
as an argument:

```python
cd = ClassifierDrift(x_ref, model, p_val=.05, preds_type='probs')
```

In order for a detector to be saveable and loadable, any models contained within it (or referenced within a
[detector configuration file](config_files.md#specifying-artefacts)) must fall within the family of supported models
documented below.

### TensorFlow models

Alibi Detect supports any TensorFlow model that can be serialized to the
[HDF5](https://www.tensorflow.org/guide/keras/save_and_serialize#keras_h5_format) format.
Custom objects should be pre-registered with
[register_keras_serializable](https://www.tensorflow.org/api_docs/python/tf/keras/utils/register_keras_serializable).
```
%### PyTorch
%### scikit-learn

0 comments on commit 4af80a1

Please sign in to comment.