hackingmaterials · ardunn · Oct 15, 2019 · Oct 11, 2019 · Oct 11, 2019 · Oct 11, 2019
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -45,7 +45,7 @@ jobs:
             coverage run setup.py test
             coverage xml
             python-codacy-coverage -r coverage.xml
-          no_output_timeout: 120m
+          no_output_timeout: 10m
 
       - save_cache:
           paths:

diff --git a/.circleci/config_old.yml b/.circleci/config_old.yml
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1,5 +1,5 @@
 include LICENSE
 include CHANGELOG.md
 include CONTRIBUTING.md
-recursive-include automatminer *.txt *.py *.yaml *.json *.csv
+recursive-include automatminer *.txt *.py *.yaml *.json *.csv *.p *.pickle
 recursive-exclude benchdev *
diff --git a/README.md b/README.md
@@ -7,11 +7,9 @@ automatminer is an automatic prediction engine for materials properties.
 |:----------:|:-------------:|:------:|:------:|
 | [![CircleCI](https://img.shields.io/circleci/project/github/hackingmaterials/automatminer/master.svg)](https://circleci.com/gh/hackingmaterials/automatminer) | [![Codacy Badge](https://img.shields.io/codacy/coverage/aa63dd7aa85e480bbe0e924a02ad1540.svg?colorB=brightgreen)](https://www.codacy.com/app/ardunn/automatminer) | [![Codacy Badge](https://img.shields.io/codacy/grade/aa63dd7aa85e480bbe0e924a02ad1540.svg)](https://www.codacy.com/app/ardunn/automatminer) | [![PyPI version](https://img.shields.io/pypi/v/automatminer.svg?colorB=blue)](https://pypi.org/project/automatminer/) |
 
-### Warning: Automatminer is currently at an experimental stage of development.
-#### Please use in production at your own risk!
-
-#### Automatminer requires the newest version of [matminer](https://github.com/hackingmaterials/matminer) (from git) to work properly!
-
 - **Website (including work-in-progress documentation):** <http://hackingmaterials.lbl.gov/automatminer/>
 - **Help/Support:** https://hackingmaterials.discourse.group/c/matminer/automatminer
 - **Source:** <https://github.com/hackingmaterials/automatminer>
+
+You may also be interested in the parent code of automatminer, matminer:
+- **Matminer**: <https://github.com/hackingmaterials/matminer>
diff --git a/automatminer/__init__.py b/automatminer/__init__.py
@@ -1,10 +1,10 @@
 from automatminer.preprocessing import DataCleaner, FeatureReducer
-from automatminer.automl import TPOTAdaptor
+from automatminer.automl import TPOTAdaptor, SinglePipelineAdaptor
 from automatminer.featurization import AutoFeaturizer
 from automatminer.pipeline import MatPipe
 from automatminer.presets import get_preset_config
 
 __author__ = 'Alex Dunn, Qi Wang, Alex Ganose, Alireza Faghaninia, Anubhav Jain'
 __author_email__ = 'ardunn@lbl.gov'
 __license__ = 'Modified BSD'
-__version__ = "2019.9.12"
+__version__ = "2019.10.11"
diff --git a/automatminer/automl/adaptors.py b/automatminer/automl/adaptors.py
@@ -8,9 +8,7 @@
 """
 from collections import OrderedDict
 
-from sklearn.pipeline import Pipeline
 from tpot import TPOTClassifier, TPOTRegressor
-from tpot.base import TPOTBase
 
 from automatminer.automl.config.tpot_configs import TPOT_CLASSIFIER_CONFIG, \
     TPOT_REGRESSOR_CONFIG
@@ -27,6 +25,8 @@
                'Qi Wang <wqthu11@gmail.com>',
                'Daniel Dopp <dbdopp@lbl.gov>']
 
+_adaptor_tmp_backend = None
+
 
 class TPOTAdaptor(DFMLAdaptor, LoggableMixin):
     """
@@ -61,6 +61,10 @@ class TPOTAdaptor(DFMLAdaptor, LoggableMixin):
         best_models (OrderedDict): The best model names and their scores.
         backend (TPOTBase): The TPOT object interface used for ML training.
         models (OrderedDict): The raw sklearn-style models output by TPOT.
+
+        from_serialized (bool): Whether the backend is loaded from a serialized
+            instance. If True, the previous full TPOT data will not be available
+            due to pickling problems.
     """
 
     def __init__(self, logger=True, **tpot_kwargs):
@@ -80,6 +84,10 @@ def __init__(self, logger=True, **tpot_kwargs):
         self._features = None
         self.logger = logger
 
+        self.from_serialized = False
+        self._best_models = None
+        super(DFMLAdaptor, self).__init__()
+
     @log_progress(AMM_LOG_FIT_STR)
     @set_fitted
     def fit(self, df, target, **fit_kwargs):
@@ -148,69 +156,110 @@ def best_models(self):
                 best hyperparameter combination found.
 
         """
-        self.greater_score_is_better = is_greater_better(
-            self.backend.scoring_function)
-
-        # Get list of evaluated model names, cast to set and back
-        # to get unique model names, instantiate ordered model dictionary
-        evaluated_models = []
-        for key in self.backend.evaluated_individuals_.keys():
-            evaluated_models.append(key.split('(')[0])
-
-        model_names = list(set(evaluated_models))
-        models = OrderedDict({model: [] for model in model_names})
-
-        # This makes a dict of model names mapped to all runs of that model
-        for key, val in self.backend.evaluated_individuals_.items():
-            models[key.split('(')[0]].append(val)
-
-        # For each base model type sort the runs by best score
-        for model_name in model_names:
-            models[model_name].sort(
-                key=lambda x: x['internal_cv_score'],
-                reverse=self.greater_score_is_better
-            )
-
-        # Gets a simplified dict of the model to only its best run
-        # Sort the best individual models by type to best models overall
-        best_models = OrderedDict(
-            sorted({model: models[model][0] for model in models}.items(),
-                   key=lambda x: x[1]['internal_cv_score'],
-                   reverse=self.greater_score_is_better))
-
-        # Mapping of top models to just their score
-        scores = {model: best_models[model]['internal_cv_score']
-                  for model in best_models}
-
-        # Sorted dict of top models just mapped to their top scores
-        best_models_and_scores = OrderedDict(
-            sorted(scores.items(),
-                   key=lambda x: x[1],
-                   reverse=self.greater_score_is_better))
-        self.models = models
-        return best_models_and_scores
+
+        if self.from_serialized:
+            return self._best_models
+        else:
+            self.greater_score_is_better = is_greater_better(
+                self.backend.scoring_function)
+
+            # Get list of evaluated model names, cast to set and back
+            # to get unique model names, instantiate ordered model dictionary
+            evaluated_models = []
+            for key in self.backend.evaluated_individuals_.keys():
+                evaluated_models.append(key.split('(')[0])
+                # evaluated_models.append(key)
+
+            model_names = list(set(evaluated_models))
+            models = OrderedDict({model: [] for model in model_names})
+
+            # This makes a dict of model names mapped to all runs of that model
+            for key, val in self.backend.evaluated_individuals_.items():
+                models[key.split('(')[0]].append(val)
+
+            # For each base model type sort the runs by best score
+            for model_name in model_names:
+                models[model_name].sort(
+                    key=lambda x: x['internal_cv_score'],
+                    reverse=self.greater_score_is_better
+                )
+
+            # Gets a simplified dict of the model to only its best run
+            # Sort the best individual models by type to best models overall
+            best_models = OrderedDict(
+                sorted({model: models[model][0] for model in models}.items(),
+                       key=lambda x: x[1]['internal_cv_score'],
+                       reverse=self.greater_score_is_better))
+
+            # Mapping of top models to just their score
+            scores = {model: best_models[model]['internal_cv_score']
+                      for model in best_models}
+
+            # Sorted dict of top models just mapped to their top scores
+            best_models_and_scores = OrderedDict(
+                sorted(scores.items(),
+                       key=lambda x: x[1],
+                       reverse=self.greater_score_is_better))
+            self.models = models
+            return best_models_and_scores
 
     @property
+    @check_fitted
     def backend(self):
         return self._backend
 
     @property
+    @check_fitted
     def best_pipeline(self):
-        if isinstance(self._backend, TPOTBase):
-            return self._backend.fitted_pipeline_
-        elif isinstance(self._backend, Pipeline):
+        if self.from_serialized:
+            # The TPOT backend is replaced by the best pipeline.
             return self._backend
         else:
-            raise TypeError("Backend type not recognized as TPOT or Pipeline")
+            return self._backend.fitted_pipeline_
 
     @property
+    @check_fitted
     def features(self):
         return self._features
 
     @property
+    @check_fitted
     def fitted_target(self):
         return self._fitted_target
 
+    @check_fitted
+    def serialize(self) -> None:
+        """
+        Avoid TPOT pickling issues. Used by MatPipe during save.
+
+        Returns:
+            (self): A deepcopy of this object, with some modifications to make
+                it serializable.
+
+        """
+        if not self.from_serialized:
+            global _adaptor_tmp_backend
+            _adaptor_tmp_backend = self._backend
+            # Necessary for getting best models post serialization
+            self._best_models = self.best_models
+            self._backend = self.best_pipeline
+            self.from_serialized = True
+
+    @check_fitted
+    def deserialize(self) -> None:
+        """
+        Get the original TPOTAdaptor image back after serializing, with
+        (relatively) contained scope.
+
+        Returns:
+            None
+        """
+        if not self.from_serialized:
+            global _adaptor_tmp_backend
+            self._backend = _adaptor_tmp_backend
+            _adaptor_tmp_backend = None
+            self.from_serialized = False
+
 
 class SinglePipelineAdaptor(DFMLAdaptor, LoggableMixin):
     """
@@ -236,11 +285,6 @@ class SinglePipelineAdaptor(DFMLAdaptor, LoggableMixin):
 
         mode (str): Either AMM_REG_NAME (regression) or AMM_CLF_NAME
             (classification)
-        _regressor (BaseEstimator): The single pipeline to be used for
-            regression
-        _classifier (BaseEstimator)L The single pipeline to be used for
-            classification
-
     """
 
     def __init__(self, regressor, classifier, logger=True):
@@ -278,7 +322,7 @@ def fit(self, df, target, **fit_kwargs):
     @property
     @check_fitted
     def backend(self):
-        return None
+        return self.best_pipeline
 
     @property
     @check_fitted
@@ -294,4 +338,3 @@ def features(self):
     @check_fitted
     def fitted_target(self):
         return self._fitted_target
-