Add constraint capabilities, change tsemoa and optimizers to work wit…

…h datasets
sustainable-processes · Aug 19, 2019 · 8a24566 · 8a24566
1 parent fcfe253
commit 8a24566
Show file tree

Hide file tree

Showing 7 changed files with 278 additions and 113 deletions.
diff --git a/summit/acquisition.py b/summit/acquisition.py
@@ -101,7 +101,7 @@ def select_max(self, samples, num_evals=1):
 
         for i in range(num_evals):
             masked_samples = samples[mask, :]
-            Yfront, _ = pareto_efficient(Ynew)
+            Yfront, _ = pareto_efficient(Ynew, maximize=True)
             if len(Yfront) ==0:
                 raise ValueError('Pareto front length too short')
 
@@ -112,7 +112,7 @@ def select_max(self, samples, num_evals=1):
             for sample in masked_samples:
                 sample = sample.reshape(1,n)
                 A = np.append(Ynew, sample, axis=0)
-                Afront, _ = pareto_efficient(A)
+                Afront, _ = pareto_efficient(A, maximize=True)
                 hv = HvI.hypervolume(-Afront, [0,0])
                 hv_improvement.append(hv-hvY)
 
@@ -147,8 +147,6 @@ def hypervolume(pointset, ref):
         """Compute the absolute hypervolume of a *pointset* according to the
         reference point *ref*.
         """
-        warnings.warn("Falling back to the python version of hypervolume "
-            "module. Expect this to be very slow.", RuntimeWarning)
         hv = _HyperVolume(ref)
         return hv.compute(pointset)
 

diff --git a/summit/data/dataset.py b/summit/data/dataset.py
@@ -14,6 +14,24 @@ class DataSet(pd.core.frame.DataFrame):
     ----
     Based on https://notes.mikejarrett.ca/storing-metadata-in-pandas-dataframes/
     """
+    def __init__(self, data=None, index=None, columns=None, metadata_columns=[], units=None, dtype=None, copy=False):
+        if isinstance(columns, pd.MultiIndex):
+            pass
+        elif columns is not None:
+            column_names = columns
+            if metadata_columns:
+                types = ['METADATA' if x in metadata_columns else 'DATA' for x in column_names]
+            else:
+                types = ['DATA' for _ in range(len(column_names))]
+            arrays = [column_names, types]
+            levels = ['NAME', 'TYPE']
+            if units:
+                arrays.append(units)
+                levels.append('UNITS')
+            tuples=list(zip(*arrays))
+            columns = pd.MultiIndex.from_tuples(tuples, names=levels)
+        pd.core.frame.DataFrame.__init__(self, data=data, index=index, columns=columns, dtype=dtype, copy=copy)
+
     @staticmethod
     def from_df(df: pd.DataFrame, metadata_columns: List=[], 
                 units: List = []):
@@ -83,7 +101,8 @@ def zero_to_one(self, small_tol=1.0e-5) -> np.ndarray:
         scaled[abs(scaled) < small_tol] = 0.0
         return scaled
 
-    def standardize(self, small_tol=1.0e-5) -> np.ndarray:
+    def standardize(self, small_tol=1.0e-5,
+                    return_mean=False, return_std=False, **kwargs) -> np.ndarray:
         """Standardize data columns by removing the mean and scaling to unit variance
 
         The standard score of each data column is calculated as:
@@ -97,6 +116,16 @@ def standardize(self, small_tol=1.0e-5) -> np.ndarray:
             The minimum value of any value in the final scaled array. 
             This is used to prevent very small values that will cause
             issues in later calcualtions. Defaults to 1e-5.
+        return_mean: bool, optional
+            Return an array with the mean of each column in the DataSet
+        return_std: bool, optional
+            Return an array with the stnadard deviation of each column
+            in the DataSet
+        mean: array, optional
+            Pass a precalculated array of means for the columns
+        std: array, optional
+            Pass a precalculated array of standard deviations 
+            for the columns
         
         Returns
         -------
@@ -110,11 +139,21 @@ def standardize(self, small_tol=1.0e-5) -> np.ndarray:
         """
         values = self.data_to_numpy()
         values = values.astype(np.float64)
-        mean = np.mean(values, axis=0)
-        sigma = np.std(values, axis=0)
+
+        mean = kwargs.get('mean',
+                          np.mean(values, axis=0))
+        sigma = kwargs.get('std',
+                           np.std(values, axis=0))
         standard = (values-mean)/sigma
         standard[abs(standard) < small_tol] = 0.0
-        return standard
+        if return_mean and return_std:
+            return standard, mean, sigma
+        elif return_mean:
+            return standard, mean
+        elif return_std:
+            return standard, sigma
+        else:
+            return standard
 
     @property  
     def _constructor(self):
@@ -182,6 +221,4 @@ def insert(self, loc, column, value, type='DATA', units=None, allow_duplicates=F
         self.columns[loc][0] = column
         self.columns[loc][1] = type
         self.columns[loc][2] = units
-
-class ResultSet(DataSet):
-    data_column_types = ['input', 'output']
+
diff --git a/summit/domain.py b/summit/domain.py
@@ -357,7 +357,6 @@ def _html_table_rows(self):
         columns.append("") #value column
         return ''.join([f"<td>{column}</td>" for column in columns])
 
-
 
 class Domain:
     """Representation of the optimization domain
@@ -522,6 +521,7 @@ def from_dict(domain_dict):
 
 
     def __add__(self, obj):
+        #TODO: make this work with adding arrays of variable or constraints
         if isinstance(obj, Variable):
             if obj.is_objective and obj.variable_type != 'continuous':
                 raise DomainError("Output variables must be continuous")

diff --git a/summit/models.py b/summit/models.py
@@ -1,12 +1,12 @@
-from summit.initial_design.latin_designer import lhs
+from summit.data import DataSet
 
 from GPy.models import GPRegression
 from GPy.kern import Matern52
 import numpy as np
-from numpy import matlib
 import scipy
 
 from abc import ABC, abstractmethod
+from sklearn.base import BaseEstimator, RegressorMixin
 
 
 class Model(ABC):
@@ -23,7 +23,32 @@ def fit(self, X, Y):
     def predict(self, X):
         pass
 
-class GPyModel(Model):
+class ModelGroup:
+    def __init__(self, models: dict):
+        self._models = models
+
+    @property
+    def models(self):
+        return self._models
+
+    def fit(self, X, y, **kwargs):
+        for column_name, model in self.models.items():
+            model.fit(X, y[[column_name]]) 
+
+    def predict(self, X, **kwargs):
+        """
+        Note
+        -----
+        This the make the assumption that each model returns a n_samples x 1 array
+        from the predict method.
+        """
+        result = [model.predict(X)[:, 0] for model in self.models.values()]
+        return np.array(result).T
+
+    def __getitem__(self, key):
+        return self.models[key]
+
+class GPyModel(BaseEstimator, RegressorMixin):
     ''' A Gaussian Process Regression model from GPy
 
     This is implemented as an alternative to the sklearn
@@ -54,7 +79,8 @@ def __init__(self, kernel=None, input_dim=None,noise_var=1.0, optimizer=None):
         else: 
             if not input_dim:
                 raise ValueError('input_dim must be specified if no kernel is specified.')
-            self._kernel =  Matern52(input_dim = input_dim, ARD=True)
+            self.input_dim = input_dim
+            self._kernel =  Matern52(input_dim = self.input_dim, ARD=True)
         self._noise_var = noise_var
         self._optimizer = optimizer
         self._model = None
@@ -63,10 +89,10 @@ def fit(self, X, y, num_restarts=10, max_iters=2000, parallel=False):
         """Fit Gaussian process regression model.
         Parameters
         ----------
-        X : array-like, shape = (n_samples, n_features)
-            Training data
-        y : array-like, shape = (n_samples, [n_output_dims])
-            Target values
+        X : DataSet
+            The data columns will be used as inputs for fitting the model
+        y : DataSEt
+            The data columns will be used as outputs for fitting the model
         num_restarts : int, optional (default=10)
             The number of random restarts of the optimizer.
         max_iters : int, optional (default=2000)
@@ -79,7 +105,25 @@ def fit(self, X, y, num_restarts=10, max_iters=2000, parallel=False):
         self : returns an instance of self.
         -----
         """ 
-        self._model = GPRegression(X,y, self._kernel, noise_var=self._noise_var)
+        #Standardize inputs and outputs
+        if isinstance(X, DataSet):
+            X_std, self.input_mean, self.input_std = X.standardize(return_mean=True, return_std=True)
+        elif isinstance(X, np.ndarray):
+            self.input_mean = np.mean(X,axis=0)
+            self.input_std = np.std(X, axis=0)
+            X_std = (X-self.input_mean)/self.input_std
+            X_std[abs(X_std) < 1e-5] = 0.0
+
+        if isinstance(y, DataSet):
+            y_std, self.output_mean, self.output_std = y.standardize(return_mean=True, return_std=True)
+        elif isinstance(y, np.ndarray):
+            self.output_mean = np.mean(y,axis=0)
+            self.output_std = np.std(y, axis=0)
+            y_std = (y-self.output_mean)/self.output_std
+            y_std[abs(y_std) < 1e-5] = 0.0
+
+        #Initialize and fit model
+        self._model = GPRegression(X_std,y_std, self._kernel, noise_var=self._noise_var)
         if self._optimizer:
             self._model.optimize_restarts(num_restarts = num_restarts, 
                                           verbose=False,
@@ -120,17 +164,24 @@ def predict(self, X,
             raise RuntimeError(
                 "Not returning standard deviation of predictions when "
                 "returning full covariance.")
-
-        m, v = self._model.predict(X)
-
-        if return_cov:
-            result = m, v
-        elif return_std:
-            result = m, self._model.Kdiag(X)
-        else:
-            result = m
 
-        return result
+        if isinstance(X, np.ndarray):
+            X_std =  (X-self.input_mean)/self.input_std
+            X_std[abs(X_std) < 1e-5] = 0.0
+        elif isinstance(X, DataSet):
+            X_std = X.standardize(mean=self.input_mean, std=self.input_std)
+
+        m_std, v_std = self._model.predict(X_std)
+        m = m_std*self.output_std + self.output_mean
+
+        # if return_cov:
+        #     result = m, v
+        # elif return_std:
+        #     result = m, self._model.Kdiag(X)
+        # else:
+        #     result = m
+
+        return m
 
 class AnalyticalModel(Model):
     ''' An analytical model instead of statistical model

diff --git a/summit/optimizers.py b/summit/optimizers.py
@@ -7,7 +7,7 @@
 from typing import List
 from summit.domain import Domain, DomainError
 from summit.initial_design import RandomDesigner
-# from .objective import ObjectiveWrapper
+from summit.data import DataSet
 
 from abc import ABC, abstractmethod
 import numpy as np
@@ -62,12 +62,17 @@ def is_multiobjective(self):
 class NSGAII(Optimizer): 
     def __init__(self, domain: Domain):
         Optimizer.__init__(self, domain)
+        #Set up platypus problem
         self.problem = pp.Problem(nvars=self.domain.num_variables(),
-                                  nobjs=len(self.domain.output_variables))
+                                  nobjs=len(self.domain.output_variables),
+                                  nconstrs=len(self.domain.constraints))
+        #Set maximization or minimization for each objective                          
+        j = 0
         for i, v in enumerate(self.domain.variables):
             if v.is_objective:
-                continue
-            if v.variable_type == "continuous":
+                direction = self.problem.MAXIMIZE if v.maximize else self.problem.MINIMIZE
+                self.problem.directions[j] = direction
+            elif v.variable_type == "continuous":
                 self.problem.types[i] = pp.Real(v.lower_bound, v.upper_bound)
             elif v.variable_type == "discrete":
                 #Select a subset of one of the available options
@@ -78,23 +83,36 @@ def __init__(self, domain: Domain):
             else:
                 raise DomainError(f'{v.variable_type} is not a valid variable type.')
 
-    def _optimize(self, objective, **kwargs):
+        #Set up constraints
+        self.problem.constraints[:] = "<=0"
+
+
+    def _optimize(self, models, **kwargs):
+        input_columns = [v.name for  v in self.domain.variables if not v.is_objective]
+        output_columns = [v.name for  v in self.domain.variables if v.is_objective]
         def problem_wrapper(X):
-            np.atleast_2d(X)
-            result = objective(X)
-            return result.tolist()
+            X = DataSet(np.atleast_2d(X), 
+                        columns=input_columns)
+            result = models.predict(X)
+
+            constraint_res = [X.eval(c.expression, resolvers=[X]) 
+                             for c in self.domain.constraints]
+            constraint_res = [c.tolist()[0] for c in constraint_res]
 
+            return result[0, :].tolist(), constraint_res
+
+        #Run optimization
         self.problem.function = problem_wrapper
-        self.problem.directions[:] = pp.Problem.MAXIMIZE
         algorithm = pp.NSGAII(self.problem)
         iterations = kwargs.get('iterations', 10000)
         algorithm.run(iterations)
+
         x = [[s.variables[i] for i in range(self.domain.num_variables())]
              for s in algorithm.result]
-        x = np.array(x)
+        x = DataSet(x, columns = input_columns)
         y =[[s.objectives[i] for i in range(len(self.domain.output_variables))]
             for s in algorithm.result]
-        y = np.array(y)
+        y = DataSet(y, columns=output_columns)
         return OptimizeResult(x=x, fun=y, success=True)
 
 class MCOptimizer(Optimizer):