Merge pull request #3 from sustainable-processes/datasets

Datasets enabling easier problem specification
sustainable-processes · Aug 19, 2019 · 804c85b · 804c85b
2 parents c4e8ccf + 8a24566
commit 804c85b
Show file tree

Hide file tree

Showing 7 changed files with 329 additions and 118 deletions.
diff --git a/summit/acquisition.py b/summit/acquisition.py
@@ -101,7 +101,7 @@ def select_max(self, samples, num_evals=1):
 
         for i in range(num_evals):
             masked_samples = samples[mask, :]
-            Yfront, _ = pareto_efficient(Ynew)
+            Yfront, _ = pareto_efficient(Ynew, maximize=True)
             if len(Yfront) ==0:
                 raise ValueError('Pareto front length too short')
 
@@ -112,7 +112,7 @@ def select_max(self, samples, num_evals=1):
             for sample in masked_samples:
                 sample = sample.reshape(1,n)
                 A = np.append(Ynew, sample, axis=0)
-                Afront, _ = pareto_efficient(A)
+                Afront, _ = pareto_efficient(A, maximize=True)
                 hv = HvI.hypervolume(-Afront, [0,0])
                 hv_improvement.append(hv-hvY)
 
@@ -147,8 +147,6 @@ def hypervolume(pointset, ref):
         """Compute the absolute hypervolume of a *pointset* according to the
         reference point *ref*.
         """
-        warnings.warn("Falling back to the python version of hypervolume "
-            "module. Expect this to be very slow.", RuntimeWarning)
         hv = _HyperVolume(ref)
         return hv.compute(pointset)
 

diff --git a/summit/data/dataset.py b/summit/data/dataset.py
@@ -14,6 +14,24 @@ class DataSet(pd.core.frame.DataFrame):
     ----
     Based on https://notes.mikejarrett.ca/storing-metadata-in-pandas-dataframes/
     """
+    def __init__(self, data=None, index=None, columns=None, metadata_columns=[], units=None, dtype=None, copy=False):
+        if isinstance(columns, pd.MultiIndex):
+            pass
+        elif columns is not None:
+            column_names = columns
+            if metadata_columns:
+                types = ['METADATA' if x in metadata_columns else 'DATA' for x in column_names]
+            else:
+                types = ['DATA' for _ in range(len(column_names))]
+            arrays = [column_names, types]
+            levels = ['NAME', 'TYPE']
+            if units:
+                arrays.append(units)
+                levels.append('UNITS')
+            tuples=list(zip(*arrays))
+            columns = pd.MultiIndex.from_tuples(tuples, names=levels)
+        pd.core.frame.DataFrame.__init__(self, data=data, index=index, columns=columns, dtype=dtype, copy=copy)
+
     @staticmethod
     def from_df(df: pd.DataFrame, metadata_columns: List=[], 
                 units: List = []):
@@ -83,7 +101,8 @@ def zero_to_one(self, small_tol=1.0e-5) -> np.ndarray:
         scaled[abs(scaled) < small_tol] = 0.0
         return scaled
 
-    def standardize(self, small_tol=1.0e-5) -> np.ndarray:
+    def standardize(self, small_tol=1.0e-5,
+                    return_mean=False, return_std=False, **kwargs) -> np.ndarray:
         """Standardize data columns by removing the mean and scaling to unit variance
 
         The standard score of each data column is calculated as:
@@ -97,6 +116,16 @@ def standardize(self, small_tol=1.0e-5) -> np.ndarray:
             The minimum value of any value in the final scaled array. 
             This is used to prevent very small values that will cause
             issues in later calcualtions. Defaults to 1e-5.
+        return_mean: bool, optional
+            Return an array with the mean of each column in the DataSet
+        return_std: bool, optional
+            Return an array with the stnadard deviation of each column
+            in the DataSet
+        mean: array, optional
+            Pass a precalculated array of means for the columns
+        std: array, optional
+            Pass a precalculated array of standard deviations 
+            for the columns
         
         Returns
         -------
@@ -110,11 +139,21 @@ def standardize(self, small_tol=1.0e-5) -> np.ndarray:
         """
         values = self.data_to_numpy()
         values = values.astype(np.float64)
-        mean = np.mean(values, axis=0)
-        sigma = np.std(values, axis=0)
+
+        mean = kwargs.get('mean',
+                          np.mean(values, axis=0))
+        sigma = kwargs.get('std',
+                           np.std(values, axis=0))
         standard = (values-mean)/sigma
         standard[abs(standard) < small_tol] = 0.0
-        return standard
+        if return_mean and return_std:
+            return standard, mean, sigma
+        elif return_mean:
+            return standard, mean
+        elif return_std:
+            return standard, sigma
+        else:
+            return standard
 
     @property  
     def _constructor(self):
@@ -182,6 +221,4 @@ def insert(self, loc, column, value, type='DATA', units=None, allow_duplicates=F
         self.columns[loc][0] = column
         self.columns[loc][1] = type
         self.columns[loc][2] = units
-
-class ResultSet(DataSet):
-    data_column_types = ['input', 'output']
+
diff --git a/summit/domain.py b/summit/domain.py
@@ -176,6 +176,11 @@ def from_dict(variable_dict):
                            description=variable_dict['description'],
                            bounds=variable_dict['bounds'],
                            is_objective=variable_dict['is_objective'])
+
+    def __add__(self, other):
+        if isinstance(other, self.__class__):
+            return [self, other]
+
 
 class DiscreteVariable(Variable):
     """Representation of a discrete variable
@@ -335,6 +340,24 @@ def from_dict(variable_dict):
     def _html_table_rows(self):
         return self._make_html_table_rows(f"{self.num_examples} examples of {self.num_descriptors} descriptors")
 
+
+class Constraint: 
+    def __init__(self, expression):
+        self._expression = expression
+
+    @property
+    def expression(self):
+        return self._expression
+
+    def _html_table_rows(self):
+        columns = []
+        columns.append("") #name column
+        columns.append("constraint") #type column
+        columns.append(self.expression) #description columns
+        columns.append("") #value column
+        return ''.join([f"<td>{column}</td>" for column in columns])
+
+
 class Domain:
     """Representation of the optimization domain
 
@@ -353,8 +376,9 @@ class Domain:
     >>> domain += ContinuousVariable('temperature', 'reaction temperature', [1, 100])
 
     """
-    def __init__(self, variables:Optional[List[Type[Variable]]]=[]):
+    def __init__(self, variables=[], constraints=[]):
         self._variables = variables
+        self._constraints = constraints
         #Check that all the output variables continuous
         self.raise_noncontinuous_outputs()
 
@@ -363,6 +387,20 @@ def variables(self):
         """[List[Type[Variable]]]: List of variables in the domain"""
         return self._variables
 
+    @property
+    def constraints(self):
+        return self._constraints
+
+    # def __getitem__(self, key):
+    #     '''For accessing variables like a dictionary'''
+    #     for v in self.variables:
+    #         if v.name == key:
+    #             return v    
+    #     raise KeyError(f'No variable {key} found in the domain.')
+
+    # def _ipython_key_completions_(self):
+    #     return [v.name for v in self.variables]
+
     @property
     def input_variables(self):
         input_variables = []
@@ -482,10 +520,16 @@ def from_dict(domain_dict):
         return Domain(variables)
 
 
-    def __add__(self, var):
-        if var.is_objective and var.variable_type != 'continuous':
-            DomainError("Output variables must be continuous")
-        return Domain(self._variables + [var])
+    def __add__(self, obj):
+        #TODO: make this work with adding arrays of variable or constraints
+        if isinstance(obj, Variable):
+            if obj.is_objective and obj.variable_type != 'continuous':
+                raise DomainError("Output variables must be continuous")
+            return Domain(variables=self._variables + [obj], constraints=self.constraints) 
+        elif isinstance(obj, Constraint):
+            return Domain(variables=self.variables, constraints = self.constraints + [obj])
+        else:
+            raise RuntimeError('Not a supported domain object.')
 
     def _repr_html_(self):
         """Build html string for table display in jupyter notebooks.
@@ -510,7 +554,9 @@ def _repr_html_(self):
         return ''.join(html)
 
     def _html_table_rows(self):
-        return ''.join(map(lambda l: l._html_table_rows(), self._variables))
+        variables = ''.join([v._html_table_rows() for v in self.variables])
+        constraints = ''.join([c._html_table_rows() for c in self.constraints])
+        return variables + constraints
 
 
 class DomainError(Exception):

diff --git a/summit/models.py b/summit/models.py
@@ -1,12 +1,12 @@
-from summit.initial_design.latin_designer import lhs
+from summit.data import DataSet
 
 from GPy.models import GPRegression
 from GPy.kern import Matern52
 import numpy as np
-from numpy import matlib
 import scipy
 
 from abc import ABC, abstractmethod
+from sklearn.base import BaseEstimator, RegressorMixin
 
 
 class Model(ABC):
@@ -23,7 +23,32 @@ def fit(self, X, Y):
     def predict(self, X):
         pass
 
-class GPyModel(Model):
+class ModelGroup:
+    def __init__(self, models: dict):
+        self._models = models
+
+    @property
+    def models(self):
+        return self._models
+
+    def fit(self, X, y, **kwargs):
+        for column_name, model in self.models.items():
+            model.fit(X, y[[column_name]]) 
+
+    def predict(self, X, **kwargs):
+        """
+        Note
+        -----
+        This the make the assumption that each model returns a n_samples x 1 array
+        from the predict method.
+        """
+        result = [model.predict(X)[:, 0] for model in self.models.values()]
+        return np.array(result).T
+
+    def __getitem__(self, key):
+        return self.models[key]
+
+class GPyModel(BaseEstimator, RegressorMixin):
     ''' A Gaussian Process Regression model from GPy
 
     This is implemented as an alternative to the sklearn
@@ -54,7 +79,8 @@ def __init__(self, kernel=None, input_dim=None,noise_var=1.0, optimizer=None):
         else: 
             if not input_dim:
                 raise ValueError('input_dim must be specified if no kernel is specified.')
-            self._kernel =  Matern52(input_dim = input_dim, ARD=True)
+            self.input_dim = input_dim
+            self._kernel =  Matern52(input_dim = self.input_dim, ARD=True)
         self._noise_var = noise_var
         self._optimizer = optimizer
         self._model = None
@@ -63,10 +89,10 @@ def fit(self, X, y, num_restarts=10, max_iters=2000, parallel=False):
         """Fit Gaussian process regression model.
         Parameters
         ----------
-        X : array-like, shape = (n_samples, n_features)
-            Training data
-        y : array-like, shape = (n_samples, [n_output_dims])
-            Target values
+        X : DataSet
+            The data columns will be used as inputs for fitting the model
+        y : DataSEt
+            The data columns will be used as outputs for fitting the model
         num_restarts : int, optional (default=10)
             The number of random restarts of the optimizer.
         max_iters : int, optional (default=2000)
@@ -79,7 +105,25 @@ def fit(self, X, y, num_restarts=10, max_iters=2000, parallel=False):
         self : returns an instance of self.
         -----
         """ 
-        self._model = GPRegression(X,y, self._kernel, noise_var=self._noise_var)
+        #Standardize inputs and outputs
+        if isinstance(X, DataSet):
+            X_std, self.input_mean, self.input_std = X.standardize(return_mean=True, return_std=True)
+        elif isinstance(X, np.ndarray):
+            self.input_mean = np.mean(X,axis=0)
+            self.input_std = np.std(X, axis=0)
+            X_std = (X-self.input_mean)/self.input_std
+            X_std[abs(X_std) < 1e-5] = 0.0
+
+        if isinstance(y, DataSet):
+            y_std, self.output_mean, self.output_std = y.standardize(return_mean=True, return_std=True)
+        elif isinstance(y, np.ndarray):
+            self.output_mean = np.mean(y,axis=0)
+            self.output_std = np.std(y, axis=0)
+            y_std = (y-self.output_mean)/self.output_std
+            y_std[abs(y_std) < 1e-5] = 0.0
+
+        #Initialize and fit model
+        self._model = GPRegression(X_std,y_std, self._kernel, noise_var=self._noise_var)
         if self._optimizer:
             self._model.optimize_restarts(num_restarts = num_restarts, 
                                           verbose=False,
@@ -120,17 +164,24 @@ def predict(self, X,
             raise RuntimeError(
                 "Not returning standard deviation of predictions when "
                 "returning full covariance.")
-
-        m, v = self._model.predict(X)
-
-        if return_cov:
-            result = m, v
-        elif return_std:
-            result = m, self._model.Kdiag(X)
-        else:
-            result = m
 
-        return result
+        if isinstance(X, np.ndarray):
+            X_std =  (X-self.input_mean)/self.input_std
+            X_std[abs(X_std) < 1e-5] = 0.0
+        elif isinstance(X, DataSet):
+            X_std = X.standardize(mean=self.input_mean, std=self.input_std)
+
+        m_std, v_std = self._model.predict(X_std)
+        m = m_std*self.output_std + self.output_mean
+
+        # if return_cov:
+        #     result = m, v
+        # elif return_std:
+        #     result = m, self._model.Kdiag(X)
+        # else:
+        #     result = m
+
+        return m
 
 class AnalyticalModel(Model):
     ''' An analytical model instead of statistical model