diff --git a/summit/acquisition.py b/summit/acquisition.py
index dd950d36..ba3a23c0 100644
--- a/summit/acquisition.py
+++ b/summit/acquisition.py
@@ -101,7 +101,7 @@ def select_max(self, samples, num_evals=1):
for i in range(num_evals):
masked_samples = samples[mask, :]
- Yfront, _ = pareto_efficient(Ynew)
+ Yfront, _ = pareto_efficient(Ynew, maximize=True)
if len(Yfront) ==0:
raise ValueError('Pareto front length too short')
@@ -112,7 +112,7 @@ def select_max(self, samples, num_evals=1):
for sample in masked_samples:
sample = sample.reshape(1,n)
A = np.append(Ynew, sample, axis=0)
- Afront, _ = pareto_efficient(A)
+ Afront, _ = pareto_efficient(A, maximize=True)
hv = HvI.hypervolume(-Afront, [0,0])
hv_improvement.append(hv-hvY)
@@ -147,8 +147,6 @@ def hypervolume(pointset, ref):
"""Compute the absolute hypervolume of a *pointset* according to the
reference point *ref*.
"""
- warnings.warn("Falling back to the python version of hypervolume "
- "module. Expect this to be very slow.", RuntimeWarning)
hv = _HyperVolume(ref)
return hv.compute(pointset)
diff --git a/summit/data/dataset.py b/summit/data/dataset.py
index 1cb9cd3c..200b6b6b 100644
--- a/summit/data/dataset.py
+++ b/summit/data/dataset.py
@@ -14,6 +14,24 @@ class DataSet(pd.core.frame.DataFrame):
----
Based on https://notes.mikejarrett.ca/storing-metadata-in-pandas-dataframes/
"""
+ def __init__(self, data=None, index=None, columns=None, metadata_columns=[], units=None, dtype=None, copy=False):
+ if isinstance(columns, pd.MultiIndex):
+ pass
+ elif columns is not None:
+ column_names = columns
+ if metadata_columns:
+ types = ['METADATA' if x in metadata_columns else 'DATA' for x in column_names]
+ else:
+ types = ['DATA' for _ in range(len(column_names))]
+ arrays = [column_names, types]
+ levels = ['NAME', 'TYPE']
+ if units:
+ arrays.append(units)
+ levels.append('UNITS')
+ tuples=list(zip(*arrays))
+ columns = pd.MultiIndex.from_tuples(tuples, names=levels)
+ pd.core.frame.DataFrame.__init__(self, data=data, index=index, columns=columns, dtype=dtype, copy=copy)
+
@staticmethod
def from_df(df: pd.DataFrame, metadata_columns: List=[],
units: List = []):
@@ -83,7 +101,8 @@ def zero_to_one(self, small_tol=1.0e-5) -> np.ndarray:
scaled[abs(scaled) < small_tol] = 0.0
return scaled
- def standardize(self, small_tol=1.0e-5) -> np.ndarray:
+ def standardize(self, small_tol=1.0e-5,
+ return_mean=False, return_std=False, **kwargs) -> np.ndarray:
"""Standardize data columns by removing the mean and scaling to unit variance
The standard score of each data column is calculated as:
@@ -97,6 +116,16 @@ def standardize(self, small_tol=1.0e-5) -> np.ndarray:
The minimum value of any value in the final scaled array.
This is used to prevent very small values that will cause
issues in later calcualtions. Defaults to 1e-5.
+ return_mean: bool, optional
+ Return an array with the mean of each column in the DataSet
+ return_std: bool, optional
+ Return an array with the stnadard deviation of each column
+ in the DataSet
+ mean: array, optional
+ Pass a precalculated array of means for the columns
+ std: array, optional
+ Pass a precalculated array of standard deviations
+ for the columns
Returns
-------
@@ -110,11 +139,21 @@ def standardize(self, small_tol=1.0e-5) -> np.ndarray:
"""
values = self.data_to_numpy()
values = values.astype(np.float64)
- mean = np.mean(values, axis=0)
- sigma = np.std(values, axis=0)
+
+ mean = kwargs.get('mean',
+ np.mean(values, axis=0))
+ sigma = kwargs.get('std',
+ np.std(values, axis=0))
standard = (values-mean)/sigma
standard[abs(standard) < small_tol] = 0.0
- return standard
+ if return_mean and return_std:
+ return standard, mean, sigma
+ elif return_mean:
+ return standard, mean
+ elif return_std:
+ return standard, sigma
+ else:
+ return standard
@property
def _constructor(self):
@@ -182,6 +221,4 @@ def insert(self, loc, column, value, type='DATA', units=None, allow_duplicates=F
self.columns[loc][0] = column
self.columns[loc][1] = type
self.columns[loc][2] = units
-
-class ResultSet(DataSet):
- data_column_types = ['input', 'output']
+
\ No newline at end of file
diff --git a/summit/domain.py b/summit/domain.py
index d1c123d8..4848ed70 100644
--- a/summit/domain.py
+++ b/summit/domain.py
@@ -176,6 +176,11 @@ def from_dict(variable_dict):
description=variable_dict['description'],
bounds=variable_dict['bounds'],
is_objective=variable_dict['is_objective'])
+
+ def __add__(self, other):
+ if isinstance(other, self.__class__):
+ return [self, other]
+
class DiscreteVariable(Variable):
"""Representation of a discrete variable
@@ -335,6 +340,24 @@ def from_dict(variable_dict):
def _html_table_rows(self):
return self._make_html_table_rows(f"{self.num_examples} examples of {self.num_descriptors} descriptors")
+
+class Constraint:
+ def __init__(self, expression):
+ self._expression = expression
+
+ @property
+ def expression(self):
+ return self._expression
+
+ def _html_table_rows(self):
+ columns = []
+ columns.append("") #name column
+ columns.append("constraint") #type column
+ columns.append(self.expression) #description columns
+ columns.append("") #value column
+ return ''.join([f"
{column} | " for column in columns])
+
+
class Domain:
"""Representation of the optimization domain
@@ -353,8 +376,9 @@ class Domain:
>>> domain += ContinuousVariable('temperature', 'reaction temperature', [1, 100])
"""
- def __init__(self, variables:Optional[List[Type[Variable]]]=[]):
+ def __init__(self, variables=[], constraints=[]):
self._variables = variables
+ self._constraints = constraints
#Check that all the output variables continuous
self.raise_noncontinuous_outputs()
@@ -363,6 +387,20 @@ def variables(self):
"""[List[Type[Variable]]]: List of variables in the domain"""
return self._variables
+ @property
+ def constraints(self):
+ return self._constraints
+
+ # def __getitem__(self, key):
+ # '''For accessing variables like a dictionary'''
+ # for v in self.variables:
+ # if v.name == key:
+ # return v
+ # raise KeyError(f'No variable {key} found in the domain.')
+
+ # def _ipython_key_completions_(self):
+ # return [v.name for v in self.variables]
+
@property
def input_variables(self):
input_variables = []
@@ -482,10 +520,16 @@ def from_dict(domain_dict):
return Domain(variables)
- def __add__(self, var):
- if var.is_objective and var.variable_type != 'continuous':
- DomainError("Output variables must be continuous")
- return Domain(self._variables + [var])
+ def __add__(self, obj):
+ #TODO: make this work with adding arrays of variable or constraints
+ if isinstance(obj, Variable):
+ if obj.is_objective and obj.variable_type != 'continuous':
+ raise DomainError("Output variables must be continuous")
+ return Domain(variables=self._variables + [obj], constraints=self.constraints)
+ elif isinstance(obj, Constraint):
+ return Domain(variables=self.variables, constraints = self.constraints + [obj])
+ else:
+ raise RuntimeError('Not a supported domain object.')
def _repr_html_(self):
"""Build html string for table display in jupyter notebooks.
@@ -510,7 +554,9 @@ def _repr_html_(self):
return ''.join(html)
def _html_table_rows(self):
- return ''.join(map(lambda l: l._html_table_rows(), self._variables))
+ variables = ''.join([v._html_table_rows() for v in self.variables])
+ constraints = ''.join([c._html_table_rows() for c in self.constraints])
+ return variables + constraints
class DomainError(Exception):
diff --git a/summit/models.py b/summit/models.py
index 474b3211..3bd76ba0 100644
--- a/summit/models.py
+++ b/summit/models.py
@@ -1,12 +1,12 @@
-from summit.initial_design.latin_designer import lhs
+from summit.data import DataSet
from GPy.models import GPRegression
from GPy.kern import Matern52
import numpy as np
-from numpy import matlib
import scipy
from abc import ABC, abstractmethod
+from sklearn.base import BaseEstimator, RegressorMixin
class Model(ABC):
@@ -23,7 +23,32 @@ def fit(self, X, Y):
def predict(self, X):
pass
-class GPyModel(Model):
+class ModelGroup:
+ def __init__(self, models: dict):
+ self._models = models
+
+ @property
+ def models(self):
+ return self._models
+
+ def fit(self, X, y, **kwargs):
+ for column_name, model in self.models.items():
+ model.fit(X, y[[column_name]])
+
+ def predict(self, X, **kwargs):
+ """
+ Note
+ -----
+ This the make the assumption that each model returns a n_samples x 1 array
+ from the predict method.
+ """
+ result = [model.predict(X)[:, 0] for model in self.models.values()]
+ return np.array(result).T
+
+ def __getitem__(self, key):
+ return self.models[key]
+
+class GPyModel(BaseEstimator, RegressorMixin):
''' A Gaussian Process Regression model from GPy
This is implemented as an alternative to the sklearn
@@ -54,7 +79,8 @@ def __init__(self, kernel=None, input_dim=None,noise_var=1.0, optimizer=None):
else:
if not input_dim:
raise ValueError('input_dim must be specified if no kernel is specified.')
- self._kernel = Matern52(input_dim = input_dim, ARD=True)
+ self.input_dim = input_dim
+ self._kernel = Matern52(input_dim = self.input_dim, ARD=True)
self._noise_var = noise_var
self._optimizer = optimizer
self._model = None
@@ -63,10 +89,10 @@ def fit(self, X, y, num_restarts=10, max_iters=2000, parallel=False):
"""Fit Gaussian process regression model.
Parameters
----------
- X : array-like, shape = (n_samples, n_features)
- Training data
- y : array-like, shape = (n_samples, [n_output_dims])
- Target values
+ X : DataSet
+ The data columns will be used as inputs for fitting the model
+ y : DataSEt
+ The data columns will be used as outputs for fitting the model
num_restarts : int, optional (default=10)
The number of random restarts of the optimizer.
max_iters : int, optional (default=2000)
@@ -79,7 +105,25 @@ def fit(self, X, y, num_restarts=10, max_iters=2000, parallel=False):
self : returns an instance of self.
-----
"""
- self._model = GPRegression(X,y, self._kernel, noise_var=self._noise_var)
+ #Standardize inputs and outputs
+ if isinstance(X, DataSet):
+ X_std, self.input_mean, self.input_std = X.standardize(return_mean=True, return_std=True)
+ elif isinstance(X, np.ndarray):
+ self.input_mean = np.mean(X,axis=0)
+ self.input_std = np.std(X, axis=0)
+ X_std = (X-self.input_mean)/self.input_std
+ X_std[abs(X_std) < 1e-5] = 0.0
+
+ if isinstance(y, DataSet):
+ y_std, self.output_mean, self.output_std = y.standardize(return_mean=True, return_std=True)
+ elif isinstance(y, np.ndarray):
+ self.output_mean = np.mean(y,axis=0)
+ self.output_std = np.std(y, axis=0)
+ y_std = (y-self.output_mean)/self.output_std
+ y_std[abs(y_std) < 1e-5] = 0.0
+
+ #Initialize and fit model
+ self._model = GPRegression(X_std,y_std, self._kernel, noise_var=self._noise_var)
if self._optimizer:
self._model.optimize_restarts(num_restarts = num_restarts,
verbose=False,
@@ -120,17 +164,24 @@ def predict(self, X,
raise RuntimeError(
"Not returning standard deviation of predictions when "
"returning full covariance.")
-
- m, v = self._model.predict(X)
-
- if return_cov:
- result = m, v
- elif return_std:
- result = m, self._model.Kdiag(X)
- else:
- result = m
- return result
+ if isinstance(X, np.ndarray):
+ X_std = (X-self.input_mean)/self.input_std
+ X_std[abs(X_std) < 1e-5] = 0.0
+ elif isinstance(X, DataSet):
+ X_std = X.standardize(mean=self.input_mean, std=self.input_std)
+
+ m_std, v_std = self._model.predict(X_std)
+ m = m_std*self.output_std + self.output_mean
+
+ # if return_cov:
+ # result = m, v
+ # elif return_std:
+ # result = m, self._model.Kdiag(X)
+ # else:
+ # result = m
+
+ return m
class AnalyticalModel(Model):
''' An analytical model instead of statistical model
diff --git a/summit/optimizers.py b/summit/optimizers.py
index 0defc248..de0c3d28 100644
--- a/summit/optimizers.py
+++ b/summit/optimizers.py
@@ -7,7 +7,7 @@
from typing import List
from summit.domain import Domain, DomainError
from summit.initial_design import RandomDesigner
-# from .objective import ObjectiveWrapper
+from summit.data import DataSet
from abc import ABC, abstractmethod
import numpy as np
@@ -62,12 +62,17 @@ def is_multiobjective(self):
class NSGAII(Optimizer):
def __init__(self, domain: Domain):
Optimizer.__init__(self, domain)
+ #Set up platypus problem
self.problem = pp.Problem(nvars=self.domain.num_variables(),
- nobjs=len(self.domain.output_variables))
+ nobjs=len(self.domain.output_variables),
+ nconstrs=len(self.domain.constraints))
+ #Set maximization or minimization for each objective
+ j = 0
for i, v in enumerate(self.domain.variables):
if v.is_objective:
- continue
- if v.variable_type == "continuous":
+ direction = self.problem.MAXIMIZE if v.maximize else self.problem.MINIMIZE
+ self.problem.directions[j] = direction
+ elif v.variable_type == "continuous":
self.problem.types[i] = pp.Real(v.lower_bound, v.upper_bound)
elif v.variable_type == "discrete":
#Select a subset of one of the available options
@@ -78,23 +83,36 @@ def __init__(self, domain: Domain):
else:
raise DomainError(f'{v.variable_type} is not a valid variable type.')
- def _optimize(self, objective, **kwargs):
+ #Set up constraints
+ self.problem.constraints[:] = "<=0"
+
+
+ def _optimize(self, models, **kwargs):
+ input_columns = [v.name for v in self.domain.variables if not v.is_objective]
+ output_columns = [v.name for v in self.domain.variables if v.is_objective]
def problem_wrapper(X):
- np.atleast_2d(X)
- result = objective(X)
- return result.tolist()
+ X = DataSet(np.atleast_2d(X),
+ columns=input_columns)
+ result = models.predict(X)
+
+ constraint_res = [X.eval(c.expression, resolvers=[X])
+ for c in self.domain.constraints]
+ constraint_res = [c.tolist()[0] for c in constraint_res]
+ return result[0, :].tolist(), constraint_res
+
+ #Run optimization
self.problem.function = problem_wrapper
- self.problem.directions[:] = pp.Problem.MAXIMIZE
algorithm = pp.NSGAII(self.problem)
iterations = kwargs.get('iterations', 10000)
algorithm.run(iterations)
+
x = [[s.variables[i] for i in range(self.domain.num_variables())]
for s in algorithm.result]
- x = np.array(x)
+ x = DataSet(x, columns = input_columns)
y =[[s.objectives[i] for i in range(len(self.domain.output_variables))]
for s in algorithm.result]
- y = np.array(y)
+ y = DataSet(y, columns=output_columns)
return OptimizeResult(x=x, fun=y, success=True)
class MCOptimizer(Optimizer):
diff --git a/summit/strategies.py b/summit/strategies.py
index 035e7f81..dd112cda 100644
--- a/summit/strategies.py
+++ b/summit/strategies.py
@@ -1,13 +1,13 @@
from summit.data import DataSet
+from summit.models import ModelGroup
from summit.domain import Domain, DomainError
from summit.acquisition import HvI
from summit.optimizers import NSGAII
+from summit.utils import pareto_efficient
import GPy
import numpy as np
-import warnings
-import logging
class Strategy:
def __init__(self, domain:Domain):
@@ -70,10 +70,6 @@ class TSEMO2(Strategy):
maximize: bool, optional
Whether optimization should be treated as a maximization or minimization problem.
Defaults to maximization.
- acquisition: summit.acquistion.Acquisition, optional
- The acquisition function used to select the next set of points from the pareto front
- (see optimizer). Defaults to hypervolume improvement with the reference point set
- as the upper bounds of the outputs in the specified domain and random rate 0.0
optimizer: summit.optimizers.Optimizer, optional
The internal optimizer for estimating the pareto front prior to maximization
of the acquisition function. By default, NSGAII will be used if there is a combination
@@ -105,81 +101,122 @@ class TSEMO2(Strategy):
normalize_inputs=True)
'''
- def __init__(self, domain, models, acquisition=None, optimizer=None):
+ def __init__(self, domain, models, optimizer=None, **kwargs):
Strategy.__init__(self, domain)
- self.models = models
- if acquisition is None:
- reference = [v.upper_bound for v in self.domain.output_variables]
- self.acquisition = HvI(reference, random_rate=0.0)
- else:
- self.acquisition = acquisition
+
+ if isinstance(models, ModelGroup):
+ self.models = models
+ elif isinstance(models, dict):
+ self.models = ModelGroup(models)
+ else:
+ raise TypeError('models must be a ModelGroup or a dictionary of models.')
+
if not optimizer:
self.optimizer = NSGAII(self.domain)
else:
self.optimizer = optimizer
- def generate_experiments(self, previous_results: DataSet, num_experiments,
- normalize_inputs=False, no_repeats=True, maximize=True):
- #Get inputs and outputs + standardize if needed
+ self._reference = kwargs.get('reference', [0,0])
+ self._random_rate = kwargs.get('random_rate', 0.0)
+
+ def generate_experiments(self, previous_results: DataSet, num_experiments):
+ #Get inputs and outputs
inputs, outputs = self.get_inputs_outputs(previous_results)
- if normalize_inputs:
- self.x = inputs.standardize() #TODO: get this to work for discrete variables
+
+ #Fit models to new data
+ self.models.fit(inputs, outputs)
+
+ internal_res = self.optimizer.optimize(self.models)
+
+ hv_imp, indices = self.select_max_hvi(outputs, internal_res.fun, num_experiments)
+ result = internal_res.x.join(internal_res.fun)
+
+ return result.iloc[indices, :]
+
+ def select_max_hvi(self, y, samples, num_evals=1):
+ ''' Returns the point(s) that maximimize hypervolume improvement
+
+ Parameters
+ ----------
+ samples: np.ndarray
+ The samples on which hypervolume improvement is calculated
+ num_evals: `int`
+ The number of points to return (with top hypervolume improvement)
+
+ Returns
+ -------
+ hv_imp, index
+ Returns a tuple with lists of the best hypervolume improvement
+ and the indices of the corresponding points in samples
+
+ '''
+ #Get the reference point, r
+ # r = self._reference + 0.01*(np.max(samples, axis=0)-np.min(samples, axis=0))
+ r = self._reference
+
+ #Set up maximization and minimization
+ for v in self.domain.variables:
+ if v.is_objective and v.maximize:
+ y[v.name] = -1.0 * y[v.name]
+ samples[v.name] = -1.0 * samples[v.name]
+
+ Ynew = y.data_to_numpy()
+ samples = samples.data_to_numpy()
+ index = []
+ n = samples.shape[1]
+ mask = np.ones(samples.shape[0], dtype=bool)
+
+ #Set up random selection
+ if not (self._random_rate <=1.) | (self._random_rate >=0.):
+ raise ValueError('Random Rate must be between 0 and 1.')
+
+ if self._random_rate>0:
+ num_random = round(self._random_rate*num_evals)
+ random_selects = np.random.randint(0, num_evals, size=num_random)
else:
- self.x = inputs.data_to_numpy()
-
- self.y = outputs.data_to_numpy()
-
- #Update surrogate models with new data
- for i, model in enumerate(self.models):
- Y = self.y[:, i]
- Y = np.atleast_2d(Y).T
- logging.debug(f'Fitting model {i+1}')
- model.fit(self.x, Y, num_restarts=3, max_iters=100,parallel=True)
-
- logging.debug("Running internal optimization")
-
- #If the domain consists of one descriptors variables, evaluate every candidate
- check_descriptors = [True if v.variable_type =='descriptors' else False
- for v in self.domain.input_variables]
- if all(check_descriptors) and len(check_descriptors)==1:
- descriptor_arr = self.domain.variables[0].ds.data_to_numpy()
- if no_repeats:
- points = self._mask_previous_points(self.x, descriptor_arr)
+ random_selects = np.array([])
+
+ for i in range(num_evals):
+ masked_samples = samples[mask, :]
+ Yfront, _ = pareto_efficient(Ynew, maximize=True)
+ if len(Yfront) == 0:
+ raise ValueError('Pareto front length too short')
+
+ hv_improvement = []
+ hvY = HvI.hypervolume(Yfront, [0, 0])
+ #Determine hypervolume improvement by including
+ #each point from samples (masking previously selected poonts)
+ for sample in masked_samples:
+ sample = sample.reshape(1,n)
+ A = np.append(Ynew, sample, axis=0)
+ Afront, _ = pareto_efficient(A, maximize=True)
+ hv = HvI.hypervolume(Afront, [0,0])
+ hv_improvement.append(hv-hvY)
+
+ hvY0 = hvY if i==0 else hvY0
+
+ if i in random_selects:
+ masked_index = np.random.randint(0, masked_samples.shape[0])
else:
- points = self.x
- predictions = np.zeros([points.shape[0], len(self.models)])
- for i, model in enumerate(self.models):
- predictions[:, i] = model.predict(points)[0][:,0]
+ #Choose the point that maximizes hypervolume improvement
+ masked_index = hv_improvement.index(max(hv_improvement))
- self.acquisition.data = self.y
- self.acq_vals, indices = self.acquisition.select_max(predictions, num_evals=num_experiments)
- indices = [np.where((descriptor_arr == points[ix]).all(axis=1))[0][0]
- for ix in indices]
- result = self.domain.variables[0].ds.iloc[indices, :]
- #Else use modified nsgaII
+ samples_index = np.where((samples == masked_samples[masked_index, :]).all(axis=1))[0][0]
+ new_point = samples[samples_index, :].reshape(1, n)
+ Ynew = np.append(Ynew, new_point, axis=0)
+ mask[samples_index] = False
+ index.append(samples_index)
+
+ if len(hv_improvement)==0:
+ hv_imp = 0
+ elif len(index) == 0:
+ index = []
+ hv_imp = 0
else:
- def problem(x):
- x = np.array(x)
- x = np.atleast_2d(x)
- y = [model.predict(x)
- for model in self.models]
- y = np.array([yo[0,0] for yo in y])
- return y
- int_result = self.optimizer.optimize(problem)
- self.acquisition.data = self.y
- self.acq_vals, indices = self.acquisition.select_max(int_result.fun,
- num_evals=num_experiments)
- result = int_result.x[indices, :]
- return result
-
- def _mask_previous_points(self, x, descriptor_arr):
- descriptor_mask = np.ones(descriptor_arr.shape[0], dtype=bool)
- for point in x:
- try:
- index = np.where(descriptor_arr==point)[0][0]
- descriptor_mask[index] = False
- except IndexError:
- continue
- return descriptor_arr[descriptor_mask, :]
+ #Total hypervolume improvement
+ #Includes all points added to batch (hvY + last hv_improvement)
+ #Subtracts hypervolume without any points added (hvY0)
+ hv_imp = hv_improvement[masked_index] + hvY-hvY0
+ return hv_imp, index
\ No newline at end of file
diff --git a/summit/utils.py b/summit/utils.py
new file mode 100644
index 00000000..9c31a9d9
--- /dev/null
+++ b/summit/utils.py
@@ -0,0 +1,24 @@
+import numpy as np
+
+def pareto_efficient(costs, maximize=True):
+ """
+ Find the pareto-efficient points
+ :param costs: An (n_points, n_costs) array
+
+
+ """
+ original_costs = costs
+ is_efficient = np.arange(costs.shape[0])
+ n_points = costs.shape[0]
+ next_point_index = 0 # Next index in the is_efficient array to search for
+ while next_point_indexcosts[next_point_index], axis=1)
+ else:
+ nondominated_point_mask = np.any(costs