diff --git a/summit/acquisition.py b/summit/acquisition.py index dd950d36..ba3a23c0 100644 --- a/summit/acquisition.py +++ b/summit/acquisition.py @@ -101,7 +101,7 @@ def select_max(self, samples, num_evals=1): for i in range(num_evals): masked_samples = samples[mask, :] - Yfront, _ = pareto_efficient(Ynew) + Yfront, _ = pareto_efficient(Ynew, maximize=True) if len(Yfront) ==0: raise ValueError('Pareto front length too short') @@ -112,7 +112,7 @@ def select_max(self, samples, num_evals=1): for sample in masked_samples: sample = sample.reshape(1,n) A = np.append(Ynew, sample, axis=0) - Afront, _ = pareto_efficient(A) + Afront, _ = pareto_efficient(A, maximize=True) hv = HvI.hypervolume(-Afront, [0,0]) hv_improvement.append(hv-hvY) @@ -147,8 +147,6 @@ def hypervolume(pointset, ref): """Compute the absolute hypervolume of a *pointset* according to the reference point *ref*. """ - warnings.warn("Falling back to the python version of hypervolume " - "module. Expect this to be very slow.", RuntimeWarning) hv = _HyperVolume(ref) return hv.compute(pointset) diff --git a/summit/data/dataset.py b/summit/data/dataset.py index 1cb9cd3c..200b6b6b 100644 --- a/summit/data/dataset.py +++ b/summit/data/dataset.py @@ -14,6 +14,24 @@ class DataSet(pd.core.frame.DataFrame): ---- Based on https://notes.mikejarrett.ca/storing-metadata-in-pandas-dataframes/ """ + def __init__(self, data=None, index=None, columns=None, metadata_columns=[], units=None, dtype=None, copy=False): + if isinstance(columns, pd.MultiIndex): + pass + elif columns is not None: + column_names = columns + if metadata_columns: + types = ['METADATA' if x in metadata_columns else 'DATA' for x in column_names] + else: + types = ['DATA' for _ in range(len(column_names))] + arrays = [column_names, types] + levels = ['NAME', 'TYPE'] + if units: + arrays.append(units) + levels.append('UNITS') + tuples=list(zip(*arrays)) + columns = pd.MultiIndex.from_tuples(tuples, names=levels) + pd.core.frame.DataFrame.__init__(self, data=data, index=index, columns=columns, dtype=dtype, copy=copy) + @staticmethod def from_df(df: pd.DataFrame, metadata_columns: List=[], units: List = []): @@ -83,7 +101,8 @@ def zero_to_one(self, small_tol=1.0e-5) -> np.ndarray: scaled[abs(scaled) < small_tol] = 0.0 return scaled - def standardize(self, small_tol=1.0e-5) -> np.ndarray: + def standardize(self, small_tol=1.0e-5, + return_mean=False, return_std=False, **kwargs) -> np.ndarray: """Standardize data columns by removing the mean and scaling to unit variance The standard score of each data column is calculated as: @@ -97,6 +116,16 @@ def standardize(self, small_tol=1.0e-5) -> np.ndarray: The minimum value of any value in the final scaled array. This is used to prevent very small values that will cause issues in later calcualtions. Defaults to 1e-5. + return_mean: bool, optional + Return an array with the mean of each column in the DataSet + return_std: bool, optional + Return an array with the stnadard deviation of each column + in the DataSet + mean: array, optional + Pass a precalculated array of means for the columns + std: array, optional + Pass a precalculated array of standard deviations + for the columns Returns ------- @@ -110,11 +139,21 @@ def standardize(self, small_tol=1.0e-5) -> np.ndarray: """ values = self.data_to_numpy() values = values.astype(np.float64) - mean = np.mean(values, axis=0) - sigma = np.std(values, axis=0) + + mean = kwargs.get('mean', + np.mean(values, axis=0)) + sigma = kwargs.get('std', + np.std(values, axis=0)) standard = (values-mean)/sigma standard[abs(standard) < small_tol] = 0.0 - return standard + if return_mean and return_std: + return standard, mean, sigma + elif return_mean: + return standard, mean + elif return_std: + return standard, sigma + else: + return standard @property def _constructor(self): @@ -182,6 +221,4 @@ def insert(self, loc, column, value, type='DATA', units=None, allow_duplicates=F self.columns[loc][0] = column self.columns[loc][1] = type self.columns[loc][2] = units - -class ResultSet(DataSet): - data_column_types = ['input', 'output'] + \ No newline at end of file diff --git a/summit/domain.py b/summit/domain.py index d1c123d8..4848ed70 100644 --- a/summit/domain.py +++ b/summit/domain.py @@ -176,6 +176,11 @@ def from_dict(variable_dict): description=variable_dict['description'], bounds=variable_dict['bounds'], is_objective=variable_dict['is_objective']) + + def __add__(self, other): + if isinstance(other, self.__class__): + return [self, other] + class DiscreteVariable(Variable): """Representation of a discrete variable @@ -335,6 +340,24 @@ def from_dict(variable_dict): def _html_table_rows(self): return self._make_html_table_rows(f"{self.num_examples} examples of {self.num_descriptors} descriptors") + +class Constraint: + def __init__(self, expression): + self._expression = expression + + @property + def expression(self): + return self._expression + + def _html_table_rows(self): + columns = [] + columns.append("") #name column + columns.append("constraint") #type column + columns.append(self.expression) #description columns + columns.append("") #value column + return ''.join([f"{column}" for column in columns]) + + class Domain: """Representation of the optimization domain @@ -353,8 +376,9 @@ class Domain: >>> domain += ContinuousVariable('temperature', 'reaction temperature', [1, 100]) """ - def __init__(self, variables:Optional[List[Type[Variable]]]=[]): + def __init__(self, variables=[], constraints=[]): self._variables = variables + self._constraints = constraints #Check that all the output variables continuous self.raise_noncontinuous_outputs() @@ -363,6 +387,20 @@ def variables(self): """[List[Type[Variable]]]: List of variables in the domain""" return self._variables + @property + def constraints(self): + return self._constraints + + # def __getitem__(self, key): + # '''For accessing variables like a dictionary''' + # for v in self.variables: + # if v.name == key: + # return v + # raise KeyError(f'No variable {key} found in the domain.') + + # def _ipython_key_completions_(self): + # return [v.name for v in self.variables] + @property def input_variables(self): input_variables = [] @@ -482,10 +520,16 @@ def from_dict(domain_dict): return Domain(variables) - def __add__(self, var): - if var.is_objective and var.variable_type != 'continuous': - DomainError("Output variables must be continuous") - return Domain(self._variables + [var]) + def __add__(self, obj): + #TODO: make this work with adding arrays of variable or constraints + if isinstance(obj, Variable): + if obj.is_objective and obj.variable_type != 'continuous': + raise DomainError("Output variables must be continuous") + return Domain(variables=self._variables + [obj], constraints=self.constraints) + elif isinstance(obj, Constraint): + return Domain(variables=self.variables, constraints = self.constraints + [obj]) + else: + raise RuntimeError('Not a supported domain object.') def _repr_html_(self): """Build html string for table display in jupyter notebooks. @@ -510,7 +554,9 @@ def _repr_html_(self): return ''.join(html) def _html_table_rows(self): - return ''.join(map(lambda l: l._html_table_rows(), self._variables)) + variables = ''.join([v._html_table_rows() for v in self.variables]) + constraints = ''.join([c._html_table_rows() for c in self.constraints]) + return variables + constraints class DomainError(Exception): diff --git a/summit/models.py b/summit/models.py index 474b3211..3bd76ba0 100644 --- a/summit/models.py +++ b/summit/models.py @@ -1,12 +1,12 @@ -from summit.initial_design.latin_designer import lhs +from summit.data import DataSet from GPy.models import GPRegression from GPy.kern import Matern52 import numpy as np -from numpy import matlib import scipy from abc import ABC, abstractmethod +from sklearn.base import BaseEstimator, RegressorMixin class Model(ABC): @@ -23,7 +23,32 @@ def fit(self, X, Y): def predict(self, X): pass -class GPyModel(Model): +class ModelGroup: + def __init__(self, models: dict): + self._models = models + + @property + def models(self): + return self._models + + def fit(self, X, y, **kwargs): + for column_name, model in self.models.items(): + model.fit(X, y[[column_name]]) + + def predict(self, X, **kwargs): + """ + Note + ----- + This the make the assumption that each model returns a n_samples x 1 array + from the predict method. + """ + result = [model.predict(X)[:, 0] for model in self.models.values()] + return np.array(result).T + + def __getitem__(self, key): + return self.models[key] + +class GPyModel(BaseEstimator, RegressorMixin): ''' A Gaussian Process Regression model from GPy This is implemented as an alternative to the sklearn @@ -54,7 +79,8 @@ def __init__(self, kernel=None, input_dim=None,noise_var=1.0, optimizer=None): else: if not input_dim: raise ValueError('input_dim must be specified if no kernel is specified.') - self._kernel = Matern52(input_dim = input_dim, ARD=True) + self.input_dim = input_dim + self._kernel = Matern52(input_dim = self.input_dim, ARD=True) self._noise_var = noise_var self._optimizer = optimizer self._model = None @@ -63,10 +89,10 @@ def fit(self, X, y, num_restarts=10, max_iters=2000, parallel=False): """Fit Gaussian process regression model. Parameters ---------- - X : array-like, shape = (n_samples, n_features) - Training data - y : array-like, shape = (n_samples, [n_output_dims]) - Target values + X : DataSet + The data columns will be used as inputs for fitting the model + y : DataSEt + The data columns will be used as outputs for fitting the model num_restarts : int, optional (default=10) The number of random restarts of the optimizer. max_iters : int, optional (default=2000) @@ -79,7 +105,25 @@ def fit(self, X, y, num_restarts=10, max_iters=2000, parallel=False): self : returns an instance of self. ----- """ - self._model = GPRegression(X,y, self._kernel, noise_var=self._noise_var) + #Standardize inputs and outputs + if isinstance(X, DataSet): + X_std, self.input_mean, self.input_std = X.standardize(return_mean=True, return_std=True) + elif isinstance(X, np.ndarray): + self.input_mean = np.mean(X,axis=0) + self.input_std = np.std(X, axis=0) + X_std = (X-self.input_mean)/self.input_std + X_std[abs(X_std) < 1e-5] = 0.0 + + if isinstance(y, DataSet): + y_std, self.output_mean, self.output_std = y.standardize(return_mean=True, return_std=True) + elif isinstance(y, np.ndarray): + self.output_mean = np.mean(y,axis=0) + self.output_std = np.std(y, axis=0) + y_std = (y-self.output_mean)/self.output_std + y_std[abs(y_std) < 1e-5] = 0.0 + + #Initialize and fit model + self._model = GPRegression(X_std,y_std, self._kernel, noise_var=self._noise_var) if self._optimizer: self._model.optimize_restarts(num_restarts = num_restarts, verbose=False, @@ -120,17 +164,24 @@ def predict(self, X, raise RuntimeError( "Not returning standard deviation of predictions when " "returning full covariance.") - - m, v = self._model.predict(X) - - if return_cov: - result = m, v - elif return_std: - result = m, self._model.Kdiag(X) - else: - result = m - return result + if isinstance(X, np.ndarray): + X_std = (X-self.input_mean)/self.input_std + X_std[abs(X_std) < 1e-5] = 0.0 + elif isinstance(X, DataSet): + X_std = X.standardize(mean=self.input_mean, std=self.input_std) + + m_std, v_std = self._model.predict(X_std) + m = m_std*self.output_std + self.output_mean + + # if return_cov: + # result = m, v + # elif return_std: + # result = m, self._model.Kdiag(X) + # else: + # result = m + + return m class AnalyticalModel(Model): ''' An analytical model instead of statistical model diff --git a/summit/optimizers.py b/summit/optimizers.py index 0defc248..de0c3d28 100644 --- a/summit/optimizers.py +++ b/summit/optimizers.py @@ -7,7 +7,7 @@ from typing import List from summit.domain import Domain, DomainError from summit.initial_design import RandomDesigner -# from .objective import ObjectiveWrapper +from summit.data import DataSet from abc import ABC, abstractmethod import numpy as np @@ -62,12 +62,17 @@ def is_multiobjective(self): class NSGAII(Optimizer): def __init__(self, domain: Domain): Optimizer.__init__(self, domain) + #Set up platypus problem self.problem = pp.Problem(nvars=self.domain.num_variables(), - nobjs=len(self.domain.output_variables)) + nobjs=len(self.domain.output_variables), + nconstrs=len(self.domain.constraints)) + #Set maximization or minimization for each objective + j = 0 for i, v in enumerate(self.domain.variables): if v.is_objective: - continue - if v.variable_type == "continuous": + direction = self.problem.MAXIMIZE if v.maximize else self.problem.MINIMIZE + self.problem.directions[j] = direction + elif v.variable_type == "continuous": self.problem.types[i] = pp.Real(v.lower_bound, v.upper_bound) elif v.variable_type == "discrete": #Select a subset of one of the available options @@ -78,23 +83,36 @@ def __init__(self, domain: Domain): else: raise DomainError(f'{v.variable_type} is not a valid variable type.') - def _optimize(self, objective, **kwargs): + #Set up constraints + self.problem.constraints[:] = "<=0" + + + def _optimize(self, models, **kwargs): + input_columns = [v.name for v in self.domain.variables if not v.is_objective] + output_columns = [v.name for v in self.domain.variables if v.is_objective] def problem_wrapper(X): - np.atleast_2d(X) - result = objective(X) - return result.tolist() + X = DataSet(np.atleast_2d(X), + columns=input_columns) + result = models.predict(X) + + constraint_res = [X.eval(c.expression, resolvers=[X]) + for c in self.domain.constraints] + constraint_res = [c.tolist()[0] for c in constraint_res] + return result[0, :].tolist(), constraint_res + + #Run optimization self.problem.function = problem_wrapper - self.problem.directions[:] = pp.Problem.MAXIMIZE algorithm = pp.NSGAII(self.problem) iterations = kwargs.get('iterations', 10000) algorithm.run(iterations) + x = [[s.variables[i] for i in range(self.domain.num_variables())] for s in algorithm.result] - x = np.array(x) + x = DataSet(x, columns = input_columns) y =[[s.objectives[i] for i in range(len(self.domain.output_variables))] for s in algorithm.result] - y = np.array(y) + y = DataSet(y, columns=output_columns) return OptimizeResult(x=x, fun=y, success=True) class MCOptimizer(Optimizer): diff --git a/summit/strategies.py b/summit/strategies.py index 035e7f81..dd112cda 100644 --- a/summit/strategies.py +++ b/summit/strategies.py @@ -1,13 +1,13 @@ from summit.data import DataSet +from summit.models import ModelGroup from summit.domain import Domain, DomainError from summit.acquisition import HvI from summit.optimizers import NSGAII +from summit.utils import pareto_efficient import GPy import numpy as np -import warnings -import logging class Strategy: def __init__(self, domain:Domain): @@ -70,10 +70,6 @@ class TSEMO2(Strategy): maximize: bool, optional Whether optimization should be treated as a maximization or minimization problem. Defaults to maximization. - acquisition: summit.acquistion.Acquisition, optional - The acquisition function used to select the next set of points from the pareto front - (see optimizer). Defaults to hypervolume improvement with the reference point set - as the upper bounds of the outputs in the specified domain and random rate 0.0 optimizer: summit.optimizers.Optimizer, optional The internal optimizer for estimating the pareto front prior to maximization of the acquisition function. By default, NSGAII will be used if there is a combination @@ -105,81 +101,122 @@ class TSEMO2(Strategy): normalize_inputs=True) ''' - def __init__(self, domain, models, acquisition=None, optimizer=None): + def __init__(self, domain, models, optimizer=None, **kwargs): Strategy.__init__(self, domain) - self.models = models - if acquisition is None: - reference = [v.upper_bound for v in self.domain.output_variables] - self.acquisition = HvI(reference, random_rate=0.0) - else: - self.acquisition = acquisition + + if isinstance(models, ModelGroup): + self.models = models + elif isinstance(models, dict): + self.models = ModelGroup(models) + else: + raise TypeError('models must be a ModelGroup or a dictionary of models.') + if not optimizer: self.optimizer = NSGAII(self.domain) else: self.optimizer = optimizer - def generate_experiments(self, previous_results: DataSet, num_experiments, - normalize_inputs=False, no_repeats=True, maximize=True): - #Get inputs and outputs + standardize if needed + self._reference = kwargs.get('reference', [0,0]) + self._random_rate = kwargs.get('random_rate', 0.0) + + def generate_experiments(self, previous_results: DataSet, num_experiments): + #Get inputs and outputs inputs, outputs = self.get_inputs_outputs(previous_results) - if normalize_inputs: - self.x = inputs.standardize() #TODO: get this to work for discrete variables + + #Fit models to new data + self.models.fit(inputs, outputs) + + internal_res = self.optimizer.optimize(self.models) + + hv_imp, indices = self.select_max_hvi(outputs, internal_res.fun, num_experiments) + result = internal_res.x.join(internal_res.fun) + + return result.iloc[indices, :] + + def select_max_hvi(self, y, samples, num_evals=1): + ''' Returns the point(s) that maximimize hypervolume improvement + + Parameters + ---------- + samples: np.ndarray + The samples on which hypervolume improvement is calculated + num_evals: `int` + The number of points to return (with top hypervolume improvement) + + Returns + ------- + hv_imp, index + Returns a tuple with lists of the best hypervolume improvement + and the indices of the corresponding points in samples + + ''' + #Get the reference point, r + # r = self._reference + 0.01*(np.max(samples, axis=0)-np.min(samples, axis=0)) + r = self._reference + + #Set up maximization and minimization + for v in self.domain.variables: + if v.is_objective and v.maximize: + y[v.name] = -1.0 * y[v.name] + samples[v.name] = -1.0 * samples[v.name] + + Ynew = y.data_to_numpy() + samples = samples.data_to_numpy() + index = [] + n = samples.shape[1] + mask = np.ones(samples.shape[0], dtype=bool) + + #Set up random selection + if not (self._random_rate <=1.) | (self._random_rate >=0.): + raise ValueError('Random Rate must be between 0 and 1.') + + if self._random_rate>0: + num_random = round(self._random_rate*num_evals) + random_selects = np.random.randint(0, num_evals, size=num_random) else: - self.x = inputs.data_to_numpy() - - self.y = outputs.data_to_numpy() - - #Update surrogate models with new data - for i, model in enumerate(self.models): - Y = self.y[:, i] - Y = np.atleast_2d(Y).T - logging.debug(f'Fitting model {i+1}') - model.fit(self.x, Y, num_restarts=3, max_iters=100,parallel=True) - - logging.debug("Running internal optimization") - - #If the domain consists of one descriptors variables, evaluate every candidate - check_descriptors = [True if v.variable_type =='descriptors' else False - for v in self.domain.input_variables] - if all(check_descriptors) and len(check_descriptors)==1: - descriptor_arr = self.domain.variables[0].ds.data_to_numpy() - if no_repeats: - points = self._mask_previous_points(self.x, descriptor_arr) + random_selects = np.array([]) + + for i in range(num_evals): + masked_samples = samples[mask, :] + Yfront, _ = pareto_efficient(Ynew, maximize=True) + if len(Yfront) == 0: + raise ValueError('Pareto front length too short') + + hv_improvement = [] + hvY = HvI.hypervolume(Yfront, [0, 0]) + #Determine hypervolume improvement by including + #each point from samples (masking previously selected poonts) + for sample in masked_samples: + sample = sample.reshape(1,n) + A = np.append(Ynew, sample, axis=0) + Afront, _ = pareto_efficient(A, maximize=True) + hv = HvI.hypervolume(Afront, [0,0]) + hv_improvement.append(hv-hvY) + + hvY0 = hvY if i==0 else hvY0 + + if i in random_selects: + masked_index = np.random.randint(0, masked_samples.shape[0]) else: - points = self.x - predictions = np.zeros([points.shape[0], len(self.models)]) - for i, model in enumerate(self.models): - predictions[:, i] = model.predict(points)[0][:,0] + #Choose the point that maximizes hypervolume improvement + masked_index = hv_improvement.index(max(hv_improvement)) - self.acquisition.data = self.y - self.acq_vals, indices = self.acquisition.select_max(predictions, num_evals=num_experiments) - indices = [np.where((descriptor_arr == points[ix]).all(axis=1))[0][0] - for ix in indices] - result = self.domain.variables[0].ds.iloc[indices, :] - #Else use modified nsgaII + samples_index = np.where((samples == masked_samples[masked_index, :]).all(axis=1))[0][0] + new_point = samples[samples_index, :].reshape(1, n) + Ynew = np.append(Ynew, new_point, axis=0) + mask[samples_index] = False + index.append(samples_index) + + if len(hv_improvement)==0: + hv_imp = 0 + elif len(index) == 0: + index = [] + hv_imp = 0 else: - def problem(x): - x = np.array(x) - x = np.atleast_2d(x) - y = [model.predict(x) - for model in self.models] - y = np.array([yo[0,0] for yo in y]) - return y - int_result = self.optimizer.optimize(problem) - self.acquisition.data = self.y - self.acq_vals, indices = self.acquisition.select_max(int_result.fun, - num_evals=num_experiments) - result = int_result.x[indices, :] - return result - - def _mask_previous_points(self, x, descriptor_arr): - descriptor_mask = np.ones(descriptor_arr.shape[0], dtype=bool) - for point in x: - try: - index = np.where(descriptor_arr==point)[0][0] - descriptor_mask[index] = False - except IndexError: - continue - return descriptor_arr[descriptor_mask, :] + #Total hypervolume improvement + #Includes all points added to batch (hvY + last hv_improvement) + #Subtracts hypervolume without any points added (hvY0) + hv_imp = hv_improvement[masked_index] + hvY-hvY0 + return hv_imp, index \ No newline at end of file diff --git a/summit/utils.py b/summit/utils.py new file mode 100644 index 00000000..9c31a9d9 --- /dev/null +++ b/summit/utils.py @@ -0,0 +1,24 @@ +import numpy as np + +def pareto_efficient(costs, maximize=True): + """ + Find the pareto-efficient points + :param costs: An (n_points, n_costs) array + + + """ + original_costs = costs + is_efficient = np.arange(costs.shape[0]) + n_points = costs.shape[0] + next_point_index = 0 # Next index in the is_efficient array to search for + while next_point_indexcosts[next_point_index], axis=1) + else: + nondominated_point_mask = np.any(costs