Skip to content

Commit

Permalink
Add constraint capabilities, change tsemoa and optimizers to work wit…
Browse files Browse the repository at this point in the history
…h datasets
  • Loading branch information
marcosfelt committed Aug 19, 2019
1 parent fcfe253 commit 8a24566
Show file tree
Hide file tree
Showing 7 changed files with 278 additions and 113 deletions.
6 changes: 2 additions & 4 deletions summit/acquisition.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ def select_max(self, samples, num_evals=1):

for i in range(num_evals):
masked_samples = samples[mask, :]
Yfront, _ = pareto_efficient(Ynew)
Yfront, _ = pareto_efficient(Ynew, maximize=True)
if len(Yfront) ==0:
raise ValueError('Pareto front length too short')

Expand All @@ -112,7 +112,7 @@ def select_max(self, samples, num_evals=1):
for sample in masked_samples:
sample = sample.reshape(1,n)
A = np.append(Ynew, sample, axis=0)
Afront, _ = pareto_efficient(A)
Afront, _ = pareto_efficient(A, maximize=True)
hv = HvI.hypervolume(-Afront, [0,0])
hv_improvement.append(hv-hvY)

Expand Down Expand Up @@ -147,8 +147,6 @@ def hypervolume(pointset, ref):
"""Compute the absolute hypervolume of a *pointset* according to the
reference point *ref*.
"""
warnings.warn("Falling back to the python version of hypervolume "
"module. Expect this to be very slow.", RuntimeWarning)
hv = _HyperVolume(ref)
return hv.compute(pointset)

Expand Down
51 changes: 44 additions & 7 deletions summit/data/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,24 @@ class DataSet(pd.core.frame.DataFrame):
----
Based on https://notes.mikejarrett.ca/storing-metadata-in-pandas-dataframes/
"""
def __init__(self, data=None, index=None, columns=None, metadata_columns=[], units=None, dtype=None, copy=False):
if isinstance(columns, pd.MultiIndex):
pass
elif columns is not None:
column_names = columns
if metadata_columns:
types = ['METADATA' if x in metadata_columns else 'DATA' for x in column_names]
else:
types = ['DATA' for _ in range(len(column_names))]
arrays = [column_names, types]
levels = ['NAME', 'TYPE']
if units:
arrays.append(units)
levels.append('UNITS')
tuples=list(zip(*arrays))
columns = pd.MultiIndex.from_tuples(tuples, names=levels)
pd.core.frame.DataFrame.__init__(self, data=data, index=index, columns=columns, dtype=dtype, copy=copy)

@staticmethod
def from_df(df: pd.DataFrame, metadata_columns: List=[],
units: List = []):
Expand Down Expand Up @@ -83,7 +101,8 @@ def zero_to_one(self, small_tol=1.0e-5) -> np.ndarray:
scaled[abs(scaled) < small_tol] = 0.0
return scaled

def standardize(self, small_tol=1.0e-5) -> np.ndarray:
def standardize(self, small_tol=1.0e-5,
return_mean=False, return_std=False, **kwargs) -> np.ndarray:
"""Standardize data columns by removing the mean and scaling to unit variance
The standard score of each data column is calculated as:
Expand All @@ -97,6 +116,16 @@ def standardize(self, small_tol=1.0e-5) -> np.ndarray:
The minimum value of any value in the final scaled array.
This is used to prevent very small values that will cause
issues in later calcualtions. Defaults to 1e-5.
return_mean: bool, optional
Return an array with the mean of each column in the DataSet
return_std: bool, optional
Return an array with the stnadard deviation of each column
in the DataSet
mean: array, optional
Pass a precalculated array of means for the columns
std: array, optional
Pass a precalculated array of standard deviations
for the columns
Returns
-------
Expand All @@ -110,11 +139,21 @@ def standardize(self, small_tol=1.0e-5) -> np.ndarray:
"""
values = self.data_to_numpy()
values = values.astype(np.float64)
mean = np.mean(values, axis=0)
sigma = np.std(values, axis=0)

mean = kwargs.get('mean',
np.mean(values, axis=0))
sigma = kwargs.get('std',
np.std(values, axis=0))
standard = (values-mean)/sigma
standard[abs(standard) < small_tol] = 0.0
return standard
if return_mean and return_std:
return standard, mean, sigma
elif return_mean:
return standard, mean
elif return_std:
return standard, sigma
else:
return standard

@property
def _constructor(self):
Expand Down Expand Up @@ -182,6 +221,4 @@ def insert(self, loc, column, value, type='DATA', units=None, allow_duplicates=F
self.columns[loc][0] = column
self.columns[loc][1] = type
self.columns[loc][2] = units

class ResultSet(DataSet):
data_column_types = ['input', 'output']

2 changes: 1 addition & 1 deletion summit/domain.py
Original file line number Diff line number Diff line change
Expand Up @@ -357,7 +357,6 @@ def _html_table_rows(self):
columns.append("") #value column
return ''.join([f"<td>{column}</td>" for column in columns])



class Domain:
"""Representation of the optimization domain
Expand Down Expand Up @@ -522,6 +521,7 @@ def from_dict(domain_dict):


def __add__(self, obj):
#TODO: make this work with adding arrays of variable or constraints
if isinstance(obj, Variable):
if obj.is_objective and obj.variable_type != 'continuous':
raise DomainError("Output variables must be continuous")
Expand Down
89 changes: 70 additions & 19 deletions summit/models.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
from summit.initial_design.latin_designer import lhs
from summit.data import DataSet

from GPy.models import GPRegression
from GPy.kern import Matern52
import numpy as np
from numpy import matlib
import scipy

from abc import ABC, abstractmethod
from sklearn.base import BaseEstimator, RegressorMixin


class Model(ABC):
Expand All @@ -23,7 +23,32 @@ def fit(self, X, Y):
def predict(self, X):
pass

class GPyModel(Model):
class ModelGroup:
def __init__(self, models: dict):
self._models = models

@property
def models(self):
return self._models

def fit(self, X, y, **kwargs):
for column_name, model in self.models.items():
model.fit(X, y[[column_name]])

def predict(self, X, **kwargs):
"""
Note
-----
This the make the assumption that each model returns a n_samples x 1 array
from the predict method.
"""
result = [model.predict(X)[:, 0] for model in self.models.values()]
return np.array(result).T

def __getitem__(self, key):
return self.models[key]

class GPyModel(BaseEstimator, RegressorMixin):
''' A Gaussian Process Regression model from GPy
This is implemented as an alternative to the sklearn
Expand Down Expand Up @@ -54,7 +79,8 @@ def __init__(self, kernel=None, input_dim=None,noise_var=1.0, optimizer=None):
else:
if not input_dim:
raise ValueError('input_dim must be specified if no kernel is specified.')
self._kernel = Matern52(input_dim = input_dim, ARD=True)
self.input_dim = input_dim
self._kernel = Matern52(input_dim = self.input_dim, ARD=True)
self._noise_var = noise_var
self._optimizer = optimizer
self._model = None
Expand All @@ -63,10 +89,10 @@ def fit(self, X, y, num_restarts=10, max_iters=2000, parallel=False):
"""Fit Gaussian process regression model.
Parameters
----------
X : array-like, shape = (n_samples, n_features)
Training data
y : array-like, shape = (n_samples, [n_output_dims])
Target values
X : DataSet
The data columns will be used as inputs for fitting the model
y : DataSEt
The data columns will be used as outputs for fitting the model
num_restarts : int, optional (default=10)
The number of random restarts of the optimizer.
max_iters : int, optional (default=2000)
Expand All @@ -79,7 +105,25 @@ def fit(self, X, y, num_restarts=10, max_iters=2000, parallel=False):
self : returns an instance of self.
-----
"""
self._model = GPRegression(X,y, self._kernel, noise_var=self._noise_var)
#Standardize inputs and outputs
if isinstance(X, DataSet):
X_std, self.input_mean, self.input_std = X.standardize(return_mean=True, return_std=True)
elif isinstance(X, np.ndarray):
self.input_mean = np.mean(X,axis=0)
self.input_std = np.std(X, axis=0)
X_std = (X-self.input_mean)/self.input_std
X_std[abs(X_std) < 1e-5] = 0.0

if isinstance(y, DataSet):
y_std, self.output_mean, self.output_std = y.standardize(return_mean=True, return_std=True)
elif isinstance(y, np.ndarray):
self.output_mean = np.mean(y,axis=0)
self.output_std = np.std(y, axis=0)
y_std = (y-self.output_mean)/self.output_std
y_std[abs(y_std) < 1e-5] = 0.0

#Initialize and fit model
self._model = GPRegression(X_std,y_std, self._kernel, noise_var=self._noise_var)
if self._optimizer:
self._model.optimize_restarts(num_restarts = num_restarts,
verbose=False,
Expand Down Expand Up @@ -120,17 +164,24 @@ def predict(self, X,
raise RuntimeError(
"Not returning standard deviation of predictions when "
"returning full covariance.")

m, v = self._model.predict(X)

if return_cov:
result = m, v
elif return_std:
result = m, self._model.Kdiag(X)
else:
result = m

return result
if isinstance(X, np.ndarray):
X_std = (X-self.input_mean)/self.input_std
X_std[abs(X_std) < 1e-5] = 0.0
elif isinstance(X, DataSet):
X_std = X.standardize(mean=self.input_mean, std=self.input_std)

m_std, v_std = self._model.predict(X_std)
m = m_std*self.output_std + self.output_mean

# if return_cov:
# result = m, v
# elif return_std:
# result = m, self._model.Kdiag(X)
# else:
# result = m

return m

class AnalyticalModel(Model):
''' An analytical model instead of statistical model
Expand Down
40 changes: 29 additions & 11 deletions summit/optimizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from typing import List
from summit.domain import Domain, DomainError
from summit.initial_design import RandomDesigner
# from .objective import ObjectiveWrapper
from summit.data import DataSet

from abc import ABC, abstractmethod
import numpy as np
Expand Down Expand Up @@ -62,12 +62,17 @@ def is_multiobjective(self):
class NSGAII(Optimizer):
def __init__(self, domain: Domain):
Optimizer.__init__(self, domain)
#Set up platypus problem
self.problem = pp.Problem(nvars=self.domain.num_variables(),
nobjs=len(self.domain.output_variables))
nobjs=len(self.domain.output_variables),
nconstrs=len(self.domain.constraints))
#Set maximization or minimization for each objective
j = 0
for i, v in enumerate(self.domain.variables):
if v.is_objective:
continue
if v.variable_type == "continuous":
direction = self.problem.MAXIMIZE if v.maximize else self.problem.MINIMIZE
self.problem.directions[j] = direction
elif v.variable_type == "continuous":
self.problem.types[i] = pp.Real(v.lower_bound, v.upper_bound)
elif v.variable_type == "discrete":
#Select a subset of one of the available options
Expand All @@ -78,23 +83,36 @@ def __init__(self, domain: Domain):
else:
raise DomainError(f'{v.variable_type} is not a valid variable type.')

def _optimize(self, objective, **kwargs):
#Set up constraints
self.problem.constraints[:] = "<=0"


def _optimize(self, models, **kwargs):
input_columns = [v.name for v in self.domain.variables if not v.is_objective]
output_columns = [v.name for v in self.domain.variables if v.is_objective]
def problem_wrapper(X):
np.atleast_2d(X)
result = objective(X)
return result.tolist()
X = DataSet(np.atleast_2d(X),
columns=input_columns)
result = models.predict(X)

constraint_res = [X.eval(c.expression, resolvers=[X])
for c in self.domain.constraints]
constraint_res = [c.tolist()[0] for c in constraint_res]

return result[0, :].tolist(), constraint_res

#Run optimization
self.problem.function = problem_wrapper
self.problem.directions[:] = pp.Problem.MAXIMIZE
algorithm = pp.NSGAII(self.problem)
iterations = kwargs.get('iterations', 10000)
algorithm.run(iterations)

x = [[s.variables[i] for i in range(self.domain.num_variables())]
for s in algorithm.result]
x = np.array(x)
x = DataSet(x, columns = input_columns)
y =[[s.objectives[i] for i in range(len(self.domain.output_variables))]
for s in algorithm.result]
y = np.array(y)
y = DataSet(y, columns=output_columns)
return OptimizeResult(x=x, fun=y, success=True)

class MCOptimizer(Optimizer):
Expand Down
Loading

0 comments on commit 8a24566

Please sign in to comment.