Skip to content

Commit

Permalink
Merge pull request #3 from sustainable-processes/datasets
Browse files Browse the repository at this point in the history
Datasets enabling easier problem specification
  • Loading branch information
marcosfelt authored Aug 19, 2019
2 parents c4e8ccf + 8a24566 commit 804c85b
Show file tree
Hide file tree
Showing 7 changed files with 329 additions and 118 deletions.
6 changes: 2 additions & 4 deletions summit/acquisition.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ def select_max(self, samples, num_evals=1):

for i in range(num_evals):
masked_samples = samples[mask, :]
Yfront, _ = pareto_efficient(Ynew)
Yfront, _ = pareto_efficient(Ynew, maximize=True)
if len(Yfront) ==0:
raise ValueError('Pareto front length too short')

Expand All @@ -112,7 +112,7 @@ def select_max(self, samples, num_evals=1):
for sample in masked_samples:
sample = sample.reshape(1,n)
A = np.append(Ynew, sample, axis=0)
Afront, _ = pareto_efficient(A)
Afront, _ = pareto_efficient(A, maximize=True)
hv = HvI.hypervolume(-Afront, [0,0])
hv_improvement.append(hv-hvY)

Expand Down Expand Up @@ -147,8 +147,6 @@ def hypervolume(pointset, ref):
"""Compute the absolute hypervolume of a *pointset* according to the
reference point *ref*.
"""
warnings.warn("Falling back to the python version of hypervolume "
"module. Expect this to be very slow.", RuntimeWarning)
hv = _HyperVolume(ref)
return hv.compute(pointset)

Expand Down
51 changes: 44 additions & 7 deletions summit/data/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,24 @@ class DataSet(pd.core.frame.DataFrame):
----
Based on https://notes.mikejarrett.ca/storing-metadata-in-pandas-dataframes/
"""
def __init__(self, data=None, index=None, columns=None, metadata_columns=[], units=None, dtype=None, copy=False):
if isinstance(columns, pd.MultiIndex):
pass
elif columns is not None:
column_names = columns
if metadata_columns:
types = ['METADATA' if x in metadata_columns else 'DATA' for x in column_names]
else:
types = ['DATA' for _ in range(len(column_names))]
arrays = [column_names, types]
levels = ['NAME', 'TYPE']
if units:
arrays.append(units)
levels.append('UNITS')
tuples=list(zip(*arrays))
columns = pd.MultiIndex.from_tuples(tuples, names=levels)
pd.core.frame.DataFrame.__init__(self, data=data, index=index, columns=columns, dtype=dtype, copy=copy)

@staticmethod
def from_df(df: pd.DataFrame, metadata_columns: List=[],
units: List = []):
Expand Down Expand Up @@ -83,7 +101,8 @@ def zero_to_one(self, small_tol=1.0e-5) -> np.ndarray:
scaled[abs(scaled) < small_tol] = 0.0
return scaled

def standardize(self, small_tol=1.0e-5) -> np.ndarray:
def standardize(self, small_tol=1.0e-5,
return_mean=False, return_std=False, **kwargs) -> np.ndarray:
"""Standardize data columns by removing the mean and scaling to unit variance
The standard score of each data column is calculated as:
Expand All @@ -97,6 +116,16 @@ def standardize(self, small_tol=1.0e-5) -> np.ndarray:
The minimum value of any value in the final scaled array.
This is used to prevent very small values that will cause
issues in later calcualtions. Defaults to 1e-5.
return_mean: bool, optional
Return an array with the mean of each column in the DataSet
return_std: bool, optional
Return an array with the stnadard deviation of each column
in the DataSet
mean: array, optional
Pass a precalculated array of means for the columns
std: array, optional
Pass a precalculated array of standard deviations
for the columns
Returns
-------
Expand All @@ -110,11 +139,21 @@ def standardize(self, small_tol=1.0e-5) -> np.ndarray:
"""
values = self.data_to_numpy()
values = values.astype(np.float64)
mean = np.mean(values, axis=0)
sigma = np.std(values, axis=0)

mean = kwargs.get('mean',
np.mean(values, axis=0))
sigma = kwargs.get('std',
np.std(values, axis=0))
standard = (values-mean)/sigma
standard[abs(standard) < small_tol] = 0.0
return standard
if return_mean and return_std:
return standard, mean, sigma
elif return_mean:
return standard, mean
elif return_std:
return standard, sigma
else:
return standard

@property
def _constructor(self):
Expand Down Expand Up @@ -182,6 +221,4 @@ def insert(self, loc, column, value, type='DATA', units=None, allow_duplicates=F
self.columns[loc][0] = column
self.columns[loc][1] = type
self.columns[loc][2] = units

class ResultSet(DataSet):
data_column_types = ['input', 'output']

58 changes: 52 additions & 6 deletions summit/domain.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,11 @@ def from_dict(variable_dict):
description=variable_dict['description'],
bounds=variable_dict['bounds'],
is_objective=variable_dict['is_objective'])

def __add__(self, other):
if isinstance(other, self.__class__):
return [self, other]


class DiscreteVariable(Variable):
"""Representation of a discrete variable
Expand Down Expand Up @@ -335,6 +340,24 @@ def from_dict(variable_dict):
def _html_table_rows(self):
return self._make_html_table_rows(f"{self.num_examples} examples of {self.num_descriptors} descriptors")


class Constraint:
def __init__(self, expression):
self._expression = expression

@property
def expression(self):
return self._expression

def _html_table_rows(self):
columns = []
columns.append("") #name column
columns.append("constraint") #type column
columns.append(self.expression) #description columns
columns.append("") #value column
return ''.join([f"<td>{column}</td>" for column in columns])


class Domain:
"""Representation of the optimization domain
Expand All @@ -353,8 +376,9 @@ class Domain:
>>> domain += ContinuousVariable('temperature', 'reaction temperature', [1, 100])
"""
def __init__(self, variables:Optional[List[Type[Variable]]]=[]):
def __init__(self, variables=[], constraints=[]):
self._variables = variables
self._constraints = constraints
#Check that all the output variables continuous
self.raise_noncontinuous_outputs()

Expand All @@ -363,6 +387,20 @@ def variables(self):
"""[List[Type[Variable]]]: List of variables in the domain"""
return self._variables

@property
def constraints(self):
return self._constraints

# def __getitem__(self, key):
# '''For accessing variables like a dictionary'''
# for v in self.variables:
# if v.name == key:
# return v
# raise KeyError(f'No variable {key} found in the domain.')

# def _ipython_key_completions_(self):
# return [v.name for v in self.variables]

@property
def input_variables(self):
input_variables = []
Expand Down Expand Up @@ -482,10 +520,16 @@ def from_dict(domain_dict):
return Domain(variables)


def __add__(self, var):
if var.is_objective and var.variable_type != 'continuous':
DomainError("Output variables must be continuous")
return Domain(self._variables + [var])
def __add__(self, obj):
#TODO: make this work with adding arrays of variable or constraints
if isinstance(obj, Variable):
if obj.is_objective and obj.variable_type != 'continuous':
raise DomainError("Output variables must be continuous")
return Domain(variables=self._variables + [obj], constraints=self.constraints)
elif isinstance(obj, Constraint):
return Domain(variables=self.variables, constraints = self.constraints + [obj])
else:
raise RuntimeError('Not a supported domain object.')

def _repr_html_(self):
"""Build html string for table display in jupyter notebooks.
Expand All @@ -510,7 +554,9 @@ def _repr_html_(self):
return ''.join(html)

def _html_table_rows(self):
return ''.join(map(lambda l: l._html_table_rows(), self._variables))
variables = ''.join([v._html_table_rows() for v in self.variables])
constraints = ''.join([c._html_table_rows() for c in self.constraints])
return variables + constraints


class DomainError(Exception):
Expand Down
89 changes: 70 additions & 19 deletions summit/models.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
from summit.initial_design.latin_designer import lhs
from summit.data import DataSet

from GPy.models import GPRegression
from GPy.kern import Matern52
import numpy as np
from numpy import matlib
import scipy

from abc import ABC, abstractmethod
from sklearn.base import BaseEstimator, RegressorMixin


class Model(ABC):
Expand All @@ -23,7 +23,32 @@ def fit(self, X, Y):
def predict(self, X):
pass

class GPyModel(Model):
class ModelGroup:
def __init__(self, models: dict):
self._models = models

@property
def models(self):
return self._models

def fit(self, X, y, **kwargs):
for column_name, model in self.models.items():
model.fit(X, y[[column_name]])

def predict(self, X, **kwargs):
"""
Note
-----
This the make the assumption that each model returns a n_samples x 1 array
from the predict method.
"""
result = [model.predict(X)[:, 0] for model in self.models.values()]
return np.array(result).T

def __getitem__(self, key):
return self.models[key]

class GPyModel(BaseEstimator, RegressorMixin):
''' A Gaussian Process Regression model from GPy
This is implemented as an alternative to the sklearn
Expand Down Expand Up @@ -54,7 +79,8 @@ def __init__(self, kernel=None, input_dim=None,noise_var=1.0, optimizer=None):
else:
if not input_dim:
raise ValueError('input_dim must be specified if no kernel is specified.')
self._kernel = Matern52(input_dim = input_dim, ARD=True)
self.input_dim = input_dim
self._kernel = Matern52(input_dim = self.input_dim, ARD=True)
self._noise_var = noise_var
self._optimizer = optimizer
self._model = None
Expand All @@ -63,10 +89,10 @@ def fit(self, X, y, num_restarts=10, max_iters=2000, parallel=False):
"""Fit Gaussian process regression model.
Parameters
----------
X : array-like, shape = (n_samples, n_features)
Training data
y : array-like, shape = (n_samples, [n_output_dims])
Target values
X : DataSet
The data columns will be used as inputs for fitting the model
y : DataSEt
The data columns will be used as outputs for fitting the model
num_restarts : int, optional (default=10)
The number of random restarts of the optimizer.
max_iters : int, optional (default=2000)
Expand All @@ -79,7 +105,25 @@ def fit(self, X, y, num_restarts=10, max_iters=2000, parallel=False):
self : returns an instance of self.
-----
"""
self._model = GPRegression(X,y, self._kernel, noise_var=self._noise_var)
#Standardize inputs and outputs
if isinstance(X, DataSet):
X_std, self.input_mean, self.input_std = X.standardize(return_mean=True, return_std=True)
elif isinstance(X, np.ndarray):
self.input_mean = np.mean(X,axis=0)
self.input_std = np.std(X, axis=0)
X_std = (X-self.input_mean)/self.input_std
X_std[abs(X_std) < 1e-5] = 0.0

if isinstance(y, DataSet):
y_std, self.output_mean, self.output_std = y.standardize(return_mean=True, return_std=True)
elif isinstance(y, np.ndarray):
self.output_mean = np.mean(y,axis=0)
self.output_std = np.std(y, axis=0)
y_std = (y-self.output_mean)/self.output_std
y_std[abs(y_std) < 1e-5] = 0.0

#Initialize and fit model
self._model = GPRegression(X_std,y_std, self._kernel, noise_var=self._noise_var)
if self._optimizer:
self._model.optimize_restarts(num_restarts = num_restarts,
verbose=False,
Expand Down Expand Up @@ -120,17 +164,24 @@ def predict(self, X,
raise RuntimeError(
"Not returning standard deviation of predictions when "
"returning full covariance.")

m, v = self._model.predict(X)

if return_cov:
result = m, v
elif return_std:
result = m, self._model.Kdiag(X)
else:
result = m

return result
if isinstance(X, np.ndarray):
X_std = (X-self.input_mean)/self.input_std
X_std[abs(X_std) < 1e-5] = 0.0
elif isinstance(X, DataSet):
X_std = X.standardize(mean=self.input_mean, std=self.input_std)

m_std, v_std = self._model.predict(X_std)
m = m_std*self.output_std + self.output_mean

# if return_cov:
# result = m, v
# elif return_std:
# result = m, self._model.Kdiag(X)
# else:
# result = m

return m

class AnalyticalModel(Model):
''' An analytical model instead of statistical model
Expand Down
Loading

0 comments on commit 804c85b

Please sign in to comment.