kubeflow · k8s-ci-robot · Mar 11, 2019 · Feb 19, 2019 · Feb 20, 2019 · Feb 20, 2019
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,9 @@
 # python ignore files
 __pycache__/
 .idea/
+.coverage
+.pytest_cache
+*.egg-info
 
 # Project specific ignore files
 *.swp

diff --git a/...mization/src/acquisition_func/__init__.py → pkg/suggestion/__init__.py b/...mization/src/acquisition_func/__init__.py → pkg/suggestion/__init__.py
diff --git a/pkg/suggestion/bayesianoptimization/src/acquisition_func.py b/pkg/suggestion/bayesianoptimization/src/acquisition_func.py
@@ -0,0 +1,36 @@
+""" module for acquisition function"""
+import numpy as np
+from scipy.stats import norm
+
+
+class AcquisitionFunc:
+    """
+    Class for acquisition function with options for expected improvement,
+    probability of improvement, or lower confident bound.
+    """
+
+    def __init__(self, model, current_optimal, mode="ei", trade_off=0.01):
+        """
+        :param mode: pi: probability of improvement, ei: expected improvement, lcb: lower confident bound
+        :param trade_off: a parameter to control the trade off between exploiting and exploring
+        :param model_type: gp: gaussian process, rf: random forest
+        """
+        self.model = model
+        self.current_optimal = current_optimal
+        self.mode = mode
+        self.trade_off = trade_off
+
+    def compute(self, X_test):
+        y_mean, y_std, y_variance = self.model.predict(X_test)
+
+        z = (y_mean - self.current_optimal - self.trade_off) / y_std
+
+        if self.mode == "ei":
+            if y_std.any() < 0.000001:
+                return 0, y_mean, y_variance
+            result = y_std * (z * norm.cdf(z) + norm.pdf(z))
+        elif self.mode == "pi":
+            result = norm.cdf(z)
+        else:
+            result = - (y_mean - self.trade_off * y_std)
+        return np.squeeze(result), np.squeeze(y_mean), np.squeeze(y_variance)
diff --git a/pkg/suggestion/bayesianoptimization/src/acquisition_func/acquisition_func.py b/pkg/suggestion/bayesianoptimization/src/acquisition_func/acquisition_func.py
diff --git a/pkg/suggestion/bayesianoptimization/src/algorithm_manager.py b/pkg/suggestion/bayesianoptimization/src/algorithm_manager.py
@@ -1,39 +1,31 @@
 """ module for algorithm manager """
-
 import numpy as np
 
 from pkg.api.python import api_pb2
-import logging
-from logging import getLogger, StreamHandler, INFO, DEBUG
+
+from .utils import get_logger
+
 
 def deal_with_discrete(feasible_values, current_value):
     """ function to embed the current values to the feasible discrete space"""
     diff = np.subtract(feasible_values, current_value)
     diff = np.absolute(diff)
     return feasible_values[np.argmin(diff)]
 
+
 def deal_with_categorical(feasible_values, one_hot_values):
     """ function to do the one hot encoding of the categorical values """
-    #index = np.argmax(one_hot_values)
-    index = one_hot_values.argmax()
+    index = np.argmax(one_hot_values)
+    #index = one_hot_values.argmax()
     return feasible_values[int(index)]
 
+
 class AlgorithmManager:
     """ class for the algorithm manager
     provide some helper functions
     """
     def __init__(self, study_id, study_config, X_train, y_train, logger=None):
-        if logger == None:
-            self.logger = getLogger(__name__)
-            FORMAT = '%(asctime)-15s StudyID %(studyid)s %(message)s'
-            logging.basicConfig(format=FORMAT)
-            handler = StreamHandler()
-            handler.setLevel(DEBUG)
-            self.logger.setLevel(DEBUG)
-            self.logger.addHandler(handler)
-            self.logger.propagate = False
-        else:
-            self.logger = logger
+        self.logger = logger if (logger is not None) else get_logger()
         self._study_id = study_id
         self._study_config = study_config
         self._goal = self._study_config.optimization_type
@@ -82,7 +74,7 @@ def lower_bound(self):
 
     @property
     def upper_bound(self):
-        """ return the ipper bound of all the parameters """
+        """ return the upper bound of all the parameters """
         return self._upperbound
 
     @property
@@ -118,10 +110,10 @@ def y_train(self):
     def _parse_config(self):
         """ extract info from the study configuration """
         for i, param in enumerate(self._study_config.parameter_configs.configs):
-            self._name_id[param.name]=i
+            self._name_id[param.name] = i
             self._types.append(param.parameter_type)
             self._names.append(param.name)
-            if param.parameter_type == api_pb2.DOUBLE or param.parameter_type == api_pb2.INT:
+            if param.parameter_type in [api_pb2.DOUBLE, api_pb2.INT]:
                 self._dim = self._dim + 1
                 self._lowerbound.append(float(param.feasible.min))
                 self._upperbound.append(float(param.feasible.max))
@@ -158,15 +150,15 @@ def _mapping_params(self, parameters_list):
             for p in parameters:
                 self.logger.debug("mapping: %r", p, extra={"StudyID": self._study_id})
                 map_id = self._name_id[p.name]
-                if self._types[map_id] == api_pb2.DOUBLE or self._types[map_id] == api_pb2.INT or self._types[map_id] == api_pb2.DISCRETE:
+                if self._types[map_id] in [api_pb2.DOUBLE, api_pb2.INT, api_pb2.DISCRETE]:
                     maplist[map_id] = float(p.value)
                 elif self._types[map_id] == api_pb2.CATEGORICAL:
                     for ci in self._categorical_info:
                         if ci["name"] == p.name:
                             maplist[map_id] = np.zeros(ci["number"])
                             for i, v in enumerate(ci["values"]):
                                 if v == p.value:
-                                    maplist[map_id][i]=1
+                                    maplist[map_id][i] = 1
                                     break
             self.logger.debug("mapped: %r", maplist, extra={"StudyID": self._study_id})
             ret.append(np.hstack(maplist))
@@ -234,4 +226,3 @@ def convert_to_dict(self, x_next):
             })
             result.append(tmp)
         return result
-
diff --git a/pkg/suggestion/bayesianoptimization/src/bayesian_optimization_algorithm.py b/pkg/suggestion/bayesianoptimization/src/bayesian_optimization_algorithm.py
@@ -2,7 +2,7 @@
 import numpy as np
 from sklearn.preprocessing import MinMaxScaler
 
-from pkg.suggestion.bayesianoptimization.src.global_optimizer.global_optimizer import GlobalOptimizer
+from .global_optimizer import GlobalOptimizer
 
 
 class BOAlgorithm:
@@ -54,7 +54,7 @@ def get_suggestion(self, request_num):
         x_next_list = []
         if self.X_train is None and self.y_train is None and self.current_optimal is None:
             # randomly pick a point as the first trial
-            for i in range(request_num):
+            for _ in range(request_num):
                 x_next_list.append(np.random.uniform(self.lowerbound, self.upperbound, size=(1, self.dim)))
         else:
             _, x_next_list_que = self.optimizer.direct(request_num)

diff --git a/.../src/global_optimizer/global_optimizer.py → ...esianoptimization/src/global_optimizer.py b/.../src/global_optimizer/global_optimizer.py → ...esianoptimization/src/global_optimizer.py
@@ -2,12 +2,13 @@
 DIRECT algorithm is used in this case
 """
 import copy
+
 import numpy as np
-from collections import deque
 
-from pkg.suggestion.bayesianoptimization.src.acquisition_func.acquisition_func import AcquisitionFunc
-import logging
-from logging import getLogger, StreamHandler, INFO, DEBUG
+from .acquisition_func import AcquisitionFunc
+from .model.gp import GaussianProcessModel
+from .model.rf import RandomForestModel
+from .utils import get_logger
 
 
 class RectPack:
@@ -74,37 +75,31 @@ class GlobalOptimizer:
 
     def __init__(self, N, l, u, scaler, X_train, y_train, current_optimal, mode, trade_off, length_scale,
                  noise, nu, kernel_type, n_estimators, max_features, model_type, logger=None):
-        if logger == None:
-            self.logger = getLogger(__name__)
-            FORMAT = '%(asctime)-15s StudyID %(studyid)s %(message)s'
-            logging.basicConfig(format=FORMAT)
-            handler = StreamHandler()
-            handler.setLevel(INFO)
-            self.logger.setLevel(INFO)
-            self.logger.addHandler(handler)
-            self.logger.propagate = False
-        else:
-            self.logger = logger
-
+        self.logger = logger if (logger is not None) else get_logger()
         self.N = N
         self.l = l
         self.u = u
         self.scaler = scaler
         self.buckets = []
         self.dim = None
+        if model_type == "gp":
+            model = GaussianProcessModel(
+                length_scale=length_scale,
+                noise=noise,
+                nu=nu,
+                kernel_type=kernel_type,
+            )
+        else:
+            model = RandomForestModel(
+                n_estimators=n_estimators,
+                max_features=max_features,
+            )
+        model.fit(X_train, y_train)
         self.aq_func = AcquisitionFunc(
-            X_train=X_train,
-            y_train=y_train,
+            model=model,
             current_optimal=current_optimal,
             mode=mode,
             trade_off=trade_off,
-            length_scale=length_scale,
-            noise=noise,
-            nu=nu,
-            kernel_type=kernel_type,
-            n_estimators=n_estimators,
-            max_features=max_features,
-            model_type=model_type,
         )
 
     def potential_opt(self, f_min):
@@ -174,7 +169,7 @@ def direct(self, request_num):
         x_next = first_rect.center
         ei_min.append(f_min)
 
-        for t in range(self.N):
+        for _ in range(self.N):
             opt_set = self.potential_opt(f_min)
 
             # for bucket in self.buckets:
@@ -215,7 +210,7 @@ def sample_buckets(self, request_num):
                 fc_sum -= a.fc
                 bucket_index.append([-a.fc, a.center])
         bucket_index = sorted(bucket_index, key=lambda x: x[0])
-        for i in range(request_num):
+        for _ in range(request_num):
             sample = np.random.rand()
             stick = 0.0
             for b in bucket_index:

diff --git a/pkg/suggestion/bayesianoptimization/src/model/gp.py b/pkg/suggestion/bayesianoptimization/src/model/gp.py
@@ -5,7 +5,8 @@
 
 class GaussianProcessModel:
     """ use the gaussian process as a prior """
-    def __init__(self, length_scale, noise, nu, kernel_type):
+    def __init__(self, length_scale=0.5, noise=0.00005,
+                 nu=1.5, kernel_type="matern"):
         """
         :param length_scale: the larger the length_scale is, the smoother the gaussian prior is. If a float,
         an isotropic kernel is used. If an array, an anisotropic kernel is used where each dimension of it defines
@@ -15,20 +16,23 @@ def __init__(self, length_scale, noise, nu, kernel_type):
         approximate function is.
         :param kernel_type: "rbf": squared exponential kernel, "matern": Matern kernel.
         """
-
-        length_scale = length_scale or 0.5
-        noise = noise or 0.00005
-        nu = nu or 1.5
-        kernel_type = kernel_type or "matern"
-
         if kernel_type == "rbf":
             kernel = RBF(length_scale=length_scale)
-        else:
+        elif kernel_type == "matern":
             kernel = Matern(length_scale=length_scale, nu=nu)
-
+        else:
+            raise Exception("kernel_type must be 'rbf' or 'matern'")
         self.gp = GaussianProcessRegressor(
             kernel=kernel,
             alpha=noise,
             random_state=0,
             optimizer=None,
         )
+
+    def fit(self, X_train, y_train):
+        self.gp.fit(X_train, y_train)
+
+    def predict(self, X_test):
+        y_mean, y_std = self.gp.predict(X_test, return_std=True)
+        y_variance = y_std ** 2
+        return y_mean, y_std, y_variance
diff --git a/pkg/suggestion/bayesianoptimization/src/model/rf.py b/pkg/suggestion/bayesianoptimization/src/model/rf.py
@@ -1,11 +1,24 @@
+import numpy as np
+import forestci as fci
 from sklearn.ensemble import RandomForestRegressor
 
 
 class RandomForestModel:
-    def __init__(self, n_estimators, max_features):
-        n_estimators = n_estimators or 50
-        max_features = max_features or "auto"
+
+    def __init__(self, n_estimators=50, max_features="auto"):
         self.rf = RandomForestRegressor(
             n_estimators=n_estimators,
             max_features=max_features,
         )
+        self.X_train = None
+
+    def fit(self, X_train, y_train):
+        print(X_train.shape, y_train.shape)
+        self.X_train = X_train
+        self.rf.fit(X_train, y_train)
+
+    def predict(self, X_test):
+        y_mean = self.rf.predict(X_test)
+        y_variance = fci.random_forest_error(self.rf, self.X_train, X_test)
+        y_std = np.sqrt(y_variance)
+        return y_mean, y_std, y_variance
diff --git a/pkg/suggestion/bayesianoptimization/src/utils.py b/pkg/suggestion/bayesianoptimization/src/utils.py
@@ -0,0 +1,17 @@
+import os
+import logging
+from logging import getLogger, StreamHandler
+
+
+FORMAT = '%(asctime)-15s StudyID %(studyid)s %(message)s'
+LOG_LEVEL = os.environ.get("LOG_LEVEL", "INFO")
+
+
+def get_logger(name=__name__):
+    logger = getLogger(name)
+    logging.basicConfig(format=FORMAT)
+    handler = StreamHandler()
+    logger.setLevel(LOG_LEVEL)
+    logger.addHandler(handler)
+    logger.propagate = False
+    return logger