paper.bib

@article{Shahriari2016,
abstract = {—Big data applications are typically associated with systems involving large numbers of users, massive complex software systems, and large-scale heterogeneous computing and storage architectures. The construction of such systems involves many distributed design choices. The end products (e.g., rec-ommendation systems, medical analysis tools, real-time game engines, speech recognizers) thus involves many tunable config-uration parameters. These parameters are often specified and hard-coded into the software by various developers or teams. If optimized jointly, these parameters can result in significant improvements. Bayesian optimization is a powerful tool for the joint optimization of design choices that is gaining great popularity in recent years. It promises greater automation so as to increase both product quality and human productivity. This review paper introduces Bayesian optimization, highlights some of its methodological aspects, and showcases a wide range of applications.},
archivePrefix = {arXiv},
arxivId = {arXiv:1011.1669v3},
author = {Shahriari, Bobak and Swersky, Kevin and Wang, Ziyu and Adams, Ryan P. and {De Freitas}, Nando},
doi = {10.1109/JPROC.2015.2494218},
eprint = {arXiv:1011.1669v3},
file = {:home/jose/BayesOptLoop (1).pdf:pdf},
isbn = {0018-9219},
issn = {00189219},
journal = {Proceedings of the IEEE},
keywords = {decision making,design of experiments,genomic medicine,optimization,response surface methodology,statistical learning},
number = {1},
pages = {148--175},
pmid = {25246403},
title = {{Taking the human out of the loop: A review of Bayesian optimization}},
volume = {104},
year = {2016}
}
@article{Snoek2012,
abstract = {Machine learning algorithms frequently require careful tuning of model hyperparameters, regularization terms, and optimization parameters. Unfortunately, this tuning is often a" black art" that requires expert experience, unwritten rules of thumb, or sometimes brute-force ... $\backslash$n},
archivePrefix = {arXiv},
arxivId = {arXiv:1206.2944v2},
author = {Snoek, Jasper and Larochelle, Hugo and Adams, Rp},
doi = {2012arXiv1206.2944S},
eprint = {arXiv:1206.2944v2},
file = {:home/jose/1206.2944.pdf:pdf},
isbn = {9781627480031},
issn = {10495258},
journal = {Advances in Neural Information {\ldots}},
pages = {1--9},
pmid = {9377276},
title = {{Practical Bayesian optimization of machine learning algorithms}},
url = {http://papers.nips.cc/paper/4522-practical-bayesian-optimization-of-machine-learning-algorithms},
year = {2012}
}
@article{Bergstra2012,
abstract = {Grid search and manual search are the most widely used strategies for hyper-parameter optimiza-tion. This paper shows empirically and theoretically that randomly chosen trials are more efficient for hyper-parameter optimization than trials on a grid. Empirical evidence comes from a compar-ison with a large previous study that used grid search and manual search to configure neural net-works and deep belief networks. Compared with neural networks configured by a pure grid search, we find that random search over the same domain is able to find models that are as good or better within a small fraction of the computation time. Granting random search the same computational budget, random search finds better models by effectively searching a larger, less promising con-figuration space. Compared with deep belief networks configured by a thoughtful combination of manual search and grid search, purely random search over the same 32-dimensional configuration space found statistically equal performance on four of seven data sets, and superior performance on one of seven. A Gaussian process analysis of the function from hyper-parameters to validation set performance reveals that for most data sets only a few of the hyper-parameters really matter, but that different hyper-parameters are important on different data sets. This phenomenon makes grid search a poor choice for configuring algorithms for new data sets. Our analysis casts some light on why recent " High Throughput " methods achieve surprising success—they appear to search through a large number of hyper-parameters because most hyper-parameters do not matter much. We anticipate that growing interest in large hierarchical models will place an increasing burden on techniques for hyper-parameter optimization; this work shows that random search is a natural base-line against which to judge progress in the development of adaptive (sequential) hyper-parameter optimization algorithms.},
archivePrefix = {arXiv},
arxivId = {1504.05070},
author = {{Bergstra James}, James and {Bengio Yoshua}, Umontrealca},
doi = {10.1162/153244303322533223},
eprint = {1504.05070},
isbn = {1532-4435},
issn = {1532-4435},
journal = {Journal of Machine Learning Research},
keywords = {deep learning,global optimization,model selection,neural networks,response surface modeling},
pages = {281--305},
pmid = {18244602},
title = {{Random Search for Hyper-Parameter Optimization}},
volume = {13},
year = {2012}
}
@misc{SpearmintSnoek2012,
abstract = {Spearmint},
author = {Snoek, Jasper},
title = {Spearmint},
url = {https://github.com/HIPS/Spearmint},
year = {2012}
}
@misc{scikitoptimize,
abstract = {scikit-optimize},
author = {The scikit-optimize team.},
title = {scikit-optimize},
url = {https://github.com/scikit-optimize/scikit-optimize},
year = {2016}
}

@misc{yelpmoe,
abstract = {MOE},
author = {Yelp},
title = {MOE},
url = {https://github.com/Yelp/MOE},
year = {2014}
}

@article{Hernandez-Lobato2014,
abstract = { We propose a novel information-theoretic approach$\backslash$nfor Bayesian optimization called Predictive Entropy$\backslash$nSearch (PES). At each iteration, PES selects the$\backslash$nnext evaluation point that maximizes the expected$\backslash$ninformation gained with respect to the global$\backslash$nmaximum. PES codifies this intractable acquisition$\backslash$nfunction in terms of the expected reduction in the$\backslash$ndifferential entropy of the predictive distribution.$\backslash$nThis reformulation allows PES to obtain$\backslash$napproximations that are both more accurate and$\backslash$nefficient than other alternatives such as Entropy$\backslash$nSearch (ES). Furthermore, PES can easily perform a$\backslash$nfully Bayesian treatment of the model$\backslash$nhyperparameters while ES cannot. We evaluate PES in$\backslash$nboth synthetic and realworld applications, including$\backslash$noptimization problems in machine learning, finance,$\backslash$nbiotechnology, and robotics. We show that the$\backslash$nincreased accuracy of PES leads to significant gains$\backslash$nin optimization performance. },
archivePrefix = {arXiv},
arxivId = {arXiv:1406.2541v1},
author = {Hern{\'{a}}ndez-Lobato, Jos{\'{e}} Miguel and Hoffman, Matthew W and Ghahramani, Zoubin},
eprint = {arXiv:1406.2541v1},
issn = {10495258},
journal = {Advances in Neural Information Processing Systems 28},
pages = {1--9},
title = {{Predictive Entropy Search for Efficient Global Optimization of Black-box Functions}},
url = {https://jmhldotorg.files.wordpress.com/2014/10/pes-final.pdf},
year = {2014}
}

@article{Gardner2014,
abstract = {Bayesian optimization is a powerful frame- work for minimizing expensive objective functions while using very few function eval- uations. It has been successfully applied to a variety of problems, including hyperparam- eter tuning and experimental design. How- ever, this framework has not been extended to the inequality-constrained optimization setting, particularly the setting in which eval- uating feasibility is just as expensive as eval- uating the objective. Here we present con- strained Bayesian optimization, which places a prior distribution on both the objective and the constraint functions. We evaluate our method on simulated and real data, demon- strating that constrained Bayesian optimiza- tion can quickly find optimal and feasible points, even when small feasible regions cause standard methods to fail.},
author = {Gardner, Jacob R. and Kusner, Matt J. and Xu, Zhixiang Eddie and Weinberger, Kilian Q. and Cunningham, John P.},
isbn = {9781634393973},
journal = {Proceedings of the 31st International Conference on Machine Learning},
pages = {937--945},
title = {{Bayesian Optimization with Inequality Constraints}},
volume = {32},
year = {2014}
}

@article{Chapelle2011,
abstract = {Thompson sampling is one of oldest heuristic to address the exploration ex- ploitation trade-off, but it is surprisingly unpopular in the literature. We present here some empirical results using Thompson sampling on simulated and real data, and show that it is highly competitive. And since this heuristic is very easy to implement, we argue that it should be part of the standard baselines to compare against.},
author = {Chapelle, Olivier and Li, Lihong},
isbn = {9781618395993},
journal = {Advances in Neural Information Processing Systems},
pages = {2249----2257},
title = {{An Empirical Evaluation of Thompson Sampling}},
url = {http://explo.cs.ucl.ac.uk/wp-content/uploads/2011/05/An-Empirical-Evaluation-of-Thompson-Sampling-Chapelle-Li-2011.pdf},
year = {2011}
}

@inproceedings{Vanchinathan2014,
abstract = {We address the challenge of ranking recommendation lists based on click feedback by efficiently encoding similarities among users and among items. The key challenges are threefold: (1) combinato- rial number of lists; (2) sparse feedback and (3) context dependent recommendations. We propose the CGPRANK algorithm, which exploits prior information specified in terms of a Gaussian pro- cess kernel function, which allows to share feedback in three ways: Between positions in a list, between items, and between contexts. Under our model, we provide strong performance guarantees and empirically evaluate our algorithm on data from two large scale recommendation tasks: Yahoo! news article recommendation, and Google books. In our experiments, CGPRANK significantly out- performs state-of-the-art multi-armed bandit and learning-to-rank methods, with an 18{\%} increase in clicks.},
author = {Vanchinathan, Hastagiri P. and Nikolic, Isidor and {De Bona}, Fabio and Krause, Andreas},
booktitle = {Proceedings of the 8th ACM Conference on Recommender systems - RecSys '14},
doi = {10.1145/2645710.2645733},
isbn = {9781450326681},
pages = {225--232},
title = {{Explore-exploit in top-N recommender systems via Gaussian processes}},
url = {http://dl.acm.org/citation.cfm?doid=2645710.2645733},
year = {2014}
}


@article{Salvatier2016,
abstract = {Probabilistic programming allows for automatic Bayesian inference on user-defined probabilistic models. Recent advances in Markov chain Monte Carlo (MCMC) sampling allow inference on increasingly complex models. This class of MCMC, known as Hamiltonian Monte Carlo, requires gradient information which is often not readily available. PyMC3 is a new open source probabilistic programming framework written in Python that uses Theano to compute gradients via automatic differentiation as well as compile probabilistic programs on-the-fly to C for increased speed. Contrary to other probabilistic programming languages, PyMC3 allows model specification directly in Python code. The lack of a domain specific language allows for great flexibility and direct interaction with the model. This paper is a tutorial-style introduction to this software package.},
author = {Salvatier, J and Wiecki, TV and Fonnesbeck C.},
journal = {PeerJ Computer Science},
title = {{Probabilistic programming in Python using PyMC3}},
url = {https://doi.org/10.7717/peerj-cs.55},
year = {2016}
}


@book{Rasmussen2004,
abstract = {Gaussian processes (GPs) are natural generalisations of multivariate Gaussian random variables to infinite (countably or continuous) index sets. GPs have been applied in a large number of fields to a diverse range of ends, and very many deep theoretical analyses of various properties are available. This paper gives an introduction to Gaussian processes on a fairly elementary level with special emphasis on characteristics relevant in machine learning. It draws explicit connections to branches such as spline smoothing models and support vector machines in which similar ideas have been investigated. Gaussian process models are routinely used to solve hard machine learning problems. They are attractive because of their flexible non-parametric nature and computational simplicity. Treated within a Bayesian framework, very powerful statistical methods can be implemented which offer valid estimates of uncertainties in our predictions and generic model selection procedures cast as nonlinear optimization problems. Their main drawback of heavy computational scaling has recently been alleviated by the introduction of generic sparse approximations.13,78,31 The mathematical literature on GPs is large and often uses deep concepts which are not required to fully understand most machine learning applications. In this tutorial paper, we aim to present characteristics of GPs relevant to machine learning and to show up precise connections to other "kernel machines" popular in the community. Our focus is on a simple presentation, but references to more detailed sources are provided.},
archivePrefix = {arXiv},
arxivId = {026218253X},
author = {Rasmussen, Carl E. and Williams, Christopher K. I.},
booktitle = {International journal of neural systems},
doi = {10.1142/S0129065704001899},
eprint = {026218253X},
isbn = {026218253X},
issn = {0129-0657},
keywords = {2006,c,c 2006 massachusetts institute,e,gaussian processes for machine,gaussianprocess,gpml,i,isbn 026218253x,k,learning,of technology,org,rasmussen,the mit press,williams,www},
number = {2},
pages = {69--106},
pmid = {15112367},
title = {{Gaussian processes for machine learning.}},
url = {http://www.gaussianprocess.org/gpml/chapters/RW.pdf},
volume = {14},
year = {2004}
}