From 7ab94e6bba68d4f75c864503fdb269a70def59e6 Mon Sep 17 00:00:00 2001 From: Nikita Titov Date: Thu, 14 Mar 2019 05:51:48 +0300 Subject: [PATCH 1/5] [examples] updated tree index with categorical feature (#2044) * updated gitignore * updated tree index with cat feature --- .gitignore | 4 ++++ examples/python-guide/plot_example.py | 8 ++++---- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index 4b9a81ee0578..1e5df998a9a8 100644 --- a/.gitignore +++ b/.gitignore @@ -393,3 +393,7 @@ lightgbm.Rcheck/ # Files generated by aspell **/*.bak + +# GraphViz artifacts +*.gv +*.gv.* diff --git a/examples/python-guide/plot_example.py b/examples/python-guide/plot_example.py index eca896982fa9..664a1543006f 100644 --- a/examples/python-guide/plot_example.py +++ b/examples/python-guide/plot_example.py @@ -50,10 +50,10 @@ ax = lgb.plot_importance(gbm, max_num_features=10) plt.show() -print('Plotting 84th tree...') # one tree use categorical feature to split -ax = lgb.plot_tree(gbm, tree_index=83, figsize=(20, 8), show_info=['split_gain']) +print('Plotting 54th tree...') # one tree use categorical feature to split +ax = lgb.plot_tree(gbm, tree_index=53, figsize=(15, 15), show_info=['split_gain']) plt.show() -print('Plotting 84th tree with graphviz...') -graph = lgb.create_tree_digraph(gbm, tree_index=83, name='Tree84') +print('Plotting 54th tree with graphviz...') +graph = lgb.create_tree_digraph(gbm, tree_index=53, name='Tree54') graph.render(view=True) From ffb134cc31cc6ec7a456257fcdae9054ded54559 Mon Sep 17 00:00:00 2001 From: Nikita Titov Date: Thu, 14 Mar 2019 05:52:13 +0300 Subject: [PATCH 2/5] [python] disabled split value histogram for categorical features (#2045) * disabled split value histogram for categorical features * updated test for cat. feature * updated docs --- python-package/lightgbm/basic.py | 10 +++++++++- tests/python_package_test/test_engine.py | 14 ++++++++------ 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index 67fb218a1ce9..b0c0f1563c23 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -2437,6 +2437,11 @@ def get_split_value_histogram(self, feature, bins=None, xgboost_style=False): The feature name or index the histogram is calculated for. If int, interpreted as index. If string, interpreted as name. + + Note + ---- + Categorical features are not supported. + bins : int, string or None, optional (default=None) The maximum number of bins. If None, or int and > number of unique split values and ``xgboost_style=True``, @@ -2464,7 +2469,10 @@ def add(root): else: split_feature = root['split_feature'] if split_feature == feature: - values.append(root['threshold']) + if isinstance(root['threshold'], string_type): + raise LightGBMError('Cannot compute split value histogram for the categorical feature') + else: + values.append(root['threshold']) add(root['left_child']) add(root['right_child']) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 04f911e00182..c5c6e0ef7f0b 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -1245,17 +1245,17 @@ def test_model_size(self): def test_get_split_value_histogram(self): X, y = load_boston(True) - lgb_train = lgb.Dataset(X, y) + lgb_train = lgb.Dataset(X, y, categorical_feature=[2]) gbm = lgb.train({'verbose': -1}, lgb_train, num_boost_round=20) # test XGBoost-style return value params = {'feature': 0, 'xgboost_style': True} - self.assertTupleEqual(gbm.get_split_value_histogram(**params).shape, (10, 2)) - self.assertTupleEqual(gbm.get_split_value_histogram(bins=999, **params).shape, (10, 2)) + self.assertTupleEqual(gbm.get_split_value_histogram(**params).shape, (9, 2)) + self.assertTupleEqual(gbm.get_split_value_histogram(bins=999, **params).shape, (9, 2)) self.assertTupleEqual(gbm.get_split_value_histogram(bins=-1, **params).shape, (1, 2)) self.assertTupleEqual(gbm.get_split_value_histogram(bins=0, **params).shape, (1, 2)) self.assertTupleEqual(gbm.get_split_value_histogram(bins=1, **params).shape, (1, 2)) self.assertTupleEqual(gbm.get_split_value_histogram(bins=2, **params).shape, (2, 2)) - self.assertTupleEqual(gbm.get_split_value_histogram(bins=6, **params).shape, (6, 2)) + self.assertTupleEqual(gbm.get_split_value_histogram(bins=6, **params).shape, (5, 2)) self.assertTupleEqual(gbm.get_split_value_histogram(bins=7, **params).shape, (6, 2)) if lgb.compat.PANDAS_INSTALLED: np.testing.assert_almost_equal( @@ -1277,8 +1277,8 @@ def test_get_split_value_histogram(self): ) # test numpy-style return value hist, bins = gbm.get_split_value_histogram(0) - self.assertEqual(len(hist), 22) - self.assertEqual(len(bins), 23) + self.assertEqual(len(hist), 23) + self.assertEqual(len(bins), 24) hist, bins = gbm.get_split_value_histogram(0, bins=999) self.assertEqual(len(hist), 999) self.assertEqual(len(bins), 1000) @@ -1316,3 +1316,5 @@ def test_get_split_value_histogram(self): mask = hist_vals > 0 np.testing.assert_array_equal(hist_vals[mask], hist[:, 1]) np.testing.assert_almost_equal(bin_edges[1:][mask], hist[:, 0]) + # test histogram is disabled for categorical features + self.assertRaises(lgb.basic.LightGBMError, gbm.get_split_value_histogram, 2) From 74ce2cfe9418b0def315c5e17f6244eefac64a60 Mon Sep 17 00:00:00 2001 From: Nikita Titov Date: Thu, 14 Mar 2019 05:56:50 +0300 Subject: [PATCH 3/5] added examples for multiple custom metrics (#2021) --- examples/python-guide/advanced_example.py | 21 +++++++++++++++++++++ examples/python-guide/sklearn_example.py | 16 ++++++++++++++++ 2 files changed, 37 insertions(+) diff --git a/examples/python-guide/advanced_example.py b/examples/python-guide/advanced_example.py index cef8ad6c944a..e6c9f3bdb637 100644 --- a/examples/python-guide/advanced_example.py +++ b/examples/python-guide/advanced_example.py @@ -163,6 +163,27 @@ def binary_error(preds, train_data): print('Finished 40 - 50 rounds with self-defined objective function and eval metric...') + +# another self-defined eval metric +# f(preds: array, train_data: Dataset) -> name: string, eval_result: float, is_higher_better: bool +# accuracy +def accuracy(preds, train_data): + labels = train_data.get_label() + return 'accuracy', np.mean(labels == (preds > 0.5)), True + + +gbm = lgb.train(params, + lgb_train, + num_boost_round=10, + init_model=gbm, + fobj=loglikelihood, + feval=lambda preds, train_data: [binary_error(preds, train_data), + accuracy(preds, train_data)], + valid_sets=lgb_eval) + +print('Finished 50 - 60 rounds with self-defined objective function ' + 'and multiple self-defined eval metrics...') + print('Starting a new training job...') diff --git a/examples/python-guide/sklearn_example.py b/examples/python-guide/sklearn_example.py index f6a2fc2c2170..b9dcb580ce07 100644 --- a/examples/python-guide/sklearn_example.py +++ b/examples/python-guide/sklearn_example.py @@ -51,11 +51,27 @@ def rmsle(y_true, y_pred): eval_metric=rmsle, early_stopping_rounds=5) + +# another self-defined eval metric +# f(y_true: array, y_pred: array) -> name: string, eval_result: float, is_higher_better: bool +# Relative Absolute Error (RAE) +def rae(y_true, y_pred): + return 'RAE', np.sum(np.abs(y_pred - y_true)) / np.sum(np.abs(np.mean(y_true) - y_true)), False + + +print('Starting training with multiple custom eval functions...') +# train +gbm.fit(X_train, y_train, + eval_set=[(X_test, y_test)], + eval_metric=lambda y_true, y_pred: [rmsle(y_true, y_pred), rae(y_true, y_pred)], + early_stopping_rounds=5) + print('Starting predicting...') # predict y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_) # eval print('The rmsle of prediction is:', rmsle(y_test, y_pred)[1]) +print('The rae of prediction is:', rae(y_test, y_pred)[1]) # other scikit-learn modules estimator = lgb.LGBMRegressor(num_leaves=31) From b020a25d3dbfcd9f1128352ca1582abdf67ff87b Mon Sep 17 00:00:00 2001 From: Nikita Titov Date: Thu, 14 Mar 2019 13:45:48 +0300 Subject: [PATCH 4/5] [ci] compatibility hotfix for notebook execution (#2048) * ci fix * ci fix for Appveyor * actually firx Appveyor --- .appveyor.yml | 2 +- .ci/test.sh | 2 +- .ci/test_windows.ps1 | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.appveyor.yml b/.appveyor.yml index 27481011927a..b1f25d356fb9 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -53,5 +53,5 @@ test_script: if (!$?) { $host.SetShouldExit(-1) } } # run all examples - cd %APPVEYOR_BUILD_FOLDER%\examples\python-guide\notebooks - - conda install -q -y -n test-env ipywidgets notebook + - conda install -q -y -n test-env ipywidgets notebook "tornado=5.1.1" - jupyter nbconvert --ExecutePreprocessor.timeout=180 --to notebook --execute --inplace *.ipynb # run all notebooks diff --git a/.ci/test.sh b/.ci/test.sh index e5633673fe7a..4c4bef36c911 100755 --- a/.ci/test.sh +++ b/.ci/test.sh @@ -139,6 +139,6 @@ matplotlib.use\(\"Agg\"\)\ sed -i'.bak' 's/graph.render(view=True)/graph.render(view=False)/' plot_example.py for f in *.py; do python $f || exit -1; done # run all examples cd $BUILD_DIRECTORY/examples/python-guide/notebooks - conda install -q -y -n $CONDA_ENV ipywidgets notebook + conda install -q -y -n $CONDA_ENV ipywidgets notebook "tornado=5.1.1" jupyter nbconvert --ExecutePreprocessor.timeout=180 --to notebook --execute --inplace *.ipynb || exit -1 # run all notebooks fi diff --git a/.ci/test_windows.ps1 b/.ci/test_windows.ps1 index bae9c25964ae..23963cc6c4df 100644 --- a/.ci/test_windows.ps1 +++ b/.ci/test_windows.ps1 @@ -38,6 +38,6 @@ if ($env:TASK -eq "regular") { python $file ; Check-Output $? } # run all examples cd $env:BUILD_SOURCESDIRECTORY/examples/python-guide/notebooks - conda install -q -y -n $env:CONDA_ENV ipywidgets notebook + conda install -q -y -n $env:CONDA_ENV ipywidgets notebook "tornado=5.1.1" jupyter nbconvert --ExecutePreprocessor.timeout=180 --to notebook --execute --inplace *.ipynb ; Check-Output $? # run all notebooks } From 95246cdaec4feb739f3725e3d0fbf2efeb37bc85 Mon Sep 17 00:00:00 2001 From: Ilya Matiach Date: Sat, 16 Mar 2019 02:29:41 -0400 Subject: [PATCH 5/5] lightgbm SWIG Java wrapper changes needed to add early stopping in mmlspark (#2047) * lightgbm SWIG Java wrapper changes needed to add early stopping in mmlspark * updated based on comments --- swig/lightgbmlib.i | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/swig/lightgbmlib.i b/swig/lightgbmlib.i index 5eb045c114e7..902a6f64be37 100644 --- a/swig/lightgbmlib.i +++ b/swig/lightgbmlib.i @@ -1,10 +1,12 @@ /* lightgbmlib.i */ %module lightgbmlib %ignore LGBM_BoosterSaveModelToString; +%ignore LGBM_BoosterGetEvalNames; %{ /* Includes the header in the wrapper code */ #include "../include/LightGBM/export.h" #include "../include/LightGBM/utils/log.h" +#include "../include/LightGBM/utils/common.h" #include "../include/LightGBM/c_api.h" %} @@ -16,12 +18,32 @@ %inline %{ char * LGBM_BoosterSaveModelToStringSWIG(BoosterHandle handle, - int start_iteration, - int num_iteration, - int64_t buffer_len, - int64_t* out_len) { + int start_iteration, + int num_iteration, + int64_t buffer_len, + int64_t* out_len) { char* dst = new char[buffer_len]; int result = LGBM_BoosterSaveModelToString(handle, start_iteration, num_iteration, buffer_len, out_len, dst); + // Reallocate to use larger length + if (*out_len > buffer_len) { + delete [] dst; + int64_t realloc_len = *out_len; + dst = new char[realloc_len]; + result = LGBM_BoosterSaveModelToString(handle, start_iteration, num_iteration, realloc_len, out_len, dst); + } + if (result != 0) { + return nullptr; + } + return dst; + } + + char ** LGBM_BoosterGetEvalNamesSWIG(BoosterHandle handle, + int eval_counts) { + char** dst = new char*[eval_counts]; + for (int i = 0; i < eval_counts; ++i) { + dst[i] = new char[128]; + } + int result = LGBM_BoosterGetEvalNames(handle, &eval_counts, dst); if (result != 0) { return nullptr; } @@ -49,6 +71,7 @@ %array_functions(float, floatArray) %array_functions(int, intArray) %array_functions(long, longArray) +%array_functions(char *, stringArray) /* Custom pointer manipulation template */ %define %pointer_manipulation(TYPE,NAME)