diff --git a/.appveyor.yml b/.appveyor.yml index 27481011927a..b1f25d356fb9 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -53,5 +53,5 @@ test_script: if (!$?) { $host.SetShouldExit(-1) } } # run all examples - cd %APPVEYOR_BUILD_FOLDER%\examples\python-guide\notebooks - - conda install -q -y -n test-env ipywidgets notebook + - conda install -q -y -n test-env ipywidgets notebook "tornado=5.1.1" - jupyter nbconvert --ExecutePreprocessor.timeout=180 --to notebook --execute --inplace *.ipynb # run all notebooks diff --git a/.ci/test.sh b/.ci/test.sh index e5633673fe7a..4c4bef36c911 100755 --- a/.ci/test.sh +++ b/.ci/test.sh @@ -139,6 +139,6 @@ matplotlib.use\(\"Agg\"\)\ sed -i'.bak' 's/graph.render(view=True)/graph.render(view=False)/' plot_example.py for f in *.py; do python $f || exit -1; done # run all examples cd $BUILD_DIRECTORY/examples/python-guide/notebooks - conda install -q -y -n $CONDA_ENV ipywidgets notebook + conda install -q -y -n $CONDA_ENV ipywidgets notebook "tornado=5.1.1" jupyter nbconvert --ExecutePreprocessor.timeout=180 --to notebook --execute --inplace *.ipynb || exit -1 # run all notebooks fi diff --git a/.ci/test_windows.ps1 b/.ci/test_windows.ps1 index bae9c25964ae..23963cc6c4df 100644 --- a/.ci/test_windows.ps1 +++ b/.ci/test_windows.ps1 @@ -38,6 +38,6 @@ if ($env:TASK -eq "regular") { python $file ; Check-Output $? } # run all examples cd $env:BUILD_SOURCESDIRECTORY/examples/python-guide/notebooks - conda install -q -y -n $env:CONDA_ENV ipywidgets notebook + conda install -q -y -n $env:CONDA_ENV ipywidgets notebook "tornado=5.1.1" jupyter nbconvert --ExecutePreprocessor.timeout=180 --to notebook --execute --inplace *.ipynb ; Check-Output $? # run all notebooks } diff --git a/.gitignore b/.gitignore index 4b9a81ee0578..1e5df998a9a8 100644 --- a/.gitignore +++ b/.gitignore @@ -393,3 +393,7 @@ lightgbm.Rcheck/ # Files generated by aspell **/*.bak + +# GraphViz artifacts +*.gv +*.gv.* diff --git a/examples/python-guide/advanced_example.py b/examples/python-guide/advanced_example.py index cef8ad6c944a..e6c9f3bdb637 100644 --- a/examples/python-guide/advanced_example.py +++ b/examples/python-guide/advanced_example.py @@ -163,6 +163,27 @@ def binary_error(preds, train_data): print('Finished 40 - 50 rounds with self-defined objective function and eval metric...') + +# another self-defined eval metric +# f(preds: array, train_data: Dataset) -> name: string, eval_result: float, is_higher_better: bool +# accuracy +def accuracy(preds, train_data): + labels = train_data.get_label() + return 'accuracy', np.mean(labels == (preds > 0.5)), True + + +gbm = lgb.train(params, + lgb_train, + num_boost_round=10, + init_model=gbm, + fobj=loglikelihood, + feval=lambda preds, train_data: [binary_error(preds, train_data), + accuracy(preds, train_data)], + valid_sets=lgb_eval) + +print('Finished 50 - 60 rounds with self-defined objective function ' + 'and multiple self-defined eval metrics...') + print('Starting a new training job...') diff --git a/examples/python-guide/plot_example.py b/examples/python-guide/plot_example.py index eca896982fa9..664a1543006f 100644 --- a/examples/python-guide/plot_example.py +++ b/examples/python-guide/plot_example.py @@ -50,10 +50,10 @@ ax = lgb.plot_importance(gbm, max_num_features=10) plt.show() -print('Plotting 84th tree...') # one tree use categorical feature to split -ax = lgb.plot_tree(gbm, tree_index=83, figsize=(20, 8), show_info=['split_gain']) +print('Plotting 54th tree...') # one tree use categorical feature to split +ax = lgb.plot_tree(gbm, tree_index=53, figsize=(15, 15), show_info=['split_gain']) plt.show() -print('Plotting 84th tree with graphviz...') -graph = lgb.create_tree_digraph(gbm, tree_index=83, name='Tree84') +print('Plotting 54th tree with graphviz...') +graph = lgb.create_tree_digraph(gbm, tree_index=53, name='Tree54') graph.render(view=True) diff --git a/examples/python-guide/sklearn_example.py b/examples/python-guide/sklearn_example.py index f6a2fc2c2170..b9dcb580ce07 100644 --- a/examples/python-guide/sklearn_example.py +++ b/examples/python-guide/sklearn_example.py @@ -51,11 +51,27 @@ def rmsle(y_true, y_pred): eval_metric=rmsle, early_stopping_rounds=5) + +# another self-defined eval metric +# f(y_true: array, y_pred: array) -> name: string, eval_result: float, is_higher_better: bool +# Relative Absolute Error (RAE) +def rae(y_true, y_pred): + return 'RAE', np.sum(np.abs(y_pred - y_true)) / np.sum(np.abs(np.mean(y_true) - y_true)), False + + +print('Starting training with multiple custom eval functions...') +# train +gbm.fit(X_train, y_train, + eval_set=[(X_test, y_test)], + eval_metric=lambda y_true, y_pred: [rmsle(y_true, y_pred), rae(y_true, y_pred)], + early_stopping_rounds=5) + print('Starting predicting...') # predict y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_) # eval print('The rmsle of prediction is:', rmsle(y_test, y_pred)[1]) +print('The rae of prediction is:', rae(y_test, y_pred)[1]) # other scikit-learn modules estimator = lgb.LGBMRegressor(num_leaves=31) diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index 67fb218a1ce9..b0c0f1563c23 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -2437,6 +2437,11 @@ def get_split_value_histogram(self, feature, bins=None, xgboost_style=False): The feature name or index the histogram is calculated for. If int, interpreted as index. If string, interpreted as name. + + Note + ---- + Categorical features are not supported. + bins : int, string or None, optional (default=None) The maximum number of bins. If None, or int and > number of unique split values and ``xgboost_style=True``, @@ -2464,7 +2469,10 @@ def add(root): else: split_feature = root['split_feature'] if split_feature == feature: - values.append(root['threshold']) + if isinstance(root['threshold'], string_type): + raise LightGBMError('Cannot compute split value histogram for the categorical feature') + else: + values.append(root['threshold']) add(root['left_child']) add(root['right_child']) diff --git a/swig/lightgbmlib.i b/swig/lightgbmlib.i index 5eb045c114e7..902a6f64be37 100644 --- a/swig/lightgbmlib.i +++ b/swig/lightgbmlib.i @@ -1,10 +1,12 @@ /* lightgbmlib.i */ %module lightgbmlib %ignore LGBM_BoosterSaveModelToString; +%ignore LGBM_BoosterGetEvalNames; %{ /* Includes the header in the wrapper code */ #include "../include/LightGBM/export.h" #include "../include/LightGBM/utils/log.h" +#include "../include/LightGBM/utils/common.h" #include "../include/LightGBM/c_api.h" %} @@ -16,12 +18,32 @@ %inline %{ char * LGBM_BoosterSaveModelToStringSWIG(BoosterHandle handle, - int start_iteration, - int num_iteration, - int64_t buffer_len, - int64_t* out_len) { + int start_iteration, + int num_iteration, + int64_t buffer_len, + int64_t* out_len) { char* dst = new char[buffer_len]; int result = LGBM_BoosterSaveModelToString(handle, start_iteration, num_iteration, buffer_len, out_len, dst); + // Reallocate to use larger length + if (*out_len > buffer_len) { + delete [] dst; + int64_t realloc_len = *out_len; + dst = new char[realloc_len]; + result = LGBM_BoosterSaveModelToString(handle, start_iteration, num_iteration, realloc_len, out_len, dst); + } + if (result != 0) { + return nullptr; + } + return dst; + } + + char ** LGBM_BoosterGetEvalNamesSWIG(BoosterHandle handle, + int eval_counts) { + char** dst = new char*[eval_counts]; + for (int i = 0; i < eval_counts; ++i) { + dst[i] = new char[128]; + } + int result = LGBM_BoosterGetEvalNames(handle, &eval_counts, dst); if (result != 0) { return nullptr; } @@ -49,6 +71,7 @@ %array_functions(float, floatArray) %array_functions(int, intArray) %array_functions(long, longArray) +%array_functions(char *, stringArray) /* Custom pointer manipulation template */ %define %pointer_manipulation(TYPE,NAME) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 9688fe409e71..86cb403bfd6a 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -1246,17 +1246,17 @@ def test_model_size(self): def test_get_split_value_histogram(self): X, y = load_boston(True) - lgb_train = lgb.Dataset(X, y) + lgb_train = lgb.Dataset(X, y, categorical_feature=[2]) gbm = lgb.train({'verbose': -1}, lgb_train, num_boost_round=20) # test XGBoost-style return value params = {'feature': 0, 'xgboost_style': True} - self.assertTupleEqual(gbm.get_split_value_histogram(**params).shape, (10, 2)) - self.assertTupleEqual(gbm.get_split_value_histogram(bins=999, **params).shape, (10, 2)) + self.assertTupleEqual(gbm.get_split_value_histogram(**params).shape, (9, 2)) + self.assertTupleEqual(gbm.get_split_value_histogram(bins=999, **params).shape, (9, 2)) self.assertTupleEqual(gbm.get_split_value_histogram(bins=-1, **params).shape, (1, 2)) self.assertTupleEqual(gbm.get_split_value_histogram(bins=0, **params).shape, (1, 2)) self.assertTupleEqual(gbm.get_split_value_histogram(bins=1, **params).shape, (1, 2)) self.assertTupleEqual(gbm.get_split_value_histogram(bins=2, **params).shape, (2, 2)) - self.assertTupleEqual(gbm.get_split_value_histogram(bins=6, **params).shape, (6, 2)) + self.assertTupleEqual(gbm.get_split_value_histogram(bins=6, **params).shape, (5, 2)) self.assertTupleEqual(gbm.get_split_value_histogram(bins=7, **params).shape, (6, 2)) if lgb.compat.PANDAS_INSTALLED: np.testing.assert_almost_equal( @@ -1278,8 +1278,8 @@ def test_get_split_value_histogram(self): ) # test numpy-style return value hist, bins = gbm.get_split_value_histogram(0) - self.assertEqual(len(hist), 22) - self.assertEqual(len(bins), 23) + self.assertEqual(len(hist), 23) + self.assertEqual(len(bins), 24) hist, bins = gbm.get_split_value_histogram(0, bins=999) self.assertEqual(len(hist), 999) self.assertEqual(len(bins), 1000) @@ -1317,6 +1317,8 @@ def test_get_split_value_histogram(self): mask = hist_vals > 0 np.testing.assert_array_equal(hist_vals[mask], hist[:, 1]) np.testing.assert_almost_equal(bin_edges[1:][mask], hist[:, 0]) + # test histogram is disabled for categorical features + self.assertRaises(lgb.basic.LightGBMError, gbm.get_split_value_histogram, 2) def test_early_stopping_for_only_first_metric(self): X, y = load_boston(True)