diff --git a/include/LightGBM/boosting.h b/include/LightGBM/boosting.h index 31bb430f0aed..c329f5e12186 100644 --- a/include/LightGBM/boosting.h +++ b/include/LightGBM/boosting.h @@ -166,10 +166,11 @@ class LIGHTGBM_EXPORT Boosting { * \brief Feature contributions for the model's prediction of one record * \param feature_values Feature value on this record * \param output Prediction result for this record - * \param early_stop Early stopping instance. If nullptr, no early stopping is applied and all models are evaluated. */ - virtual void PredictContrib(const double* features, double* output, - const PredictionEarlyStopInstance* early_stop) const = 0; + virtual void PredictContrib(const double* features, double* output) const = 0; + + virtual void PredictContribByMap(const std::unordered_map& features, + std::vector>& output) const = 0; /*! * \brief Dump model to json format string diff --git a/include/LightGBM/c_api.h b/include/LightGBM/c_api.h index 6a30fce495c5..c7bbd4d13c95 100644 --- a/include/LightGBM/c_api.h +++ b/include/LightGBM/c_api.h @@ -727,6 +727,45 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForCSR(BoosterHandle handle, int64_t* out_len, double* out_result); +/*! + * \brief Make sparse prediction for a new dataset in CSR format. Currently only used for feature contributions. + * \note + * The outputs are pre-allocated, as they can vary for each invocation, but the shape should be the same: + * - for feature contributions, the shape of sparse matrix will be ``num_class * num_data * (num_feature + 1)``. + * \param handle Handle of booster + * \param indptr Pointer to row headers + * \param indptr_type Type of ``indptr``, can be ``C_API_DTYPE_INT32`` or ``C_API_DTYPE_INT64`` + * \param indices Pointer to column indices + * \param data Pointer to the data space + * \param data_type Type of ``data`` pointer, can be ``C_API_DTYPE_FLOAT32`` or ``C_API_DTYPE_FLOAT64`` + * \param nindptr Number of rows in the matrix + 1 + * \param nelem Number of nonzero elements in the matrix + * \param num_col Number of columns + * \param predict_type What should be predicted, only feature contributions supported currently + * - ``C_API_PREDICT_CONTRIB``: feature contributions (SHAP values) + * \param num_iteration Number of iterations for prediction, <= 0 means no limit + * \param parameter Other parameters for prediction, e.g. early stopping for prediction + * \param[out] out_len Length of output result + * \param[out] out_indices Pointer to sparse indices + * \param[out] out_data Pointer to sparse data space + * \return 0 when succeed, -1 when failure happens + */ +LIGHTGBM_C_EXPORT int LGBM_BoosterPredictSparseForCSR(BoosterHandle handle, + const void* indptr, + int indptr_type, + const int32_t* indices, + const void* data, + int data_type, + int64_t nindptr, + int64_t nelem, + int64_t num_col, + int predict_type, + int num_iteration, + const char* parameter, + int64_t* out_len, + int32_t** out_indices, + void** out_data); + /*! * \brief Make prediction for a new dataset in CSR format. This method re-uses the internal predictor structure * from previous calls and is optimized for single row invocation. @@ -812,6 +851,45 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForCSC(BoosterHandle handle, int64_t* out_len, double* out_result); +/*! + * \brief Make sparse prediction for a new dataset in CSC format. Currently only used for feature contributions. + * \note + * The outputs are pre-allocated, as they can vary for each invocation, but the shape should be the same: + * - for feature contributions, the shape of sparse matrix will be ``num_class * num_data * (num_feature + 1)``. + * \param handle Handle of booster + * \param col_ptr Pointer to column headers + * \param col_ptr_type Type of ``col_ptr``, can be ``C_API_DTYPE_INT32`` or ``C_API_DTYPE_INT64`` + * \param indices Pointer to row indices + * \param data Pointer to the data space + * \param data_type Type of ``data`` pointer, can be ``C_API_DTYPE_FLOAT32`` or ``C_API_DTYPE_FLOAT64`` + * \param ncol_ptr Number of columns in the matrix + 1 + * \param nelem Number of nonzero elements in the matrix + * \param num_row Number of rows + * \param predict_type What should be predicted + * - ``C_API_PREDICT_CONTRIB``: feature contributions (SHAP values) + * \param num_iteration Number of iteration for prediction, <= 0 means no limit + * \param parameter Other parameters for prediction, e.g. early stopping for prediction + * \param[out] out_len Length of output result + * \param[out] out_indices Pointer to sparse indices + * \param[out] out_data Pointer to sparse data space + * \return 0 when succeed, -1 when failure happens + */ +LIGHTGBM_C_EXPORT int LGBM_BoosterPredictSparseForCSC(BoosterHandle handle, + const void* col_ptr, + int col_ptr_type, + const int32_t* indices, + const void* data, + int data_type, + int64_t ncol_ptr, + int64_t nelem, + int64_t num_row, + int predict_type, + int num_iteration, + const char* parameter, + int64_t* out_len, + int32_t** out_indices, + void** out_data); + /*! * \brief Make prediction for a new dataset. * \note diff --git a/include/LightGBM/meta.h b/include/LightGBM/meta.h index b15b1ba4b378..7edb0c850a49 100644 --- a/include/LightGBM/meta.h +++ b/include/LightGBM/meta.h @@ -11,6 +11,7 @@ #include #include #include +#include #if (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_AMD64))) || defined(__INTEL_COMPILER) || MM_PREFETCH #include @@ -58,6 +59,9 @@ typedef int32_t comm_size_t; using PredictFunction = std::function>&, double* output)>; +using PredictSparseFunction = +std::function>&, std::vector>& output)>; + typedef void(*ReduceFunction)(const char* input, char* output, int type_size, comm_size_t array_size); diff --git a/include/LightGBM/tree.h b/include/LightGBM/tree.h index 047215231fc6..b37e26af55b4 100644 --- a/include/LightGBM/tree.h +++ b/include/LightGBM/tree.h @@ -135,6 +135,8 @@ class Tree { inline int PredictLeafIndexByMap(const std::unordered_map& feature_values) const; inline void PredictContrib(const double* feature_values, int num_features, double* output); + inline void PredictContribByMap(const std::unordered_map& feature_values, + int num_features, std::unordered_map& output); /*! \brief Get Number of leaves*/ inline int num_leaves() const { return num_leaves_; } @@ -382,6 +384,12 @@ class Tree { PathElement *parent_unique_path, double parent_zero_fraction, double parent_one_fraction, int parent_feature_index) const; + void TreeSHAPByMap(const std::unordered_map& feature_values, + std::unordered_map& phi, + int node, int unique_depth, + PathElement *parent_unique_path, double parent_zero_fraction, + double parent_one_fraction, int parent_feature_index) const; + /*! \brief Extend our decision path with a fraction of one and zero extensions for TreeSHAP*/ static void ExtendPath(PathElement *unique_path, int unique_depth, double zero_fraction, double one_fraction, int feature_index); @@ -525,6 +533,18 @@ inline void Tree::PredictContrib(const double* feature_values, int num_features, } } +inline void Tree::PredictContribByMap(const std::unordered_map& feature_values, + int num_features, std::unordered_map& output) { + output[num_features] += ExpectedValue(); + // Run the recursion with preallocated space for the unique path data + if (num_leaves_ > 1) { + CHECK_GE(max_depth_, 0); + const int max_path_len = max_depth_ + 1; + std::vector unique_path_data(max_path_len*(max_path_len + 1) / 2); + TreeSHAPByMap(feature_values, output, 0, 0, unique_path_data.data(), 1, 1, -1); + } +} + inline void Tree::RecomputeLeafDepths(int node, int depth) { if (node == 0) leaf_depth_.resize(num_leaves()); if (node < 0) { diff --git a/src/application/predictor.hpp b/src/application/predictor.hpp index 1c56cfa5eb2c..025843bcce47 100644 --- a/src/application/predictor.hpp +++ b/src/application/predictor.hpp @@ -87,13 +87,19 @@ class Predictor { predict_fun_ = [=](const std::vector>& features, double* output) { int tid = omp_get_thread_num(); - CopyToPredictBuffer(predict_buf_[tid].data(), features); - // get result for leaf index - boosting_->PredictContrib(predict_buf_[tid].data(), output, - &early_stop_); - ClearPredictBuffer(predict_buf_[tid].data(), predict_buf_[tid].size(), - features); + CopyToPredictBuffer(predict_buf_[tid].data(), features); + // get feature importances + boosting_->PredictContrib(predict_buf_[tid].data(), output); + ClearPredictBuffer(predict_buf_[tid].data(), predict_buf_[tid].size(), + features); + }; + predict_sparse_fun_ = [=](const std::vector>& features, + std::vector>& output) { + auto buf = CopyToPredictMap(features); + // get sparse feature importances + boosting_->PredictContribByMap(buf, output); }; + } else { if (is_raw_score) { predict_fun_ = [=](const std::vector>& features, @@ -140,6 +146,11 @@ class Predictor { return predict_fun_; } + + inline const PredictSparseFunction& GetPredictSparseFunction() const { + return predict_sparse_fun_; + } + /*! * \brief predicting on data, then saving result to disk * \param data_filename Filename of data @@ -275,6 +286,7 @@ class Predictor { const Boosting* boosting_; /*! \brief function for prediction */ PredictFunction predict_fun_; + PredictSparseFunction predict_sparse_fun_; PredictionEarlyStopInstance early_stop_; int num_feature_; int num_pred_one_row_; diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index 6a2e3e27c791..9750ead51802 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -571,8 +571,7 @@ const double* GBDT::GetTrainingScore(int64_t* out_len) { return train_score_updater_->score(); } -void GBDT::PredictContrib(const double* features, double* output, const PredictionEarlyStopInstance* early_stop) const { - int early_stop_round_counter = 0; +void GBDT::PredictContrib(const double* features, double* output) const { // set zero const int num_features = max_feature_idx_ + 1; std::memset(output, 0, sizeof(double) * num_tree_per_iteration_ * (num_features + 1)); @@ -581,17 +580,20 @@ void GBDT::PredictContrib(const double* features, double* output, const Predicti for (int k = 0; k < num_tree_per_iteration_; ++k) { models_[i * num_tree_per_iteration_ + k]->PredictContrib(features, num_features, output + k*(num_features + 1)); } - // check early stopping - ++early_stop_round_counter; - if (early_stop->round_period == early_stop_round_counter) { - if (early_stop->callback_function(output, num_tree_per_iteration_)) { - return; - } - early_stop_round_counter = 0; - } } } +void GBDT::PredictContribByMap(const std::unordered_map& features, + std::vector>& output) const { + const int num_features = max_feature_idx_ + 1; + for (int i = 0; i < num_iteration_for_pred_; ++i) { + // predict all the trees for one iteration + for (int k = 0; k < num_tree_per_iteration_; ++k) { + models_[i * num_tree_per_iteration_ + k]->PredictContribByMap(features, num_features, output[k]); + } + } +} + void GBDT::GetPredictAt(int data_idx, double* out_result, int64_t* out_len) { CHECK(data_idx >= 0 && data_idx <= static_cast(valid_score_updater_.size())); diff --git a/src/boosting/gbdt.h b/src/boosting/gbdt.h index 67c30c86be2e..2069708ff1e8 100644 --- a/src/boosting/gbdt.h +++ b/src/boosting/gbdt.h @@ -240,8 +240,10 @@ class GBDT : public GBDTBase { void PredictLeafIndexByMap(const std::unordered_map& features, double* output) const override; - void PredictContrib(const double* features, double* output, - const PredictionEarlyStopInstance* earlyStop) const override; + void PredictContrib(const double* features, double* output) const override; + + void PredictContribByMap(const std::unordered_map& features, + std::vector>& output) const override; /*! * \brief Dump model to json format string diff --git a/src/io/tree.cpp b/src/io/tree.cpp index be928b7e3124..08c21eb767ef 100644 --- a/src/io/tree.cpp +++ b/src/io/tree.cpp @@ -727,6 +727,56 @@ void Tree::TreeSHAP(const double *feature_values, double *phi, } } +// recursive sparse computation of SHAP values for a decision tree +void Tree::TreeSHAPByMap(const std::unordered_map& feature_values, std::unordered_map& phi, + int node, int unique_depth, + PathElement *parent_unique_path, double parent_zero_fraction, + double parent_one_fraction, int parent_feature_index) const { + // extend the unique path + PathElement* unique_path = parent_unique_path + unique_depth; + if (unique_depth > 0) std::copy(parent_unique_path, parent_unique_path + unique_depth, unique_path); + ExtendPath(unique_path, unique_depth, parent_zero_fraction, + parent_one_fraction, parent_feature_index); + + // leaf node + if (node < 0) { + for (int i = 1; i <= unique_depth; ++i) { + const double w = UnwoundPathSum(unique_path, unique_depth, i); + const PathElement &el = unique_path[i]; + phi[el.feature_index] += w*(el.one_fraction - el.zero_fraction)*leaf_value_[~node]; + } + + // internal node + } else { + const int hot_index = Decision(feature_values.count(split_feature_[node]) > 0 ? feature_values.at(split_feature_[node]) : 0.0f, node); + const int cold_index = (hot_index == left_child_[node] ? right_child_[node] : left_child_[node]); + const double w = data_count(node); + const double hot_zero_fraction = data_count(hot_index) / w; + const double cold_zero_fraction = data_count(cold_index) / w; + double incoming_zero_fraction = 1; + double incoming_one_fraction = 1; + + // see if we have already split on this feature, + // if so we undo that split so we can redo it for this node + int path_index = 0; + for (; path_index <= unique_depth; ++path_index) { + if (unique_path[path_index].feature_index == split_feature_[node]) break; + } + if (path_index != unique_depth + 1) { + incoming_zero_fraction = unique_path[path_index].zero_fraction; + incoming_one_fraction = unique_path[path_index].one_fraction; + UnwindPath(unique_path, unique_depth, path_index); + unique_depth -= 1; + } + + TreeSHAPByMap(feature_values, phi, hot_index, unique_depth + 1, unique_path, + hot_zero_fraction*incoming_zero_fraction, incoming_one_fraction, split_feature_[node]); + + TreeSHAPByMap(feature_values, phi, cold_index, unique_depth + 1, unique_path, + cold_zero_fraction*incoming_zero_fraction, 0, split_feature_[node]); + } +} + double Tree::ExpectedValue() const { if (num_leaves_ == 1) return LeafOutput(0); const double total_count = internal_count_[0];