Skip to content

Commit

Permalink
adding sparse support to TreeSHAP in lightgbm
Browse files Browse the repository at this point in the history
  • Loading branch information
imatiach-msft committed Apr 20, 2020
1 parent 0d3e204 commit 39e14fc
Show file tree
Hide file tree
Showing 8 changed files with 190 additions and 21 deletions.
7 changes: 4 additions & 3 deletions include/LightGBM/boosting.h
Original file line number Diff line number Diff line change
Expand Up @@ -166,10 +166,11 @@ class LIGHTGBM_EXPORT Boosting {
* \brief Feature contributions for the model's prediction of one record
* \param feature_values Feature value on this record
* \param output Prediction result for this record
* \param early_stop Early stopping instance. If nullptr, no early stopping is applied and all models are evaluated.
*/
virtual void PredictContrib(const double* features, double* output,
const PredictionEarlyStopInstance* early_stop) const = 0;
virtual void PredictContrib(const double* features, double* output) const = 0;

virtual void PredictContribByMap(const std::unordered_map<int, double>& features,
std::vector<std::unordered_map<int, double>>& output) const = 0;

/*!
* \brief Dump model to json format string
Expand Down
78 changes: 78 additions & 0 deletions include/LightGBM/c_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -727,6 +727,45 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForCSR(BoosterHandle handle,
int64_t* out_len,
double* out_result);

/*!
* \brief Make sparse prediction for a new dataset in CSR format. Currently only used for feature contributions.
* \note
* The outputs are pre-allocated, as they can vary for each invocation, but the shape should be the same:
* - for feature contributions, the shape of sparse matrix will be ``num_class * num_data * (num_feature + 1)``.
* \param handle Handle of booster
* \param indptr Pointer to row headers
* \param indptr_type Type of ``indptr``, can be ``C_API_DTYPE_INT32`` or ``C_API_DTYPE_INT64``
* \param indices Pointer to column indices
* \param data Pointer to the data space
* \param data_type Type of ``data`` pointer, can be ``C_API_DTYPE_FLOAT32`` or ``C_API_DTYPE_FLOAT64``
* \param nindptr Number of rows in the matrix + 1
* \param nelem Number of nonzero elements in the matrix
* \param num_col Number of columns
* \param predict_type What should be predicted, only feature contributions supported currently
* - ``C_API_PREDICT_CONTRIB``: feature contributions (SHAP values)
* \param num_iteration Number of iterations for prediction, <= 0 means no limit
* \param parameter Other parameters for prediction, e.g. early stopping for prediction
* \param[out] out_len Length of output result
* \param[out] out_indices Pointer to sparse indices
* \param[out] out_data Pointer to sparse data space
* \return 0 when succeed, -1 when failure happens
*/
LIGHTGBM_C_EXPORT int LGBM_BoosterPredictSparseForCSR(BoosterHandle handle,
const void* indptr,
int indptr_type,
const int32_t* indices,
const void* data,
int data_type,
int64_t nindptr,
int64_t nelem,
int64_t num_col,
int predict_type,
int num_iteration,
const char* parameter,
int64_t* out_len,
int32_t** out_indices,
void** out_data);

/*!
* \brief Make prediction for a new dataset in CSR format. This method re-uses the internal predictor structure
* from previous calls and is optimized for single row invocation.
Expand Down Expand Up @@ -812,6 +851,45 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForCSC(BoosterHandle handle,
int64_t* out_len,
double* out_result);

/*!
* \brief Make sparse prediction for a new dataset in CSC format. Currently only used for feature contributions.
* \note
* The outputs are pre-allocated, as they can vary for each invocation, but the shape should be the same:
* - for feature contributions, the shape of sparse matrix will be ``num_class * num_data * (num_feature + 1)``.
* \param handle Handle of booster
* \param col_ptr Pointer to column headers
* \param col_ptr_type Type of ``col_ptr``, can be ``C_API_DTYPE_INT32`` or ``C_API_DTYPE_INT64``
* \param indices Pointer to row indices
* \param data Pointer to the data space
* \param data_type Type of ``data`` pointer, can be ``C_API_DTYPE_FLOAT32`` or ``C_API_DTYPE_FLOAT64``
* \param ncol_ptr Number of columns in the matrix + 1
* \param nelem Number of nonzero elements in the matrix
* \param num_row Number of rows
* \param predict_type What should be predicted
* - ``C_API_PREDICT_CONTRIB``: feature contributions (SHAP values)
* \param num_iteration Number of iteration for prediction, <= 0 means no limit
* \param parameter Other parameters for prediction, e.g. early stopping for prediction
* \param[out] out_len Length of output result
* \param[out] out_indices Pointer to sparse indices
* \param[out] out_data Pointer to sparse data space
* \return 0 when succeed, -1 when failure happens
*/
LIGHTGBM_C_EXPORT int LGBM_BoosterPredictSparseForCSC(BoosterHandle handle,
const void* col_ptr,
int col_ptr_type,
const int32_t* indices,
const void* data,
int data_type,
int64_t ncol_ptr,
int64_t nelem,
int64_t num_row,
int predict_type,
int num_iteration,
const char* parameter,
int64_t* out_len,
int32_t** out_indices,
void** out_data);

/*!
* \brief Make prediction for a new dataset.
* \note
Expand Down
4 changes: 4 additions & 0 deletions include/LightGBM/meta.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include <memory>
#include <utility>
#include <vector>
#include <unordered_map>

#if (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_AMD64))) || defined(__INTEL_COMPILER) || MM_PREFETCH
#include <xmmintrin.h>
Expand Down Expand Up @@ -58,6 +59,9 @@ typedef int32_t comm_size_t;
using PredictFunction =
std::function<void(const std::vector<std::pair<int, double>>&, double* output)>;

using PredictSparseFunction =
std::function<void(const std::vector<std::pair<int, double>>&, std::vector<std::unordered_map<int, double>>& output)>;

typedef void(*ReduceFunction)(const char* input, char* output, int type_size, comm_size_t array_size);


Expand Down
20 changes: 20 additions & 0 deletions include/LightGBM/tree.h
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,8 @@ class Tree {
inline int PredictLeafIndexByMap(const std::unordered_map<int, double>& feature_values) const;

inline void PredictContrib(const double* feature_values, int num_features, double* output);
inline void PredictContribByMap(const std::unordered_map<int, double>& feature_values,
int num_features, std::unordered_map<int, double>& output);

/*! \brief Get Number of leaves*/
inline int num_leaves() const { return num_leaves_; }
Expand Down Expand Up @@ -382,6 +384,12 @@ class Tree {
PathElement *parent_unique_path, double parent_zero_fraction,
double parent_one_fraction, int parent_feature_index) const;

void TreeSHAPByMap(const std::unordered_map<int, double>& feature_values,
std::unordered_map<int, double>& phi,
int node, int unique_depth,
PathElement *parent_unique_path, double parent_zero_fraction,
double parent_one_fraction, int parent_feature_index) const;

/*! \brief Extend our decision path with a fraction of one and zero extensions for TreeSHAP*/
static void ExtendPath(PathElement *unique_path, int unique_depth,
double zero_fraction, double one_fraction, int feature_index);
Expand Down Expand Up @@ -525,6 +533,18 @@ inline void Tree::PredictContrib(const double* feature_values, int num_features,
}
}

inline void Tree::PredictContribByMap(const std::unordered_map<int, double>& feature_values,
int num_features, std::unordered_map<int, double>& output) {
output[num_features] += ExpectedValue();
// Run the recursion with preallocated space for the unique path data
if (num_leaves_ > 1) {
CHECK_GE(max_depth_, 0);
const int max_path_len = max_depth_ + 1;
std::vector<PathElement> unique_path_data(max_path_len*(max_path_len + 1) / 2);
TreeSHAPByMap(feature_values, output, 0, 0, unique_path_data.data(), 1, 1, -1);
}
}

inline void Tree::RecomputeLeafDepths(int node, int depth) {
if (node == 0) leaf_depth_.resize(num_leaves());
if (node < 0) {
Expand Down
24 changes: 18 additions & 6 deletions src/application/predictor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,13 +87,19 @@ class Predictor {
predict_fun_ = [=](const std::vector<std::pair<int, double>>& features,
double* output) {
int tid = omp_get_thread_num();
CopyToPredictBuffer(predict_buf_[tid].data(), features);
// get result for leaf index
boosting_->PredictContrib(predict_buf_[tid].data(), output,
&early_stop_);
ClearPredictBuffer(predict_buf_[tid].data(), predict_buf_[tid].size(),
features);
CopyToPredictBuffer(predict_buf_[tid].data(), features);
// get feature importances
boosting_->PredictContrib(predict_buf_[tid].data(), output);
ClearPredictBuffer(predict_buf_[tid].data(), predict_buf_[tid].size(),
features);
};
predict_sparse_fun_ = [=](const std::vector<std::pair<int, double>>& features,
std::vector<std::unordered_map<int, double>>& output) {
auto buf = CopyToPredictMap(features);
// get sparse feature importances
boosting_->PredictContribByMap(buf, output);
};

} else {
if (is_raw_score) {
predict_fun_ = [=](const std::vector<std::pair<int, double>>& features,
Expand Down Expand Up @@ -140,6 +146,11 @@ class Predictor {
return predict_fun_;
}


inline const PredictSparseFunction& GetPredictSparseFunction() const {
return predict_sparse_fun_;
}

/*!
* \brief predicting on data, then saving result to disk
* \param data_filename Filename of data
Expand Down Expand Up @@ -275,6 +286,7 @@ class Predictor {
const Boosting* boosting_;
/*! \brief function for prediction */
PredictFunction predict_fun_;
PredictSparseFunction predict_sparse_fun_;
PredictionEarlyStopInstance early_stop_;
int num_feature_;
int num_pred_one_row_;
Expand Down
22 changes: 12 additions & 10 deletions src/boosting/gbdt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -571,8 +571,7 @@ const double* GBDT::GetTrainingScore(int64_t* out_len) {
return train_score_updater_->score();
}

void GBDT::PredictContrib(const double* features, double* output, const PredictionEarlyStopInstance* early_stop) const {
int early_stop_round_counter = 0;
void GBDT::PredictContrib(const double* features, double* output) const {
// set zero
const int num_features = max_feature_idx_ + 1;
std::memset(output, 0, sizeof(double) * num_tree_per_iteration_ * (num_features + 1));
Expand All @@ -581,17 +580,20 @@ void GBDT::PredictContrib(const double* features, double* output, const Predicti
for (int k = 0; k < num_tree_per_iteration_; ++k) {
models_[i * num_tree_per_iteration_ + k]->PredictContrib(features, num_features, output + k*(num_features + 1));
}
// check early stopping
++early_stop_round_counter;
if (early_stop->round_period == early_stop_round_counter) {
if (early_stop->callback_function(output, num_tree_per_iteration_)) {
return;
}
early_stop_round_counter = 0;
}
}
}

void GBDT::PredictContribByMap(const std::unordered_map<int, double>& features,
std::vector<std::unordered_map<int, double>>& output) const {
const int num_features = max_feature_idx_ + 1;
for (int i = 0; i < num_iteration_for_pred_; ++i) {
// predict all the trees for one iteration
for (int k = 0; k < num_tree_per_iteration_; ++k) {
models_[i * num_tree_per_iteration_ + k]->PredictContribByMap(features, num_features, output[k]);
}
}
}

void GBDT::GetPredictAt(int data_idx, double* out_result, int64_t* out_len) {
CHECK(data_idx >= 0 && data_idx <= static_cast<int>(valid_score_updater_.size()));

Expand Down
6 changes: 4 additions & 2 deletions src/boosting/gbdt.h
Original file line number Diff line number Diff line change
Expand Up @@ -240,8 +240,10 @@ class GBDT : public GBDTBase {

void PredictLeafIndexByMap(const std::unordered_map<int, double>& features, double* output) const override;

void PredictContrib(const double* features, double* output,
const PredictionEarlyStopInstance* earlyStop) const override;
void PredictContrib(const double* features, double* output) const override;

void PredictContribByMap(const std::unordered_map<int, double>& features,
std::vector<std::unordered_map<int, double>>& output) const override;

/*!
* \brief Dump model to json format string
Expand Down
50 changes: 50 additions & 0 deletions src/io/tree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -727,6 +727,56 @@ void Tree::TreeSHAP(const double *feature_values, double *phi,
}
}

// recursive sparse computation of SHAP values for a decision tree
void Tree::TreeSHAPByMap(const std::unordered_map<int, double>& feature_values, std::unordered_map<int, double>& phi,
int node, int unique_depth,
PathElement *parent_unique_path, double parent_zero_fraction,
double parent_one_fraction, int parent_feature_index) const {
// extend the unique path
PathElement* unique_path = parent_unique_path + unique_depth;
if (unique_depth > 0) std::copy(parent_unique_path, parent_unique_path + unique_depth, unique_path);
ExtendPath(unique_path, unique_depth, parent_zero_fraction,
parent_one_fraction, parent_feature_index);

// leaf node
if (node < 0) {
for (int i = 1; i <= unique_depth; ++i) {
const double w = UnwoundPathSum(unique_path, unique_depth, i);
const PathElement &el = unique_path[i];
phi[el.feature_index] += w*(el.one_fraction - el.zero_fraction)*leaf_value_[~node];
}

// internal node
} else {
const int hot_index = Decision(feature_values.count(split_feature_[node]) > 0 ? feature_values.at(split_feature_[node]) : 0.0f, node);
const int cold_index = (hot_index == left_child_[node] ? right_child_[node] : left_child_[node]);
const double w = data_count(node);
const double hot_zero_fraction = data_count(hot_index) / w;
const double cold_zero_fraction = data_count(cold_index) / w;
double incoming_zero_fraction = 1;
double incoming_one_fraction = 1;

// see if we have already split on this feature,
// if so we undo that split so we can redo it for this node
int path_index = 0;
for (; path_index <= unique_depth; ++path_index) {
if (unique_path[path_index].feature_index == split_feature_[node]) break;
}
if (path_index != unique_depth + 1) {
incoming_zero_fraction = unique_path[path_index].zero_fraction;
incoming_one_fraction = unique_path[path_index].one_fraction;
UnwindPath(unique_path, unique_depth, path_index);
unique_depth -= 1;
}

TreeSHAPByMap(feature_values, phi, hot_index, unique_depth + 1, unique_path,
hot_zero_fraction*incoming_zero_fraction, incoming_one_fraction, split_feature_[node]);

TreeSHAPByMap(feature_values, phi, cold_index, unique_depth + 1, unique_path,
cold_zero_fraction*incoming_zero_fraction, 0, split_feature_[node]);
}
}

double Tree::ExpectedValue() const {
if (num_leaves_ == 1) return LeafOutput(0);
const double total_count = internal_count_[0];
Expand Down

0 comments on commit 39e14fc

Please sign in to comment.