Skip to content

Commit

Permalink
merge from master
Browse files Browse the repository at this point in the history
  • Loading branch information
guolinke committed Jan 25, 2017
2 parents d085c16 + 4f23257 commit 66bf4c2
Show file tree
Hide file tree
Showing 20 changed files with 415 additions and 196 deletions.
6 changes: 4 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -111,5 +111,7 @@ endif()

install(TARGETS lightgbm _lightgbm
RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/bin
LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/lib)
install(DIRECTORY ${LightGBM_HEADER_DIR}/LightGBM DESTINATION ${CMAKE_INSTALL_PREFIX}/include)
LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/lib
ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/lib)

install(DIRECTORY ${LightGBM_HEADER_DIR}/LightGBM DESTINATION ${CMAKE_INSTALL_PREFIX}/include)
14 changes: 7 additions & 7 deletions R-package/DESCRIPTION
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ Description: LightGBM is a gradient boosting framework that uses tree based lear
1.Faster training speed and higher efficiency.
2.Lower memory usage.
3.Better accuracy.
4.Parallel learning supported
5. Capable of handling large-scale data
4.Parallel learning supported.
5. Capable of handling large-scale data.
License: The MIT License (MIT) | file LICENSE
URL: https://github.com/Microsoft/LightGBM
BugReports: https://github.com/Microsoft/LightGBM/issues
Expand All @@ -25,14 +25,14 @@ Suggests:
vcd (>= 1.3),
testthat,
igraph (>= 1.0.1),
methods,
data.table (>= 1.9.6),
magrittr (>= 1.5),
stringi (>= 0.5.2)
Depends:
R (>= 3.0),
R6
Imports:
methods,
Matrix (>= 1.1-0),
methods
RoxygenNote: 5.0.1
data.table (>= 1.9.6),
magrittr (>= 1.5),
jsonlite
RoxygenNote: 5.0.1
5 changes: 5 additions & 0 deletions R-package/NAMESPACE
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,17 @@ export(lgb.Dataset.set.reference)
export(lgb.cv)
export(lgb.dump)
export(lgb.get.eval.result)
export(lgb.importance)
export(lgb.load)
export(lgb.model.dt.tree)
export(lgb.save)
export(lgb.train)
export(lightgbm)
export(setinfo)
export(slice)
import(methods)
importFrom(R6,R6Class)
importFrom(data.table,":=")
importFrom(magrittr,"%>%")
importFrom(magrittr,"%T>%")
useDynLib(lightgbm)
55 changes: 55 additions & 0 deletions R-package/R/lgb.importance.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#' Compute feature importance in a model
#'
#' Creates a \code{data.table} of feature importances in a model.
#'
#' @param model object of class \code{lgb.Booster}.
#' @param percentage whether to show importance in relative percentage.
#'
#' @return
#'
#' For a tree model, a \code{data.table} with the following columns:
#' \itemize{
#' \item \code{Feature} Feature names in the model.
#' \item \code{Gain} The total gain of this feature's splits.
#' \item \code{Cover} The number of observation related to this feature.
#' \item \code{Frequency} The number of times a feature splited in trees.
#' }
#'
#' @examples
#'
#' data(agaricus.train, package = 'lightgbm')
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#'
#' params = list(objective = "binary",
#' learning_rate = 0.01, num_leaves = 63, max_depth = -1,
#' min_data_in_leaf = 1, min_sum_hessian_in_leaf = 1)
#' model <- lgb.train(params, dtrain, 20)
#' model <- lgb.train(params, dtrain, 20)
#'
#' tree_imp1 <- lgb.importance(model, percentage = TRUE)
#' tree_imp2 <- lgb.importance(model, percentage = FALSE)
#'
#' @importFrom magrittr %>% %T>%
#' @importFrom data.table :=
#' @export

lgb.importance <- function(model, percentage = TRUE) {
if (!any(class(model) == "lgb.Booster")) {
stop("'model' has to be an object of class lgb.Booster")
}
tree_dt <- lgb.model.dt.tree(model)
tree_imp <- tree_dt %>%
magrittr::extract(.,
i = is.na(split_index) == FALSE,
j = .(Gain = sum(split_gain), Cover = sum(internal_count), Frequency = .N),
by = "split_feature") %T>%
data.table::setnames(., old = "split_feature", new = "Feature") %>%
magrittr::extract(., i = order(Gain, decreasing = TRUE))
if (percentage) {
tree_imp[, ":="(Gain = Gain / sum(Gain),
Cover = Cover / sum(Cover),
Frequency = Frequency / sum(Frequency))]
}
return(tree_imp)
}
95 changes: 95 additions & 0 deletions R-package/R/lgb.model.dt.tree.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
#' Parse a LightGBM model json dump
#'
#' Parse a LightGBM model json dump into a \code{data.table} structure.
#'
#' @param model object of class \code{lgb.Booster}
#'
#' @return
#' A \code{data.table} with detailed information about model trees' nodes and leafs.
#'
#' The columns of the \code{data.table} are:
#'
#' \itemize{
#' \item \code{tree_index}: ID of a tree in a model (integer)
#' \item \code{split_index}: ID of a node in a tree (integer)
#' \item \code{split_feature}: for a node, it's a feature name (character);
#' for a leaf, it simply labels it as \code{'NA'}
#' \item \code{node_parent}: ID of the parent node for current node (integer)
#' \item \code{leaf_index}: ID of a leaf in a tree (integer)
#' \item \code{leaf_parent}: ID of the parent node for current leaf (integer)
#' \item \code{split_gain}: Split gain of a node
#' \item \code{threshold}: Spliting threshold value of a node
#' \item \code{decision_type}: Decision type of a node
#' \item \code{internal_value}: Node value
#' \item \code{internal_count}: The number of observation collected by a node
#' \item \code{leaf_value}: Leaf value
#' \item \code{leaf_count}: The number of observation collected by a leaf
#' }
#'
#' @examples
#'
#' data(agaricus.train, package = 'lightgbm')
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#'
#' params = list(objective = "binary",
#' learning_rate = 0.01, num_leaves = 63, max_depth = -1,
#' min_data_in_leaf = 1, min_sum_hessian_in_leaf = 1)
#' model <- lgb.train(params, dtrain, 20)
#' model <- lgb.train(params, dtrain, 20)
#'
#' tree_dt <- lgb.model.dt.tree(model)
#'
#' @importFrom magrittr %>%
#' @importFrom data.table :=
#' @export

lgb.model.dt.tree <- function(model, num_iteration = NULL) {
json_model <- lgb.dump(model, num_iteration = num_iteration)
parsed_json_model <- jsonlite::fromJSON(json_model,
simplifyVector = TRUE,
simplifyDataFrame = FALSE,
simplifyMatrix = FALSE,
flatten = FALSE)
tree_list <- lapply(parsed_json_model$tree_info, single.tree.parse)
tree_dt <- data.table::rbindlist(tree_list, use.names = TRUE)
tree_dt[, split_feature := Lookup(split_feature,
seq(0, parsed_json_model$max_feature_idx, by = 1),
parsed_json_model$feature_names)]
return(tree_dt)
}

single.tree.parse <- function(lgb_tree) {
single_tree_dt <- data.table::data.table(tree_index = integer(0),
split_index = integer(0), split_feature = integer(0), node_parent = integer(0),
leaf_index = integer(0), leaf_parent = integer(0),
split_gain = numeric(0), threshold = numeric(0), decision_type = character(0),
internal_value = integer(0), internal_count = integer(0),
leaf_value = integer(0), leaf_count = integer(0))
pre_order_traversal <- function(tree_node_leaf, parent_index = NA) {
if (!is.null(tree_node_leaf$split_index)) {
single_tree_dt <<- data.table::rbindlist(l = list(single_tree_dt,
c(tree_node_leaf[c("split_index", "split_feature",
"split_gain", "threshold", "decision_type",
"internal_value", "internal_count")],
"node_parent" = parent_index)),
use.names = TRUE, fill = TRUE)
pre_order_traversal(tree_node_leaf$left_child, parent_index = tree_node_leaf$split_index)
pre_order_traversal(tree_node_leaf$right_child, parent_index = tree_node_leaf$split_index)
} else if (!is.null(tree_node_leaf$leaf_index)) {
single_tree_dt <<- data.table::rbindlist(l = list(single_tree_dt,
tree_node_leaf[c("leaf_index", "leaf_parent",
"leaf_value", "leaf_count")]),
use.names = TRUE, fill = TRUE)
}
}
pre_order_traversal(lgb_tree$tree_structure)
single_tree_dt[, tree_index := lgb_tree$tree_index]
return(single_tree_dt)
}

Lookup <- function(key, key_lookup, value_lookup, missing = NA) {
match(key, key_lookup) %>%
magrittr::extract(value_lookup, .) %>%
magrittr::inset(. , is.na(.), missing)
}
42 changes: 42 additions & 0 deletions R-package/man/lgb.importance.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

52 changes: 52 additions & 0 deletions R-package/man/lgb.model.dt.tree.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

11 changes: 2 additions & 9 deletions include/LightGBM/bin.h
Original file line number Diff line number Diff line change
Expand Up @@ -126,10 +126,6 @@ class BinMapper {
return 0;
}
}

inline uint32_t GetMaxHeavyBin() const {
return max_heavy_bin_;
}
/*!
* \brief Construct feature value to bin mapper according feature values
* \param column_name name of this column
Expand Down Expand Up @@ -193,8 +189,6 @@ class BinMapper {
double min_val_;
/*! \brief maximum feature value */
double max_val_;
/*! \brief default bin value */
uint32_t max_heavy_bin_;
};

/*! \brief Iterator for one bin column */
Expand Down Expand Up @@ -310,7 +304,7 @@ class Bin {
*/
static Bin* CreateBin(data_size_t num_data, int num_bin,
double sparse_rate, bool is_enable_sparse,
bool* is_sparse, uint32_t default_bin, uint32_t max_heavy_bin, BinType bin_type);
bool* is_sparse, uint32_t default_bin, BinType bin_type);

/*!
* \brief Create object for bin data of one feature, used for dense feature
Expand All @@ -328,12 +322,11 @@ class Bin {
* \param num_data Total number of data
* \param num_bin Number of bin
* \param default_bin Default bin for zeros value
* \param max_heavy_bin max heavy bin value
* \param bin_type type of bin
* \return The bin data object
*/
static Bin* CreateSparseBin(data_size_t num_data,
int num_bin, uint32_t default_bin, uint32_t max_heavy_bin, BinType bin_type);
int num_bin, uint32_t default_bin, BinType bin_type);
};

inline unsigned int BinMapper::ValueToBin(double value) const {
Expand Down
5 changes: 3 additions & 2 deletions include/LightGBM/dataset.h
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,6 @@ class Metadata {

void SetQuery(const data_size_t* query, data_size_t len);

void SetQueryId(const data_size_t* query_id, data_size_t len);

/*!
* \brief Set initial scores
* \param init_score Initial scores, this class will manage memory for init_score.
Expand Down Expand Up @@ -244,6 +242,9 @@ class Metadata {
std::vector<data_size_t> queries_;
/*! \brief mutex for threading safe call */
std::mutex mutex_;
bool weight_load_from_file_;
bool query_load_from_file_;
bool init_score_load_from_file_;
};


Expand Down
4 changes: 2 additions & 2 deletions include/LightGBM/dataset_loader.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,6 @@ class DatasetLoader {

LIGHTGBM_EXPORT Dataset* LoadFromFileAlignWithOtherDataset(const char* filename, const Dataset* train_data);

LIGHTGBM_EXPORT Dataset* LoadFromBinFile(const char* data_filename, const char* bin_filename, int rank, int num_machines);

LIGHTGBM_EXPORT Dataset* CostructFromSampleData(std::vector<std::vector<double>>& sample_values, size_t total_sample_size, data_size_t num_data);

/*! \brief Disable copy */
Expand All @@ -31,6 +29,8 @@ class DatasetLoader {

private:

Dataset* LoadFromBinFile(const char* data_filename, const char* bin_filename, int rank, int num_machines, int* num_global_data, std::vector<data_size_t>* used_data_indices);

void SetHeader(const char* filename);

void CheckDataset(const Dataset* dataset);
Expand Down
Loading

0 comments on commit 66bf4c2

Please sign in to comment.