merge from master

microsoft · Jan 25, 2017 · 66bf4c2 · 66bf4c2
2 parents d085c16 + 4f23257
commit 66bf4c2
Show file tree

Hide file tree

Showing 20 changed files with 415 additions and 196 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -111,5 +111,7 @@ endif()
 
 install(TARGETS lightgbm _lightgbm
         RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/bin
-        LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/lib)
-install(DIRECTORY ${LightGBM_HEADER_DIR}/LightGBM DESTINATION ${CMAKE_INSTALL_PREFIX}/include)
+        LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/lib
+        ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/lib)
+
+install(DIRECTORY ${LightGBM_HEADER_DIR}/LightGBM DESTINATION ${CMAKE_INSTALL_PREFIX}/include)
diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION
@@ -10,8 +10,8 @@ Description: LightGBM is a gradient boosting framework that uses tree based lear
         1.Faster training speed and higher efficiency. 
         2.Lower memory usage. 
         3.Better accuracy. 
-        4.Parallel learning supported 
-        5. Capable of handling large-scale data
+        4.Parallel learning supported. 
+        5. Capable of handling large-scale data. 
 License: The MIT License (MIT) | file LICENSE
 URL: https://github.com/Microsoft/LightGBM
 BugReports: https://github.com/Microsoft/LightGBM/issues
@@ -25,14 +25,14 @@ Suggests:
     vcd (>= 1.3),
     testthat,
     igraph (>= 1.0.1),
-    methods,
-    data.table (>= 1.9.6),
-    magrittr (>= 1.5),
     stringi (>= 0.5.2)
 Depends:
     R (>= 3.0),
     R6
 Imports:
+    methods,
     Matrix (>= 1.1-0),
-    methods
-RoxygenNote: 5.0.1
+    data.table (>= 1.9.6),
+    magrittr (>= 1.5),
+    jsonlite
+RoxygenNote: 5.0.1
diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE
@@ -17,12 +17,17 @@ export(lgb.Dataset.set.reference)
 export(lgb.cv)
 export(lgb.dump)
 export(lgb.get.eval.result)
+export(lgb.importance)
 export(lgb.load)
+export(lgb.model.dt.tree)
 export(lgb.save)
 export(lgb.train)
 export(lightgbm)
 export(setinfo)
 export(slice)
 import(methods)
 importFrom(R6,R6Class)
+importFrom(data.table,":=")
+importFrom(magrittr,"%>%")
+importFrom(magrittr,"%T>%")
 useDynLib(lightgbm)
diff --git a/R-package/R/lgb.importance.R b/R-package/R/lgb.importance.R
@@ -0,0 +1,55 @@
+#' Compute feature importance in a model
+#'
+#' Creates a \code{data.table} of feature importances in a model.
+#'
+#' @param model object of class \code{lgb.Booster}.
+#' @param percentage whether to show importance in relative percentage.
+#'
+#' @return
+#'
+#' For a tree model, a \code{data.table} with the following columns:
+#' \itemize{
+#'   \item \code{Feature} Feature names in the model.
+#'   \item \code{Gain} The total gain of this feature's splits.
+#'   \item \code{Cover} The number of observation related to this feature.
+#'   \item \code{Frequency} The number of times a feature splited in trees.
+#' }
+#'
+#' @examples
+#'
+#' data(agaricus.train, package = 'lightgbm')
+#' train <- agaricus.train
+#' dtrain <- lgb.Dataset(train$data, label = train$label)
+#'
+#' params = list(objective = "binary",
+#'               learning_rate = 0.01, num_leaves = 63, max_depth = -1,
+#'               min_data_in_leaf = 1, min_sum_hessian_in_leaf = 1)
+#'               model <- lgb.train(params, dtrain, 20)
+#' model <- lgb.train(params, dtrain, 20)
+#'
+#' tree_imp1 <- lgb.importance(model, percentage = TRUE)
+#' tree_imp2 <- lgb.importance(model, percentage = FALSE)
+#'
+#' @importFrom magrittr %>% %T>%
+#' @importFrom data.table :=
+#' @export
+
+lgb.importance <- function(model, percentage = TRUE) {
+  if (!any(class(model) == "lgb.Booster")) {
+    stop("'model' has to be an object of class lgb.Booster")
+  }
+  tree_dt <- lgb.model.dt.tree(model)
+  tree_imp <- tree_dt %>%
+    magrittr::extract(.,
+                      i = is.na(split_index) == FALSE,
+                      j = .(Gain = sum(split_gain), Cover = sum(internal_count), Frequency = .N),
+                      by = "split_feature") %T>%
+    data.table::setnames(., old = "split_feature", new = "Feature") %>%
+    magrittr::extract(., i = order(Gain, decreasing = TRUE))
+  if (percentage) {
+    tree_imp[, ":="(Gain = Gain / sum(Gain),
+                    Cover = Cover / sum(Cover),
+                    Frequency = Frequency / sum(Frequency))]
+  }
+  return(tree_imp)
+}
diff --git a/R-package/R/lgb.model.dt.tree.R b/R-package/R/lgb.model.dt.tree.R
@@ -0,0 +1,95 @@
+#' Parse a LightGBM model json dump
+#'
+#' Parse a LightGBM model json dump into a \code{data.table} structure.
+#'
+#' @param model object of class \code{lgb.Booster}
+#'
+#' @return
+#' A \code{data.table} with detailed information about model trees' nodes and leafs.
+#'
+#' The columns of the \code{data.table} are:
+#'
+#' \itemize{
+#'  \item \code{tree_index}: ID of a tree in a model (integer)
+#'  \item \code{split_index}: ID of a node in a tree (integer)
+#'  \item \code{split_feature}: for a node, it's a feature name (character);
+#'                              for a leaf, it simply labels it as \code{'NA'}
+#'  \item \code{node_parent}: ID of the parent node for current node (integer)
+#'  \item \code{leaf_index}: ID of a leaf in a tree (integer)
+#'  \item \code{leaf_parent}: ID of the parent node for current leaf (integer)
+#'  \item \code{split_gain}: Split gain of a node
+#'  \item \code{threshold}: Spliting threshold value of a node
+#'  \item \code{decision_type}: Decision type of a node
+#'  \item \code{internal_value}: Node value
+#'  \item \code{internal_count}: The number of observation collected by a node
+#'  \item \code{leaf_value}: Leaf value
+#'  \item \code{leaf_count}: The number of observation collected by a leaf
+#' }
+#'
+#' @examples
+#'
+#' data(agaricus.train, package = 'lightgbm')
+#' train <- agaricus.train
+#' dtrain <- lgb.Dataset(train$data, label = train$label)
+#'
+#' params = list(objective = "binary",
+#'               learning_rate = 0.01, num_leaves = 63, max_depth = -1,
+#'               min_data_in_leaf = 1, min_sum_hessian_in_leaf = 1)
+#'               model <- lgb.train(params, dtrain, 20)
+#' model <- lgb.train(params, dtrain, 20)
+#'
+#' tree_dt <- lgb.model.dt.tree(model)
+#'
+#' @importFrom magrittr %>%
+#' @importFrom data.table :=
+#' @export
+
+lgb.model.dt.tree <- function(model, num_iteration = NULL) {
+  json_model <- lgb.dump(model, num_iteration = num_iteration)
+  parsed_json_model <- jsonlite::fromJSON(json_model,
+                                          simplifyVector = TRUE,
+                                          simplifyDataFrame = FALSE,
+                                          simplifyMatrix = FALSE,
+                                          flatten = FALSE)
+  tree_list <- lapply(parsed_json_model$tree_info, single.tree.parse)
+  tree_dt <- data.table::rbindlist(tree_list, use.names = TRUE)
+  tree_dt[, split_feature := Lookup(split_feature,
+                                    seq(0, parsed_json_model$max_feature_idx, by = 1),
+                                    parsed_json_model$feature_names)]
+  return(tree_dt)
+}
+
+single.tree.parse <- function(lgb_tree) {
+  single_tree_dt <- data.table::data.table(tree_index = integer(0),
+                                           split_index = integer(0), split_feature = integer(0), node_parent = integer(0),
+                                           leaf_index = integer(0), leaf_parent = integer(0),
+                                           split_gain = numeric(0), threshold = numeric(0), decision_type = character(0),
+                                           internal_value = integer(0), internal_count = integer(0),
+                                           leaf_value = integer(0), leaf_count = integer(0))
+  pre_order_traversal <- function(tree_node_leaf, parent_index = NA) {
+    if (!is.null(tree_node_leaf$split_index)) {
+      single_tree_dt <<- data.table::rbindlist(l = list(single_tree_dt,
+                                                        c(tree_node_leaf[c("split_index", "split_feature",
+                                                                           "split_gain", "threshold", "decision_type",
+                                                                           "internal_value", "internal_count")],
+                                                          "node_parent" = parent_index)),
+                                               use.names = TRUE, fill = TRUE)
+      pre_order_traversal(tree_node_leaf$left_child, parent_index = tree_node_leaf$split_index)
+      pre_order_traversal(tree_node_leaf$right_child, parent_index = tree_node_leaf$split_index)
+    } else if (!is.null(tree_node_leaf$leaf_index)) {
+      single_tree_dt <<- data.table::rbindlist(l = list(single_tree_dt,
+                                                        tree_node_leaf[c("leaf_index", "leaf_parent",
+                                                                         "leaf_value", "leaf_count")]),
+                                               use.names = TRUE, fill = TRUE)
+    }
+  }
+  pre_order_traversal(lgb_tree$tree_structure)
+  single_tree_dt[, tree_index := lgb_tree$tree_index]
+  return(single_tree_dt)
+}
+
+Lookup <- function(key, key_lookup, value_lookup, missing = NA) {
+  match(key, key_lookup) %>%
+    magrittr::extract(value_lookup, .) %>%
+    magrittr::inset(. , is.na(.), missing)
+}
diff --git a/R-package/man/lgb.importance.Rd b/R-package/man/lgb.importance.Rd
diff --git a/R-package/man/lgb.model.dt.tree.Rd b/R-package/man/lgb.model.dt.tree.Rd
diff --git a/include/LightGBM/bin.h b/include/LightGBM/bin.h
@@ -126,10 +126,6 @@ class BinMapper {
       return 0;
     }
   }
-
-  inline uint32_t GetMaxHeavyBin() const {
-    return max_heavy_bin_;
-  }
   /*!
   * \brief Construct feature value to bin mapper according feature values
   * \param column_name name of this column
@@ -193,8 +189,6 @@ class BinMapper {
   double min_val_;
   /*! \brief maximum feature value */
   double max_val_;
-  /*! \brief default bin value */
-  uint32_t max_heavy_bin_;
 };
 
 /*! \brief Iterator for one bin column */
@@ -310,7 +304,7 @@ class Bin {
   */
   static Bin* CreateBin(data_size_t num_data, int num_bin,
     double sparse_rate, bool is_enable_sparse, 
-    bool* is_sparse, uint32_t default_bin, uint32_t max_heavy_bin, BinType bin_type);
+    bool* is_sparse, uint32_t default_bin, BinType bin_type);
 
   /*!
   * \brief Create object for bin data of one feature, used for dense feature
@@ -328,12 +322,11 @@ class Bin {
   * \param num_data Total number of data
   * \param num_bin Number of bin
   * \param default_bin Default bin for zeros value
-  * \param max_heavy_bin max heavy bin value
   * \param bin_type type of bin
   * \return The bin data object
   */
   static Bin* CreateSparseBin(data_size_t num_data,
-    int num_bin, uint32_t default_bin, uint32_t max_heavy_bin, BinType bin_type);
+    int num_bin, uint32_t default_bin, BinType bin_type);
 };
 
 inline unsigned int BinMapper::ValueToBin(double value) const {

diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h
@@ -88,8 +88,6 @@ class Metadata {
 
   void SetQuery(const data_size_t* query, data_size_t len);
 
-  void SetQueryId(const data_size_t* query_id, data_size_t len);
-
   /*!
   * \brief Set initial scores
   * \param init_score Initial scores, this class will manage memory for init_score.
@@ -244,6 +242,9 @@ class Metadata {
   std::vector<data_size_t> queries_;
   /*! \brief mutex for threading safe call */
   std::mutex mutex_;
+  bool weight_load_from_file_;
+  bool query_load_from_file_;
+  bool init_score_load_from_file_;
 };
 
 

diff --git a/include/LightGBM/dataset_loader.h b/include/LightGBM/dataset_loader.h
@@ -20,8 +20,6 @@ class DatasetLoader {
 
   LIGHTGBM_EXPORT Dataset* LoadFromFileAlignWithOtherDataset(const char* filename, const Dataset* train_data);
 
-  LIGHTGBM_EXPORT Dataset* LoadFromBinFile(const char* data_filename, const char* bin_filename, int rank, int num_machines);
-
   LIGHTGBM_EXPORT Dataset* CostructFromSampleData(std::vector<std::vector<double>>& sample_values, size_t total_sample_size, data_size_t num_data);
 
   /*! \brief Disable copy */
@@ -31,6 +29,8 @@ class DatasetLoader {
 
 private:
 
+  Dataset* LoadFromBinFile(const char* data_filename, const char* bin_filename, int rank, int num_machines, int* num_global_data, std::vector<data_size_t>* used_data_indices);
+
   void SetHeader(const char* filename);
 
   void CheckDataset(const Dataset* dataset);