diff --git a/doc/model.schema b/doc/model.schema index 6c7bd9c32fd6..d90049caa2c9 100644 --- a/doc/model.schema +++ b/doc/model.schema @@ -88,6 +88,12 @@ "type": "number" } }, + "split_type": { + "type": "array", + "items": { + "type": "integer" + } + }, "default_left": { "type": "array", "items": { @@ -247,6 +253,18 @@ "learner": { "type": "object", "properties": { + "feature_names": { + "type": "array", + "items": { + "type": "string" + } + }, + "feature_types": { + "type": "array", + "items": { + "type": "string" + } + }, "gradient_booster": { "oneOf": [ { diff --git a/include/xgboost/c_api.h b/include/xgboost/c_api.h index 60c77bb38674..e618369ceb4c 100644 --- a/include/xgboost/c_api.h +++ b/include/xgboost/c_api.h @@ -1132,4 +1132,46 @@ XGB_DLL int XGBoosterSetAttr(BoosterHandle handle, XGB_DLL int XGBoosterGetAttrNames(BoosterHandle handle, bst_ulong* out_len, const char*** out); + +/*! + * \brief Set string encoded feature info in Booster, similar to the feature + * info in DMatrix. + * + * Accepted fields are: + * - feature_name + * - feature_type + * + * \param handle An instance of Booster + * \param field Feild name + * \param features Pointer to array of strings. + * \param size Size of `features` pointer (number of strings passed in). + * + * \return 0 when success, -1 when failure happens + */ +XGB_DLL int XGBoosterSetStrFeatureInfo(BoosterHandle handle, const char *field, + const char **features, + const bst_ulong size); + +/*! + * \brief Get string encoded feature info from Booster, similar to feature info + * in DMatrix. + * + * Accepted fields are: + * - feature_name + * - feature_type + * + * Caller is responsible for copying out the data, before next call to any API + * function of XGBoost. + * + * \param handle An instance of Booster + * \param field Feild name + * \param size Size of output pointer `features` (number of strings returned). + * \param out_features Address of a pointer to array of strings. Result is stored in + * thread local memory. + * + * \return 0 when success, -1 when failure happens + */ +XGB_DLL int XGBoosterGetStrFeatureInfo(BoosterHandle handle, const char *field, + bst_ulong *len, + const char ***out_features); #endif // XGBOOST_C_API_H_ diff --git a/include/xgboost/learner.h b/include/xgboost/learner.h index 8676e5a250b0..7e7ad2a9ea3a 100644 --- a/include/xgboost/learner.h +++ b/include/xgboost/learner.h @@ -213,6 +213,27 @@ class Learner : public Model, public Configurable, public dmlc::Serializable { * \return vector of attribute name strings. */ virtual std::vector GetAttrNames() const = 0; + /*! + * \brief Set the feature names for current booster. + * \param fn Input feature names + */ + virtual void SetFeatureNames(std::vector const& fn) = 0; + /*! + * \brief Get the feature names for current booster. + * \param fn Output feature names + */ + virtual void GetFeatureNames(std::vector* fn) const = 0; + /*! + * \brief Set the feature types for current booster. + * \param ft Input feature types. + */ + virtual void SetFeatureTypes(std::vector const& ft) = 0; + /*! + * \brief Get the feature types for current booster. + * \param fn Output feature types + */ + virtual void GetFeatureTypes(std::vector* ft) const = 0; + /*! * \return whether the model allow lazy checkpoint in rabit. */ diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index 0514f0c27f76..6a9a1dd486e6 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -77,7 +77,7 @@ def from_pystr_to_cstr(data: Union[str, List[str]]): raise TypeError() -def from_cstr_to_pystr(data, length): +def from_cstr_to_pystr(data, length) -> List[str]: """Revert C pointer to Python str Parameters @@ -869,7 +869,7 @@ def feature_names(self) -> List[str]: ) feature_names = from_cstr_to_pystr(sarr, length) if not feature_names: - feature_names = ["f{0}".format(i) for i in range(self.num_col())] + return None return feature_names @feature_names.setter @@ -1167,9 +1167,6 @@ class Booster(object): training, prediction and evaluation. """ - feature_names = None - feature_types = None - def __init__(self, params=None, cache=(), model_file=None): # pylint: disable=invalid-name """ @@ -1185,12 +1182,15 @@ def __init__(self, params=None, cache=(), model_file=None): for d in cache: if not isinstance(d, DMatrix): raise TypeError('invalid cache item: {}'.format(type(d).__name__), cache) - self._validate_features(d) dmats = c_array(ctypes.c_void_p, [d.handle for d in cache]) self.handle = ctypes.c_void_p() _check_call(_LIB.XGBoosterCreate(dmats, c_bst_ulong(len(cache)), ctypes.byref(self.handle))) + for d in cache: + # Validate feature only after the feature names are saved into booster. + self._validate_features(d) + params = params or {} params = self._configure_metrics(params.copy()) if isinstance(params, list): @@ -1400,6 +1400,60 @@ def set_attr(self, **kwargs): _check_call(_LIB.XGBoosterSetAttr( self.handle, c_str(key), value)) + def _get_feature_info(self, field: str): + length = c_bst_ulong() + sarr = ctypes.POINTER(ctypes.c_char_p)() + if not hasattr(self, "handle") or self.handle is None: + return None + _check_call( + _LIB.XGBoosterGetStrFeatureInfo( + self.handle, c_str(field), ctypes.byref(length), ctypes.byref(sarr), + ) + ) + feature_info = from_cstr_to_pystr(sarr, length) + return feature_info if feature_info else None + + @property + def feature_types(self) -> Optional[List[str]]: + """Feature types for this booster. Can be directly set by input data or by + assignment. + + """ + return self._get_feature_info("feature_type") + + @property + def feature_names(self) -> Optional[List[str]]: + """Feature names for this booster. Can be directly set by input data or by + assignment. + + """ + return self._get_feature_info("feature_name") + + def _set_feature_info(self, features: Optional[List[str]], field: str) -> None: + if features is not None: + assert isinstance(features, list) + c_feature_info = [bytes(f, encoding="utf-8") for f in features] + c_feature_info = (ctypes.c_char_p * len(c_feature_info))(*c_feature_info) + _check_call( + _LIB.XGBoosterSetStrFeatureInfo( + self.handle, c_str(field), c_feature_info, c_bst_ulong(len(features)) + ) + ) + else: + _check_call( + _LIB.XGBoosterSetStrFeatureInfo( + self.handle, c_str(field), None, c_bst_ulong(0) + ) + ) + + @feature_names.setter + def feature_names(self, features: Optional[List[str]]) -> None: + self._set_feature_info(features, "feature_name") + + @feature_types.setter + def feature_types(self, features: Optional[List[str]]) -> None: + self._set_feature_info(features, "feature_type") + def set_param(self, params, value=None): """Set parameters into the Booster. @@ -1859,9 +1913,10 @@ def inplace_predict( def save_model(self, fname): """Save the model to a file. - The model is saved in an XGBoost internal format which is universal - among the various XGBoost interfaces. Auxiliary attributes of the - Python Booster object (such as feature_names) will not be saved. See: + The model is saved in an XGBoost internal format which is universal among the + various XGBoost interfaces. Auxiliary attributes of the Python Booster object + (such as feature_names) will not be saved when using binary format. To save those + attributes, use JSON instead. See: https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html @@ -1898,9 +1953,10 @@ def load_model(self, fname): """Load the model from a file or bytearray. Path to file can be local or as an URI. - The model is loaded from XGBoost format which is universal among the - various XGBoost interfaces. Auxiliary attributes of the Python Booster - object (such as feature_names) will not be loaded. See: + The model is loaded from XGBoost format which is universal among the various + XGBoost interfaces. Auxiliary attributes of the Python Booster object (such as + feature_names) will not be loaded when using binary format. To save those + attributes, use JSON instead. See: https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html @@ -2249,7 +2305,7 @@ def trees_to_dataframe(self, fmap=''): # pylint: disable=no-member return df.sort(['Tree', 'Node']).reset_index(drop=True) - def _validate_features(self, data): + def _validate_features(self, data: DMatrix): """ Validate Booster and data's feature_names are identical. Set feature_names and feature_types from DMatrix @@ -2260,24 +2316,27 @@ def _validate_features(self, data): if self.feature_names is None: self.feature_names = data.feature_names self.feature_types = data.feature_types - else: - # Booster can't accept data with different feature names - if self.feature_names != data.feature_names: - dat_missing = set(self.feature_names) - set(data.feature_names) - my_missing = set(data.feature_names) - set(self.feature_names) + if data.feature_names is None and self.feature_names is not None: + raise ValueError( + "training data did not have the following fields: " + + ", ".join(self.feature_names) + ) + # Booster can't accept data with different feature names + if self.feature_names != data.feature_names: + dat_missing = set(self.feature_names) - set(data.feature_names) + my_missing = set(data.feature_names) - set(self.feature_names) - msg = 'feature_names mismatch: {0} {1}' + msg = 'feature_names mismatch: {0} {1}' - if dat_missing: - msg += ('\nexpected ' + ', '.join( - str(s) for s in dat_missing) + ' in input data') + if dat_missing: + msg += ('\nexpected ' + ', '.join( + str(s) for s in dat_missing) + ' in input data') - if my_missing: - msg += ('\ntraining data did not have the following fields: ' + - ', '.join(str(s) for s in my_missing)) + if my_missing: + msg += ('\ntraining data did not have the following fields: ' + + ', '.join(str(s) for s in my_missing)) - raise ValueError(msg.format(self.feature_names, - data.feature_names)) + raise ValueError(msg.format(self.feature_names, data.feature_names)) def get_split_value_histogram(self, feature, fmap='', bins=None, as_pandas=True): diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py index f9b32e0687c4..c27281517ce5 100644 --- a/python-package/xgboost/sklearn.py +++ b/python-package/xgboost/sklearn.py @@ -958,9 +958,13 @@ def feature_importances_(self): raise AttributeError( 'Feature importance is not defined for Booster type {}' .format(self.booster)) - b = self.get_booster() + b: Booster = self.get_booster() score = b.get_score(importance_type=self.importance_type) - all_features = [score.get(f, 0.) for f in b.feature_names] + if b.feature_names is None: + feature_names = ["f{0}".format(i) for i in range(self.n_features_in_)] + else: + feature_names = b.feature_names + all_features = [score.get(f, 0.) for f in feature_names] all_features = np.array(all_features, dtype=np.float32) total = all_features.sum() if total == 0: diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc index c81fddf2c084..135634557977 100644 --- a/src/c_api/c_api.cc +++ b/src/c_api/c_api.cc @@ -1022,5 +1022,50 @@ XGB_DLL int XGBoosterGetAttrNames(BoosterHandle handle, API_END(); } +XGB_DLL int XGBoosterSetStrFeatureInfo(BoosterHandle handle, const char *field, + const char **features, + const xgboost::bst_ulong size) { + API_BEGIN(); + CHECK_HANDLE(); + auto *learner = static_cast(handle); + std::vector feature_info; + for (size_t i = 0; i < size; ++i) { + feature_info.emplace_back(features[i]); + } + if (!std::strcmp(field, "feature_name")) { + learner->SetFeatureNames(feature_info); + } else if (!std::strcmp(field, "feature_type")) { + learner->SetFeatureTypes(feature_info); + } else { + LOG(FATAL) << "Unknown field for Booster feature info:" << field; + } + API_END(); +} + +XGB_DLL int XGBoosterGetStrFeatureInfo(BoosterHandle handle, const char *field, + xgboost::bst_ulong *len, + const char ***out_features) { + API_BEGIN(); + CHECK_HANDLE(); + auto const *learner = static_cast(handle); + std::vector &charp_vecs = + learner->GetThreadLocal().ret_vec_charp; + std::vector &str_vecs = learner->GetThreadLocal().ret_vec_str; + if (!std::strcmp(field, "feature_name")) { + learner->GetFeatureNames(&str_vecs); + } else if (!std::strcmp(field, "feature_type")) { + learner->GetFeatureTypes(&str_vecs); + } else { + LOG(FATAL) << "Unknown field for Booster feature info:" << field; + } + charp_vecs.resize(str_vecs.size()); + for (size_t i = 0; i < str_vecs.size(); ++i) { + charp_vecs[i] = str_vecs[i].c_str(); + } + *out_features = dmlc::BeginPtr(charp_vecs); + *len = static_cast(charp_vecs.size()); + API_END(); +} + // force link rabit static DMLC_ATTRIBUTE_UNUSED int XGBOOST_LINK_RABIT_C_API_ = RabitLinkTag(); diff --git a/src/learner.cc b/src/learner.cc index c9e816c13465..019603261bbf 100644 --- a/src/learner.cc +++ b/src/learner.cc @@ -256,6 +256,11 @@ class LearnerConfiguration : public Learner { std::map cfg_; // Stores information like best-iteration for early stopping. std::map attributes_; + // Name of each feature, usually set from DMatrix. + std::vector feature_names_; + // Type of each feature, usually set from DMatrix. + std::vector feature_types_; + common::Monitor monitor_; LearnerModelParamLegacy mparam_; LearnerModelParam learner_model_param_; @@ -460,6 +465,23 @@ class LearnerConfiguration : public Learner { return true; } + void SetFeatureNames(std::vector const& fn) override { + feature_names_ = fn; + } + + void GetFeatureNames(std::vector* fn) const override { + *fn = feature_names_; + } + + void SetFeatureTypes(std::vector const& ft) override { + this->feature_types_ = ft; + } + + void GetFeatureTypes(std::vector* p_ft) const override { + auto& ft = *p_ft; + ft = this->feature_types_; + } + std::vector GetAttrNames() const override { std::vector out; for (auto const& kv : attributes_) { @@ -666,6 +688,25 @@ class LearnerIO : public LearnerConfiguration { attributes_[kv.first] = get(kv.second); } + // feature names and types are saved in xgboost 1.4 + auto it = learner.find("feature_names"); + if (it != learner.cend()) { + auto const &feature_names = get(it->second); + feature_names_.clear(); + for (auto const &name : feature_names) { + feature_names_.emplace_back(get(name)); + } + } + it = learner.find("feature_types"); + if (it != learner.cend()) { + auto const &feature_types = get(it->second); + feature_types_.clear(); + for (auto const &name : feature_types) { + auto type = get(name); + feature_types_.emplace_back(type); + } + } + this->need_configuration_ = true; } @@ -691,6 +732,17 @@ class LearnerIO : public LearnerConfiguration { for (auto const& kv : attributes_) { learner["attributes"][kv.first] = String(kv.second); } + + learner["feature_names"] = Array(); + auto& feature_names = get(learner["feature_names"]); + for (auto const& name : feature_names_) { + feature_names.emplace_back(name); + } + learner["feature_types"] = Array(); + auto& feature_types = get(learner["feature_types"]); + for (auto const& type : feature_types_) { + feature_types.emplace_back(type); + } } // About to be deprecated by JSON format void LoadModel(dmlc::Stream* fi) override { diff --git a/src/tree/tree_model.cc b/src/tree/tree_model.cc index 3b7cb9daa1ae..1d02278de5c2 100644 --- a/src/tree/tree_model.cc +++ b/src/tree/tree_model.cc @@ -385,7 +385,7 @@ class JsonGenerator : public TreeGenerator { std::string PlainNode(RegTree const& tree, int32_t nid, uint32_t depth) const override { auto cond = tree[nid].SplitCond(); static std::string const kNodeTemplate = - R"I( "nodeid": {nid}, "depth": {depth}, "split": {fname}, )I" + R"I( "nodeid": {nid}, "depth": {depth}, "split": "{fname}", )I" R"I("split_condition": {cond}, "yes": {left}, "no": {right}, )I" R"I("missing": {missing})I"; return SplitNodeImpl(tree, nid, kNodeTemplate, SuperT::ToStr(cond), depth); diff --git a/tests/cpp/test_learner.cc b/tests/cpp/test_learner.cc index 19a78645d7e3..237cb559cc94 100644 --- a/tests/cpp/test_learner.cc +++ b/tests/cpp/test_learner.cc @@ -360,4 +360,60 @@ TEST(Learner, ConstantSeed) { CHECK_EQ(v_0, v_2); } } + +TEST(Learner, FeatureInfo) { + size_t constexpr kCols = 10; + auto m = RandomDataGenerator{10, kCols, 0}.GenerateDMatrix(true); + std::vector names(kCols); + for (size_t i = 0; i < kCols; ++i) { + names[i] = ("f" + std::to_string(i)); + } + + std::vector types(kCols); + for (size_t i = 0; i < kCols; ++i) { + types[i] = "q"; + } + types[8] = "f"; + types[0] = "int"; + types[3] = "i"; + types[7] = "i"; + + std::vector c_names(kCols); + for (size_t i = 0; i < names.size(); ++i) { + c_names[i] = names[i].c_str(); + } + std::vector c_types(kCols); + for (size_t i = 0; i < types.size(); ++i) { + c_types[i] = names[i].c_str(); + } + + std::vector out_names; + std::vector out_types; + + Json model{Object()}; + { + std::unique_ptr learner{Learner::Create({m})}; + learner->Configure(); + learner->SetFeatureNames(names); + learner->GetFeatureNames(&out_names); + + learner->SetFeatureTypes(types); + learner->GetFeatureTypes(&out_types); + + ASSERT_TRUE(std::equal(out_names.begin(), out_names.end(), names.begin())); + ASSERT_TRUE(std::equal(out_types.begin(), out_types.end(), types.begin())); + + learner->SaveModel(&model); + } + + { + std::unique_ptr learner{Learner::Create({m})}; + learner->LoadModel(model); + + learner->GetFeatureNames(&out_names); + learner->GetFeatureTypes(&out_types); + ASSERT_TRUE(std::equal(out_names.begin(), out_names.end(), names.begin())); + ASSERT_TRUE(std::equal(out_types.begin(), out_types.end(), types.begin())); + } +} } // namespace xgboost diff --git a/tests/python/test_basic_models.py b/tests/python/test_basic_models.py index beef2f331c82..120022552d92 100644 --- a/tests/python/test_basic_models.py +++ b/tests/python/test_basic_models.py @@ -217,8 +217,8 @@ def test_feature_names_validation(self): X = np.random.random((10, 3)) y = np.random.randint(2, size=(10,)) - dm1 = xgb.DMatrix(X, y) - dm2 = xgb.DMatrix(X, y, feature_names=("a", "b", "c")) + dm1 = xgb.DMatrix(X, y, feature_names=("a", "b", "c")) + dm2 = xgb.DMatrix(X, y) bst = xgb.train([], dm1) bst.predict(dm1) # success @@ -228,9 +228,6 @@ def test_feature_names_validation(self): bst = xgb.train([], dm2) bst.predict(dm2) # success - with pytest.raises(ValueError): - bst.predict(dm1) - bst.predict(dm2) # success def test_model_binary_io(self): model_path = 'test_model_binary_io.bin' @@ -458,3 +455,31 @@ def test_slice(self, booster): merged = predt_0 + predt_1 - 0.5 single = booster[1:7].predict(dtrain, output_margin=True) np.testing.assert_allclose(merged, single, atol=1e-6) + + @pytest.mark.skipif(**tm.no_pandas()) + def test_feature_info(self): + import pandas as pd + rows = 100 + cols = 10 + X = rng.randn(rows, cols) + y = rng.randn(rows) + feature_names = ["test_feature_" + str(i) for i in range(cols)] + X_pd = pd.DataFrame(X, columns=feature_names) + X_pd.iloc[:, 3] = X_pd.iloc[:, 3].astype(np.int) + + Xy = xgb.DMatrix(X_pd, y) + assert Xy.feature_types[3] == "int" + booster = xgb.train({}, dtrain=Xy, num_boost_round=1) + + assert booster.feature_names == Xy.feature_names + assert booster.feature_names == feature_names + assert booster.feature_types == Xy.feature_types + + with tempfile.TemporaryDirectory() as tmpdir: + path = tmpdir + "model.json" + booster.save_model(path) + booster = xgb.Booster() + booster.load_model(path) + + assert booster.feature_names == Xy.feature_names + assert booster.feature_types == Xy.feature_types diff --git a/tests/python/test_cli.py b/tests/python/test_cli.py index 9ff7ad04c044..26860129a88d 100644 --- a/tests/python/test_cli.py +++ b/tests/python/test_cli.py @@ -95,6 +95,11 @@ def test_cli_model(self): } data = xgboost.DMatrix(data_path) booster = xgboost.train(parameters, data, num_boost_round=10) + + # CLI model doesn't contain feature info. + booster.feature_names = None + booster.feature_types = None + booster.save_model(model_out_py) py_predt = booster.predict(data) diff --git a/tests/python/test_dmatrix.py b/tests/python/test_dmatrix.py index df046c33a4dc..1d201ece9281 100644 --- a/tests/python/test_dmatrix.py +++ b/tests/python/test_dmatrix.py @@ -180,7 +180,7 @@ def test_feature_names_slice(self): # reset dm.feature_names = None - assert dm.feature_names == ['f0', 'f1', 'f2', 'f3', 'f4'] + assert dm.feature_names is None assert dm.feature_types is None def test_feature_names(self):