Skip to content

Commit

Permalink
[breaking] Save booster feature info in JSON, remove feature name gen…
Browse files Browse the repository at this point in the history
…eration. (#6605)

* Save feature info in booster in JSON model.
* [breaking] Remove automatic feature name generation in `DMatrix`.

This PR is to enable reliable feature validation in Python package.
  • Loading branch information
trivialfis authored Feb 25, 2021
1 parent b6167cd commit 9da2287
Show file tree
Hide file tree
Showing 12 changed files with 363 additions and 36 deletions.
18 changes: 18 additions & 0 deletions doc/model.schema
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,12 @@
"type": "number"
}
},
"split_type": {
"type": "array",
"items": {
"type": "integer"
}
},
"default_left": {
"type": "array",
"items": {
Expand Down Expand Up @@ -247,6 +253,18 @@
"learner": {
"type": "object",
"properties": {
"feature_names": {
"type": "array",
"items": {
"type": "string"
}
},
"feature_types": {
"type": "array",
"items": {
"type": "string"
}
},
"gradient_booster": {
"oneOf": [
{
Expand Down
42 changes: 42 additions & 0 deletions include/xgboost/c_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -1132,4 +1132,46 @@ XGB_DLL int XGBoosterSetAttr(BoosterHandle handle,
XGB_DLL int XGBoosterGetAttrNames(BoosterHandle handle,
bst_ulong* out_len,
const char*** out);

/*!
* \brief Set string encoded feature info in Booster, similar to the feature
* info in DMatrix.
*
* Accepted fields are:
* - feature_name
* - feature_type
*
* \param handle An instance of Booster
* \param field Feild name
* \param features Pointer to array of strings.
* \param size Size of `features` pointer (number of strings passed in).
*
* \return 0 when success, -1 when failure happens
*/
XGB_DLL int XGBoosterSetStrFeatureInfo(BoosterHandle handle, const char *field,
const char **features,
const bst_ulong size);

/*!
* \brief Get string encoded feature info from Booster, similar to feature info
* in DMatrix.
*
* Accepted fields are:
* - feature_name
* - feature_type
*
* Caller is responsible for copying out the data, before next call to any API
* function of XGBoost.
*
* \param handle An instance of Booster
* \param field Feild name
* \param size Size of output pointer `features` (number of strings returned).
* \param out_features Address of a pointer to array of strings. Result is stored in
* thread local memory.
*
* \return 0 when success, -1 when failure happens
*/
XGB_DLL int XGBoosterGetStrFeatureInfo(BoosterHandle handle, const char *field,
bst_ulong *len,
const char ***out_features);
#endif // XGBOOST_C_API_H_
21 changes: 21 additions & 0 deletions include/xgboost/learner.h
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,27 @@ class Learner : public Model, public Configurable, public dmlc::Serializable {
* \return vector of attribute name strings.
*/
virtual std::vector<std::string> GetAttrNames() const = 0;
/*!
* \brief Set the feature names for current booster.
* \param fn Input feature names
*/
virtual void SetFeatureNames(std::vector<std::string> const& fn) = 0;
/*!
* \brief Get the feature names for current booster.
* \param fn Output feature names
*/
virtual void GetFeatureNames(std::vector<std::string>* fn) const = 0;
/*!
* \brief Set the feature types for current booster.
* \param ft Input feature types.
*/
virtual void SetFeatureTypes(std::vector<std::string> const& ft) = 0;
/*!
* \brief Get the feature types for current booster.
* \param fn Output feature types
*/
virtual void GetFeatureTypes(std::vector<std::string>* ft) const = 0;

/*!
* \return whether the model allow lazy checkpoint in rabit.
*/
Expand Down
113 changes: 86 additions & 27 deletions python-package/xgboost/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def from_pystr_to_cstr(data: Union[str, List[str]]):
raise TypeError()


def from_cstr_to_pystr(data, length):
def from_cstr_to_pystr(data, length) -> List[str]:
"""Revert C pointer to Python str
Parameters
Expand Down Expand Up @@ -869,7 +869,7 @@ def feature_names(self) -> List[str]:
)
feature_names = from_cstr_to_pystr(sarr, length)
if not feature_names:
feature_names = ["f{0}".format(i) for i in range(self.num_col())]
return None
return feature_names

@feature_names.setter
Expand Down Expand Up @@ -1167,9 +1167,6 @@ class Booster(object):
training, prediction and evaluation.
"""

feature_names = None
feature_types = None

def __init__(self, params=None, cache=(), model_file=None):
# pylint: disable=invalid-name
"""
Expand All @@ -1185,12 +1182,15 @@ def __init__(self, params=None, cache=(), model_file=None):
for d in cache:
if not isinstance(d, DMatrix):
raise TypeError('invalid cache item: {}'.format(type(d).__name__), cache)
self._validate_features(d)

dmats = c_array(ctypes.c_void_p, [d.handle for d in cache])
self.handle = ctypes.c_void_p()
_check_call(_LIB.XGBoosterCreate(dmats, c_bst_ulong(len(cache)),
ctypes.byref(self.handle)))
for d in cache:
# Validate feature only after the feature names are saved into booster.
self._validate_features(d)

params = params or {}
params = self._configure_metrics(params.copy())
if isinstance(params, list):
Expand Down Expand Up @@ -1400,6 +1400,60 @@ def set_attr(self, **kwargs):
_check_call(_LIB.XGBoosterSetAttr(
self.handle, c_str(key), value))

def _get_feature_info(self, field: str):
length = c_bst_ulong()
sarr = ctypes.POINTER(ctypes.c_char_p)()
if not hasattr(self, "handle") or self.handle is None:
return None
_check_call(
_LIB.XGBoosterGetStrFeatureInfo(
self.handle, c_str(field), ctypes.byref(length), ctypes.byref(sarr),
)
)
feature_info = from_cstr_to_pystr(sarr, length)
return feature_info if feature_info else None

@property
def feature_types(self) -> Optional[List[str]]:
"""Feature types for this booster. Can be directly set by input data or by
assignment.
"""
return self._get_feature_info("feature_type")

@property
def feature_names(self) -> Optional[List[str]]:
"""Feature names for this booster. Can be directly set by input data or by
assignment.
"""
return self._get_feature_info("feature_name")

def _set_feature_info(self, features: Optional[List[str]], field: str) -> None:
if features is not None:
assert isinstance(features, list)
c_feature_info = [bytes(f, encoding="utf-8") for f in features]
c_feature_info = (ctypes.c_char_p * len(c_feature_info))(*c_feature_info)
_check_call(
_LIB.XGBoosterSetStrFeatureInfo(
self.handle, c_str(field), c_feature_info, c_bst_ulong(len(features))
)
)
else:
_check_call(
_LIB.XGBoosterSetStrFeatureInfo(
self.handle, c_str(field), None, c_bst_ulong(0)
)
)

@feature_names.setter
def feature_names(self, features: Optional[List[str]]) -> None:
self._set_feature_info(features, "feature_name")

@feature_types.setter
def feature_types(self, features: Optional[List[str]]) -> None:
self._set_feature_info(features, "feature_type")

def set_param(self, params, value=None):
"""Set parameters into the Booster.
Expand Down Expand Up @@ -1859,9 +1913,10 @@ def inplace_predict(
def save_model(self, fname):
"""Save the model to a file.
The model is saved in an XGBoost internal format which is universal
among the various XGBoost interfaces. Auxiliary attributes of the
Python Booster object (such as feature_names) will not be saved. See:
The model is saved in an XGBoost internal format which is universal among the
various XGBoost interfaces. Auxiliary attributes of the Python Booster object
(such as feature_names) will not be saved when using binary format. To save those
attributes, use JSON instead. See:
https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html
Expand Down Expand Up @@ -1898,9 +1953,10 @@ def load_model(self, fname):
"""Load the model from a file or bytearray. Path to file can be local
or as an URI.
The model is loaded from XGBoost format which is universal among the
various XGBoost interfaces. Auxiliary attributes of the Python Booster
object (such as feature_names) will not be loaded. See:
The model is loaded from XGBoost format which is universal among the various
XGBoost interfaces. Auxiliary attributes of the Python Booster object (such as
feature_names) will not be loaded when using binary format. To save those
attributes, use JSON instead. See:
https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html
Expand Down Expand Up @@ -2249,7 +2305,7 @@ def trees_to_dataframe(self, fmap=''):
# pylint: disable=no-member
return df.sort(['Tree', 'Node']).reset_index(drop=True)

def _validate_features(self, data):
def _validate_features(self, data: DMatrix):
"""
Validate Booster and data's feature_names are identical.
Set feature_names and feature_types from DMatrix
Expand All @@ -2260,24 +2316,27 @@ def _validate_features(self, data):
if self.feature_names is None:
self.feature_names = data.feature_names
self.feature_types = data.feature_types
else:
# Booster can't accept data with different feature names
if self.feature_names != data.feature_names:
dat_missing = set(self.feature_names) - set(data.feature_names)
my_missing = set(data.feature_names) - set(self.feature_names)
if data.feature_names is None and self.feature_names is not None:
raise ValueError(
"training data did not have the following fields: " +
", ".join(self.feature_names)
)
# Booster can't accept data with different feature names
if self.feature_names != data.feature_names:
dat_missing = set(self.feature_names) - set(data.feature_names)
my_missing = set(data.feature_names) - set(self.feature_names)

msg = 'feature_names mismatch: {0} {1}'
msg = 'feature_names mismatch: {0} {1}'

if dat_missing:
msg += ('\nexpected ' + ', '.join(
str(s) for s in dat_missing) + ' in input data')
if dat_missing:
msg += ('\nexpected ' + ', '.join(
str(s) for s in dat_missing) + ' in input data')

if my_missing:
msg += ('\ntraining data did not have the following fields: ' +
', '.join(str(s) for s in my_missing))
if my_missing:
msg += ('\ntraining data did not have the following fields: ' +
', '.join(str(s) for s in my_missing))

raise ValueError(msg.format(self.feature_names,
data.feature_names))
raise ValueError(msg.format(self.feature_names, data.feature_names))

def get_split_value_histogram(self, feature, fmap='', bins=None,
as_pandas=True):
Expand Down
8 changes: 6 additions & 2 deletions python-package/xgboost/sklearn.py
Original file line number Diff line number Diff line change
Expand Up @@ -958,9 +958,13 @@ def feature_importances_(self):
raise AttributeError(
'Feature importance is not defined for Booster type {}'
.format(self.booster))
b = self.get_booster()
b: Booster = self.get_booster()
score = b.get_score(importance_type=self.importance_type)
all_features = [score.get(f, 0.) for f in b.feature_names]
if b.feature_names is None:
feature_names = ["f{0}".format(i) for i in range(self.n_features_in_)]
else:
feature_names = b.feature_names
all_features = [score.get(f, 0.) for f in feature_names]
all_features = np.array(all_features, dtype=np.float32)
total = all_features.sum()
if total == 0:
Expand Down
45 changes: 45 additions & 0 deletions src/c_api/c_api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1022,5 +1022,50 @@ XGB_DLL int XGBoosterGetAttrNames(BoosterHandle handle,
API_END();
}

XGB_DLL int XGBoosterSetStrFeatureInfo(BoosterHandle handle, const char *field,
const char **features,
const xgboost::bst_ulong size) {
API_BEGIN();
CHECK_HANDLE();
auto *learner = static_cast<Learner *>(handle);
std::vector<std::string> feature_info;
for (size_t i = 0; i < size; ++i) {
feature_info.emplace_back(features[i]);
}
if (!std::strcmp(field, "feature_name")) {
learner->SetFeatureNames(feature_info);
} else if (!std::strcmp(field, "feature_type")) {
learner->SetFeatureTypes(feature_info);
} else {
LOG(FATAL) << "Unknown field for Booster feature info:" << field;
}
API_END();
}

XGB_DLL int XGBoosterGetStrFeatureInfo(BoosterHandle handle, const char *field,
xgboost::bst_ulong *len,
const char ***out_features) {
API_BEGIN();
CHECK_HANDLE();
auto const *learner = static_cast<Learner const *>(handle);
std::vector<const char *> &charp_vecs =
learner->GetThreadLocal().ret_vec_charp;
std::vector<std::string> &str_vecs = learner->GetThreadLocal().ret_vec_str;
if (!std::strcmp(field, "feature_name")) {
learner->GetFeatureNames(&str_vecs);
} else if (!std::strcmp(field, "feature_type")) {
learner->GetFeatureTypes(&str_vecs);
} else {
LOG(FATAL) << "Unknown field for Booster feature info:" << field;
}
charp_vecs.resize(str_vecs.size());
for (size_t i = 0; i < str_vecs.size(); ++i) {
charp_vecs[i] = str_vecs[i].c_str();
}
*out_features = dmlc::BeginPtr(charp_vecs);
*len = static_cast<xgboost::bst_ulong>(charp_vecs.size());
API_END();
}

// force link rabit
static DMLC_ATTRIBUTE_UNUSED int XGBOOST_LINK_RABIT_C_API_ = RabitLinkTag();
Loading

0 comments on commit 9da2287

Please sign in to comment.