Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle special characters in JSON model dump. #9474

Merged
merged 1 commit into from
Aug 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 46 additions & 17 deletions src/common/common.cc
Original file line number Diff line number Diff line change
@@ -1,16 +1,17 @@
/*!
* Copyright 2015-2019 by Contributors
* \file common.cc
* \brief Enable all kinds of global variables in common.
/**
* Copyright 2015-2023 by Contributors
*/
#include <dmlc/thread_local.h>
#include <xgboost/logging.h>

#include "common.h"
#include "./random.h"

namespace xgboost {
namespace common {
#include <dmlc/thread_local.h> // for ThreadLocalStore

#include <cstdint> // for uint8_t
#include <cstdio> // for snprintf, size_t
#include <string> // for string

#include "./random.h" // for GlobalRandomEngine, GlobalRandom

namespace xgboost::common {
/*! \brief thread local entry for random. */
struct RandomThreadLocalEntry {
/*! \brief the random engine instance. */
Expand All @@ -19,15 +20,43 @@ struct RandomThreadLocalEntry {

using RandomThreadLocalStore = dmlc::ThreadLocalStore<RandomThreadLocalEntry>;

GlobalRandomEngine& GlobalRandom() {
return RandomThreadLocalStore::Get()->engine;
GlobalRandomEngine &GlobalRandom() { return RandomThreadLocalStore::Get()->engine; }

void EscapeU8(std::string const &string, std::string *p_buffer) {
auto &buffer = *p_buffer;
for (size_t i = 0; i < string.length(); i++) {
const auto ch = string[i];
if (ch == '\\') {
if (i < string.size() && string[i + 1] == 'u') {
buffer += "\\";
} else {
buffer += "\\\\";
}
} else if (ch == '"') {
buffer += "\\\"";
} else if (ch == '\b') {
buffer += "\\b";
} else if (ch == '\f') {
buffer += "\\f";
} else if (ch == '\n') {
buffer += "\\n";
} else if (ch == '\r') {
buffer += "\\r";
} else if (ch == '\t') {
buffer += "\\t";
} else if (static_cast<uint8_t>(ch) <= 0x1f) {
// Unit separator
char buf[8];
snprintf(buf, sizeof buf, "\\u%04x", ch);
buffer += buf;
} else {
buffer += ch;
}
}
}

#if !defined(XGBOOST_USE_CUDA)
int AllVisibleGPUs() {
return 0;
}
int AllVisibleGPUs() { return 0; }
#endif // !defined(XGBOOST_USE_CUDA)

} // namespace common
} // namespace xgboost
} // namespace xgboost::common
43 changes: 17 additions & 26 deletions src/common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,19 @@
#ifndef XGBOOST_COMMON_COMMON_H_
#define XGBOOST_COMMON_COMMON_H_

#include <xgboost/base.h>
#include <xgboost/logging.h>
#include <xgboost/span.h>

#include <algorithm>
#include <exception>
#include <functional>
#include <limits>
#include <numeric>
#include <sstream>
#include <string>
#include <type_traits>
#include <utility>
#include <vector>
#include <algorithm> // for max
#include <array> // for array
#include <cmath> // for ceil
#include <cstddef> // for size_t
#include <cstdint> // for int32_t, int64_t
#include <sstream> // for basic_istream, operator<<, istringstream
#include <string> // for string, basic_string, getline, char_traits
#include <tuple> // for make_tuple
#include <utility> // for forward, index_sequence, make_index_sequence
#include <vector> // for vector

#include "xgboost/base.h" // for XGBOOST_DEVICE
#include "xgboost/logging.h" // for LOG, LOG_FATAL, LogMessageFatal

#if defined(__CUDACC__)
#include <thrust/system/cuda/error.h>
Expand Down Expand Up @@ -52,8 +51,7 @@ inline cudaError_t ThrowOnCudaError(cudaError_t code, const char *file,
#endif // defined(__CUDACC__)
} // namespace dh

namespace xgboost {
namespace common {
namespace xgboost::common {
/*!
* \brief Split a string by delimiter
* \param s String to be split.
Expand All @@ -69,19 +67,13 @@ inline std::vector<std::string> Split(const std::string& s, char delim) {
return ret;
}

void EscapeU8(std::string const &string, std::string *p_buffer);

template <typename T>
XGBOOST_DEVICE T Max(T a, T b) {
return a < b ? b : a;
}

// simple routine to convert any data to string
template<typename T>
inline std::string ToString(const T& data) {
std::ostringstream os;
os << data;
return os.str();
}

template <typename T1, typename T2>
XGBOOST_DEVICE T1 DivRoundUp(const T1 a, const T2 b) {
return static_cast<T1>(std::ceil(static_cast<double>(a) / b));
Expand Down Expand Up @@ -195,6 +187,5 @@ template <typename Indexable>
XGBOOST_DEVICE size_t LastOf(size_t group, Indexable const &indptr) {
return indptr[group + 1] - 1;
}
} // namespace common
} // namespace xgboost
} // namespace xgboost::common
#endif // XGBOOST_COMMON_COMMON_H_
92 changes: 35 additions & 57 deletions src/common/json.cc
Original file line number Diff line number Diff line change
@@ -1,23 +1,29 @@
/*!
* Copyright (c) by Contributors 2019-2022
/**
* Copyright 2019-2023, XGBoost Contributors
*/
#include "xgboost/json.h"

#include <dmlc/endian.h>

#include <cctype>
#include <cmath>
#include <cstddef>
#include <iterator>
#include <limits>
#include <sstream>

#include "./math.h"
#include "charconv.h"
#include "xgboost/base.h"
#include "xgboost/json_io.h"
#include "xgboost/logging.h"
#include "xgboost/string_view.h"
#include <array> // for array
#include <cctype> // for isdigit
#include <cmath> // for isinf, isnan
#include <cstdio> // for EOF
#include <cstdlib> // for size_t, strtof
#include <cstring> // for memcpy
#include <initializer_list> // for initializer_list
#include <iterator> // for distance
#include <limits> // for numeric_limits
#include <memory> // for allocator
#include <sstream> // for operator<<, basic_ostream, operator&, ios, stringstream
#include <system_error> // for errc

#include "./math.h" // for CheckNAN
#include "charconv.h" // for to_chars, NumericLimits, from_chars, to_chars_result
#include "common.h" // for EscapeU8
#include "xgboost/base.h" // for XGBOOST_EXPECT
#include "xgboost/intrusive_ptr.h" // for IntrusivePtr
#include "xgboost/json_io.h" // for JsonReader, UBJReader, UBJWriter, JsonWriter, ToBigEn...
#include "xgboost/logging.h" // for LOG, LOG_FATAL, LogMessageFatal, LogCheck_NE, CHECK
#include "xgboost/string_view.h" // for StringView, operator<<

namespace xgboost {

Expand Down Expand Up @@ -57,12 +63,12 @@ void JsonWriter::Visit(JsonObject const* obj) {
}

void JsonWriter::Visit(JsonNumber const* num) {
char number[NumericLimits<float>::kToCharsSize];
auto res = to_chars(number, number + sizeof(number), num->GetNumber());
std::array<char, NumericLimits<float>::kToCharsSize> number;
auto res = to_chars(number.data(), number.data() + number.size(), num->GetNumber());
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it possible to use std::to_chars, now that we are using C++17? Not blocking this release, let's consider using std::to_chars in the future.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The implementation is unspecified by the standard. MSVC can produce different result than GCC.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Even for integers?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not for interger. I will do some cleanups after 2.0

auto end = res.ptr;
auto ori_size = stream_->size();
stream_->resize(stream_->size() + end - number);
std::memcpy(stream_->data() + ori_size, number, end - number);
stream_->resize(stream_->size() + end - number.data());
std::memcpy(stream_->data() + ori_size, number.data(), end - number.data());
}

void JsonWriter::Visit(JsonInteger const* num) {
Expand All @@ -88,43 +94,15 @@ void JsonWriter::Visit(JsonNull const* ) {
}

void JsonWriter::Visit(JsonString const* str) {
std::string buffer;
buffer += '"';
auto const& string = str->GetString();
for (size_t i = 0; i < string.length(); i++) {
const char ch = string[i];
if (ch == '\\') {
if (i < string.size() && string[i+1] == 'u') {
buffer += "\\";
} else {
buffer += "\\\\";
}
} else if (ch == '"') {
buffer += "\\\"";
} else if (ch == '\b') {
buffer += "\\b";
} else if (ch == '\f') {
buffer += "\\f";
} else if (ch == '\n') {
buffer += "\\n";
} else if (ch == '\r') {
buffer += "\\r";
} else if (ch == '\t') {
buffer += "\\t";
} else if (static_cast<uint8_t>(ch) <= 0x1f) {
// Unit separator
char buf[8];
snprintf(buf, sizeof buf, "\\u%04x", ch);
buffer += buf;
} else {
buffer += ch;
}
}
buffer += '"';
std::string buffer;
buffer += '"';
auto const& string = str->GetString();
common::EscapeU8(string, &buffer);
buffer += '"';

auto s = stream_->size();
stream_->resize(s + buffer.size());
std::memcpy(stream_->data() + s, buffer.data(), buffer.size());
auto s = stream_->size();
stream_->resize(s + buffer.size());
std::memcpy(stream_->data() + s, buffer.data(), buffer.size());
}

void JsonWriter::Visit(JsonBoolean const* boolean) {
Expand Down
1 change: 1 addition & 0 deletions src/common/numeric.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include <cstddef> // for size_t
#include <cstdint> // for int32_t
#include <iterator> // for iterator_traits
#include <numeric> // for accumulate
#include <vector>

#include "common.h" // AssertGPUSupport
Expand Down
4 changes: 2 additions & 2 deletions src/learner.cc
Original file line number Diff line number Diff line change
Expand Up @@ -797,7 +797,7 @@ class LearnerConfiguration : public Learner {
bool has_nc {cfg_.find("num_class") != cfg_.cend()};
// Inject num_class into configuration.
// FIXME(jiamingy): Remove the duplicated parameter in softmax
cfg_["num_class"] = common::ToString(mparam_.num_class);
cfg_["num_class"] = std::to_string(mparam_.num_class);
auto& args = *p_args;
args = {cfg_.cbegin(), cfg_.cend()}; // renew
obj_->Configure(args);
Expand Down Expand Up @@ -1076,7 +1076,7 @@ class LearnerIO : public LearnerConfiguration {
mparam_.major_version = std::get<0>(Version::Self());
mparam_.minor_version = std::get<1>(Version::Self());

cfg_["num_feature"] = common::ToString(mparam_.num_feature);
cfg_["num_feature"] = std::to_string(mparam_.num_feature);

auto n = tparam_.__DICT__();
cfg_.insert(n.cbegin(), n.cend());
Expand Down
11 changes: 8 additions & 3 deletions src/tree/tree_model.cc
Original file line number Diff line number Diff line change
Expand Up @@ -398,11 +398,14 @@ class JsonGenerator : public TreeGenerator {
static std::string const kIndicatorTemplate =
R"ID( "nodeid": {nid}, "depth": {depth}, "split": "{fname}", "yes": {yes}, "no": {no})ID";
auto split_index = tree[nid].SplitIndex();
auto fname = fmap_.Name(split_index);
std::string qfname; // quoted
common::EscapeU8(fname, &qfname);
auto result = SuperT::Match(
kIndicatorTemplate,
{{"{nid}", std::to_string(nid)},
{"{depth}", std::to_string(depth)},
{"{fname}", fmap_.Name(split_index)},
{"{fname}", qfname},
{"{yes}", std::to_string(nyes)},
{"{no}", std::to_string(tree[nid].DefaultChild())}});
return result;
Expand Down Expand Up @@ -430,12 +433,14 @@ class JsonGenerator : public TreeGenerator {
std::string const &template_str, std::string cond,
uint32_t depth) const {
auto split_index = tree[nid].SplitIndex();
auto fname = split_index < fmap_.Size() ? fmap_.Name(split_index) : std::to_string(split_index);
std::string qfname; // quoted
common::EscapeU8(fname, &qfname);
std::string const result = SuperT::Match(
template_str,
{{"{nid}", std::to_string(nid)},
{"{depth}", std::to_string(depth)},
{"{fname}", split_index < fmap_.Size() ? fmap_.Name(split_index) :
std::to_string(split_index)},
{"{fname}", qfname},
{"{cond}", cond},
{"{left}", std::to_string(tree[nid].LeftChild())},
{"{right}", std::to_string(tree[nid].RightChild())},
Expand Down
20 changes: 20 additions & 0 deletions tests/python/test_basic_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -439,6 +439,26 @@ def validate_model(parameters):
'objective': 'multi:softmax'}
validate_model(parameters)

def test_special_model_dump_characters(self):
params = {"objective": "reg:squarederror", "max_depth": 3}
feature_names = ['"feature 0"', "\tfeature\n1", "feature 2"]
X, y, w = tm.make_regression(n_samples=128, n_features=3, use_cupy=False)
Xy = xgb.DMatrix(X, label=y, feature_names=feature_names)
booster = xgb.train(params, Xy, num_boost_round=3)
json_dump = booster.get_dump(dump_format="json")
assert len(json_dump) == 3

def validate(obj: dict) -> None:
for k, v in obj.items():
if k == "split":
assert v in feature_names
elif isinstance(v, dict):
validate(v)

for j_tree in json_dump:
loaded = json.loads(j_tree)
validate(loaded)

def test_categorical_model_io(self):
X, y = tm.make_categorical(256, 16, 71, False)
Xy = xgb.DMatrix(X, y, enable_categorical=True)
Expand Down