Skip to content

Commit

Permalink
Cleanup set info. (#10139)
Browse files Browse the repository at this point in the history
- Use the array interface internally.
- Deprecate `XGDMatrixSetDenseInfo`.
- Deprecate `XGDMatrixSetUIntInfo`.
- Move the handling of `DataType` into the deprecated C function.

---------

Co-authored-by: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
  • Loading branch information
trivialfis and hcho3 authored Mar 26, 2024
1 parent 6a7c6a8 commit 230010d
Show file tree
Hide file tree
Showing 37 changed files with 245 additions and 267 deletions.
8 changes: 4 additions & 4 deletions .github/workflows/r_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ jobs:
name: Test R package on Debian
runs-on: ubuntu-latest
container:
image: rhub/debian-gcc-devel
image: rhub/debian-gcc-release

steps:
- name: Install system dependencies
Expand All @@ -130,12 +130,12 @@ jobs:
- name: Install dependencies
shell: bash -l {0}
run: |
/tmp/R-devel/bin/Rscript -e "source('./R-package/tests/helper_scripts/install_deps.R')"
Rscript -e "source('./R-package/tests/helper_scripts/install_deps.R')"
- name: Test R
shell: bash -l {0}
run: |
python3 tests/ci_build/test_r_package.py --r=/tmp/R-devel/bin/R --build-tool=autotools --task=check
python3 tests/ci_build/test_r_package.py --r=/usr/bin/R --build-tool=autotools --task=check
- uses: dorny/paths-filter@v2
id: changes
Expand All @@ -147,4 +147,4 @@ jobs:
- name: Run document check
if: steps.changes.outputs.r_package == 'true'
run: |
python3 tests/ci_build/test_r_package.py --r=/tmp/R-devel/bin/R --task=doc
python3 tests/ci_build/test_r_package.py --r=/usr/bin/R --task=doc
60 changes: 12 additions & 48 deletions include/xgboost/c_api.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* Copyright 2015~2023 by XGBoost Contributors
* Copyright 2015-2024, XGBoost Contributors
* \file c_api.h
* \author Tianqi Chen
* \brief C API of XGBoost, used for interfacing to other languages.
Expand Down Expand Up @@ -639,21 +639,14 @@ XGB_DLL int XGDMatrixSetInfoFromInterface(DMatrixHandle handle,
* \param len length of array
* \return 0 when success, -1 when failure happens
*/
XGB_DLL int XGDMatrixSetFloatInfo(DMatrixHandle handle,
const char *field,
const float *array,
XGB_DLL int XGDMatrixSetFloatInfo(DMatrixHandle handle, const char *field, const float *array,
bst_ulong len);
/*!
* \brief set uint32 vector to a content in info
* \param handle a instance of data matrix
* \param field field name
* \param array pointer to unsigned int vector
* \param len length of array
* \return 0 when success, -1 when failure happens
/**
* @deprecated since 2.1.0
*
* Use @ref XGDMatrixSetInfoFromInterface instead.
*/
XGB_DLL int XGDMatrixSetUIntInfo(DMatrixHandle handle,
const char *field,
const unsigned *array,
XGB_DLL int XGDMatrixSetUIntInfo(DMatrixHandle handle, const char *field, const unsigned *array,
bst_ulong len);

/*!
Expand Down Expand Up @@ -725,42 +718,13 @@ XGB_DLL int XGDMatrixGetStrFeatureInfo(DMatrixHandle handle, const char *field,
bst_ulong *size,
const char ***out_features);

/*!
* \brief Set meta info from dense matrix. Valid field names are:
*
* - label
* - weight
* - base_margin
* - group
* - label_lower_bound
* - label_upper_bound
* - feature_weights
/**
* @deprecated since 2.1.0
*
* \param handle An instance of data matrix
* \param field Field name
* \param data Pointer to consecutive memory storing data.
* \param size Size of the data, this is relative to size of type. (Meaning NOT number
* of bytes.)
* \param type Indicator of data type. This is defined in xgboost::DataType enum class.
* - float = 1
* - double = 2
* - uint32_t = 3
* - uint64_t = 4
* \return 0 when success, -1 when failure happens
*/
XGB_DLL int XGDMatrixSetDenseInfo(DMatrixHandle handle, const char *field,
void const *data, bst_ulong size, int type);

/*!
* \brief (deprecated) Use XGDMatrixSetUIntInfo instead. Set group of the training matrix
* \param handle a instance of data matrix
* \param group pointer to group size
* \param len length of array
* \return 0 when success, -1 when failure happens
* Use @ref XGDMatrixSetInfoFromInterface instead.
*/
XGB_DLL int XGDMatrixSetGroup(DMatrixHandle handle,
const unsigned *group,
bst_ulong len);
XGB_DLL int XGDMatrixSetDenseInfo(DMatrixHandle handle, const char *field, void const *data,
bst_ulong size, int type);

/*!
* \brief get float info vector from matrix.
Expand Down
13 changes: 0 additions & 13 deletions include/xgboost/data.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
#include <algorithm>
#include <limits>
#include <memory>
#include <numeric>
#include <string>
#include <utility>
#include <vector>
Expand Down Expand Up @@ -137,14 +136,6 @@ class MetaInfo {
* \param fo The output stream.
*/
void SaveBinary(dmlc::Stream* fo) const;
/*!
* \brief Set information in the meta info.
* \param key The key of the information.
* \param dptr The data pointer of the source array.
* \param dtype The type of the source data.
* \param num Number of elements in the source array.
*/
void SetInfo(Context const& ctx, const char* key, const void* dptr, DataType dtype, size_t num);
/*!
* \brief Set information in the meta info with array interface.
* \param key The key of the information.
Expand Down Expand Up @@ -517,10 +508,6 @@ class DMatrix {
DMatrix() = default;
/*! \brief meta information of the dataset */
virtual MetaInfo& Info() = 0;
virtual void SetInfo(const char* key, const void* dptr, DataType dtype, size_t num) {
auto const& ctx = *this->Ctx();
this->Info().SetInfo(ctx, key, dptr, dtype, num);
}
virtual void SetInfo(const char* key, std::string const& interface_str) {
auto const& ctx = *this->Ctx();
this->Info().SetInfo(ctx, key, StringView{interface_str});
Expand Down
15 changes: 12 additions & 3 deletions include/xgboost/linalg.h
Original file line number Diff line number Diff line change
Expand Up @@ -190,13 +190,14 @@ constexpr auto ArrToTuple(T (&arr)[N]) {
// uint division optimization inspired by the CIndexer in cupy. Division operation is
// slow on both CPU and GPU, especially 64 bit integer. So here we first try to avoid 64
// bit when the index is smaller, then try to avoid division when it's exp of 2.
template <typename I, int32_t D>
template <typename I, std::int32_t D>
LINALG_HD auto UnravelImpl(I idx, common::Span<size_t const, D> shape) {
size_t index[D]{0};
std::size_t index[D]{0};
static_assert(std::is_signed<decltype(D)>::value,
"Don't change the type without changing the for loop.");
auto const sptr = shape.data();
for (int32_t dim = D; --dim > 0;) {
auto s = static_cast<std::remove_const_t<std::remove_reference_t<I>>>(shape[dim]);
auto s = static_cast<std::remove_const_t<std::remove_reference_t<I>>>(sptr[dim]);
if (s & (s - 1)) {
auto t = idx / s;
index[dim] = idx - t * s;
Expand Down Expand Up @@ -745,6 +746,14 @@ auto ArrayInterfaceStr(TensorView<T, D> const &t) {
return str;
}

template <typename T>
auto Make1dInterface(T const *vec, std::size_t len) {
Context ctx;
auto t = linalg::MakeTensorView(&ctx, common::Span{vec, len}, len);
auto str = linalg::ArrayInterfaceStr(t);
return str;
}

/**
* \brief A tensor storage. To use it for other functionality like slicing one needs to
* obtain a view first. This way we can use it on both host and device.
Expand Down
10 changes: 4 additions & 6 deletions include/xgboost/span.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,8 @@
#define XGBOOST_SPAN_H_

#include <xgboost/base.h>
#include <xgboost/logging.h>

#include <cinttypes> // size_t
#include <cstddef> // size_t
#include <cstdio>
#include <iterator>
#include <limits> // numeric_limits
Expand Down Expand Up @@ -73,8 +72,7 @@

#endif // defined(_MSC_VER) && _MSC_VER < 1910

namespace xgboost {
namespace common {
namespace xgboost::common {

#if defined(__CUDA_ARCH__)
// Usual logging facility is not available inside device code.
Expand Down Expand Up @@ -707,8 +705,8 @@ class IterSpan {
return it_ + size();
}
};
} // namespace common
} // namespace xgboost
} // namespace xgboost::common


#if defined(_MSC_VER) &&_MSC_VER < 1910
#undef constexpr
Expand Down
10 changes: 6 additions & 4 deletions jvm-packages/xgboost4j/src/native/xgboost4j.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -408,7 +408,8 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixSetFloatI

jfloat* array = jenv->GetFloatArrayElements(jarray, NULL);
bst_ulong len = (bst_ulong)jenv->GetArrayLength(jarray);
int ret = XGDMatrixSetFloatInfo(handle, field, (float const *)array, len);
auto str = xgboost::linalg::Make1dInterface(array, len);
int ret = XGDMatrixSetInfoFromInterface(handle, field, str.c_str());
JVM_CHECK_CALL(ret);
//release
if (field) jenv->ReleaseStringUTFChars(jfield, field);
Expand All @@ -427,7 +428,8 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixSetUIntIn
const char* field = jenv->GetStringUTFChars(jfield, 0);
jint* array = jenv->GetIntArrayElements(jarray, NULL);
bst_ulong len = (bst_ulong)jenv->GetArrayLength(jarray);
int ret = XGDMatrixSetUIntInfo(handle, (char const *)field, (unsigned int const *)array, len);
auto str = xgboost::linalg::Make1dInterface(array, len);
int ret = XGDMatrixSetInfoFromInterface(handle, field, str.c_str());
JVM_CHECK_CALL(ret);
//release
if (field) jenv->ReleaseStringUTFChars(jfield, (const char *)field);
Expand Down Expand Up @@ -730,8 +732,8 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGBoosterPredictFr
if (jmargin) {
margin = jenv->GetFloatArrayElements(jmargin, nullptr);
JVM_CHECK_CALL(XGProxyDMatrixCreate(&proxy));
JVM_CHECK_CALL(
XGDMatrixSetFloatInfo(proxy, "base_margin", margin, jenv->GetArrayLength(jmargin)));
auto str = xgboost::linalg::Make1dInterface(margin, jenv->GetArrayLength(jmargin));
JVM_CHECK_CALL(XGDMatrixSetInfoFromInterface(proxy, "base_margin", str.c_str()));
}

bst_ulong const *out_shape;
Expand Down
62 changes: 48 additions & 14 deletions src/c_api/c_api.cc
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* Copyright 2014-2024 by XGBoost Contributors
* Copyright 2014-2024, XGBoost Contributors
*/
#include "xgboost/c_api.h"

Expand Down Expand Up @@ -614,8 +614,8 @@ XGB_DLL int XGDMatrixSetFloatInfo(DMatrixHandle handle, const char *field, const
API_BEGIN();
CHECK_HANDLE();
xgboost_CHECK_C_ARG_PTR(field);
auto const& p_fmat = *static_cast<std::shared_ptr<DMatrix> *>(handle);
p_fmat->SetInfo(field, info, xgboost::DataType::kFloat32, len);
auto const &p_fmat = *static_cast<std::shared_ptr<DMatrix> *>(handle);
p_fmat->SetInfo(field, linalg::Make1dInterface(info, len));
API_END();
}

Expand All @@ -634,8 +634,9 @@ XGB_DLL int XGDMatrixSetUIntInfo(DMatrixHandle handle, const char *field, const
API_BEGIN();
CHECK_HANDLE();
xgboost_CHECK_C_ARG_PTR(field);
LOG(WARNING) << error::DeprecatedFunc(__func__, "2.1.0", "XGDMatrixSetInfoFromInterface");
auto const &p_fmat = *static_cast<std::shared_ptr<DMatrix> *>(handle);
p_fmat->SetInfo(field, info, xgboost::DataType::kUInt32, len);
p_fmat->SetInfo(field, linalg::Make1dInterface(info, len));
API_END();
}

Expand Down Expand Up @@ -679,19 +680,52 @@ XGB_DLL int XGDMatrixSetDenseInfo(DMatrixHandle handle, const char *field, void
xgboost::bst_ulong size, int type) {
API_BEGIN();
CHECK_HANDLE();
LOG(WARNING) << error::DeprecatedFunc(__func__, "2.1.0", "XGDMatrixSetInfoFromInterface");
auto const &p_fmat = *static_cast<std::shared_ptr<DMatrix> *>(handle);
CHECK(type >= 1 && type <= 4);
xgboost_CHECK_C_ARG_PTR(field);
p_fmat->SetInfo(field, data, static_cast<DataType>(type), size);
API_END();
}

XGB_DLL int XGDMatrixSetGroup(DMatrixHandle handle, const unsigned *group, xgboost::bst_ulong len) {
API_BEGIN();
CHECK_HANDLE();
LOG(WARNING) << "XGDMatrixSetGroup is deprecated, use `XGDMatrixSetUIntInfo` instead.";
auto const &p_fmat = *static_cast<std::shared_ptr<DMatrix> *>(handle);
p_fmat->SetInfo("group", group, xgboost::DataType::kUInt32, len);
Context ctx;
auto dtype = static_cast<DataType>(type);
std::string str;
auto proc = [&](auto cast_d_ptr) {
using T = std::remove_pointer_t<decltype(cast_d_ptr)>;
auto t = linalg::TensorView<T, 1>(
common::Span<T>{cast_d_ptr, static_cast<typename common::Span<T>::index_type>(size)},
{size}, DeviceOrd::CPU());
CHECK(t.CContiguous());
Json interface{linalg::ArrayInterface(t)};
CHECK(ArrayInterface<1>{interface}.is_contiguous);
str = Json::Dump(interface);
return str;
};

// Legacy code using XGBoost dtype, which is a small subset of array interface types.
switch (dtype) {
case xgboost::DataType::kFloat32: {
auto cast_ptr = reinterpret_cast<const float *>(data);
p_fmat->Info().SetInfo(ctx, field, proc(cast_ptr));
break;
}
case xgboost::DataType::kDouble: {
auto cast_ptr = reinterpret_cast<const double *>(data);
p_fmat->Info().SetInfo(ctx, field, proc(cast_ptr));
break;
}
case xgboost::DataType::kUInt32: {
auto cast_ptr = reinterpret_cast<const uint32_t *>(data);
p_fmat->Info().SetInfo(ctx, field, proc(cast_ptr));
break;
}
case xgboost::DataType::kUInt64: {
auto cast_ptr = reinterpret_cast<const uint64_t *>(data);
p_fmat->Info().SetInfo(ctx, field, proc(cast_ptr));
break;
}
default:
LOG(FATAL) << "Unknown data type" << static_cast<uint8_t>(dtype);
}

API_END();
}

Expand Down Expand Up @@ -987,7 +1021,7 @@ XGB_DLL int XGBoosterBoostOneIter(BoosterHandle handle, DMatrixHandle dtrain, bs
bst_float *hess, xgboost::bst_ulong len) {
API_BEGIN();
CHECK_HANDLE();
error::DeprecatedFunc(__func__, "2.1.0", "XGBoosterTrainOneIter");
LOG(WARNING) << error::DeprecatedFunc(__func__, "2.1.0", "XGBoosterTrainOneIter");
auto *learner = static_cast<Learner *>(handle);
auto ctx = learner->Ctx()->MakeCPU();

Expand Down
19 changes: 10 additions & 9 deletions src/c_api/c_api_utils.h
Original file line number Diff line number Diff line change
@@ -1,17 +1,18 @@
/**
* Copyright 2021-2023, XGBoost Contributors
* Copyright 2021-2024, XGBoost Contributors
*/
#ifndef XGBOOST_C_API_C_API_UTILS_H_
#define XGBOOST_C_API_C_API_UTILS_H_

#include <algorithm>
#include <cstddef>
#include <functional>
#include <memory> // for shared_ptr
#include <string> // for string
#include <tuple> // for make_tuple
#include <utility> // for move
#include <vector>
#include <algorithm> // for min
#include <cstddef> // for size_t
#include <functional> // for multiplies
#include <memory> // for shared_ptr
#include <numeric> // for accumulate
#include <string> // for string
#include <tuple> // for make_tuple
#include <utility> // for move
#include <vector> // for vector

#include "../common/json_utils.h" // for TypeCheck
#include "xgboost/c_api.h"
Expand Down
2 changes: 2 additions & 0 deletions src/collective/nccl_device_communicator.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
* Copyright 2023 XGBoost contributors
*/
#if defined(XGBOOST_USE_NCCL)
#include <numeric> // for accumulate

#include "comm.cuh"
#include "nccl_device_communicator.cuh"

Expand Down
2 changes: 1 addition & 1 deletion src/common/error_msg.cc
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
#include "xgboost/logging.h"

namespace xgboost::error {
std::string DeprecatedFunc(StringView old, StringView since, StringView replacement) {
[[nodiscard]] std::string DeprecatedFunc(StringView old, StringView since, StringView replacement) {
std::stringstream ss;
ss << "`" << old << "` is deprecated since" << since << ", use `" << replacement << "` instead.";
return ss.str();
Expand Down
2 changes: 1 addition & 1 deletion src/common/error_msg.h
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ void WarnDeprecatedGPUId();

void WarnEmptyDataset();

std::string DeprecatedFunc(StringView old, StringView since, StringView replacement);
[[nodiscard]] std::string DeprecatedFunc(StringView old, StringView since, StringView replacement);

constexpr StringView InvalidCUDAOrdinal() {
return "Invalid device. `device` is required to be CUDA and there must be at least one GPU "
Expand Down
Loading

0 comments on commit 230010d

Please sign in to comment.