From e29396cef615d6236dbe5227445c76726f4ab3af Mon Sep 17 00:00:00 2001 From: jiamingy Date: Sun, 20 Aug 2023 04:34:36 +0800 Subject: [PATCH 1/5] [WIP] Drop support for loading remote files. --- CMakeLists.txt | 4 -- doc/jvm/xgboost4j_spark_tutorial.rst | 33 ----------------- python-package/xgboost/core.py | 6 +-- src/common/io.cc | 55 ++++++++++------------------ src/common/io.h | 12 +++--- tests/cpp/common/test_io.cc | 14 +++---- 6 files changed, 31 insertions(+), 93 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3bffa6b07dcd..89151a2d5350 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -71,10 +71,6 @@ option(USE_NCCL "Build with NCCL to enable distributed GPU support." OFF) option(BUILD_WITH_SHARED_NCCL "Build with shared NCCL library." OFF) set(GPU_COMPUTE_VER "" CACHE STRING "Semicolon separated list of compute versions to be built against, e.g. '35;61'") -## Copied From dmlc -option(USE_HDFS "Build with HDFS support" OFF) -option(USE_AZURE "Build with AZURE support" OFF) -option(USE_S3 "Build with S3 support" OFF) ## Sanitizers option(USE_SANITIZER "Use santizer flags" OFF) option(SANITIZER_PATH "Path to sanitizes.") diff --git a/doc/jvm/xgboost4j_spark_tutorial.rst b/doc/jvm/xgboost4j_spark_tutorial.rst index 1cf3ba2c8272..90859dfba2d9 100644 --- a/doc/jvm/xgboost4j_spark_tutorial.rst +++ b/doc/jvm/xgboost4j_spark_tutorial.rst @@ -390,39 +390,6 @@ Then we can load this model with single node Python XGBoost: bst = xgb.Booster({'nthread': 4}) bst.load_model(nativeModelPath) -.. note:: Using HDFS and S3 for exporting the models with nativeBooster.saveModel() - - When interacting with other language bindings, XGBoost also supports saving-models-to and loading-models-from file systems other than the local one. You can use HDFS and S3 by prefixing the path with ``hdfs://`` and ``s3://`` respectively. However, for this capability, you must do **one** of the following: - - 1. Build XGBoost4J-Spark with the steps described in :ref:`here `, but turning `USE_HDFS `_ (or USE_S3, etc. in the same place) switch on. With this approach, you can reuse the above code example by replacing "nativeModelPath" with a HDFS path. - - - However, if you build with USE_HDFS, etc. you have to ensure that the involved shared object file, e.g. libhdfs.so, is put in the LIBRARY_PATH of your cluster. To avoid the complicated cluster environment configuration, choose the other option. - - 2. Use bindings of HDFS, S3, etc. to pass model files around. Here are the steps (taking HDFS as an example): - - - Create a new file with - - .. code-block:: scala - - val outputStream = fs.create("hdfs_path") - - where "fs" is an instance of `org.apache.hadoop.fs.FileSystem `_ class in Hadoop. - - - Pass the returned OutputStream in the first step to nativeBooster.saveModel(): - - .. code-block:: scala - - xgbClassificationModel.nativeBooster.saveModel(outputStream) - - - Download file in other languages from HDFS and load with the pre-built (without the requirement of libhdfs.so) version of XGBoost. (The function "download_from_hdfs" is a helper function to be implemented by the user) - - .. code-block:: python - - import xgboost as xgb - bst = xgb.Booster({'nthread': 4}) - local_path = download_from_hdfs("hdfs_path") - bst.load_model(local_path) - .. note:: Consistency issue between XGBoost4J-Spark and other bindings There is a consistency issue between XGBoost4J-Spark and other language bindings of XGBoost. diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index 27d62cdf2ee2..d59d2f1d17f1 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -505,8 +505,7 @@ class DataIter(ABC): # pylint: disable=too-many-instance-attributes Parameters ---------- cache_prefix : - Prefix to the cache files, only used in external memory. It can be either an - URI or a file path. + Prefix to the cache files, only used in external memory. release_data : Whether the iterator should release the data during reset. Set it to True if the data transformation (converting data to np.float32 type) is expensive. @@ -2558,8 +2557,7 @@ def save_raw(self, raw_format: str = "deprecated") -> bytearray: return ctypes2buffer(cptr, length.value) def load_model(self, fname: ModelIn) -> None: - """Load the model from a file or bytearray. Path to file can be local - or as an URI. + """Load the model from a file or a bytearray. The model is loaded from XGBoost format which is universal among the various XGBoost interfaces. Auxiliary attributes of the Python Booster object (such as diff --git a/src/common/io.cc b/src/common/io.cc index 8dbeba935689..8d0d4acc0499 100644 --- a/src/common/io.cc +++ b/src/common/io.cc @@ -139,7 +139,7 @@ auto SystemErrorMsg() { } } // anonymous namespace -std::string LoadSequentialFile(std::string uri, bool stream) { +std::string LoadSequentialFile(std::string uri) { auto OpenErr = [&uri]() { std::string msg; msg = "Opening " + uri + " failed: "; @@ -148,44 +148,27 @@ std::string LoadSequentialFile(std::string uri, bool stream) { }; auto parsed = dmlc::io::URI(uri.c_str()); + CHECK((parsed.protocol == "file://" || parsed.protocol.length() == 0)) + << "Only local file is supported."; // Read from file. - if ((parsed.protocol == "file://" || parsed.protocol.length() == 0) && !stream) { - std::string buffer; - // Open in binary mode so that correct file size can be computed with - // seekg(). This accommodates Windows platform: - // https://docs.microsoft.com/en-us/cpp/standard-library/basic-istream-class?view=vs-2019#seekg - auto path = std::filesystem::weakly_canonical(std::filesystem::u8path(uri)); - std::ifstream ifs(path, std::ios_base::binary | std::ios_base::in); - if (!ifs) { - // https://stackoverflow.com/a/17338934 - OpenErr(); - } - - ifs.seekg(0, std::ios_base::end); - const size_t file_size = static_cast(ifs.tellg()); - ifs.seekg(0, std::ios_base::beg); - buffer.resize(file_size + 1); - ifs.read(&buffer[0], file_size); - buffer.back() = '\0'; - - return buffer; - } - - // Read from remote. - std::unique_ptr fs{dmlc::Stream::Create(uri.c_str(), "r")}; std::string buffer; - size_t constexpr kInitialSize = 4096; - size_t size {kInitialSize}, total {0}; - while (true) { - buffer.resize(total + size); - size_t read = fs->Read(&buffer[total], size); - total += read; - if (read < size) { - break; - } - size *= 2; + // Open in binary mode so that correct file size can be computed with + // seekg(). This accommodates Windows platform: + // https://docs.microsoft.com/en-us/cpp/standard-library/basic-istream-class?view=vs-2019#seekg + auto path = std::filesystem::weakly_canonical(std::filesystem::u8path(uri)); + std::ifstream ifs(path, std::ios_base::binary | std::ios_base::in); + if (!ifs) { + // https://stackoverflow.com/a/17338934 + OpenErr(); } - buffer.resize(total); + + ifs.seekg(0, std::ios_base::end); + const size_t file_size = static_cast(ifs.tellg()); + ifs.seekg(0, std::ios_base::beg); + buffer.resize(file_size + 1); + ifs.read(&buffer[0], file_size); + buffer.back() = '\0'; + return buffer; } diff --git a/src/common/io.h b/src/common/io.h index 95971abae116..e7ef9ed51b99 100644 --- a/src/common/io.h +++ b/src/common/io.h @@ -84,16 +84,14 @@ class FixedSizeStream : public PeekableInStream { std::string buffer_; }; -/*! - * \brief Helper function for loading consecutive file to avoid dmlc Stream when possible. +/** + * @brief Helper function for loading consecutive file. * - * \param uri URI or file name to file. - * \param stream Use dmlc Stream unconditionally if set to true. Used for running test - * without remote filesystem. + * @param uri URI or file name to file. * - * \return File content. + * @return File content. */ -std::string LoadSequentialFile(std::string uri, bool stream = false); +std::string LoadSequentialFile(std::string uri); /** * \brief Get file extension from file name. diff --git a/tests/cpp/common/test_io.cc b/tests/cpp/common/test_io.cc index 8bc12698bd9d..b8ebc13a1ace 100644 --- a/tests/cpp/common/test_io.cc +++ b/tests/cpp/common/test_io.cc @@ -63,31 +63,27 @@ TEST(IO, LoadSequentialFile) { // Generate a JSON file. size_t constexpr kRows = 1000, kCols = 100; - std::shared_ptr p_dmat{ - RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix(true)}; - std::unique_ptr learner { Learner::Create({p_dmat}) }; + std::shared_ptr p_dmat{RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix(true)}; + std::unique_ptr learner{Learner::Create({p_dmat})}; learner->SetParam("tree_method", "hist"); learner->Configure(); for (int32_t iter = 0; iter < 10; ++iter) { learner->UpdateOneIter(iter, p_dmat); } - Json out { Object() }; + Json out{Object()}; learner->SaveModel(&out); std::string str; Json::Dump(out, &str); std::string tmpfile = tempdir.path + "/model.json"; { - std::unique_ptr fo( - dmlc::Stream::Create(tmpfile.c_str(), "w")); + std::unique_ptr fo(dmlc::Stream::Create(tmpfile.c_str(), "w")); fo->Write(str.c_str(), str.size()); } - auto loaded = LoadSequentialFile(tmpfile, true); + auto loaded = LoadSequentialFile(tmpfile); ASSERT_EQ(loaded, str); - - ASSERT_THROW(LoadSequentialFile("non-exist", true), dmlc::Error); } TEST(IO, Resource) { From 401726b9a15eae93e83ad2f4817f778e8ec0cb70 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Mon, 21 Aug 2023 20:14:33 +0800 Subject: [PATCH 2/5] Don't use string as buffer. --- src/c_api/c_api.cc | 8 ++++---- src/cli_main.cc | 8 ++++---- src/common/io.cc | 6 ++---- src/common/io.h | 2 +- tests/cpp/common/test_io.cc | 4 ++-- tests/cpp/common/test_json.cc | 8 ++++---- tests/cpp/data/test_sparse_page_dmatrix.cc | 4 ++-- tests/cpp/test_learner.cc | 2 +- 8 files changed, 20 insertions(+), 22 deletions(-) diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc index 0c98c0198ac8..5b49d136f06f 100644 --- a/src/c_api/c_api.cc +++ b/src/c_api/c_api.cc @@ -1220,12 +1220,12 @@ XGB_DLL int XGBoosterLoadModel(BoosterHandle handle, const char* fname) { return str; }; if (common::FileExtension(fname) == "json") { - auto str = read_file(); - Json in{Json::Load(StringView{str})}; + auto buffer = read_file(); + Json in{Json::Load(StringView{buffer.data(), buffer.size()})}; static_cast(handle)->LoadModel(in); } else if (common::FileExtension(fname) == "ubj") { - auto str = read_file(); - Json in = Json::Load(StringView{str}, std::ios::binary); + auto buffer = read_file(); + Json in = Json::Load(StringView{buffer.data(), buffer.size()}, std::ios::binary); static_cast(handle)->LoadModel(in); } else { std::unique_ptr fi(dmlc::Stream::Create(fname, "r")); diff --git a/src/cli_main.cc b/src/cli_main.cc index 9507e6e893b9..546edc48314b 100644 --- a/src/cli_main.cc +++ b/src/cli_main.cc @@ -345,10 +345,10 @@ class CLI { void LoadModel(std::string const& path, Learner* learner) const { if (common::FileExtension(path) == "json") { - auto str = common::LoadSequentialFile(path); - CHECK_GT(str.size(), 2); - CHECK_EQ(str[0], '{'); - Json in{Json::Load({str.c_str(), str.size()})}; + auto buffer = common::LoadSequentialFile(path); + CHECK_GT(buffer.size(), 2); + CHECK_EQ(buffer[0], '{'); + Json in{Json::Load({buffer.data(), buffer.size()})}; learner->LoadModel(in); } else { std::unique_ptr fi(dmlc::Stream::Create(path.c_str(), "r")); diff --git a/src/common/io.cc b/src/common/io.cc index 8d0d4acc0499..902cd71335c7 100644 --- a/src/common/io.cc +++ b/src/common/io.cc @@ -139,7 +139,7 @@ auto SystemErrorMsg() { } } // anonymous namespace -std::string LoadSequentialFile(std::string uri) { +std::vector LoadSequentialFile(std::string uri) { auto OpenErr = [&uri]() { std::string msg; msg = "Opening " + uri + " failed: "; @@ -151,7 +151,6 @@ std::string LoadSequentialFile(std::string uri) { CHECK((parsed.protocol == "file://" || parsed.protocol.length() == 0)) << "Only local file is supported."; // Read from file. - std::string buffer; // Open in binary mode so that correct file size can be computed with // seekg(). This accommodates Windows platform: // https://docs.microsoft.com/en-us/cpp/standard-library/basic-istream-class?view=vs-2019#seekg @@ -165,9 +164,8 @@ std::string LoadSequentialFile(std::string uri) { ifs.seekg(0, std::ios_base::end); const size_t file_size = static_cast(ifs.tellg()); ifs.seekg(0, std::ios_base::beg); - buffer.resize(file_size + 1); + std::vector buffer(file_size); ifs.read(&buffer[0], file_size); - buffer.back() = '\0'; return buffer; } diff --git a/src/common/io.h b/src/common/io.h index e7ef9ed51b99..07bb60787272 100644 --- a/src/common/io.h +++ b/src/common/io.h @@ -91,7 +91,7 @@ class FixedSizeStream : public PeekableInStream { * * @return File content. */ -std::string LoadSequentialFile(std::string uri); +std::vector LoadSequentialFile(std::string uri); /** * \brief Get file extension from file name. diff --git a/tests/cpp/common/test_io.cc b/tests/cpp/common/test_io.cc index b8ebc13a1ace..e4d65c1f4567 100644 --- a/tests/cpp/common/test_io.cc +++ b/tests/cpp/common/test_io.cc @@ -73,13 +73,13 @@ TEST(IO, LoadSequentialFile) { } Json out{Object()}; learner->SaveModel(&out); - std::string str; + std::vector str; Json::Dump(out, &str); std::string tmpfile = tempdir.path + "/model.json"; { std::unique_ptr fo(dmlc::Stream::Create(tmpfile.c_str(), "w")); - fo->Write(str.c_str(), str.size()); + fo->Write(str.data(), str.size()); } auto loaded = LoadSequentialFile(tmpfile); diff --git a/tests/cpp/common/test_json.cc b/tests/cpp/common/test_json.cc index 1b4ed76ec688..4d498ffd5eb5 100644 --- a/tests/cpp/common/test_json.cc +++ b/tests/cpp/common/test_json.cc @@ -418,7 +418,7 @@ TEST(Json, AssigningString) { TEST(Json, LoadDump) { std::string ori_buffer = GetModelStr(); - Json origin {Json::Load(StringView{ori_buffer.c_str(), ori_buffer.size()})}; + Json origin{Json::Load(StringView{ori_buffer.c_str(), ori_buffer.size()})}; dmlc::TemporaryDirectory tempdir; auto const& path = tempdir.path + "test_model_dump"; @@ -430,9 +430,9 @@ TEST(Json, LoadDump) { ASSERT_TRUE(fout); fout << out << std::flush; - std::string new_buffer = common::LoadSequentialFile(path); + std::vector new_buffer = common::LoadSequentialFile(path); - Json load_back {Json::Load(StringView(new_buffer.c_str(), new_buffer.size()))}; + Json load_back{Json::Load(StringView(new_buffer.data(), new_buffer.size()))}; ASSERT_EQ(load_back, origin); } @@ -651,7 +651,7 @@ TEST(UBJson, Basic) { } auto data = common::LoadSequentialFile("test.ubj"); - UBJReader reader{StringView{data}}; + UBJReader reader{StringView{data.data(), data.size()}}; json = reader.Load(); return json; }; diff --git a/tests/cpp/data/test_sparse_page_dmatrix.cc b/tests/cpp/data/test_sparse_page_dmatrix.cc index 839ea762e970..efd323e7792e 100644 --- a/tests/cpp/data/test_sparse_page_dmatrix.cc +++ b/tests/cpp/data/test_sparse_page_dmatrix.cc @@ -250,7 +250,7 @@ auto TestSparsePageDMatrixDeterminism(int32_t threads) { auto cache_name = data::MakeId(filename, dynamic_cast(sparse.get())) + ".row.page"; - std::string cache = common::LoadSequentialFile(cache_name); + auto cache = common::LoadSequentialFile(cache_name); return cache; } @@ -258,7 +258,7 @@ TEST(SparsePageDMatrix, Determinism) { #if defined(_MSC_VER) return; #endif // defined(_MSC_VER) - std::vector caches; + std::vector> caches; for (size_t i = 1; i < 18; i += 2) { caches.emplace_back(TestSparsePageDMatrixDeterminism(i)); } diff --git a/tests/cpp/test_learner.cc b/tests/cpp/test_learner.cc index 48fd2d8e96c0..5a31ce1bd98f 100644 --- a/tests/cpp/test_learner.cc +++ b/tests/cpp/test_learner.cc @@ -184,7 +184,7 @@ TEST(Learner, JsonModelIO) { fout.close(); auto loaded_str = common::LoadSequentialFile(tmpdir.path + "/model.json"); - Json loaded = Json::Load(StringView{loaded_str.c_str(), loaded_str.size()}); + Json loaded = Json::Load(StringView{loaded_str.data(), loaded_str.size()}); learner->LoadModel(loaded); learner->Configure(); From 4c5fb1a51a857d873ec14357089670fbfc081d32 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Mon, 21 Aug 2023 21:50:49 +0800 Subject: [PATCH 3/5] fix. --- tests/cpp/c_api/test_c_api.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/cpp/c_api/test_c_api.cc b/tests/cpp/c_api/test_c_api.cc index 205e5f561919..3bf03c95581a 100644 --- a/tests/cpp/c_api/test_c_api.cc +++ b/tests/cpp/c_api/test_c_api.cc @@ -216,8 +216,8 @@ TEST(CAPI, JsonModelIO) { std::string buffer; Json::Dump(Json::Load(l, std::ios::binary), &buffer); - ASSERT_EQ(model_str_0.size() - 1, buffer.size()); - ASSERT_EQ(model_str_0.back(), '\0'); + ASSERT_EQ(model_str_0.size(), buffer.size()); + ASSERT_EQ(model_str_0.back(), '}'); ASSERT_TRUE(std::equal(model_str_0.begin(), model_str_0.end() - 1, buffer.begin())); ASSERT_EQ(XGBoosterSaveModelToBuffer(handle, R"({})", &len, &data), -1); From f1473b7a2d541f2a99ba811e333b1d0767501765 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Mon, 21 Aug 2023 21:54:35 +0800 Subject: [PATCH 4/5] Use file system. --- src/common/io.cc | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/common/io.cc b/src/common/io.cc index 902cd71335c7..39909c782fe5 100644 --- a/src/common/io.cc +++ b/src/common/io.cc @@ -161,9 +161,7 @@ std::vector LoadSequentialFile(std::string uri) { OpenErr(); } - ifs.seekg(0, std::ios_base::end); - const size_t file_size = static_cast(ifs.tellg()); - ifs.seekg(0, std::ios_base::beg); + auto file_size = std::filesystem::file_size(path); std::vector buffer(file_size); ifs.read(&buffer[0], file_size); From 472bf8cdafbea4088fd8e65c0eaac2122181040a Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Mon, 21 Aug 2023 21:55:09 +0800 Subject: [PATCH 5/5] cleanup comment. --- src/common/io.cc | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/common/io.cc b/src/common/io.cc index 39909c782fe5..1715669b091a 100644 --- a/src/common/io.cc +++ b/src/common/io.cc @@ -151,9 +151,6 @@ std::vector LoadSequentialFile(std::string uri) { CHECK((parsed.protocol == "file://" || parsed.protocol.length() == 0)) << "Only local file is supported."; // Read from file. - // Open in binary mode so that correct file size can be computed with - // seekg(). This accommodates Windows platform: - // https://docs.microsoft.com/en-us/cpp/standard-library/basic-istream-class?view=vs-2019#seekg auto path = std::filesystem::weakly_canonical(std::filesystem::u8path(uri)); std::ifstream ifs(path, std::ios_base::binary | std::ios_base::in); if (!ifs) {