Skip to content

Commit

Permalink
1.
Browse files Browse the repository at this point in the history
Add unit test to test distributed training on more than 2^31 (~2B) examples.

2.
Improve error message when training on more than 2^31 example while YDF is compiled with 32-bit index precision (default). The previous error message was not explaining how to enable training with more than 2^31 examples.

3.
Improve (~2x) dataset loading speed when a column filter is set.
An optional<vector<int>> was copied instead of passed by reference in one of the method.

4.
Add a buffer for all file writing operations.
This notably speeds-up local file writing.

5. Minor
Move types.h from learner/ to dataset/ directory.

PiperOrigin-RevId: 558111770
  • Loading branch information
achoum authored and copybara-github committed Aug 18, 2023
1 parent 7207188 commit f930e1f
Show file tree
Hide file tree
Showing 50 changed files with 319 additions and 124 deletions.
11 changes: 11 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,15 @@

## HEAD

### Feature

- Improve speed of dataset reading and writing.

### Fix

- Proper error message when using distributed training on more than 2^31
(i.e., ~2B) examples while compiling YDF with 32-bits example index.

## 1.5.0 - 2023-07-03

### Feature
Expand All @@ -24,6 +33,8 @@
- Fix buggy restriction for SelGB sampling
- Improve documentation.

### Change

## 1.4.0 - 2023-03-20

### Features
Expand Down
8 changes: 4 additions & 4 deletions yggdrasil_decision_forests/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -56,15 +56,15 @@ config_setting(
# contains zero-weighted examples).
#
# Possible values:
# (default) --define=example_idx_num_bits=32 => Up to 2'147'483'647 training
# (default) --define=ydf_example_idx_num_bits=32 => Up to 2'147'483'647 training
# examples.
# --define=example_idx_num_bits=64 => Up to 10^19 training examples.
# --define=ydf_example_idx_num_bits=64 => Up to 10^19 training examples.
config_setting(
name = "example_idx_32bits",
values = {"define": "example_idx_num_bits=32"},
values = {"define": "ydf_example_idx_num_bits=32"},
)

config_setting(
name = "example_idx_64bits",
values = {"define": "example_idx_num_bits=64"},
values = {"define": "ydf_example_idx_num_bits=64"},
)
16 changes: 16 additions & 0 deletions yggdrasil_decision_forests/dataset/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@ cc_library_ydf(
":data_spec",
":data_spec_cc_proto",
":example_cc_proto",
":types",
"//yggdrasil_decision_forests/utils:logging",
"//yggdrasil_decision_forests/utils:status_macros",
"@com_google_absl//absl/memory",
Expand Down Expand Up @@ -327,6 +328,21 @@ cc_library_ydf(
}),
)

cc_library_ydf(
name = "types",
srcs = ["types.cc"],
hdrs = ["types.h"],
defines = select({
"//yggdrasil_decision_forests:example_idx_32bits": ["YGGDRASIL_EXAMPLE_IDX_32_BITS"],
"//yggdrasil_decision_forests:example_idx_64bits": ["YGGDRASIL_EXAMPLE_IDX_64_BITS"],
"//conditions:default": ["YGGDRASIL_EXAMPLE_IDX_32_BITS"],
}),
deps = [
"@com_google_absl//absl/status",
"@com_google_absl//absl/strings",
],
)

# Proto
# ========

Expand Down
6 changes: 3 additions & 3 deletions yggdrasil_decision_forests/dataset/csv_example_reader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ using proto::ColumnType;

CsvExampleReader::Implementation::Implementation(
const proto::DataSpecification& data_spec,
const absl::optional<std::vector<int>> required_columns)
const absl::optional<std::vector<int>>& required_columns)
: data_spec_(data_spec), required_columns_(required_columns) {}

absl::Status CsvExampleReader::Implementation::OpenShard(
Expand Down Expand Up @@ -79,8 +79,8 @@ absl::StatusOr<bool> CsvExampleReader::Implementation::NextInShard(
if (!has_row) {
return false;
}
RETURN_IF_ERROR(CsvRowToExample({row->begin(), row->end()}, data_spec_,
col_idx_to_field_idx_, example));
RETURN_IF_ERROR(
CsvRowToExample(*row, data_spec_, col_idx_to_field_idx_, example));
return true;
}

Expand Down
5 changes: 3 additions & 2 deletions yggdrasil_decision_forests/dataset/csv_example_reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,9 @@ class CsvExampleReader final : public ExampleReaderInterface {
private:
class Implementation final : public utils::ShardedReader<proto::Example> {
public:
explicit Implementation(const proto::DataSpecification& data_spec,
absl::optional<std::vector<int>> required_columns);
explicit Implementation(
const proto::DataSpecification& data_spec,
const absl::optional<std::vector<int>>& required_columns);

protected:
// Opens the .csv file at "path", and check that the header is as expected.
Expand Down
11 changes: 9 additions & 2 deletions yggdrasil_decision_forests/dataset/data_spec.cc
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ int32_t CategoricalStringToValue(const std::string& value,
}

absl::StatusOr<int32_t> CategoricalStringToValueWithStatus(
const std::string& value, const proto::Column& col_spec) {
const absl::string_view value, const proto::Column& col_spec) {
if (col_spec.categorical().is_already_integerized()) {
int32_t int_value;
if (!absl::SimpleAtoi(value, &int_value)) {
Expand All @@ -148,6 +148,13 @@ absl::StatusOr<int32_t> CategoricalStringToValueWithStatus(
}
}

// TODO: Remove this version when external protobuffer supports dictionary
// query with absl::string_view.
absl::StatusOr<int32_t> CategoricalStringToValueWithStatus(
const std::string& value, const proto::Column& col_spec) {
return CategoricalStringToValueWithStatus(absl::string_view{value}, col_spec);
}

absl::Status BuildColIdxToFeatureLabelIdx(
const proto::DataSpecification& data_spec,
const std::vector<std::string>& fields,
Expand Down Expand Up @@ -270,7 +277,7 @@ bool HasColumn(absl::string_view name,
return false;
}

absl::Status CsvRowToExample(const std::vector<std::string>& csv_fields,
absl::Status CsvRowToExample(const std::vector<absl::string_view>& csv_fields,
const proto::DataSpecification& data_spec,
const std::vector<int>& col_idx_to_field_idx,
proto::Example* example) {
Expand Down
6 changes: 5 additions & 1 deletion yggdrasil_decision_forests/dataset/data_spec.h
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ absl::Status GetSingleColumnIdxFromName(
// Converts a single row from a csv into an Example.
// If col_idx_to_field_idx[i] == -1, all the values of the i-th column are
// replaced by empty values.
absl::Status CsvRowToExample(const std::vector<std::string>& csv_fields,
absl::Status CsvRowToExample(const std::vector<absl::string_view>& csv_fields,
const proto::DataSpecification& data_spec,
const std::vector<int>& col_idx_to_field_idx,
proto::Example* example);
Expand Down Expand Up @@ -127,6 +127,10 @@ std::string PrintHumanReadable(const proto::DataSpecification& data_spec,

// Returns the integer representation of a categorical value provided as a
// string.
absl::StatusOr<int32_t> CategoricalStringToValueWithStatus(
absl::string_view value, const proto::Column& col_spec);
// TODO: Remove this version when external protobuffer will support
// map query with absl::string_view.
absl::StatusOr<int32_t> CategoricalStringToValueWithStatus(
const std::string& value, const proto::Column& col_spec);

Expand Down
2 changes: 1 addition & 1 deletion yggdrasil_decision_forests/dataset/example_reader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ namespace dataset {
absl::StatusOr<std::unique_ptr<ExampleReaderInterface>> CreateExampleReader(
const absl::string_view typed_path,
const proto::DataSpecification& data_spec,
const absl::optional<std::vector<int>> required_columns) {
const absl::optional<std::vector<int>>& required_columns) {
std::string sharded_path;
proto::DatasetFormat format;
ASSIGN_OR_RETURN(std::tie(sharded_path, format),
Expand Down
2 changes: 1 addition & 1 deletion yggdrasil_decision_forests/dataset/example_reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ namespace dataset {
//
absl::StatusOr<std::unique_ptr<ExampleReaderInterface>> CreateExampleReader(
absl::string_view typed_path, const proto::DataSpecification& data_spec,
absl::optional<std::vector<int>> required_columns = {});
const absl::optional<std::vector<int>>& required_columns = {});

// Checks if the format of a typed dataset is supported i.e. a dataset reader is
// registered for this format. Returns true, if the format is supported. Returns
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,24 +13,28 @@
* limitations under the License.
*/

#include "yggdrasil_decision_forests/learner/types.h"
#include "yggdrasil_decision_forests/dataset/types.h"

#include "absl/status/status.h"
#include "absl/strings/substitute.h"

namespace yggdrasil_decision_forests {
namespace model {
namespace dataset {

absl::Status CheckNumExamples(size_t num_examples) {
const auto max = std::numeric_limits<SignedExampleIdx>::max();
if (num_examples > max) {
return absl::InvalidArgumentError(
absl::Substitute("Too many training example ($0 > $1). Recompile the "
"binary with --define=example_idx_num_bits=64.",
num_examples, max));
return absl::InvalidArgumentError(absl::Substitute(
"The dataset contains to many example ($0 > $1). Compile Yggdrasil "
"Decision Forests with support for 64-bits example index with the "
"following flag to train on more example: "
"--define=ydf_example_idx_num_bits=64. Warning: 64-bits example index "
"can increase up to 2x the RAM usage of YDF. Don't use it for datasets "
"with less than 2^31 i.e. ~2B examples.",
num_examples, max));
}
return absl::OkStatus();
}

} // namespace model
} // namespace yggdrasil_decision_forests
} // namespace dataset
} // namespace yggdrasil_decision_forests
Original file line number Diff line number Diff line change
Expand Up @@ -13,21 +13,21 @@
* limitations under the License.
*/

#ifndef YGGDRASIL_DECISION_FORESTS_LEARNER_TYPES_H_
#define YGGDRASIL_DECISION_FORESTS_LEARNER_TYPES_H_
#ifndef YGGDRASIL_DECISION_FORESTS_DATASET_TYPES_H_
#define YGGDRASIL_DECISION_FORESTS_DATASET_TYPES_H_

#include <cstdint>

#include "absl/status/status.h"

namespace yggdrasil_decision_forests {
namespace model {
namespace dataset {

// "ExampleIdx" is a signed integer able to store the number of examples in a
// training dataset.
//
// ExampleIdx is controlled by the --define=example_idx_num_bits={32,64} flag.
// See the documentation of this flag for more details.
// ExampleIdx is controlled by the --define=ydf_example_idx_num_bits={32,64}
// flag. See the documentation of this flag for more details.
#if defined(YGGDRASIL_EXAMPLE_IDX_32_BITS)
typedef int32_t SignedExampleIdx;
typedef uint32_t UnsignedExampleIdx;
Expand All @@ -42,7 +42,14 @@ typedef uint64_t UnsignedExampleIdx;
// "SignedExampleIdx".
absl::Status CheckNumExamples(size_t num_examples);

} // namespace dataset

namespace model {
// Alias in "model" namespace.
typedef dataset::SignedExampleIdx SignedExampleIdx;
typedef dataset::UnsignedExampleIdx UnsignedExampleIdx;
} // namespace model

} // namespace yggdrasil_decision_forests

#endif // YGGDRASIL_DECISION_FORESTS_LEARNER_TYPES_H_
#endif // YGGDRASIL_DECISION_FORESTS_DATASET_TYPES_H_
6 changes: 3 additions & 3 deletions yggdrasil_decision_forests/dataset/vertical_dataset.cc
Original file line number Diff line number Diff line change
Expand Up @@ -133,14 +133,14 @@ int VerticalDataset::ColumnNameToColumnIdx(absl::string_view name) const {

void VerticalDataset::AppendExample(
const proto::Example& example,
const absl::optional<std::vector<int>> load_columns) {
const absl::optional<std::vector<int>>& load_columns) {
// TODO: Update.
CHECK_OK(AppendExampleWithStatus(example, load_columns));
}

absl::Status VerticalDataset::AppendExampleWithStatus(
const proto::Example& example,
const absl::optional<std::vector<int>> load_columns) {
const absl::optional<std::vector<int>>& load_columns) {
DCHECK_EQ(columns_.size(), example.attributes_size());
if (load_columns.has_value()) {
for (int col_idx : load_columns.value()) {
Expand Down Expand Up @@ -905,7 +905,7 @@ void MapExampleToProtoExample(
absl::Status MapExampleToProtoExampleWithStatus(
const std::unordered_map<std::string, std::string>& src,
const proto::DataSpecification& data_spec, proto::Example* dst) {
std::vector<std::string> flat_values;
std::vector<absl::string_view> flat_values;
std::vector<int> col_idx_to_field_idx(data_spec.columns_size(), -1);
for (const auto& src_value : src) {
const int col_idx = GetColumnIdxFromName(src_value.first, data_spec);
Expand Down
28 changes: 16 additions & 12 deletions yggdrasil_decision_forests/dataset/vertical_dataset.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
#include "yggdrasil_decision_forests/dataset/data_spec.h"
#include "yggdrasil_decision_forests/dataset/data_spec.pb.h"
#include "yggdrasil_decision_forests/dataset/example.pb.h"
#include "yggdrasil_decision_forests/dataset/types.h"
#include "yggdrasil_decision_forests/utils/logging.h"
#include "yggdrasil_decision_forests/utils/status_macros.h"

Expand All @@ -48,7 +49,7 @@ namespace dataset {
class VerticalDataset {
public:
// Row index type.
typedef int64_t row_t;
typedef SignedExampleIdx row_t;

// Abstract representation of a column.
class AbstractColumn {
Expand Down Expand Up @@ -110,8 +111,9 @@ class VerticalDataset {
virtual absl::Status ExtractAndAppend(const std::vector<row_t>& indices,
AbstractColumn* dst) const = 0;

virtual absl::Status ExtractAndAppend(const std::vector<uint32_t>& indices,
AbstractColumn* dst) const = 0;
virtual absl::Status ExtractAndAppend(
const std::vector<UnsignedExampleIdx>& indices,
AbstractColumn* dst) const = 0;

// Converts the content of a column to another dataspec.
virtual absl::Status ConvertToGivenDataspec(
Expand Down Expand Up @@ -180,8 +182,9 @@ class VerticalDataset {
absl::Status ExtractAndAppend(const std::vector<row_t>& indices,
AbstractColumn* dst) const override;

absl::Status ExtractAndAppend(const std::vector<uint32_t>& indices,
AbstractColumn* dst) const override;
absl::Status ExtractAndAppend(
const std::vector<UnsignedExampleIdx>& indices,
AbstractColumn* dst) const override;

std::pair<uint64_t, uint64_t> memory_usage() const override {
return std::pair<uint64_t, uint64_t>(values_.size() * sizeof(T),
Expand Down Expand Up @@ -235,8 +238,9 @@ class VerticalDataset {
absl::Status ExtractAndAppend(const std::vector<row_t>& indices,
AbstractColumn* dst) const override;

absl::Status ExtractAndAppend(const std::vector<uint32_t>& indices,
AbstractColumn* dst) const override;
absl::Status ExtractAndAppend(
const std::vector<UnsignedExampleIdx>& indices,
AbstractColumn* dst) const override;

const std::vector<std::pair<size_t, size_t>>& values() const {
return values_;
Expand Down Expand Up @@ -721,9 +725,9 @@ class VerticalDataset {
// If "load_columns" is set, only the columns specified in it will be loaded.
absl::Status AppendExampleWithStatus(
const proto::Example& example,
const absl::optional<std::vector<int>> load_columns = {});
const absl::optional<std::vector<int>>& load_columns = {});
void AppendExample(const proto::Example& example,
const absl::optional<std::vector<int>> load_columns = {});
const absl::optional<std::vector<int>>& load_columns = {});

absl::Status AppendExampleWithStatus(
const std::unordered_map<std::string, std::string>& example);
Expand Down Expand Up @@ -842,7 +846,7 @@ absl::Status VerticalDataset::TemplateScalarStorage<T>::ExtractAndAppend(

template <typename T>
absl::Status VerticalDataset::TemplateScalarStorage<T>::ExtractAndAppend(
const std::vector<uint32_t>& indices, AbstractColumn* dst) const {
const std::vector<UnsignedExampleIdx>& indices, AbstractColumn* dst) const {
auto* cast_dst =
dynamic_cast<VerticalDataset::TemplateScalarStorage<T>*>(dst);
STATUS_CHECK(cast_dst != nullptr);
Expand Down Expand Up @@ -891,12 +895,12 @@ absl::Status VerticalDataset::TemplateMultiValueStorage<T>::ExtractAndAppend(

template <typename T>
absl::Status VerticalDataset::TemplateMultiValueStorage<T>::ExtractAndAppend(
const std::vector<uint32_t>& indices, AbstractColumn* dst) const {
const std::vector<UnsignedExampleIdx>& indices, AbstractColumn* dst) const {
auto* cast_dst =
dynamic_cast<VerticalDataset::TemplateMultiValueStorage<T>*>(dst);
STATUS_CHECK(cast_dst != nullptr);
if (values_.empty() && !indices.empty()) {
YDF_LOG(FATAL) << "ExtractAndAppend on an empty column";
return absl::InvalidArgumentError("ExtractAndAppend on an empty column");
}
cast_dst->Reserve(dst->nrows() + indices.size());
for (const auto row_idx : indices) {
Expand Down
4 changes: 2 additions & 2 deletions yggdrasil_decision_forests/dataset/vertical_dataset_io.cc
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ namespace {
absl::Status LoadVerticalDatasetSingleThread(
const absl::string_view typed_path,
const proto::DataSpecification& data_spec, VerticalDataset* dataset,
absl::optional<std::vector<int>> required_columns,
const absl::optional<std::vector<int>>& required_columns,
const LoadConfig& config) {
// Initialize dataset.
dataset->set_data_spec(data_spec);
Expand Down Expand Up @@ -117,7 +117,7 @@ absl::StatusOr<std::unique_ptr<BlockOfExamples>> LoadShard(
absl::Status LoadVerticalDataset(
const absl::string_view typed_path,
const proto::DataSpecification& data_spec, VerticalDataset* dataset,
absl::optional<std::vector<int>> required_columns,
const absl::optional<std::vector<int>>& required_columns,
const LoadConfig& config) {
// Extract the shards from the dataset path.
std::string path, prefix;
Expand Down
2 changes: 1 addition & 1 deletion yggdrasil_decision_forests/dataset/vertical_dataset_io.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ struct LoadConfig {
absl::Status LoadVerticalDataset(
absl::string_view typed_path, const proto::DataSpecification& data_spec,
VerticalDataset* dataset,
absl::optional<std::vector<int>> required_columns = {},
const absl::optional<std::vector<int>>& required_columns = {},
const LoadConfig& config = {});

// Save the dataset to a file (or a set of files). If
Expand Down
Loading

0 comments on commit f930e1f

Please sign in to comment.