Skip to content

Commit

Permalink
Check max_bin, etc. match config when using binary (#3592)
Browse files Browse the repository at this point in the history
* Check max_bin, etc. match config when using binary.

* Check max_bin_by_feature, bin_construct_sample_cnt matching config.
  • Loading branch information
cyfdecyf authored Dec 5, 2020
1 parent d83b973 commit 2c958dd
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 3 deletions.
2 changes: 1 addition & 1 deletion include/LightGBM/dataset_loader.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ class DatasetLoader {

void SetHeader(const char* filename);

void CheckDataset(const Dataset* dataset);
void CheckDataset(const Dataset* dataset, bool is_load_from_binary);

std::vector<std::string> LoadTextDataToMemory(const char* filename, const Metadata& metadata, int rank, int num_machines, int* num_global_data, std::vector<data_size_t>* used_data_indices);

Expand Down
40 changes: 38 additions & 2 deletions src/io/dataset_loader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,7 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_mac
data_size_t num_global_data = 0;
std::vector<data_size_t> used_data_indices;
auto bin_filename = CheckCanLoadFromBin(filename);
bool is_load_from_binary = false;
if (bin_filename.size() == 0) {
auto parser = std::unique_ptr<Parser>(Parser::CreateParser(filename, config_.header, 0, label_idx_));
if (parser == nullptr) {
Expand Down Expand Up @@ -229,12 +230,15 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_mac
}
} else {
// load data from binary file
is_load_from_binary = true;
Log::Info("Load from binary file %s", bin_filename.c_str());
dataset.reset(LoadFromBinFile(filename, bin_filename.c_str(), rank, num_machines, &num_global_data, &used_data_indices));
}
// check meta data
dataset->metadata_.CheckOrPartition(num_global_data, used_data_indices);
// need to check training data
CheckDataset(dataset.get());
CheckDataset(dataset.get(), is_load_from_binary);

return dataset.release();
}

Expand Down Expand Up @@ -707,7 +711,7 @@ Dataset* DatasetLoader::ConstructFromSampleData(double** sample_values,

// ---- private functions ----

void DatasetLoader::CheckDataset(const Dataset* dataset) {
void DatasetLoader::CheckDataset(const Dataset* dataset, bool is_load_from_binary) {
if (dataset->num_data_ <= 0) {
Log::Fatal("Data file %s is empty", dataset->data_filename_.c_str());
}
Expand Down Expand Up @@ -736,6 +740,38 @@ void DatasetLoader::CheckDataset(const Dataset* dataset) {
if (!is_feature_order_by_group) {
Log::Fatal("Features in dataset should be ordered by group");
}

if (is_load_from_binary) {
if (dataset->max_bin_ != config_.max_bin) {
Log::Fatal("Dataset max_bin %d != config %d", dataset->max_bin_, config_.max_bin);
}
if (dataset->min_data_in_bin_ != config_.min_data_in_bin) {
Log::Fatal("Dataset min_data_in_bin %d != config %d", dataset->min_data_in_bin_, config_.min_data_in_bin);
}
if (dataset->use_missing_ != config_.use_missing) {
Log::Fatal("Dataset use_missing %d != config %d", dataset->use_missing_, config_.use_missing);
}
if (dataset->zero_as_missing_ != config_.zero_as_missing) {
Log::Fatal("Dataset zero_as_missing %d != config %d", dataset->zero_as_missing_, config_.zero_as_missing);
}
if (dataset->bin_construct_sample_cnt_ != config_.bin_construct_sample_cnt) {
Log::Fatal("Dataset bin_construct_sample_cnt %d != config %d", dataset->bin_construct_sample_cnt_, config_.bin_construct_sample_cnt);
}
if ((dataset->max_bin_by_feature_.size() != config_.max_bin_by_feature.size()) ||
!std::equal(dataset->max_bin_by_feature_.begin(), dataset->max_bin_by_feature_.end(),
config_.max_bin_by_feature.begin())) {
Log::Fatal("Dataset max_bin_by_feature does not match with config");
}

int label_idx = -1;
if (Common::AtoiAndCheck(config_.label_column.c_str(), &label_idx)) {
if (dataset->label_idx_ != label_idx) {
Log::Fatal("Dataset label_idx %d != config %d", dataset->zero_as_missing_, config_.zero_as_missing);
}
} else {
Log::Info("Recommend use integer for label index when loading data from binary for sanity check.");
}
}
}

std::vector<std::string> DatasetLoader::LoadTextDataToMemory(const char* filename, const Metadata& metadata,
Expand Down

0 comments on commit 2c958dd

Please sign in to comment.