Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix add features #2754

Merged
merged 28 commits into from
Oct 26, 2020
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
a9974b6
fix subset bug
guolinke Feb 7, 2020
1e9af5b
typo
guolinke Feb 7, 2020
b6bb36c
add fixme tag
guolinke Feb 7, 2020
3741002
bin mapper
guolinke Feb 7, 2020
acccd9f
fix test
guolinke Feb 7, 2020
bd33f8a
fix add_features_from
guolinke Feb 8, 2020
5d7dbf1
fixed conflict
StrikerRUS Feb 8, 2020
706d1dd
Update dataset.cpp
guolinke Feb 10, 2020
85a9e3f
Merge branch 'master' into fix-add-features
guolinke Feb 10, 2020
fe506a0
fix merge bug
guolinke Feb 10, 2020
0f57239
Merge branch 'master' into fix-add-features
guolinke Feb 15, 2020
a086c0d
Merge branch 'master' into fix-add-features
guolinke Feb 19, 2020
604c41e
Merge branch 'master' into fix-add-features
guolinke Feb 23, 2020
278031b
Merge branch 'master' into fix-add-features
guolinke Mar 4, 2020
6a07005
added Python merge code
StrikerRUS Mar 4, 2020
16dae64
Merge branch 'master' into fix-add-features
StrikerRUS Mar 6, 2020
3cb29ad
added test for add_features
StrikerRUS Apr 19, 2020
f6b440a
merge from master
guolinke Apr 25, 2020
7799f9a
Merge branch 'master' into fix-add-features
StrikerRUS Jun 11, 2020
98ed3e5
Update dataset.cpp
guolinke Jul 7, 2020
2874ba3
Merge branch 'master' into fix-add-features
StrikerRUS Jul 7, 2020
4b20942
Update src/io/dataset.cpp
guolinke Jul 12, 2020
1427314
Merge branch 'master' into fix-add-features
guolinke Aug 5, 2020
26dfd5e
Merge branch 'master' into fix-add-features
StrikerRUS Sep 6, 2020
79666fb
Merge branch 'master' into fix-add-features
StrikerRUS Oct 5, 2020
32526e5
continue implementing
StrikerRUS Oct 5, 2020
833b351
warn users about categorical features
StrikerRUS Oct 5, 2020
c2c3bf3
Merge branch 'master' into fix-add-features
guolinke Oct 26, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
231 changes: 153 additions & 78 deletions include/LightGBM/feature_group.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
/*!
* Copyright (c) 2017 Microsoft Corporation. All rights reserved.
* Licensed under the MIT License. See LICENSE file in the project root for license information.
* Licensed under the MIT License. See LICENSE file in the project root for
* license information.
*/
#ifndef LIGHTGBM_FEATURE_GROUP_H_
#define LIGHTGBM_FEATURE_GROUP_H_
Expand All @@ -17,22 +18,27 @@ namespace LightGBM {

class Dataset;
class DatasetLoader;
/*! \brief Using to store data and providing some operations on one feature group*/
/*! \brief Using to store data and providing some operations on one feature
* group*/
class FeatureGroup {
public:
friend Dataset;
friend DatasetLoader;
/*!
* \brief Constructor
* \param num_feature number of features of this group
* \param bin_mappers Bin mapper for features
* \param num_data Total number of data
* \param is_enable_sparse True if enable sparse feature
* \param sparse_threshold Threshold for treating a feature as a sparse feature
*/
* \brief Constructor
* \param num_feature number of features of this group
* \param bin_mappers Bin mapper for features
* \param num_data Total number of data
* \param is_enable_sparse True if enable sparse feature
* \param sparse_threshold Threshold for treating a feature as a sparse
* feature
*/
FeatureGroup(int num_feature, bool is_multi_val,
std::vector<std::unique_ptr<BinMapper>>* bin_mappers,
data_size_t num_data) : num_feature_(num_feature), is_multi_val_(is_multi_val), is_sparse_(false) {
std::vector<std::unique_ptr<BinMapper>>* bin_mappers,
data_size_t num_data)
: num_feature_(num_feature),
is_multi_val_(is_multi_val),
is_sparse_(false) {
CHECK(static_cast<int>(bin_mappers->size()) == num_feature);
// use bin at zero to store most_freq_bin
num_total_bin_ = 1;
Expand All @@ -46,23 +52,26 @@ class FeatureGroup {
num_total_bin_ += num_bin;
bin_offsets_.emplace_back(num_total_bin_);
}
if (is_multi_val_) {
multi_bin_data_.clear();
for (int i = 0; i < num_feature_; ++i) {
int addi = bin_mappers_[i]->GetMostFreqBin() == 0 ? 0 : 1;
if (bin_mappers_[i]->sparse_rate() >= kSparseThreshold) {
multi_bin_data_.emplace_back(Bin::CreateSparseBin(num_data, bin_mappers_[i]->num_bin() + addi));
} else {
multi_bin_data_.emplace_back(Bin::CreateDenseBin(num_data, bin_mappers_[i]->num_bin() + addi));
}
}
} else {
bin_data_.reset(Bin::CreateDenseBin(num_data, num_total_bin_));
CreateBinData(num_data, is_multi_val_, true, false);
}

FeatureGroup(const FeatureGroup& other, int num_data) {
num_feature_ = other.num_feature_;
is_multi_val_ = other.is_multi_val_;
is_sparse_ = other.is_sparse_;
num_total_bin_ = other.num_total_bin_;
bin_offsets_ = other.bin_offsets_;

bin_mappers_.reserve(other.bin_mappers_.size());
for (auto& bin_mapper : other.bin_mappers_) {
bin_mappers_.emplace_back(new BinMapper(*bin_mapper));
}
CreateBinData(num_data, is_multi_val_, !is_sparse_, is_sparse_);
}

FeatureGroup(std::vector<std::unique_ptr<BinMapper>>* bin_mappers,
data_size_t num_data) : num_feature_(1), is_multi_val_(false) {
data_size_t num_data)
: num_feature_(1), is_multi_val_(false) {
CHECK(static_cast<int>(bin_mappers->size()) == 1);
// use bin at zero to store default_bin
num_total_bin_ = 1;
Expand All @@ -76,23 +85,17 @@ class FeatureGroup {
num_total_bin_ += num_bin;
bin_offsets_.emplace_back(num_total_bin_);
}
if (bin_mappers_[0]->sparse_rate() >= kSparseThreshold) {
is_sparse_ = true;
bin_data_.reset(Bin::CreateSparseBin(num_data, num_total_bin_));
} else {
is_sparse_ = false;
bin_data_.reset(Bin::CreateDenseBin(num_data, num_total_bin_));
}
CreateBinData(num_data, false, false, false);
}

/*!
* \brief Constructor from memory
* \param memory Pointer of memory
* \param num_all_data Number of global data
* \param local_used_indices Local used indices, empty means using all data
*/
* \brief Constructor from memory
* \param memory Pointer of memory
* \param num_all_data Number of global data
* \param local_used_indices Local used indices, empty means using all data
*/
FeatureGroup(const void* memory, data_size_t num_all_data,
const std::vector<data_size_t>& local_used_indices) {
const std::vector<data_size_t>& local_used_indices) {
const char* memory_ptr = reinterpret_cast<const char*>(memory);
// get is_sparse
is_multi_val_ = *(reinterpret_cast<const bool*>(memory_ptr));
Expand Down Expand Up @@ -125,9 +128,11 @@ class FeatureGroup {
for (int i = 0; i < num_feature_; ++i) {
int addi = bin_mappers_[i]->GetMostFreqBin() == 0 ? 0 : 1;
if (bin_mappers_[i]->sparse_rate() >= kSparseThreshold) {
multi_bin_data_.emplace_back(Bin::CreateSparseBin(num_data, bin_mappers_[i]->num_bin() + addi));
multi_bin_data_.emplace_back(Bin::CreateSparseBin(
num_data, bin_mappers_[i]->num_bin() + addi));
} else {
multi_bin_data_.emplace_back(Bin::CreateDenseBin(num_data, bin_mappers_[i]->num_bin() + addi));
multi_bin_data_.emplace_back(
Bin::CreateDenseBin(num_data, bin_mappers_[i]->num_bin() + addi));
}
multi_bin_data_.back()->LoadFromMemory(memory_ptr, local_used_indices);
memory_ptr += multi_bin_data_.back()->SizesInByte();
Expand All @@ -143,18 +148,20 @@ class FeatureGroup {
}
}
/*! \brief Destructor */
~FeatureGroup() {
}
~FeatureGroup() {}

/*!
* \brief Push one record, will auto convert to bin and push to bin data
* \param tid Thread id
* \param idx Index of record
* \param value feature value of record
*/
inline void PushData(int tid, int sub_feature_idx, data_size_t line_idx, double value) {
* \brief Push one record, will auto convert to bin and push to bin data
* \param tid Thread id
* \param idx Index of record
* \param value feature value of record
*/
inline void PushData(int tid, int sub_feature_idx, data_size_t line_idx,
double value) {
uint32_t bin = bin_mappers_[sub_feature_idx]->ValueToBin(value);
if (bin == bin_mappers_[sub_feature_idx]->GetMostFreqBin()) { return; }
if (bin == bin_mappers_[sub_feature_idx]->GetMostFreqBin()) {
return;
}
if (bin_mappers_[sub_feature_idx]->GetMostFreqBin() == 0) {
bin -= 1;
}
Expand All @@ -166,16 +173,47 @@ class FeatureGroup {
}
}

inline void CopySubset(const FeatureGroup* full_feature, const data_size_t* used_indices, data_size_t num_used_indices) {
void ReSize(int num_data) {
if (!is_multi_val_) {
bin_data_->CopySubset(full_feature->bin_data_.get(), used_indices, num_used_indices);
bin_data_->ReSize(num_data);
} else {
for (int i = 0; i < num_feature_; ++i) {
multi_bin_data_[i]->CopySubset(full_feature->multi_bin_data_[i].get(), used_indices, num_used_indices);
multi_bin_data_[i]->ReSize(num_data);
}
}
}

inline void CopySubset(const FeatureGroup* full_feature,
const data_size_t* used_indices,
data_size_t num_used_indices) {
if (!is_multi_val_) {
bin_data_->CopySubset(full_feature->bin_data_.get(), used_indices,
num_used_indices);
} else {
for (int i = 0; i < num_feature_; ++i) {
multi_bin_data_[i]->CopySubset(full_feature->multi_bin_data_[i].get(),
used_indices, num_used_indices);
}
}
}

void AddFeaturesFrom(const FeatureGroup* other) {
CHECK(is_multi_val_);
CHECK(other->is_multi_val_);
for (int i = 0; i < other->num_feature_; ++i) {
const auto& other_bin_mapper = other->bin_mappers_[i];
bin_mappers_.emplace_back(new BinMapper(*other_bin_mapper));
auto num_bin = other_bin_mapper->num_bin();
if (other_bin_mapper->GetMostFreqBin() == 0) {
num_bin -= 1;
}
num_total_bin_ += num_bin;
bin_offsets_.emplace_back(num_total_bin_);
multi_bin_data_.emplace_back(other->multi_bin_data_[i]->Clone());
}
num_feature_ += other->num_feature_;
}

inline BinIterator* SubFeatureIterator(int sub_feature) {
uint32_t most_freq_bin = bin_mappers_[sub_feature]->GetMostFreqBin();
if (!is_multi_val_) {
Expand All @@ -186,14 +224,15 @@ class FeatureGroup {
int addi = bin_mappers_[sub_feature]->GetMostFreqBin() == 0 ? 0 : 1;
uint32_t min_bin = 1;
uint32_t max_bin = bin_mappers_[sub_feature]->num_bin() - 1 + addi;
return multi_bin_data_[sub_feature]->GetIterator(min_bin, max_bin, most_freq_bin);
return multi_bin_data_[sub_feature]->GetIterator(min_bin, max_bin,
most_freq_bin);
}
}

inline void FinishLoad() {
if (is_multi_val_) {
OMP_INIT_EX();
#pragma omp parallel for schedule(guided)
#pragma omp parallel for schedule(guided)
for (int i = 0; i < num_feature_; ++i) {
OMP_LOOP_EX_BEGIN();
multi_bin_data_[i]->FinishLoad();
Expand All @@ -220,51 +259,57 @@ class FeatureGroup {
return bin_data_->GetIterator(min_bin, max_bin, most_freq_bin);
}

inline data_size_t Split(
int sub_feature,
const uint32_t* threshold,
int num_threshold,
bool default_left,
data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices) const {
inline data_size_t Split(int sub_feature, const uint32_t* threshold,
int num_threshold, bool default_left,
data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices,
data_size_t* gt_indices) const {
uint32_t default_bin = bin_mappers_[sub_feature]->GetDefaultBin();
uint32_t most_freq_bin = bin_mappers_[sub_feature]->GetMostFreqBin();
if (!is_multi_val_) {
uint32_t min_bin = bin_offsets_[sub_feature];
uint32_t max_bin = bin_offsets_[sub_feature + 1] - 1;
if (bin_mappers_[sub_feature]->bin_type() == BinType::NumericalBin) {
auto missing_type = bin_mappers_[sub_feature]->missing_type();
return bin_data_->Split(min_bin, max_bin, default_bin, most_freq_bin, missing_type, default_left,
*threshold, data_indices, num_data, lte_indices, gt_indices);
return bin_data_->Split(min_bin, max_bin, default_bin, most_freq_bin,
missing_type, default_left, *threshold,
data_indices, num_data, lte_indices,
gt_indices);
} else {
return bin_data_->SplitCategorical(min_bin, max_bin, most_freq_bin, threshold, num_threshold, data_indices, num_data, lte_indices, gt_indices);
return bin_data_->SplitCategorical(
min_bin, max_bin, most_freq_bin, threshold, num_threshold,
data_indices, num_data, lte_indices, gt_indices);
}
} else {
int addi = bin_mappers_[sub_feature]->GetMostFreqBin() == 0 ? 0 : 1;
uint32_t min_bin = 1;
uint32_t max_bin = bin_mappers_[sub_feature]->num_bin() - 1 + addi;
if (bin_mappers_[sub_feature]->bin_type() == BinType::NumericalBin) {
auto missing_type = bin_mappers_[sub_feature]->missing_type();
return multi_bin_data_[sub_feature]->Split(min_bin, max_bin, default_bin, most_freq_bin, missing_type, default_left,
*threshold, data_indices, num_data, lte_indices, gt_indices);
return multi_bin_data_[sub_feature]->Split(
min_bin, max_bin, default_bin, most_freq_bin, missing_type,
default_left, *threshold, data_indices, num_data, lte_indices,
gt_indices);
} else {
return multi_bin_data_[sub_feature]->SplitCategorical(min_bin, max_bin, most_freq_bin, threshold, num_threshold, data_indices, num_data, lte_indices, gt_indices);
return multi_bin_data_[sub_feature]->SplitCategorical(
min_bin, max_bin, most_freq_bin, threshold, num_threshold,
data_indices, num_data, lte_indices, gt_indices);
}
}
}
/*!
* \brief From bin to feature value
* \param bin
* \return FeatureGroup value of this bin
*/
* \brief From bin to feature value
* \param bin
* \return FeatureGroup value of this bin
*/
inline double BinToValue(int sub_feature_idx, uint32_t bin) const {
return bin_mappers_[sub_feature_idx]->BinToValue(bin);
}

/*!
* \brief Save binary data to file
* \param file File want to write
*/
* \brief Save binary data to file
* \param file File want to write
*/
void SaveBinaryToFile(const VirtualFileWriter* writer) const {
writer->Write(&is_multi_val_, sizeof(is_multi_val_));
writer->Write(&is_sparse_, sizeof(is_sparse_));
Expand All @@ -281,10 +326,11 @@ class FeatureGroup {
}
}
/*!
* \brief Get sizes in byte of this object
*/
* \brief Get sizes in byte of this object
*/
size_t SizesInByte() const {
size_t ret = sizeof(is_multi_val_) + sizeof(is_sparse_) + sizeof(num_feature_);
size_t ret =
sizeof(is_multi_val_) + sizeof(is_sparse_) + sizeof(num_feature_);
for (int i = 0; i < num_feature_; ++i) {
ret += bin_mappers_[i]->SizesInByte();
}
Expand All @@ -297,6 +343,7 @@ class FeatureGroup {
}
return ret;
}

/*! \brief Disable copy */
FeatureGroup& operator=(const FeatureGroup&) = delete;
/*! \brief Deep copy */
Expand All @@ -322,6 +369,35 @@ class FeatureGroup {
}

private:
void CreateBinData(int num_data, bool is_multi_val, bool force_dense,
bool force_sparse) {
if (is_multi_val) {
multi_bin_data_.clear();
for (int i = 0; i < num_feature_; ++i) {
int addi = bin_mappers_[i]->GetMostFreqBin() == 0 ? 0 : 1;
if (bin_mappers_[i]->sparse_rate() >= kSparseThreshold) {
multi_bin_data_.emplace_back(Bin::CreateSparseBin(
num_data, bin_mappers_[i]->num_bin() + addi));
} else {
multi_bin_data_.emplace_back(
Bin::CreateDenseBin(num_data, bin_mappers_[i]->num_bin() + addi));
}
}
is_multi_val_ = true;
} else {
if (force_sparse ||
(!force_dense && num_feature_ == 1 &&
bin_mappers_[0]->sparse_rate() >= kSparseThreshold)) {
is_sparse_ = true;
bin_data_.reset(Bin::CreateSparseBin(num_data, num_total_bin_));
} else {
is_sparse_ = false;
bin_data_.reset(Bin::CreateDenseBin(num_data, num_total_bin_));
}
is_multi_val_ = false;
}
}

/*! \brief Number of features */
int num_feature_;
/*! \brief Bin mapper for sub features */
Expand All @@ -337,7 +413,6 @@ class FeatureGroup {
int num_total_bin_;
};


} // namespace LightGBM

#endif // LIGHTGBM_FEATURE_GROUP_H_
#endif // LIGHTGBM_FEATURE_GROUP_H_
5 changes: 5 additions & 0 deletions python-package/lightgbm/basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1640,6 +1640,11 @@ def add_features_from(self, other):
if self.handle is None or other.handle is None:
raise ValueError('Both source and target Datasets must be constructed before adding features')
_safe_call(_LIB.LGBM_DatasetAddFeaturesFrom(self.handle, other.handle))
if other.data is None:
self.data = None
if self.data is not None:
# FIXME: concat two dataset
self.data = [self.data, other.data]
StrikerRUS marked this conversation as resolved.
Show resolved Hide resolved
return self

def _dump_text(self, filename):
Expand Down
Loading