diff --git a/c/extras/dt_ftrl.cc b/c/extras/dt_ftrl.cc index ad35b8fd80..5ef0f6d953 100644 --- a/c/extras/dt_ftrl.cc +++ b/c/extras/dt_ftrl.cc @@ -54,8 +54,8 @@ Ftrl::Ftrl(FtrlParams params_in) : void Ftrl::fit(const DataTable* dt_X, const DataTable* dt_y) { define_features(dt_X->ncols); - if (!is_dt_valid(dt_model, params.d, 2)) create_model(); - if (!is_dt_valid(dt_fi, nfeatures, 1)) create_fi(); + is_dt_valid(dt_model, params.d, 2)? init_weights() : create_model(); + is_dt_valid(dt_fi, nfeatures, 1)? init_fi() : create_fi(); // Create column hashers. create_hashers(dt_X); @@ -63,11 +63,10 @@ void Ftrl::fit(const DataTable* dt_X, const DataTable* dt_y) { // Get the target column. auto c_y = static_cast(dt_y->columns[0]); auto d_y = c_y->elements_r(); + const RowIndex ri_y = c_y->rowindex(); // Do training for `nepochs`. - for (size_t i = 0; i < params.nepochs; ++i) { - double total_loss = 0; - + for (size_t e = 0; e < params.nepochs; ++e) { #pragma omp parallel num_threads(config::nthreads) { // Array to store hashed column values and their interactions. @@ -75,21 +74,14 @@ void Ftrl::fit(const DataTable* dt_X, const DataTable* dt_y) { size_t ith = static_cast(omp_get_thread_num()); size_t nth = static_cast(omp_get_num_threads()); - for (size_t j = ith; j < dt_X->nrows; j += nth) { - if (ISNA(d_y[j])) continue; - bool y = d_y[j]; - hash_row(x, j); - double p = predict_row(x); - update(x, p, y); - - double loss = logloss(p, y); - #pragma omp atomic update - total_loss += loss; - if ((j+1) % REPORT_FREQUENCY == 0) { - printf("Training epoch: %zu\tRow: %zu\tPrediction: %f\t" - "Current loss: %f\tAverage loss: %f\n", - i, j+1, p, loss, total_loss / (j+1)); - } + for (size_t i = ith; i < dt_X->nrows; i += nth) { + size_t j = ri_y[i]; + if (j != RowIndex::NA && !ISNA(d_y[j])) { + bool y = d_y[j]; + hash_row(x, i); + double p = predict_row(x); + update(x, p, y); + } } } } @@ -104,7 +96,8 @@ void Ftrl::fit(const DataTable* dt_X, const DataTable* dt_y) { dtptr Ftrl::predict(const DataTable* dt_X) { xassert(model_trained); define_features(dt_X->ncols); - if (is_dt_valid(dt_fi, nfeatures, 1)) create_fi(); + init_weights(); + is_dt_valid(dt_fi, nfeatures, 1)? init_fi() : create_fi(); // Re-create hashers as stypes for training dataset and predictions // may be different @@ -122,12 +115,9 @@ dtptr Ftrl::predict(const DataTable* dt_X) { size_t ith = static_cast(omp_get_thread_num()); size_t nth = static_cast(omp_get_num_threads()); - for (size_t j = ith; j < dt_X->nrows; j += nth) { - hash_row(x, j); - d_y[j] = predict_row(x); - if ((j+1) % REPORT_FREQUENCY == 0) { - printf("Row: %zu\tPrediction: %f\n", j+1, d_y[j]); - } + for (size_t i = ith; i < dt_X->nrows; i += nth) { + hash_row(x, i); + d_y[i] = predict_row(x); } } return dt_y; @@ -175,12 +165,13 @@ void Ftrl::create_model() { Column* col_z = Column::new_data_column(SType::FLOAT64, params.d); Column* col_n = Column::new_data_column(SType::FLOAT64, params.d); dt_model = dtptr(new DataTable({col_z, col_n}, model_colnames)); - init_weights(); + w = doubleptr(new double[params.d]()); reset_model(); } void Ftrl::reset_model() { + init_weights(); if (z == nullptr || n == nullptr) return; std::memset(z, 0, params.d * sizeof(double)); std::memset(n, 0, params.d * sizeof(double)); @@ -192,14 +183,12 @@ void Ftrl::init_weights() { if (dt_model == nullptr) return; z = static_cast(dt_model->columns[0]->data_w()); n = static_cast(dt_model->columns[1]->data_w()); - w = doubleptr(new double[params.d]()); } void Ftrl::create_fi() { Column* col_fi = Column::new_data_column(SType::FLOAT64, nfeatures); dt_fi = dtptr(new DataTable({col_fi}, {"feature_importance"})); - init_fi(); reset_fi(); } @@ -211,6 +200,7 @@ void Ftrl::init_fi() { void Ftrl::reset_fi() { + init_fi(); if (fi == nullptr) return; std::memset(fi, 0, nfeatures * sizeof(double)); } @@ -422,6 +412,7 @@ size_t Ftrl::get_nepochs() { void Ftrl::set_model(DataTable* dt_model_in) { dt_model = dtptr(dt_model_in->copy()); set_d(dt_model->nrows); + w = doubleptr(new double[params.d]()); init_weights(); ncols = 0; nfeatures = 0; diff --git a/c/extras/dt_ftrl.h b/c/extras/dt_ftrl.h index 83202fe2ed..546973282c 100644 --- a/c/extras/dt_ftrl.h +++ b/c/extras/dt_ftrl.h @@ -28,7 +28,6 @@ using hashptr = std::unique_ptr; using doubleptr = std::unique_ptr; using uint64ptr = std::unique_ptr; -#define REPORT_FREQUENCY 1000 namespace dt { diff --git a/c/extras/hash.cc b/c/extras/hash.cc index ad331bb54a..a73281bb84 100644 --- a/c/extras/hash.cc +++ b/c/extras/hash.cc @@ -21,15 +21,17 @@ //------------------------------------------------------------------------------ #include "extras/hash.h" +Hash::Hash(const Column* col) : ri(col->rowindex()) {} Hash::~Hash() {} - -HashBool::HashBool(const Column* col) { +HashBool::HashBool(const Column* col) : Hash(col) { values = dynamic_cast(col)->elements_r(); } uint64_t HashBool::hash(size_t row) const { - uint64_t h = static_cast(values[row]); + size_t i = ri[row]; + bool value = (i == RowIndex::NA)? GETNA() : values[i]; + uint64_t h = static_cast(value); return h; } diff --git a/c/extras/hash.h b/c/extras/hash.h index 0991a57e69..22a2172cf0 100644 --- a/c/extras/hash.h +++ b/c/extras/hash.h @@ -30,7 +30,10 @@ */ class Hash { public: + explicit Hash(const Column*); virtual ~Hash(); + + const RowIndex ri; virtual uint64_t hash(size_t row) const = 0; }; @@ -61,14 +64,16 @@ class HashInt : public Hash { template -HashInt::HashInt(const Column* col) { +HashInt::HashInt(const Column* col) : Hash(col) { values = dynamic_cast*>(col)->elements_r(); } template uint64_t HashInt::hash(size_t row) const { - uint64_t h = static_cast(values[row]); + size_t i = ri[row]; + T value = (i == RowIndex::NA)? GETNA() : values[i]; + uint64_t h = static_cast(value); return h; } @@ -87,14 +92,16 @@ class HashFloat : public Hash { template -HashFloat::HashFloat(const Column* col) { +HashFloat::HashFloat(const Column* col) : Hash(col){ values = dynamic_cast*>(col)->elements_r(); } template uint64_t HashFloat::hash(size_t row) const { - auto x = static_cast(values[row]); + size_t i = ri[row]; + T value = (i == RowIndex::NA)? GETNA() : values[i]; + auto x = static_cast(value); uint64_t* h = reinterpret_cast(&x); return *h; } @@ -115,7 +122,7 @@ class HashString : public Hash { template -HashString::HashString(const Column* col) { +HashString::HashString(const Column* col) : Hash(col){ auto scol = dynamic_cast*>(col); strdata = scol->strdata(); offsets = scol->offsets(); @@ -124,11 +131,18 @@ HashString::HashString(const Column* col) { template uint64_t HashString::hash(size_t row) const { - const T strstart = offsets[row - 1] & ~GETNA(); - const char* c_str = strdata + strstart; - T len = offsets[row] - strstart; - uint64_t h = hash_murmur2(c_str, len * sizeof(char), 0); - return h; + size_t i = ri[row]; + if (i == RowIndex::NA) { + return 0; // Return 0 hash for NA rowindex + } else { + const T strstart = offsets[i - 1] & ~GETNA(); + const char* c_str = strdata + strstart; + if (ISNA(offsets[i])) { + return 0; // Return 0 hash for NA strings + } + T len = offsets[i] - strstart; + return hash_murmur2(c_str, len * sizeof(char), 0); + } } #endif diff --git a/tests/extras/test_ftrl.py b/tests/extras/test_ftrl.py index bbc9c1c3fb..5d4454f5ef 100644 --- a/tests/extras/test_ftrl.py +++ b/tests/extras/test_ftrl.py @@ -576,8 +576,8 @@ def test_ftrl_fit_predict_float(): def test_ftrl_fit_predict_string(): ft = Ftrl(alpha = 0.1, nepochs = 10000) - df_train = dt.Frame([["Monday", "Tuesday"]]) - df_target = dt.Frame([[True, False]]) + df_train = dt.Frame([["Monday", None, "", "Tuesday"]]) + df_target = dt.Frame([[True, False, False, True]]) ft.fit(df_train, df_target) df_target = ft.predict(df_train[:,0]) assert df_target[0, 0] <= 1 @@ -606,6 +606,29 @@ def test_ftrl_fit_predict_from_setters(): assert_equals(target1, target2) +def test_ftrl_fit_predict_view(): + ft = Ftrl(d=100) + # Generate unique numbers, so that this test can be run in parallel + df_train = dt.Frame(random.sample(range(ft.d), ft.d)) + df_target = dt.Frame([bool(random.getrandbits(1)) for _ in range(ft.d)]) + rows = range(ft.d//2, ft.d) + + # Train model and predict on a view + ft.fit(df_train[rows,:], df_target[rows,:]) + predictions = ft.predict(df_train[rows,:]) + model = dt.Frame(ft.model.to_dict()) + + # Train model and predict on a frame + ft.reset() + df_train_range = dt.Frame(df_train[rows,:].to_list()) + df_target_range = dt.Frame(df_target[rows,:].to_list()) + ft.fit(df_train_range, df_target_range) + predictions_half = ft.predict(df_train_range) + + assert_equals(model, ft.model) + assert_equals(predictions, predictions_half) + + #------------------------------------------------------------------------------- # Test feature importance #-------------------------------------------------------------------------------